├── dummy.csv ├── images ├── list.png ├── editor.png ├── compiler.png ├── fibonacci.png ├── recursion.png ├── freeze-melt.png ├── interpreter.png ├── variables1.png ├── variables2.png ├── auto-completion.png ├── collections_uml.png ├── lectures_sketch.png └── syntax-highlight.png ├── numpy └── images │ ├── ndarray.png │ ├── reference.png │ ├── storage_index.png │ ├── storage_simple.png │ └── ndarray_with_details.png ├── pandas ├── img │ ├── df_inside.png │ ├── df_outside.png │ ├── join-inner.png │ ├── join-left.png │ ├── join-outer.png │ ├── join-right.png │ ├── df_inside_numpy.png │ └── xlsxwriterexample.png ├── data │ ├── blooth_sales_data.xlsx │ ├── blooth_sales_data_clean.xlsx │ └── sampledf.json └── 1_intro_pandas.ipynb ├── scikit-learn ├── images │ ├── iris_setosa.jpg │ ├── petal_sepal.jpg │ ├── iris_virginica.jpg │ ├── ml-wordle-436.jpg │ ├── iris_versicolor.jpg │ ├── cluster_comparison.png │ ├── ml_supervised_example.png │ ├── ml_unsupervised_example.png │ └── scikit-learn-cheatsheet.png ├── 01.4 Review of Scikit-learn API.ipynb ├── 01.1 Introduction to Machine Learning.ipynb ├── 02.2 Supervised Learning - Regression.ipynb ├── 02.4 Unsupervised Learning - Clustering.ipynb └── 02.1 Supervised Learning - Classification.ipynb ├── requirements.txt ├── pyds.yml ├── LICENSE ├── .gitignore ├── data.py ├── Markdown_notebook_showcase.ipynb ├── data ├── inflammation-01.csv ├── inflammation-02.csv ├── inflammation-03.csv └── inflammation-04.csv ├── working_notebook_dataclasses.ipynb ├── Exercises.ipynb ├── README.md ├── python_extras ├── pep8.ipynb ├── functions-objects.ipynb └── data_model.ipynb ├── data_abstraction.ipynb ├── working_notebook_lect1.ipynb └── programming_with_python └── exceptions.ipynb /dummy.csv: -------------------------------------------------------------------------------- 1 | 0,0,1,3,1,2,4 2 | 0,1,2,1,2,1,3 3 | 0,1,1,3,3,2,6 -------------------------------------------------------------------------------- /images/list.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/python-data-science/HEAD/images/list.png -------------------------------------------------------------------------------- /images/editor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/python-data-science/HEAD/images/editor.png -------------------------------------------------------------------------------- /images/compiler.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/python-data-science/HEAD/images/compiler.png -------------------------------------------------------------------------------- /images/fibonacci.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/python-data-science/HEAD/images/fibonacci.png -------------------------------------------------------------------------------- /images/recursion.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/python-data-science/HEAD/images/recursion.png -------------------------------------------------------------------------------- /images/freeze-melt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/python-data-science/HEAD/images/freeze-melt.png -------------------------------------------------------------------------------- /images/interpreter.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/python-data-science/HEAD/images/interpreter.png -------------------------------------------------------------------------------- /images/variables1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/python-data-science/HEAD/images/variables1.png -------------------------------------------------------------------------------- /images/variables2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/python-data-science/HEAD/images/variables2.png -------------------------------------------------------------------------------- /numpy/images/ndarray.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/python-data-science/HEAD/numpy/images/ndarray.png -------------------------------------------------------------------------------- /pandas/img/df_inside.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/python-data-science/HEAD/pandas/img/df_inside.png -------------------------------------------------------------------------------- /pandas/img/df_outside.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/python-data-science/HEAD/pandas/img/df_outside.png -------------------------------------------------------------------------------- /pandas/img/join-inner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/python-data-science/HEAD/pandas/img/join-inner.png -------------------------------------------------------------------------------- /pandas/img/join-left.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/python-data-science/HEAD/pandas/img/join-left.png -------------------------------------------------------------------------------- /pandas/img/join-outer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/python-data-science/HEAD/pandas/img/join-outer.png -------------------------------------------------------------------------------- /pandas/img/join-right.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/python-data-science/HEAD/pandas/img/join-right.png -------------------------------------------------------------------------------- /images/auto-completion.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/python-data-science/HEAD/images/auto-completion.png -------------------------------------------------------------------------------- /images/collections_uml.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/python-data-science/HEAD/images/collections_uml.png -------------------------------------------------------------------------------- /images/lectures_sketch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/python-data-science/HEAD/images/lectures_sketch.png -------------------------------------------------------------------------------- /images/syntax-highlight.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/python-data-science/HEAD/images/syntax-highlight.png -------------------------------------------------------------------------------- /numpy/images/reference.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/python-data-science/HEAD/numpy/images/reference.png -------------------------------------------------------------------------------- /numpy/images/storage_index.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/python-data-science/HEAD/numpy/images/storage_index.png -------------------------------------------------------------------------------- /pandas/img/df_inside_numpy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/python-data-science/HEAD/pandas/img/df_inside_numpy.png -------------------------------------------------------------------------------- /numpy/images/storage_simple.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/python-data-science/HEAD/numpy/images/storage_simple.png -------------------------------------------------------------------------------- /pandas/img/xlsxwriterexample.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/python-data-science/HEAD/pandas/img/xlsxwriterexample.png -------------------------------------------------------------------------------- /pandas/data/blooth_sales_data.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/python-data-science/HEAD/pandas/data/blooth_sales_data.xlsx -------------------------------------------------------------------------------- /scikit-learn/images/iris_setosa.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/python-data-science/HEAD/scikit-learn/images/iris_setosa.jpg -------------------------------------------------------------------------------- /scikit-learn/images/petal_sepal.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/python-data-science/HEAD/scikit-learn/images/petal_sepal.jpg -------------------------------------------------------------------------------- /numpy/images/ndarray_with_details.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/python-data-science/HEAD/numpy/images/ndarray_with_details.png -------------------------------------------------------------------------------- /scikit-learn/images/iris_virginica.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/python-data-science/HEAD/scikit-learn/images/iris_virginica.jpg -------------------------------------------------------------------------------- /scikit-learn/images/ml-wordle-436.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/python-data-science/HEAD/scikit-learn/images/ml-wordle-436.jpg -------------------------------------------------------------------------------- /pandas/data/blooth_sales_data_clean.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/python-data-science/HEAD/pandas/data/blooth_sales_data_clean.xlsx -------------------------------------------------------------------------------- /scikit-learn/images/iris_versicolor.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/python-data-science/HEAD/scikit-learn/images/iris_versicolor.jpg -------------------------------------------------------------------------------- /scikit-learn/images/cluster_comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/python-data-science/HEAD/scikit-learn/images/cluster_comparison.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | jupyter 2 | jupyterlab 3 | ipython 4 | matplotlib 5 | numpy 6 | openpyxl 7 | pandas 8 | pip 9 | scikit-learn 10 | setuptools 11 | -------------------------------------------------------------------------------- /scikit-learn/images/ml_supervised_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/python-data-science/HEAD/scikit-learn/images/ml_supervised_example.png -------------------------------------------------------------------------------- /scikit-learn/images/ml_unsupervised_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/python-data-science/HEAD/scikit-learn/images/ml_unsupervised_example.png -------------------------------------------------------------------------------- /scikit-learn/images/scikit-learn-cheatsheet.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/python-data-science/HEAD/scikit-learn/images/scikit-learn-cheatsheet.png -------------------------------------------------------------------------------- /pyds.yml: -------------------------------------------------------------------------------- 1 | name: pyds 2 | channels: 3 | - conda-forge 4 | - defaults 5 | dependencies: 6 | - jupyter 7 | - jupyterlab 8 | - ipython 9 | - matplotlib 10 | - numpy 11 | - openpyxl 12 | - pandas 13 | - pip 14 | - python>=3.9 15 | - scikit-learn 16 | - setuptools 17 | -------------------------------------------------------------------------------- /pandas/data/sampledf.json: -------------------------------------------------------------------------------- 1 | {"0":{"0":79,"1":25,"2":37,"3":74,"4":79,"5":45,"6":12,"7":36,"8":55,"9":46},"1":{"0":19,"1":39,"2":64,"3":61,"4":60,"5":26,"6":29,"7":32,"8":53,"9":74},"2":{"0":21,"1":89,"2":31,"3":100,"4":83,"5":73,"6":18,"7":22,"8":89,"9":36},"3":{"0":99,"1":66,"2":69,"3":6,"4":85,"5":73,"6":98,"7":4,"8":13,"9":54},"4":{"0":35,"1":9,"2":61,"3":58,"4":16,"5":100,"6":62,"7":66,"8":84,"9":21},"5":{"0":59,"1":41,"2":97,"3":80,"4":5,"5":60,"6":68,"7":25,"8":87,"9":12},"6":{"0":44,"1":6,"2":5,"3":95,"4":16,"5":21,"6":92,"7":63,"8":74,"9":68},"7":{"0":25,"1":69,"2":11,"3":50,"4":69,"5":19,"6":29,"7":51,"8":3,"9":33},"8":{"0":75,"1":63,"2":76,"3":15,"4":5,"5":95,"6":74,"7":59,"8":2,"9":80},"9":{"0":58,"1":3,"2":57,"3":51,"4":20,"5":12,"6":96,"7":14,"8":64,"9":25}} -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 WebValley 2021 ReImagined 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | #osx rubbish 2 | .DS_Store 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | pip-wheel-metadata/ 27 | share/python-wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | MANIFEST 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .nox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *.cover 53 | *.py,cover 54 | .hypothesis/ 55 | .pytest_cache/ 56 | 57 | # Translations 58 | *.mo 59 | *.pot 60 | 61 | # Django stuff: 62 | *.log 63 | local_settings.py 64 | db.sqlite3 65 | db.sqlite3-journal 66 | 67 | # Flask stuff: 68 | instance/ 69 | .webassets-cache 70 | 71 | # Scrapy stuff: 72 | .scrapy 73 | 74 | # Sphinx documentation 75 | docs/_build/ 76 | 77 | # PyBuilder 78 | target/ 79 | 80 | # Jupyter Notebook 81 | .ipynb_checkpoints 82 | 83 | # IPython 84 | profile_default/ 85 | ipython_config.py 86 | 87 | # pyenv 88 | .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 98 | __pypackages__/ 99 | 100 | # Celery stuff 101 | celerybeat-schedule 102 | celerybeat.pid 103 | 104 | # SageMath parsed files 105 | *.sage.py 106 | 107 | # Environments 108 | .env 109 | .venv 110 | env/ 111 | venv/ 112 | ENV/ 113 | env.bak/ 114 | venv.bak/ 115 | 116 | # Spyder project settings 117 | .spyderproject 118 | .spyproject 119 | 120 | # Rope project settings 121 | .ropeproject 122 | 123 | # mkdocs documentation 124 | /site 125 | 126 | # mypy 127 | .mypy_cache/ 128 | .dmypy.json 129 | dmypy.json 130 | 131 | # Pyre type checker 132 | .pyre/ 133 | -------------------------------------------------------------------------------- /data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | 4 | from pathlib import Path 5 | 6 | BASE_FOLDER = Path(os.path.abspath(os.path.curdir)) 7 | DATA_FOLDER = BASE_FOLDER / "data" 8 | 9 | from typing import Sequence 10 | 11 | # from collections import namedtuple 12 | from dataclasses import dataclass 13 | 14 | 15 | @dataclass 16 | class Patient: 17 | pid: str 18 | sex: str 19 | group: str 20 | age: int 21 | inf_data: Sequence[int] 22 | 23 | def stratification_label(self) -> str: 24 | """ 25 | Return the group label of a Patient obtained 26 | by concatenation of sex and group. 27 | """ 28 | return f"{self.sex}-{self.group}" 29 | 30 | 31 | # Patient = namedtuple("Patient", ["pid", "sex", "group", "age", "inf_data"]) 32 | # Dataset = Sequence[Patient] 33 | 34 | 35 | class Dataset: 36 | def __init__(self, patients: Sequence[Patient]): 37 | self.patients = patients 38 | 39 | def __len__(self): 40 | return len(self.patients) 41 | 42 | def __getitem__(self, index) -> Patient: 43 | return self.patients[index] 44 | 45 | 46 | # expect reading inflammation-04.csv format 47 | def read_inflammation_data(filename: str) -> Dataset: 48 | """ 49 | Read dataset in ver4 format. 50 | 51 | Parameters 52 | ---------- 53 | filename : str 54 | Name of the datafile to use in input. 55 | The file is assumed to be located in DATA_FOLDER 56 | 57 | Raises 58 | ------ 59 | ValueError 60 | When input filename does not correspond to 61 | any file in DATA FOLDER 62 | 63 | Returns 64 | ------- 65 | Dataset 66 | Sequence of `Patient` tuple 67 | """ 68 | 69 | datafilepath = DATA_FOLDER / filename 70 | if not datafilepath.exists(): 71 | raise ValueError( 72 | f"Input filename {filename} has not been found in Data Folder!" 73 | ) 74 | with open(datafilepath) as datafile: 75 | patients_list = list() 76 | for i, line in enumerate(datafile): 77 | if i == 0: 78 | continue 79 | line = line.strip() 80 | pinfo = line.split(",") 81 | patient = Patient( 82 | pid=pinfo[0], 83 | group=pinfo[-1], 84 | age=int(pinfo[-2]), 85 | sex=pinfo[-3], 86 | inf_data=np.asarray(pinfo[1:-3]).astype(int), 87 | ) 88 | patients_list.append(patient) 89 | dataset = Dataset(patients=patients_list) 90 | return dataset 91 | -------------------------------------------------------------------------------- /Markdown_notebook_showcase.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "raw", 5 | "id": "cc211f50-80da-452c-922b-4e6a31888335", 6 | "metadata": {}, 7 | "source": [ 8 | "asdasdasdasd RAW TEXT" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "id": "2f78e7a3-6655-4144-9e87-2e84e8e41715", 15 | "metadata": {}, 16 | "outputs": [ 17 | { 18 | "ename": "NameError", 19 | "evalue": "name 'sasdasd' is not defined", 20 | "output_type": "error", 21 | "traceback": [ 22 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 23 | "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", 24 | "\u001b[0;32m/var/folders/16/xdrp9d8s5510rt6fwr4j6llr0000gn/T/ipykernel_3657/3841536159.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0msasdasd\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 25 | "\u001b[0;31mNameError\u001b[0m: name 'sasdasd' is not defined" 26 | ] 27 | } 28 | ], 29 | "source": [ 30 | "sasdasd" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "id": "55ad43ac-1a9c-46de-acb7-74322d7c351e", 36 | "metadata": {}, 37 | "source": [ 38 | "# header h1\n", 39 | "## header h2\n", 40 | "### header h3" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "id": "4233abff-3a89-4868-b9e8-b142180fb11e", 46 | "metadata": {}, 47 | "source": [ 48 | "- first item\n", 49 | "- second item\n", 50 | " * second first\n", 51 | " * second second\n", 52 | "- third item\n", 53 | "\n", 54 | "this is normal text \n" 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "id": "bebd3938-e2fe-4a52-9482-5bf61e66dd1f", 60 | "metadata": {}, 61 | "source": [ 62 | "**this is a bold text**\n", 63 | "\n", 64 | "*this is italic*\n", 65 | "\n", 66 | "`this is monospace`" 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "id": "5af5fd1c-c15c-4a1b-b3cd-e7065af00313", 72 | "metadata": {}, 73 | "source": [ 74 | "```python\n", 75 | "\n", 76 | "for (int i =0; i < 10; i++){\n", 77 | " printf(i);\n", 78 | "}\n", 79 | "\n", 80 | "```" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "id": "53a7601b-42a6-4c8e-9378-39d04ce0a47e", 86 | "metadata": {}, 87 | "source": [ 88 | "$ \\alpha = \\sum_{i=0}^{N} x^2$" 89 | ] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "id": "736d9d9d-9bb9-4c79-a061-ba7fa865d913", 94 | "metadata": {}, 95 | "source": [ 96 | "### Keyboard shortcuts to remember" 97 | ] 98 | }, 99 | { 100 | "cell_type": "raw", 101 | "id": "f04b61c3-426c-484f-8c9d-c722d8e793c8", 102 | "metadata": {}, 103 | "source": [ 104 | "add new cell above: ESC ; A\n", 105 | "add new cell below: ESC ; B\n", 106 | "del a cell (ViM style): ESC ; DD\n", 107 | "\n", 108 | "Run a cell: Shift+Enter" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "id": "e75e6d6d-da39-4ed7-8b05-8b63d7ab2ff5", 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [] 118 | } 119 | ], 120 | "metadata": { 121 | "kernelspec": { 122 | "display_name": "Python 3 (ipykernel)", 123 | "language": "python", 124 | "name": "python3" 125 | }, 126 | "language_info": { 127 | "codemirror_mode": { 128 | "name": "ipython", 129 | "version": 3 130 | }, 131 | "file_extension": ".py", 132 | "mimetype": "text/x-python", 133 | "name": "python", 134 | "nbconvert_exporter": "python", 135 | "pygments_lexer": "ipython3", 136 | "version": "3.9.7" 137 | } 138 | }, 139 | "nbformat": 4, 140 | "nbformat_minor": 5 141 | } 142 | -------------------------------------------------------------------------------- /pandas/1_intro_pandas.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Pandas\n", 8 | "\n", 9 | "Pandas is the Swiss-Multipurpose Knife for Data Analysis in Python. With Pandas dealing with data-analysis is easy and simple but there are some things you need to get your head around first as Data-Frames and Data-Series. \n", 10 | "\n", 11 | "The tutorial provides a compact introduction to Pandas for beginners for I/O, data visualisation, statistical data analysis and aggregation within Jupiter notebooks." 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "## Content at a glance\n", 19 | "\n", 20 | "#### A Practical Start: Reading and Writing Data Across Multiple Formats \n", 21 | "\n", 22 | "* CSV\n", 23 | "* Excel\n", 24 | "* JSON\n", 25 | "* Clipboard\n", 26 | " \n", 27 | "* data\n", 28 | " * .info\n", 29 | " * .describe\n", 30 | "\n", 31 | "#### DataSeries & DataFrames / NumPy\n", 32 | "\n", 33 | "* Ode to NumPy\n", 34 | "* Data-Series\n", 35 | "* Data-Frames\n", 36 | "\n", 37 | "#### Data selection & Indexing\n", 38 | "\n", 39 | "* Data-Series: \n", 40 | " * Slicing\n", 41 | " * Access by label\n", 42 | " * Index\n", 43 | "* Data-Frames: \n", 44 | " * Slicing\n", 45 | " * Access by label\n", 46 | " * Peek into joining data\n", 47 | "* Returns a copy / inplace\n", 48 | "* Boolean indexing\n", 49 | "\n", 50 | "#### Operations\n", 51 | " \n", 52 | " * add/substract\n", 53 | " * multiply\n", 54 | " * mention Index but don't go deep\n", 55 | "\n", 56 | "#### Data Visualisation\n", 57 | "\n", 58 | " * plot your data directly into your notebook\n", 59 | " \n", 60 | "#### Anti Patterns\n", 61 | "\n", 62 | " * a collection of (anti-)patterns when using Pandas" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": {}, 68 | "source": [ 69 | "---" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "## Brief Introduction to Pandas\n", 77 | "\n", 78 | "Pandas builds on top of two main data structures: **Data Frame** and **Series**" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": {}, 84 | "source": [ 85 | "### Data Frame _from the outside_\n", 86 | "\n", 87 | "" 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "metadata": {}, 93 | "source": [ 94 | "### Data Frame _from the inside_\n", 95 | "\n", 96 | "" 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "metadata": {}, 102 | "source": [ 103 | "### Data Frame vs Numpy Array" 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "metadata": {}, 109 | "source": [ 110 | "#### Numpy Array\n", 111 | "\n", 112 | "" 113 | ] 114 | }, 115 | { 116 | "cell_type": "markdown", 117 | "metadata": {}, 118 | "source": [ 119 | "#### Pandas Data Frame" 120 | ] 121 | }, 122 | { 123 | "cell_type": "markdown", 124 | "metadata": {}, 125 | "source": [ 126 | "" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": null, 132 | "metadata": {}, 133 | "outputs": [], 134 | "source": [] 135 | } 136 | ], 137 | "metadata": { 138 | "kernelspec": { 139 | "display_name": "Python 3 (ipykernel)", 140 | "language": "python", 141 | "name": "python3" 142 | }, 143 | "language_info": { 144 | "codemirror_mode": { 145 | "name": "ipython", 146 | "version": 3 147 | }, 148 | "file_extension": ".py", 149 | "mimetype": "text/x-python", 150 | "name": "python", 151 | "nbconvert_exporter": "python", 152 | "pygments_lexer": "ipython3", 153 | "version": "3.9.7" 154 | } 155 | }, 156 | "nbformat": 4, 157 | "nbformat_minor": 4 158 | } 159 | -------------------------------------------------------------------------------- /data/inflammation-01.csv: -------------------------------------------------------------------------------- 1 | 0,0,1,3,1,2,4,7,8,3,3,3,10,5,7,4,7,7,12,18,6,13,11,11,7,7,4,6,8,8,4,4,5,7,3,4,2,3,0,0 2 | 0,1,2,1,2,1,3,2,2,6,10,11,5,9,4,4,7,16,8,6,18,4,12,5,12,7,11,5,11,3,3,5,4,4,5,5,1,1,0,1 3 | 0,1,1,3,3,2,6,2,5,9,5,7,4,5,4,15,5,11,9,10,19,14,12,17,7,12,11,7,4,2,10,5,4,2,2,3,2,2,1,1 4 | 0,0,2,0,4,2,2,1,6,7,10,7,9,13,8,8,15,10,10,7,17,4,4,7,6,15,6,4,9,11,3,5,6,3,3,4,2,3,2,1 5 | 0,1,1,3,3,1,3,5,2,4,4,7,6,5,3,10,8,10,6,17,9,14,9,7,13,9,12,6,7,7,9,6,3,2,2,4,2,0,1,1 6 | 0,0,1,2,2,4,2,1,6,4,7,6,6,9,9,15,4,16,18,12,12,5,18,9,5,3,10,3,12,7,8,4,7,3,5,4,4,3,2,1 7 | 0,0,2,2,4,2,2,5,5,8,6,5,11,9,4,13,5,12,10,6,9,17,15,8,9,3,13,7,8,2,8,8,4,2,3,5,4,1,1,1 8 | 0,0,1,2,3,1,2,3,5,3,7,8,8,5,10,9,15,11,18,19,20,8,5,13,15,10,6,10,6,7,4,9,3,5,2,5,3,2,2,1 9 | 0,0,0,3,1,5,6,5,5,8,2,4,11,12,10,11,9,10,17,11,6,16,12,6,8,14,6,13,10,11,4,6,4,7,6,3,2,1,0,0 10 | 0,1,1,2,1,3,5,3,5,8,6,8,12,5,13,6,13,8,16,8,18,15,16,14,12,7,3,8,9,11,2,5,4,5,1,4,1,2,0,0 11 | 0,1,0,0,4,3,3,5,5,4,5,8,7,10,13,3,7,13,15,18,8,15,15,16,11,14,12,4,10,10,4,3,4,5,5,3,3,2,2,1 12 | 0,1,0,0,3,4,2,7,8,5,2,8,11,5,5,8,14,11,6,11,9,16,18,6,12,5,4,3,5,7,8,3,5,4,5,5,4,0,1,1 13 | 0,0,2,1,4,3,6,4,6,7,9,9,3,11,6,12,4,17,13,15,13,12,8,7,4,7,12,9,5,6,5,4,7,3,5,4,2,3,0,1 14 | 0,0,0,0,1,3,1,6,6,5,5,6,3,6,13,3,10,13,9,16,15,9,11,4,6,4,11,11,12,3,5,8,7,4,6,4,1,3,0,0 15 | 0,1,2,1,1,1,4,1,5,2,3,3,10,7,13,5,7,17,6,9,12,13,10,4,12,4,6,7,6,10,8,2,5,1,3,4,2,0,2,0 16 | 0,1,1,0,1,2,4,3,6,4,7,5,5,7,5,10,7,8,18,17,9,8,12,11,11,11,14,6,11,2,10,9,5,6,5,3,4,2,2,0 17 | 0,0,0,0,2,3,6,5,7,4,3,2,10,7,9,11,12,5,12,9,13,19,14,17,5,13,8,11,5,10,9,8,7,5,3,1,4,0,2,1 18 | 0,0,0,1,2,1,4,3,6,7,4,2,12,6,12,4,14,7,8,14,13,19,6,9,12,6,4,13,6,7,2,3,6,5,4,2,3,0,1,0 19 | 0,0,2,1,2,5,4,2,7,8,4,7,11,9,8,11,15,17,11,12,7,12,7,6,7,4,13,5,7,6,6,9,2,1,1,2,2,0,1,0 20 | 0,1,2,0,1,4,3,2,2,7,3,3,12,13,11,13,6,5,9,16,9,19,16,11,8,9,14,12,11,9,6,6,6,1,1,2,4,3,1,1 21 | 0,1,1,3,1,4,4,1,8,2,2,3,12,12,10,15,13,6,5,5,18,19,9,6,11,12,7,6,3,6,3,2,4,3,1,5,4,2,2,0 22 | 0,0,2,3,2,3,2,6,3,8,7,4,6,6,9,5,12,12,8,5,12,10,16,7,14,12,5,4,6,9,8,5,6,6,1,4,3,0,2,0 23 | 0,0,0,3,4,5,1,7,7,8,2,5,12,4,10,14,5,5,17,13,16,15,13,6,12,9,10,3,3,7,4,4,8,2,6,5,1,0,1,0 24 | 0,1,1,1,1,3,3,2,6,3,9,7,8,8,4,13,7,14,11,15,14,13,5,13,7,14,9,10,5,11,5,3,5,1,1,4,4,1,2,0 25 | 0,1,1,1,2,3,5,3,6,3,7,10,3,8,12,4,12,9,15,5,17,16,5,10,10,15,7,5,3,11,5,5,6,1,1,1,1,0,2,1 26 | 0,0,2,1,3,3,2,7,4,4,3,8,12,9,12,9,5,16,8,17,7,11,14,7,13,11,7,12,12,7,8,5,7,2,2,4,1,1,1,0 27 | 0,0,1,2,4,2,2,3,5,7,10,5,5,12,3,13,4,13,7,15,9,12,18,14,16,12,3,11,3,2,7,4,8,2,2,1,3,0,1,1 28 | 0,0,1,1,1,5,1,5,2,2,4,10,4,8,14,6,15,6,12,15,15,13,7,17,4,5,11,4,8,7,9,4,5,3,2,5,4,3,2,1 29 | 0,0,2,2,3,4,6,3,7,6,4,5,8,4,7,7,6,11,12,19,20,18,9,5,4,7,14,8,4,3,7,7,8,3,5,4,1,3,1,0 30 | 0,0,0,1,4,4,6,3,8,6,4,10,12,3,3,6,8,7,17,16,14,15,17,4,14,13,4,4,12,11,6,9,5,5,2,5,2,1,0,1 31 | 0,1,1,0,3,2,4,6,8,6,2,3,11,3,14,14,12,8,8,16,13,7,6,9,15,7,6,4,10,8,10,4,2,6,5,5,2,3,2,1 32 | 0,0,2,3,3,4,5,3,6,7,10,5,10,13,14,3,8,10,9,9,19,15,15,6,8,8,11,5,5,7,3,6,6,4,5,2,2,3,0,0 33 | 0,1,2,2,2,3,6,6,6,7,6,3,11,12,13,15,15,10,14,11,11,8,6,12,10,5,12,7,7,11,5,8,5,2,5,5,2,0,2,1 34 | 0,0,2,1,3,5,6,7,5,8,9,3,12,10,12,4,12,9,13,10,10,6,10,11,4,15,13,7,3,4,2,9,7,2,4,2,1,2,1,1 35 | 0,0,1,2,4,1,5,5,2,3,4,8,8,12,5,15,9,17,7,19,14,18,12,17,14,4,13,13,8,11,5,6,6,2,3,5,2,1,1,1 36 | 0,0,0,3,1,3,6,4,3,4,8,3,4,8,3,11,5,7,10,5,15,9,16,17,16,3,8,9,8,3,3,9,5,1,6,5,4,2,2,0 37 | 0,1,2,2,2,5,5,1,4,6,3,6,5,9,6,7,4,7,16,7,16,13,9,16,12,6,7,9,10,3,6,4,5,4,6,3,4,3,2,1 38 | 0,1,1,2,3,1,5,1,2,2,5,7,6,6,5,10,6,7,17,13,15,16,17,14,4,4,10,10,10,11,9,9,5,4,4,2,1,0,1,0 39 | 0,1,0,3,2,4,1,1,5,9,10,7,12,10,9,15,12,13,13,6,19,9,10,6,13,5,13,6,7,2,5,5,2,1,1,1,1,3,0,1 40 | 0,1,1,3,1,1,5,5,3,7,2,2,3,12,4,6,8,15,16,16,15,4,14,5,13,10,7,10,6,3,2,3,6,3,3,5,4,3,2,1 41 | 0,0,0,2,2,1,3,4,5,5,6,5,5,12,13,5,7,5,11,15,18,7,9,10,14,12,11,9,10,3,2,9,6,2,2,5,3,0,0,1 42 | 0,0,1,3,3,1,2,1,8,9,2,8,10,3,8,6,10,13,11,17,19,6,4,11,6,12,7,5,5,4,4,8,2,6,6,4,2,2,0,0 43 | 0,1,1,3,4,5,2,1,3,7,9,6,10,5,8,15,11,12,15,6,12,16,6,4,14,3,12,9,6,11,5,8,5,5,6,1,2,1,2,0 44 | 0,0,1,3,1,4,3,6,7,8,5,7,11,3,6,11,6,10,6,19,18,14,6,10,7,9,8,5,8,3,10,2,5,1,5,4,2,1,0,1 45 | 0,1,1,3,3,4,4,6,3,4,9,9,7,6,8,15,12,15,6,11,6,18,5,14,15,12,9,8,3,6,10,6,8,7,2,5,4,3,1,1 46 | 0,1,2,2,4,3,1,4,8,9,5,10,10,3,4,6,7,11,16,6,14,9,11,10,10,7,10,8,8,4,5,8,4,4,5,2,4,1,1,0 47 | 0,0,2,3,4,5,4,6,2,9,7,4,9,10,8,11,16,12,15,17,19,10,18,13,15,11,8,4,7,11,6,7,6,5,1,3,1,0,0,0 48 | 0,1,1,3,1,4,6,2,8,2,10,3,11,9,13,15,5,15,6,10,10,5,14,15,12,7,4,5,11,4,6,9,5,6,1,1,2,1,2,1 49 | 0,0,1,3,2,5,1,2,7,6,6,3,12,9,4,14,4,6,12,9,12,7,11,7,16,8,13,6,7,6,10,7,6,3,1,5,4,3,0,0 50 | 0,0,1,2,3,4,5,7,5,4,10,5,12,12,5,4,7,9,18,16,16,10,15,15,10,4,3,7,5,9,4,6,2,4,1,4,2,2,2,1 51 | 0,1,2,1,1,3,5,3,6,3,10,10,11,10,13,10,13,6,6,14,5,4,5,5,9,4,12,7,7,4,7,9,3,3,6,3,4,1,2,0 52 | 0,1,2,2,3,5,2,4,5,6,8,3,5,4,3,15,15,12,16,7,20,15,12,8,9,6,12,5,8,3,8,5,4,1,3,2,1,3,1,0 53 | 0,0,0,2,4,4,5,3,3,3,10,4,4,4,14,11,15,13,10,14,11,17,9,11,11,7,10,12,10,10,10,8,7,5,2,2,4,1,2,1 54 | 0,0,2,1,1,4,4,7,2,9,4,10,12,7,6,6,11,12,9,15,15,6,6,13,5,12,9,6,4,7,7,6,5,4,1,4,2,2,2,1 55 | 0,1,2,1,1,4,5,4,4,5,9,7,10,3,13,13,8,9,17,16,16,15,12,13,5,12,10,9,11,9,4,5,5,2,2,5,1,0,0,1 56 | 0,0,1,3,2,3,6,4,5,7,2,4,11,11,3,8,8,16,5,13,16,5,8,8,6,9,10,10,9,3,3,5,3,5,4,5,3,3,0,1 57 | 0,1,1,2,2,5,1,7,4,2,5,5,4,6,6,4,16,11,14,16,14,14,8,17,4,14,13,7,6,3,7,7,5,6,3,4,2,2,1,1 58 | 0,1,1,1,4,1,6,4,6,3,6,5,6,4,14,13,13,9,12,19,9,10,15,10,9,10,10,7,5,6,8,6,6,4,3,5,2,1,1,1 59 | 0,0,0,1,4,5,6,3,8,7,9,10,8,6,5,12,15,5,10,5,8,13,18,17,14,9,13,4,10,11,10,8,8,6,5,5,2,0,2,0 60 | 0,0,1,0,3,2,5,4,8,2,9,3,3,10,12,9,14,11,13,8,6,18,11,9,13,11,8,5,5,2,8,5,3,5,4,1,3,1,1,0 61 | -------------------------------------------------------------------------------- /data/inflammation-02.csv: -------------------------------------------------------------------------------- 1 | 669f,0,0,1,3,1,2,4,7,8,3,3,3,10,5,7,4,7,7,12,18,6,13,11,11,7,7,4,6,8,8,4,4,5,7,3,4,2,3,0,0 2 | 2edf,0,1,2,1,2,1,3,2,2,6,10,11,5,9,4,4,7,16,8,6,18,4,12,5,12,7,11,5,11,3,3,5,4,4,5,5,1,1,0,1 3 | 0355,0,1,1,3,3,2,6,2,5,9,5,7,4,5,4,15,5,11,9,10,19,14,12,17,7,12,11,7,4,2,10,5,4,2,2,3,2,2,1,1 4 | 5968,0,0,2,0,4,2,2,1,6,7,10,7,9,13,8,8,15,10,10,7,17,4,4,7,6,15,6,4,9,11,3,5,6,3,3,4,2,3,2,1 5 | c760,0,1,1,3,3,1,3,5,2,4,4,7,6,5,3,10,8,10,6,17,9,14,9,7,13,9,12,6,7,7,9,6,3,2,2,4,2,0,1,1 6 | 6b51,0,0,1,2,2,4,2,1,6,4,7,6,6,9,9,15,4,16,18,12,12,5,18,9,5,3,10,3,12,7,8,4,7,3,5,4,4,3,2,1 7 | dbaf,0,0,2,2,4,2,2,5,5,8,6,5,11,9,4,13,5,12,10,6,9,17,15,8,9,3,13,7,8,2,8,8,4,2,3,5,4,1,1,1 8 | b3b7,0,0,1,2,3,1,2,3,5,3,7,8,8,5,10,9,15,11,18,19,20,8,5,13,15,10,6,10,6,7,4,9,3,5,2,5,3,2,2,1 9 | 3995,0,0,0,3,1,5,6,5,5,8,2,4,11,12,10,11,9,10,17,11,6,16,12,6,8,14,6,13,10,11,4,6,4,7,6,3,2,1,0,0 10 | d6ff,0,1,1,2,1,3,5,3,5,8,6,8,12,5,13,6,13,8,16,8,18,15,16,14,12,7,3,8,9,11,2,5,4,5,1,4,1,2,0,0 11 | 2d58,0,1,0,0,4,3,3,5,5,4,5,8,7,10,13,3,7,13,15,18,8,15,15,16,11,14,12,4,10,10,4,3,4,5,5,3,3,2,2,1 12 | a1d4,0,1,0,0,3,4,2,7,8,5,2,8,11,5,5,8,14,11,6,11,9,16,18,6,12,5,4,3,5,7,8,3,5,4,5,5,4,0,1,1 13 | 71e9,0,0,2,1,4,3,6,4,6,7,9,9,3,11,6,12,4,17,13,15,13,12,8,7,4,7,12,9,5,6,5,4,7,3,5,4,2,3,0,1 14 | 65c1,0,0,0,0,1,3,1,6,6,5,5,6,3,6,13,3,10,13,9,16,15,9,11,4,6,4,11,11,12,3,5,8,7,4,6,4,1,3,0,0 15 | 1edd,0,1,2,1,1,1,4,1,5,2,3,3,10,7,13,5,7,17,6,9,12,13,10,4,12,4,6,7,6,10,8,2,5,1,3,4,2,0,2,0 16 | 277b,0,1,1,0,1,2,4,3,6,4,7,5,5,7,5,10,7,8,18,17,9,8,12,11,11,11,14,6,11,2,10,9,5,6,5,3,4,2,2,0 17 | fe0e,0,0,0,0,2,3,6,5,7,4,3,2,10,7,9,11,12,5,12,9,13,19,14,17,5,13,8,11,5,10,9,8,7,5,3,1,4,0,2,1 18 | 66d3,0,0,0,1,2,1,4,3,6,7,4,2,12,6,12,4,14,7,8,14,13,19,6,9,12,6,4,13,6,7,2,3,6,5,4,2,3,0,1,0 19 | 3ff3,0,0,2,1,2,5,4,2,7,8,4,7,11,9,8,11,15,17,11,12,7,12,7,6,7,4,13,5,7,6,6,9,2,1,1,2,2,0,1,0 20 | 4102,0,1,2,0,1,4,3,2,2,7,3,3,12,13,11,13,6,5,9,16,9,19,16,11,8,9,14,12,11,9,6,6,6,1,1,2,4,3,1,1 21 | 12c9,0,1,1,3,1,4,4,1,8,2,2,3,12,12,10,15,13,6,5,5,18,19,9,6,11,12,7,6,3,6,3,2,4,3,1,5,4,2,2,0 22 | 5b04,0,0,2,3,2,3,2,6,3,8,7,4,6,6,9,5,12,12,8,5,12,10,16,7,14,12,5,4,6,9,8,5,6,6,1,4,3,0,2,0 23 | 1fef,0,0,0,3,4,5,1,7,7,8,2,5,12,4,10,14,5,5,17,13,16,15,13,6,12,9,10,3,3,7,4,4,8,2,6,5,1,0,1,0 24 | 01c0,0,1,1,1,1,3,3,2,6,3,9,7,8,8,4,13,7,14,11,15,14,13,5,13,7,14,9,10,5,11,5,3,5,1,1,4,4,1,2,0 25 | 57b5,0,1,1,1,2,3,5,3,6,3,7,10,3,8,12,4,12,9,15,5,17,16,5,10,10,15,7,5,3,11,5,5,6,1,1,1,1,0,2,1 26 | 226c,0,0,2,1,3,3,2,7,4,4,3,8,12,9,12,9,5,16,8,17,7,11,14,7,13,11,7,12,12,7,8,5,7,2,2,4,1,1,1,0 27 | c653,0,0,1,2,4,2,2,3,5,7,10,5,5,12,3,13,4,13,7,15,9,12,18,14,16,12,3,11,3,2,7,4,8,2,2,1,3,0,1,1 28 | 94fd,0,0,1,1,1,5,1,5,2,2,4,10,4,8,14,6,15,6,12,15,15,13,7,17,4,5,11,4,8,7,9,4,5,3,2,5,4,3,2,1 29 | ebf2,0,0,2,2,3,4,6,3,7,6,4,5,8,4,7,7,6,11,12,19,20,18,9,5,4,7,14,8,4,3,7,7,8,3,5,4,1,3,1,0 30 | fc73,0,0,0,1,4,4,6,3,8,6,4,10,12,3,3,6,8,7,17,16,14,15,17,4,14,13,4,4,12,11,6,9,5,5,2,5,2,1,0,1 31 | d4a0,0,1,1,0,3,2,4,6,8,6,2,3,11,3,14,14,12,8,8,16,13,7,6,9,15,7,6,4,10,8,10,4,2,6,5,5,2,3,2,1 32 | a9f2,0,0,2,3,3,4,5,3,6,7,10,5,10,13,14,3,8,10,9,9,19,15,15,6,8,8,11,5,5,7,3,6,6,4,5,2,2,3,0,0 33 | dc22,0,1,2,2,2,3,6,6,6,7,6,3,11,12,13,15,15,10,14,11,11,8,6,12,10,5,12,7,7,11,5,8,5,2,5,5,2,0,2,1 34 | a6e7,0,0,2,1,3,5,6,7,5,8,9,3,12,10,12,4,12,9,13,10,10,6,10,11,4,15,13,7,3,4,2,9,7,2,4,2,1,2,1,1 35 | 3fb2,0,0,1,2,4,1,5,5,2,3,4,8,8,12,5,15,9,17,7,19,14,18,12,17,14,4,13,13,8,11,5,6,6,2,3,5,2,1,1,1 36 | 11cc,0,0,0,3,1,3,6,4,3,4,8,3,4,8,3,11,5,7,10,5,15,9,16,17,16,3,8,9,8,3,3,9,5,1,6,5,4,2,2,0 37 | c9f5,0,1,2,2,2,5,5,1,4,6,3,6,5,9,6,7,4,7,16,7,16,13,9,16,12,6,7,9,10,3,6,4,5,4,6,3,4,3,2,1 38 | a73f,0,1,1,2,3,1,5,1,2,2,5,7,6,6,5,10,6,7,17,13,15,16,17,14,4,4,10,10,10,11,9,9,5,4,4,2,1,0,1,0 39 | dab2,0,1,0,3,2,4,1,1,5,9,10,7,12,10,9,15,12,13,13,6,19,9,10,6,13,5,13,6,7,2,5,5,2,1,1,1,1,3,0,1 40 | 65a1,0,1,1,3,1,1,5,5,3,7,2,2,3,12,4,6,8,15,16,16,15,4,14,5,13,10,7,10,6,3,2,3,6,3,3,5,4,3,2,1 41 | 8bcb,0,0,0,2,2,1,3,4,5,5,6,5,5,12,13,5,7,5,11,15,18,7,9,10,14,12,11,9,10,3,2,9,6,2,2,5,3,0,0,1 42 | 4004,0,0,1,3,3,1,2,1,8,9,2,8,10,3,8,6,10,13,11,17,19,6,4,11,6,12,7,5,5,4,4,8,2,6,6,4,2,2,0,0 43 | c2af,0,1,1,3,4,5,2,1,3,7,9,6,10,5,8,15,11,12,15,6,12,16,6,4,14,3,12,9,6,11,5,8,5,5,6,1,2,1,2,0 44 | 8037,0,0,1,3,1,4,3,6,7,8,5,7,11,3,6,11,6,10,6,19,18,14,6,10,7,9,8,5,8,3,10,2,5,1,5,4,2,1,0,1 45 | cb49,0,1,1,3,3,4,4,6,3,4,9,9,7,6,8,15,12,15,6,11,6,18,5,14,15,12,9,8,3,6,10,6,8,7,2,5,4,3,1,1 46 | 2b4b,0,1,2,2,4,3,1,4,8,9,5,10,10,3,4,6,7,11,16,6,14,9,11,10,10,7,10,8,8,4,5,8,4,4,5,2,4,1,1,0 47 | 80a8,0,0,2,3,4,5,4,6,2,9,7,4,9,10,8,11,16,12,15,17,19,10,18,13,15,11,8,4,7,11,6,7,6,5,1,3,1,0,0,0 48 | ac50,0,1,1,3,1,4,6,2,8,2,10,3,11,9,13,15,5,15,6,10,10,5,14,15,12,7,4,5,11,4,6,9,5,6,1,1,2,1,2,1 49 | 57ef,0,0,1,3,2,5,1,2,7,6,6,3,12,9,4,14,4,6,12,9,12,7,11,7,16,8,13,6,7,6,10,7,6,3,1,5,4,3,0,0 50 | cc45,0,0,1,2,3,4,5,7,5,4,10,5,12,12,5,4,7,9,18,16,16,10,15,15,10,4,3,7,5,9,4,6,2,4,1,4,2,2,2,1 51 | 9184,0,1,2,1,1,3,5,3,6,3,10,10,11,10,13,10,13,6,6,14,5,4,5,5,9,4,12,7,7,4,7,9,3,3,6,3,4,1,2,0 52 | 84be,0,1,2,2,3,5,2,4,5,6,8,3,5,4,3,15,15,12,16,7,20,15,12,8,9,6,12,5,8,3,8,5,4,1,3,2,1,3,1,0 53 | 0af0,0,0,0,2,4,4,5,3,3,3,10,4,4,4,14,11,15,13,10,14,11,17,9,11,11,7,10,12,10,10,10,8,7,5,2,2,4,1,2,1 54 | bf77,0,0,2,1,1,4,4,7,2,9,4,10,12,7,6,6,11,12,9,15,15,6,6,13,5,12,9,6,4,7,7,6,5,4,1,4,2,2,2,1 55 | c56c,0,1,2,1,1,4,5,4,4,5,9,7,10,3,13,13,8,9,17,16,16,15,12,13,5,12,10,9,11,9,4,5,5,2,2,5,1,0,0,1 56 | 7d0c,0,0,1,3,2,3,6,4,5,7,2,4,11,11,3,8,8,16,5,13,16,5,8,8,6,9,10,10,9,3,3,5,3,5,4,5,3,3,0,1 57 | c736,0,1,1,2,2,5,1,7,4,2,5,5,4,6,6,4,16,11,14,16,14,14,8,17,4,14,13,7,6,3,7,7,5,6,3,4,2,2,1,1 58 | c5c8,0,1,1,1,4,1,6,4,6,3,6,5,6,4,14,13,13,9,12,19,9,10,15,10,9,10,10,7,5,6,8,6,6,4,3,5,2,1,1,1 59 | 050a,0,0,0,1,4,5,6,3,8,7,9,10,8,6,5,12,15,5,10,5,8,13,18,17,14,9,13,4,10,11,10,8,8,6,5,5,2,0,2,0 60 | a085,0,0,1,0,3,2,5,4,8,2,9,3,3,10,12,9,14,11,13,8,6,18,11,9,13,11,8,5,5,2,8,5,3,5,4,1,3,1,1,0 61 | -------------------------------------------------------------------------------- /data/inflammation-03.csv: -------------------------------------------------------------------------------- 1 | 669f,0,0,1,3,1,2,4,7,8,3,3,3,10,5,7,4,7,7,12,18,6,13,11,11,7,7,4,6,8,8,4,4,5,7,3,4,2,3,0,0 2 | 2edf,0,1,2,1,2,1,3,2,2,6,10,11,5,9,4,4,7,16,8,6,18,4,12,5,12,7,11,5,11,3,3,5,4,4,5,5,1,1,0,1 3 | 0355,0,1,1,3,3,2,6,2,5,9,5,7,4,5,4,15,5,11,9,10,19,14,12,17,7,12,11,7,4,2,10,5,4,2,2,3,2,2,1,1 4 | 5968,0,0,2,0,4,2,2,1,6,7,10,7,9,13,8,8,15,10,10,7,17,4,4,7,6,15,6,4,9,11,3,5,6,3,3,4,2,3,2,1 5 | c760,0,1,1,3,3,1,3,5,2,4,4,7,6,5,3,10,8,10,6,17,9,14,9,"7",13,9,12,6,7,7,9,6,3,2,2,4,2,0,1,1 6 | 6b51,0,0,1,2,2,4,2,1,6,4,7,6,6,9,9,15,4,16,18,12,12,5,18,9,5,3,10,3,12,7,8,4,7,3,5,4,4,3,2,1 7 | dbaf,0,0,2,2,4,2,2,5,5,8,6,5,11,9,4,13,5,12,10,6,9,17,15,8,9,3,13,7,8,2,8,8,4,2,3,5,4,1,1,1 8 | b3b7,0,0,1,2,3,1,2,3,5,3,7,8,8,5,10,9,15,11,18,19,20,8,5,13,15,10,6,10,6,7,4,9,3,5,2,5,3,2,2,1 9 | 3995,0,0,0,3,1,5,6,5,8,2,4,11,12,10,11,9,10,17,11,6,16,12,6,8,14,6,13,10,11,4,6,4,7,6,3,2,1,0,0 10 | d6ff,0,1,1,2,1,3,5,3,5,8,6,8,12,5,13,6,13,8,16,8,18,15,16,14,12,7,3,8,9,11,2,5,4,5,1,4,1,2,0,0 11 | 2d58,0,1,0,0,4,3,3,5,5,4,5,8,7,10,13,13,15,18,8,15,15,16,11,14,12,4,10,10,4,3,4,5,5,3,3,2,2,1 12 | a1d4,0,1,0,0,3,4,2,7,828,11,5,5,8,14,11,6,11,9,16,18,6,12,5,4,3,5,7,8,3,5,4,5,5,4,0,1,1 13 | 71e9,0,0,2,1,4,3,6,4,6,7,9,9,3,11,6,12,4,17,13,15,13,12,8,7,4,7,12,9,5,6,5,4,7,3,5,4,2,3,0,1 14 | 65c1,0,0,0,0,1,3,1,6,6,5,5,6,3,6,13,3,10,13,9,16,15,9,11,4,6,4,11,-11,12,3,5,8,7,4,6,4,1,3,0,0 15 | 1edd,0,1,2,1,1,1,5,2,3,3,10,7,13,5,7,17,6,9,12,13,10,4,12,4,6,7,6,10,8,2,5,1,3,4,2,0,2,0 16 | 277b,0,1,1,0,1,2,4,3,6,4,7,5,5,7,5,10,7,8,18,17,9,8,12,11,11,11,14,6,11,2,10,9,5,6,5,3,4,2,2,0 17 | fe0e,0,0,0,0,2,3,6,5,7,4,3,2,10,7,9,11,12,5,12,9,13,19,14,17,5,13,8,11,5,10,9,8,7,5,3,1,4,0,2,1 18 | 66d3,0,0,0,1,2,1,4,3,6,7,4,2,12,6,12,4,14,7,8,14,13,19,6,9,12,6,4,13,6,7,2,3,6,5,4,2,3,0,1,0 19 | 3ff3,0,0,2,1,2,5,4,2,7,8,4,7,11,9,8,11,15,17,11,12,7,12,7,6,7,4,13,5,7,6,6,9,2,1,1,2,2,0,1,0 20 | 4102,0,1,2,0,1,4,3,2,2,7,3,3,12,13,11,13,6,5,9,16,9,19,16,11,8,9,14,12,11,9,6,6,6,1,1,2,4,3,1,1 21 | 12c9,0,1,1,3,1,4,4,1,8,2,2,3,12,12,10,15,13,6,5,5,18,19,9,6,11,12,7,6,3,6,3,2,4,3,1,5,4,2,2,0 22 | 5b04,0,0,2,3,2,3,2,6,3,8,7,4,6,6,9,5,12,12,8,5,12,10,16,7,14,12,5,4,6,9,8,5,6,6,1,4,3,0,2,0 23 | 1fef,0,0,0,3,4,5,1,7,7,8,2,5,12,4,10,14,5,5,17,13,16,15,13,6,12,9,10,a,3,7,4,4,8,2,6,5,1,0,1,0 24 | 01c0,0,1,1,1,1,3,3,2,6,3,9,7,8,8,4,13,7,14,11,15,14,13,5,13,7,14,9,10,5,11,5,3,5,1,1,4,4,1,2,0 25 | 57b5,0,1,1,1,2,3,5,3,6,3,7,10,3,8,12,4,12,9,15,5,17,16,5,10,10,15,7,5,3,11,5,5,6,1,1,1,1,0,2,1 26 | 226c,0,0,2,1,3,3,2,7,4,4,3,8,12,9,12,9,5,16,8,17,7,11,14,7,13,11,7,12,12,7,8,5,7,2,2,4,1,1,1,0 27 | c653,0,0,1,2,4,2,2,3,5,7,10,5,5,12,3,13,4,13,7,15,9,12,18,14,16,12,3,11,3,2,7,4,8,2,2,1,3,0,1,1 28 | 94fd,0,0,1,1,1,5,1,5,2,2,4,10,4,8,14,6,15,6,12,15,15,13,7,17,4,5,11,4,8,7,9,4,5,3,2,5,4,3,2,1 29 | ebf2,0,0,2,2,3,4,6,3,7,6,4,5,8,4,7,7,6,11,12,19,20,18,9,5,4,7,14,8,4,3,7,7,8,3,5,4,1,3,1,0 30 | fc73,0,0,0,1,4,4,6,3,8,6,4,10,12,3,3,6,8,7,17,16,14,15,17,4,14,13,4,4,12,11,6,9,5,5,2,5,2,1,0,1 31 | d4a0,0,1,1,0,3,2,4,6,8,6,2,3,11,3,14,14,12,8,8,16,13.2,7,6,9,15,7,6,4,10,8,10,4,2,6,5,5,2,3,2,1 32 | a9f2,0,0,2,3,3,4,5,3,6,7,10,5,10,13,14,3,8,10,9,9,19,15,15,6,8,8,11,5,5,7,3,6,6,4,5,2,2,3,0,0 33 | dc22,0,1,2,2,2,3,6,6,6,7,6,3,11,12,13,15,15,10,14,11,11,8,6,12,10,5,12,7,7,11,5,8,5,2,5,5,2,0,2,1 34 | a6e7,0,0,2,1,3,5,6,7,5,8,9,3,12,10,12,4,12,9,13,10,10,6,10,11,4,15,13,7,3,4,2,9,7,2,4,2,1,2,1,1 35 | 3fb2,0,0,1,2,4,1,5,5,2,3,4,8,8,12,5,15,9,17,7,19,14,18,12,17,14,4,13,13,8,11,5,6,6,2,3,5,2,1,1,1 36 | 11cc,0,0,0,3,1,3,6,4,3,4,8,3,4,8,3,11,5,7,10,5,15,9,16,17,16,3,8,9,8,3,3,9,5,1,6,5,4,2,2,0 37 | c9f5,0,1,2,2,2,5,5,1,4,6,3,6,5,9,6,7,4,7,16,7,16,13,9,16,12,6,7,9,10,3,6,4,5,4,6,3,4,3,2,1 38 | a73f,0,1,1,2,3,1,5,1,2,2,5,7,6,6,5,10,6,7,17,13,15,16,17,14,4,4,10,10,10,11,9,9,5,4,4,2,1,0,1,0 39 | dab2,0,1,0,3,2,4,1,1,5,9,10,7,12,10,9,15,12,13,13,6,19,9,10,6,13,5,13,6,7,2,5,5,2,1,1,1,1,3,0,1 40 | 65a1,0,1,1,3\t,1,1,5,5,3,7,2,2,3,12,4,6,8,15,16,16,15,4,14,5,13,10,7,10,6,3,2,3,6,3,3,5,4,3,2,1 41 | 8bcb,0,0,0,2,2,1,3,4,5,5,6,5,5,12,13,5,7,5,11,15,18,7,9,10,14,12,11,9,10,3,2,9,6,2,2,5,3,0,0,1 42 | 4004,0,0,1,3,3,1,2,1,8,9,2,8,10,3,8,6,10,13,11,17,19,6,4,11,6,12,7,5,5,4,4,8,2,6,6,4,2,2,0,0 43 | c2af,0,1,1,3,4,5,2,1,3,7,9,6,10,5,8,15,11,12,15,6,12,16,6,4,14,3,12,9,6,11,5,8,5,5,6,1,2,1,2,0 44 | 8037,0,0,1,3,1,4,3,6,7,8,5,7,11,3,6,11,6,10,6,19,18,14,6,10,7,9,8,5,8,3,10,2,5,1,5,4,2,1,0,1 45 | cb49,0,1,1,3,3,4,4,6,3,-4,9,9,7,6,8,15,12,15,6,11,6,18,5,14,15,12,9,8,3,6,10,6,8,7,2,5,4,3,1,1 46 | 2b4b,0,1,2,2,4,3,1,4,8,9,5,10,10,3,4,6,7,11,16,6,14,9,11,10,10,7,10,8,8,4,5,8,4,4,5,2,4,1,1,0 47 | 80a8,0,0,2,3,4,5,4,6,2,9,7,4,9,10,8,11,16,12,15,17,19,10,18,13,15,11,8,4,7,11,6,7,6,5,1,3,1,0,0,0 48 | ac50,0,1,1,3,1,4,6,2,8,2,10,3,11,9,13,15,5,15,6,10,10,5,14,15,12,7,4,5,11,4,6,9,5,6,1,1,2,1,2,1 49 | 57ef,0,0,1,3,2,5,1,2,7,f,6,3,12,9,4,14,4,6,12,9,12,7,11,-7,16,8,13,6,7,6,10,7,6,3,1,5,4,3,0,0 50 | cc45,0,0,1,2,3,4,5,7,5,4,10,5,12,12,5,4,7,9,18,16,16,10,15,15,10,4,3,7,5,9,4,6,2,4,1,4,2,2,2,1 51 | 9184,0,1,2,1,1,3,5,3,6,3,10,10,11,10,13,10,13,6,6,14,5,4,5,5,9,4,12,7,7,4,7,9,3,3,6,3,4,1,2,0 52 | 84be,0,1,2,2,3,5,2,4,5,6,8,3,5,4,3,15,15,12,16,7,20,15,12,8,9,6,12,5,8,3,8,5,4,1,3,2,1,3,1,0 53 | 0af0,0,0,0,2,4,4,5,3,3,3,10,4,4,4,14,11,15,13,10,14,11,17,9,11,11,7,10,12,10,10,10,8,7,5,2,2,4,1,2,1 54 | bf77,0,0,2,1,1,4,4,7,2,9,4,10,12,7,6,6,11,12,9,15,15,6,6,13,5,12,9,6,4,7,7,6,5,4,1,4,2,2,2,1 55 | c56c,0,1,2,1,1,4,5,4,4,5,9,7,10,3,13,13,8,9,17,16,16,15,12,13,5,12,10,9,11,9,4,5,5,2,2,5,1,0,0,1 56 | 7d0c,0,0,1,3,2,3,6,4,5,7,2,4,11,11,3,8,8,16,5,13,16,5,8,8,6,9,1010,9,3,3,5,3,5,4,5,3,3,0,1 57 | c736,0,1,1,2,2,5,1,7,4,2,5,5,4,6,6,4,16,11,14,16,14,14,8,17,4,14,13,7,6,3,7,7,5,6,3,4,2,2,1,1 58 | c5c8,0,1,1,1,4,1a,6,4,6,3,6,5,6,4,14,13,13,9,12,19,9,10,15,10,9,10,10,7,5,6,8,6,6,4,3,5,2,1,1,1 59 | 050a,0,0,0,1,4,5,6,3,8,7,9,10,8,6,5,12,15,5,10,5,8,13,18,'17',14,9,13,4,10,11,10,8,8,6,5,5,2,0,2,0 60 | 3995,0,0,1,0,3,2,5,4,8,2,9,3,3,10,12,9,14,11,13,8,6,18,11,9,13,11,8,5,5,2,8,5,3,5,4,1,3,1,1,0 61 | 62 | 63 | -------------------------------------------------------------------------------- /data/inflammation-04.csv: -------------------------------------------------------------------------------- 1 | PatientID,Day1,Day2,Day3,Day4,Day5,Day6,Day7,Day8,Day9,Day10,Day11,Day12,Day13,Day14,Day15,Day16,Day17,Day18,Day19,Day20,Day21,Day22,Day23,Day24,Day25,Day26,Day27,Day28,Day29,Day30,Day31,Day32,Day33,Day34,Day35,Day36,Day37,Day38,Day39,Day40,Sex,Age,Group 2 | 669f,0,0,1,3,1,2,4,7,8,3,3,3,10,5,7,4,7,7,12,18,6,13,11,11,7,7,4,6,8,8,4,4,5,7,3,4,2,3,0,0,F,76,G3 3 | 2edf,0,1,2,1,2,1,3,2,2,6,10,11,5,9,4,4,7,16,8,6,18,4,12,5,12,7,11,5,11,3,3,5,4,4,5,5,1,1,0,1,F,42,G3 4 | 0355,0,1,1,3,3,2,6,2,5,9,5,7,4,5,4,15,5,11,9,10,19,14,12,17,7,12,11,7,4,2,10,5,4,2,2,3,2,2,1,1,F,59,G3 5 | 5968,0,0,2,0,4,2,2,1,6,7,10,7,9,13,8,8,15,10,10,7,17,4,4,7,6,15,6,4,9,11,3,5,6,3,3,4,2,3,2,1,M,25,G1 6 | c760,0,1,1,3,3,1,3,5,2,4,4,7,6,5,3,10,8,10,6,17,9,14,9,7,13,9,12,6,7,7,9,6,3,2,2,4,2,0,1,1,M,60,G2 7 | 6b51,0,0,1,2,2,4,2,1,6,4,7,6,6,9,9,15,4,16,18,12,12,5,18,9,5,3,10,3,12,7,8,4,7,3,5,4,4,3,2,1,F,65,G3 8 | dbaf,0,0,2,2,4,2,2,5,5,8,6,5,11,9,4,13,5,12,10,6,9,17,15,8,9,3,13,7,8,2,8,8,4,2,3,5,4,1,1,1,M,38,G3 9 | b3b7,0,0,1,2,3,1,2,3,5,3,7,8,8,5,10,9,15,11,18,19,20,8,5,13,15,10,6,10,6,7,4,9,3,5,2,5,3,2,2,1,M,39,G1 10 | 3995,0,0,0,3,1,5,6,5,5,8,2,4,11,12,10,11,9,10,17,11,6,16,12,6,8,14,6,13,10,11,4,6,4,7,6,3,2,1,0,0,F,33,G1 11 | d6ff,0,1,1,2,1,3,5,3,5,8,6,8,12,5,13,6,13,8,16,8,18,15,16,14,12,7,3,8,9,11,2,5,4,5,1,4,1,2,0,0,F,26,G3 12 | 2d58,0,1,0,0,4,3,3,5,5,4,5,8,7,10,13,3,7,13,15,18,8,15,15,16,11,14,12,4,10,10,4,3,4,5,5,3,3,2,2,1,M,66,G3 13 | a1d4,0,1,0,0,3,4,2,7,8,5,2,8,11,5,5,8,14,11,6,11,9,16,18,6,12,5,4,3,5,7,8,3,5,4,5,5,4,0,1,1,F,60,G3 14 | 71e9,0,0,2,1,4,3,6,4,6,7,9,9,3,11,6,12,4,17,13,15,13,12,8,7,4,7,12,9,5,6,5,4,7,3,5,4,2,3,0,1,M,36,G3 15 | 65c1,0,0,0,0,1,3,1,6,6,5,5,6,3,6,13,3,10,13,9,16,15,9,11,4,6,4,11,11,12,3,5,8,7,4,6,4,1,3,0,0,M,59,G3 16 | 1edd,0,1,2,1,1,1,4,1,5,2,3,3,10,7,13,5,7,17,6,9,12,13,10,4,12,4,6,7,6,10,8,2,5,1,3,4,2,0,2,0,F,42,G2 17 | 277b,0,1,1,0,1,2,4,3,6,4,7,5,5,7,5,10,7,8,18,17,9,8,12,11,11,11,14,6,11,2,10,9,5,6,5,3,4,2,2,0,F,40,G2 18 | fe0e,0,0,0,0,2,3,6,5,7,4,3,2,10,7,9,11,12,5,12,9,13,19,14,17,5,13,8,11,5,10,9,8,7,5,3,1,4,0,2,1,M,43,G1 19 | 66d3,0,0,0,1,2,1,4,3,6,7,4,2,12,6,12,4,14,7,8,14,13,19,6,9,12,6,4,13,6,7,2,3,6,5,4,2,3,0,1,0,F,44,G1 20 | 3ff3,0,0,2,1,2,5,4,2,7,8,4,7,11,9,8,11,15,17,11,12,7,12,7,6,7,4,13,5,7,6,6,9,2,1,1,2,2,0,1,0,F,55,G2 21 | 4102,0,1,2,0,1,4,3,2,2,7,3,3,12,13,11,13,6,5,9,16,9,19,16,11,8,9,14,12,11,9,6,6,6,1,1,2,4,3,1,1,M,25,G3 22 | 12c9,0,1,1,3,1,4,4,1,8,2,2,3,12,12,10,15,13,6,5,5,18,19,9,6,11,12,7,6,3,6,3,2,4,3,1,5,4,2,2,0,M,59,G3 23 | 5b04,0,0,2,3,2,3,2,6,3,8,7,4,6,6,9,5,12,12,8,5,12,10,16,7,14,12,5,4,6,9,8,5,6,6,1,4,3,0,2,0,F,79,G1 24 | 1fef,0,0,0,3,4,5,1,7,7,8,2,5,12,4,10,14,5,5,17,13,16,15,13,6,12,9,10,3,3,7,4,4,8,2,6,5,1,0,1,0,F,64,G1 25 | 01c0,0,1,1,1,1,3,3,2,6,3,9,7,8,8,4,13,7,14,11,15,14,13,5,13,7,14,9,10,5,11,5,3,5,1,1,4,4,1,2,0,F,79,G2 26 | 57b5,0,1,1,1,2,3,5,3,6,3,7,10,3,8,12,4,12,9,15,5,17,16,5,10,10,15,7,5,3,11,5,5,6,1,1,1,1,0,2,1,F,35,G2 27 | 226c,0,0,2,1,3,3,2,7,4,4,3,8,12,9,12,9,5,16,8,17,7,11,14,7,13,11,7,12,12,7,8,5,7,2,2,4,1,1,1,0,M,49,G1 28 | c653,0,0,1,2,4,2,2,3,5,7,10,5,5,12,3,13,4,13,7,15,9,12,18,14,16,12,3,11,3,2,7,4,8,2,2,1,3,0,1,1,F,58,G2 29 | 94fd,0,0,1,1,1,5,1,5,2,2,4,10,4,8,14,6,15,6,12,15,15,13,7,17,4,5,11,4,8,7,9,4,5,3,2,5,4,3,2,1,M,80,G1 30 | ebf2,0,0,2,2,3,4,6,3,7,6,4,5,8,4,7,7,6,11,12,19,20,18,9,5,4,7,14,8,4,3,7,7,8,3,5,4,1,3,1,0,F,71,G3 31 | fc73,0,0,0,1,4,4,6,3,8,6,4,10,12,3,3,6,8,7,17,16,14,15,17,4,14,13,4,4,12,11,6,9,5,5,2,5,2,1,0,1,F,33,G3 32 | d4a0,0,1,1,0,3,2,4,6,8,6,2,3,11,3,14,14,12,8,8,16,13,7,6,9,15,7,6,4,10,8,10,4,2,6,5,5,2,3,2,1,M,63,G3 33 | a9f2,0,0,2,3,3,4,5,3,6,7,10,5,10,13,14,3,8,10,9,9,19,15,15,6,8,8,11,5,5,7,3,6,6,4,5,2,2,3,0,0,M,27,G3 34 | dc22,0,1,2,2,2,3,6,6,6,7,6,3,11,12,13,15,15,10,14,11,11,8,6,12,10,5,12,7,7,11,5,8,5,2,5,5,2,0,2,1,M,67,G3 35 | a6e7,0,0,2,1,3,5,6,7,5,8,9,3,12,10,12,4,12,9,13,10,10,6,10,11,4,15,13,7,3,4,2,9,7,2,4,2,1,2,1,1,M,58,G3 36 | 3fb2,0,0,1,2,4,1,5,5,2,3,4,8,8,12,5,15,9,17,7,19,14,18,12,17,14,4,13,13,8,11,5,6,6,2,3,5,2,1,1,1,M,79,G1 37 | 11cc,0,0,0,3,1,3,6,4,3,4,8,3,4,8,3,11,5,7,10,5,15,9,16,17,16,3,8,9,8,3,3,9,5,1,6,5,4,2,2,0,M,25,G3 38 | c9f5,0,1,2,2,2,5,5,1,4,6,3,6,5,9,6,7,4,7,16,7,16,13,9,16,12,6,7,9,10,3,6,4,5,4,6,3,4,3,2,1,M,66,G1 39 | a73f,0,1,1,2,3,1,5,1,2,2,5,7,6,6,5,10,6,7,17,13,15,16,17,14,4,4,10,10,10,11,9,9,5,4,4,2,1,0,1,0,F,77,G1 40 | dab2,0,1,0,3,2,4,1,1,5,9,10,7,12,10,9,15,12,13,13,6,19,9,10,6,13,5,13,6,7,2,5,5,2,1,1,1,1,3,0,1,F,41,G2 41 | 65a1,0,1,1,3,1,1,5,5,3,7,2,2,3,12,4,6,8,15,16,16,15,4,14,5,13,10,7,10,6,3,2,3,6,3,3,5,4,3,2,1,M,57,G1 42 | 8bcb,0,0,0,2,2,1,3,4,5,5,6,5,5,12,13,5,7,5,11,15,18,7,9,10,14,12,11,9,10,3,2,9,6,2,2,5,3,0,0,1,M,24,G2 43 | 4004,0,0,1,3,3,1,2,1,8,9,2,8,10,3,8,6,10,13,11,17,19,6,4,11,6,12,7,5,5,4,4,8,2,6,6,4,2,2,0,0,M,34,G3 44 | c2af,0,1,1,3,4,5,2,1,3,7,9,6,10,5,8,15,11,12,15,6,12,16,6,4,14,3,12,9,6,11,5,8,5,5,6,1,2,1,2,0,F,65,G3 45 | 8037,0,0,1,3,1,4,3,6,7,8,5,7,11,3,6,11,6,10,6,19,18,14,6,10,7,9,8,5,8,3,10,2,5,1,5,4,2,1,0,1,M,53,G2 46 | cb49,0,1,1,3,3,4,4,6,3,4,9,9,7,6,8,15,12,15,6,11,6,18,5,14,15,12,9,8,3,6,10,6,8,7,2,5,4,3,1,1,F,63,G2 47 | 2b4b,0,1,2,2,4,3,1,4,8,9,5,10,10,3,4,6,7,11,16,6,14,9,11,10,10,7,10,8,8,4,5,8,4,4,5,2,4,1,1,0,F,60,G3 48 | 80a8,0,0,2,3,4,5,4,6,2,9,7,4,9,10,8,11,16,12,15,17,19,10,18,13,15,11,8,4,7,11,6,7,6,5,1,3,1,0,0,0,M,24,G2 49 | ac50,0,1,1,3,1,4,6,2,8,2,10,3,11,9,13,15,5,15,6,10,10,5,14,15,12,7,4,5,11,4,6,9,5,6,1,1,2,1,2,1,M,68,G1 50 | 57ef,0,0,1,3,2,5,1,2,7,6,6,3,12,9,4,14,4,6,12,9,12,7,11,7,16,8,13,6,7,6,10,7,6,3,1,5,4,3,0,0,F,75,G1 51 | cc45,0,0,1,2,3,4,5,7,5,4,10,5,12,12,5,4,7,9,18,16,16,10,15,15,10,4,3,7,5,9,4,6,2,4,1,4,2,2,2,1,F,37,G2 52 | 9184,0,1,2,1,1,3,5,3,6,3,10,10,11,10,13,10,13,6,6,14,5,4,5,5,9,4,12,7,7,4,7,9,3,3,6,3,4,1,2,0,F,37,G1 53 | 84be,0,1,2,2,3,5,2,4,5,6,8,3,5,4,3,15,15,12,16,7,20,15,12,8,9,6,12,5,8,3,8,5,4,1,3,2,1,3,1,0,M,27,G3 54 | 0af0,0,0,0,2,4,4,5,3,3,3,10,4,4,4,14,11,15,13,10,14,11,17,9,11,11,7,10,12,10,10,10,8,7,5,2,2,4,1,2,1,F,68,G1 55 | bf77,0,0,2,1,1,4,4,7,2,9,4,10,12,7,6,6,11,12,9,15,15,6,6,13,5,12,9,6,4,7,7,6,5,4,1,4,2,2,2,1,F,52,G3 56 | c56c,0,1,2,1,1,4,5,4,4,5,9,7,10,3,13,13,8,9,17,16,16,15,12,13,5,12,10,9,11,9,4,5,5,2,2,5,1,0,0,1,M,48,G1 57 | 7d0c,0,0,1,3,2,3,6,4,5,7,2,4,11,11,3,8,8,16,5,13,16,5,8,8,6,9,10,10,9,3,3,5,3,5,4,5,3,3,0,1,F,40,G1 58 | c736,0,1,1,2,2,5,1,7,4,2,5,5,4,6,6,4,16,11,14,16,14,14,8,17,4,14,13,7,6,3,7,7,5,6,3,4,2,2,1,1,M,30,G1 59 | c5c8,0,1,1,1,4,1,6,4,6,3,6,5,6,4,14,13,13,9,12,19,9,10,15,10,9,10,10,7,5,6,8,6,6,4,3,5,2,1,1,1,M,77,G3 60 | 050a,0,0,0,1,4,5,6,3,8,7,9,10,8,6,5,12,15,5,10,5,8,13,18,17,14,9,13,4,10,11,10,8,8,6,5,5,2,0,2,0,M,49,G1 61 | a085,0,0,1,0,3,2,5,4,8,2,9,3,3,10,12,9,14,11,13,8,6,18,11,9,13,11,8,5,5,2,8,5,3,5,4,1,3,1,1,0,F,29,G3 62 | -------------------------------------------------------------------------------- /scikit-learn/01.4 Review of Scikit-learn API.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# A recap on Scikit-learn's estimator interface" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": { 13 | "slideshow": { 14 | "slide_type": "subslide" 15 | } 16 | }, 17 | "source": [ 18 | "Scikit-learn strives to have a uniform interface across all methods. Given a scikit-learn *estimator*\n", 19 | "object named `model`, the following methods are available (not all for each model)" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": { 25 | "slideshow": { 26 | "slide_type": "subslide" 27 | } 28 | }, 29 | "source": [ 30 | "- Available in **all Estimators**\n", 31 | " + `model.fit()` : fit training data. For supervised learning applications,\n", 32 | " this accepts two arguments: the data `X` and the labels `y` (e.g. `model.fit(X, y)`).\n", 33 | " For unsupervised learning applications, ``fit`` takes only a single argument,\n", 34 | " the data `X` (e.g. `model.fit(X)`)." 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": { 40 | "slideshow": { 41 | "slide_type": "subslide" 42 | } 43 | }, 44 | "source": [ 45 | "- Available in **supervised estimators**\n", 46 | " + `model.predict()` : given a trained model, predict the label of a new set of data.\n", 47 | " This method accepts one argument, the new data `X_new` (e.g. `model.predict(X_new)`),\n", 48 | " and returns the learned label for each object in the array.\n", 49 | " + `model.predict_proba()` : For classification problems, some estimators also provide\n", 50 | " this method, which returns the probability that a new observation has each categorical label.\n", 51 | " In this case, the label with the highest probability is returned by `model.predict()`.\n", 52 | " " 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": { 58 | "slideshow": { 59 | "slide_type": "subslide" 60 | } 61 | }, 62 | "source": [ 63 | "- Available in **supervised estimators** (cont.)\n", 64 | " \n", 65 | " + `model.decision_function()` : For classification problems, some estimators provide an uncertainty estimate that is not a probability. For binary classification, a decision_function >= 0 means the positive class will be predicted, while < 0 means the negative class.\n", 66 | " + `model.score()` : for classification or regression problems, most (all?) estimators implement\n", 67 | " a score method. Scores are between 0 and 1, with a larger score indicating a better fit." 68 | ] 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "metadata": { 73 | "slideshow": { 74 | "slide_type": "subslide" 75 | } 76 | }, 77 | "source": [ 78 | "- Available in **supervised estimators** (cont.)\n", 79 | "\n", 80 | " + `model.transform()` : For feature selection algorithms, this will reduce the dataset to the selected features. For some classification and regression models such as some linear models and random forests, this method reduces the dataset to the most informative features. These classification and regression models can therefor also be used as feature selection methods." 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": { 86 | "slideshow": { 87 | "slide_type": "subslide" 88 | } 89 | }, 90 | "source": [ 91 | "- Available in **unsupervised estimators**\n", 92 | " + `model.transform()` : given an unsupervised model, transform new data into the new basis.\n", 93 | " This also accepts one argument `X_new`, and returns the new representation of the data based\n", 94 | " on the unsupervised model.\n", 95 | " + `model.fit_transform()` : some estimators implement this method,\n", 96 | " which more efficiently performs a fit and a transform on the same input data." 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "metadata": { 102 | "slideshow": { 103 | "slide_type": "subslide" 104 | } 105 | }, 106 | "source": [ 107 | "- Available in **unsupervised estimators** (cont.)\n", 108 | "\n", 109 | " + `model.predict()` : for clustering algorithms, the predict method will produce cluster labels for new data points. Not all clustering methods have this functionality.\n", 110 | " + `model.predict_proba()` : Gaussian mixture models (GMMs) provide the probability for each point to be generated by a given mixture component.\n", 111 | " + `model.score()` : Density models like KDE and GMMs provide the likelihood of the data under the model." 112 | ] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "metadata": { 117 | "slideshow": { 118 | "slide_type": "subslide" 119 | } 120 | }, 121 | "source": [ 122 | "Apart from ``fit``, the two most important functions are arguably ``predict`` to produce a target variable (a ``y``) ``transform``, which produces a new representation of the data (an ``X``).\n", 123 | "The following table shows for which class of models which function applies:\n", 124 | "\n" 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "metadata": {}, 130 | "source": [ 131 | "\n", 132 | "\n", 133 | "\n", 134 | "\n", 135 | "\n", 136 | "\n", 137 | "\n", 138 | "
``model.predict````model.transform``
ClassificationPreprocessing
RegressionDimensionality Reduction
ClusteringFeature Extraction
 Feature selection
\n", 139 | "\n", 140 | "\n" 141 | ] 142 | } 143 | ], 144 | "metadata": { 145 | "kernelspec": { 146 | "display_name": "Python 3 (ipykernel)", 147 | "language": "python", 148 | "name": "python3" 149 | }, 150 | "language_info": { 151 | "codemirror_mode": { 152 | "name": "ipython", 153 | "version": 3 154 | }, 155 | "file_extension": ".py", 156 | "mimetype": "text/x-python", 157 | "name": "python", 158 | "nbconvert_exporter": "python", 159 | "pygments_lexer": "ipython3", 160 | "version": "3.9.7" 161 | } 162 | }, 163 | "nbformat": 4, 164 | "nbformat_minor": 4 165 | } 166 | -------------------------------------------------------------------------------- /working_notebook_dataclasses.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "688b4b2b-9a80-42ee-9235-322e718a686f", 6 | "metadata": {}, 7 | "source": [ 8 | "## Classes, Data Classes and Abstractions" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "bf607701-26d6-4bd2-bfb2-6047395194cb", 14 | "metadata": {}, 15 | "source": [ 16 | "### Recap Data Problem" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "id": "1c091974-113c-4bfc-a260-ca0dff020734", 22 | "metadata": {}, 23 | "source": [ 24 | "$\\rightarrow$ _Adapted from_ : [**Software Carpentries: Programming with Python**]()\n", 25 | "\n", 26 | "## Arthritis Inflammation\n", 27 | "We are studying **inflammation in patients** who have been given a new treatment for arthritis.\n", 28 | "\n", 29 | "There are `60` patients, who had their inflammation levels recorded for `40` days.\n", 30 | "We want to analyze these recordings to study the effect of the new arthritis treatment.\n", 31 | "\n", 32 | "To see how the treatment is affecting the patients in general, we would like to:\n", 33 | "\n", 34 | "1. Process the file to extract data for each patient;\n", 35 | "2. Calculate some statistics on each patient;\n", 36 | " - e.g. average inflammation over the `40` days (or `min`, `max` .. and so on)\n", 37 | " - e.g average statistics per week (we will assume `40` days account for `5` weeks)\n", 38 | " - `...` (open to ideas)\n", 39 | "3. Calculate some statistics on the dataset.\n", 40 | " - e.g. min and max inflammation registered overall in the clinical study;\n", 41 | " - e.g. the average inflammation per day across all patients.\n", 42 | " - `...` (open to ideas)\n", 43 | "\n", 44 | "\n", 45 | "![3-step flowchart shows inflammation data records for patients moving to the Analysis step\n", 46 | "where a heat map of provided data is generated moving to the Conclusion step that asks the\n", 47 | "question, How does the medication affect patients?](\n", 48 | "https://raw.githubusercontent.com/swcarpentry/python-novice-inflammation/gh-pages/fig/lesson-overview.svg \"Lesson Overview\")\n", 49 | "\n", 50 | "\n", 51 | "### Data Format\n", 52 | "\n", 53 | "The data sets are stored in\n", 54 | "[comma-separated values] (CSV) format:\n", 55 | "\n", 56 | "- each row holds information for a single patient,\n", 57 | "- columns represent successive days.\n", 58 | "\n", 59 | "The first three rows of our first file look like this:\n", 60 | "~~~\n", 61 | "0,0,1,3,1,2,4,7,8,3,3,3,10,5,7,4,7,7,12,18,6,13,11,11,7,7,4,6,8,8,4,4,5,7,3,4,2,3,0,0\n", 62 | "0,1,2,1,2,1,3,2,2,6,10,11,5,9,4,4,7,16,8,6,18,4,12,5,12,7,11,5,11,3,3,5,4,4,5,5,1,1,0,1\n", 63 | "0,1,1,3,3,2,6,2,5,9,5,7,4,5,4,15,5,11,9,10,19,14,12,17,7,12,11,7,4,2,10,5,4,2,2,3,2,2,1,1\n", 64 | "~~~\n", 65 | "\n", 66 | "Each number represents the number of inflammation bouts that a particular patient experienced on a\n", 67 | "given day.\n", 68 | "\n", 69 | "For example, value \"6\" at row 3 column 7 of the data set above means that the third\n", 70 | "patient was experiencing inflammation six times on the seventh day of the clinical study.\n", 71 | "\n", 72 | "Our **task** is to gather as much information as possible from the dataset, and to report back to colleagues to foster future discussions." 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "id": "dee40b14-d3a8-41a5-8320-9e0b3b2e8c69", 78 | "metadata": {}, 79 | "source": [ 80 | "---" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "id": "b502cf0d-9b3b-48f0-83c0-23541c7eff0a", 86 | "metadata": {}, 87 | "source": [ 88 | "Dealing with more _realistic cases_ ❌" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "id": "56bbe9dd-4051-436e-b7a2-45a02087c35c", 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "id": "4a5513e7-28d6-43c7-8b91-0254b6950f48", 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "id": "728e2bef-d3e5-45bc-87c4-15873d68eefb", 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [] 114 | }, 115 | { 116 | "cell_type": "markdown", 117 | "id": "4886c01e-06dd-4276-a84d-c80d454754d0", 118 | "metadata": {}, 119 | "source": [ 120 | "Putting our helmets on (_with some testing_) ⛑" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "id": "12bbe647-95c3-45d7-9518-e15cca04f53d", 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "id": "89cdaeee-3ae4-4276-a62c-394f210f8cf5", 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "id": "b96221bf-b216-4125-9a4e-33d0463c5204", 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "id": "28748dd5-7169-45b9-9281-99b1c93f4fdc", 150 | "metadata": {}, 151 | "source": [ 152 | "Now it's time to rethink about our Data (Abstractions): let's define our own **new type**!" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": null, 158 | "id": "4d2b3aed-2b98-4a09-b566-78c9be28e1e1", 159 | "metadata": {}, 160 | "outputs": [], 161 | "source": [] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": null, 166 | "id": "9b50b042-976b-411f-9348-4571df8e4b1d", 167 | "metadata": {}, 168 | "outputs": [], 169 | "source": [] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "id": "51038696-c461-4408-9d94-295d63423e1d", 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": null, 182 | "id": "34a5b1c8-94b6-4e1c-96dd-311ea4c0c3d2", 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": [] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "id": "73712490-22ea-4283-8451-065787eef7fb", 191 | "metadata": {}, 192 | "outputs": [], 193 | "source": [] 194 | } 195 | ], 196 | "metadata": { 197 | "kernelspec": { 198 | "display_name": "Python 3 (ipykernel)", 199 | "language": "python", 200 | "name": "python3" 201 | }, 202 | "language_info": { 203 | "codemirror_mode": { 204 | "name": "ipython", 205 | "version": 3 206 | }, 207 | "file_extension": ".py", 208 | "mimetype": "text/x-python", 209 | "name": "python", 210 | "nbconvert_exporter": "python", 211 | "pygments_lexer": "ipython3", 212 | "version": "3.9.7" 213 | } 214 | }, 215 | "nbformat": 4, 216 | "nbformat_minor": 5 217 | } 218 | -------------------------------------------------------------------------------- /Exercises.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "7c60b622-9fca-4b68-8ec5-25dcd8e3d505", 6 | "metadata": {}, 7 | "source": [ 8 | "# Exercises Lect 1" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "ca8293d2-9b49-4432-bcf7-6307e2a8d829", 14 | "metadata": {}, 15 | "source": [ 16 | "## Before we start\n", 17 | "\n", 18 | "Please **do not modify this notebook** as it will be updated with additional exercises later on.\n", 19 | "\n", 20 | "So, _before starting to work on the exercises_, please **duplicate** this notebook into a new one and rename it as `exercises_work.ipynb`" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "id": "bcee788b-4438-47de-840e-653c78333d88", 26 | "metadata": {}, 27 | "source": [ 28 | "### Familiarise with Notebook" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "id": "2b248fb6-c978-4d8e-9bab-1137be828796", 34 | "metadata": {}, 35 | "source": [ 36 | "Look at what we did in [Markdown Notebook Showcase](./Markdown_notebook_showcase.ipynb) as a refresher." 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "id": "65e23547-9b04-43d6-81db-7fe5164278b2", 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "# make the current cell a markdown cell" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "id": "f97fe553-41de-4140-af5e-a08f8cede65f", 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "# Now try to turn this into a Markdown cell using only Keyboard shortcut" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "id": "f5a87c56-c232-4fc4-973d-7731d2dac81d", 62 | "metadata": {}, 63 | "source": [ 64 | "---" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "id": "f9f6567f-65ac-4a91-bcb8-37e19e980755", 70 | "metadata": {}, 71 | "source": [ 72 | "**Add** a cell below this one, and type `well done` in it, as a Python comment\n", 73 | "\n", 74 | "Note: The cell will be by default a _code cell_, therefore you won't need to change its type.\n", 75 | "\n", 76 | "A comment in Python is marked using the `#` (`sharp`) character." 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "id": "46f2607a-3dab-469e-9ee8-ab6fc64a95db", 82 | "metadata": {}, 83 | "source": [ 84 | "---" 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "id": "f7cadc47-7b24-4de0-9fae-8617c337a7a2", 90 | "metadata": {}, 91 | "source": [ 92 | "Please turn the following cell into a Markdown Cell, so that the code will be displayed in `monospace`, with `syntax highlighting`" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "id": "6cd139ca-0415-4028-a754-2f13f65fabae", 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "# this is a code cell\n", 103 | "for i in range(10):\n", 104 | " print(i)" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "id": "91e4ccca-24ab-4b5b-a6b4-6ec04e3ecadc", 110 | "metadata": {}, 111 | "source": [ 112 | "---" 113 | ] 114 | }, 115 | { 116 | "cell_type": "markdown", 117 | "id": "6d207600-ccdf-4996-ad8c-35e733c1cf42", 118 | "metadata": {}, 119 | "source": [ 120 | "Make this text in BOLD" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "id": "58eaf9ff-1f7a-49f8-8808-3ad1fe410975", 126 | "metadata": {}, 127 | "source": [ 128 | "Turn this text in ITALIC" 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "id": "eb6f24fd-d102-41c5-8639-d7633a486f43", 134 | "metadata": {}, 135 | "source": [ 136 | "---" 137 | ] 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "id": "d3b60cec-3cb5-413b-8e4c-df043708744b", 142 | "metadata": {}, 143 | "source": [ 144 | "Create a list of items in Markdown (it can be either an _ordered_ or _unordered_ list. You choose !-)" 145 | ] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "id": "2bdefff8-59f2-4f65-871a-a461e887e62d", 150 | "metadata": {}, 151 | "source": [] 152 | }, 153 | { 154 | "cell_type": "markdown", 155 | "id": "e6155f59-1bfe-49ea-96ea-0519c60ea966", 156 | "metadata": {}, 157 | "source": [ 158 | "---" 159 | ] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "id": "a77fa826-1686-406a-974a-31cd06b335b8", 164 | "metadata": {}, 165 | "source": [ 166 | "## Basic Python" 167 | ] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "id": "fa99ab1e-7a29-495e-8568-4944f8924395", 172 | "metadata": {}, 173 | "source": [ 174 | "Generate a sequence of the first `20` numbers, iterate them, and `print` them on screen. \n", 175 | "\n", 176 | "Hint: Use the `range(20)` function to generate the numbers" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": null, 182 | "id": "b6d8cc08-bedc-4833-9ad4-6ebae4dddf40", 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": [] 186 | }, 187 | { 188 | "cell_type": "markdown", 189 | "id": "5d2c7728-1bd3-42c0-ba60-b5a954e31d6a", 190 | "metadata": {}, 191 | "source": [ 192 | "Create a list of `pets` (containing as many elements as you want) and `print` each item in the list" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": null, 198 | "id": "70bbb2cb-767b-4468-91a0-bc6ee5468a40", 199 | "metadata": {}, 200 | "outputs": [], 201 | "source": [] 202 | }, 203 | { 204 | "cell_type": "markdown", 205 | "id": "da8b7269-e8b0-459f-9d28-bdeb64fb4f6d", 206 | "metadata": {}, 207 | "source": [ 208 | "Given the following code:\n", 209 | " \n", 210 | "```python \n", 211 | "\n", 212 | "stationery = [\"pen\", \"pencil\", \"glue\", \"notebook\", \"clipper\"]\n", 213 | "\n", 214 | "```\n", 215 | "\n", 216 | "1. Print the length of the list (use `len`)\n", 217 | "2. Iterate the list, and print each item, along with its corresponding length" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": null, 223 | "id": "b211718c-2554-4733-a037-af0f7382c9c7", 224 | "metadata": {}, 225 | "outputs": [], 226 | "source": [] 227 | }, 228 | { 229 | "cell_type": "markdown", 230 | "id": "d4149660-ad54-403f-bb18-3762c399c498", 231 | "metadata": {}, 232 | "source": [ 233 | "Repeat the previous exercise but this time with this list instead\n", 234 | "\n", 235 | "```python\n", 236 | "\n", 237 | "data_info = [\"time\", \"space\", \"age\", \"height\", \"\", \" \", 3, 1.8]\n", 238 | "\n", 239 | "```" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": null, 245 | "id": "6cfa4891-1205-44bd-9bcb-da39656fe970", 246 | "metadata": {}, 247 | "outputs": [], 248 | "source": [] 249 | }, 250 | { 251 | "cell_type": "markdown", 252 | "id": "cc4437c0-eff8-4f25-bd94-8f8665f481de", 253 | "metadata": {}, 254 | "source": [ 255 | "Copy the following code into the cell below, and FIX it to make it running without errors:\n", 256 | "\n", 257 | "```python\n", 258 | "\n", 259 | "dummy_data = \"\"\n", 260 | "\n", 261 | "with open(\"dummy.csv\") as f:\n", 262 | " \n", 263 | "for l in dummy_file:\n", 264 | " line = line.strip()\n", 265 | " \n", 266 | "dummy_data.append(line)\n", 267 | "\n", 268 | "print(dummy)\n", 269 | "```" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": null, 275 | "id": "1b625548-bfde-4402-a1e1-9b89158c1e53", 276 | "metadata": {}, 277 | "outputs": [], 278 | "source": [] 279 | } 280 | ], 281 | "metadata": { 282 | "kernelspec": { 283 | "display_name": "Python 3 (ipykernel)", 284 | "language": "python", 285 | "name": "python3" 286 | }, 287 | "language_info": { 288 | "codemirror_mode": { 289 | "name": "ipython", 290 | "version": 3 291 | }, 292 | "file_extension": ".py", 293 | "mimetype": "text/x-python", 294 | "name": "python", 295 | "nbconvert_exporter": "python", 296 | "pygments_lexer": "ipython3", 297 | "version": "3.9.7" 298 | } 299 | }, 300 | "nbformat": 4, 301 | "nbformat_minor": 5 302 | } 303 | -------------------------------------------------------------------------------- /scikit-learn/01.1 Introduction to Machine Learning.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "slideshow": { 7 | "slide_type": "slide" 8 | } 9 | }, 10 | "source": [ 11 | "# Introduction to Machine Learning in Python" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": { 17 | "slideshow": { 18 | "slide_type": "subslide" 19 | } 20 | }, 21 | "source": [ 22 | "## What is Machine Learning?" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": { 28 | "slideshow": { 29 | "slide_type": "subslide" 30 | } 31 | }, 32 | "source": [ 33 | "### Machine Learning at a Glance" 34 | ] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "metadata": { 39 | "slideshow": { 40 | "slide_type": "-" 41 | } 42 | }, 43 | "source": [ 44 | "" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": { 50 | "slideshow": { 51 | "slide_type": "subslide" 52 | } 53 | }, 54 | "source": [ 55 | "> Machine learning teaches machines how to carry out tasks by themselves. It is that simple.\n", 56 | "The complexity comes with the details.\n", 57 | "\n", 58 | "_W. Richert & L.P. Coelho, 2013\n", 59 | "Building Machine Learning Systems with Python_" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": { 65 | "slideshow": { 66 | "slide_type": "subslide" 67 | } 68 | }, 69 | "source": [ 70 | "Machine learning is the process to automatically **extract knowledge** from data, usually with the goal of making **predictions** on _new_, _unseen_ data. \n", 71 | "\n", 72 | "A classical example is a _spam filter_, for which the user keeps labeling incoming mails as either spam or not spam. \n", 73 | "\n", 74 | "A machine learning algorithm then \"learns\" what distinguishes spam from normal emails, and can predict for new emails whether they are spam or not." 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "metadata": { 80 | "slideshow": { 81 | "slide_type": "subslide" 82 | } 83 | }, 84 | "source": [ 85 | "Central to machine learning is the concept of **making decision automatically** from data, **without the user specifying explicit rules** how this decision should be made.\n", 86 | "\n", 87 | "For the case of emails, the user doesn't provide a list of words or characteristics that make an email spam. Instead, the user provides examples of spam and non-spam emails." 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "metadata": { 93 | "slideshow": { 94 | "slide_type": "subslide" 95 | } 96 | }, 97 | "source": [ 98 | "The second central concept is **generalization**. \n", 99 | "\n", 100 | "The goal of a machine learning algorithm is to predict on new, previously unseen data. We are not interested in marking an email as spam or not, that the human already labeled. Instead, we want to make the users life easier by making an automatic decision for new incoming mail." 101 | ] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "metadata": { 106 | "slideshow": { 107 | "slide_type": "subslide" 108 | } 109 | }, 110 | "source": [ 111 | "There are two kinds of machine learning we will talk about in these notebooks: \n", 112 | "\n", 113 | "* **Supervised learning;** \n", 114 | "* **Unsupervised learning.**" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": { 120 | "slideshow": { 121 | "slide_type": "slide" 122 | } 123 | }, 124 | "source": [ 125 | "### Supervised Learning\n", 126 | "\n", 127 | "In **Supervised Learning**, we have a dataset consisting of both input features and a desired output, such as in the spam / no-spam example.\n", 128 | "\n", 129 | "The task is to construct a model (or program) which is able to predict the desired output of an unseen object\n", 130 | "given the set of features." 131 | ] 132 | }, 133 | { 134 | "cell_type": "markdown", 135 | "metadata": { 136 | "slideshow": { 137 | "slide_type": "subslide" 138 | } 139 | }, 140 | "source": [ 141 | "" 142 | ] 143 | }, 144 | { 145 | "cell_type": "markdown", 146 | "metadata": { 147 | "slideshow": { 148 | "slide_type": "subslide" 149 | } 150 | }, 151 | "source": [ 152 | "Supervised learning is further broken down into two categories, **classification** and **regression**.\n", 153 | "\n", 154 | "In classification, the label is discrete (a.k.a. _Categorical Data_, i.e. _Integer values_), such as \"spam\" or \"no spam\". \n", 155 | "\n", 156 | "In other words, it provides a clear-cut distinction between categories. \n", 157 | "\n", 158 | "In regression, the label is continuous, i.e. _Float output_." 159 | ] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "metadata": { 164 | "slideshow": { 165 | "slide_type": "subslide" 166 | } 167 | }, 168 | "source": [ 169 | "### Other Examples\n", 170 | "\n", 171 | "Some more complicated examples are:\n", 172 | "\n", 173 | "- given a multicolor image of an object through a telescope, determine\n", 174 | " whether that object is a star, a quasar, or a galaxy.\n", 175 | "- given a photograph of a person, identify the person in the photo.\n", 176 | "- given a list of movies a person has watched and their personal rating\n", 177 | " of the movie, recommend a list of movies they would like.\n", 178 | "- given a persons age, education and position, infer their salary" 179 | ] 180 | }, 181 | { 182 | "cell_type": "markdown", 183 | "metadata": { 184 | "slideshow": { 185 | "slide_type": "subslide" 186 | } 187 | }, 188 | "source": [ 189 | "What these tasks have in common is that there are one or more unknown\n", 190 | "quantities associated with the object which needs to be determined from other\n", 191 | "observed quantities." 192 | ] 193 | }, 194 | { 195 | "cell_type": "markdown", 196 | "metadata": { 197 | "slideshow": { 198 | "slide_type": "subslide" 199 | } 200 | }, 201 | "source": [ 202 | "### For example\n", 203 | "\n", 204 | "* In astronomy, the task of determining whether an object is a star, a galaxy, or a quasar is a **classification problem**: the label is from three distinct categories. \n", 205 | "\n", 206 | "* On the other hand, we might wish to estimate the age of an object based on such observations: this would be a **regression problem**, because the label (age) is a continuous quantity." 207 | ] 208 | }, 209 | { 210 | "cell_type": "markdown", 211 | "metadata": { 212 | "slideshow": { 213 | "slide_type": "slide" 214 | } 215 | }, 216 | "source": [ 217 | "### Unsupervised Learning\n", 218 | "\n", 219 | "In **Unsupervised Learning** there is no desired output associated with the data.\n", 220 | "\n", 221 | "Instead, we are interested in extracting some form of knowledge or model from the given data.\n", 222 | "\n", 223 | "In a sense, you can think of unsupervised learning as a means of discovering labels from the data itself.\n", 224 | "\n", 225 | "Unsupervised learning comprises tasks such as *dimensionality reduction*, *clustering*, and\n", 226 | "*density estimation*. " 227 | ] 228 | }, 229 | { 230 | "cell_type": "markdown", 231 | "metadata": { 232 | "slideshow": { 233 | "slide_type": "subslide" 234 | } 235 | }, 236 | "source": [ 237 | "" 238 | ] 239 | }, 240 | { 241 | "cell_type": "markdown", 242 | "metadata": { 243 | "slideshow": { 244 | "slide_type": "fragment" 245 | } 246 | }, 247 | "source": [ 248 | "Unsupervised learning is often harder to understand and to evaluate.\n", 249 | "\n", 250 | "Sometimes the two may even be combined: e.g. Unsupervised learning can be used to find useful\n", 251 | "features in heterogeneous data, and then these features can be used within a supervised\n", 252 | "framework. We call these methods semi-supervised learning." 253 | ] 254 | }, 255 | { 256 | "cell_type": "markdown", 257 | "metadata": { 258 | "slideshow": { 259 | "slide_type": "subslide" 260 | } 261 | }, 262 | "source": [ 263 | "### Other Examples\n", 264 | "\n", 265 | "Some more involved unsupervised learning problems are:\n", 266 | "\n", 267 | "- given detailed observations of distant galaxies, determine which features or combinations of\n", 268 | " features summarize best the information.\n", 269 | "- given a mixture of two sound sources (for example, a person talking over some music),\n", 270 | " separate the two (this is called the [blind source separation](http://en.wikipedia.org/wiki/Blind_signal_separation) problem).\n", 271 | "- given a large collection of news articles, find recurring topics inside these articles.\n", 272 | "- given a collection of images, cluster similar images together (for example to group them when visualizing a collection)." 273 | ] 274 | }, 275 | { 276 | "cell_type": "markdown", 277 | "metadata": { 278 | "slideshow": { 279 | "slide_type": "slide" 280 | } 281 | }, 282 | "source": [ 283 | "# Scikit-learn at a Glance" 284 | ] 285 | }, 286 | { 287 | "cell_type": "markdown", 288 | "metadata": {}, 289 | "source": [ 290 | "" 291 | ] 292 | } 293 | ], 294 | "metadata": { 295 | "celltoolbar": "Slideshow", 296 | "kernelspec": { 297 | "display_name": "Python 3 (ipykernel)", 298 | "language": "python", 299 | "name": "python3" 300 | }, 301 | "language_info": { 302 | "codemirror_mode": { 303 | "name": "ipython", 304 | "version": 3 305 | }, 306 | "file_extension": ".py", 307 | "mimetype": "text/x-python", 308 | "name": "python", 309 | "nbconvert_exporter": "python", 310 | "pygments_lexer": "ipython3", 311 | "version": "3.9.7" 312 | } 313 | }, 314 | "nbformat": 4, 315 | "nbformat_minor": 4 316 | } 317 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Programming for Data Science @ FBK Academy 2 | 3 | This is a programming tutorial aimed at researchers and practitioners with (potentially) no prior programming experience, as well as with previous programming skills. 4 | 5 | We will walk through several concepts to give you an introduction to some of the principal programming concepts like _conditionals_, _functions_, _iterations_, as well as more specialised topics like _classes_, _objects_ and what's sometimes called _defensive programming_. 6 | 7 | _If all these terms sounds like [gibberish](https://en.wikipedia.org/wiki/Gibberish) to you, don't worry!_ 8 | 9 | I'll try to show everything with simple code examples: no long and complicated explanations with fancy words. At the end of this tutorial, I am sure you will master all these concepts like a _pro_ 🙌 10 | 11 | ### Why Programming for _Data Science_ ? 12 | 13 | In this tutorial we will be using **Python 3**. Python is nowadays considered as **"the"** language of choice for Data Science. 14 | There are indeed many reasons for that, and many articles have been written on the subject. 15 | This [article](https://analyticsindiamag.com/heres-why-python-continues-to-be-the-language-of-choice-for-data-scientists/) looks like a good and clear example on the topic. 16 | 17 | #### A Few notes before we start 18 | 19 | * `Q:` _Yes, ok.. but.. is this a tutorial on Data Science?_ 20 | * `A:` **No**. This is a **tutorial** on programming with Python. The _perspective_ though is of a _wanna-be_ data scientists. 21 | 22 | * `Q:` _Cool... but.. is this a tutorial on the Python Language ?_ 23 | * `A:` **Ehm, No again. Sorry**. 24 | We will focus on programming concepts _using_ Python as a language. Most of the concepts you will learn are shared in most of other languages (_just the syntax will be different, ed._) _Although_ there is a section in the Lecture materials named `Python Extras` that is **specifically** focusing on features of the Python language. You could read it, if interested :) 25 | 26 | #### Here is what I have in mind for this course (HTH) 27 | 28 | ![lecture sketch](./images/lectures_sketch.png) 29 | 30 | _I do hope that this (very simple) mind-map look-alike clarifies a bit the perspective I chose when I thought about this course._ 31 | 32 | `tl,dr;` We will dive into programming focusing on two main aspects: the _Algorithmic_ perspective, that is "what are the steps we need to implement to solve a specific problem", and the _Data Structure_ perspective, that is "what is the data structure that would simplify as much as possible our algorithm implementation". These two perspectives led in the past decades to two completely different approaches to programming: **Procedural** vs **Object-Oriented**, respectively. 33 | 34 | Python allows for _a lot_ of flexibility, and this flexibility will be our [swiss-knife](https://www.ctotech.io/blog/python/why-python3-insights-in-the-swiss-army-knife-of-coding/). In fact, Python supports _multiple programming paradigms_ at once (i.e _imperative_, _OOP_, _functional_ [1]), and we will be (seemingly) shifting our focus on those as we go along with the lecture materials. 35 | 36 | --- 37 | 38 | `1`: functional programming only for the intrepid programmers of you :) See this [video](https://www.youtube.com/watch?v=ThS4juptJjQ) 39 | 40 | ## Outline of the Course (at a glance) 41 | 42 | The course is organised into **six parts** lectures, with the following learning path in mind: 43 | 44 | 1. Python Programming (part 1): Introduction to Python Main Data structures, and functions; 45 | 46 | 2. Python Programming (part 2): Advanced Data Structures and Object-Oriented Programming 47 | 48 | 3. Scientific Python Programming and Data Processing: Numerical Processing with `NumPy` & Data Processing with`Pandas` 49 | 50 | 4. Advanced Data Objects and Data Plotting: Introduction to `dataclasses` and `matplotlib` / `bokeh` for interactive plotting 51 | 52 | 5. Introduction to Scikit-Learn (`sklearn`) and Machine Learning Modules 53 | 54 | 6. Project-Team work on real-cases Data Science scenarios 55 | 56 | 57 | ## Lecture Materials 58 | 59 | _Note: The following section is currently incomplete, and will be updated throughout the rest of the course._ 60 | 61 | ### Introductory Readings (`intro` folder) 62 | 63 | This part will introduce to the concept of computer programming, and to the 64 | very basics of the Python programming language: 65 | 66 | 1. [The Way of the Program](intro/1-the-way-of-the-program.html) 67 | 2. [Variables, Statements and Expressions](intro/2-variables-statements-expressions.html) 68 | 3. [Introduction to Functions](intro/3-intro-functions.html) 69 | 4. [Setting up an editor](intro/4-setup-editor.html) 70 | 5. [Conditional Statements](basics/5-conditionals.html) 71 | 72 | Regardless you have already programmed before, using Python or not, I would suggest to take a look at this introductory section anyway. There is always time to **skip**, based on your learning pace. 73 | 74 | **Alternatively**, a good starting point would be this online course: [Intro to Python by Microsoft](https://docs.microsoft.com/en-us/learn/modules/intro-to-python/) 75 | 76 | ### Programming with Python (`programming_with_python` folder) 77 | This section contains the materials for the main topics that will be covered in our first two lectures. These are (in no specific order): 78 | 79 | 1. [Pythonic Functions](programmin_with_python/functions.ipynb) 80 | 2. [Collections and Sequences](programmin_with_python/collections.ipynb) 81 | 3. [Dictionaries](programmin_with_python/dictionaries.ipynb) 82 | 4. [Iterators, Generators, Comprehensions](programmin_with_python/iterators.ipynb) 83 | 5. [Classes and OOP](programmin_with_python/classes.ipynb) 84 | 6. [Errors and Exceptions](programming_with_python/exceptions.ipynb) 85 | 86 | #### Python Extras (`pyhton_extras` folder) 87 | This section contains some extra notebooks you could go through to read more about some specific aspects of the Python programming language. 88 | 89 | **Note:** This is the only part of the course spefically focused on _how Python_ does things 90 | 91 | 1. [Modules](python_extras/modules.ipynb) 92 | 2. [Python Data Model](python_extras/data-model.ipynb) 93 | 3. [Function as Objects](python_extras/functions-objects.ipynb) 94 | 4. [Magic Methods](python_extras/magic.ipynb) 95 | 5. [ Pythonic Coding Style](python_extras/pep8.ipynb) 96 | 97 | ## Instructions 98 | 99 | ### 1. Get the material 100 | 101 | **Option A**: `Clone` (or `fork`) the Repository using `git` (**Recommended**) 102 | 103 | ⚠️ Note: It is necessary to have `Git` installed in order to proceed. If you don't have `git` installed on your system, you need to **install git** first. 104 | [Instructions to Install Git](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git) 105 | 106 | 107 | 108 | 💡 Please also consider looking at [**Git CheatSheet**](https://education.github.com/git-cheat-sheet-education.pdf) 109 | 110 | 111 | 112 | To acquire the lecture material it is highly recommended using `git` to **clone** the current repository. Since the repository will be constantly updated after each lesson, using git method will allow for an easier synchronisation of the material. 113 | 114 | To clone the repository, type the following command in the terminal prompt: 115 | ```bash 116 | git clone https://github.com/leriomaggio/python-data-science.git 117 | ``` 118 | 119 | ⚠️ Note for **Windows users**: Once installed `git`, please make sure to run the _Git Terminal_ (or _Git Prompt_) 120 | 121 | Once completed, this will create a new folder named `python-data-science` (_presumably in your Home folder_). 122 | 123 | Well done! Now you should bear with me another few minutes, following instructions reported below 🙏 124 | 125 | Please now proceed to **2. Setting up your Environment** 126 | 127 | 128 | 129 | 130 | 131 | 132 | **Option B**: Downloading the material in a ZIP archive from GitHub (**Not Recommended**) 133 | 134 | It is indeed possible to download the whole material from GitHub as a ZIP archive. 135 | Link [here](https://github.com/leriomaggio/python-data-science#:~:text=with%20GitHub%20Desktop-,Download%20ZIP,-Latest%20commit) 136 | 137 | However, this method is **not recommended** as it will be required to download the archive everytime there is an update (which means at the end of each lesson)! 138 | 139 | ### 2. Setting up your Environment 140 | 141 | We will be using [**Jupyter lab**](https://jupyter.org) as our _interactive programming environment_ for this course. 142 | 143 | This will have the great advantage of lowering the barriers in setting up the environment, and installing specialised tools. If you're not familiar with _jupyter notebooks_, no worries: we will get the time to familiarise with the environment as the first thing we will do! 144 | 145 | Meanwhile, it is necessary to setup the Python **Virtual Environment** to run the code contained in this repository _smoothly_ and with no _headaches_. 146 | 147 | If you don't know what a Python [virtual environment](https://docs.python.org/3/tutorial/venv.html) is, think of it as a sandbox Python installation you can have on your machine that is fully controllable and fully independent from any other Python environment you may have on your local machine. 148 | 149 | To execute the notebooks in this repository, a few packages are required, but installing them in your Conda environment is super easy. 150 | 151 | **Step 1:** Download [Anaconda Python Distribution](https://www.anaconda.com/products/individual). 152 | 153 | Note for **Windows Users**: More information here on the [official documentation](https://docs.anaconda.com/anaconda/user-guide/getting-started/#open-nav-win) 154 | 155 | **Step 2:** Set up the virtual environment: 156 | 157 | Open a Terminal (or **Anaconda Prompt** on Windows) and **move** to the `python-data-science` folder, i.e. the main folder of this repository. 158 | 159 | ```bash 160 | cd python-data-science 161 | ``` 162 | 163 | Now create the conda environment by typing the following command: 164 | 165 | ```bash 166 | conda env create -f pyds.yml 167 | ``` 168 | This will install a **new** Conda environment named `pyds`. 169 | 170 | **Step 2.1**: If you'd like to double check that the creation of the environment completed successfully, you can type: 171 | 172 | ```bash 173 | conda info --envs 174 | ``` 175 | This will list all the virtual environments conda can found within your installation. `pyds` should appear in the list as well. 176 | 177 | **Step 3:**: Activate the environment: 178 | 179 | Once the environment is set, we need to **activate** it in order to use it. 180 | 181 | ```bash 182 | conda activate pyds 183 | ``` 184 | 185 | 🎉 You should be now ready to go! 186 | 187 | The last bit is to run your `jupyter lab` server, and open the notebooks: 188 | 189 | ```bash 190 | jupyter lab 191 | ``` 192 | 193 | #### (Alternative) Setup Environment via `pip` 194 | 195 | The repository also includes a `requirements.txt` file that can be used to install all the required packages using `pip`: 196 | 197 | ```bash 198 | pip install -r requirements.txt 199 | ``` 200 | 201 | However this is recommended only if (A) it is not possible to install Anaconda on your machine; (B) The setup of Anaconda environment is unsuccessfull. 202 | 203 | ⚠️ **Either is the case** it is important that the version of Python used will be `Python >=3.9` 204 | 205 | ## Colophon 206 | 207 | **Author**: Valerio Maggio ([`@leriomaggio`](https://twitter.com/leriomaggio)), Senior Research Associate, University of Bristol. 208 | 209 | All the **Code** material is distributed under the terms of the GNU GPLv3 License. See [LICENSE](./LICENSE) file for additional details. 210 | 211 | All the instructional materials in this repository is free to use, and made available under the [Creative Commons Attribution 212 | license][https://creativecommons.org/licenses/by/4.0/]. The following is a human-readable summary of (and not a substitute for) the [full legal text of the CC BY 4.0 213 | license](https://creativecommons.org/licenses/by/4.0/legalcode). 214 | 215 | You are free: 216 | 217 | * to **Share**---copy and redistribute the material in any medium or format 218 | * to **Adapt**---remix, transform, and build upon the material 219 | 220 | for any purpose, even commercially. 221 | 222 | The licensor cannot revoke these freedoms as long as you follow the 223 | license terms. 224 | 225 | Under the following terms: 226 | 227 | * **Attribution**---You must give appropriate credit (mentioning that 228 | your work is derived from work that is Copyright © Software 229 | Carpentry and, where practical, linking to 230 | http://software-carpentry.org/), provide a [link to the 231 | license][cc-by-human], and indicate if changes were made. You may do 232 | so in any reasonable manner, but not in any way that suggests the 233 | licensor endorses you or your use. 234 | 235 | **No additional restrictions**---You may not apply legal terms or 236 | technological measures that legally restrict others from doing 237 | anything the license permits. 238 | 239 | ### Contacts 240 | 241 | For any questions or doubts, feel free to open an [issue](https://github.com/leriomaggio/python-data-science/issues) in the repository, or drop me an email @ `valerio.maggio_at_bristol.ac.uk` -------------------------------------------------------------------------------- /python_extras/pep8.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "slideshow": { 7 | "slide_type": "slide" 8 | } 9 | }, 10 | "source": [ 11 | "# Coding Style: PEP 8\n", 12 | "\n" 13 | ] 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "metadata": { 18 | "slideshow": { 19 | "slide_type": "fragment" 20 | } 21 | }, 22 | "source": [ 23 | "You are now starting to write Python programs that have a little substance. Your programs are growing a little longer, and there is a little more structure to your programs. This is a really good time to consider your overall style in writing code." 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": { 29 | "slideshow": { 30 | "slide_type": "subslide" 31 | } 32 | }, 33 | "source": [ 34 | "Why do we need style conventions?\n", 35 | "---\n", 36 | "\n", 37 | "The people who originally developed Python made some of their decisions based on the realization that code is read much more often than it is written. The original developers paid as much attention to making the language easy to read, as well as easy to write. " 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": { 43 | "slideshow": { 44 | "slide_type": "fragment" 45 | } 46 | }, 47 | "source": [ 48 | "Python has gained a lot of respect as a programming language because of how readable the code is. You have seen that Python uses indentation to show which lines in a program are grouped together. This makes the structure of your code visible to anyone who reads it." 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": { 54 | "slideshow": { 55 | "slide_type": "fragment" 56 | } 57 | }, 58 | "source": [ 59 | "There are, however, some styling decisions we get to make as programmers that can make our programs more readable for ourselves, and for others." 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": { 65 | "slideshow": { 66 | "slide_type": "fragment" 67 | } 68 | }, 69 | "source": [ 70 | "There are several audiences to consider when you think about how readable your code is." 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": { 76 | "slideshow": { 77 | "slide_type": "subslide" 78 | } 79 | }, 80 | "source": [ 81 | "#### Yourself, 6 months from now\n", 82 | "\n", 83 | "- You know what you are thinking when you write code for the first time. But how easily will you recall what you were thinking when you come back to that code tomorrow, next week, or six months from now? We want our code to be as easy to read as possible six months from now, so we can jump back into our projects when we want to." 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "metadata": { 89 | "slideshow": { 90 | "slide_type": "fragment" 91 | } 92 | }, 93 | "source": [ 94 | "#### Other programmers you might want to collaborate with\n", 95 | "\n", 96 | "- Every significant project is the result of collaboration these days. If you stay in programming, you will work with others in jobs and in open source projects. If you write readable code with good commments, people will be happy to work with you in any setting." 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "metadata": { 102 | "slideshow": { 103 | "slide_type": "fragment" 104 | } 105 | }, 106 | "source": [ 107 | "#### Potential employers\n", 108 | "\n", 109 | "- Most people who hire programmers will ask to see some code you have written, and they will probably ask you to write some code during your interview. If you are in the habit of writing code that is easy to read, you will do well in these situations." 110 | ] 111 | }, 112 | { 113 | "cell_type": "markdown", 114 | "metadata": { 115 | "slideshow": { 116 | "slide_type": "subslide" 117 | } 118 | }, 119 | "source": [ 120 | "What is a PEP?\n", 121 | "---\n", 122 | "\n", 123 | "A PEP is a *Python Enhancement Proposal*. " 124 | ] 125 | }, 126 | { 127 | "cell_type": "markdown", 128 | "metadata": { 129 | "slideshow": { 130 | "slide_type": "fragment" 131 | } 132 | }, 133 | "source": [ 134 | "One of the earliest PEPs was a collection of guidelines for writing code that is easy to read. It was PEP 8, the [Style Guide for Python Code](http://www.python.org/dev/peps/pep-0008/). " 135 | ] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "metadata": { 140 | "slideshow": { 141 | "slide_type": "fragment" 142 | } 143 | }, 144 | "source": [ 145 | "When people want to suggest changes to the actual Python language, someone drafts a Python Enhancement Proposal. \n", 146 | "\n", 147 | "There is a lot in there that won't make sense to you for some time yet, but there are some suggestions that you should be aware of from the beginning. Starting with good style habits now will help you write clean code from the beginning, which will help you make sense of your code as well." 148 | ] 149 | }, 150 | { 151 | "cell_type": "markdown", 152 | "metadata": { 153 | "slideshow": { 154 | "slide_type": "subslide" 155 | } 156 | }, 157 | "source": [ 158 | "Basic Python style guidelines\n", 159 | "---\n" 160 | ] 161 | }, 162 | { 163 | "cell_type": "markdown", 164 | "metadata": { 165 | "slideshow": { 166 | "slide_type": "fragment" 167 | } 168 | }, 169 | "source": [ 170 | "#### Indentation\n", 171 | "- Use 4 spaces for indentation. This is enough space to give your code some visual structure, while leaving room for multiple indentation levels. There are configuration settings in most editors to automatically convert tabs to 4 spaces, and it is a good idea to check this setting. On Geany, this is under Edit>Preferences>Editor>Indentation; set Width to 4, and Type to *Spaces*." 172 | ] 173 | }, 174 | { 175 | "cell_type": "markdown", 176 | "metadata": { 177 | "slideshow": { 178 | "slide_type": "fragment" 179 | } 180 | }, 181 | "source": [ 182 | "#### Line Length\n", 183 | "- Use up to 79 characters per line of code, and 72 characters for comments. This is a style guideline that some people adhere to and others completely ignore. This used to relate to a limit on the display size of most monitors. Now almost every monitor is capable of showing much more than 80 characters per line. But we often work in terminals, which are not always high-resolution. We also like to have multiple code files open, next to each other. It turns out this is still a useful guideline to follow in most cases. There is a secondary guideline of sticking to 99 characters per line, if you want longer lines." 184 | ] 185 | }, 186 | { 187 | "cell_type": "markdown", 188 | "metadata": { 189 | "slideshow": { 190 | "slide_type": "fragment" 191 | } 192 | }, 193 | "source": [ 194 | "Many editors have a setting that shows a vertical line that helps you keep your lines to a certain length. " 195 | ] 196 | }, 197 | { 198 | "cell_type": "markdown", 199 | "metadata": { 200 | "slideshow": { 201 | "slide_type": "subslide" 202 | } 203 | }, 204 | "source": [ 205 | "#### Blank Lines\n", 206 | "- Use single blank lines to break up your code into meaningful blocks. You have seen this in many examples so far. You can use two blank lines in longer programs, but don't get excessive with blank lines." 207 | ] 208 | }, 209 | { 210 | "cell_type": "markdown", 211 | "metadata": { 212 | "slideshow": { 213 | "slide_type": "fragment" 214 | } 215 | }, 216 | "source": [ 217 | "#### Comments\n", 218 | "- Use a single space after the pound sign at the beginning of a line. If you are writing more than one paragraph, use an empty line with a pound sign between paragraphs." 219 | ] 220 | }, 221 | { 222 | "cell_type": "markdown", 223 | "metadata": { 224 | "slideshow": { 225 | "slide_type": "fragment" 226 | } 227 | }, 228 | "source": [ 229 | "#### Naming Variables\n", 230 | "- Name variables and program files using only lowercase letters, underscores, and numbers. Python won't complain or throw errors if you use capitalization, but you will mislead other programmers if you use capital letters in variables at this point." 231 | ] 232 | }, 233 | { 234 | "cell_type": "markdown", 235 | "metadata": { 236 | "slideshow": { 237 | "slide_type": "subslide" 238 | } 239 | }, 240 | "source": [ 241 | "That's all for now. We will go over more style guidelines as we introduce more complicated programming structures. If you follow these guidelines for now, you will be well on your way to writing readable code that professionals will respect." 242 | ] 243 | }, 244 | { 245 | "cell_type": "markdown", 246 | "metadata": {}, 247 | "source": [ 248 | "Import statements\n", 249 | "---\n", 250 | "PEP8 provides clear guidelines about [where](http://www.python.org/dev/peps/pep-0008/#imports) import statements should appear in a file. The names of modules should be on separate lines:" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": 1, 256 | "metadata": { 257 | "collapsed": true 258 | }, 259 | "outputs": [], 260 | "source": [ 261 | "# this\n", 262 | "import sys\n", 263 | "import os\n", 264 | "\n", 265 | "# not this\n", 266 | "import sys, os" 267 | ] 268 | }, 269 | { 270 | "cell_type": "markdown", 271 | "metadata": {}, 272 | "source": [ 273 | "The names of classes can be on the same line:" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": 3, 279 | "metadata": {}, 280 | "outputs": [], 281 | "source": [ 282 | "from rocket import Rocket, Shuttle" 283 | ] 284 | }, 285 | { 286 | "cell_type": "markdown", 287 | "metadata": {}, 288 | "source": [ 289 | "Imports should always be placed at the top of the file. When you are working on a longer program, you might have an idea that requires an import statement. You might write the import statement in the code block you are working on to see if your idea works. If you end up keeping the import, make sure you move the import statement to the top of the file. This lets anyone who works with your program see what modules are required for the program to work.\n", 290 | "\n", 291 | "Your import statements should be in a predictable order:\n", 292 | "\n", 293 | "- The first imports should be standard Python modules such as *sys*, *os*, and *math*.\n", 294 | "- The second set of imports should be \"third-party\" libraries. These are libraries that are written and maintained by independent programmers, which are not part of the official Python language." 295 | ] 296 | }, 297 | { 298 | "cell_type": "markdown", 299 | "metadata": {}, 300 | "source": [ 301 | "Module and class names\n", 302 | "---\n", 303 | "Modules should have [short, lowercase names](http://www.python.org/dev/peps/pep-0008/#package-and-module-names). If you want to have a space in the module name, use an underscore.\n", 304 | "\n", 305 | "[Class names](http://www.python.org/dev/peps/pep-0008/#class-names) should be written in *CamelCase*, with an initial capital letter and any new word capitalized. There should be no underscores in your class names.\n", 306 | "\n", 307 | "This convention helps distinguish modules from classes, for example when you are writing import statements." 308 | ] 309 | }, 310 | { 311 | "cell_type": "markdown", 312 | "metadata": { 313 | "slideshow": { 314 | "slide_type": "slide" 315 | } 316 | }, 317 | "source": [ 318 | "Exercises\n", 319 | "---\n", 320 | "#### Skim PEP 8\n", 321 | "- If you haven't done so already, skim [PEP 8 - Style Guide for Python Code](http://www.python.org/dev/peps/pep-0008/#block-comments). As you continue to learn Python, go back and look at this every once in a while. I can't stress enough that many good programmers will take you much more seriously from the start if you are following community-wide conventions as you write your code.\n", 322 | "\n", 323 | "#### Implement PEP 8\n", 324 | "- Take three of your longest programs, and add the extension *\\_pep8.py* to the filename of each program. Revise your code so that it meets the styling conventions listed above." 325 | ] 326 | } 327 | ], 328 | "metadata": { 329 | "kernelspec": { 330 | "display_name": "Python 3", 331 | "language": "python", 332 | "name": "python3" 333 | }, 334 | "language_info": { 335 | "codemirror_mode": { 336 | "name": "ipython", 337 | "version": 3 338 | }, 339 | "file_extension": ".py", 340 | "mimetype": "text/x-python", 341 | "name": "python", 342 | "nbconvert_exporter": "python", 343 | "pygments_lexer": "ipython3", 344 | "version": "3.7.6" 345 | } 346 | }, 347 | "nbformat": 4, 348 | "nbformat_minor": 2 349 | } 350 | -------------------------------------------------------------------------------- /scikit-learn/02.2 Supervised Learning - Regression.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": false, 8 | "jupyter": { 9 | "outputs_hidden": false 10 | }, 11 | "slideshow": { 12 | "slide_type": "skip" 13 | } 14 | }, 15 | "outputs": [], 16 | "source": [ 17 | "%matplotlib inline\n", 18 | "import matplotlib.pyplot as plt\n", 19 | "import numpy as np" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": { 25 | "slideshow": { 26 | "slide_type": "slide" 27 | } 28 | }, 29 | "source": [ 30 | "Regression\n", 31 | "==========\n" 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "metadata": { 37 | "slideshow": { 38 | "slide_type": "subslide" 39 | } 40 | }, 41 | "source": [ 42 | "In regression we try to predict a continuous output variable. \n", 43 | "\n", 44 | "This can be most easily visualized in one dimension.\n", 45 | "\n", 46 | "We will start with a very simple toy example. We will create a dataset out of a sinus curve with some noise:" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": { 53 | "collapsed": false, 54 | "jupyter": { 55 | "outputs_hidden": false 56 | }, 57 | "slideshow": { 58 | "slide_type": "subslide" 59 | } 60 | }, 61 | "outputs": [], 62 | "source": [ 63 | "x = np.linspace(-3, 3, 100)\n", 64 | "print(x)" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": { 71 | "collapsed": false, 72 | "jupyter": { 73 | "outputs_hidden": false 74 | }, 75 | "slideshow": { 76 | "slide_type": "subslide" 77 | } 78 | }, 79 | "outputs": [], 80 | "source": [ 81 | "rng = np.random.RandomState(42)\n", 82 | "y = np.sin(4 * x) + x + rng.uniform(size=len(x))" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": { 89 | "collapsed": false, 90 | "jupyter": { 91 | "outputs_hidden": false 92 | }, 93 | "slideshow": { 94 | "slide_type": "fragment" 95 | } 96 | }, 97 | "outputs": [], 98 | "source": [ 99 | "plt.plot(x, y, 'o')" 100 | ] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "metadata": { 105 | "slideshow": { 106 | "slide_type": "subslide" 107 | } 108 | }, 109 | "source": [ 110 | "Linear Regression\n", 111 | "=================\n", 112 | "One of the simplest models again is a linear one, that simply tries to predict the data as lying on a line. One way to find such a line is LinearRegression (also known as ordinary least squares).\n", 113 | "The interface for LinearRegression is exactly the same as for the classifiers before, only that ``y`` now contains float values, instead of classes." 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "metadata": { 119 | "slideshow": { 120 | "slide_type": "subslide" 121 | } 122 | }, 123 | "source": [ 124 | "To apply a scikit-learn model, we need to make X be a 2d-array:" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": null, 130 | "metadata": { 131 | "collapsed": false, 132 | "jupyter": { 133 | "outputs_hidden": false 134 | } 135 | }, 136 | "outputs": [], 137 | "source": [ 138 | "print(x.shape)\n", 139 | "X = x[:, np.newaxis]\n", 140 | "print(X.shape)" 141 | ] 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "metadata": { 146 | "slideshow": { 147 | "slide_type": "subslide" 148 | } 149 | }, 150 | "source": [ 151 | "We split our data in a training and a test set again:" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": null, 157 | "metadata": { 158 | "collapsed": false, 159 | "jupyter": { 160 | "outputs_hidden": false 161 | } 162 | }, 163 | "outputs": [], 164 | "source": [ 165 | "from sklearn.model_selection import train_test_split\n", 166 | "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)" 167 | ] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "metadata": { 172 | "slideshow": { 173 | "slide_type": "subslide" 174 | } 175 | }, 176 | "source": [ 177 | "Then we can built our regression model:" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "metadata": { 184 | "collapsed": false, 185 | "jupyter": { 186 | "outputs_hidden": false 187 | } 188 | }, 189 | "outputs": [], 190 | "source": [ 191 | "from sklearn.linear_model import LinearRegression\n", 192 | "regressor = LinearRegression()\n", 193 | "regressor.fit(X_train, y_train)" 194 | ] 195 | }, 196 | { 197 | "cell_type": "markdown", 198 | "metadata": { 199 | "slideshow": { 200 | "slide_type": "subslide" 201 | } 202 | }, 203 | "source": [ 204 | "And predict. First let us try the training set:" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": null, 210 | "metadata": { 211 | "tags": [] 212 | }, 213 | "outputs": [], 214 | "source": [ 215 | "y_pred_train = regressor.predict(X_train)" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": null, 221 | "metadata": { 222 | "collapsed": false, 223 | "jupyter": { 224 | "outputs_hidden": false 225 | }, 226 | "slideshow": { 227 | "slide_type": "subslide" 228 | } 229 | }, 230 | "outputs": [], 231 | "source": [ 232 | "plt.plot(X_train, y_train, 'o', label=\"data\")\n", 233 | "plt.plot(X_train, y_pred_train, 'o', label=\"prediction\")\n", 234 | "plt.legend(loc='best')" 235 | ] 236 | }, 237 | { 238 | "cell_type": "markdown", 239 | "metadata": { 240 | "slideshow": { 241 | "slide_type": "fragment" 242 | } 243 | }, 244 | "source": [ 245 | "The line is able to capture the general slope of the data, but not many details." 246 | ] 247 | }, 248 | { 249 | "cell_type": "markdown", 250 | "metadata": { 251 | "slideshow": { 252 | "slide_type": "subslide" 253 | } 254 | }, 255 | "source": [ 256 | "Let's try the test set:" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": null, 262 | "metadata": { 263 | "tags": [] 264 | }, 265 | "outputs": [], 266 | "source": [ 267 | "y_pred_test = regressor.predict(X_test)" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": null, 273 | "metadata": { 274 | "collapsed": false, 275 | "jupyter": { 276 | "outputs_hidden": false 277 | }, 278 | "slideshow": { 279 | "slide_type": "subslide" 280 | } 281 | }, 282 | "outputs": [], 283 | "source": [ 284 | "plt.plot(X_test, y_test, 'o', label=\"data\")\n", 285 | "plt.plot(X_test, y_pred_test, 'o', label=\"prediction\")\n", 286 | "plt.legend(loc='best')" 287 | ] 288 | }, 289 | { 290 | "cell_type": "markdown", 291 | "metadata": { 292 | "slideshow": { 293 | "slide_type": "subslide" 294 | } 295 | }, 296 | "source": [ 297 | "Again, scikit-learn provides an easy way to evaluate the prediction quantitatively using the ``score`` method. \n", 298 | "\n", 299 | "For regression tasks, this is the **R2 score**:\n", 300 | "\n", 301 | "$$ R^2 = 1 - \\frac{\\sum_{i} (y_i - f_i)^2}{\\sum_i (y_i - \\hat{y})^2} \\text{ where } \\hat{y} = \\frac{1}{n}\\sum_i^n y_i$$\n", 302 | "\n", 303 | "Another popular way would be the **mean squared error**." 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": null, 309 | "metadata": { 310 | "collapsed": false, 311 | "jupyter": { 312 | "outputs_hidden": false 313 | }, 314 | "slideshow": { 315 | "slide_type": "fragment" 316 | } 317 | }, 318 | "outputs": [], 319 | "source": [ 320 | "regressor.score(X_test, y_test)" 321 | ] 322 | }, 323 | { 324 | "cell_type": "markdown", 325 | "metadata": { 326 | "slideshow": { 327 | "slide_type": "slide" 328 | } 329 | }, 330 | "source": [ 331 | "KNeighborsRegression\n", 332 | "=======================\n" 333 | ] 334 | }, 335 | { 336 | "cell_type": "markdown", 337 | "metadata": { 338 | "slideshow": { 339 | "slide_type": "subslide" 340 | } 341 | }, 342 | "source": [ 343 | "As for classification, we can also use a neighbor based method for regression. \n", 344 | "\n", 345 | "We can simply take the output of the nearest point, or we could average several nearest points. \n", 346 | "\n", 347 | "This method is less popular for regression than for classification, but still a good baseline." 348 | ] 349 | }, 350 | { 351 | "cell_type": "code", 352 | "execution_count": null, 353 | "metadata": { 354 | "collapsed": false, 355 | "jupyter": { 356 | "outputs_hidden": false 357 | }, 358 | "slideshow": { 359 | "slide_type": "subslide" 360 | } 361 | }, 362 | "outputs": [], 363 | "source": [ 364 | "from sklearn.neighbors import KNeighborsRegressor\n", 365 | "kneighbor_regression = KNeighborsRegressor(n_neighbors=1)\n", 366 | "kneighbor_regression.fit(X_train, y_train)" 367 | ] 368 | }, 369 | { 370 | "cell_type": "markdown", 371 | "metadata": { 372 | "slideshow": { 373 | "slide_type": "fragment" 374 | } 375 | }, 376 | "source": [ 377 | "Again, let us look at the behavior on training and test set:" 378 | ] 379 | }, 380 | { 381 | "cell_type": "code", 382 | "execution_count": null, 383 | "metadata": { 384 | "collapsed": false, 385 | "jupyter": { 386 | "outputs_hidden": false 387 | }, 388 | "slideshow": { 389 | "slide_type": "subslide" 390 | } 391 | }, 392 | "outputs": [], 393 | "source": [ 394 | "y_pred_train = kneighbor_regression.predict(X_train)\n", 395 | "\n", 396 | "plt.plot(X_train, y_train, 'o', label=\"data\")\n", 397 | "plt.plot(X_train, y_pred_train, 'o', label=\"prediction\")\n", 398 | "plt.legend(loc='best')" 399 | ] 400 | }, 401 | { 402 | "cell_type": "code", 403 | "execution_count": null, 404 | "metadata": { 405 | "collapsed": false, 406 | "jupyter": { 407 | "outputs_hidden": false 408 | } 409 | }, 410 | "outputs": [], 411 | "source": [ 412 | "kneighbor_regression.score(X_train, y_train)" 413 | ] 414 | }, 415 | { 416 | "cell_type": "markdown", 417 | "metadata": { 418 | "slideshow": { 419 | "slide_type": "-" 420 | } 421 | }, 422 | "source": [ 423 | "On the training set, we do a perfect job: each point is its own nearest neighbor!" 424 | ] 425 | }, 426 | { 427 | "cell_type": "code", 428 | "execution_count": null, 429 | "metadata": { 430 | "collapsed": false, 431 | "jupyter": { 432 | "outputs_hidden": false 433 | }, 434 | "slideshow": { 435 | "slide_type": "subslide" 436 | } 437 | }, 438 | "outputs": [], 439 | "source": [ 440 | "y_pred_test = kneighbor_regression.predict(X_test)\n", 441 | "\n", 442 | "plt.plot(X_test, y_test, 'o', label=\"data\")\n", 443 | "plt.plot(X_test, y_pred_test, 'o', label=\"prediction\")\n", 444 | "plt.legend(loc='best')" 445 | ] 446 | }, 447 | { 448 | "cell_type": "markdown", 449 | "metadata": { 450 | "slideshow": { 451 | "slide_type": "subslide" 452 | } 453 | }, 454 | "source": [ 455 | "On the test set, we also do a better job of capturing the variation, but our estimates look much more messy then before.\n", 456 | "Let us look at the R2 score:" 457 | ] 458 | }, 459 | { 460 | "cell_type": "code", 461 | "execution_count": null, 462 | "metadata": { 463 | "collapsed": false, 464 | "jupyter": { 465 | "outputs_hidden": false 466 | } 467 | }, 468 | "outputs": [], 469 | "source": [ 470 | "kneighbor_regression.score(X_test, y_test)" 471 | ] 472 | }, 473 | { 474 | "cell_type": "markdown", 475 | "metadata": { 476 | "slideshow": { 477 | "slide_type": "fragment" 478 | } 479 | }, 480 | "source": [ 481 | "Much better then before! Here, the linear model was not a good fit for our problem." 482 | ] 483 | }, 484 | { 485 | "cell_type": "markdown", 486 | "metadata": {}, 487 | "source": [ 488 | "Exercise\n", 489 | "=========\n", 490 | "Compare the KNeighborsRegressor and LinearRegression on the boston housing dataset. You can load the dataset using ``sklearn.datasets.load_boston``." 491 | ] 492 | } 493 | ], 494 | "metadata": { 495 | "kernelspec": { 496 | "display_name": "Python 3 (ipykernel)", 497 | "language": "python", 498 | "name": "python3" 499 | }, 500 | "language_info": { 501 | "codemirror_mode": { 502 | "name": "ipython", 503 | "version": 3 504 | }, 505 | "file_extension": ".py", 506 | "mimetype": "text/x-python", 507 | "name": "python", 508 | "nbconvert_exporter": "python", 509 | "pygments_lexer": "ipython3", 510 | "version": "3.9.7" 511 | } 512 | }, 513 | "nbformat": 4, 514 | "nbformat_minor": 4 515 | } 516 | -------------------------------------------------------------------------------- /data_abstraction.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from data import read_inflammation_data" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 2, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "inf04_data = \"inflammation-04.csv\"\n", 19 | "\n", 20 | "dataset = read_inflammation_data(filename=inf04_data)" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 3, 26 | "metadata": {}, 27 | "outputs": [ 28 | { 29 | "data": { 30 | "text/plain": [ 31 | "data.Dataset" 32 | ] 33 | }, 34 | "execution_count": 3, 35 | "metadata": {}, 36 | "output_type": "execute_result" 37 | } 38 | ], 39 | "source": [ 40 | "type(dataset)" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 4, 46 | "metadata": {}, 47 | "outputs": [ 48 | { 49 | "data": { 50 | "text/plain": [ 51 | "data.Patient" 52 | ] 53 | }, 54 | "execution_count": 4, 55 | "metadata": {}, 56 | "output_type": "execute_result" 57 | } 58 | ], 59 | "source": [ 60 | "type(dataset[0])" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 5, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "patient = dataset[1]" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 6, 75 | "metadata": {}, 76 | "outputs": [ 77 | { 78 | "data": { 79 | "text/plain": [ 80 | "'F'" 81 | ] 82 | }, 83 | "execution_count": 6, 84 | "metadata": {}, 85 | "output_type": "execute_result" 86 | } 87 | ], 88 | "source": [ 89 | "patient.sex" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 7, 95 | "metadata": {}, 96 | "outputs": [ 97 | { 98 | "name": "stdout", 99 | "output_type": "stream", 100 | "text": [ 101 | "G3\n" 102 | ] 103 | } 104 | ], 105 | "source": [ 106 | "print(patient.group)" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 8, 112 | "metadata": {}, 113 | "outputs": [ 114 | { 115 | "name": "stdout", 116 | "output_type": "stream", 117 | "text": [ 118 | "F-G3\n" 119 | ] 120 | } 121 | ], 122 | "source": [ 123 | "print(patient.stratification_label())" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 10, 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "from sklearn import datasets\n", 133 | "\n", 134 | "iris = datasets.load_iris()" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": 11, 140 | "metadata": {}, 141 | "outputs": [ 142 | { 143 | "data": { 144 | "text/plain": [ 145 | "sklearn.utils.Bunch" 146 | ] 147 | }, 148 | "execution_count": 11, 149 | "metadata": {}, 150 | "output_type": "execute_result" 151 | } 152 | ], 153 | "source": [ 154 | "type(iris)" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": 12, 160 | "metadata": {}, 161 | "outputs": [ 162 | { 163 | "name": "stdout", 164 | "output_type": "stream", 165 | "text": [ 166 | ".. _iris_dataset:\n", 167 | "\n", 168 | "Iris plants dataset\n", 169 | "--------------------\n", 170 | "\n", 171 | "**Data Set Characteristics:**\n", 172 | "\n", 173 | " :Number of Instances: 150 (50 in each of three classes)\n", 174 | " :Number of Attributes: 4 numeric, predictive attributes and the class\n", 175 | " :Attribute Information:\n", 176 | " - sepal length in cm\n", 177 | " - sepal width in cm\n", 178 | " - petal length in cm\n", 179 | " - petal width in cm\n", 180 | " - class:\n", 181 | " - Iris-Setosa\n", 182 | " - Iris-Versicolour\n", 183 | " - Iris-Virginica\n", 184 | " \n", 185 | " :Summary Statistics:\n", 186 | "\n", 187 | " ============== ==== ==== ======= ===== ====================\n", 188 | " Min Max Mean SD Class Correlation\n", 189 | " ============== ==== ==== ======= ===== ====================\n", 190 | " sepal length: 4.3 7.9 5.84 0.83 0.7826\n", 191 | " sepal width: 2.0 4.4 3.05 0.43 -0.4194\n", 192 | " petal length: 1.0 6.9 3.76 1.76 0.9490 (high!)\n", 193 | " petal width: 0.1 2.5 1.20 0.76 0.9565 (high!)\n", 194 | " ============== ==== ==== ======= ===== ====================\n", 195 | "\n", 196 | " :Missing Attribute Values: None\n", 197 | " :Class Distribution: 33.3% for each of 3 classes.\n", 198 | " :Creator: R.A. Fisher\n", 199 | " :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)\n", 200 | " :Date: July, 1988\n", 201 | "\n", 202 | "The famous Iris database, first used by Sir R.A. Fisher. The dataset is taken\n", 203 | "from Fisher's paper. Note that it's the same as in R, but not as in the UCI\n", 204 | "Machine Learning Repository, which has two wrong data points.\n", 205 | "\n", 206 | "This is perhaps the best known database to be found in the\n", 207 | "pattern recognition literature. Fisher's paper is a classic in the field and\n", 208 | "is referenced frequently to this day. (See Duda & Hart, for example.) The\n", 209 | "data set contains 3 classes of 50 instances each, where each class refers to a\n", 210 | "type of iris plant. One class is linearly separable from the other 2; the\n", 211 | "latter are NOT linearly separable from each other.\n", 212 | "\n", 213 | ".. topic:: References\n", 214 | "\n", 215 | " - Fisher, R.A. \"The use of multiple measurements in taxonomic problems\"\n", 216 | " Annual Eugenics, 7, Part II, 179-188 (1936); also in \"Contributions to\n", 217 | " Mathematical Statistics\" (John Wiley, NY, 1950).\n", 218 | " - Duda, R.O., & Hart, P.E. (1973) Pattern Classification and Scene Analysis.\n", 219 | " (Q327.D83) John Wiley & Sons. ISBN 0-471-22361-1. See page 218.\n", 220 | " - Dasarathy, B.V. (1980) \"Nosing Around the Neighborhood: A New System\n", 221 | " Structure and Classification Rule for Recognition in Partially Exposed\n", 222 | " Environments\". IEEE Transactions on Pattern Analysis and Machine\n", 223 | " Intelligence, Vol. PAMI-2, No. 1, 67-71.\n", 224 | " - Gates, G.W. (1972) \"The Reduced Nearest Neighbor Rule\". IEEE Transactions\n", 225 | " on Information Theory, May 1972, 431-433.\n", 226 | " - See also: 1988 MLC Proceedings, 54-64. Cheeseman et al\"s AUTOCLASS II\n", 227 | " conceptual clustering system finds 3 classes in the data.\n", 228 | " - Many, many more ...\n" 229 | ] 230 | } 231 | ], 232 | "source": [ 233 | "print(iris.DESCR)" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": 13, 239 | "metadata": {}, 240 | "outputs": [ 241 | { 242 | "name": "stdout", 243 | "output_type": "stream", 244 | "text": [ 245 | "[[5.1 3.5 1.4 0.2]\n", 246 | " [4.9 3. 1.4 0.2]\n", 247 | " [4.7 3.2 1.3 0.2]\n", 248 | " [4.6 3.1 1.5 0.2]\n", 249 | " [5. 3.6 1.4 0.2]\n", 250 | " [5.4 3.9 1.7 0.4]\n", 251 | " [4.6 3.4 1.4 0.3]\n", 252 | " [5. 3.4 1.5 0.2]\n", 253 | " [4.4 2.9 1.4 0.2]\n", 254 | " [4.9 3.1 1.5 0.1]\n", 255 | " [5.4 3.7 1.5 0.2]\n", 256 | " [4.8 3.4 1.6 0.2]\n", 257 | " [4.8 3. 1.4 0.1]\n", 258 | " [4.3 3. 1.1 0.1]\n", 259 | " [5.8 4. 1.2 0.2]\n", 260 | " [5.7 4.4 1.5 0.4]\n", 261 | " [5.4 3.9 1.3 0.4]\n", 262 | " [5.1 3.5 1.4 0.3]\n", 263 | " [5.7 3.8 1.7 0.3]\n", 264 | " [5.1 3.8 1.5 0.3]\n", 265 | " [5.4 3.4 1.7 0.2]\n", 266 | " [5.1 3.7 1.5 0.4]\n", 267 | " [4.6 3.6 1. 0.2]\n", 268 | " [5.1 3.3 1.7 0.5]\n", 269 | " [4.8 3.4 1.9 0.2]\n", 270 | " [5. 3. 1.6 0.2]\n", 271 | " [5. 3.4 1.6 0.4]\n", 272 | " [5.2 3.5 1.5 0.2]\n", 273 | " [5.2 3.4 1.4 0.2]\n", 274 | " [4.7 3.2 1.6 0.2]\n", 275 | " [4.8 3.1 1.6 0.2]\n", 276 | " [5.4 3.4 1.5 0.4]\n", 277 | " [5.2 4.1 1.5 0.1]\n", 278 | " [5.5 4.2 1.4 0.2]\n", 279 | " [4.9 3.1 1.5 0.2]\n", 280 | " [5. 3.2 1.2 0.2]\n", 281 | " [5.5 3.5 1.3 0.2]\n", 282 | " [4.9 3.6 1.4 0.1]\n", 283 | " [4.4 3. 1.3 0.2]\n", 284 | " [5.1 3.4 1.5 0.2]\n", 285 | " [5. 3.5 1.3 0.3]\n", 286 | " [4.5 2.3 1.3 0.3]\n", 287 | " [4.4 3.2 1.3 0.2]\n", 288 | " [5. 3.5 1.6 0.6]\n", 289 | " [5.1 3.8 1.9 0.4]\n", 290 | " [4.8 3. 1.4 0.3]\n", 291 | " [5.1 3.8 1.6 0.2]\n", 292 | " [4.6 3.2 1.4 0.2]\n", 293 | " [5.3 3.7 1.5 0.2]\n", 294 | " [5. 3.3 1.4 0.2]\n", 295 | " [7. 3.2 4.7 1.4]\n", 296 | " [6.4 3.2 4.5 1.5]\n", 297 | " [6.9 3.1 4.9 1.5]\n", 298 | " [5.5 2.3 4. 1.3]\n", 299 | " [6.5 2.8 4.6 1.5]\n", 300 | " [5.7 2.8 4.5 1.3]\n", 301 | " [6.3 3.3 4.7 1.6]\n", 302 | " [4.9 2.4 3.3 1. ]\n", 303 | " [6.6 2.9 4.6 1.3]\n", 304 | " [5.2 2.7 3.9 1.4]\n", 305 | " [5. 2. 3.5 1. ]\n", 306 | " [5.9 3. 4.2 1.5]\n", 307 | " [6. 2.2 4. 1. ]\n", 308 | " [6.1 2.9 4.7 1.4]\n", 309 | " [5.6 2.9 3.6 1.3]\n", 310 | " [6.7 3.1 4.4 1.4]\n", 311 | " [5.6 3. 4.5 1.5]\n", 312 | " [5.8 2.7 4.1 1. ]\n", 313 | " [6.2 2.2 4.5 1.5]\n", 314 | " [5.6 2.5 3.9 1.1]\n", 315 | " [5.9 3.2 4.8 1.8]\n", 316 | " [6.1 2.8 4. 1.3]\n", 317 | " [6.3 2.5 4.9 1.5]\n", 318 | " [6.1 2.8 4.7 1.2]\n", 319 | " [6.4 2.9 4.3 1.3]\n", 320 | " [6.6 3. 4.4 1.4]\n", 321 | " [6.8 2.8 4.8 1.4]\n", 322 | " [6.7 3. 5. 1.7]\n", 323 | " [6. 2.9 4.5 1.5]\n", 324 | " [5.7 2.6 3.5 1. ]\n", 325 | " [5.5 2.4 3.8 1.1]\n", 326 | " [5.5 2.4 3.7 1. ]\n", 327 | " [5.8 2.7 3.9 1.2]\n", 328 | " [6. 2.7 5.1 1.6]\n", 329 | " [5.4 3. 4.5 1.5]\n", 330 | " [6. 3.4 4.5 1.6]\n", 331 | " [6.7 3.1 4.7 1.5]\n", 332 | " [6.3 2.3 4.4 1.3]\n", 333 | " [5.6 3. 4.1 1.3]\n", 334 | " [5.5 2.5 4. 1.3]\n", 335 | " [5.5 2.6 4.4 1.2]\n", 336 | " [6.1 3. 4.6 1.4]\n", 337 | " [5.8 2.6 4. 1.2]\n", 338 | " [5. 2.3 3.3 1. ]\n", 339 | " [5.6 2.7 4.2 1.3]\n", 340 | " [5.7 3. 4.2 1.2]\n", 341 | " [5.7 2.9 4.2 1.3]\n", 342 | " [6.2 2.9 4.3 1.3]\n", 343 | " [5.1 2.5 3. 1.1]\n", 344 | " [5.7 2.8 4.1 1.3]\n", 345 | " [6.3 3.3 6. 2.5]\n", 346 | " [5.8 2.7 5.1 1.9]\n", 347 | " [7.1 3. 5.9 2.1]\n", 348 | " [6.3 2.9 5.6 1.8]\n", 349 | " [6.5 3. 5.8 2.2]\n", 350 | " [7.6 3. 6.6 2.1]\n", 351 | " [4.9 2.5 4.5 1.7]\n", 352 | " [7.3 2.9 6.3 1.8]\n", 353 | " [6.7 2.5 5.8 1.8]\n", 354 | " [7.2 3.6 6.1 2.5]\n", 355 | " [6.5 3.2 5.1 2. ]\n", 356 | " [6.4 2.7 5.3 1.9]\n", 357 | " [6.8 3. 5.5 2.1]\n", 358 | " [5.7 2.5 5. 2. ]\n", 359 | " [5.8 2.8 5.1 2.4]\n", 360 | " [6.4 3.2 5.3 2.3]\n", 361 | " [6.5 3. 5.5 1.8]\n", 362 | " [7.7 3.8 6.7 2.2]\n", 363 | " [7.7 2.6 6.9 2.3]\n", 364 | " [6. 2.2 5. 1.5]\n", 365 | " [6.9 3.2 5.7 2.3]\n", 366 | " [5.6 2.8 4.9 2. ]\n", 367 | " [7.7 2.8 6.7 2. ]\n", 368 | " [6.3 2.7 4.9 1.8]\n", 369 | " [6.7 3.3 5.7 2.1]\n", 370 | " [7.2 3.2 6. 1.8]\n", 371 | " [6.2 2.8 4.8 1.8]\n", 372 | " [6.1 3. 4.9 1.8]\n", 373 | " [6.4 2.8 5.6 2.1]\n", 374 | " [7.2 3. 5.8 1.6]\n", 375 | " [7.4 2.8 6.1 1.9]\n", 376 | " [7.9 3.8 6.4 2. ]\n", 377 | " [6.4 2.8 5.6 2.2]\n", 378 | " [6.3 2.8 5.1 1.5]\n", 379 | " [6.1 2.6 5.6 1.4]\n", 380 | " [7.7 3. 6.1 2.3]\n", 381 | " [6.3 3.4 5.6 2.4]\n", 382 | " [6.4 3.1 5.5 1.8]\n", 383 | " [6. 3. 4.8 1.8]\n", 384 | " [6.9 3.1 5.4 2.1]\n", 385 | " [6.7 3.1 5.6 2.4]\n", 386 | " [6.9 3.1 5.1 2.3]\n", 387 | " [5.8 2.7 5.1 1.9]\n", 388 | " [6.8 3.2 5.9 2.3]\n", 389 | " [6.7 3.3 5.7 2.5]\n", 390 | " [6.7 3. 5.2 2.3]\n", 391 | " [6.3 2.5 5. 1.9]\n", 392 | " [6.5 3. 5.2 2. ]\n", 393 | " [6.2 3.4 5.4 2.3]\n", 394 | " [5.9 3. 5.1 1.8]]\n" 395 | ] 396 | } 397 | ], 398 | "source": [ 399 | "print(iris.data)" 400 | ] 401 | }, 402 | { 403 | "cell_type": "code", 404 | "execution_count": null, 405 | "metadata": {}, 406 | "outputs": [], 407 | "source": [] 408 | } 409 | ], 410 | "metadata": { 411 | "interpreter": { 412 | "hash": "7c1591eedb000bd053220f2fc60ff960636f649b4010d915c615a93c42c7f36c" 413 | }, 414 | "kernelspec": { 415 | "display_name": "Python 3.9.7 64-bit ('pyds': conda)", 416 | "language": "python", 417 | "name": "python3" 418 | }, 419 | "language_info": { 420 | "codemirror_mode": { 421 | "name": "ipython", 422 | "version": 3 423 | }, 424 | "file_extension": ".py", 425 | "mimetype": "text/x-python", 426 | "name": "python", 427 | "nbconvert_exporter": "python", 428 | "pygments_lexer": "ipython3", 429 | "version": "3.9.7" 430 | }, 431 | "orig_nbformat": 4 432 | }, 433 | "nbformat": 4, 434 | "nbformat_minor": 2 435 | } 436 | -------------------------------------------------------------------------------- /working_notebook_lect1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "8ab55536-1c75-4728-8a58-a7a66dc3d7db", 6 | "metadata": {}, 7 | "source": [ 8 | "# Working Notebook\n", 9 | "\n", 10 | "Welcome to the _Programming with Python_ course! We will be using this notebook to go through the lecture materials, as well as to work _together_ on practical examples and exercises." 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "id": "8376b782-cfd1-48d8-91db-1d53565675cf", 16 | "metadata": {}, 17 | "source": [ 18 | "## first thing: let's familiarise with the environment\n", 19 | "\n", 20 | "Let's talk about **Jupyter Notebooks** for a second." 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "id": "f53bb072-e218-492a-bef6-e6d78dc9ffdf", 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "# code cell" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "id": "39a01f33-d622-4c65-a545-ed1b6daebbbe", 36 | "metadata": {}, 37 | "source": [ 38 | "Text Cell" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "id": "f0b90233-2a73-46a4-9717-9a68c365be69", 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "id": "060f82c9-e4b8-45e8-b05c-2f969447fb61", 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "id": "fd36f8e1-5b40-411f-beb7-5e6a3e3d47e7", 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "id": "05bf6893-81bf-4863-a024-b029dad9614a", 68 | "metadata": {}, 69 | "source": [ 70 | "---" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "id": "8a59ce6b-9cdc-4009-a6c3-f453a8ed00ae", 76 | "metadata": {}, 77 | "source": [ 78 | "$\\rightarrow$ _Adapted from_ : [**Software Carpentries: Programming with Python**]()\n", 79 | "\n", 80 | "## Arthritis Inflammation\n", 81 | "We are studying **inflammation in patients** who have been given a new treatment for arthritis.\n", 82 | "\n", 83 | "There are `60` patients, who had their inflammation levels recorded for `40` days.\n", 84 | "We want to analyze these recordings to study the effect of the new arthritis treatment.\n", 85 | "\n", 86 | "To see how the treatment is affecting the patients in general, we would like to:\n", 87 | "\n", 88 | "1. Process the file to extract data for each patient;\n", 89 | "2. Calculate some statistics on each patient;\n", 90 | " - e.g. average inflammation over the `40` days (or `min`, `max` .. and so on)\n", 91 | " - e.g average statistics per week (we will assume `40` days account for `5` weeks)\n", 92 | " - `...` (open to ideas)\n", 93 | "3. Calculate some statistics on the dataset.\n", 94 | " - e.g. min and max inflammation registered overall in the clinical study;\n", 95 | " - e.g. the average inflammation per day across all patients.\n", 96 | " - `...` (open to ideas)\n", 97 | "\n", 98 | "\n", 99 | "![3-step flowchart shows inflammation data records for patients moving to the Analysis step\n", 100 | "where a heat map of provided data is generated moving to the Conclusion step that asks the\n", 101 | "question, How does the medication affect patients?](\n", 102 | "https://raw.githubusercontent.com/swcarpentry/python-novice-inflammation/gh-pages/fig/lesson-overview.svg \"Lesson Overview\")\n", 103 | "\n", 104 | "\n", 105 | "### Data Format\n", 106 | "\n", 107 | "The data sets are stored in\n", 108 | "[comma-separated values] (CSV) format:\n", 109 | "\n", 110 | "- each row holds information for a single patient,\n", 111 | "- columns represent successive days.\n", 112 | "\n", 113 | "The first three rows of our first file look like this:\n", 114 | "~~~\n", 115 | "0,0,1,3,1,2,4,7,8,3,3,3,10,5,7,4,7,7,12,18,6,13,11,11,7,7,4,6,8,8,4,4,5,7,3,4,2,3,0,0\n", 116 | "0,1,2,1,2,1,3,2,2,6,10,11,5,9,4,4,7,16,8,6,18,4,12,5,12,7,11,5,11,3,3,5,4,4,5,5,1,1,0,1\n", 117 | "0,1,1,3,3,2,6,2,5,9,5,7,4,5,4,15,5,11,9,10,19,14,12,17,7,12,11,7,4,2,10,5,4,2,2,3,2,2,1,1\n", 118 | "~~~\n", 119 | "\n", 120 | "Each number represents the number of inflammation bouts that a particular patient experienced on a\n", 121 | "given day.\n", 122 | "\n", 123 | "For example, value \"6\" at row 3 column 7 of the data set above means that the third\n", 124 | "patient was experiencing inflammation six times on the seventh day of the clinical study.\n", 125 | "\n", 126 | "Our **task** is to gather as much information as possible from the dataset, and to report back to colleagues to foster future discussions." 127 | ] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "id": "31f65289-4826-49db-b213-5152e363f9db", 132 | "metadata": {}, 133 | "source": [ 134 | "### Let'make a plan\n", 135 | "\n", 136 | "- Problem description (step by step) in NATURAL LANGUAGE (**strict rule**) - imagine you're explaining this to someone who doesn't know **anything** about programming.\n", 137 | "- What do we need to start\n", 138 | "- Where do we start" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": null, 144 | "id": "1741f9bc-95a7-4174-9301-d8a2b7e5bb4d", 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "# I'll go first - let's create a dummy file to practice named dummy, two rows, ten values" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "id": "6cf6afcb-d86e-47de-b9f8-4a088e95392f", 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": null, 162 | "id": "0b4c4c53-9451-455a-806b-52561209b644", 163 | "metadata": {}, 164 | "outputs": [], 165 | "source": [] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "id": "84761636-aa4d-4e4d-b3cf-4f8b6a32bee1", 170 | "metadata": {}, 171 | "source": [ 172 | "Read the file" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "id": "83f71ca3-14ec-4a00-b0e9-88ba1a9e5748", 179 | "metadata": {}, 180 | "outputs": [], 181 | "source": [] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": null, 186 | "id": "a75846a5-c0f6-49c9-ace5-b1f364e4846b", 187 | "metadata": {}, 188 | "outputs": [], 189 | "source": [] 190 | }, 191 | { 192 | "cell_type": "markdown", 193 | "id": "aa16a789-4cca-4ca5-b232-5d67ffedc296", 194 | "metadata": {}, 195 | "source": [ 196 | "How to collect information from data file" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": null, 202 | "id": "60493a2b-6a43-45d6-b67c-6bfd967211c5", 203 | "metadata": {}, 204 | "outputs": [], 205 | "source": [] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": null, 210 | "id": "25285a44-dd0c-41a5-a7cc-3e22f74b0a5b", 211 | "metadata": {}, 212 | "outputs": [], 213 | "source": [] 214 | }, 215 | { 216 | "cell_type": "markdown", 217 | "id": "8772ca9b-80a6-4794-8039-654e0294450d", 218 | "metadata": {}, 219 | "source": [ 220 | "Play with what we have so far: iteration" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": null, 226 | "id": "38f35ae1-ce44-47f5-9265-2f32ec3bfad1", 227 | "metadata": {}, 228 | "outputs": [], 229 | "source": [] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": null, 234 | "id": "1e200c2d-49de-4e82-a919-78eef3e68d2b", 235 | "metadata": {}, 236 | "outputs": [], 237 | "source": [] 238 | }, 239 | { 240 | "cell_type": "markdown", 241 | "id": "68e0ce79-78ee-40a6-b8e0-eef3ff3ec819", 242 | "metadata": {}, 243 | "source": [ 244 | "(_fancy word_) **Slicing**" 245 | ] 246 | }, 247 | { 248 | "cell_type": "markdown", 249 | "id": "b17eb294-bd53-4489-9138-10a84e17bf08", 250 | "metadata": {}, 251 | "source": [ 252 | "![slicing example](https://swcarpentry.github.io/python-novice-inflammation/fig/python-zero-index.svg)" 253 | ] 254 | }, 255 | { 256 | "cell_type": "markdown", 257 | "id": "712bb90e-c2f9-4f22-be2b-65afc16400a4", 258 | "metadata": {}, 259 | "source": [ 260 | "Source: [Software Carpentries](https://swcarpentry.github.io/python-novice-inflammation/02-numpy/index.html)" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": null, 266 | "id": "868a1b97-f825-488a-bf3a-984949111895", 267 | "metadata": {}, 268 | "outputs": [], 269 | "source": [] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": null, 274 | "id": "61b2cb3c-9686-434c-8eba-efffd0cae647", 275 | "metadata": {}, 276 | "outputs": [], 277 | "source": [] 278 | }, 279 | { 280 | "cell_type": "markdown", 281 | "id": "a947bde6-c61a-4aef-b1af-a9e6b31d33ce", 282 | "metadata": {}, 283 | "source": [ 284 | "Now let's move to the _real_ data file: **how can we re-use the same algorithm?**" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": null, 290 | "id": "6866f778-3b98-4114-8bbb-8764eccf0c1c", 291 | "metadata": {}, 292 | "outputs": [], 293 | "source": [] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": null, 298 | "id": "84e9f8df-5a87-47f8-8fcb-e22e951e52d6", 299 | "metadata": {}, 300 | "outputs": [], 301 | "source": [] 302 | }, 303 | { 304 | "cell_type": "markdown", 305 | "id": "c6f34a5d-3bc7-47a2-bb31-4272970e5299", 306 | "metadata": {}, 307 | "source": [ 308 | "_now we have 60 patiens to deal with_ - how can we do that?" 309 | ] 310 | }, 311 | { 312 | "cell_type": "code", 313 | "execution_count": null, 314 | "id": "257289a9-adfd-462c-9731-0971255fd016", 315 | "metadata": {}, 316 | "outputs": [], 317 | "source": [] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": null, 322 | "id": "4df67359-ea08-40ff-8b79-d077aa117de0", 323 | "metadata": {}, 324 | "outputs": [], 325 | "source": [] 326 | }, 327 | { 328 | "cell_type": "markdown", 329 | "id": "7545c8de-4946-47e4-868d-51e3f05eb8fa", 330 | "metadata": {}, 331 | "source": [ 332 | "What if we also add in a reference ID for each patient? (see `data/inflammation02.csv`)" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": null, 338 | "id": "401d50af-6438-4dc0-ac94-712cefb201c8", 339 | "metadata": {}, 340 | "outputs": [], 341 | "source": [] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "execution_count": null, 346 | "id": "6138dbb3-7226-4bf0-a422-7d050b3f8784", 347 | "metadata": {}, 348 | "outputs": [], 349 | "source": [] 350 | }, 351 | { 352 | "cell_type": "markdown", 353 | "id": "f2643109-a925-49f3-a80e-a830b5a99f23", 354 | "metadata": {}, 355 | "source": [ 356 | "Let's practice with our new data structure" 357 | ] 358 | }, 359 | { 360 | "cell_type": "code", 361 | "execution_count": null, 362 | "id": "4f03168d-d223-487a-a73c-40e05a40da76", 363 | "metadata": {}, 364 | "outputs": [], 365 | "source": [] 366 | }, 367 | { 368 | "cell_type": "code", 369 | "execution_count": null, 370 | "id": "efe6dab4-fbd9-4b20-baa9-897fdf52dc7a", 371 | "metadata": {}, 372 | "outputs": [], 373 | "source": [] 374 | }, 375 | { 376 | "cell_type": "code", 377 | "execution_count": null, 378 | "id": "d658d0c1-a451-4a5a-96e3-c0bc56c90431", 379 | "metadata": {}, 380 | "outputs": [], 381 | "source": [] 382 | }, 383 | { 384 | "cell_type": "markdown", 385 | "id": "ae5424a8-53d5-4a0f-9a28-9fa1b8714777", 386 | "metadata": {}, 387 | "source": [ 388 | "Let's get on with the _real deal_ : let's gather some statistics!" 389 | ] 390 | }, 391 | { 392 | "cell_type": "code", 393 | "execution_count": null, 394 | "id": "1ba867f5-daf0-415b-8a65-6178fa74a8f8", 395 | "metadata": {}, 396 | "outputs": [], 397 | "source": [] 398 | }, 399 | { 400 | "cell_type": "code", 401 | "execution_count": null, 402 | "id": "c7c221bf-62c9-4640-a1e9-18ac0bdfb263", 403 | "metadata": {}, 404 | "outputs": [], 405 | "source": [] 406 | }, 407 | { 408 | "cell_type": "code", 409 | "execution_count": null, 410 | "id": "bded191e-e0aa-4406-934c-4873de21e3b9", 411 | "metadata": {}, 412 | "outputs": [], 413 | "source": [] 414 | }, 415 | { 416 | "cell_type": "code", 417 | "execution_count": null, 418 | "id": "c94458ea-bf76-41ab-93a6-949f40fbc10d", 419 | "metadata": {}, 420 | "outputs": [], 421 | "source": [] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "execution_count": null, 426 | "id": "d234ec24-3e1b-4408-bffd-f74634967ae6", 427 | "metadata": {}, 428 | "outputs": [], 429 | "source": [] 430 | } 431 | ], 432 | "metadata": { 433 | "kernelspec": { 434 | "display_name": "Python 3 (ipykernel)", 435 | "language": "python", 436 | "name": "python3" 437 | }, 438 | "language_info": { 439 | "codemirror_mode": { 440 | "name": "ipython", 441 | "version": 3 442 | }, 443 | "file_extension": ".py", 444 | "mimetype": "text/x-python", 445 | "name": "python", 446 | "nbconvert_exporter": "python", 447 | "pygments_lexer": "ipython3", 448 | "version": "3.9.7" 449 | } 450 | }, 451 | "nbformat": 4, 452 | "nbformat_minor": 5 453 | } 454 | -------------------------------------------------------------------------------- /scikit-learn/02.4 Unsupervised Learning - Clustering.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "slideshow": { 8 | "slide_type": "skip" 9 | }, 10 | "tags": [] 11 | }, 12 | "outputs": [], 13 | "source": [ 14 | "%matplotlib inline\n", 15 | "import matplotlib.pyplot as plt\n", 16 | "import numpy as np" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": { 22 | "slideshow": { 23 | "slide_type": "slide" 24 | } 25 | }, 26 | "source": [ 27 | "# Clustering" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": { 33 | "slideshow": { 34 | "slide_type": "subslide" 35 | } 36 | }, 37 | "source": [ 38 | "Clustering is the task of gathering samples into groups of similar\n", 39 | "samples according to some predefined similarity or dissimilarity\n", 40 | "measure (such as the Euclidean distance).\n", 41 | "In this section we will explore a basic clustering task on some synthetic and real datasets.\n", 42 | "\n", 43 | "Here are some common applications of clustering algorithms:\n", 44 | "\n", 45 | "- Compression, in a data reduction sens\n", 46 | "- Can be used as a preprocessing step for recommender systems\n", 47 | "- Similarly:\n", 48 | " - grouping related web news (e.g. Google News) and web search results\n", 49 | " - grouping related stock quotes for investment portfolio management\n", 50 | " - building customer profiles for market analysis\n", 51 | "- Building a code book of prototype samples for unsupervised feature extraction\n", 52 | "\n" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": { 58 | "slideshow": { 59 | "slide_type": "subslide" 60 | } 61 | }, 62 | "source": [ 63 | "Let's start of with a very simple and obvious example:" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": { 70 | "collapsed": false, 71 | "jupyter": { 72 | "outputs_hidden": false 73 | } 74 | }, 75 | "outputs": [], 76 | "source": [ 77 | "from sklearn.datasets import make_blobs\n", 78 | "X, y = make_blobs(random_state=42)\n", 79 | "X.shape" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "metadata": { 86 | "collapsed": false, 87 | "jupyter": { 88 | "outputs_hidden": false 89 | }, 90 | "slideshow": { 91 | "slide_type": "subslide" 92 | } 93 | }, 94 | "outputs": [], 95 | "source": [ 96 | "plt.scatter(X[:, 0], X[:, 1])" 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "metadata": { 102 | "slideshow": { 103 | "slide_type": "subslide" 104 | } 105 | }, 106 | "source": [ 107 | "There are clearly three separate groups of points in the data, and we would like to recover them using clustering.\n", 108 | "Even if the groups are obvious in the data, it is hard to find them when the data lives in a high-dimensional space." 109 | ] 110 | }, 111 | { 112 | "cell_type": "markdown", 113 | "metadata": { 114 | "slideshow": { 115 | "slide_type": "fragment" 116 | } 117 | }, 118 | "source": [ 119 | "Now we will use one of the simplest clustering algorithms, K-means.\n", 120 | "This is an iterative algorithm which searches for three cluster\n", 121 | "centers such that the distance from each point to its cluster is\n", 122 | "minimized.\n", 123 | "**Question:** what would you expect the output to look like?" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "metadata": { 130 | "collapsed": false, 131 | "jupyter": { 132 | "outputs_hidden": false 133 | }, 134 | "slideshow": { 135 | "slide_type": "subslide" 136 | } 137 | }, 138 | "outputs": [], 139 | "source": [ 140 | "from sklearn.cluster import KMeans\n", 141 | "\n", 142 | "kmeans = KMeans(n_clusters=3, random_state=42)" 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "metadata": { 148 | "slideshow": { 149 | "slide_type": "subslide" 150 | } 151 | }, 152 | "source": [ 153 | "We can get the cluster labels either by calling fit and then accessing the \n", 154 | "``labels_`` attribute of the K means estimator, or by calling ``fit_predict``.\n", 155 | "Either way, the result contains the ID of the cluster that each point is assigned to." 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "metadata": { 162 | "collapsed": false, 163 | "jupyter": { 164 | "outputs_hidden": false 165 | }, 166 | "slideshow": { 167 | "slide_type": "fragment" 168 | } 169 | }, 170 | "outputs": [], 171 | "source": [ 172 | "labels = kmeans.fit_predict(X)" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "metadata": { 179 | "collapsed": false, 180 | "jupyter": { 181 | "outputs_hidden": false 182 | }, 183 | "slideshow": { 184 | "slide_type": "fragment" 185 | } 186 | }, 187 | "outputs": [], 188 | "source": [ 189 | "labels" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": null, 195 | "metadata": { 196 | "collapsed": false, 197 | "jupyter": { 198 | "outputs_hidden": false 199 | }, 200 | "slideshow": { 201 | "slide_type": "fragment" 202 | } 203 | }, 204 | "outputs": [], 205 | "source": [ 206 | "all(labels == kmeans.labels_)" 207 | ] 208 | }, 209 | { 210 | "cell_type": "markdown", 211 | "metadata": { 212 | "slideshow": { 213 | "slide_type": "subslide" 214 | } 215 | }, 216 | "source": [ 217 | "Let's visualize the assignments that have been found" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": null, 223 | "metadata": { 224 | "collapsed": false, 225 | "jupyter": { 226 | "outputs_hidden": false 227 | } 228 | }, 229 | "outputs": [], 230 | "source": [ 231 | "plt.scatter(X[:, 0], X[:, 1], c=labels)" 232 | ] 233 | }, 234 | { 235 | "cell_type": "markdown", 236 | "metadata": { 237 | "slideshow": { 238 | "slide_type": "subslide" 239 | } 240 | }, 241 | "source": [ 242 | "Here, we are probably satisfied with the clustering. But in general we might want to have a more quantitative evaluation. How about we compare our cluster labels with the ground truth we got when generating the blobs?" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": null, 248 | "metadata": { 249 | "collapsed": false, 250 | "jupyter": { 251 | "outputs_hidden": false 252 | } 253 | }, 254 | "outputs": [], 255 | "source": [ 256 | "from sklearn.metrics import confusion_matrix, accuracy_score\n", 257 | "print(accuracy_score(y, labels))\n", 258 | "print(confusion_matrix(y, labels))\n" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": null, 264 | "metadata": { 265 | "collapsed": false, 266 | "jupyter": { 267 | "outputs_hidden": false 268 | }, 269 | "slideshow": { 270 | "slide_type": "fragment" 271 | } 272 | }, 273 | "outputs": [], 274 | "source": [ 275 | "np.mean(y == labels)" 276 | ] 277 | }, 278 | { 279 | "cell_type": "markdown", 280 | "metadata": { 281 | "slideshow": { 282 | "slide_type": "subslide" 283 | } 284 | }, 285 | "source": [ 286 | "Even though we recovered the partitioning of the data into clusters perfectly, the cluster IDs we assigned were arbitrary,\n", 287 | "and we can not hope to recover them. Therefore, we must use a different scoring metric, such as ``adjusted_rand_score``, which is invariant to permutations of the labels:" 288 | ] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "execution_count": null, 293 | "metadata": { 294 | "collapsed": false, 295 | "jupyter": { 296 | "outputs_hidden": false 297 | } 298 | }, 299 | "outputs": [], 300 | "source": [ 301 | "from sklearn.metrics import adjusted_rand_score\n", 302 | "adjusted_rand_score(y, labels)" 303 | ] 304 | }, 305 | { 306 | "cell_type": "markdown", 307 | "metadata": { 308 | "slideshow": { 309 | "slide_type": "subslide" 310 | } 311 | }, 312 | "source": [ 313 | "**Clustering comes with assumptions**: A clustering algorithm finds clusters by making assumptions with samples should be grouped together. \n", 314 | "\n", 315 | "Each algorithm makes different assumptions and the quality and interpretability of your results will depend on whether the assumptions are satisfied for your goal. \n", 316 | "\n", 317 | "For K-means clustering, the model is that all clusters have equal, spherical variance." 318 | ] 319 | }, 320 | { 321 | "cell_type": "markdown", 322 | "metadata": { 323 | "slideshow": { 324 | "slide_type": "subslide" 325 | } 326 | }, 327 | "source": [ 328 | "**In general, there is no guarantee that structure found by a clustering algorithm has anything to do with what you were interested in**." 329 | ] 330 | }, 331 | { 332 | "cell_type": "markdown", 333 | "metadata": { 334 | "slideshow": { 335 | "slide_type": "fragment" 336 | } 337 | }, 338 | "source": [ 339 | "We can easily create a dataset that has non-isotropic clusters, on which kmeans will fail:" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": null, 345 | "metadata": { 346 | "collapsed": false, 347 | "jupyter": { 348 | "outputs_hidden": false 349 | }, 350 | "slideshow": { 351 | "slide_type": "subslide" 352 | } 353 | }, 354 | "outputs": [], 355 | "source": [ 356 | "from sklearn.datasets import make_blobs\n", 357 | "\n", 358 | "X, y = make_blobs(random_state=170, n_samples=600)\n", 359 | "rng = np.random.RandomState(74)\n", 360 | "\n", 361 | "transformation = rng.normal(size=(2, 2))\n", 362 | "X = np.dot(X, transformation)\n", 363 | "\n", 364 | "y_pred = KMeans(n_clusters=3).fit_predict(X)\n", 365 | "\n", 366 | "plt.scatter(X[:, 0], X[:, 1], c=y_pred)" 367 | ] 368 | }, 369 | { 370 | "cell_type": "markdown", 371 | "metadata": { 372 | "slideshow": { 373 | "slide_type": "slide" 374 | } 375 | }, 376 | "source": [ 377 | "## Some Notable Clustering Routines" 378 | ] 379 | }, 380 | { 381 | "cell_type": "markdown", 382 | "metadata": { 383 | "slideshow": { 384 | "slide_type": "subslide" 385 | } 386 | }, 387 | "source": [ 388 | "The following are two well-known clustering algorithms. \n", 389 | "\n", 390 | "- `sklearn.cluster.KMeans`:
\n", 391 | " The simplest, yet effective clustering algorithm. Needs to be provided with the\n", 392 | " number of clusters in advance, and assumes that the data is normalized as input\n", 393 | " (but use a PCA model as preprocessor).\n", 394 | "- `sklearn.cluster.MeanShift`:
\n", 395 | " Can find better looking clusters than KMeans but is not scalable to high number of samples." 396 | ] 397 | }, 398 | { 399 | "cell_type": "markdown", 400 | "metadata": { 401 | "slideshow": { 402 | "slide_type": "subslide" 403 | } 404 | }, 405 | "source": [ 406 | "- `sklearn.cluster.DBSCAN`:
\n", 407 | " Can detect irregularly shaped clusters based on density, i.e. sparse regions in\n", 408 | " the input space are likely to become inter-cluster boundaries. Can also detect\n", 409 | " outliers (samples that are not part of a cluster).\n", 410 | "- `sklearn.cluster.AffinityPropagation`:
\n", 411 | " Clustering algorithm based on message passing between data points." 412 | ] 413 | }, 414 | { 415 | "cell_type": "markdown", 416 | "metadata": { 417 | "slideshow": { 418 | "slide_type": "subslide" 419 | } 420 | }, 421 | "source": [ 422 | "- `sklearn.cluster.SpectralClustering`:
\n", 423 | " KMeans applied to a projection of the normalized graph Laplacian: finds\n", 424 | " normalized graph cuts if the affinity matrix is interpreted as an adjacency matrix of a graph.\n", 425 | "- `sklearn.cluster.Ward`:
\n", 426 | " Ward implements hierarchical clustering based on the Ward algorithm,\n", 427 | " a variance-minimizing approach. At each step, it minimizes the sum of\n", 428 | " squared differences within all clusters (inertia criterion).\n" 429 | ] 430 | }, 431 | { 432 | "cell_type": "markdown", 433 | "metadata": { 434 | "slideshow": { 435 | "slide_type": "fragment" 436 | } 437 | }, 438 | "source": [ 439 | "Of these, Ward, SpectralClustering, DBSCAN and Affinity propagation can also work with precomputed similarity matrices." 440 | ] 441 | }, 442 | { 443 | "cell_type": "markdown", 444 | "metadata": { 445 | "slideshow": { 446 | "slide_type": "subslide" 447 | } 448 | }, 449 | "source": [ 450 | "" 451 | ] 452 | }, 453 | { 454 | "cell_type": "markdown", 455 | "metadata": { 456 | "slideshow": { 457 | "slide_type": "slide" 458 | } 459 | }, 460 | "source": [ 461 | "## Exercise: digits clustering" 462 | ] 463 | }, 464 | { 465 | "cell_type": "markdown", 466 | "metadata": { 467 | "slideshow": { 468 | "slide_type": "-" 469 | } 470 | }, 471 | "source": [ 472 | "Perform K-means clustering on the digits data, searching for ten clusters.\n", 473 | "Visualize the cluster centers as images (i.e. reshape each to 8x8 and use\n", 474 | "``plt.imshow``) Do the clusters seem to be correlated with particular digits? What is the ``adjusted_rand_score``?\n", 475 | "\n", 476 | "Visualize the projected digits as in the last notebook, but this time use the\n", 477 | "cluster labels as the color. What do you notice?" 478 | ] 479 | }, 480 | { 481 | "cell_type": "code", 482 | "execution_count": null, 483 | "metadata": { 484 | "collapsed": false, 485 | "jupyter": { 486 | "outputs_hidden": false 487 | }, 488 | "slideshow": { 489 | "slide_type": "skip" 490 | } 491 | }, 492 | "outputs": [], 493 | "source": [ 494 | "from sklearn.datasets import load_digits\n", 495 | "digits = load_digits()\n", 496 | "# ..." 497 | ] 498 | }, 499 | { 500 | "cell_type": "code", 501 | "execution_count": null, 502 | "metadata": { 503 | "slideshow": { 504 | "slide_type": "skip" 505 | }, 506 | "tags": [] 507 | }, 508 | "outputs": [], 509 | "source": [] 510 | } 511 | ], 512 | "metadata": { 513 | "kernelspec": { 514 | "display_name": "Python 3 (ipykernel)", 515 | "language": "python", 516 | "name": "python3" 517 | }, 518 | "language_info": { 519 | "codemirror_mode": { 520 | "name": "ipython", 521 | "version": 3 522 | }, 523 | "file_extension": ".py", 524 | "mimetype": "text/x-python", 525 | "name": "python", 526 | "nbconvert_exporter": "python", 527 | "pygments_lexer": "ipython3", 528 | "version": "3.9.7" 529 | } 530 | }, 531 | "nbformat": 4, 532 | "nbformat_minor": 4 533 | } 534 | -------------------------------------------------------------------------------- /python_extras/functions-objects.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "slideshow": { 7 | "slide_type": "slide" 8 | } 9 | }, 10 | "source": [ 11 | "# Functions as Objects" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": { 17 | "slideshow": { 18 | "slide_type": "subslide" 19 | } 20 | }, 21 | "source": [ 22 | "Functions in Python are **first-class objects**. \n", 23 | "\n", 24 | "Programming language theorists define a **first-class object** as a program entity that can be:\n", 25 | "\n", 26 | "- Created at runtime\n", 27 | "- Assigned to a variable or element in a data structure\n", 28 | "- Passed as an argument to a function\n", 29 | "- Returned as the result of a function" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "metadata": { 35 | "slideshow": { 36 | "slide_type": "subslide" 37 | } 38 | }, 39 | "source": [ 40 | "Integers, strings, and dictionaries are other examples of first-class objects in Python — nothing fancy here. \n", 41 | "\n", 42 | "But if you came to Python from a language where functions are **not** first-class citizens, this notbook and the rest focuses on the implications and practical applications of treating functions as objects." 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": { 48 | "slideshow": { 49 | "slide_type": "subslide" 50 | } 51 | }, 52 | "source": [ 53 | "#### Treating a Function like an Object" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 1, 59 | "metadata": { 60 | "slideshow": { 61 | "slide_type": "fragment" 62 | } 63 | }, 64 | "outputs": [], 65 | "source": [ 66 | "def factorial(n):\n", 67 | " '''returns n!'''\n", 68 | " return 1 if n < 2 else n * factorial(n-1)" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 2, 74 | "metadata": { 75 | "slideshow": { 76 | "slide_type": "subslide" 77 | } 78 | }, 79 | "outputs": [ 80 | { 81 | "data": { 82 | "text/plain": [ 83 | "1405006117752879898543142606244511569936384000000000" 84 | ] 85 | }, 86 | "execution_count": 2, 87 | "metadata": {}, 88 | "output_type": "execute_result" 89 | } 90 | ], 91 | "source": [ 92 | "factorial(42)" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 3, 98 | "metadata": { 99 | "slideshow": { 100 | "slide_type": "fragment" 101 | } 102 | }, 103 | "outputs": [ 104 | { 105 | "data": { 106 | "text/plain": [ 107 | "'returns n!'" 108 | ] 109 | }, 110 | "execution_count": 3, 111 | "metadata": {}, 112 | "output_type": "execute_result" 113 | } 114 | ], 115 | "source": [ 116 | "factorial.__doc__" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 4, 122 | "metadata": { 123 | "slideshow": { 124 | "slide_type": "fragment" 125 | } 126 | }, 127 | "outputs": [ 128 | { 129 | "data": { 130 | "text/plain": [ 131 | "function" 132 | ] 133 | }, 134 | "execution_count": 4, 135 | "metadata": {}, 136 | "output_type": "execute_result" 137 | } 138 | ], 139 | "source": [ 140 | "type(factorial)" 141 | ] 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "metadata": { 146 | "slideshow": { 147 | "slide_type": "subslide" 148 | } 149 | }, 150 | "source": [ 151 | "##### Introspection" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": 8, 157 | "metadata": { 158 | "slideshow": { 159 | "slide_type": "fragment" 160 | } 161 | }, 162 | "outputs": [ 163 | { 164 | "data": { 165 | "text/plain": [ 166 | "['__annotations__',\n", 167 | " '__call__',\n", 168 | " '__class__',\n", 169 | " '__closure__',\n", 170 | " '__code__',\n", 171 | " '__defaults__',\n", 172 | " '__delattr__',\n", 173 | " '__dict__',\n", 174 | " '__dir__',\n", 175 | " '__doc__',\n", 176 | " '__eq__',\n", 177 | " '__format__',\n", 178 | " '__ge__',\n", 179 | " '__get__',\n", 180 | " '__getattribute__',\n", 181 | " '__globals__',\n", 182 | " '__gt__',\n", 183 | " '__hash__',\n", 184 | " '__init__',\n", 185 | " '__kwdefaults__',\n", 186 | " '__le__',\n", 187 | " '__lt__',\n", 188 | " '__module__',\n", 189 | " '__name__',\n", 190 | " '__ne__',\n", 191 | " '__new__',\n", 192 | " '__qualname__',\n", 193 | " '__reduce__',\n", 194 | " '__reduce_ex__',\n", 195 | " '__repr__',\n", 196 | " '__setattr__',\n", 197 | " '__sizeof__',\n", 198 | " '__str__',\n", 199 | " '__subclasshook__']" 200 | ] 201 | }, 202 | "execution_count": 8, 203 | "metadata": {}, 204 | "output_type": "execute_result" 205 | } 206 | ], 207 | "source": [ 208 | "dir(factorial)" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": 9, 214 | "metadata": { 215 | "slideshow": { 216 | "slide_type": "fragment" 217 | } 218 | }, 219 | "outputs": [ 220 | { 221 | "data": { 222 | "text/plain": [ 223 | "function" 224 | ] 225 | }, 226 | "execution_count": 9, 227 | "metadata": {}, 228 | "output_type": "execute_result" 229 | } 230 | ], 231 | "source": [ 232 | "factorial.__class__" 233 | ] 234 | }, 235 | { 236 | "cell_type": "markdown", 237 | "metadata": { 238 | "slideshow": { 239 | "slide_type": "subslide" 240 | } 241 | }, 242 | "source": [ 243 | "#### Use function through a different name, and pass function as argument" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": 5, 249 | "metadata": { 250 | "slideshow": { 251 | "slide_type": "-" 252 | } 253 | }, 254 | "outputs": [ 255 | { 256 | "data": { 257 | "text/plain": [ 258 | "" 259 | ] 260 | }, 261 | "execution_count": 5, 262 | "metadata": {}, 263 | "output_type": "execute_result" 264 | } 265 | ], 266 | "source": [ 267 | "fact = factorial\n", 268 | "fact" 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": 6, 274 | "metadata": { 275 | "slideshow": { 276 | "slide_type": "fragment" 277 | } 278 | }, 279 | "outputs": [ 280 | { 281 | "data": { 282 | "text/plain": [ 283 | "120" 284 | ] 285 | }, 286 | "execution_count": 6, 287 | "metadata": {}, 288 | "output_type": "execute_result" 289 | } 290 | ], 291 | "source": [ 292 | "fact(5)" 293 | ] 294 | }, 295 | { 296 | "cell_type": "markdown", 297 | "metadata": { 298 | "slideshow": { 299 | "slide_type": "fragment" 300 | } 301 | }, 302 | "source": [ 303 | "**Note**: Having first-class functions enables programming in a **functional style**. \n", 304 | "\n", 305 | "One of the hallmarks of functional programming is the use of **higher-order functions**.\n" 306 | ] 307 | }, 308 | { 309 | "cell_type": "markdown", 310 | "metadata": { 311 | "slideshow": { 312 | "slide_type": "slide" 313 | } 314 | }, 315 | "source": [ 316 | "## Higher-Order Functions" 317 | ] 318 | }, 319 | { 320 | "cell_type": "markdown", 321 | "metadata": { 322 | "slideshow": { 323 | "slide_type": "subslide" 324 | } 325 | }, 326 | "source": [ 327 | ">A function that takes a function as argument or returns a function as the result is a higher-order function. \n", 328 | "\n", 329 | "One example is `map`. Another is the built-in function `sorted`: an optional `key` argument lets you provide a function to be applied to each item for sorting, as seen in `list.sort` and the `sorted` functions.\n", 330 | "\n", 331 | "For example, to sort a list of words by length, simply pass the `len` function as the key:." 332 | ] 333 | }, 334 | { 335 | "cell_type": "code", 336 | "execution_count": 7, 337 | "metadata": { 338 | "slideshow": { 339 | "slide_type": "subslide" 340 | } 341 | }, 342 | "outputs": [ 343 | { 344 | "data": { 345 | "text/plain": [ 346 | "['fig', 'apple', 'cherry', 'banana', 'raspberry', 'strawberry']" 347 | ] 348 | }, 349 | "execution_count": 7, 350 | "metadata": {}, 351 | "output_type": "execute_result" 352 | } 353 | ], 354 | "source": [ 355 | "fruits = ['strawberry', 'fig', 'apple', 'cherry', 'raspberry', 'banana']\n", 356 | "sorted(fruits, key=len)" 357 | ] 358 | }, 359 | { 360 | "cell_type": "markdown", 361 | "metadata": { 362 | "slideshow": { 363 | "slide_type": "slide" 364 | } 365 | }, 366 | "source": [ 367 | "## Anonymous Functions" 368 | ] 369 | }, 370 | { 371 | "cell_type": "markdown", 372 | "metadata": { 373 | "slideshow": { 374 | "slide_type": "subslide" 375 | } 376 | }, 377 | "source": [ 378 | "The `lambda` keyword creates an anonymous function within a Python expression.\n", 379 | "\n", 380 | "However, the simple syntax of Python limits the body of `lambda` functions to be pure expressions. \n", 381 | "\n", 382 | "In other words, the body of a lambda cannot make assignments or use any other Python statement such as `while`, etc.\n", 383 | "\n", 384 | "The best use of anonymous functions is in the context of an argument list." 385 | ] 386 | }, 387 | { 388 | "cell_type": "code", 389 | "execution_count": 10, 390 | "metadata": { 391 | "slideshow": { 392 | "slide_type": "fragment" 393 | } 394 | }, 395 | "outputs": [ 396 | { 397 | "data": { 398 | "text/plain": [ 399 | "['banana', 'apple', 'fig', 'raspberry', 'strawberry', 'cherry']" 400 | ] 401 | }, 402 | "execution_count": 10, 403 | "metadata": {}, 404 | "output_type": "execute_result" 405 | } 406 | ], 407 | "source": [ 408 | "fruits = ['strawberry', 'fig', 'apple', 'cherry', 'raspberry', 'banana']\n", 409 | "sorted(fruits, key=lambda word: word[::-1])" 410 | ] 411 | }, 412 | { 413 | "cell_type": "markdown", 414 | "metadata": { 415 | "slideshow": { 416 | "slide_type": "fragment" 417 | } 418 | }, 419 | "source": [ 420 | "Outside the limited context of arguments to higher-order functions, anonymous functions are rarely useful in Python. \n", 421 | "\n", 422 | "The syntactic restrictions tend to make nontriv‐ ial lambdas either unreadable or unworkable.\n", 423 | "\n" 424 | ] 425 | }, 426 | { 427 | "cell_type": "markdown", 428 | "metadata": { 429 | "slideshow": { 430 | "slide_type": "subslide" 431 | } 432 | }, 433 | "source": [ 434 | "**Lundh’s lambda Refactoring Recipe**\n", 435 | "\n", 436 | "If you find a piece of code hard to understand because of a lambda, Fredrik Lundh suggests this refactoring procedure:\n", 437 | "\n", 438 | "> 1. Write a comment explaining what the heck that lambda does. \n", 439 | "> 2. Study the comment for a while, and think of a name that captures the essence of\n", 440 | "the comment.\n", 441 | "> 3. Convert the lambda to a def statement, using that name.\n", 442 | "> 4. Remove the comment.\n", 443 | "\n", 444 | "These steps are quoted from the [Functional Programming HOWTO](https://docs.python.org/3/howto/functional.html), a **must read**." 445 | ] 446 | }, 447 | { 448 | "cell_type": "markdown", 449 | "metadata": { 450 | "slideshow": { 451 | "slide_type": "fragment" 452 | } 453 | }, 454 | "source": [ 455 | "The `lambda` syntax is just syntactic sugar: a lambda expression creates a function object just like the def statement. " 456 | ] 457 | }, 458 | { 459 | "cell_type": "markdown", 460 | "metadata": { 461 | "slideshow": { 462 | "slide_type": "slide" 463 | } 464 | }, 465 | "source": [ 466 | "## Function Annotations" 467 | ] 468 | }, 469 | { 470 | "cell_type": "markdown", 471 | "metadata": { 472 | "slideshow": { 473 | "slide_type": "subslide" 474 | } 475 | }, 476 | "source": [ 477 | "**Python 3** provides syntax to attach _metadata_ to the parameters of a function declaration and its return value. \n" 478 | ] 479 | }, 480 | { 481 | "cell_type": "code", 482 | "execution_count": 11, 483 | "metadata": { 484 | "slideshow": { 485 | "slide_type": "fragment" 486 | } 487 | }, 488 | "outputs": [], 489 | "source": [ 490 | "def clip(text:str, max_len:'int > 0'=80) -> str:\n", 491 | " \"\"\"Return text clipped at the last space before or after max_len\n", 492 | " \"\"\"\n", 493 | " end = None\n", 494 | " if len(text) > max_len:\n", 495 | " space_before = text.rfind(' ', 0, max_len)\n", 496 | " if space_before >= 0:\n", 497 | " end = space_before\n", 498 | " else:\n", 499 | " space_after = text.rfind(' ', max_len)\n", 500 | " if space_after >= 0:\n", 501 | " end = space_after\n", 502 | " if end is None: # no spaces were found\n", 503 | " end = len(text)\n", 504 | " return text[:end].rstrip()" 505 | ] 506 | }, 507 | { 508 | "cell_type": "code", 509 | "execution_count": 12, 510 | "metadata": { 511 | "slideshow": { 512 | "slide_type": "fragment" 513 | } 514 | }, 515 | "outputs": [ 516 | { 517 | "data": { 518 | "text/plain": [ 519 | "{'max_len': 'int > 0', 'return': str, 'text': str}" 520 | ] 521 | }, 522 | "execution_count": 12, 523 | "metadata": {}, 524 | "output_type": "execute_result" 525 | } 526 | ], 527 | "source": [ 528 | "clip.__annotations__" 529 | ] 530 | }, 531 | { 532 | "cell_type": "markdown", 533 | "metadata": { 534 | "slideshow": { 535 | "slide_type": "subslide" 536 | } 537 | }, 538 | "source": [ 539 | "**Annotations Overview:**\n", 540 | "\n", 541 | "- Each argument in the function declaration may have an annotation expression preceded by `:`. \n", 542 | "- If there is a _default value_, the annotation goes between the argument name and the `=` sign. \n", 543 | "- To annotate the `return` value, add `->` and another expression between the `)` and the `:` at the tail of the function declaration. \n", 544 | "\n", 545 | "The expressions may be of any type. The most common types used in annotations are classes, like `str` or `int`, or strings, like `'int > 0'`." 546 | ] 547 | } 548 | ], 549 | "metadata": { 550 | "kernelspec": { 551 | "display_name": "Python 3", 552 | "language": "python", 553 | "name": "python3" 554 | }, 555 | "language_info": { 556 | "codemirror_mode": { 557 | "name": "ipython", 558 | "version": 3 559 | }, 560 | "file_extension": ".py", 561 | "mimetype": "text/x-python", 562 | "name": "python", 563 | "nbconvert_exporter": "python", 564 | "pygments_lexer": "ipython3", 565 | "version": "3.7.6" 566 | } 567 | }, 568 | "nbformat": 4, 569 | "nbformat_minor": 2 570 | } 571 | -------------------------------------------------------------------------------- /programming_with_python/exceptions.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "slideshow": { 7 | "slide_type": "slide" 8 | } 9 | }, 10 | "source": [ 11 | "# Exceptions" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": { 17 | "slideshow": { 18 | "slide_type": "subslide" 19 | } 20 | }, 21 | "source": [ 22 | "Exceptions are events that can modify the *flow* of control through a program. \n", 23 | "\n", 24 | "In Python, exceptions are triggered automatically on errors, and they can be triggered and intercepted by your code.\n", 25 | "\n", 26 | "They are processed by **four** statements we’ll study in this notebook, the first of which has two variations (listed separately here) and the last of which was an optional extension until Python 2.6 and 3.0:\n", 27 | "\n", 28 | "* `try/except`:\n", 29 | " * Catch and recover from exceptions raised by Python, or by you\n", 30 | " \n", 31 | "* `try/finally`:\n", 32 | " * Perform cleanup actions, whether exceptions occur or not.\n", 33 | "\n", 34 | "* `raise`:\n", 35 | " * Trigger an exception manually in your code.\n", 36 | " \n", 37 | "* `assert`:\n", 38 | " * Conditionally trigger an exception in your code.\n", 39 | " \n", 40 | "* `with/as`:\n", 41 | " * Implement context managers in Python 2.6, 3.0, and later (optional in 2.5)." 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": { 47 | "slideshow": { 48 | "slide_type": "slide" 49 | } 50 | }, 51 | "source": [ 52 | "# `try/except` Statement" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": { 58 | "slideshow": { 59 | "slide_type": "fragment" 60 | } 61 | }, 62 | "source": [ 63 | "```\n", 64 | "try:\n", 65 | " statements # Run this main action first\n", 66 | "except name1: \n", 67 | " # Run if name1 is raised during try block\n", 68 | " statements\n", 69 | "except (name2, name3): \n", 70 | " # Run if any of these exceptions occur\n", 71 | " statements \n", 72 | "except name4 as var: \n", 73 | " # Run if name4 is raised, assign instance raised to var \n", 74 | " statements\n", 75 | "except: # Run for all other exceptions raised\n", 76 | " statements\n", 77 | "else:\n", 78 | " statements # Run if no exception was raised during try block\n", 79 | "```" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 1, 85 | "metadata": { 86 | "scrolled": true, 87 | "slideshow": { 88 | "slide_type": "subslide" 89 | } 90 | }, 91 | "outputs": [ 92 | { 93 | "name": "stdout", 94 | "output_type": "stream", 95 | "text": [ 96 | "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99]\n" 97 | ] 98 | } 99 | ], 100 | "source": [ 101 | "list_of_numbers = [number for number in range(1, 100)]\n", 102 | "print(list_of_numbers)" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": 2, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "dictionary_of_numbers = {}\n", 112 | "for number in list_of_numbers:\n", 113 | " dictionary_of_numbers[number**2] = number" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 3, 119 | "metadata": { 120 | "scrolled": true 121 | }, 122 | "outputs": [ 123 | { 124 | "data": { 125 | "text/plain": [ 126 | "{1: 1,\n", 127 | " 4: 2,\n", 128 | " 9: 3,\n", 129 | " 16: 4,\n", 130 | " 25: 5,\n", 131 | " 36: 6,\n", 132 | " 49: 7,\n", 133 | " 64: 8,\n", 134 | " 81: 9,\n", 135 | " 100: 10,\n", 136 | " 121: 11,\n", 137 | " 144: 12,\n", 138 | " 169: 13,\n", 139 | " 196: 14,\n", 140 | " 225: 15,\n", 141 | " 256: 16,\n", 142 | " 289: 17,\n", 143 | " 324: 18,\n", 144 | " 361: 19,\n", 145 | " 400: 20,\n", 146 | " 441: 21,\n", 147 | " 484: 22,\n", 148 | " 529: 23,\n", 149 | " 576: 24,\n", 150 | " 625: 25,\n", 151 | " 676: 26,\n", 152 | " 729: 27,\n", 153 | " 784: 28,\n", 154 | " 841: 29,\n", 155 | " 900: 30,\n", 156 | " 961: 31,\n", 157 | " 1024: 32,\n", 158 | " 1089: 33,\n", 159 | " 1156: 34,\n", 160 | " 1225: 35,\n", 161 | " 1296: 36,\n", 162 | " 1369: 37,\n", 163 | " 1444: 38,\n", 164 | " 1521: 39,\n", 165 | " 1600: 40,\n", 166 | " 1681: 41,\n", 167 | " 1764: 42,\n", 168 | " 1849: 43,\n", 169 | " 1936: 44,\n", 170 | " 2025: 45,\n", 171 | " 2116: 46,\n", 172 | " 2209: 47,\n", 173 | " 2304: 48,\n", 174 | " 2401: 49,\n", 175 | " 2500: 50,\n", 176 | " 2601: 51,\n", 177 | " 2704: 52,\n", 178 | " 2809: 53,\n", 179 | " 2916: 54,\n", 180 | " 3025: 55,\n", 181 | " 3136: 56,\n", 182 | " 3249: 57,\n", 183 | " 3364: 58,\n", 184 | " 3481: 59,\n", 185 | " 3600: 60,\n", 186 | " 3721: 61,\n", 187 | " 3844: 62,\n", 188 | " 3969: 63,\n", 189 | " 4096: 64,\n", 190 | " 4225: 65,\n", 191 | " 4356: 66,\n", 192 | " 4489: 67,\n", 193 | " 4624: 68,\n", 194 | " 4761: 69,\n", 195 | " 4900: 70,\n", 196 | " 5041: 71,\n", 197 | " 5184: 72,\n", 198 | " 5329: 73,\n", 199 | " 5476: 74,\n", 200 | " 5625: 75,\n", 201 | " 5776: 76,\n", 202 | " 5929: 77,\n", 203 | " 6084: 78,\n", 204 | " 6241: 79,\n", 205 | " 6400: 80,\n", 206 | " 6561: 81,\n", 207 | " 6724: 82,\n", 208 | " 6889: 83,\n", 209 | " 7056: 84,\n", 210 | " 7225: 85,\n", 211 | " 7396: 86,\n", 212 | " 7569: 87,\n", 213 | " 7744: 88,\n", 214 | " 7921: 89,\n", 215 | " 8100: 90,\n", 216 | " 8281: 91,\n", 217 | " 8464: 92,\n", 218 | " 8649: 93,\n", 219 | " 8836: 94,\n", 220 | " 9025: 95,\n", 221 | " 9216: 96,\n", 222 | " 9409: 97,\n", 223 | " 9604: 98,\n", 224 | " 9801: 99}" 225 | ] 226 | }, 227 | "execution_count": 3, 228 | "metadata": {}, 229 | "output_type": "execute_result" 230 | } 231 | ], 232 | "source": [ 233 | "dictionary_of_numbers" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "metadata": { 240 | "slideshow": { 241 | "slide_type": "fragment" 242 | } 243 | }, 244 | "outputs": [], 245 | "source": [ 246 | "try:\n", 247 | " index = list_of_numbers.index(2)\n", 248 | " value = dictionary_of_numbers[index]\n", 249 | "except (ValueError, KeyError):\n", 250 | " print('Error Raised, but Controlled! ')\n", 251 | "else: \n", 252 | " # This executes ONLY if no exception is raised\n", 253 | " print('Getting number at position %d : %d' % (index, value))\n", 254 | "finally:\n", 255 | " # Do cleanup operations\n", 256 | " print('Cleaning UP')" 257 | ] 258 | }, 259 | { 260 | "cell_type": "markdown", 261 | "metadata": { 262 | "slideshow": { 263 | "slide_type": "slide" 264 | } 265 | }, 266 | "source": [ 267 | "# `try/finally` Statement" 268 | ] 269 | }, 270 | { 271 | "cell_type": "markdown", 272 | "metadata": { 273 | "slideshow": { 274 | "slide_type": "subslide" 275 | } 276 | }, 277 | "source": [ 278 | "The other flavor of the try statement is a specialization that has to do with finalization (a.k.a. termination) actions. If a finally clause is included in a try, Python will always run its block of statements “on the way out” of the try statement, whether an exception occurred while the try block was running or not. \n", 279 | "\n", 280 | "In it's general form, it is:\n", 281 | "\n", 282 | "```\n", 283 | "try:\n", 284 | " statements # Run this action first \n", 285 | "finally:\n", 286 | " statements # Always run this code on the way out\n", 287 | "```" 288 | ] 289 | }, 290 | { 291 | "cell_type": "markdown", 292 | "metadata": { 293 | "slideshow": { 294 | "slide_type": "subslide" 295 | } 296 | }, 297 | "source": [ 298 | "# `with/as` Context Managers" 299 | ] 300 | }, 301 | { 302 | "cell_type": "markdown", 303 | "metadata": { 304 | "slideshow": { 305 | "slide_type": "fragment" 306 | } 307 | }, 308 | "source": [ 309 | "Python 2.6 and 3.0 introduced a new exception-related statement—the with, and its optional as clause. This statement is designed to work with context manager objects, which support a new method-based protocol, similar in spirit to the way that iteration tools work with methods of the iteration protocol. " 310 | ] 311 | }, 312 | { 313 | "cell_type": "markdown", 314 | "metadata": { 315 | "slideshow": { 316 | "slide_type": "subslide" 317 | } 318 | }, 319 | "source": [ 320 | "## Context Manager Intro" 321 | ] 322 | }, 323 | { 324 | "cell_type": "markdown", 325 | "metadata": { 326 | "slideshow": { 327 | "slide_type": "fragment" 328 | } 329 | }, 330 | "source": [ 331 | "### Basic Usage:\n", 332 | "\n", 333 | "```\n", 334 | "with expression [as variable]: \n", 335 | " with-block\n", 336 | "```" 337 | ] 338 | }, 339 | { 340 | "cell_type": "markdown", 341 | "metadata": { 342 | "slideshow": { 343 | "slide_type": "subslide" 344 | } 345 | }, 346 | "source": [ 347 | "### Classical Usage\n", 348 | "\n", 349 | "```python\n", 350 | "\n", 351 | "with open(r'C:\\misc\\data') as myfile: \n", 352 | " for line in myfile:\n", 353 | " print(line)\n", 354 | " # ...more code here...\n", 355 | "```\n", 356 | "\n", 357 | "... even using multiple context managers:\n", 358 | "\n", 359 | "```python\n", 360 | "with open('script1.py') as f1, open('script2.py') as f2: \n", 361 | " for (linenum, (line1, line2)) in enumerate(zip(f1, f2)):\n", 362 | " if line1 != line2:\n", 363 | " print('%s\\n%r\\n%r' % (linenum, line1, line2))\n", 364 | "```" 365 | ] 366 | }, 367 | { 368 | "cell_type": "markdown", 369 | "metadata": { 370 | "slideshow": { 371 | "slide_type": "subslide" 372 | } 373 | }, 374 | "source": [ 375 | "### How it works\n", 376 | "\n", 377 | "1. The expression is evaluated,resulting in an object known as a **context manager** that must have `__enter__` and `__exit__` methods\n", 378 | "\n", 379 | "2. The context manager’s `__enter__` method is called. The value it returns is assigned to the variable in the as clause if present, or simply discarded otherwise\n", 380 | "\n", 381 | "3. The code in the nested with block is executed.\n", 382 | "\n", 383 | "4. If the with block raises an exception, the `__exit__(type,value,traceback)` method is called with the exception details. These are the same three values returned by `sys.exc_info` (Python function). If this method returns a `false` value, the exception is **re-raised**; otherwise, the exception is terminated. The exception should normally be reraised so that it is propagated outside the with statement.\n", 384 | "\n", 385 | "5. If the with block does not raise an exception, the `__exit__` method is still called, but its type, value, and traceback arguments are all passed in as `None`." 386 | ] 387 | }, 388 | { 389 | "cell_type": "markdown", 390 | "metadata": { 391 | "slideshow": { 392 | "slide_type": "slide" 393 | } 394 | }, 395 | "source": [ 396 | "## Usage with Exceptions" 397 | ] 398 | }, 399 | { 400 | "cell_type": "code", 401 | "execution_count": null, 402 | "metadata": { 403 | "collapsed": true, 404 | "jupyter": { 405 | "outputs_hidden": true 406 | }, 407 | "slideshow": { 408 | "slide_type": "subslide" 409 | } 410 | }, 411 | "outputs": [], 412 | "source": [ 413 | "class TraceBlock:\n", 414 | " def message(self, arg):\n", 415 | " print('running ' + arg) \n", 416 | " \n", 417 | " def __enter__(self):\n", 418 | " print('starting with block')\n", 419 | " return self\n", 420 | " \n", 421 | " def __exit__(self, exc_type, exc_value, exc_tb):\n", 422 | " if exc_type is None: \n", 423 | " print('exited normally\\n')\n", 424 | " else:\n", 425 | " print('raise an exception! ' + str(exc_type)) \n", 426 | " return False # Propagate" 427 | ] 428 | }, 429 | { 430 | "cell_type": "code", 431 | "execution_count": null, 432 | "metadata": { 433 | "slideshow": { 434 | "slide_type": "fragment" 435 | } 436 | }, 437 | "outputs": [], 438 | "source": [ 439 | "with TraceBlock() as action: \n", 440 | " action.message('test 1')\n", 441 | " print('reached')" 442 | ] 443 | }, 444 | { 445 | "cell_type": "code", 446 | "execution_count": null, 447 | "metadata": { 448 | "slideshow": { 449 | "slide_type": "fragment" 450 | } 451 | }, 452 | "outputs": [], 453 | "source": [ 454 | "with TraceBlock() as action: \n", 455 | " action.message('test 2') \n", 456 | " raise TypeError()\n", 457 | " print('not reached')" 458 | ] 459 | }, 460 | { 461 | "cell_type": "markdown", 462 | "metadata": { 463 | "slideshow": { 464 | "slide_type": "subslide" 465 | } 466 | }, 467 | "source": [ 468 | "## User Defined Exceptions" 469 | ] 470 | }, 471 | { 472 | "cell_type": "code", 473 | "execution_count": null, 474 | "metadata": { 475 | "collapsed": true, 476 | "jupyter": { 477 | "outputs_hidden": true 478 | }, 479 | "slideshow": { 480 | "slide_type": "fragment" 481 | } 482 | }, 483 | "outputs": [], 484 | "source": [ 485 | "class AlreadyGotOne(Exception): \n", 486 | " pass\n", 487 | "\n", 488 | "def gail():\n", 489 | " raise AlreadyGotOne()" 490 | ] 491 | }, 492 | { 493 | "cell_type": "code", 494 | "execution_count": null, 495 | "metadata": { 496 | "slideshow": { 497 | "slide_type": "fragment" 498 | } 499 | }, 500 | "outputs": [], 501 | "source": [ 502 | "try:\n", 503 | " gail()\n", 504 | "except AlreadyGotOne:\n", 505 | " print('got exception')" 506 | ] 507 | }, 508 | { 509 | "cell_type": "code", 510 | "execution_count": null, 511 | "metadata": { 512 | "scrolled": true, 513 | "slideshow": { 514 | "slide_type": "subslide" 515 | } 516 | }, 517 | "outputs": [], 518 | "source": [ 519 | "class Career(Exception):\n", 520 | " \n", 521 | " def __init__(self, job, *args, **kwargs):\n", 522 | " super(Career, self).__init__(*args, **kwargs)\n", 523 | " self._job = job\n", 524 | " \n", 525 | " def __str__(self): \n", 526 | " return 'So I became a waiter of {}'.format(self._job)\n", 527 | " \n", 528 | "raise Career('Engineer')" 529 | ] 530 | } 531 | ], 532 | "metadata": { 533 | "kernelspec": { 534 | "display_name": "Python 3 (ipykernel)", 535 | "language": "python", 536 | "name": "python3" 537 | }, 538 | "language_info": { 539 | "codemirror_mode": { 540 | "name": "ipython", 541 | "version": 3 542 | }, 543 | "file_extension": ".py", 544 | "mimetype": "text/x-python", 545 | "name": "python", 546 | "nbconvert_exporter": "python", 547 | "pygments_lexer": "ipython3", 548 | "version": "3.9.6" 549 | } 550 | }, 551 | "nbformat": 4, 552 | "nbformat_minor": 4 553 | } 554 | -------------------------------------------------------------------------------- /scikit-learn/02.1 Supervised Learning - Classification.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": false, 8 | "jupyter": { 9 | "outputs_hidden": false 10 | }, 11 | "slideshow": { 12 | "slide_type": "skip" 13 | } 14 | }, 15 | "outputs": [], 16 | "source": [ 17 | "%matplotlib inline\n", 18 | "import matplotlib.pyplot as plt\n", 19 | "import numpy as np" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": { 25 | "slideshow": { 26 | "slide_type": "slide" 27 | } 28 | }, 29 | "source": [ 30 | "Classification\n", 31 | "========\n" 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "metadata": { 37 | "slideshow": { 38 | "slide_type": "subslide" 39 | } 40 | }, 41 | "source": [ 42 | "To visualize the workings of machine learning algorithms, it is often helpful to study two-dimensional or one-dimensional data, that is data with only one or two features. \n", 43 | "\n", 44 | "While in practice, datasets usually have many more features, it is hard to plot high-dimensional data on two-dimensional screens.\n", 45 | "\n", 46 | "We will illustrate some very simple examples before we move on to more \"real world\" data sets." 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "metadata": { 52 | "slideshow": { 53 | "slide_type": "subslide" 54 | } 55 | }, 56 | "source": [ 57 | "First, we will look at a two class classification problem in two dimensions. We use the synthetic data generated by the ``make_blobs`` function." 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": { 64 | "collapsed": false, 65 | "jupyter": { 66 | "outputs_hidden": false 67 | }, 68 | "slideshow": { 69 | "slide_type": "fragment" 70 | } 71 | }, 72 | "outputs": [], 73 | "source": [ 74 | "from sklearn.datasets import make_blobs\n", 75 | "X, y = make_blobs(centers=2, random_state=0)\n", 76 | "print(X.shape)\n", 77 | "print(y.shape)\n", 78 | "print(X[:5, :])\n", 79 | "print(y[:5])" 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "metadata": { 85 | "slideshow": { 86 | "slide_type": "subslide" 87 | } 88 | }, 89 | "source": [ 90 | "As the data is two-dimensional, we can plot each sample as a point in two-dimensional space, with the first feature being the x-axis and the second feature being the y-axis." 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "metadata": { 97 | "collapsed": false, 98 | "jupyter": { 99 | "outputs_hidden": false 100 | }, 101 | "slideshow": { 102 | "slide_type": "fragment" 103 | } 104 | }, 105 | "outputs": [], 106 | "source": [ 107 | "plt.scatter(X[:, 0], X[:, 1], c=y, s=40)\n", 108 | "plt.xlabel(\"first feature\")\n", 109 | "plt.ylabel(\"second feature\")" 110 | ] 111 | }, 112 | { 113 | "cell_type": "markdown", 114 | "metadata": { 115 | "slideshow": { 116 | "slide_type": "subslide" 117 | } 118 | }, 119 | "source": [ 120 | "As classification is a supervised task, and we are interested in how well the model generalizes, we split our data into a training set,\n", 121 | "to built the model from, and a test-set, to evaluate how well our model performs on new data. \n", 122 | "The ``train_test_split`` function form the ``cross_validation`` module does that for us, by randomly splitting of 25% of the data for testing.\n", 123 | "\n", 124 | "\n" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": null, 130 | "metadata": { 131 | "slideshow": { 132 | "slide_type": "subslide" 133 | }, 134 | "tags": [] 135 | }, 136 | "outputs": [], 137 | "source": [ 138 | "from sklearn.model_selection import train_test_split\n", 139 | "\n", 140 | "X_train, X_test, y_train, y_test = train_test_split(X, y, \n", 141 | " random_state=0)" 142 | ] 143 | }, 144 | { 145 | "cell_type": "markdown", 146 | "metadata": { 147 | "slideshow": { 148 | "slide_type": "subslide" 149 | } 150 | }, 151 | "source": [ 152 | "\n" 153 | ] 154 | }, 155 | { 156 | "cell_type": "markdown", 157 | "metadata": { 158 | "slideshow": { 159 | "slide_type": "subslide" 160 | } 161 | }, 162 | "source": [ 163 | "## Scikit-Learn Estimator API\n", 164 | "\n", 165 | "Every algorithm is exposed in scikit-learn via an ''Estimator'' object. For instance a logistic regression is:" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": null, 171 | "metadata": { 172 | "collapsed": false, 173 | "jupyter": { 174 | "outputs_hidden": false 175 | }, 176 | "slideshow": { 177 | "slide_type": "fragment" 178 | } 179 | }, 180 | "outputs": [], 181 | "source": [ 182 | "from sklearn.linear_model import LogisticRegression" 183 | ] 184 | }, 185 | { 186 | "cell_type": "markdown", 187 | "metadata": { 188 | "slideshow": { 189 | "slide_type": "subslide" 190 | } 191 | }, 192 | "source": [ 193 | "All models in scikit-learn have a very consistent interface.\n", 194 | "First, we instantiate the estimator object." 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": null, 200 | "metadata": { 201 | "collapsed": false, 202 | "jupyter": { 203 | "outputs_hidden": false 204 | }, 205 | "slideshow": { 206 | "slide_type": "fragment" 207 | } 208 | }, 209 | "outputs": [], 210 | "source": [ 211 | "classifier = LogisticRegression()" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": null, 217 | "metadata": { 218 | "collapsed": false, 219 | "jupyter": { 220 | "outputs_hidden": false 221 | }, 222 | "slideshow": { 223 | "slide_type": "fragment" 224 | } 225 | }, 226 | "outputs": [], 227 | "source": [ 228 | "X_train.shape" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": null, 234 | "metadata": { 235 | "collapsed": false, 236 | "jupyter": { 237 | "outputs_hidden": false 238 | }, 239 | "slideshow": { 240 | "slide_type": "fragment" 241 | } 242 | }, 243 | "outputs": [], 244 | "source": [ 245 | "y_train.shape" 246 | ] 247 | }, 248 | { 249 | "cell_type": "markdown", 250 | "metadata": { 251 | "slideshow": { 252 | "slide_type": "subslide" 253 | } 254 | }, 255 | "source": [ 256 | "To built the model from our data, that is to learn how to classify new points, we call the ``fit`` function with the training data, and the corresponding training labels (the desired output for the training data point):" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": null, 262 | "metadata": { 263 | "collapsed": false, 264 | "jupyter": { 265 | "outputs_hidden": false 266 | }, 267 | "slideshow": { 268 | "slide_type": "fragment" 269 | } 270 | }, 271 | "outputs": [], 272 | "source": [ 273 | "classifier = classifier.fit(X_train, y_train)\n", 274 | "classifier" 275 | ] 276 | }, 277 | { 278 | "cell_type": "markdown", 279 | "metadata": { 280 | "slideshow": { 281 | "slide_type": "subslide" 282 | } 283 | }, 284 | "source": [ 285 | "We can then apply the model to unseen data and use the model to predict the estimated outcome using the ``predict`` method:" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": null, 291 | "metadata": { 292 | "collapsed": false, 293 | "jupyter": { 294 | "outputs_hidden": false 295 | }, 296 | "slideshow": { 297 | "slide_type": "fragment" 298 | } 299 | }, 300 | "outputs": [], 301 | "source": [ 302 | "prediction = classifier.predict(X_test)" 303 | ] 304 | }, 305 | { 306 | "cell_type": "markdown", 307 | "metadata": { 308 | "slideshow": { 309 | "slide_type": "subslide" 310 | } 311 | }, 312 | "source": [ 313 | "We can compare these against the true labels:" 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "execution_count": null, 319 | "metadata": { 320 | "collapsed": false, 321 | "jupyter": { 322 | "outputs_hidden": false 323 | }, 324 | "slideshow": { 325 | "slide_type": "fragment" 326 | } 327 | }, 328 | "outputs": [], 329 | "source": [ 330 | "print(prediction)\n", 331 | "print(y_test)" 332 | ] 333 | }, 334 | { 335 | "cell_type": "markdown", 336 | "metadata": { 337 | "slideshow": { 338 | "slide_type": "subslide" 339 | } 340 | }, 341 | "source": [ 342 | "We can evaluate our classifier quantitatively by measuring what fraction of predictions is correct (i.e. **accuracy**):" 343 | ] 344 | }, 345 | { 346 | "cell_type": "code", 347 | "execution_count": null, 348 | "metadata": { 349 | "collapsed": false, 350 | "jupyter": { 351 | "outputs_hidden": false 352 | } 353 | }, 354 | "outputs": [], 355 | "source": [ 356 | "np.mean(prediction == y_test)" 357 | ] 358 | }, 359 | { 360 | "cell_type": "markdown", 361 | "metadata": { 362 | "slideshow": { 363 | "slide_type": "subslide" 364 | } 365 | }, 366 | "source": [ 367 | "There is also a convenience function , ``score``, that all scikit-learn classifiers have to compute this directly from the test data:\n", 368 | " " 369 | ] 370 | }, 371 | { 372 | "cell_type": "code", 373 | "execution_count": null, 374 | "metadata": { 375 | "collapsed": false, 376 | "jupyter": { 377 | "outputs_hidden": false 378 | } 379 | }, 380 | "outputs": [], 381 | "source": [ 382 | "classifier.score(X_test, y_test)" 383 | ] 384 | }, 385 | { 386 | "cell_type": "markdown", 387 | "metadata": { 388 | "slideshow": { 389 | "slide_type": "subslide" 390 | } 391 | }, 392 | "source": [ 393 | "It is often helpful to compare the generalization performance (on the test set) to the performance on the training set:" 394 | ] 395 | }, 396 | { 397 | "cell_type": "code", 398 | "execution_count": null, 399 | "metadata": { 400 | "collapsed": false, 401 | "jupyter": { 402 | "outputs_hidden": false 403 | } 404 | }, 405 | "outputs": [], 406 | "source": [ 407 | "classifier.score(X_train, y_train)" 408 | ] 409 | }, 410 | { 411 | "cell_type": "markdown", 412 | "metadata": { 413 | "slideshow": { 414 | "slide_type": "subslide" 415 | } 416 | }, 417 | "source": [ 418 | "LogisticRegression is a so-called linear model,\n", 419 | "that means it will create a decision that is linear in the input space. \n", 420 | "\n", 421 | "In 2D, this simply means it finds a line to separate the blue from the red:" 422 | ] 423 | }, 424 | { 425 | "cell_type": "code", 426 | "execution_count": null, 427 | "metadata": {}, 428 | "outputs": [], 429 | "source": [ 430 | "def plot_2d_separator(classifier, X, fill=False, ax=None, eps=None):\n", 431 | " if eps is None:\n", 432 | " eps = X.std() / 2.\n", 433 | " x_min, x_max = X[:, 0].min() - eps, X[:, 0].max() + eps\n", 434 | " y_min, y_max = X[:, 1].min() - eps, X[:, 1].max() + eps\n", 435 | " xx = np.linspace(x_min, x_max, 100)\n", 436 | " yy = np.linspace(y_min, y_max, 100)\n", 437 | "\n", 438 | " X1, X2 = np.meshgrid(xx, yy)\n", 439 | " X_grid = np.c_[X1.ravel(), X2.ravel()]\n", 440 | " try:\n", 441 | " decision_values = classifier.decision_function(X_grid)\n", 442 | " levels = [0]\n", 443 | " fill_levels = [decision_values.min(), 0, decision_values.max()]\n", 444 | " except AttributeError:\n", 445 | " # no decision_function\n", 446 | " decision_values = classifier.predict_proba(X_grid)[:, 1]\n", 447 | " levels = [.5]\n", 448 | " fill_levels = [0, .5, 1]\n", 449 | "\n", 450 | " if ax is None:\n", 451 | " ax = plt.gca()\n", 452 | " if fill:\n", 453 | " ax.contourf(X1, X2, decision_values.reshape(X1.shape),\n", 454 | " levels=fill_levels, colors=['blue', 'red'])\n", 455 | " else:\n", 456 | " ax.contour(X1, X2, decision_values.reshape(X1.shape), levels=levels,\n", 457 | " colors=\"black\")\n", 458 | " ax.set_xlim(x_min, x_max)\n", 459 | " ax.set_ylim(y_min, y_max)\n", 460 | " ax.set_xticks(())\n", 461 | " ax.set_yticks(())" 462 | ] 463 | }, 464 | { 465 | "cell_type": "code", 466 | "execution_count": null, 467 | "metadata": { 468 | "collapsed": false, 469 | "jupyter": { 470 | "outputs_hidden": false 471 | } 472 | }, 473 | "outputs": [], 474 | "source": [ 475 | "plt.scatter(X[:, 0], X[:, 1], c=y, s=40)\n", 476 | "plt.xlabel(\"first feature\")\n", 477 | "plt.ylabel(\"second feature\")\n", 478 | "plot_2d_separator(classifier, X)" 479 | ] 480 | }, 481 | { 482 | "cell_type": "markdown", 483 | "metadata": { 484 | "slideshow": { 485 | "slide_type": "subslide" 486 | } 487 | }, 488 | "source": [ 489 | "**Estimated parameters**: All the estimated parameters are attributes of the estimator object ending by an underscore. " 490 | ] 491 | }, 492 | { 493 | "cell_type": "markdown", 494 | "metadata": { 495 | "slideshow": { 496 | "slide_type": "fragment" 497 | } 498 | }, 499 | "source": [ 500 | "Here, these are the coefficients and the offset of the line:" 501 | ] 502 | }, 503 | { 504 | "cell_type": "code", 505 | "execution_count": null, 506 | "metadata": { 507 | "collapsed": false, 508 | "jupyter": { 509 | "outputs_hidden": false 510 | }, 511 | "slideshow": { 512 | "slide_type": "fragment" 513 | } 514 | }, 515 | "outputs": [], 516 | "source": [ 517 | "print(classifier.coef_)\n", 518 | "print(classifier.intercept_)" 519 | ] 520 | }, 521 | { 522 | "cell_type": "markdown", 523 | "metadata": { 524 | "slideshow": { 525 | "slide_type": "slide" 526 | } 527 | }, 528 | "source": [ 529 | "Another classifier: K Nearest Neighbors\n", 530 | "------------------------------------------------" 531 | ] 532 | }, 533 | { 534 | "cell_type": "markdown", 535 | "metadata": { 536 | "slideshow": { 537 | "slide_type": "subslide" 538 | } 539 | }, 540 | "source": [ 541 | "Another popular and easy to understand classifier is K nearest neighbors (kNN). \n", 542 | "\n", 543 | "It has one of the simplest learning strategies: given a new, unknown observation, look up in your reference database which ones have the closest features and assign the predominant class.\n", 544 | "\n", 545 | "The interface is exactly the same as for ``LogisticRegression``." 546 | ] 547 | }, 548 | { 549 | "cell_type": "code", 550 | "execution_count": null, 551 | "metadata": { 552 | "collapsed": false, 553 | "jupyter": { 554 | "outputs_hidden": false 555 | }, 556 | "slideshow": { 557 | "slide_type": "fragment" 558 | } 559 | }, 560 | "outputs": [], 561 | "source": [ 562 | "from sklearn.neighbors import KNeighborsClassifier" 563 | ] 564 | }, 565 | { 566 | "cell_type": "markdown", 567 | "metadata": { 568 | "slideshow": { 569 | "slide_type": "subslide" 570 | } 571 | }, 572 | "source": [ 573 | "This time we set a parameter of the KNeighborsClassifier to tell it we only want to look at one nearest neighbor:" 574 | ] 575 | }, 576 | { 577 | "cell_type": "code", 578 | "execution_count": null, 579 | "metadata": { 580 | "collapsed": false, 581 | "jupyter": { 582 | "outputs_hidden": false 583 | } 584 | }, 585 | "outputs": [], 586 | "source": [ 587 | "knn = KNeighborsClassifier(n_neighbors=1)" 588 | ] 589 | }, 590 | { 591 | "cell_type": "markdown", 592 | "metadata": { 593 | "slideshow": { 594 | "slide_type": "subslide" 595 | } 596 | }, 597 | "source": [ 598 | "We fit the model with out training data" 599 | ] 600 | }, 601 | { 602 | "cell_type": "code", 603 | "execution_count": null, 604 | "metadata": { 605 | "collapsed": false, 606 | "jupyter": { 607 | "outputs_hidden": false 608 | } 609 | }, 610 | "outputs": [], 611 | "source": [ 612 | "knn.fit(X_train, y_train)" 613 | ] 614 | }, 615 | { 616 | "cell_type": "code", 617 | "execution_count": null, 618 | "metadata": { 619 | "collapsed": false, 620 | "jupyter": { 621 | "outputs_hidden": false 622 | }, 623 | "slideshow": { 624 | "slide_type": "subslide" 625 | } 626 | }, 627 | "outputs": [], 628 | "source": [ 629 | "plt.scatter(X[:, 0], X[:, 1], c=y, s=40)\n", 630 | "plt.xlabel(\"first feature\")\n", 631 | "plt.ylabel(\"second feature\")\n", 632 | "plot_2d_separator(knn, X)" 633 | ] 634 | }, 635 | { 636 | "cell_type": "code", 637 | "execution_count": null, 638 | "metadata": { 639 | "collapsed": false, 640 | "jupyter": { 641 | "outputs_hidden": false 642 | }, 643 | "slideshow": { 644 | "slide_type": "fragment" 645 | } 646 | }, 647 | "outputs": [], 648 | "source": [ 649 | "knn.score(X_test, y_test)" 650 | ] 651 | }, 652 | { 653 | "cell_type": "markdown", 654 | "metadata": { 655 | "slideshow": { 656 | "slide_type": "slide" 657 | } 658 | }, 659 | "source": [ 660 | "Exercise\n", 661 | "=========\n", 662 | "Apply the KNeighborsClassifier to the ``iris`` dataset. Play with different values of the ``n_neighbors`` and observe how training and test score change." 663 | ] 664 | } 665 | ], 666 | "metadata": { 667 | "kernelspec": { 668 | "display_name": "Python 3 (ipykernel)", 669 | "language": "python", 670 | "name": "python3" 671 | }, 672 | "language_info": { 673 | "codemirror_mode": { 674 | "name": "ipython", 675 | "version": 3 676 | }, 677 | "file_extension": ".py", 678 | "mimetype": "text/x-python", 679 | "name": "python", 680 | "nbconvert_exporter": "python", 681 | "pygments_lexer": "ipython3", 682 | "version": "3.9.7" 683 | } 684 | }, 685 | "nbformat": 4, 686 | "nbformat_minor": 4 687 | } 688 | -------------------------------------------------------------------------------- /python_extras/data_model.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "slideshow": { 7 | "slide_type": "slide" 8 | } 9 | }, 10 | "source": [ 11 | "# Python Data Model" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": { 17 | "slideshow": { 18 | "slide_type": "subslide" 19 | } 20 | }, 21 | "source": [ 22 | "Most of the content of this book has been extracted from the book \"Fluent Python\" by Luciano Ramalho (O'Reilly, 2015)\n", 23 | "http://shop.oreilly.com/product/0636920032519.do" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": { 29 | "slideshow": { 30 | "slide_type": "subslide" 31 | } 32 | }, 33 | "source": [ 34 | ">One of the best qualities of Python is its consistency. \n", 35 | ">After working with Python for a while, you are able to start making informed, correct guesses about features that are new to you." 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": { 41 | "slideshow": { 42 | "slide_type": "fragment" 43 | } 44 | }, 45 | "source": [ 46 | "However, if you learned another object-oriented language before Python, you may have found it strange to use `len(collection)` instead of `collection.len()`. \n", 47 | "\n", 48 | "This apparent oddity is the tip of an iceberg that, when properly understood, is the key to everything we call **Pythonic**. \n", 49 | "\n", 50 | ">The iceberg is called the **Python data model**, and it describes the API that you can use to make your own objects play well with the most idiomatic language features." 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": { 56 | "slideshow": { 57 | "slide_type": "subslide" 58 | } 59 | }, 60 | "source": [ 61 | "> You can think of the data model as a description of Python as a framework. It formalises the interfaces of the building blocks of the language itself, such as sequences, iterators, functions, classes, context managers, and so on." 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": { 67 | "slideshow": { 68 | "slide_type": "subslide" 69 | } 70 | }, 71 | "source": [ 72 | "While coding with any framework, you spend a lot of time implementing methods that are called by the framework. The same happens when you leverage the Python data model. \n", 73 | "\n", 74 | "The Python interpreter invokes **special methods** to perform basic object operations, often triggered by **special syntax**. \n", 75 | "\n", 76 | "The special method names are always written with leading and trailing double underscores (i.e., `__getitem__`).\n", 77 | "\n", 78 | "For example, the syntax `obj[key]` is supported by the `__getitem__` special method. \n", 79 | "\n", 80 | "In order to evaluate `my_collection[key]`, the interpreter calls `my_collection.__getitem__(key)`.\n", 81 | "\n", 82 | "The special method names allow your objects to implement, support, and interact with basic language constructs such as:\n", 83 | "\n", 84 | "- Iteration\n", 85 | "- Collections\n", 86 | "- Attribute access\n", 87 | "- Operator overloading\n", 88 | "- Function and method invocation\n", 89 | "- Object creation and destruction\n", 90 | "- String representation and formatting\n", 91 | "- Managed contexts (i.e., with blocks)" 92 | ] 93 | }, 94 | { 95 | "cell_type": "markdown", 96 | "metadata": { 97 | "slideshow": { 98 | "slide_type": "slide" 99 | } 100 | }, 101 | "source": [ 102 | "## A Pythonic Card Deck" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": 11, 108 | "metadata": { 109 | "slideshow": { 110 | "slide_type": "subslide" 111 | } 112 | }, 113 | "outputs": [], 114 | "source": [ 115 | "import collections\n", 116 | "\n", 117 | "Card = collections.namedtuple('Card', ['rank', 'suit'])\n", 118 | "\n", 119 | "class FrenchDeck:\n", 120 | " ranks = [str(n) for n in range(2, 11)] + list('JQKA')\n", 121 | " suits = 'spades diamonds clubs hearts'.split()\n", 122 | "\n", 123 | " def __init__(self):\n", 124 | " self._cards = [Card(rank, suit) for suit in self.suits\n", 125 | " for rank in self.ranks]\n", 126 | "\n", 127 | " def __len__(self):\n", 128 | " return len(self._cards)\n", 129 | "\n", 130 | " def __getitem__(self, position):\n", 131 | " return self._cards[position]" 132 | ] 133 | }, 134 | { 135 | "cell_type": "markdown", 136 | "metadata": { 137 | "slideshow": { 138 | "slide_type": "fragment" 139 | } 140 | }, 141 | "source": [ 142 | "The first thing to note is the use of `collections.namedtuple` to construct a simple class to represent individual cards. \n", 143 | "\n", 144 | "Since Python 2.6, `namedtuple` can be used to build classes of objects that are just bundles of attributes with no custom methods, like a database record. \n", 145 | "\n", 146 | "In the example, we use it to provide a nice representation for the cards in the deck:" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": 2, 152 | "metadata": { 153 | "slideshow": { 154 | "slide_type": "subslide" 155 | } 156 | }, 157 | "outputs": [ 158 | { 159 | "data": { 160 | "text/plain": [ 161 | "Card(rank='7', suit='diamonds')" 162 | ] 163 | }, 164 | "execution_count": 2, 165 | "metadata": {}, 166 | "output_type": "execute_result" 167 | } 168 | ], 169 | "source": [ 170 | "beer_card = Card('7', 'diamonds')\n", 171 | "beer_card" 172 | ] 173 | }, 174 | { 175 | "cell_type": "markdown", 176 | "metadata": { 177 | "slideshow": { 178 | "slide_type": "fragment" 179 | } 180 | }, 181 | "source": [ 182 | "But the point of this example is the `FrenchDeck` class. \n", 183 | "\n", 184 | "It’s short, but it packs a punch. \n", 185 | "\n", 186 | "##### Length of a Deck\n", 187 | "\n", 188 | "First, like any standard Python collection, a deck responds to the `len()` function by returning the number of cards in it:" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": 4, 194 | "metadata": { 195 | "slideshow": { 196 | "slide_type": "fragment" 197 | } 198 | }, 199 | "outputs": [ 200 | { 201 | "data": { 202 | "text/plain": [ 203 | "52" 204 | ] 205 | }, 206 | "execution_count": 4, 207 | "metadata": {}, 208 | "output_type": "execute_result" 209 | } 210 | ], 211 | "source": [ 212 | "deck = FrenchDeck()\n", 213 | "len(deck)" 214 | ] 215 | }, 216 | { 217 | "cell_type": "markdown", 218 | "metadata": { 219 | "slideshow": { 220 | "slide_type": "subslide" 221 | } 222 | }, 223 | "source": [ 224 | "##### Reading Specific Cards" 225 | ] 226 | }, 227 | { 228 | "cell_type": "markdown", 229 | "metadata": { 230 | "slideshow": { 231 | "slide_type": "fragment" 232 | } 233 | }, 234 | "source": [ 235 | "Reading specific cards from the deck say, the first or the last— should be as easy as `deck[0]` or `deck[-1]`, and this is what the `__getitem__` method provides:" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": 5, 241 | "metadata": { 242 | "slideshow": { 243 | "slide_type": "fragment" 244 | } 245 | }, 246 | "outputs": [ 247 | { 248 | "data": { 249 | "text/plain": [ 250 | "Card(rank='2', suit='spades')" 251 | ] 252 | }, 253 | "execution_count": 5, 254 | "metadata": {}, 255 | "output_type": "execute_result" 256 | } 257 | ], 258 | "source": [ 259 | "deck[0]" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": 6, 265 | "metadata": { 266 | "slideshow": { 267 | "slide_type": "fragment" 268 | } 269 | }, 270 | "outputs": [ 271 | { 272 | "data": { 273 | "text/plain": [ 274 | "Card(rank='A', suit='hearts')" 275 | ] 276 | }, 277 | "execution_count": 6, 278 | "metadata": {}, 279 | "output_type": "execute_result" 280 | } 281 | ], 282 | "source": [ 283 | "deck[-1]" 284 | ] 285 | }, 286 | { 287 | "cell_type": "markdown", 288 | "metadata": { 289 | "slideshow": { 290 | "slide_type": "subslide" 291 | } 292 | }, 293 | "source": [ 294 | "##### Picking Random Card\n", 295 | "\n", 296 | "Should we create a method to pick a random card? **No need**. \n", 297 | "\n", 298 | "Python already has a function to get a random item from a sequence: `random.choice`. We can just use it on a deck instance:" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": 7, 304 | "metadata": { 305 | "slideshow": { 306 | "slide_type": "fragment" 307 | } 308 | }, 309 | "outputs": [ 310 | { 311 | "data": { 312 | "text/plain": [ 313 | "Card(rank='Q', suit='clubs')" 314 | ] 315 | }, 316 | "execution_count": 7, 317 | "metadata": {}, 318 | "output_type": "execute_result" 319 | } 320 | ], 321 | "source": [ 322 | "from random import choice\n", 323 | "choice(deck)" 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": 8, 329 | "metadata": { 330 | "slideshow": { 331 | "slide_type": "fragment" 332 | } 333 | }, 334 | "outputs": [ 335 | { 336 | "data": { 337 | "text/plain": [ 338 | "Card(rank='A', suit='spades')" 339 | ] 340 | }, 341 | "execution_count": 8, 342 | "metadata": {}, 343 | "output_type": "execute_result" 344 | } 345 | ], 346 | "source": [ 347 | "choice(deck)" 348 | ] 349 | }, 350 | { 351 | "cell_type": "code", 352 | "execution_count": 9, 353 | "metadata": { 354 | "slideshow": { 355 | "slide_type": "fragment" 356 | } 357 | }, 358 | "outputs": [ 359 | { 360 | "data": { 361 | "text/plain": [ 362 | "Card(rank='K', suit='spades')" 363 | ] 364 | }, 365 | "execution_count": 9, 366 | "metadata": {}, 367 | "output_type": "execute_result" 368 | } 369 | ], 370 | "source": [ 371 | "choice(deck)" 372 | ] 373 | }, 374 | { 375 | "cell_type": "markdown", 376 | "metadata": { 377 | "slideshow": { 378 | "slide_type": "subslide" 379 | } 380 | }, 381 | "source": [ 382 | "### First Impressions:\n", 383 | "\n", 384 | "We’ve just seen two advantages of using special methods to leverage the Python data model:\n", 385 | "\n", 386 | "- The users of your classes don’t have to memorize arbitrary method names for standard operations \n", 387 | "(“How to get the number of items? Is it .size(), .length(), or what?”).\n", 388 | "\n", 389 | "- It’s easier to benefit from the rich Python standard library and avoid reinventing the wheel, like the `random.choice` function." 390 | ] 391 | }, 392 | { 393 | "cell_type": "markdown", 394 | "metadata": { 395 | "slideshow": { 396 | "slide_type": "subslide" 397 | } 398 | }, 399 | "source": [ 400 | "**... but it gets better ...**\n", 401 | "\n", 402 | "Because our `__getitem__` delegates to the `[]` operator of `self._cards`, our deck automatically supports **slicing**. \n", 403 | "\n", 404 | "Here’s how we look at the top three cards from a brand new deck, and then pick just the aces by starting on index 12 and skipping 13 cards at a time:\n", 405 | "\n" 406 | ] 407 | }, 408 | { 409 | "cell_type": "code", 410 | "execution_count": 10, 411 | "metadata": { 412 | "slideshow": { 413 | "slide_type": "fragment" 414 | } 415 | }, 416 | "outputs": [ 417 | { 418 | "data": { 419 | "text/plain": [ 420 | "[Card(rank='2', suit='spades'),\n", 421 | " Card(rank='3', suit='spades'),\n", 422 | " Card(rank='4', suit='spades')]" 423 | ] 424 | }, 425 | "execution_count": 10, 426 | "metadata": {}, 427 | "output_type": "execute_result" 428 | } 429 | ], 430 | "source": [ 431 | "deck[:3]" 432 | ] 433 | }, 434 | { 435 | "cell_type": "code", 436 | "execution_count": 11, 437 | "metadata": { 438 | "slideshow": { 439 | "slide_type": "fragment" 440 | } 441 | }, 442 | "outputs": [ 443 | { 444 | "data": { 445 | "text/plain": [ 446 | "[Card(rank='A', suit='spades'),\n", 447 | " Card(rank='A', suit='diamonds'),\n", 448 | " Card(rank='A', suit='clubs'),\n", 449 | " Card(rank='A', suit='hearts')]" 450 | ] 451 | }, 452 | "execution_count": 11, 453 | "metadata": {}, 454 | "output_type": "execute_result" 455 | } 456 | ], 457 | "source": [ 458 | "deck[12::13]" 459 | ] 460 | }, 461 | { 462 | "cell_type": "markdown", 463 | "metadata": { 464 | "slideshow": { 465 | "slide_type": "subslide" 466 | } 467 | }, 468 | "source": [ 469 | "Just by implementing the `__getitem__` special method, our deck is also **iterable**:\n" 470 | ] 471 | }, 472 | { 473 | "cell_type": "code", 474 | "execution_count": 12, 475 | "metadata": { 476 | "slideshow": { 477 | "slide_type": "fragment" 478 | } 479 | }, 480 | "outputs": [ 481 | { 482 | "name": "stdout", 483 | "output_type": "stream", 484 | "text": [ 485 | "Card(rank='2', suit='spades')\n", 486 | "Card(rank='3', suit='spades')\n", 487 | "Card(rank='4', suit='spades')\n", 488 | "Card(rank='5', suit='spades')\n", 489 | "Card(rank='6', suit='spades')\n", 490 | "Card(rank='7', suit='spades')\n", 491 | "Card(rank='8', suit='spades')\n", 492 | "Card(rank='9', suit='spades')\n", 493 | "Card(rank='10', suit='spades')\n", 494 | "Card(rank='J', suit='spades')\n", 495 | "Card(rank='Q', suit='spades')\n", 496 | "Card(rank='K', suit='spades')\n", 497 | "Card(rank='A', suit='spades')\n", 498 | "Card(rank='2', suit='diamonds')\n", 499 | "Card(rank='3', suit='diamonds')\n", 500 | "Card(rank='4', suit='diamonds')\n", 501 | "Card(rank='5', suit='diamonds')\n", 502 | "Card(rank='6', suit='diamonds')\n", 503 | "Card(rank='7', suit='diamonds')\n", 504 | "Card(rank='8', suit='diamonds')\n", 505 | "Card(rank='9', suit='diamonds')\n", 506 | "Card(rank='10', suit='diamonds')\n", 507 | "Card(rank='J', suit='diamonds')\n", 508 | "Card(rank='Q', suit='diamonds')\n", 509 | "Card(rank='K', suit='diamonds')\n", 510 | "Card(rank='A', suit='diamonds')\n", 511 | "Card(rank='2', suit='clubs')\n", 512 | "Card(rank='3', suit='clubs')\n", 513 | "Card(rank='4', suit='clubs')\n", 514 | "Card(rank='5', suit='clubs')\n", 515 | "Card(rank='6', suit='clubs')\n", 516 | "Card(rank='7', suit='clubs')\n", 517 | "Card(rank='8', suit='clubs')\n", 518 | "Card(rank='9', suit='clubs')\n", 519 | "Card(rank='10', suit='clubs')\n", 520 | "Card(rank='J', suit='clubs')\n", 521 | "Card(rank='Q', suit='clubs')\n", 522 | "Card(rank='K', suit='clubs')\n", 523 | "Card(rank='A', suit='clubs')\n", 524 | "Card(rank='2', suit='hearts')\n", 525 | "Card(rank='3', suit='hearts')\n", 526 | "Card(rank='4', suit='hearts')\n", 527 | "Card(rank='5', suit='hearts')\n", 528 | "Card(rank='6', suit='hearts')\n", 529 | "Card(rank='7', suit='hearts')\n", 530 | "Card(rank='8', suit='hearts')\n", 531 | "Card(rank='9', suit='hearts')\n", 532 | "Card(rank='10', suit='hearts')\n", 533 | "Card(rank='J', suit='hearts')\n", 534 | "Card(rank='Q', suit='hearts')\n", 535 | "Card(rank='K', suit='hearts')\n", 536 | "Card(rank='A', suit='hearts')\n" 537 | ] 538 | } 539 | ], 540 | "source": [ 541 | "for card in deck:\n", 542 | " print(card)" 543 | ] 544 | }, 545 | { 546 | "cell_type": "markdown", 547 | "metadata": { 548 | "slideshow": { 549 | "slide_type": "subslide" 550 | } 551 | }, 552 | "source": [ 553 | "... also in **reverse order**:\n", 554 | "\n", 555 | "```python \n", 556 | "for card in reversed(deck):\n", 557 | " print(card)\n", 558 | " \n", 559 | "...\n", 560 | "```" 561 | ] 562 | }, 563 | { 564 | "cell_type": "markdown", 565 | "metadata": { 566 | "slideshow": { 567 | "slide_type": "subslide" 568 | } 569 | }, 570 | "source": [ 571 | "## To know more...\n", 572 | "\n", 573 | "To have a more complete overview of the Magic (_dunder_) methods, please have a look at the [Extra - Magic Methods and Operator Overloading](Extra - Magic Methods and Operator Overloading.ipynb) notebook." 574 | ] 575 | }, 576 | { 577 | "cell_type": "markdown", 578 | "metadata": { 579 | "slideshow": { 580 | "slide_type": "slide" 581 | } 582 | }, 583 | "source": [ 584 | "---\n", 585 | "\n", 586 | "## Exercise: Emulating Numeric Types" 587 | ] 588 | }, 589 | { 590 | "cell_type": "markdown", 591 | "metadata": { 592 | "slideshow": { 593 | "slide_type": "subslide" 594 | } 595 | }, 596 | "source": [ 597 | "```python \n", 598 | "from math import hypot\n", 599 | "\n", 600 | "class Vector:\n", 601 | "\n", 602 | " def __init__(self, x=0, y=0):\n", 603 | " self.x = x\n", 604 | " self.y = y\n", 605 | "\n", 606 | " def __repr__(self):\n", 607 | " return 'Vector(%r, %r)' % (self.x, self.y)\n", 608 | "\n", 609 | " def __abs__(self):\n", 610 | " return hypot(self.x, self.y)\n", 611 | "\n", 612 | " def __bool__(self):\n", 613 | " return bool(abs(self))\n", 614 | "\n", 615 | " def __add__(self, other):\n", 616 | " x = self.x + other.x\n", 617 | " y = self.y + other.y\n", 618 | " return Vector(x, y)\n", 619 | "\n", 620 | " def __mul__(self, scalar):\n", 621 | " return Vector(self.x * scalar, self.y * scalar)\n", 622 | "```" 623 | ] 624 | } 625 | ], 626 | "metadata": { 627 | "kernelspec": { 628 | "display_name": "Python 3", 629 | "language": "python", 630 | "name": "python3" 631 | }, 632 | "language_info": { 633 | "codemirror_mode": { 634 | "name": "ipython", 635 | "version": 3 636 | }, 637 | "file_extension": ".py", 638 | "mimetype": "text/x-python", 639 | "name": "python", 640 | "nbconvert_exporter": "python", 641 | "pygments_lexer": "ipython3", 642 | "version": "3.7.6" 643 | } 644 | }, 645 | "nbformat": 4, 646 | "nbformat_minor": 2 647 | } 648 | --------------------------------------------------------------------------------