├── .gitignore ├── LICENSE ├── Pipfile ├── Pipfile.lock ├── README.md ├── lecture00-intro ├── README.md ├── getting_started.ipynb └── setup.md ├── lecture01-coding-basics ├── README.md └── coding_basics.ipynb ├── lecture02-basic-structures ├── README.md ├── basic_structures.ipynb └── sample_module.py ├── lecture03-data-IO ├── Linux_2k.log ├── README.md ├── city_names.txt ├── data_IO.ipynb └── message.txt ├── lecture04-pandas-basics ├── 01_pandas_basics.ipynb ├── 02_pandas_data_munging.ipynb ├── README.md └── hotel_vienna_restricted.csv ├── lecture05-graphs-basics ├── 01_plotnine_intro.ipynb ├── 02_matplotlib_intro.ipynb └── README.md ├── lecture06-conditionals ├── README.md └── conditionals_and_control_flows.ipynb ├── lecture07-data-exploration ├── README.md └── data_exploration.ipynb ├── lecture08-functions ├── README.md └── functions.ipynb ├── lecture09-exception-handling ├── README.md └── exception_handling.ipynb ├── lecture10-intro-to-regression ├── README.md └── intro_to_regression.ipynb ├── lecture11-feature-engineering ├── 01_feature_engineering_wms.ipynb ├── 02_feature_engineering_bisnode.ipynb └── README.md ├── lecture12-simple-linear-regression ├── 00_life_exp_get_data.ipynb ├── 01_life_exp_clean.ipynb ├── 02_life_exp_analysis.ipynb ├── README.md └── data │ └── WDI_lifeexp_raw.csv ├── lecture13-advanced-linear-regression ├── README.md └── hotels_advanced_regression.ipynb ├── lecture14-binary-models ├── README.md └── binary_models.ipynb ├── lecture15-datetime ├── 01_datetime_basics.ipynb ├── 02_datetime_manipulations.ipynb └── README.md ├── lecture16-timeseries-regression ├── README.md └── intro_time_series.ipynb ├── lecture17-basic-spatial-viz ├── 01_spatial_datavisualisation.ipynb ├── 02_spatial_datavisualisation.ipynb ├── 03_spatial_datavisualisation_plotly.ipynb ├── README.md ├── data_map │ ├── BEZIRKSGRENZEOGDPolygon.dbf │ ├── BEZIRKSGRENZEOGDPolygon.shp │ ├── BEZIRKSGRENZEOGDPolygon.shx │ ├── London_Borough_Excluding_MHW.dbf │ ├── London_Borough_Excluding_MHW.shp │ ├── London_Borough_Excluding_MHW.shx │ └── worldmap.csv └── output │ ├── heu_prices.png │ └── lifeexp.png ├── lecture18-cross-validation ├── README.md └── crossvalidation_usedcars.ipynb ├── lecture19-lasso ├── 01_lasso_airbnb_data_prep.ipynb ├── 02_lasso_airbnb_prediction.ipynb └── README.md ├── lecture20-regression-tree ├── 01_usedcars_cart_data_preparation.ipynb ├── 02_usedcars_cart_prediction.ipynb └── README.md ├── lecture21-random-forest ├── 00_download_model_fits.ipynb ├── 01_prepare_airbnb.ipynb ├── 02_random_forest_airbnb.ipynb └── README.md ├── lecture22-classification ├── README.md ├── data │ └── bisnode_firms_clean.csv ├── firm_exit_classification.ipynb └── helper_functions.py ├── lecture23-long-term-time-series ├── README.md └── long_term_swimming.ipynb ├── lecture24-short-term-time-series ├── README.md └── short_term_priceindex.ipynb └── lecture25-matplotlib-vs-plotnine ├── README.md ├── helper_functions.py ├── life_expectancy_gdp_matplotlib.ipynb └── life_expectancy_gdp_plotnine.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Data folder 10 | data/ 11 | model_fits/ 12 | 13 | # Distribution / packaging 14 | .Python 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | pip-wheel-metadata/ 28 | share/python-wheels/ 29 | *.egg-info/ 30 | .installed.cfg 31 | *.egg 32 | MANIFEST 33 | 34 | # PyInstaller 35 | # Usually these files are written by a python script from a template 36 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 37 | *.manifest 38 | *.spec 39 | 40 | # Installer logs 41 | pip-log.txt 42 | pip-delete-this-directory.txt 43 | 44 | # Unit test / coverage reports 45 | htmlcov/ 46 | .tox/ 47 | .nox/ 48 | .coverage 49 | .coverage.* 50 | .cache 51 | nosetests.xml 52 | coverage.xml 53 | *.cover 54 | *.py,cover 55 | .hypothesis/ 56 | .pytest_cache/ 57 | 58 | # Translations 59 | *.mo 60 | *.pot 61 | 62 | # Django stuff: 63 | *.log 64 | local_settings.py 65 | db.sqlite3 66 | db.sqlite3-journal 67 | 68 | # Flask stuff: 69 | instance/ 70 | .webassets-cache 71 | 72 | # Scrapy stuff: 73 | .scrapy 74 | 75 | # Sphinx documentation 76 | docs/_build/ 77 | 78 | # PyBuilder 79 | target/ 80 | 81 | # Jupyter Notebook 82 | .ipynb_checkpoints 83 | 84 | # IPython 85 | profile_default/ 86 | ipython_config.py 87 | 88 | # pyenv 89 | .python-version 90 | 91 | # pipenv 92 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 93 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 94 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 95 | # install all needed dependencies. 96 | #Pipfile.lock 97 | 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 99 | __pypackages__/ 100 | 101 | # Celery stuff 102 | celerybeat-schedule 103 | celerybeat.pid 104 | 105 | # SageMath parsed files 106 | *.sage.py 107 | 108 | # Environments 109 | .env 110 | .venv 111 | env/ 112 | venv/ 113 | ENV/ 114 | env.bak/ 115 | venv.bak/ 116 | 117 | # Spyder project settings 118 | .spyderproject 119 | .spyproject 120 | 121 | # Rope project settings 122 | .ropeproject 123 | 124 | # mkdocs documentation 125 | /site 126 | 127 | # mypy 128 | .mypy_cache/ 129 | .dmypy.json 130 | dmypy.json 131 | 132 | # Pyre type checker 133 | .pyre/ 134 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Gabors Data Analysis 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | url = "https://pypi.org/simple" 3 | verify_ssl = true 4 | name = "pypi" 5 | 6 | [packages] 7 | arch = "==5.3.1" 8 | black = "==22.8.0" 9 | fredapi = "==0.5.0" 10 | geopandas = "==0.11.1" 11 | jupyter = "==1.0.0" 12 | jupyter-contrib-nbextensions = "==0.5.1" 13 | matplotlib = "==3.5.0" 14 | numpy = "==1.23.3" 15 | pandas = "==1.5.0" 16 | pandas-market-calendars = "==4.0" 17 | patchworklib = "==0.4.7" 18 | plotly = "==5.10.0" 19 | plotnine = "==0.9" 20 | pmdarima = "==2.0.1" 21 | prophet = "==1.0" 22 | pycountry-convert = "==0.7.2" 23 | pydotplus = "==2.0.2" 24 | pystan = "==2.19.1.1" 25 | pyzmq = "==19.0.2" 26 | scikit-learn = "==1.1.2" 27 | scikit-misc = "==0.1.4" 28 | seaborn = "==0.12.0" 29 | shap = "==0.41.0" 30 | skimpy = "==0.0.6" 31 | stargazer = "==0.0.5" 32 | statsmodels = "==0.13.2" 33 | wbdata = "==0.3.0" 34 | xgboost = "==1.6.2" 35 | 36 | [dev-packages] 37 | 38 | [requires] 39 | python_version = "3.8" 40 | -------------------------------------------------------------------------------- /lecture00-intro/README.md: -------------------------------------------------------------------------------- 1 | # Lecture 00: Introduction to Python and Jupyter notebook 2 | 3 | ## Motivation 4 | 5 | In this course, we focus on Python and Jupyter Notebook. This means you won’t learn anything about R, Julia, or any other programming language useful for data science. They’re also excellent choices, and in practice, most data science teams use a mix of languages, often at least Python and R. 6 | 7 | ## This lecture 8 | 9 | This is the starting lecture, that introduces students to Python and Jupyter notebook (download and install), installs the virtual environment, runs a pre-written script, and highlights the importance of version control. 10 | 11 | The aim of this class is not to teach coding, but to make sure that everybody has Python, Jupyter Notebook and the virtual environment installed on their laptop. The main aim of these steps is to reveal possible OS mismatches or other problems with R and RStudio. 12 | The material and steps are detailed in [`getting_started.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture00-intro/getting_started.ipynb) and [`setup.md`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture00-intro/setup.md). 13 | 14 | 15 | ## Learning outcomes 16 | After successfully teaching the material, students will have 17 | 18 | - Python and Jupyter Notebook on their laptop/computers 19 | 20 | and understand, 21 | 22 | - How Jupyter Notebook looks like, which window is which. 23 | - How to run a command via console using VSCode. 24 | - What are packages, and how to install and load them into the virtual environment. 25 | - Why version control is important and what are the main possibilities with Git and GitHub. 26 | 27 | These steps are found to be extremely important, as fixing installation problems may take days to weeks. 28 | 29 | ## Datasets used 30 | * No dataset is used in this lecture 31 | 32 | ## Lecture Time 33 | 34 | Ideal overall time: **20-30 mins**. 35 | 36 | It can substantially differ from this if the teacher decides to do a live coding session with students and fixes the emerging problems during the class (up to ~90 mins). 37 | 38 | ## Homework 39 | 40 | No homework, apart from fixing possible issues. 41 | 42 | ## Further material 43 | - Jupyter notebook [guide](https://www.dataquest.io/blog/jupyter-notebook-tutorial/) 44 | - Git references: 45 | - [Technical foundations of informatics book](https://info201.github.io/git-basics.html) 46 | - [Software carpentry course](https://swcarpentry.github.io/git-novice/) (Strongly recommended) 47 | - [Github Learning Lab](https://lab.github.com/) 48 | - [If you are really committed](https://git-scm.com/book/en/v2) (pun intended) 49 | -------------------------------------------------------------------------------- /lecture00-intro/getting_started.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "b53567b6", 6 | "metadata": {}, 7 | "source": [ 8 | "### Lecture 0\n", 9 | "\n", 10 | "- Setting up the environment \n", 11 | "- Basic terminology \n", 12 | "- Using Jupyter notebook \n", 13 | "- Using VScode \n", 14 | "- Running script from VScode" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "id": "70a0337e", 20 | "metadata": {}, 21 | "source": [ 22 | "## Jupyter notebooks\n", 23 | "This file - a Jupyter notebook - does not follow the standard pattern with Python code in a text file. Instead, a Jupyter notebook is stored as a file in the [JSON](http://en.wikipedia.org/wiki/JSON) format. The advantage is that we can mix formatted text, Python code and code output. It requires the Jupyter notebook server to run it though, and therefore isn't a stand-alone Python program as described above. Other than that, there is no difference between the Python code that goes into a program file or a Jupyter notebook.\n", 24 | "We will return to JSON files later, when we will work with dictionaries, and advanced data structures." 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "id": "c5616878", 30 | "metadata": {}, 31 | "source": [ 32 | "## Getting familiar with the interface" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "id": "a2b8f8a3", 38 | "metadata": {}, 39 | "source": [ 40 | "There are two fairly prominent terms that you should notice, which are probably new to you: cells and kernels are key both to understanding Jupyter and to what makes it more than just a word processor. Fortunately, these concepts are not difficult to understand.\n", 41 | "\n", 42 | "- A kernel is a “computational engine” that executes the code contained in a notebook document.\n", 43 | "- A cell is a container for text to be displayed in the notebook or code to be executed by the notebook’s kernel." 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "id": "7008f1be", 49 | "metadata": {}, 50 | "source": [ 51 | "### Cells\n", 52 | "We’ll return to kernels a little later, but first let’s come to grips with cells.\n", 53 | "\n", 54 | "- A code cell contains code to be executed in the kernel. When the code is run, the notebook displays the output below the code cell that generated it.\n", 55 | "- A Markdown cell contains text formatted using Markdown and displays its output in-place when the Markdown cell is run.\n", 56 | "\n", 57 | "The first cell in a new notebook is always a code cell.\n", 58 | "\n", 59 | "Let’s test it out with a classic hello world example: Type `print(\"Hello World!\")` into the cell and click the run button Notebook Run Button in the toolbar above or press Ctrl + Enter.\n", 60 | "\n", 61 | "The result should look like this:" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 1, 67 | "id": "ceb0aa58", 68 | "metadata": {}, 69 | "outputs": [ 70 | { 71 | "name": "stdout", 72 | "output_type": "stream", 73 | "text": [ 74 | "Hello World!\n" 75 | ] 76 | } 77 | ], 78 | "source": [ 79 | "print(\"Hello World!\")" 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "id": "be2ac251", 85 | "metadata": {}, 86 | "source": [ 87 | "### Kernels\n", 88 | "Behind every notebook runs a kernel. When you run a code cell, that code is executed within the kernel. Any output is returned back to the cell to be displayed. The kernel’s state persists over time and between cells — it pertains to the document as a whole and not individual cells.\n", 89 | "\n" 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "id": "7730964a", 95 | "metadata": {}, 96 | "source": [ 97 | "More on Jupyter Notebooks for beginners: https://www.dataquest.io/blog/jupyter-notebook-tutorial/" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "id": "5b1f1795", 103 | "metadata": {}, 104 | "source": [ 105 | "## Version control: Git and GitHub\n", 106 | "\n", 107 | "Version control is an essential part of coding. It ensures that you keep proper track of your progress when writing a code for analysis or developing a code/package/material etc. We strongly encourage you to create this good habit during this course. \n", 108 | "\n", 109 | "There are multiple ways to do version control all of it has some benefits and drawbacks. Here we list the three most commonly used ones:\n", 110 | "\n", 111 | "- [GitHub Desktop](https://desktop.github.com/) (and [friends](https://www.slant.co/options/13488/alternatives/~github-desktop-alternatives), eg. [VS Code](https://code.visualstudio.com/)):\n", 112 | " - Application to conveniently keep track of your modifications, commit, pull and push to GitHub\n", 113 | " - Pro: easy to use, flexible for all types of files, helps to avoid conflicts\n", 114 | " - Con: extra application, should create a habit and not forget about it :)\n", 115 | "\n", 116 | "- Shell/Terminal\n", 117 | " - Using shell or terminal for version control\n", 118 | " - Pro: flexible for all types of files, can do literally everything there\n", 119 | " - Con: hard to learn, can make mistakes, which is hard to correct\n", 120 | " \n", 121 | "It does not matter what you use, the main issue is to use version control. It makes your life much easier, especially with complicated projects, where you have to test and try out different directions. [GitHub](https://github.com/) is a great platform to collaborate, however, there are others as well.\n", 122 | "\n", 123 | "In this course, we do not overview how to do version control but assume the basics are known. Some useful material (thanks to [Oliver Kiss](https://github.com/kiss-oliver)) can be found at:\n", 124 | "\n", 125 | " - Technical foundations of informatics book: https://info201.github.io/git-basics.html\n", 126 | " - Software carpentry course (Strongly recommended): https://swcarpentry.github.io/git-novice/\n", 127 | " - Github Learning Lab: https://lab.github.com/\n", 128 | " - If you are really committed (pun intended): https://git-scm.com/book/en/v2" 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "id": "bce2a334", 134 | "metadata": {}, 135 | "source": [ 136 | "\n", 137 | "## VS Code: An IDE For Python Scripts\n", 138 | "\n", 139 | "While the course is using Jupyter nootebooks, Python codes in production environments are run from script files ending with a `.py` extension. Jupyter offers a simple text editor to write script files but most developers use some special software application, a so-called 'integrated development environment', or IDE, to write these scripts. \n", 140 | "\n", 141 | "From the many possible alternative IDEs we recommend Visual Studio Code, or VS Code, a free Microsoft tool for these developments. It is a light-weight code editor with miriad of possible extensions which enable VS Code to support basicly *any* kind of programming languages. Beyond particular language supports VS Code also has solutions for things like version control, container management or cloud access. VS Code works equally well on Windows, MAC or Linux. \n", 142 | "\n", 143 | "VS Code can be downloaded from [here](https://code.visualstudio.com/) and tutorials can be accessed through the [documentation](https://code.visualstudio.com/docs).\n", 144 | "\n", 145 | "We recommend using the [Pylance](https://marketplace.visualstudio.com/items?itemName=ms-python.vscode-pylance) extension for Python projects but other options can equally be fine. The course does not cover the deployment of production-ready Python solutions, so VS Code is just an optional component of your toolkit for using Python later. \n", 146 | "\n", 147 | "\n", 148 | "\n", 149 | "## Appendix: A Primer On Virtual Environments\n", 150 | "\n", 151 | "A virtual environment is an isolated workspace for a particular project. In effect it is a directory structure which contains Python executable files and other files which tell Python the packages and their version numbers to use in that project. We set up this environment to make sure that all readers get exactly the same results when running the code snippets on the book's exercises. \n", 152 | "\n", 153 | "If you want ot take a deep dive into Python's virtual environment read [this](https://realpython.com/python-virtual-environments-a-primer/) detailed discussion of the topic. Beyond the dosumentation we refer above you can also get some more technical information about `pipenv` [here](https://pipenv-searchable.readthedocs.io/). \n", 154 | "\n", 155 | "Nevertheless, you don't need to be an expert on virtualenvs in order to be able to follow the course material. \n" 156 | ] 157 | }, 158 | { 159 | "cell_type": "markdown", 160 | "id": "032c4444", 161 | "metadata": {}, 162 | "source": [] 163 | } 164 | ], 165 | "metadata": { 166 | "kernelspec": { 167 | "display_name": "Python 3.10.0 64-bit", 168 | "language": "python", 169 | "name": "python3" 170 | }, 171 | "language_info": { 172 | "codemirror_mode": { 173 | "name": "ipython", 174 | "version": 3 175 | }, 176 | "file_extension": ".py", 177 | "mimetype": "text/x-python", 178 | "name": "python", 179 | "nbconvert_exporter": "python", 180 | "pygments_lexer": "ipython3", 181 | "version": "3.10.0" 182 | }, 183 | "vscode": { 184 | "interpreter": { 185 | "hash": "98590ff4fe04c8543246b2a01debd3de3c5ca9b666f43f1fa87d5110c692004c" 186 | } 187 | } 188 | }, 189 | "nbformat": 4, 190 | "nbformat_minor": 5 191 | } 192 | -------------------------------------------------------------------------------- /lecture00-intro/setup.md: -------------------------------------------------------------------------------- 1 | # Set up environment 2 | ## Get Python 3 | 4 | 1. Install latest version of Python from the [official website](https://www.python.org/downloads/). **We used [version 3.8](https://www.python.org/downloads/release/python-3811/)** 5 | 6 | 2. We suggest to use [Jupyter Notebook](https://jupyter-notebook.readthedocs.io/en/stable/) to edit and run Python code. You can install it via `pip` by running `pip3 install jupyter` in your terminal/PowerShell. 7 | 8 | 9 | ## How to run case studies and coding class in Python 10 | 11 | 1. **Install `Pipenv`** 12 | 13 | We use [Pipenv](https://pipenv-fork.readthedocs.io/en/latest/index.html) for Python dependency management. First, install it via `pip` by running the following code in your terminal/PowerShell: 14 | 15 | ``` 16 | pip3 install pipenv 17 | ``` 18 | 19 | 2. **Create virtual environment and install required packages** 20 | 21 | Go to the `da-coding-python` folder to create a virtual environment and install packages by running the following code in your terminal/PowerShell: 22 | 23 | ``` 24 | pipenv sync 25 | ``` 26 | 27 | This installs the required Python version and packages stored in the `Pipfile.lock`. 28 | 29 | 30 | 31 | 3. **Run Jupyter Notebook** 32 | 33 | To start a Jupyter Notebook in this virtual environment, go to the `da-coding-python` folder and run the following code in your terminal/PowerShell: 34 | 35 | ``` 36 | pipenv run jupyter notebook 37 | ``` 38 | 39 | The jupyter environment should be opened on your default browser. You are good to go! 40 | 41 | **NOTE:** For Windows users, the above code chunks might result in an error, because the `pipenv` terminal shortcut sometimes does not install properly. In this case, run ```python3 -m pipenv sync``` and ```python3 -m pipenv run jupyter notebook```. -------------------------------------------------------------------------------- /lecture01-coding-basics/README.md: -------------------------------------------------------------------------------- 1 | # Lecture 01: Coding basics 2 | 3 | ## Motivation 4 | 5 | In this lecture we jump into the very basics of Python. This is the beginning of a long journey, which, honestly, will never end. 6 | 7 | 8 | ## This lecture 9 | 10 | We start with general coding principles, how to name variables, why and how to comment scripts, and we give some formatting tips. Then we cover basic variable types, assignments and operators. We end with string manipulations, showing hands-on examples how to automate the composition of SQL query strings. By the time you go through [`coding_basics.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture01-coding-basics/coding_basics.ipynb) you will get a taste of things to come. 11 | 12 | 13 | ## Learning outcomes 14 | 15 | After successfully completing the learning material, students will be familiar with 16 | 17 | - general coding principles (for both Python and other languages) 18 | - basic variable types of integers, gloats and booleans 19 | - how to assign value to a variable 20 | - how to do basic operations with these variables 21 | - how to manipulate strings, focusing on writing some simple SQL queries, the primary tool of any data professional. 22 | 23 | 24 | ## Datasets used 25 | * No dataset is used in this lecture 26 | 27 | 28 | ## Lecture Time 29 | 30 | Ideal overall time: **20-30 mins**. 31 | 32 | We tried to keep this part to the bare minimum as it is pretty straightforward. There will be many more examples on how to use variables of various types later in the course. 33 | 34 | 35 | ## Homework 36 | 37 | No homework for this lecture. 38 | 39 | 40 | ## Further material 41 | 42 | [The Zen of Python](https://peps.python.org/pep-0020/) 43 | -------------------------------------------------------------------------------- /lecture02-basic-structures/README.md: -------------------------------------------------------------------------------- 1 | # Lecture 02: Basic structures 2 | 3 | ## Motivation 4 | 5 | Learning how to program is a long journey. The good news is that once you learn the very basics you can write your first programs. In these programs you may need some simple variables, but nothing more. As you move forward and want to write a little more complex solutions you will add more complexity to your data. Also, your solution may need other functionalities which needed to be added to your workspace when your script runs. 6 | 7 | 8 | ## This lecture 9 | 10 | This lecture covers `collections` of data and their usage: 11 | 12 | - lists 13 | - tuples 14 | - sets 15 | - dictionaries. 16 | 17 | In addition, we introduce `JSON`, a lightweight format for storing and transferring data. 18 | 19 | Finally, we show how to import, use, and write `modules`. 20 | 21 | 22 | ## Learning outcomes 23 | 24 | After completing this [basic_structures.ipynb](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture02-basic-structures/basic_structures.ipynb) lecture students will 25 | 26 | - understand the similarities and differences between the four Python collections 27 | - be able to inspect, search and modify lists 28 | - be able to iterate on these list 29 | - be able to use lists in SQL script automation 30 | - understand how tuples work 31 | - be able to inspect dictionary items and select particular dictionary values by keys 32 | - be able to cast lists into sets 33 | - be able to do set operations 34 | - be able to cast JSON-format strings into dictionaries and vice versa 35 | - be able to import and use modules 36 | - write and import their own modules. 37 | 38 | 39 | ## Datasets used 40 | * No dataset is used in this lecture 41 | 42 | 43 | ## Lecture Time 44 | 45 | Ideal overall time: **30-40 mins**. 46 | 47 | 48 | ## Homework 49 | 50 | Create a dictionary. 51 | 52 | - Iterate through the keys and print the keys together with the values. 53 | - Try to print a value for an unknown key (which is not included in your dictionary) in two ways: 54 | 1. Your script throws an error when calling the unknown key. 55 | 2. Your script prints a default values. 56 | - Cast your dictionary keys into lists. 57 | 58 | Create another dictionary with overlapping keys with the first one but different values. Try to merge the two. What do you see? 59 | 60 | 61 | ## Further material 62 | - [Official Python tutorial](https://docs.python.org/2/tutorial/datastructures.html) on lists, tuples, set & dictionaries. 63 | 64 | -------------------------------------------------------------------------------- /lecture02-basic-structures/sample_module.py: -------------------------------------------------------------------------------- 1 | def print_hello(name): 2 | print(f'Hello {name}!') -------------------------------------------------------------------------------- /lecture03-data-IO/README.md: -------------------------------------------------------------------------------- 1 | # Lecture 03: Data I/O 2 | 3 | ## Motivation 4 | 5 | Reading and writing files is a regular step in all software applications. All data containers (such as numpy arrays or Pandas data frames) have their own read and write operations, so we focus on reading text files in this lecture. 6 | 7 | 8 | ## This lecture 9 | 10 | This [short module](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture03-data-IO/data_IO.ipynb) shows you how to read text files from the file system, how to write and save files, and how to navigate in the file system using the `os` module. 11 | 12 | 13 | ## Learning outcomes 14 | 15 | After running the material's code chunks students will be able to 16 | 17 | - open file for reading 18 | - reading text files line by line 19 | - handling file encodings 20 | - writing new files, and appending and overwriting existing files 21 | - getting working directory information 22 | - listing files in a directory 23 | - creating operating system-specific file path strings 24 | - create new directory with Python 25 | 26 | 27 | ## Datasets used 28 | * No dataset is used in this lecture 29 | 30 | 31 | ## Lecture Time 32 | 33 | Ideal overall time: **10 mins**. 34 | 35 | 36 | ## Homework 37 | 38 | Explore the directory of your Python environment (from which you are running these codes) using the `os` module. List the content of your directory and create a new subfolder there. Write a short text in new text file saved in this directory. Read the [docs](https://docs.python.org/3/library/os.html) how to delete files and directories, and delete this new directory and the file you have just saved there. 39 | 40 | -------------------------------------------------------------------------------- /lecture03-data-IO/city_names.txt: -------------------------------------------------------------------------------- 1 | Český Krumlov, Pécs, Kraków -------------------------------------------------------------------------------- /lecture03-data-IO/data_IO.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "68ce080e-8ee3-4120-b63e-d65731319404", 6 | "metadata": {}, 7 | "source": [ 8 | "# Lecture 3\n", 9 | "\n", 10 | "## I/O (Reading From and Writing To Files) \n", 11 | "\n", 12 | "- [reading](#reading)\n", 13 | "- [writing](#writing)\n", 14 | "\n", 15 | "## Navigating The File System \n", 16 | "\n", 17 | "- using the [`os` module](#os)\n", 18 | "----" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "id": "2c3950bf-f26d-40ba-903c-ffaf11d56988", 24 | "metadata": {}, 25 | "source": [ 26 | "## I/O" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "id": "51631150-3331-4848-a397-e132a9878da6", 32 | "metadata": {}, 33 | "source": [ 34 | "### Reading " 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "id": "8258ef5e-3455-4a0c-94ee-6f2f6574d2f4", 40 | "metadata": {}, 41 | "source": [ 42 | "Before any file operation we need to `open` the file." 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "id": "51d8cfcb-366a-477d-beba-9a55ab7a2d04", 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "f = open('Linux_2k.log')" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "id": "6543d7be-ef7c-40ea-9607-32a24e03863b", 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "print(f)" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "id": "04eed29a-dc05-4dc6-b4ad-92d7014fbb85", 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "f.read()" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "id": "df58897a-973d-4f27-bb0e-d828241babe9", 78 | "metadata": {}, 79 | "source": [ 80 | "You also need to `close` the file, otherwise your program will not allow other programs to access it." 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "id": "f6bc5c6d-e35f-4152-8f0f-3db9533d67ee", 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "f.close()" 91 | ] 92 | }, 93 | { 94 | "cell_type": "markdown", 95 | "id": "4bfa0064-1ee7-49ec-8cef-2f255cc78a9f", 96 | "metadata": {}, 97 | "source": [ 98 | "Note: We are using a system log example from the [Loghub](https://github.com/logpai/loghub) repository. The relevant documentation can be found on [arxiv.org](https://arxiv.org/abs/2008.06448)." 99 | ] 100 | }, 101 | { 102 | "cell_type": "markdown", 103 | "id": "00fac8ee-7666-4ff7-9152-1b1dca73e71e", 104 | "metadata": {}, 105 | "source": [ 106 | "You can also add *encoding information* to the `open()` method to avoid the mess with funny characters. " 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "id": "d3bc00aa-2a92-4fdc-8cb8-b4f86c87fabe", 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [ 116 | "f = open('city_names.txt')\n", 117 | "f.read()" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "id": "52d4e448-4f04-4c94-9655-abfe3bc09d86", 124 | "metadata": {}, 125 | "outputs": [], 126 | "source": [ 127 | "f.close()" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "id": "5f0c3f04-b3bf-433d-993b-5cfe6e2c3c05", 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [ 137 | "f = open('city_names.txt', encoding = 'utf-8')\n", 138 | "f.read()" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": null, 144 | "id": "4033b6e3-24a2-41a0-9d70-84db73c4d953", 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "f.close()" 149 | ] 150 | }, 151 | { 152 | "cell_type": "markdown", 153 | "id": "e9ff1cca-0bea-4137-bfb2-5526522a5f98", 154 | "metadata": {}, 155 | "source": [ 156 | "You can find encoding options for all languages and character sets in the documentation of the [codecs module](https://docs.python.org/3/library/codecs.html#standard-encodings). " 157 | ] 158 | }, 159 | { 160 | "cell_type": "markdown", 161 | "id": "a78f4683-5484-48ea-b425-2692e63f093a", 162 | "metadata": {}, 163 | "source": [ 164 | "Multiline text can be also be read sequentially." 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": null, 170 | "id": "ef14caef-b018-49c6-9eb0-05372fb1a0e4", 171 | "metadata": {}, 172 | "outputs": [], 173 | "source": [ 174 | "f = open('Linux_2k.log')" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": null, 180 | "id": "2d9d4ef2-74fd-4b8e-9d02-1d7451cd3d23", 181 | "metadata": {}, 182 | "outputs": [], 183 | "source": [ 184 | "f.readline()" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "id": "e1114b95-fad7-453c-84ef-7021f427b500", 191 | "metadata": {}, 192 | "outputs": [], 193 | "source": [ 194 | "f.readline()" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": null, 200 | "id": "4736993f-2c12-42f7-a486-6a50784b0be6", 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "f.close()" 205 | ] 206 | }, 207 | { 208 | "cell_type": "markdown", 209 | "id": "8947bdfa-ba99-45ab-98ba-3081ef54d9b4", 210 | "metadata": {}, 211 | "source": [ 212 | "The best way to read and write file is by using the `with` statement. This ensures that the file is closed when the block inside the with statement is exited. We don't need to explicitly call the `close()` method, it is done internally." 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": null, 218 | "id": "4aa35b7a-f047-4d60-b604-37e9514a6ea8", 219 | "metadata": {}, 220 | "outputs": [], 221 | "source": [ 222 | "with open(\"Linux_2k.log\", encoding=\"utf-8\") as f:\n", 223 | " for line in f: # remember to indent!\n", 224 | " print(line)\n", 225 | "\n", 226 | "# After the operation the connection to the file is closed." 227 | ] 228 | }, 229 | { 230 | "cell_type": "markdown", 231 | "id": "8bf19b9c-2ecc-4940-8647-978653884b06", 232 | "metadata": {}, 233 | "source": [ 234 | "### Writing " 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": null, 240 | "id": "127c154e-84d9-4938-acee-8ece7647f85e", 241 | "metadata": {}, 242 | "outputs": [], 243 | "source": [ 244 | "with open(file = 'message.txt', mode = 'w', encoding = 'utf-8') as write_text:\n", 245 | " write_text.write('Hello Monthy! \\nThis is Python class on file I/O.')" 246 | ] 247 | }, 248 | { 249 | "cell_type": "markdown", 250 | "id": "5b747e36-1232-447b-9e75-1e47b3708675", 251 | "metadata": {}, 252 | "source": [ 253 | "There are four ways to open a file:\n", 254 | "- \"r\" - Read - Default value. Opens a file for reading, error if the file does not exist\n", 255 | "- \"a\" - Append - Opens a file for appending, creates the file if it does not exist\n", 256 | "- \"w\" - Write - Opens a file for writing, creates the file if it does not exist\n", 257 | "- \"x\" - Create - Creates the specified file, returns an error if the file exists" 258 | ] 259 | }, 260 | { 261 | "cell_type": "markdown", 262 | "id": "bb263a87-dc6c-40e1-9933-5ee5e3e7bc7e", 263 | "metadata": {}, 264 | "source": [ 265 | "## Navigating The File System " 266 | ] 267 | }, 268 | { 269 | "cell_type": "markdown", 270 | "id": "d31e62c0-8375-4f92-99cf-7c984a183dce", 271 | "metadata": {}, 272 | "source": [ 273 | "One way to navigate in your file system is by using the `os` module. This module provides methods for getting directory info, creating and deleting folders, listing files, etc. " 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": null, 279 | "id": "26233f46-e530-440a-8ffd-f8ab8f43d90e", 280 | "metadata": {}, 281 | "outputs": [], 282 | "source": [ 283 | "import os" 284 | ] 285 | }, 286 | { 287 | "cell_type": "markdown", 288 | "id": "a91f70f6-b83a-443b-a85e-600391803ee1", 289 | "metadata": {}, 290 | "source": [ 291 | "`getcwd()` will give you your current working directory, and `listdir()` lists the file in the directory of your choice. (If you don't give the 'path' parameter as input it will list the files in your current working directory.)" 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": null, 297 | "id": "dccb38e4-21eb-47fc-9519-31e83500b74a", 298 | "metadata": {}, 299 | "outputs": [], 300 | "source": [ 301 | "current_directory = os.getcwd()" 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": null, 307 | "id": "7d9b6be5-b656-4350-9733-56a805bb10ab", 308 | "metadata": {}, 309 | "outputs": [], 310 | "source": [ 311 | "files = os.listdir(current_directory)" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": null, 317 | "id": "e9d0aae3-8035-4e27-a738-0dff1e07164b", 318 | "metadata": {}, 319 | "outputs": [], 320 | "source": [ 321 | "print(files)" 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": null, 327 | "id": "10f5c962-a32a-480b-8e14-1541a856f102", 328 | "metadata": {}, 329 | "outputs": [], 330 | "source": [ 331 | "type(files)" 332 | ] 333 | }, 334 | { 335 | "cell_type": "markdown", 336 | "id": "69487636-8810-4b03-b0da-d94b045307e6", 337 | "metadata": {}, 338 | "source": [ 339 | "The `os`module uses Linux commands to interact with the file system. `mkdir()` will create a new directory, and `path.join()` is used to define new paths. Note, that the `path()` method uses the approprite directory separators, depending on your operating system. (Forward slashes for Linux and MAC, double backslashes for Windows.)" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": null, 345 | "id": "3f1187ef-16f1-49f9-a347-a823e05a55e9", 346 | "metadata": {}, 347 | "outputs": [], 348 | "source": [ 349 | "path = \"C:\\\\Users\\\\\" # win\n", 350 | "path = \"/Users/\" # mac os\n", 351 | "\n", 352 | "# Join various path components\n", 353 | "os.path.join(path, \"Documents\", \"Python_classes\", \"\")" 354 | ] 355 | } 356 | ], 357 | "metadata": { 358 | "kernelspec": { 359 | "display_name": "Python 3 (ipykernel)", 360 | "language": "python", 361 | "name": "python3" 362 | }, 363 | "language_info": { 364 | "codemirror_mode": { 365 | "name": "ipython", 366 | "version": 3 367 | }, 368 | "file_extension": ".py", 369 | "mimetype": "text/x-python", 370 | "name": "python", 371 | "nbconvert_exporter": "python", 372 | "pygments_lexer": "ipython3", 373 | "version": "3.8.10" 374 | } 375 | }, 376 | "nbformat": 4, 377 | "nbformat_minor": 5 378 | } 379 | -------------------------------------------------------------------------------- /lecture03-data-IO/message.txt: -------------------------------------------------------------------------------- 1 | Hello Monthy! 2 | This is Python class on file I/O. -------------------------------------------------------------------------------- /lecture04-pandas-basics/README.md: -------------------------------------------------------------------------------- 1 | # Lecture 04: Data Munging with pandas 2 | 3 | ## Motivation 4 | 5 | Before analyzing the data, data analysts spend a lot of time organizing, managing, and cleaning it to prepare it for analysis. This is called data wrangling or data munging. It is often said that 80 percent of data analysis time is spent on these tasks. Data wrangling is an iterative process: we usually start by organizing and cleaning our data, then start doing the analysis, and then go back to the cleaning process as problems emerge during analysis. 6 | 7 | Here we introduce students to a (relatively) easy way of carrying out this task and use the case study of [finding a good deal among hotels]((https://gabors-data-analysis.com/casestudies/#ch02a-finding-a-good-deal-among-hotels-data-preparation)). The initial data preparation, continues to work towards finding hotels that are underpriced relative to their location and quality. In this lecture, we illustrate how to find problems with observations and variables and how to solve those problems. 8 | 9 | ## This lecture 10 | 11 | 12 | This lecture introduces `pandas` as the data type of variable Python. It shows multiple columns and row manipulations with one DataFrame, also introduces students how to manipulate raw data in various ways with `pandas`. 13 | 14 | This lecture is based on [Chapter 02, A: Finding a good deal among hotels: data preparation](https://gabors-data-analysis.com/casestudies/#ch02a-finding-a-good-deal-among-hotels-data-preparation). 15 | 16 | 17 | ## Learning outcomes 18 | After successfully completing [`01_pandas_basics.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture04-pandas-basics/01_pandas_basics.ipynb), students should be able to: 19 | 20 | - create pandas `Series` 21 | - create pandas `DataFrames` from `Series`, dictionaries, lists 22 | - access data in a `DataFrame` with `loc` and `iloc` 23 | - reset index 24 | - rename columns 25 | - access metadata of `DataFrame`s 26 | 27 | After successfully completing [`02_pandas_data_munging.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture04-pandas-basics/02_pandas_data_munging.ipynb), students should be able to: 28 | 29 | - add variables 30 | - separate a character variable into two (or more) variables 31 | - convert different type of variables to specific types: 32 | - character to numeric 33 | - string manipulations in pandas Series 34 | - filter out different observations 35 | - select observations with specific values 36 | - tabulate different values of a variable 37 | - filter out missing values 38 | - replace specific values with others 39 | - handle duplicates 40 | - use pipes to do multiple manipulations at once 41 | - sort data ascending or descending according to a specific variable 42 | 43 | ## Datasets used 44 | * [Hotels Europe](https://gabors-data-analysis.com/datasets/#hotels-europe) 45 | 46 | 47 | ## Lecture Time 48 | 49 | Ideal overall time: **60 mins**. 50 | 51 | 52 | ## Homework 53 | 54 | *Type*: quick practice, approx 10 mins 55 | 56 | Use the same [hotel-europe data from OSF](https://osf.io/r6uqb/), but now 57 | - Download both `hotels-europe_price.csv` and `hotels-europe_features.csv` 58 | - `merge` them in this order by `hotel_id` 59 | - filter for : 60 | - time: 2018/01 and weekend == 1 61 | - city: Vienna or London. Hint: for multiple matches, use something like: 62 | ```r 63 | data.loc[data["city"].isin(['City_A','City_B'])] 64 | ``` 65 | - accommodation should be Apartment, 3-4 stars (only) with more than 10 reviews 66 | - price is less than 600$ 67 | - arrange the data in ascending order by price 68 | 69 | ## Further material 70 | 71 | - More materials on the case study can be found in Gabor's [da_case_studies repository](https://github.com/gabors-data-analysis/da_case_studies): [ch02-hotels-data-prep](https://github.com/gabors-data-analysis/da_case_studies/blob/master/ch02-hotels-data-prep/ch02-hotels-data-prep.R) 72 | - Arthur Turrell's Coding for Economics classes: [Data Analysis Quickstart](https://aeturrell.github.io/coding-for-economists/data-analysis-quickstart.html), [Working with Data](https://aeturrell.github.io/coding-for-economists/data-intro.html), [Data Transformation](https://aeturrell.github.io/coding-for-economists/data-transformation.html) -------------------------------------------------------------------------------- /lecture05-graphs-basics/README.md: -------------------------------------------------------------------------------- 1 | # Lecture 05: graphs basics 2 | 3 | ## Motivation 4 | 5 | You should look at your data. Graphs and charts let you explore and learn about the structure of the information you collect. Good data visualizations also make it easier to communicate your ideas and findings to other people. Beyond that, producing effective plots from your own data is the best way to develop a good eye for reading and understanding graphs — good and bad — made by others, whether presented in research articles, business slide decks, public policy advocacy, or media reports. 6 | 7 | To create a powerful graph, it is a good starting principle that all of our decisions should be guided by the *usage of the graph*: a summary concept to capture what we want to show and to whom. Its main elements are purpose, focus, and audience. Once usage is clear, the first set of decisions to make is about how we convey information: how to show what we want to show. For those decisions it is helpful to understand the entire graph as the 8 | overlay of three graphical objects: 9 | 10 | 1. Geometric object; the geometric visualization of the information we want to convey, such as a 11 | set of bars, a set of points, or a line; multiple geometric objects may be combined. 12 | 2. Scaffolding: elements that support understanding the geometric object, such as axes, labels, and 13 | legends. 14 | 3. Annotation: adding anything else to emphasize specific values or explain more detail. 15 | 16 | Keeping these in mind this lecture introduces students to how to create graphs that take into account these principles. 17 | 18 | ## This lecture 19 | 20 | This lecture introduces the tools to create and manipulate plots with `plotnine` and `matplotlib`. `plotnine` is used through the [`case studies`](https://github.com/gabors-data-analysis/da_case_studies) for the textbook, its based on `ggplot2` of the R language. 21 | 22 | `matplotlib` is the primary charting library of Python. It is a massive library, which offers so much, that it can easily become overwhelming. Creating a basic chart is fairly simple, but sometimes just a little customization already requires a deep dive into the API. 23 | 24 | One of the reasons we cover matplotlib here though is that many other libraries are also built on the matplotlib API, and plotting charts directly from Pandas dataframes is easier if we have a basic understading of matplotlib's mechanics. There are other popular charting packages, such as `seaborn` or `Plotly`, but we think that a real Pythonista should be able to work with matplotlib objects. 25 | 26 | Case studies used/related in/to this lecture: 27 | 28 | - [Chapter 03, B Comparing hotel prices in Europe: Vienna vs London](https://gabors-data-analysis.com/casestudies/#ch03b-comparing-hotel-prices-in-europe-vienna-vs-london) is the base for this lecture. 29 | - Some tools are used in [Chapter 04, A Management quality and firm size: describing patterns of association](https://gabors-data-analysis.com/casestudies/#ch04a-management-quality-and-firm-size-describing-patterns-of-association) 30 | 31 | 32 | ## Learning outcomes 33 | After completing [`01_plotnine_intro.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture05-graphs-basics/01_plotnine_intro.ipynb) students should be able to: 34 | 35 | - create `ggplot` objects with different types of figures using `geoms` 36 | - manipulating axis with `scale_*_continuous` and `scale_*_discrete`, where `*` stands for `y` or `x` 37 | - set limits 38 | - set break points 39 | - add annotation to a plot 40 | - lines, dots and text 41 | - bar charts: 42 | - simple 43 | - stacked 44 | - stacked with percentages, using `scales` package 45 | - box plot 46 | - violine plot 47 | - use `color[x]` color values from a pre-defined list 48 | 49 | After completing [`02_matplotlib_intro.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture05-graphs-basics/02_matplotlib_intro.ipynb) students should be able to: 50 | 51 | - the two key object in a `matplotlib` plot 52 | - `Figure` 53 | - `Axes` 54 | - set 55 | - y-axis limits 56 | - legends 57 | - log scale 58 | - using a second axis 59 | - spacing between the bars and horizontal grids 60 | - chart within a chart 61 | 62 | ## Datasets used 63 | * [Hotel Europe](https://gabors-data-analysis.com/datasets/#hotels-europe) 64 | 65 | ## Lecture Time 66 | 67 | Ideal overall time: **30-60mins**. 68 | 69 | Showing [`plotnine_intro.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture05-graphs-basics/plotnine_intro.ipynb) takes around *30 minutes* while doing the tasks would take approx *10-15 minutes*. Showing [`matplotlib_intro.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture05-graphs-basics/matplotlib_intro.ipynb) takes around *15-20 minutes*. 70 | 71 | ## Further material 72 | 73 | - [Introduction to Matplotlib — Data Visualization in Python](https://heartbeat.comet.ml/introduction-to-matplotlib-data-visualization-in-python-d9143287ae39) in general focuses on visualization with matplotlib. 74 | - Arthur Turrell's Coding for Economics classes: [Intro to Data Visualisation](https://aeturrell.github.io/coding-for-economists/vis-intro.html), [Common Plots](https://aeturrell.github.io/coding-for-economists/vis-common-plots.html) 75 | - [Official webpage of `plotnine`](https://plotnine.readthedocs.io/en/stable/) 76 | - [Official webpage of `matplotlib`](https://matplotlib.org/) 77 | -------------------------------------------------------------------------------- /lecture06-conditionals/README.md: -------------------------------------------------------------------------------- 1 | # Lecture 06: Conditional Programming 2 | 3 | ## Motivation 4 | 5 | Deciding what to do on a case by case is widely used in decision making and also in programming. Conditional programming enables writing codes with this in mind. If a certain condition holds execute a command otherwise do something different. Conditional programming is an element of the basic programming technique, which emerges in multiple situations. Adding this technique to the programming toolbox is a must for data scientists. 6 | 7 | ## This lecture 8 | 9 | This lecture introduces students to conditional programming with `if-else` statements. It covers the essentials control flows as `for` and `while` loop and list comprehension. 10 | 11 | 12 | ## Learning outcomes 13 | After successfully live-coding the material (see: [`conditionals_and_control_flows.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture06-conditionals/conditionals_and_control_flows.ipynb)), students will have knowledge on 14 | 15 | - how a conditional statement works 16 | - what are the crucial elements of an `if-else` statement 17 | - what is a `for` loop 18 | - what is a `while` loop 19 | - wow to create a list comprehension 20 | 21 | 22 | ## Lecture Time 23 | 24 | Ideal overall time: **10-20 mins**. 25 | 26 | This is a relatively short lecture, and it can be even shorter if logical operators with vectors is neglected. Although good understanding of the anatomy of an `if-else` statement is important -------------------------------------------------------------------------------- /lecture06-conditionals/conditionals_and_control_flows.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "6bd65ab7-b72f-46b2-b0f1-c66d477e93a0", 6 | "metadata": {}, 7 | "source": [ 8 | "# Lecture 6\n", 9 | "\n", 10 | "## Conditional statements\n", 11 | "- the [if statement](#conditionals)\n", 12 | "\n", 13 | "## Control Flows\n", 14 | "- the [for loop](#for)\n", 15 | "- the [while loop](#while)\n", 16 | "- using for loops for [list comprehension](#comprehension)\n", 17 | "---" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "id": "386d4659-98e0-4764-b66c-145d57e04bc6", 23 | "metadata": {}, 24 | "source": [ 25 | "## Conditional Statements " 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "id": "0a62d10e-f954-4283-b2f7-be6ededd84dd", 31 | "metadata": {}, 32 | "source": [ 33 | "Conditional statments is the `if`-`else` structure. The program performs an operation (or more) if certain conditions are met, and - optionally - performs some other if those conditions are not fulfilled." 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "id": "1c6be79a-9e10-4cec-b14b-7b583090c826", 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "import random" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "id": "c61ec38f-229b-4411-8e13-b972af8fa3dc", 49 | "metadata": {}, 50 | "source": [ 51 | "Condtional statements are controlled by ***indentation***. Each new embedded condition needs to be shifted one tab right. (Other languages, like Java or JavaScript, use curly braces.)" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "id": "5ba3b5fd-acee-4e54-9512-038c8df28a05", 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "r = random.randint(20,34)\n", 62 | "print(r)\n", 63 | "if r < 25:\n", 64 | " print('A small number!')\n", 65 | "elif r < 30:\n", 66 | " print('A moderately high number.')\n", 67 | "else:\n", 68 | " print('A large number!')" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "id": "0653ae7a-a29a-49f7-889b-b182a31db17f", 74 | "metadata": {}, 75 | "source": [ 76 | "Conditional statements do not have to have an `else` branch. If the condition is not met the program can also stay idle. " 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "id": "2e27f14d-1e50-45b9-ba48-d1112e4d0997", 83 | "metadata": {}, 84 | "outputs": [], 85 | "source": [ 86 | "a = random.randint(1,12)\n", 87 | "b = random.randint(1,16)\n", 88 | "\n", 89 | "print('a:', a)\n", 90 | "print('b:', b)\n", 91 | "\n", 92 | "if a > 6:\n", 93 | " print(\"'a' is large\")\n", 94 | " if b > a:\n", 95 | " print('Both numbers are large.')\n", 96 | " print('Result: b is larger than a.')" 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "id": "a14665d2-279f-43a6-8133-9e10c9c30e0e", 102 | "metadata": {}, 103 | "source": [ 104 | "## Control Flows" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "id": "0255d90b-a436-4823-b370-649226a85aa9", 110 | "metadata": {}, 111 | "source": [ 112 | "### The _'for'_ Loop " 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "id": "f0018cea-cd2a-4add-a308-891592cd7769", 119 | "metadata": {}, 120 | "outputs": [], 121 | "source": [ 122 | "for i in range(20): # Remember: 20 is not included in the range! \n", 123 | " if i%2 == 0: # The 'modulo' operator returns the integer part left after an integer division.\n", 124 | " print('Number %d is even.'% i)\n", 125 | " else:\n", 126 | " print('Number %d is odd.'% i)" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": null, 132 | "id": "e1ba7e8a-e47f-467c-8ab4-4496215e8b42", 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [ 136 | "for word in ['Business', 'analytics', 'with', 'Python']:\n", 137 | " print(word, len(word)) # functions can also be print inputs" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "id": "0834b4d6-7041-495d-9c14-83fd5c5b2cfc", 144 | "metadata": {}, 145 | "outputs": [], 146 | "source": [ 147 | "list_capitals = []\n", 148 | "for i in range(65,91):\n", 149 | " list_capitals.append(chr(i))\n", 150 | "print(list_capitals)" 151 | ] 152 | }, 153 | { 154 | "cell_type": "markdown", 155 | "id": "2114f1f2-7cf1-45f2-b66e-c8300953123f", 156 | "metadata": {}, 157 | "source": [ 158 | "The `enumerate` function helps you get a counter. " 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": null, 164 | "id": "ee9d72f3-9656-4e3e-8189-e5285a675ec0", 165 | "metadata": {}, 166 | "outputs": [], 167 | "source": [ 168 | "for k, v in enumerate(list_capitals):\n", 169 | " print(k, v)" 170 | ] 171 | }, 172 | { 173 | "cell_type": "markdown", 174 | "id": "4f01fb95-cf0b-462a-86b7-5b8aa63678b4", 175 | "metadata": {}, 176 | "source": [ 177 | "Add some simple formatting: right-adjust k, the counter. This is what the `.rjust()` function does. This, however, is a *string function*, so we need to *cast* our 'k' variable, which an integer, into string. For this we use the `str()` function." 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "id": "d54c539d-edd9-415a-8e17-a46edcc5ce67", 184 | "metadata": {}, 185 | "outputs": [], 186 | "source": [ 187 | "for k, v in enumerate(list_capitals):\n", 188 | " print(str(k).rjust(2)+': ', v)" 189 | ] 190 | }, 191 | { 192 | "cell_type": "markdown", 193 | "id": "c61e2222-0ef3-419c-b4cb-bbdbd31e7db4", 194 | "metadata": {}, 195 | "source": [ 196 | "### The *'while'* Loop " 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": null, 202 | "id": "c9b5d1f0-b39c-48a5-bcb5-3b05cc74cc28", 203 | "metadata": {}, 204 | "outputs": [], 205 | "source": [ 206 | "i = 0 # the counter\n", 207 | "while i < 20:\n", 208 | " if i%2 == 0:\n", 209 | " print('Number %d is even.'% i)\n", 210 | " else:\n", 211 | " print('Number %d is odd.'% i)\n", 212 | " i += 1 # increment in Python (same as i++ in Java)\n", 213 | "print('\\nDone.') # Indented so that it will only print at the end." 214 | ] 215 | }, 216 | { 217 | "cell_type": "markdown", 218 | "id": "e6de1e59-4666-4b8e-ae85-9f697f2ec97d", 219 | "metadata": {}, 220 | "source": [ 221 | "**Caution!!!** If you don't increment the counter, the loop will never stop!" 222 | ] 223 | }, 224 | { 225 | "cell_type": "markdown", 226 | "id": "28d1f43c-fba5-4080-afd2-4c866eb7ed73", 227 | "metadata": {}, 228 | "source": [ 229 | "If you use '*True*' in the `while` condition the script runs until manual interruption. " 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": null, 235 | "id": "32ea677f-3873-407d-82e2-5048f7d146bb", 236 | "metadata": {}, 237 | "outputs": [], 238 | "source": [ 239 | "from IPython.display import clear_output\n", 240 | "import time\n", 241 | "\n", 242 | "i = 1\n", 243 | "while True: # This syntax makes it run forever, or untill manual interruption. \n", 244 | " print(i)\n", 245 | " i += 1\n", 246 | " time.sleep(1)\n", 247 | " clear_output()" 248 | ] 249 | }, 250 | { 251 | "cell_type": "markdown", 252 | "id": "709db49c-ac70-4f7f-bf6a-316bdc74ca14", 253 | "metadata": {}, 254 | "source": [ 255 | "To interrupt the script in a code cell click in the cell and then click ■ (the black rectangle icon) on the notebook's menu bar. " 256 | ] 257 | }, 258 | { 259 | "cell_type": "markdown", 260 | "id": "fb2e7251-a6ea-428e-b258-a16f550b42d3", 261 | "metadata": {}, 262 | "source": [ 263 | "### List Comprehension " 264 | ] 265 | }, 266 | { 267 | "cell_type": "markdown", 268 | "id": "07617a0b-a3cd-4e98-9131-778999da6c47", 269 | "metadata": {}, 270 | "source": [ 271 | "List comprehension is a logical construct to create a list from another lists or from an iterable, or to modify an existing list *in place*. " 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": null, 277 | "id": "8eb7bcef-9b0a-42a6-971c-b36f4b030289", 278 | "metadata": {}, 279 | "outputs": [], 280 | "source": [ 281 | "L = [x**2 for x in range(0,10)]\n", 282 | "L" 283 | ] 284 | }, 285 | { 286 | "cell_type": "markdown", 287 | "id": "cef71af7-5a4c-4306-bda2-3eeaade310a4", 288 | "metadata": {}, 289 | "source": [ 290 | "You can also combine it with conditional statements. For example:" 291 | ] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "execution_count": null, 296 | "id": "357b3dc6-f4e9-467d-a039-7b9754741d60", 297 | "metadata": {}, 298 | "outputs": [], 299 | "source": [ 300 | "[x for x in L if x%2 == 1]" 301 | ] 302 | }, 303 | { 304 | "cell_type": "markdown", 305 | "id": "9c7b8eab-2b45-4347-8d52-fcb18c713fca", 306 | "metadata": {}, 307 | "source": [ 308 | "You can also use an ``if else`` statement" 309 | ] 310 | }, 311 | { 312 | "cell_type": "code", 313 | "execution_count": null, 314 | "id": "b0db5a8f-fe3b-48c3-992a-039fe1bb1325", 315 | "metadata": {}, 316 | "outputs": [], 317 | "source": [ 318 | "['even' if x%2 == 0 else 'odd' for x in L]" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": null, 324 | "id": "251e6732", 325 | "metadata": {}, 326 | "outputs": [], 327 | "source": [] 328 | } 329 | ], 330 | "metadata": { 331 | "kernelspec": { 332 | "display_name": "Python 3 (ipykernel)", 333 | "language": "python", 334 | "name": "python3" 335 | }, 336 | "language_info": { 337 | "codemirror_mode": { 338 | "name": "ipython", 339 | "version": 3 340 | }, 341 | "file_extension": ".py", 342 | "mimetype": "text/x-python", 343 | "name": "python", 344 | "nbconvert_exporter": "python", 345 | "pygments_lexer": "ipython3", 346 | "version": "3.8.10" 347 | } 348 | }, 349 | "nbformat": 4, 350 | "nbformat_minor": 5 351 | } 352 | -------------------------------------------------------------------------------- /lecture07-data-exploration/README.md: -------------------------------------------------------------------------------- 1 | # Lecture 07: Data Exploration 2 | 3 | ## Motivation 4 | 5 | You want to know whether online and offline prices differ in your country for products that are sold in both ways. You have access to data on a sample of products with their online and offline prices. How would you use this data to establish whether prices tend to be different or the same for all products? 6 | 7 | After collecting the data, assessing its quality, cleaning it, and structuring it, the next step is exploratory data analysis (EDA). Exploratory data analysis aims to describe variables in a dataset. EDA is important for understanding potential problems with the data and making analysts and their audiences familiar with the most important variables. The results of EDA help additional data cleaning, decisions for further steps of the analysis, and giving context to the results of the following hypothesis testing. 8 | 9 | The lecture discusses some basic concepts such as frequencies, probabilities, distributions, and extreme values. It includes guidelines 10 | for producing informative graphs and tables for presentation and describes the most important summary statistics. Furthermore, we cover the logic and practice of testing hypotheses. We describe the steps of hypothesis testing and discuss two alternative ways to carry it out: one with the help of a test statistic and a critical value, and another one with the help of a p-value. We focus on testing hypotheses about averages, but, as we show in one of our case studies, this focus is less restrictive than it may appear. 11 | 12 | 13 | ## This lecture 14 | 15 | This lecture introduces students to data exploration. `pandas` is used for data descriptive tables, `plotnine` for creating graphs, and `scipy.stats` for hypothesis testing. 16 | Descriptive statistics and descriptive graphs for one variable are concerned to decide on further data munging. 17 | Moreover, simple hypothesis testing is covered as well as association graphs and statistics between two variables. 18 | 19 | Case studies connected to this lecture: 20 | - [Chapter 03, A: Finding a good deal among hotels: data exploration](https://gabors-data-analysis.com/casestudies/#ch03a-finding-a-good-deal-among-hotels-data-exploration) - emphasis on one variable descriptive analysis, different data 21 | - [Chapter 03, D: Distributions of body height and income](https://gabors-data-analysis.com/casestudies/#ch03d-distributions-of-body-height-and-income) and [Chapter 03, U1: Size distribution of Japanese cities](https://gabors-data-analysis.com/casestudies/#ch03u1-size-distribution-of-japanese-cities) connects theoretical and empirical distributions 22 | - [Chapter 04, A: Management quality and firm size: describing patterns of association](https://gabors-data-analysis.com/casestudies/#ch04a-management-quality-and-firm-size-describing-patterns-of-association) - focuses on the association between two variables, one variable descriptive is not emphasized, different data. 23 | - [Chapter 06, A: Comparing online and offline prices: testing the difference](https://gabors-data-analysis.com/casestudies/#ch06a-comparing-online-and-offline-prices-testing-the-difference) - focuses on hypothesis testing, association and one variable descriptive is not emphasized. 24 | 25 | This lecture uses [Chapter 06, A](https://gabors-data-analysis.com/casestudies/#ch06a-comparing-online-and-offline-prices-testing-the-difference) as the starting point, but stresses the one variable descriptives such as in [Chapter 03, A](https://gabors-data-analysis.com/casestudies/#ch03a-finding-a-good-deal-among-hotels-data-exploration) and adds the two variable pattern analysis such as in [Chapter 04, A](https://gabors-data-analysis.com/casestudies/#ch04a-management-quality-and-firm-size-describing-patterns-of-association). 26 | 27 | 28 | ## Learning outcomes 29 | After completing the codes in [`data_exploration.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture07-data-exploration/data_exploration.ipynb), students should be able to: 30 | 31 | - `describe` for a quick summary of all variables in the dataframe 32 | - `skim` from the `skimpy` package for a nicer looking descriptive table 33 | - specific variables with their descriptive statistics with `filter` such as 34 | - mean, median, standard deviation, minimum, maximum, percentiles, number of observations, number of missing observations 35 | - user-created functions added to `agg` such as range or mode 36 | - descriptives for specific groups 37 | - use of `plotnine`: 38 | - histogram to plot empirical density with count or relative frequency. Understanding the role of the number of bins and bins' width. 39 | - kernel density to plot a smooth function for the empirical density with an understanding of the role of bandwidth. 40 | - stack multiple geometry objects in one graph and control for opaqueness 41 | - manipulate labels with `labs` 42 | - set axis limits with `xlim` and `ylim` 43 | - use a factor variable to graph multiple groups in one ggplot and understand the differences between `fill`, `color`, and `group` arguments. 44 | - create multiple plots in one graph with `facet_wrap` 45 | - carry out hypothesis test via t-test 46 | - two-sided, one-sided tests 47 | - multiple hypothesis test with `agg` and `groupby` functions 48 | - Association between two variables: 49 | - covariance with `cov` and correlation with `corr` 50 | - scatter plot 51 | - bin-scatter: equidistance bin-scatter with `stat_summary_bin` and an equal number of observations in each bin by hand 52 | - correlation for specific subgroups and how to plot them. Use of `reorder`. 53 | 54 | ## Datasets used 55 | 56 | * [billion-prices](https://gabors-data-analysis.com/datasets/#billion-prices) 57 | * [wms-management-survey](https://gabors-data-analysis.com/datasets/#wms-management-survey) as homework 58 | 59 | 60 | ## Lecture Time 61 | 62 | Ideal overall time: **70-100mins**. 63 | 64 | Showing [`data_exploration.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture07-data-exploration/data_exploration.ipynb) takes around *50 minutes* while doing the tasks would take the rest. 65 | 66 | I highly recommend doing all the tasks as this lecture involves many new functions. 67 | If you do not have the needed time for one lecture, you may take this into two parts. Good breakpoints are: 68 | 69 | - hypothesis-testing 70 | - association 71 | 72 | 73 | ## Homework 74 | 75 | *Type*: quick practice, approx 15 mins 76 | 77 | Use the [wms-management-survey](https://gabors-data-analysis.com/datasets/#wms-management-survey) data, ['wms_da_textbook.csv' file](https://osf.io/uzpce/). 78 | Use the following units: 79 | - United States firms, observed in wave 2004 and employment of the firms should be between 100 and 5000. 80 | - Create a descriptive statistic table for variables of `management`, `emp_firm`, and `firm_age` with mean, median, sd, min, max, range, and 5% and 95% percentiles. 81 | - Create descriptive statistics for `management` grouped by `ownership` types. Use mean, median, min, and max. 82 | - Create a plot with histogram and kernel density, with proper labeling for `management` variable. 83 | - Create a new factor variable `firm_size`, which takes the value of 'small and medium' if `emp_firm` is smaller than 1000 and otherwise it is 'large' Hint: use a simple logical operator in a factor function, specifying the label. 84 | - Test if the average `management` score is different in large vs small and medium firms 85 | - Create a bin-scatter with 10 bins, where on x-axis is the `emp_firm` and y-axis the `management` score. Use the same number of observations within each bin. 86 | 87 | ## Further material 88 | 89 | - Billion-Price-Project case study can be found in Gabor's da_case_studies repository: [ch06-online-offline-price-test](https://github.com/gabors-data-analysis/da_case_studies/tree/master/ch06-online-offline-price-test) This case study primarily focuses on hypothesis testing only. 90 | - Data exploration case studies in Gabor's da_case_studies repository are [ch03-hotels-vienna-explore](https://github.com/gabors-data-analysis/da_case_studies/blob/master/ch03-hotels-vienna-explore) and [ch03-hotels-europe-compare](https://github.com/gabors-data-analysis/da_case_studies/blob/master/ch03-hotels-europe-compare). It focuses on bars, histograms and basic descriptive statistics. 91 | - Association, scatter, and bin-scatter is used in the case study [ch04-management-firm-size](https://github.com/gabors-data-analysis/da_case_studies/tree/master/ch04-management-firm-size) in Gabor's book. 92 | - Arthur Turrell's Coding for Economics classes: [Explanatory Data Analysis](https://aeturrell.github.io/coding-for-economists/data-exploratory-analysis.html) that introduces the `skimpy` and `pandas-profiling` packages. -------------------------------------------------------------------------------- /lecture08-functions/README.md: -------------------------------------------------------------------------------- 1 | # Lecture 11: Writing Functions 2 | 3 | ## Motivation 4 | 5 | One of the best ways to improve your reach as a data scientist is to write functions. Functions allow automating common tasks in a more powerful and general way than copy-and-pasting. Writing a function has three big advantages over using copy-and-paste: 6 | 7 | 1. You can give a function an evocative name that makes your code easier to understand. 8 | 2. As requirements change, you only need to update code in one place, instead of many. 9 | 3. You eliminate the chance of making incidental mistakes when you copy and paste (i.e. updating a variable name in one place, but not in another). 10 | 11 | Writing good functions is a lifetime journey. Even after using Python for many years, one can still learn new techniques and better ways of approaching old problems. The goal is not to teach you every esoteric detail of functions but to get you started with some pragmatic advice that you can apply immediately. 12 | 13 | ## This lecture 14 | 15 | This lecture introduces functions, how they are structured and how to write them. 16 | 17 | 18 | ## Learning outcomes 19 | After successfully live-coding the material (see: [`functions.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture08-functions/functions.ipynb)), students will know on 20 | 21 | - how to create user definded functions 22 | - what is the structure of a function 23 | - the use of `docstring`s to document functions 24 | - the use of the `help` function to retreive function descriptions 25 | - the use of `lambda` function 26 | 27 | ## Lecture Time 28 | 29 | Ideal overall time: **20-30 mins**. 30 | 31 | 32 | ## Homework 33 | 34 | *Type*: quick practice, approx 15 mins, together with [lecture06-conditionals](https://github.com/gabors-data-analysis/da-coding-python/edit/main/lecture06-conditionals). 35 | 36 | Bootstrapping - using the [`sp500`](https://gabors-data-analysis.com/datasets/#sp500) data 37 | 38 | - download the cleaned data for `sp500` from [OSF](https://osf.io/h64z2/) 39 | - write a function, which calculates the bootstrap standard errors and confidence intervals based on these standard errors. 40 | - function should have an input for a) vector of prices, b) number of bootstraps, c) level for the confidence interval 41 | - create a new variable for `sp500`: `daily_return`, which is the difference in the prices from one day to the next day. 42 | - use this `daily_return` variable and calculate the 80% confidence interval based on bootstrap standard errors along with the mean. 43 | 44 | 45 | ## Further material 46 | 47 | - Case study materials from Gabor's da_case_studies repository on generalization (with bootstrapping) is: [ch05-stock-market-loss-generalize](https://github.com/gabors-data-analysis/da_case_studies/tree/master/ch05-stock-market-loss-generalize) on testing are: [ch06-online-offline-price-test](https://github.com/gabors-data-analysis/da_case_studies/tree/master/ch06-online-offline-price-test) and [ch06-stock-market-loss-test](https://github.com/gabors-data-analysis/da_case_studies/tree/master/ch06-stock-market-loss-test) 48 | -------------------------------------------------------------------------------- /lecture08-functions/functions.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "9909070d-84b3-4484-8a73-49666a8c7dcc", 6 | "metadata": {}, 7 | "source": [ 8 | "# Lecture 8\n", 9 | "\n", 10 | "## Writing Functions\n", 11 | "- User-Defined [Functions](#UDF) (UDFs)\n", 12 | "- [lambda](#lambda) functions\n", 13 | "\n", 14 | "---" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "id": "64cf5f6f-84ca-4224-96e1-ba0a4d158564", 20 | "metadata": {}, 21 | "source": [ 22 | "## User-defined Functions " 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "id": "6f6446a6-9255-477a-b928-3d4a273a2808", 28 | "metadata": {}, 29 | "source": [ 30 | "A function is a block of organized, reusable code that is used to perform a single, related action. Functions provide better modularity for your application and a high degree of code reusing.\n", 31 | "\n", 32 | "You can define functions to provide the required functionality. Here are simple rules to define a function in Python.\n", 33 | "\n", 34 | "* Function blocks begin with the keyword ```def``` followed by the function name and parentheses ```( )```.\n", 35 | "\n", 36 | "* Any input parameters or arguments should be placed within these parentheses. You can also define parameters inside these parentheses.\n", 37 | "\n", 38 | "* The first statement of a function can be an optional statement - the documentation string of the function or docstring.\n", 39 | "\n", 40 | "* The code block within every function starts with a colon (```:```) and is **indented**.\n", 41 | "\n", 42 | "* The statement ```return``` [expression] returns a value, or a serious of values, a list, a dictionary, .... A return statement with no arguments is the same as return None." 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "id": "e264b08d-0fc3-4fdd-b8a7-3b660d353884", 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "def add_one(number):\n", 53 | " x = number + 1\n", 54 | " return x" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "id": "71379516-7976-488a-8073-ef572c8ad946", 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "add_one(20)" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "id": "6ef54f7a-6379-4f6b-831f-3a0c1bc964e1", 70 | "metadata": {}, 71 | "source": [ 72 | "You can return more than one object from a single function. " 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "id": "dad4db6d-6289-43d4-8a42-f421294c8382", 79 | "metadata": {}, 80 | "outputs": [], 81 | "source": [ 82 | "def add_one_and_return_both(number):\n", 83 | " x = number\n", 84 | " y = x + 1\n", 85 | " return x, y" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "id": "2c2690e9-3ff2-4b92-9f7e-b55f428a2b2c", 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "x, y = add_one_and_return_both(23)\n", 96 | "print(x)\n", 97 | "print(y)" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "id": "e927412f-55ba-40c6-a1b1-661942b6fd2a", 103 | "metadata": {}, 104 | "source": [ 105 | "Function arguments can have default values." 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "id": "bd1def85-514c-482f-aa17-e3b301c03eda", 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [ 115 | "def number_to_the_power(number, exponent = 2):\n", 116 | " return number ** exponent" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": null, 122 | "id": "7d4dcb2a-aadc-46ce-b253-12d3e83b991f", 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "number_to_the_power(5)" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": null, 132 | "id": "d1734ec5-1097-4d3a-8b56-3ee9f0fd5122", 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [ 136 | "number_to_the_power(5, 3)" 137 | ] 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "id": "02a7f46d-2fc1-427a-a7d7-a5aa60d1d94f", 142 | "metadata": {}, 143 | "source": [ 144 | "Return objects can be of any type. Also, `docstrings` help you document your function. More on docstrings [here](https://www.datacamp.com/community/tutorials/docstrings-python)." 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "id": "8dc06669-ca6b-4ad8-9493-74e3d7d360af", 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "def cast_listitems_to_string(list):\n", 155 | " \"\"\"\n", 156 | " Casts list of various elements to string. \n", 157 | " \n", 158 | " The function cast elements in a list to string,\n", 159 | " whatever their original type is.\n", 160 | " \n", 161 | " Parameters\n", 162 | " ----------\n", 163 | " list: list \n", 164 | " A list of various data types.\n", 165 | " \n", 166 | " Returns\n", 167 | " -------\n", 168 | " list: list\n", 169 | " A list of strings, cast from the original elements.\n", 170 | " \"\"\"\n", 171 | " for i in range(len(list)):\n", 172 | " list[i] = str(list[i]) # remember: lists are mutable\n", 173 | " return list" 174 | ] 175 | }, 176 | { 177 | "cell_type": "markdown", 178 | "id": "cd7e75b0-fa2d-4a1a-bb66-384a15e03099", 179 | "metadata": {}, 180 | "source": [ 181 | "Docstrings are returned when you call the `help()` function on your UDF. This is especially helpful when you import your function from a module in a complex solution. " 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": null, 187 | "id": "e568ac74-5c5a-4ad4-b3a1-3c7d23406e03", 188 | "metadata": {}, 189 | "outputs": [], 190 | "source": [ 191 | "help(cast_listitems_to_string)" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": null, 197 | "id": "4c4a7080-bbb8-4787-95b5-163eb3ce8719", 198 | "metadata": {}, 199 | "outputs": [], 200 | "source": [ 201 | "import math \n", 202 | "\n", 203 | "ls_convertable = [1,2, 'a', math.cos(math.pi / 3)]" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": null, 209 | "id": "bee0d780-184d-418e-9dd9-da19f9624be7", 210 | "metadata": {}, 211 | "outputs": [], 212 | "source": [ 213 | "ls_convertable" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": null, 219 | "id": "19f1dac0-1a56-4f2b-be45-b8c6c78e4fa3", 220 | "metadata": {}, 221 | "outputs": [], 222 | "source": [ 223 | "ls_converted = cast_listitems_to_string(ls_convertable)" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": null, 229 | "id": "d4e6dfe2-7cbc-41ea-ac4e-af8e51c33dc5", 230 | "metadata": {}, 231 | "outputs": [], 232 | "source": [ 233 | "ls_converted" 234 | ] 235 | }, 236 | { 237 | "cell_type": "markdown", 238 | "id": "318ca855-a546-4a00-b694-6fdca2ae1bcd", 239 | "metadata": {}, 240 | "source": [ 241 | "## Lambda Functions " 242 | ] 243 | }, 244 | { 245 | "cell_type": "markdown", 246 | "id": "d974ef34-2a6b-4297-8b4b-f16ee634b9f2", 247 | "metadata": {}, 248 | "source": [ 249 | "A lambda function is a small anonymous function. A lambda function can take any number of arguments, but can only have one expression. It is created using the `lambda` keyword." 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": null, 255 | "id": "b73341b4-ac29-4fe3-b424-f7197d0116ab", 256 | "metadata": {}, 257 | "outputs": [], 258 | "source": [ 259 | "square = lambda x: x ** 2" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": null, 265 | "id": "f731ca49-31ca-47f8-9846-ed45c8254849", 266 | "metadata": {}, 267 | "outputs": [], 268 | "source": [ 269 | "square(2)" 270 | ] 271 | }, 272 | { 273 | "cell_type": "markdown", 274 | "id": "2ef41ded-502e-4699-91fb-ed316f69d510", 275 | "metadata": {}, 276 | "source": [ 277 | "We use lambda to simplify our code, to create temporary definitions, which are used only once. The same can be achieved with a normal definiton:" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": null, 283 | "id": "bc234107-fac9-4c5a-987d-7429ace83fe1", 284 | "metadata": {}, 285 | "outputs": [], 286 | "source": [ 287 | "def square_def(x): \n", 288 | " return x ** 2" 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": null, 294 | "id": "c4648bd8-426d-4e96-9557-af69fcd04c56", 295 | "metadata": {}, 296 | "outputs": [], 297 | "source": [ 298 | "square_def(2)" 299 | ] 300 | }, 301 | { 302 | "cell_type": "markdown", 303 | "id": "3542957c-9b5f-4ca0-ba95-7c967e3e5028", 304 | "metadata": {}, 305 | "source": [ 306 | "You can combine `lambda` functions with *list comprehension*. " 307 | ] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": null, 312 | "id": "3dafcdf7-9726-41b8-bd94-48e8b5086d65", 313 | "metadata": {}, 314 | "outputs": [], 315 | "source": [ 316 | "ls_numbers = list(range(10))" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": null, 322 | "id": "a75a8ef2-986a-4cf4-a136-c2858721c524", 323 | "metadata": {}, 324 | "outputs": [], 325 | "source": [ 326 | "ls_numbers" 327 | ] 328 | }, 329 | { 330 | "cell_type": "markdown", 331 | "id": "970486fc-a0e4-40c7-90a1-557e5c4280fb", 332 | "metadata": {}, 333 | "source": [ 334 | "Let's square all the values from the list and add 1 to each element" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": null, 340 | "id": "668c2e5c-37c4-4c91-bce1-d86d9294bdd3", 341 | "metadata": {}, 342 | "outputs": [], 343 | "source": [ 344 | "f = lambda x: x**2 + 1\n", 345 | "[f(x) for x in ls_numbers]" 346 | ] 347 | }, 348 | { 349 | "cell_type": "markdown", 350 | "id": "b3fb2d74-1ea1-4d88-933b-a3285dc4a6ff", 351 | "metadata": {}, 352 | "source": [ 353 | "Let's square and add one to each even number in the list" 354 | ] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": null, 359 | "id": "c73d8160-6160-42cb-8cfe-452ddf92500e", 360 | "metadata": {}, 361 | "outputs": [], 362 | "source": [ 363 | "[f(x) for x in ls_numbers if x%2 == 0 ]" 364 | ] 365 | }, 366 | { 367 | "cell_type": "markdown", 368 | "id": "8561c4c4-c82a-40cb-986f-50b92553335f", 369 | "metadata": {}, 370 | "source": [ 371 | "Square and add one to each even number in the list but return the odd numbers without transformation" 372 | ] 373 | }, 374 | { 375 | "cell_type": "code", 376 | "execution_count": null, 377 | "id": "b6952703-dc5c-45b8-b985-16ebe95a27fa", 378 | "metadata": {}, 379 | "outputs": [], 380 | "source": [ 381 | "[f(x) if x%2 == 0 else x for x in ls_numbers]" 382 | ] 383 | }, 384 | { 385 | "cell_type": "markdown", 386 | "id": "0136d6c4-f121-4d6a-a733-6b8ca8313930", 387 | "metadata": {}, 388 | "source": [ 389 | "You can also handle errors with lambda functions and conditional list comprehension." 390 | ] 391 | }, 392 | { 393 | "cell_type": "code", 394 | "execution_count": null, 395 | "id": "bce0f8e9-0975-497e-8faa-5f2aceeb07bd", 396 | "metadata": {}, 397 | "outputs": [], 398 | "source": [ 399 | "replace_comma = lambda x: x.replace(',', '.')" 400 | ] 401 | }, 402 | { 403 | "cell_type": "code", 404 | "execution_count": null, 405 | "id": "576579d5-8402-496f-b97f-4cf2ea1c32fc", 406 | "metadata": {}, 407 | "outputs": [], 408 | "source": [ 409 | "replace_comma('4,5')" 410 | ] 411 | }, 412 | { 413 | "cell_type": "code", 414 | "execution_count": null, 415 | "id": "177b5ba9-735b-49a1-8ac7-12808c73dc88", 416 | "metadata": {}, 417 | "outputs": [], 418 | "source": [ 419 | "ls_mixed_data = [1.2, '1,2', 5, 7, '4,5', 7]" 420 | ] 421 | }, 422 | { 423 | "cell_type": "code", 424 | "execution_count": null, 425 | "id": "5205a3a3-f91d-4f47-9923-5d2612d83fc4", 426 | "metadata": {}, 427 | "outputs": [], 428 | "source": [ 429 | "[replace_comma(x) if isinstance(x, str) else x for x in ls_mixed_data]" 430 | ] 431 | } 432 | ], 433 | "metadata": { 434 | "kernelspec": { 435 | "display_name": "Python 3 (ipykernel)", 436 | "language": "python", 437 | "name": "python3" 438 | }, 439 | "language_info": { 440 | "codemirror_mode": { 441 | "name": "ipython", 442 | "version": 3 443 | }, 444 | "file_extension": ".py", 445 | "mimetype": "text/x-python", 446 | "name": "python", 447 | "nbconvert_exporter": "python", 448 | "pygments_lexer": "ipython3", 449 | "version": "3.8.10" 450 | } 451 | }, 452 | "nbformat": 4, 453 | "nbformat_minor": 5 454 | } 455 | -------------------------------------------------------------------------------- /lecture09-exception-handling/README.md: -------------------------------------------------------------------------------- 1 | # Lecture 09: Exception Handling 2 | 3 | 4 | ## Motivation 5 | 6 | Our code can, and will, run into errors. Sometimes this is a consequence of incorrect coding, some times of improper input data, sometimes of some malfunction of the underlying infrastructure. Programming languages offer tools to handle these exceptions and to transfer control to another component of the codebase. Even basic solutions need to handle errors, so exception handling is also a part of the basic tools for a data scientist or analyst. 7 | 8 | 9 | ## This lecture 10 | 11 | We introduce `try` and `except`, and offer a few simple examples on how to '_catch_' these errors. By identifying exception types we show how to be selective on the treatment of the various types of errors. 12 | 13 | 14 | ## Learning outcomes 15 | 16 | After completing [exception_handling.ipynb](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture09-exception-handling/exception_handling.ipynb) students should be able to : 17 | 18 | - Test code chunks for potential errors 19 | - Control how to handle these exceptions 20 | - Identify exception (aka error) types 21 | - Selecting actions based on the types of the exception occurred 22 | 23 | 24 | ## Datasets used 25 | 26 | None. 27 | 28 | 29 | ## Lecture time 30 | 31 | Ideal overall time: **10 mins**. 32 | 33 | 34 | ## Homework 35 | 36 | Define the following function: 37 | 38 | ```python 39 | def divide(a, b): 40 | 41 | return a / b 42 | ``` 43 | 44 | The user needs to add both `a` and `b` as user input using the `input()` function. 45 | 46 | ```python 47 | a = input() 48 | b = input() 49 | ``` 50 | 51 | Define a complex `try` - `except` block which handles all false user input which otherwise crash the function. Make sure your code sends different instructions to the user depending on the nature of the error: 52 | 53 | - when division with zero 54 | - when using strings instead of numbers 55 | - in case of other false input. 56 | 57 | -------------------------------------------------------------------------------- /lecture09-exception-handling/exception_handling.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "3037f2b2-e21d-4ccd-85e1-167998d299e2", 6 | "metadata": {}, 7 | "source": [ 8 | "# Lecture 9\n", 9 | "\n", 10 | "## Exception Handling\n", 11 | "\n", 12 | "---" 13 | ] 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "id": "695bc9bd-629d-4610-b662-b96d870097db", 18 | "metadata": {}, 19 | "source": [ 20 | "## Exception Handling (Try Except)" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "id": "5f339c7c-535c-4761-86ff-cfc847e155dd", 26 | "metadata": {}, 27 | "source": [ 28 | "`Exceptions` handle errors in the code. They let you write contructs so that your program falls back to somewhere else if an error blocks the normal run of your code. " 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "id": "c5a70f13-d4a8-4026-bc16-abdbf0476b5b", 34 | "metadata": {}, 35 | "source": [ 36 | "The `try` block lets you test a block of code for errors.
\n", 37 | "The `except` block lets you handle the error.
\n", 38 | "The `else` block is to be executed if no errors were raised.
\n", 39 | "The `finally` block lets you execute code, regardless of the result of the try- and except blocks.
" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "id": "a280322d-696d-48de-ab6d-df87f30114a3", 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "try:\n", 50 | " print(\"test\")\n", 51 | " # generate an error: the variable test is not defined\n", 52 | " print(test)\n", 53 | " \n", 54 | "except:\n", 55 | " print(\"Caught an exception\")" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "id": "4d31a4e1-50c4-4940-ba85-d4fde4ce5d4d", 61 | "metadata": {}, 62 | "source": [ 63 | "To get information about the error, we can access the `Exception` class instance that describes the exception by using for example:\n", 64 | "\n", 65 | " except Exception as e:" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "id": "3e108b0d-ca61-4fcf-955b-82885c3d6b74", 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "try:\n", 76 | " print(\"test\")\n", 77 | " # generate an error: the variable test is not defined\n", 78 | " print(test)\n", 79 | " \n", 80 | "except Exception as e:\n", 81 | " print(\"The problem with our code is the following: \" + str(e))" 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "id": "23d8cafd-1c9b-4b08-8d29-8d92b4203d40", 87 | "metadata": {}, 88 | "source": [ 89 | "
Let's define two functions! " 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "id": "fd868cbc-8766-46ac-95c6-dd4194a29328", 96 | "metadata": { 97 | "tags": [] 98 | }, 99 | "outputs": [], 100 | "source": [ 101 | "def add_two_numbers(a, b):\n", 102 | " return a + b" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "id": "c1b96f43-a511-40d1-9ecf-19eabbcc4f8a", 109 | "metadata": { 110 | "tags": [] 111 | }, 112 | "outputs": [], 113 | "source": [ 114 | "def divide_two_numbers(a, b):\n", 115 | " \n", 116 | " try: \n", 117 | " result = a / b\n", 118 | " \n", 119 | " except Exception as e:\n", 120 | " pass\n", 121 | " \n", 122 | " else:\n", 123 | " return result" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "id": "282448b1-42eb-48d4-8735-bfdccd2c219d", 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "add_two_numbers(3, 5)" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "id": "0b7657c7-6ceb-4ccb-b521-e6a8eac52e5b", 139 | "metadata": {}, 140 | "source": [ 141 | "If we call our function we run into an error and our script stops running. " 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": null, 147 | "id": "65f49b77-dcb7-4239-aee9-9af8388e55be", 148 | "metadata": {}, 149 | "outputs": [], 150 | "source": [ 151 | "add_two_numbers(3, 'b')" 152 | ] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "id": "57c3dc95-7e6a-4754-80de-8577c6505945", 157 | "metadata": {}, 158 | "source": [ 159 | "We can handle the error and - for instance - call our user to modify the inputs." 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "id": "f8636462-43cc-4fed-88d1-7410bea183e6", 166 | "metadata": {}, 167 | "outputs": [], 168 | "source": [ 169 | "try:\n", 170 | " add_two_numbers(3, 'b')\n", 171 | "except Exception as e:\n", 172 | " print('We ran into this error: ' + str(e) + '.', 'Try another input.')" 173 | ] 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "id": "b09a9d9a-9fd0-41e7-8ad4-e745bf617ae9", 178 | "metadata": {}, 179 | "source": [ 180 | "And what happens here? " 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": null, 186 | "id": "d68db218-956a-4b6c-94ae-4b7ed13d5723", 187 | "metadata": {}, 188 | "outputs": [], 189 | "source": [ 190 | "try:\n", 191 | " divide_two_numbers(3, 'b') # This function already handles the error inside thus a string input does not crash the function!\n", 192 | "except Exception as e:\n", 193 | " print('We ran into this error: ' + str(e))\n", 194 | "else:\n", 195 | " print('Everything went fine.')" 196 | ] 197 | }, 198 | { 199 | "cell_type": "markdown", 200 | "id": "1d7e201f-9070-4990-8e0c-73b5ff0aa76b", 201 | "metadata": {}, 202 | "source": [ 203 | "Our `try - except` block did not throw an error, since the function already handled it. Nevertheless, we did not get any result back. \n", 204 | "\n", 205 | "If we decide to handle the exceptions inside the function, but we do want to enter the `except` block in case of an inproprer input, we can `raise` the exception inside the function. This is a useful trick when we handle various exceptions inside the function but we want to throw an error in certain cases only. " 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": null, 211 | "id": "e7ffdd17-6af9-4020-9a93-9ce125157f2f", 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [ 215 | "def division(a,b):\n", 216 | " \n", 217 | " try:\n", 218 | " result = a / b\n", 219 | " \n", 220 | " except ZeroDivisionError:\n", 221 | " print('Division by zero. Use a non-zero denominator!')\n", 222 | " \n", 223 | " except Exception as e:\n", 224 | " print('Exited with error: ' + str(e) + '.')\n", 225 | " raise\n", 226 | " \n", 227 | " else: \n", 228 | " return result" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": null, 234 | "id": "bcedd700-0dcc-46ea-be3c-8f7ece6fc127", 235 | "metadata": {}, 236 | "outputs": [], 237 | "source": [ 238 | "# Here the function will not throw an error, only tells the user about the false input. The code would continue running. \n", 239 | "division(30, 0)" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": null, 245 | "id": "eae74039-c304-4201-b9db-bc3f08d3703e", 246 | "metadata": {}, 247 | "outputs": [], 248 | "source": [ 249 | "# This is an unhandled error which stops the code running. \n", 250 | "division(30, 'a')" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": null, 256 | "id": "044738df-28ac-4749-87b6-e958f4c4e6a3", 257 | "metadata": {}, 258 | "outputs": [], 259 | "source": [ 260 | "# In this case we enter the 'else' branch. \n", 261 | "division(30,7)" 262 | ] 263 | }, 264 | { 265 | "cell_type": "markdown", 266 | "id": "560e6037-b55b-4050-ba9a-c6b36efd3082", 267 | "metadata": {}, 268 | "source": [ 269 | "As you see, a `try - except` block can have multiple `except` branches so different errors can be handled in different ways. You can read about Python's various exception types in the documentation of [built-in exceptions](https://docs.python.org/3.8/library/exceptions.html). " 270 | ] 271 | }, 272 | { 273 | "cell_type": "markdown", 274 | "id": "ac41476b-9701-4d4e-a4fe-9c01fec46bca", 275 | "metadata": {}, 276 | "source": [ 277 | "Note: we used to following code for printing the exception itself:\n", 278 | "```\n", 279 | "print(str(e))\n", 280 | "```\n", 281 | "This is because the `e` is an `Exception` class object, and as such cannot be the input of the `print()` function. The `str()` method calls the *string representation* of this object which then can be printed. " 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": null, 287 | "id": "db2221c8", 288 | "metadata": {}, 289 | "outputs": [], 290 | "source": [] 291 | } 292 | ], 293 | "metadata": { 294 | "kernelspec": { 295 | "display_name": "Python 3 (ipykernel)", 296 | "language": "python", 297 | "name": "python3" 298 | }, 299 | "language_info": { 300 | "codemirror_mode": { 301 | "name": "ipython", 302 | "version": 3 303 | }, 304 | "file_extension": ".py", 305 | "mimetype": "text/x-python", 306 | "name": "python", 307 | "nbconvert_exporter": "python", 308 | "pygments_lexer": "ipython3", 309 | "version": "3.8.10" 310 | } 311 | }, 312 | "nbformat": 4, 313 | "nbformat_minor": 5 314 | } 315 | -------------------------------------------------------------------------------- /lecture10-intro-to-regression/README.md: -------------------------------------------------------------------------------- 1 | # Lecture 10: Introduction to regression 2 | 3 | ## Motivation 4 | 5 | You want to identify hotels in a city that are good deals: underpriced for their location and quality. You have scraped the web for data on all hotels in the city, and you have cleaned the data. You have carried out exploratory data analysis that revealed that hotels closer to the city center tend to be more expensive, but there is a lot of variation in prices between hotels at the same distance. How should you identify hotels that are underpriced relative to their distance to the city center? In particular, how should you capture the average price–distance relationship that would provide you a benchmark, to which you can compare actual prices to find good deals? 6 | 7 | The analysis of hotel prices and distance to the city center reveals that hotels further away from the center are less expensive by a certain amount, on average. Can you use this result to estimate how much more revenue a hotel developer could expect if it were to build a hotel closer to the center rather than farther away? Regression is a model for the conditional mean: the mean of y for different values of one or more x variables. Regression is used to uncover patterns of association. That, in turn, is used in the causal analysis, to uncover the effect of x on y, and in predictions, to arrive at a good guess of what the value of y is if we don’t know it, but we know the value of x. 8 | 9 | In this lecture, we introduce simple non-parametric regression and simple linear regression, and we show how to visualize their results. We then discuss simple linear regression in detail. We introduce the regression equation, how its coefficients are uncovered (estimated) in actual data, and we emphasize how to interpret the coefficients. We introduce the concepts of predicted value and residual and goodness of fit, and we discuss the relationship between regression and correlation. 10 | 11 | ## This lecture 12 | 13 | This lecture introduces regressions via [hotels-vienna dataset](https://gabors-data-analysis.com/datasets/#hotels-vienna). It overviews models based on simple binary means, binscatters, lowess nonparametric regression, and introduces simple linear regression techniques. The lecture illustrates the use of predicted values and regression residuals with linear regression, but as homework, the same exercise is repeated with a binscatter-based model. 14 | 15 | This lecture is based on [Chapter 07, A: *Finding a good deal among hotels with simple regression*](https://gabors-data-analysis.com/casestudies/#ch07a-finding-a-good-deal-among-hotels-with-simple-regression) 16 | 17 | ## Learning outcomes 18 | After successfully completing [`hotels_intro_to_regression.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture10-intro-to-regression/intro_to_regression.ipynb) students should be able: 19 | 20 | - Binary means: 21 | - Calculate prediction based on means of two categories and create an annotated graph 22 | - Binscatter: 23 | - Create means based on differently defined bins for the X variable 24 | - Show two different graphs: simple mean predictions for each bins as a dot and scatter with step functions 25 | - Lowess nonparametric regression: 26 | - How to create a lowess (loess) graph 27 | - What is an output of a loess model? What are the main advantages and disadvantages? 28 | - Simple linear regression 29 | - How to create a simple linear regression line in a scatterplot 30 | - `statsmodels` package: estimate two models w and w/o heteroscedastic robust SE and compare the two model 31 | - How to get predicted values and errors of predictions 32 | - Get the best and worst deals: identify hotels with the smallest/largest errors 33 | - Visualize the errors via histogram and scatter plot with annotating the best and worst 5 deals. 34 | 35 | ## Dataset used 36 | 37 | - [hotels-vienna](https://gabors-data-analysis.com/datasets/#hotels-vienna) 38 | 39 | ## Lecture Time 40 | 41 | Ideal overall time: **60 mins**. 42 | 43 | Going through [`hotels_intro_to_regression.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture10-intro-to-regression/intro_to_regression.ipynb) takes around *45-50 minutes*, the rests are the tasks. 44 | 45 | 46 | ## Homework 47 | 48 | *Type*: quick practice, approx 15 mins 49 | 50 | Use the binscatter model with 7 bins and save the predicted values and errors (true price minus the predicted value). Find the best and worst 10 deals and visualize with a scatterplot, highlighting the under/overpriced hotels with these best/worst deals according to this model. Compare to the simple linear regression. Which model would you use? Argue! 51 | 52 | 53 | ## Further material 54 | 55 | - More materials on the case study can be found in Gabor's *da_case_studies* repository: [ch07-hotels-simple-reg](https://github.com/gabors-data-analysis/da_case_studies/tree/master/ch07-hotels-simple-reg) 56 | - Arthur Turrell's Coding for Economics classes: [Regression](https://aeturrell.github.io/coding-for-economists/econmt-regression.html) -------------------------------------------------------------------------------- /lecture11-feature-engineering/README.md: -------------------------------------------------------------------------------- 1 | # Lecture 11: Feature Engineering 2 | 3 | ## Motivation 4 | 5 | Feature engineering is the part when we take the variables as recorded in the raw data and create (transform) the y and x variables that we’ll include in the model. In general x variables are called features (in predictive analytics) whereas y is often called labels (coming from categorization tasks). 6 | 7 | Specifying the functional form of the variables is a difficult aspect of feature engineering. That includes capturing nonlinear relationships with quantitative variables (quadratic, higher order polynomial, piecewise linear spline, etc), deciding on the number of categories for qualitative variables (joining 8 | rare categories into fewer ones), and deciding on interactions. The emphasis is on getting the best fit without overfitting the data. Domain knowledge is important: knowledge from previous analyses, and/or theory, about what tends to make the outcome different. Domain knowledge can help answer what variables are likely to be more important versus less important, what interactions are likely important, and where should we be most worried about nonlinearity. For instance, professional weather forecasts use computational models that use the laws of physics to relate many variables and feed in measured values of those variables from data. Or, many central banks complement purely data-driven inflation forecasts with predictions from general equilibrium models that are simplified representations of how the economy works. The other source of information is the data itself. Exploratory data analysis (EDA) is a key part of all predictive analytics. We do EDA to make sure we understand the content of each variable, to make sure they are as clean as possible, and to understand their distribution. Besides exploring the variables in themselves, we need to investigate the patterns of associations with the y variable. In addition, we may look at how the x variables are correlated with each other, to make sure that we don’t include variables together that are extremely closely related to each other (e.g., that have a correlation coefficient of 0.95) unless we have a very good reason (usually theoretical) to do so. 9 | 10 | This work is tedious and time-consuming. Some of it is unavoidable. We need to know our data: we should never build models with x variables whose content we don’t understand. That’s because we cannot assess, or even think about, the stability of the patterns of association between y and x if we don’t know what those variables are, what their content is, and how they are measured. And assessing stability is necessary for assessing external validity, which is a key aspect of a good model. Thus, we can play around with data and estimate models without knowing what’s in them, but that won’t necessarily help with the true goal of our analysis. 11 | 12 | ## This lecture 13 | 14 | This lecture introduces feature engineering practices and focuses on simple methods used in [Gabor's book](https://gabors-data-analysis.com/) and its [case studies]((https://github.com/gabors-data-analysis/da_case_studies)). It utilizes [wms-management-survey](https://gabors-data-analysis.com/datasets/#wms-management-survey) dataset for manipulation of (multiple) variable(s) into a new one and [bisnode-firms](https://gabors-data-analysis.com/datasets/#bisnode-firms) dataset to show more elaborate techniques such as imputing, nonlinear transformations and winsorizing. 15 | 16 | The lecture (partially) uses the following case studies: 17 | - [Chapter 01, C: Management quality: data collection](https://gabors-data-analysis.com/casestudies/#ch01c-management-quality-data-collection) 18 | - [Chapter 04, A: Management quality and firm size: describing patterns of association](https://gabors-data-analysis.com/casestudies/#ch04a-management-quality-and-firm-size-describing-patterns-of-association) 19 | - [Chapter 08, C: Measurement error in hotel ratings](https://gabors-data-analysis.com/casestudies/#ch08c-measurement-error-in-hotel-ratings) as homework 20 | - [Chapter 17, A: Predicting firm exit: probability and classification](https://gabors-data-analysis.com/casestudies/#ch17a-predicting-firm-exit-probability-and-classification) 21 | 22 | *Note: this is rather an introduction to feature engineering, emphasizing the importance of what kind of (basic) transformations are necessary with the variables. However, the literature rather thinks of feature engineering as a complex, usually machine learning-based method, to create new variables. Main applications are converting texts, pictures, videos, web-page content, etc into data-analysis-ready variables.* 23 | 24 | 25 | ## Learning outcomes 26 | After successfully completing [`01_feature_engineering_wms.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture11-feature-engineering/01_feature_engineering_wms.ipynb) and [`02_feature_engineering_bisnode.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture11-feature-engineering/02_feature_engineering_bisnode.ipynb), students should be able: 27 | 28 | - How to create a new variable from multiple already existing variables by calculating the mean or the sum 29 | - Create groups of a categorical variable 30 | - `pycountry_convert` package to get continents and regions 31 | - Create an ordered factor variable 32 | - convert an integer-valued variable to an ordered factor variable 33 | - `cut` to convert a continuous variable into an ordered factor variable 34 | - Create dummy variables from a factor variable with `get_dummies` 35 | - Extra: introduction to principal component analysis with `sklearn.decomposition`'s `PCA` 36 | - Imputing values 37 | - replacing with mean or median 38 | - using outside knowledge (or other variables) 39 | - creating a categorical variable with a specific value for missing 40 | - Adjusting log transformation (to avoid log(0)) 41 | - Using `shift` functions 42 | - Numeric vs factor representation with visualization 43 | - Random sampling with panel data for (faster) visualization 44 | - Winsorizing 45 | 46 | ## Datasets used 47 | 48 | - [wms-management-survey](https://gabors-data-analysis.com/datasets/#wms-management-survey) 49 | - [bisnode-firms](https://gabors-data-analysis.com/datasets/#bisnode-firms) 50 | - [hotels-vienna](https://gabors-data-analysis.com/datasets/#hotels-vienna) as homework. 51 | 52 | ## Lecture Time 53 | 54 | Ideal overall time: **30-50 mins**. 55 | 56 | This lecture is a collection of basic feature engineering techniques used throughout [this Python course](https://github.com/gabors-data-analysis/da-coding-python), [Gabor's book](https://gabors-data-analysis.com/) and its [case studies](https://github.com/gabors-data-analysis/da_case_studies). It can be skipped and one can spend more time in each lecture on the transformations/engineering. However, it is highly useful to see almost all the transformations in one place. 57 | 58 | ## Homework 59 | 60 | *Type*: quick practice, approx 15 mins 61 | 62 | This homework should make students think about other issues with variables, namely measurement error in the explanatory variable. 63 | 64 | Use [hotels-vienna](https://gabors-data-analysis.com/datasets/#hotels-vienna) data from [OSF](https://osf.io/y6jvb/). 65 | 66 | - Filter observations to Hotels with 3-4 stars in Vienna (`city_actual`) and with prices less than 600$ 67 | - Create a new variable: log-price 68 | - Create three sub-samples, where `rating_count` is: 69 | - less than 100 70 | - between 100 and 200 71 | - more than 200 72 | - Run simple linear regressions: `log-price ~ rating` on all of the abovementioned samples 73 | - Plot the three predicted log prices on one plot, with proper formatting and legends 74 | - Argue briefly why the slopes are different. 75 | 76 | 77 | ## Further material 78 | - More materials on the **World-Management Survey case study** can be found in Gabor's *da_case_studies* repository: [ch04-management-firm-size](https://github.com/gabors-data-analysis/da_case_studies/tree/master/ch04-management-firm-size) 79 | - More materials on the **Predicting firm exit case study** can be found in Gabor's *da_case_studies* repository: [ch17-predicting-firm-exit](https://github.com/gabors-data-analysis/da_case_studies/blob/master/ch17-predicting-firm-exit), especially in the [data preparation file](https://github.com/gabors-data-analysis/da_case_studies/blob/master/ch17-predicting-firm-exit/ch17-firm-exit-data-prep.R) 80 | -------------------------------------------------------------------------------- /lecture12-simple-linear-regression/00_life_exp_get_data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "0d78786d", 6 | "metadata": {}, 7 | "source": [ 8 | "# Lecture 12\n", 9 | "\n", 10 | "## Getting the data for analysis\n", 11 | " - practice with WDI package \n", 12 | " \n", 13 | "#### Case Study: \n", 14 | " - Life-expectancy and income \n", 15 | "---" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "id": "3851750c", 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "import pandas as pd\n", 26 | "import numpy as np\n", 27 | "from datetime import date\n", 28 | "import warnings\n", 29 | "\n", 30 | "warnings.filterwarnings(\"ignore\")" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": null, 36 | "id": "6f957733", 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "import wbdata" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "id": "209c0911", 46 | "metadata": {}, 47 | "source": [ 48 | "Reminder on how WDI works - it is an API\\\n", 49 | "Search for variables which contains GDP" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "id": "5c9e7543", 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "wbdata.search_indicators(\"gdp\")" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "id": "b2ea0e3c", 65 | "metadata": {}, 66 | "source": [ 67 | "Narrow down the serach for: GDP + something + capita + something + constant" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "id": "61e7255c", 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "wbdata.search_indicators(\"gdp.*capita.*constant\")" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "id": "784147fc", 83 | "metadata": {}, 84 | "source": [ 85 | "Get GDP data" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "id": "a0c077b0", 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "gdp_data = wbdata.get_data(\n", 96 | " indicator=\"NY.GDP.PCAP.PP.KD\", country=\"all\", data_date=date(2019, 1, 1)\n", 97 | ")\n", 98 | "gdp_data" 99 | ] 100 | }, 101 | { 102 | "cell_type": "markdown", 103 | "id": "3c2fd41c", 104 | "metadata": {}, 105 | "source": [ 106 | "### Task: \n", 107 | "\n", 108 | "Get the GDP data, along with `population`, `total` and `life expectancy at birth`\n", 109 | "for year 2019 and save to your data folder!\\\n", 110 | "Note: We have pushed it to Github, we will use that later, just to be on the same page!" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "id": "7b8bc4c3", 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "wbdata.search_indicators('population, total')" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "id": "bd5a71ac", 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "wbdata.search_indicators(\"life expectancy at birth\")" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": null, 136 | "id": "4ff5383f", 137 | "metadata": {}, 138 | "outputs": [], 139 | "source": [ 140 | "data_raw = wbdata.get_dataframe(\n", 141 | " indicators={\n", 142 | " \"NY.GDP.PCAP.PP.KD\": \"gdppc\",\n", 143 | " \"SP.DYN.LE00.IN\": \"lifeexp\",\n", 144 | " \"SP.POP.TOTL\": \"population\",\n", 145 | " },\n", 146 | " country=\"all\",\n", 147 | " data_date=date(2019, 1, 1),\n", 148 | ").reset_index()\n", 149 | "\n", 150 | "ISOcodes = pd.DataFrame(wbdata.get_country())[[\"iso2Code\", \"name\"]]" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": null, 156 | "id": "5aff6a41", 157 | "metadata": {}, 158 | "outputs": [], 159 | "source": [ 160 | "data_raw = (\n", 161 | " data_raw.reset_index(drop=True)\n", 162 | " .merge(ISOcodes, left_on=\"country\", right_on=\"name\", how=\"left\")\n", 163 | " .drop(\"name\", axis=1)\n", 164 | ")" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": null, 170 | "id": "4453a51a", 171 | "metadata": {}, 172 | "outputs": [], 173 | "source": [ 174 | "data_raw.to_csv(\"data/WDI_lifeexp_raw.csv\", index = False)" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": null, 180 | "id": "b3dddfba", 181 | "metadata": {}, 182 | "outputs": [], 183 | "source": [] 184 | } 185 | ], 186 | "metadata": { 187 | "kernelspec": { 188 | "display_name": "Python 3 (ipykernel)", 189 | "language": "python", 190 | "name": "python3" 191 | }, 192 | "language_info": { 193 | "codemirror_mode": { 194 | "name": "ipython", 195 | "version": 3 196 | }, 197 | "file_extension": ".py", 198 | "mimetype": "text/x-python", 199 | "name": "python", 200 | "nbconvert_exporter": "python", 201 | "pygments_lexer": "ipython3", 202 | "version": "3.8.10" 203 | } 204 | }, 205 | "nbformat": 4, 206 | "nbformat_minor": 5 207 | } 208 | -------------------------------------------------------------------------------- /lecture12-simple-linear-regression/01_life_exp_clean.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "c7da3efc", 6 | "metadata": {}, 7 | "source": [ 8 | "# Lecture 12 #\n", 9 | " \n", 10 | "## Auxiliary file to clean data \n", 11 | " - can practice, but not mandatory \n", 12 | " \n", 13 | "#### Case Study: \n", 14 | "- life-expectancy and income \n", 15 | "---" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "id": "469640e2", 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "import pandas as pd\n", 26 | "import numpy as np\n", 27 | "import warnings\n", 28 | "\n", 29 | "warnings.filterwarnings(\"ignore\")" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "id": "0e7ed69e", 35 | "metadata": {}, 36 | "source": [ 37 | "Call the data from github" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": null, 43 | "id": "172b3be5", 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "my_url = \"https://raw.githubusercontent.com/gabors-data-analysis/da-coding-python/main/lecture12-simple-linear-regression/data/WDI_lifeexp_raw.csv\"\n", 48 | "#df = pd.read_csv(my_url)\n", 49 | "df = pd.read_csv(\"data/WDI_lifeexp_raw.csv\")\n", 50 | "df.loc[lambda x: x[\"country\"] == \"Namibia\", \"iso2Code\"] = \"NA\"\n", 51 | "df = df.dropna(subset=[\"iso2Code\"])" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "id": "49f14043", 57 | "metadata": {}, 58 | "source": [ 59 | "Check the observations:\n", 60 | "\n", 61 | "Lot of grouping observations usually contains a number" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "id": "3cc311ab", 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "d1 = df.loc[~df[\"iso2Code\"].str.isalpha()]\n", 72 | "d1" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "id": "93901b50", 78 | "metadata": {}, 79 | "source": [ 80 | "Filter these out" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "id": "ad8aba60", 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "df = df.loc[df[\"iso2Code\"].str.isalpha()]" 91 | ] 92 | }, 93 | { 94 | "cell_type": "markdown", 95 | "id": "bf36bf49", 96 | "metadata": {}, 97 | "source": [ 98 | " Some grouping observations are still there, check each of them\\\n", 99 | " HK - Hong Kong, China\\\n", 100 | " OE - OECD members\\\n", 101 | " all with starting X, except XK which is Kosovo\\\n", 102 | " all with starting Z, except ZA-South Africa, ZM-Zambia and ZW-Zimbabwe\\" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "id": "c8295167", 108 | "metadata": {}, 109 | "source": [ 110 | "1st drop speficif values" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "id": "70e6a09a", 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "df.loc[lambda x: x[\"iso2Code\"].isin([\"EU\",\"HK\",\"OE\"])]" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "id": "f2a58b7c", 126 | "metadata": {}, 127 | "source": [ 128 | "Save opposite" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "id": "7831ca31", 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [ 138 | "df = df.loc[lambda x: ~x[\"iso2Code\"].isin([\"EU\",\"HK\",\"OE\"])]" 139 | ] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "id": "fdac841f", 144 | "metadata": {}, 145 | "source": [ 146 | "2nd drop values with certain starting character\\\n", 147 | "Get the first letter from iso2c" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": null, 153 | "id": "b919919e", 154 | "metadata": {}, 155 | "outputs": [], 156 | "source": [ 157 | "df = df.loc[\n", 158 | " (~((df[\"iso2Code\"].str[0] == \"X\") | (df[\"iso2Code\"].str[0] == \"Z\")))\n", 159 | " | (df[\"iso2Code\"].isin([\"XK\", \"ZA\", \"ZM\", \"ZW\"]))\n", 160 | "]" 161 | ] 162 | }, 163 | { 164 | "cell_type": "markdown", 165 | "id": "339c7488", 166 | "metadata": {}, 167 | "source": [ 168 | "Check for missing observations" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "id": "915a69e7", 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [ 178 | "df = df.dropna(subset=['gdppc', 'lifeexp', 'population'])" 179 | ] 180 | }, 181 | { 182 | "cell_type": "markdown", 183 | "id": "143402d8", 184 | "metadata": {}, 185 | "source": [ 186 | "### Clean variables" 187 | ] 188 | }, 189 | { 190 | "cell_type": "markdown", 191 | "id": "4cd7c273", 192 | "metadata": {}, 193 | "source": [ 194 | "Rename variables and scale them\\\n", 195 | "Drop all the others !! in this case write into readme it is referring to year 2018!!" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "id": "70459d65", 202 | "metadata": {}, 203 | "outputs": [], 204 | "source": [ 205 | "df[\"population\"] = df[\"population\"] / 10**6\n", 206 | "df[\"gdppc\"] = df[\"gdppc\"] / 10**3" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": null, 212 | "id": "5e3754b9", 213 | "metadata": {}, 214 | "outputs": [], 215 | "source": [ 216 | "df.filter([ \"population\", \"gdppc\", \"lifeexp\"]).hist()" 217 | ] 218 | }, 219 | { 220 | "cell_type": "markdown", 221 | "id": "afd3b4d0", 222 | "metadata": {}, 223 | "source": [ 224 | "It seems we have a large value(s) for population:" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": null, 230 | "id": "fc064c21", 231 | "metadata": {}, 232 | "outputs": [], 233 | "source": [ 234 | "df.loc[df[\"population\"]>500]" 235 | ] 236 | }, 237 | { 238 | "cell_type": "markdown", 239 | "id": "d4a01758", 240 | "metadata": {}, 241 | "source": [ 242 | "These are India and China... not an extreme value" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": null, 248 | "id": "b325c8e0", 249 | "metadata": {}, 250 | "outputs": [], 251 | "source": [ 252 | "df.describe()" 253 | ] 254 | }, 255 | { 256 | "cell_type": "markdown", 257 | "id": "5c027e67", 258 | "metadata": {}, 259 | "source": [ 260 | "Save the raw data file for your working directory" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": null, 266 | "id": "189d5e97", 267 | "metadata": {}, 268 | "outputs": [], 269 | "source": [ 270 | "df.to_csv(\"data/WDI_lifeexp_clean.csv\",index=False)" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": null, 276 | "id": "f8b21d77", 277 | "metadata": {}, 278 | "outputs": [], 279 | "source": [] 280 | } 281 | ], 282 | "metadata": { 283 | "kernelspec": { 284 | "display_name": "Python 3 (ipykernel)", 285 | "language": "python", 286 | "name": "python3" 287 | }, 288 | "language_info": { 289 | "codemirror_mode": { 290 | "name": "ipython", 291 | "version": 3 292 | }, 293 | "file_extension": ".py", 294 | "mimetype": "text/x-python", 295 | "name": "python", 296 | "nbconvert_exporter": "python", 297 | "pygments_lexer": "ipython3", 298 | "version": "3.8.10" 299 | } 300 | }, 301 | "nbformat": 4, 302 | "nbformat_minor": 5 303 | } 304 | -------------------------------------------------------------------------------- /lecture12-simple-linear-regression/README.md: -------------------------------------------------------------------------------- 1 | # Lecture 12: Analysis of life expectancy and GDP 2 | 3 | ## Motivation 4 | 5 | Life expectancy at birth shows how long residents of a country live; it is a summary measure of their health. Residents of richer countries tend to live longer, but you want to know the strength of that pattern. You also want to identify countries where people live especially long for the income level of their country, to start thinking about what may cause their exceptional health. You download cross-country data from the World Bank database on life expectancy and GDP per capita, and you want to uncover the pattern of association between them. How would you do that in a way that accommodates potentially nonlinear patterns and, at the same time, produces results that you can interpret? 6 | 7 | Linear regression gives a meaningful approximation to the patterns of association, but real-life data can be messy, and the patterns may be nonlinear. What those mean for regression analysis and what we can do about them is important to understand. There are several tools that we can apply to make linear regression approximate nonlinear patterns of association, but whether we want to do so depends on the goal of the analysis. The fact that real-life data tends to be messy, with errors and extreme values, poses other challenges for regression analysis. 8 | 9 | ## This lecture 10 | 11 | This lecture provides materials to analyze the association between life expectancy and GDP measures for various countries in 2019 (or later), inspired by the dataset [worldbank-lifeexpectancy](https://gabors-data-analysis.com/datasets/#worldbank-lifeexpectancy). During this exercise, students get familiar with creating simple linear regression-based models with different transformations, such as level-level, log-level, level-log, and log-log models, or using polynomials and piecewise linear splines transformation of the explanatory variable. 12 | 13 | This lecture is a practice (or similar to live coding) lecture, as it does not teaches much new material, but provides students to deepen their understanding with simple regressions and the reasoning behind them. 14 | 15 | This lecture is based on [Chapter 08, B: How is life expectancy related to the average income of a country?](https://gabors-data-analysis.com/casestudies/#ch08b-how-is-life-expectancy-related-to-the-average-income-of-a-country) 16 | 17 | ## Learning outcomes 18 | After successfully completing codes in *raw_codes* student should have: 19 | 20 | [`life_exp_get_data.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture12-simple-linear-regression/life_exp_get_data.ipynb) 21 | - Solid ground for importing and exporting data from World Bank's website via API. 22 | 23 | [`life_exp_analysis.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture12-simple-linear-regression/life_exp_analysis.ipynb) 24 | - Create scatter plots for competing models. 25 | - Transform variables from level to log in a ggplot and scale the axis for proper interpretation. 26 | - Run and plot multiple single-variable regressions with: 27 | - log transformation, 28 | - higher-order polynomial, 29 | - piecewise linear spline 30 | - or using weighted OLS. 31 | - Be able to estimate heteroscedastic robust SEs and compare specific model results with `stargazer` in one output. 32 | - Create a graph, which automatically annotates observations with the *n* largest and smallest errors. 33 | 34 | 35 | ## Datasets used 36 | 37 | - [worldbank-lifeexpectancy](https://gabors-data-analysis.com/datasets/#worldbank-lifeexpectancy), but for more recent year. 38 | 39 | ## Lecture Time 40 | 41 | Ideal overall time: approx 60 minutes. 42 | 43 | Solving [`00_life_exp_get_data.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture12-simple-linear-regression/00_life_exp_get_data.ipynb)) takes around *5-10 minutes* as it builds on [lecture03-data-IO](https://github.com/gabors-data-analysis/da-coding-python/tree/main/lecture03-data-IO). In principle it should be a quick reminder and practice. 44 | 45 | Solving [`02_life_exp_analysis.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture12-simple-linear-regression/02_life_exp_analysis.ipynb) covers the main material, and takes *40-60 minutes* depending on the student's background. This lecture is mainly theory-based (practice via case study) and includes easy, but many new commands in a repetitive way. 46 | 47 | ## Homework 48 | 49 | *Type*: quick practice, approx 20 mins 50 | 51 | Use the [hotels-vienna dataset](https://gabors-data-analysis.com/datasets/#hotels-vienna), similarly as we used in [`hotels_intro_to_regression.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture10-intro-to-regression/intro_to_regression.ipynb). Create and compare different models, based on transformations of `y=price` or `x=distance` variables: 52 | 53 | - level-level 54 | - log-level 55 | - level-log 56 | - log-log 57 | - polinomials of distance with square and cube terms 58 | - piecewise-linear-spline model, with a cutoff at 2 miles 59 | 60 | Estimate these models with `statsmodels`, using robust SEs, and compare with `stargazer`. Decide which model would you use and why! Argue! 61 | 62 | ## Further material 63 | 64 | - More materials on the case study can be found in Gabor's *da_case_studies* repository: [ch08-life-expectancy-income](https://github.com/gabors-data-analysis/da_case_studies/tree/master/ch08-life-expectancy-income) 65 | - Arthur Turrell's Coding for Economics classes: [Regression](https://aeturrell.github.io/coding-for-economists/econmt-regression.html) -------------------------------------------------------------------------------- /lecture13-advanced-linear-regression/README.md: -------------------------------------------------------------------------------- 1 | # Lecture 13: Advanced Linear Regression 2 | 3 | ## Motivation 4 | 5 | You have analyzed your data on hotel prices in a particular city to find hotels that are underpriced relative to how close they are to the city center. But you have also uncovered differences in terms of other features of the hotels that measure quality and are related to price. How would you use this data to find hotels that are underpriced relative to all of their features? And how can you visualize the distribution of hotel prices relative to what price you would expect for their features in a way that helps identify underpriced hotels? 6 | 7 | After understanding simple linear regression, we can turn to multiple linear regression, which has more than one explanatory variable. Multiple linear regression is the most used method to uncover patterns of associations between variables. There are multiple reasons to include more explanatory variables in a regression. We may be interested in uncovering patterns of association between y and other explanatory variables, which may help us understand differences in terms of the x variable we are interested in most. Or, we may be interested in the effect of an x variable, but we want to compare observations that are different in x but similar in other variables. Finally, we may want to predict y, and we want to use more x variables to arrive at better predictions. 8 | 9 | We discuss why and when we should estimate multiple regression, how to interpret its coefficients, and how to construct and interpret confidence intervals and test the coefficients. We discuss the relationship between multiple regression and simple regression. We explain that piecewise linear splines and polynomial regressions are technically multiple linear regressions without the same interpretation of the coefficients. We include an informal discussion on how to decide what explanatory variables to include and in what functional form. 10 | 11 | Finally, we want to generalize the results of a regression from the data we are analyzing to a decision situation we care about. We can use methods to quantify the uncertainty brought about by generalizing to the general pattern represented by the data (statistical inference), and we can have guidelines to assess whether the general pattern represented by the data is likely close to the general pattern behind the situation we care about (external validity). 12 | 13 | ## This lecture 14 | 15 | This lecture introduces multiple variable regressions via [hotels-europe](https://gabors-data-analysis.com/datasets/#hotels-europe) dataset. It introduces topics on 16 | 17 | - how to choose a model from many possible candidates based on R2, 18 | - how to evaluate prediction with multiple regressors: 19 | - different graphs prediction uncertainty, and 20 | - calculate the confidence and prediction intervals. 21 | 22 | Moreover, it covers external validity with robustness test: checking model results in different time/location/type of observations. Finally, as an extra part, it shows a simple example of using a training and test sample to better understand the process of model choice and the limitation of R2. 23 | 24 | This lecture is based on 25 | - [Chapter 09, B: How stable is the hotel price–distance to the center relationship?](https://gabors-data-analysis.com/casestudies/#ch09b-how-stable-is-the-hotel-pricedistance-to-center-relationship) 26 | - [Chapter 10, B: Finding a good deal among hotels with multiple regression](https://gabors-data-analysis.com/casestudies/#ch10b-finding-a-good-deal-among-hotels-with-multiple-regression) 27 | 28 | 29 | ## Learning outcomes 30 | After successfully completing [`hotels_advanced_regression.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture13-advanced-linear-regression/hotels_advanced_regression.ipynb), students should be able to: 31 | 32 | - Visualize multiple explanatory variables with the outcome: 33 | - With a scatter plot decide the functional form which is needed. 34 | - Multiple linear regression 35 | - Use `statsmodels` to estimate regressions with multiple explanatory variables 36 | - Use `stargazer` to compare multiple candidate models and report model statistics such as R2 to evaluate models. 37 | - Analysing model prediction 38 | - Get model predictions and residuals and in case of a log-transformed outcome, how to convert the predictions and residuals into level. 39 | - y-yhat scatter plot with 45-degree line to evaluate prediction uncertainty 40 | - residual-yhat or residual-explanatory variable scatter plot to evaluate model performance along different dimensions 41 | - Confidence and Prediction interval 42 | - Using `get_prediction` function to get confidence and prediction interval 43 | - Set the significance level for the intervals with `alpha` input argument 44 | - Convert log-transformed outcome confidence and/or prediction intervals into level. Limitations. 45 | - External Validity: robustness checks 46 | - Estimate a selected model with different data to assess model uncertainty 47 | - Using different time periods, locations, and types of hotels/apartments. 48 | - Compare these models to the original and evaluate external validity 49 | - Extra: 50 | - Split the original sample into a training and test samples 51 | - Use the training sample to estimate the model and the test sample to predict hotel prices 52 | - Evaluate which model performs better with RMSE measure. 53 | 54 | ## Dataset used 55 | 56 | - [hotels-europe](https://gabors-data-analysis.com/datasets/#hotels-europe) 57 | 58 | ## Lecture Time 59 | 60 | Ideal overall time: **100 mins**. 61 | 62 | Going through [`hotels_advanced_regression.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture13-advanced-linear-regression/hotels_advanced_regression.ipynb takes around *70-80 minutes*. There are many discussions and interpretations of the models, which are similarly important. Solving the tasks takes the remaining *20-30 minutes*. 63 | 64 | 65 | ## Homework 66 | 67 | *Type*: quick practice, approx 20 mins 68 | 69 | Choose a different city from Vienna and make sure you have **at least 100 observations after filtering**. Create at least 3 models with at least 3 explanatory variables (check for transformation) and choose the best. Imagine you can build a new hotel in your city and can specify the feature values as you wish. Predict the price and estimate confidence and prediction intervals with a 90% significance level. Set the price of your hotel and argue, why is it your choice. 70 | 71 | 72 | ## Further material 73 | 74 | - More materials on the case study can be found in Gabor's *da_case_studies* repository: [ch10-hotels-multiple-reg](https://github.com/gabors-data-analysis/da_case_studies/tree/master/ch10-hotels-multiple-reg) on multiple regressions and [ch09-hotels-europe-stability](https://github.com/gabors-data-analysis/da_case_studies/tree/master/ch09-hotels-europe-stability) discusses external validity. 75 | - Arthur Turrell's Coding for Economics classes: [Regression](https://aeturrell.github.io/coding-for-economists/econmt-regression.html) -------------------------------------------------------------------------------- /lecture14-binary-models/README.md: -------------------------------------------------------------------------------- 1 | # Lecture 14: Binary outcome - modeling probabilities 2 | 3 | ## Motivation 4 | 5 | Does smoking make you sick? And can smoking make you sick in late middle age even if you stopped years earlier? You have data on many healthy people in their fifties from various countries, and you know whether they stayed healthy four years later. You have variables on their smoking habits, age, income, and many other characteristics. How can you use this data to estimate how much more likely non-smokers are to stay healthy? How can you uncover if that depends on whether they never smoked or are former smokers? And how can you tell if that association is the result of smoking itself or, instead, underlying differences in smoking by education, income, and other factors? 6 | 7 | The lecture is related to the chapter that discusses probability models: regressions with binary y variables. In a sense, we can treat a binary y variable just like any other variable and use regression analysis as we would otherwise. with a binary y variable, we can estimate nonlinear probability models instead of the linear ones. Data analysts need to have a good understanding of when to use these different probability models, and how to interpret and evaluate their results. 8 | 9 | ## This lecture 10 | 11 | This lecture introduces binary outcome models with an analysis of health outcomes with multiple variables based on the [share-health](https://gabors-data-analysis.com/datasets/#share-health) dataset. First, we introduce saturated models (smoking on health) and linear probability models with multiple explanatory variables. We check the predicted outcome probabilities for certain groups. Then we focus on non-linear binary models: the logit and probit model. We estimate marginal effects, to interpret the average (marginal) effects of variables on the outcome probabilities. We overview goodness of fit statistics (R2, Pseudo-R2, Brier score, and Log-loss) along with visual and descriptive inspection of the predicted probabilities. Finally, we calculate the estimated bias and the calibration curve to understand model perform better. 12 | 13 | This lecture is based on [Chapter 11, A: Does smoking pose a health risk?](https://gabors-data-analysis.com/casestudies/#ch11a-does-smoking-pose-a-health-risk) 14 | 15 | ## Learning outcomes 16 | After successfully completing codes in [`binary_models.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture14-binary-models/binary_models.ipynb), students should be able: 17 | 18 | 19 | - Calculated by hand or estimate saturated models 20 | - Visualize and understand binary outcome scatter-plots 21 | - Estimate Linear Probability Models (LPM) 22 | - Use `statsmodels` to estimate regressions with multiple explanatory variables 23 | - Use `stargazer` to compare multiple candidate models and report model statistics such as R2 to evaluate models. 24 | - Understand the limitations of LPM 25 | - Carry out sub-group analysis based on predicted probabilities 26 | - Estimate Non-Linear Probability Models 27 | - Use `statsmodels` to estimate logit or probit models 28 | - Estimate `marginaleffects` with package `get_margeff` 29 | - Use `statsmodels` to compare logit and probit coefficients 30 | - Compare LPM, logit/probit and logit/probit with marginal effects 31 | - Get relevant goodness-of-fit measures 32 | - Understand the usefulness of comparing the distribution of predicted probabilities for different models 33 | - Understanding the usefulness of comparing descriptive statistics of the predicted probabilities for different models 34 | - Calculate the bias of the model along with the calibration curve 35 | 36 | ## Datasets used 37 | 38 | - [share-health](https://gabors-data-analysis.com/datasets/#share-health) 39 | 40 | ## Lecture Time 41 | 42 | Ideal overall time: **100 mins**. 43 | 44 | Going through [`binary_models.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture14-binary-models/binary_models.ipynb) takes around *80-90 minutes* as there are many discussion and interpretation of the models. Solving the tasks takes the remaining *10-20 minutes*. 45 | 46 | 47 | ## Homework 48 | 49 | *Type*: quick practice, approx 20 mins 50 | 51 | Use the same [share-health](https://gabors-data-analysis.com/datasets/#share-health) dataset, but now use `smoking` as your outcome variable as this task is going to ask you to predict if a person is a smoker or not. Use similar variables except `stayshealthy` to explain `smoking`. Run a LPM, logit and probit model. Compare the coefficients of these models along with the average marginal effects. Compute the goodness of fit statistics (R2, Pseudo-R2, Brier score, log-loss) of all of the models. Choose one, calculate the bias, and plot the calibration curve. 52 | 53 | 54 | ## Further material 55 | 56 | - More materials on the case study can be found in Gabor's *da_case_studies* repository: [ch11-smoking-health-risk](https://github.com/gabors-data-analysis/da_case_studies/tree/master/ch11-smoking-health-risk) 57 | 58 | -------------------------------------------------------------------------------- /lecture15-datetime/README.md: -------------------------------------------------------------------------------- 1 | # Lecture 15: Date and time manipulations 2 | 3 | ## Motivation 4 | 5 | Time series data is often used to analyze business, economic, and policy questions. Time series data presents additional opportunities as well as additional challenges for regression analysis. Unlike cross-sectional data, it enables examining how y changes when x changes, and it also allows us to examine what happens to y right away or with a delay. However, variables in time series data come with some special features that affect how we should estimate regressions, and how we can interpret their coefficients. 6 | 7 | One of these differences is the frequency of the time series. It can vary from seconds to years. Time series with more frequent observations have higher frequency, e.g. monthly frequency is higher than yearly frequency, but it is lower than daily frequency. The frequency may also be irregular with gaps in-between. Gaps in time series data can be viewed as missing values of variables. But they tend to have specific causes. To run a regression of y on x in time series data, the two variables need to be at the same time series frequency. When the time series frequencies of y and x are different, we need to adjust one of them. Most often that means aggregating the variable at a higher frequency (e.g., from weekly to monthly). With flow variables, such as sales, aggregation means adding up; with stock variables and other kinds of variables, such as prices, it is often taking an average for the period or taking the last value, such as the closing price. 8 | 9 | Another fundamental feature of time series data is that variables evolve with time. They may hover around a stable average value, or they may drift upwards or downwards. A variable in time series data follows a trend if it tends to change in one direction; in other words, it has a tendency to increase or decrease. Another possible issue is seasonality. Seasonality means that the value of the variable is expected to follow a cyclical pattern, tracking the seasons of the year, days of the week, or hours of the day. Because of such systematic changes, later observations tend to be different from earlier observations. Understanding trends and seasonality is important because they make regression analysis challenging. They are examples of a broader concept, non-stationarity. Stationarity means stability; non-stationarity means the lack of stability. Stationary time series variables have the same expected 10 | value and the same distribution at all times. Trends and seasonality violate stationarity because the expected value is different at different times. 11 | 12 | ## This lecture 13 | 14 | This lecture introduces basic date and time-variable manipulations. The first part starts with the basics using `datetime` package by overviewing basic time-related functions and manipulations with time-related values and variables. The second part discusses time-series data aggregation from different frequencies along with visualization for time-series data and unit root tests. 15 | 16 | This lecture utilizes the case study of [Chapter 12, A: Returns on a company stock and market returns](https://gabors-data-analysis.com/casestudies/#ch12a-returns-on-a-company-stock-and-market-returns) as homework, and uses [`stocks-sp500`](https://gabors-data-analysis.com/datasets/#stocks-sp500) dataset. 17 | 18 | ## Learning outcomes 19 | After successfully completing [`01_datetime_basics.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture15-datetime/01_datetime_basics.ipynb) and [`02_datetime_manipulations.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture15-datetime/2_datetime_manipulations.ipynb), students should be: 20 | 21 | - Familiar with the `datetime` package, especially with 22 | - creating specific time variables, converting other types of variables into a date or datetime object 23 | - understand the importance of time zones 24 | - Get specific parts of a date object such as `year, quarter, month, day, hour etc.` 25 | - Understand the difference between duration and periods 26 | - Carry out time aggregation 27 | - Aggregate different time series objects to lower frequencies, using mean/median/max/end date, etc. 28 | - Adding `lag`-ged and differenced variables to data 29 | - Visualize time series with 30 | - handle time variable on x-axis with `scale_x_date()` 31 | - `facet_wrap` to stack multiple graphs 32 | - standardize variables and put multiple lines into one graph 33 | - Unit root tests using `arch` package's `PhillipsPerron` function 34 | - understanding the result of the Philip-Perron test and deciding if the variable needs to be differenced or not. 35 | 36 | ## Datasets used 37 | 38 | - [`stocks-sp500`](https://gabors-data-analysis.com/datasets/#stocks-sp500) 39 | 40 | ## Lecture Time 41 | 42 | Ideal overall time: **35-40 mins**. 43 | 44 | Going through the notebooks takes around *30 minutes*. There are some discussions and interpretations of the time series (e.g. stationarity). Solving the tasks takes the remaining *5-10 minutes*. The lecture can be shortened by only showing the methods. It will be partially repeated in [lecture16-timeseries-regression](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture16-timeseries-regression). 45 | 46 | 47 | ## Homework 48 | 49 | *Type*: quick practice, approx 10 mins 50 | 51 | Estimate the *beta* coefficient between quarterly SP500 log returns on Microsoft stocks log return. Use the [`stocks-sp500`](https://gabors-data-analysis.com/datasets/#stocks-sp500) dataset. Take care when aggregating the data to a) use the last day in the quarter and then take the logs and then difference the variable to get log returns. When estimating the regression use heteroskedastic robust standard error (next lecture we learn how to use Newey-West SE). 52 | 53 | 54 | ## Further material 55 | 56 | - More materials on the case study can be found in Gabor's *da_case_studies* repository: [ch12-stock-returns-risk](https://github.com/gabors-data-analysis/da_case_studies/blob/master/ch12-stock-returns-risk/ch12-stock-returns-risk.R) 57 | - Arthur Turrell's Coding for Economics classes: [Time Series](https://aeturrell.github.io/coding-for-economists/time-series.html) 58 | -------------------------------------------------------------------------------- /lecture16-timeseries-regression/README.md: -------------------------------------------------------------------------------- 1 | # Lecture 16: Introduction to time-series regression 2 | 3 | ## Motivation 4 | 5 | Heating and cooling are potentially important uses of electricity. To investigate how weather conditions affect electricity consumption, you have collected data on temperature and residential electricity consumption in a hot region. How should you estimate the association between temperature and electricity consumption? How should you define the variables of interest, and how should you prepare the data, which has daily observations on temperature and monthly observations on electricity consumption? Should you worry about the fact that both electricity consumption and temperature vary a lot across months within years, and if yes, what should you do about it? 6 | 7 | Time series data is often used to analyze business, economic, and policy questions. Time series data presents additional opportunities as well as additional challenges for regression analysis. Unlike cross-sectional data, it enables examining how y changes when x changes, and it also allows us to examine what happens to y right away or with a delay. However, variables in time series data come with some special features that affect how we should estimate regressions, and how we can interpret their coefficients. 8 | 9 | ## This lecture 10 | 11 | This lecture introduces time-series regression via the [arizona-electricity](https://gabors-data-analysis.com/datasets/#arizona-electricity) dataset. During this lecture, students manipulate time-series data along time dimensions, create multiple time-series related graphs and get familiar with (partial) autocorrelation. Differenced variables, lags of the outcome, and lags of the explanatory variables, (deterministic) seasonality are used during regression models. Estimating these models are via `statsmodels`'s `get_robustcov_results` with Newey-West standard errors. Model comparisons and estimating cumulative effects with valid SEs are shown. 12 | 13 | This lecture is based on [Chapter 12, B: Electricity consumption and temperature](https://gabors-data-analysis.com/casestudies/#ch12b-electricity-consumption-and-temperature) 14 | 15 | ## Learning outcomes 16 | After successfully completing [`intro_time_series.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture16-timeseries-regression/intro_time_series.ipynb), students should be able: 17 | 18 | - Merge different time-series data 19 | - Create time-series related descriptives and graphs 20 | - handle date as the axis with different formatting 21 | - create autocorrelation and partial autocorrelation graphs and interpret 22 | - Run time-series regression 23 | - Estimate Newey-West standard errors and understand the role of lags 24 | - Control for seasonality via dummies 25 | - Add lagged variables to the model (and possibly leads as well) 26 | - How and why to use the same time interval when comparing competing time-series models 27 | - Estimate the standard error(s) for the cumulative effect 28 | 29 | ## Datasets used 30 | 31 | - [arizona-electricity](https://gabors-data-analysis.com/datasets/#arizona-electricity) 32 | 33 | ## Lecture Time 34 | 35 | Ideal overall time: **60-80 mins**. 36 | 37 | Going through [`intro_time_series.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture16-timeseries-regression/intro_time_series.ipynb) takes around *50-70 minutes* as there are some discussions and interpretations of the time series (e.g. stationarity, a transformation of variables, etc). Solving the tasks takes the remaining *5-10 minutes*. 38 | 39 | 40 | ## Homework 41 | 42 | *Type*: quick practice, approx 20 mins 43 | 44 | You will use the [case-shiller-la](https://gabors-data-analysis.com/datasets/#case-shiller-la) dataset to build a model for unemployment based on the Shiller price index. Load the data and consider only `pn` (Shiller price index) and `un` (unemployment) as the variables of interest. Both are seasonally adjusted. Decide which transformation to use to make the variables stationary. Create models, where you predict unemployment based on the Shiller price index. At least you should have one model where you use only the contemporaneous effects and one when you use lagged variables for both variables as explanatory variables. 45 | 46 | 47 | ## Further material 48 | 49 | - More materials on the case study can be found in Gabor's *da_case_studies* repository: [ch12-electricity-temperature](https://github.com/gabors-data-analysis/da_case_studies/tree/master/ch12-electricity-temperature) 50 | - Arthur Turrell's Coding for Economics classes: [Time Series](https://aeturrell.github.io/coding-for-economists/time-series.html), [Forecasting](https://aeturrell.github.io/coding-for-economists/time-fcasts-env.html) 51 | -------------------------------------------------------------------------------- /lecture17-basic-spatial-viz/03_spatial_datavisualisation_plotly.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "305165ae", 6 | "metadata": {}, 7 | "source": [ 8 | "# Lecture 20 – part II \n", 9 | " \n", 10 | "## Basic spatial data visualization \n", 11 | " - Hotels-Europe \n", 12 | " - Create maps with `plotly` \n", 13 | "\n", 14 | "\n", 15 | "Case-studies:\n", 16 | "\n", 17 | " - Ch03B Comparing hotel prices in Europe: Vienna vs London \n", 18 | " \n", 19 | "Data used:\n", 20 | "\n", 21 | " hotels-europe \n", 22 | "\n", 23 | "___" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "id": "d8106332", 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "import json\n", 34 | "import geopandas as gpd\n", 35 | "import numpy as np\n", 36 | "import warnings\n", 37 | "import pandas as pd\n", 38 | "import plotly.express as px\n", 39 | "\n", 40 | "%matplotlib inline\n", 41 | "warnings.filterwarnings(\"ignore\")" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "id": "6fe1f494", 47 | "metadata": {}, 48 | "source": [ 49 | "Read Vienna data" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "id": "08a5b712", 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "vienna_map = pd.read_csv(\"data_map/vienna.csv\")" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "id": "a23efa4e", 65 | "metadata": {}, 66 | "source": [ 67 | "Convert pandas dataframe to geopandas dataframe" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "id": "51ba68e8", 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "vienna_map = gpd.GeoDataFrame(\n", 78 | " vienna_map.loc[:, [c for c in vienna_map.columns if c != \"geometry\"]],\n", 79 | " geometry=gpd.GeoSeries.from_wkt(vienna_map[\"geometry\"]),\n", 80 | " crs=\"epsg:3005\",\n", 81 | ")" 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "id": "1752aeb6", 87 | "metadata": {}, 88 | "source": [ 89 | "Create a geojson object" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "id": "6dd2e16d", 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [ 99 | "vmap = json.loads(vienna_map.to_json())" 100 | ] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "id": "0b4bba31", 105 | "metadata": {}, 106 | "source": [ 107 | "Create figure with plotly" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "id": "f6d6cfe2", 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "fig = px.choropleth_mapbox(\n", 118 | " vienna_map,\n", 119 | " geojson=vmap,\n", 120 | " locations=\"district\",\n", 121 | " color=\"price\",\n", 122 | " title=\"Average hotel prices in Vienna ($,2017)\",\n", 123 | " color_continuous_scale=\"viridis\",\n", 124 | " featureidkey=\"properties.district\", # featureidkey connects the original geopandas dataframe (vienna_map) to the geojson object (vmap)\n", 125 | " mapbox_style=\"carto-positron\",\n", 126 | " zoom=10,\n", 127 | " center={\"lat\": 48.210033, \"lon\": 16.363449},\n", 128 | " opacity=0.5,\n", 129 | ")\n", 130 | "fig.update_layout(margin={\"r\": 0, \"l\": 0, \"b\": 0})\n", 131 | "fig.show()" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "id": "d0a350c6", 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "def generateColorScale(colors, naColor):\n", 142 | " colorArray = []\n", 143 | " colorArray.append([0, naColor])\n", 144 | " for grenze, color in zip(np.linspace(0.7, 1, len(colors)), colors):\n", 145 | " colorArray.append([grenze, color])\n", 146 | " return colorArray" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "id": "7b00b60d", 153 | "metadata": {}, 154 | "outputs": [], 155 | "source": [ 156 | "fig = px.choropleth_mapbox(\n", 157 | " vienna_map.fillna(0),\n", 158 | " geojson=vmap,\n", 159 | " locations=\"district\",\n", 160 | " color=\"price\",\n", 161 | " title=\"Average hotel prices in Vienna ($,2017)\",\n", 162 | " color_continuous_scale=generateColorScale(colors=[\"red\", \"yellow\"], naColor=\"gray\"),\n", 163 | " featureidkey=\"properties.district\",\n", 164 | " mapbox_style=\"carto-positron\",\n", 165 | " zoom=10,\n", 166 | " center={\"lat\": 48.210033, \"lon\": 16.363449},\n", 167 | " opacity=0.5,\n", 168 | ")\n", 169 | "fig.update_layout(margin={\"r\": 0, \"l\": 0, \"b\": 0})\n", 170 | "fig.show()" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": null, 176 | "id": "4e706c25", 177 | "metadata": {}, 178 | "outputs": [], 179 | "source": [] 180 | } 181 | ], 182 | "metadata": { 183 | "kernelspec": { 184 | "display_name": "Python 3 (ipykernel)", 185 | "language": "python", 186 | "name": "python3" 187 | }, 188 | "language_info": { 189 | "codemirror_mode": { 190 | "name": "ipython", 191 | "version": 3 192 | }, 193 | "file_extension": ".py", 194 | "mimetype": "text/x-python", 195 | "name": "python", 196 | "nbconvert_exporter": "python", 197 | "pygments_lexer": "ipython3", 198 | "version": "3.8.10" 199 | } 200 | }, 201 | "nbformat": 4, 202 | "nbformat_minor": 5 203 | } 204 | -------------------------------------------------------------------------------- /lecture17-basic-spatial-viz/README.md: -------------------------------------------------------------------------------- 1 | # Lecture 17: Spatial data visualization 2 | 3 | ## Motivation 4 | 5 | Visualizing data spatially can allow us to make insights as to what is going on beyond our bubble. Aside from being great visuals that immediately engage audiences, map data visualizations provide a critical context for the metrics. Combining geospatial information with data creates a greater scope of understanding. Some benefits of using maps in your data visualization include: 6 | 7 | 1. A greater ability to more easily understand the distribution of your variable across the city, state, country, or world. 8 | 2. The ability to compare the activity across several locations at a glance 9 | 3. More intuitive decision making for company leaders 10 | 4. Contextualizing your data in the real world 11 | 12 | 13 | There is lots of room for creativity when making map dashboards because there are numerous ways to convey information with this kind of visualization. We map geographical regions colored, shaded, or graded according to some variable. They are visually striking, especially when the spatial units of the map are familiar entities. 14 | 15 | | Life expectancy map | Hotel prices in cities | 16 | |-------------------------|-------------------------| 17 | | ![alt text 1](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture17-basic-spatial-viz/output/lifeexp.png) | ![alt text 2](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture17-basic-spatial-viz/output/heu_prices.png) | 18 | 19 | 20 | ## This lecture 21 | 22 | This lecture introduces spatial data visualization using maps. During the lecture, students learn how to use the `maps` package which offers built-in maps with the [worldbank-lifeexpectancy](https://gabors-data-analysis.com/datasets/#worldbank-lifeexpectancy) data. Plotting the raw life expectancy at birth on a world map is already a powerful tool, but students will learn how to show deviance from the expected value given by the regression model. In the second part, students import raw `shp` files with auxiliary files, which contain the map of London boroughs and Vienna districts. With the [hotels-europe](https://gabors-data-analysis.com/datasets/#hotels-europe) dataset the average price for each unit on the map is shown. 23 | 24 | Case studies used during the lecture: 25 | - [Chapter 08, B: How is life expectancy related to the average income of a country?](https://gabors-data-analysis.com/casestudies/#ch08b-how-is-life-expectancy-related-to-the-average-income-of-a-country) 26 | - [Chapter 03, B: Comparing hotel prices in Europe: Vienna vs London](https://gabors-data-analysis.com/casestudies/#ch03b-comparing-hotel-prices-in-europe-vienna-vs-london) 27 | 28 | ## Learning outcomes 29 | After successfully completing [`01_spatial_datavisualisation.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture17-basic-spatial-viz/01_spatial_datavisualisation.ipynb) students should be able: 30 | 31 | - Understand how `geom_polygon` works 32 | - Shaping the outlook of the map with `coord_equal` or `coord_map` 33 | - Creating a `theme_map` theme 34 | - Use different coloring with `scale_fill_gradient` 35 | - How to match different data tables to be able to plot a map 36 | - Use custom values as a filler on the map based on life-expectancy case study 37 | 38 | After successfully completing [`02_spatial_datavisualisation.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture17-basic-spatial-viz/02_spatial_datavisualisation.ipynb) students should be able: 39 | 40 | - Use `geopandas` package to import 'shp' files and other needed auxiliary files as 'shx' and 'dbf' 41 | - `geom_path` to color the edges of the map 42 | - Map manipulations to show only inner-London boroughs 43 | - Add (borough or district) names to a map with `geom_text` 44 | - Control for limits of legend colors with `scale_fill_gradientn()` 45 | - Use nice color maps with unique palettes 46 | - Task for Vienna: replicate the same as for London 47 | 48 | After successfully completing [`02_spatial_datavisualisation_plotly.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture17-basic-spatial-viz/02_spatial_datavisualisation_plotly.ipynb) students should be able: 49 | 50 | - Use`plotly`'s 'choropleth_mapbox' function to create interactive maps. 51 | 52 | 53 | ## Lecture Time 54 | 55 | Ideal overall time: **40-60 mins**. 56 | 57 | Going through [`01_spatial_datavisualisation.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture17-basic-spatial-viz/01_spatial_datavisualisation.ipynb) and [`02_spatial_datavisualisation.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture17-basic-spatial-viz/02_spatial_datavisualisation.ipynb) takes around 20-40 minutes. Solving the tasks takes the remaining 20-40 minutes as there are two long tasks. 58 | 59 | 60 | ## Homework 61 | 62 | *Type*: quick practice, approx 10 mins 63 | 64 | Get countries' GDP growth rates with the `WDI` package. Plot the values in a world map. 65 | 66 | 67 | ## Further material 68 | 69 | - Arthur Turrell's Coding for Economics classes: [Geo-Spatial Visualisation](https://aeturrell.github.io/coding-for-economists/geo-vis.html). 70 | - Create beautiful maps with [Plotly](https://plotly.com/python/maps/). 71 | - Maps with [Matplotlib](https://towardsdatascience.com/mapping-with-matplotlib-pandas-geopandas-and-basemap-in-python-d11b57ab5dac). 72 | -------------------------------------------------------------------------------- /lecture17-basic-spatial-viz/data_map/BEZIRKSGRENZEOGDPolygon.dbf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gabors-data-analysis/da-coding-python/ac4dcefcb3b6e54e36e21490948e2edb41a13485/lecture17-basic-spatial-viz/data_map/BEZIRKSGRENZEOGDPolygon.dbf -------------------------------------------------------------------------------- /lecture17-basic-spatial-viz/data_map/BEZIRKSGRENZEOGDPolygon.shp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gabors-data-analysis/da-coding-python/ac4dcefcb3b6e54e36e21490948e2edb41a13485/lecture17-basic-spatial-viz/data_map/BEZIRKSGRENZEOGDPolygon.shp -------------------------------------------------------------------------------- /lecture17-basic-spatial-viz/data_map/BEZIRKSGRENZEOGDPolygon.shx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gabors-data-analysis/da-coding-python/ac4dcefcb3b6e54e36e21490948e2edb41a13485/lecture17-basic-spatial-viz/data_map/BEZIRKSGRENZEOGDPolygon.shx -------------------------------------------------------------------------------- /lecture17-basic-spatial-viz/data_map/London_Borough_Excluding_MHW.dbf: -------------------------------------------------------------------------------- 1 | q 2 | !JNAMECGSS_CODEC HECTARESN NONLD_AREAN ONS_INNERCSUB_2009CSUB_2006C 3 | Kingston upon Thames E09000021 3726.117 0.000F Croydon E09000008 8649.441 0.000F Bromley E09000006 15013.487 0.000F Hounslow E09000018 5658.541 60.755F Ealing E09000009 5554.428 0.000F Havering E09000016 11445.735 210.763F Hillingdon E09000017 11570.063 0.000F Harrow E09000015 5046.330 0.000F Brent E09000005 4323.270 0.000F Barnet E09000003 8674.837 0.000F Lambeth E09000022 2724.940 43.927T Southwark E09000028 2991.340 105.139T Lewisham E09000023 3531.706 16.795T Greenwich E09000011 5044.190 310.785F Bexley E09000004 6428.649 370.619F Enfield E09000010 8220.025 0.000F Waltham Forest E09000031 3880.793 0.000F Redbridge E09000026 5644.225 2.300F Sutton E09000029 4384.698 0.000F Richmond upon Thames E09000027 5876.111 135.443F Merton E09000024 3762.466 0.000F Wandsworth E09000032 3522.022 95.600T Hammersmith and FulhamE09000013 1715.409 75.648T Kensington and ChelseaE09000020 1238.379 25.994T Westminster E09000033 2203.005 54.308T Camden E09000007 2178.932 0.000T Tower Hamlets E09000030 2157.501 179.707T Islington E09000019 1485.664 0.000T Hackney E09000012 1904.902 0.000T Haringey E09000014 2959.837 0.000T Newham E09000025 3857.806 237.637T Barking and Dagenham E09000002 3779.934 169.150F City of London E09000001 314.942 24.546T  -------------------------------------------------------------------------------- /lecture17-basic-spatial-viz/data_map/London_Borough_Excluding_MHW.shp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gabors-data-analysis/da-coding-python/ac4dcefcb3b6e54e36e21490948e2edb41a13485/lecture17-basic-spatial-viz/data_map/London_Borough_Excluding_MHW.shp -------------------------------------------------------------------------------- /lecture17-basic-spatial-viz/data_map/London_Borough_Excluding_MHW.shx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gabors-data-analysis/da-coding-python/ac4dcefcb3b6e54e36e21490948e2edb41a13485/lecture17-basic-spatial-viz/data_map/London_Borough_Excluding_MHW.shx -------------------------------------------------------------------------------- /lecture17-basic-spatial-viz/output/heu_prices.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gabors-data-analysis/da-coding-python/ac4dcefcb3b6e54e36e21490948e2edb41a13485/lecture17-basic-spatial-viz/output/heu_prices.png -------------------------------------------------------------------------------- /lecture17-basic-spatial-viz/output/lifeexp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gabors-data-analysis/da-coding-python/ac4dcefcb3b6e54e36e21490948e2edb41a13485/lecture17-basic-spatial-viz/output/lifeexp.png -------------------------------------------------------------------------------- /lecture18-cross-validation/README.md: -------------------------------------------------------------------------------- 1 | # Lecture 18: Cross-validating linear models 2 | 3 | ## Motivation 4 | 5 | You have a car that you want to sell in the near future. You want to know what price you can expect if you were to sell it. You may also want to know what you could expect if you were to wait one more year and sell your car then. You have data on used cars with their age and other features, and you can predict price with several kinds of regression models with different righthand-side variables in different functional forms. How should you select the regression model that would give the best prediction? 6 | 7 | We introduce point prediction versus interval prediction; we discuss the components of prediction error and how to find the best prediction model that will likely produce the best fit (smallest prediction error) in the live data, using observations in the original data. We introduce loss functions in general and mean squared error (MSE) and its square root (RMSE) in particular, to evaluate predictions. We discuss three ways of finding the best predictor model, using all data and the Bayesian Information Criterion (BIC) as the measure of fit, using training–test splitting of the data, and using k-fold cross-validation, which is an improvement on the training–test split. 8 | 9 | ## This lecture 10 | 11 | This lecture refreshes methods for data cleaning and refactoring data as well as some basic feature engineering practices. After data is set, multiple competing regressions are run and compared via BIC and k-fold cross validation. Cross validation is carried out by the `sklearn` package as well. After the best-performing model is chosen (by RMSE), prediction performance and risks associated are discussed. In the case, when log-transformed outcome is used as the model, transformation back to level and evaluation of the prediction performance is also covered. 12 | 13 | Case studies used: 14 | - [Chapter 13, A: Predicting used car value with linear regressions](https://gabors-data-analysis.com/casestudies/#ch13a-predicting-used-car-value-with-linear-regressions) 15 | - [Chapter 14, A: Predicting used car value: log prices](https://gabors-data-analysis.com/casestudies/#ch14a-predicting-used-car-value-log-prices) 16 | 17 | ## Learning outcomes 18 | After successfully completing [`crossvalidation_usedcars.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture18-cross-validation/crossvalidation_usedcars.ipynb), students should be able: 19 | 20 | - Clean and prepare data for modeling 21 | - Decide for functional forms and do meaningful variable transformations 22 | - Run multiple regressions and compare performance based on BIC 23 | - Carry out k-fold cross validation with `sklearn` package for different regression models 24 | - Compare the prediction performance of the models 25 | - Understand what happens if a log-transformed outcome is used 26 | - convert prediction back to level 27 | - compare prediction performance of other (non-log) models 28 | 29 | ## Dataset used 30 | 31 | - [`used-cars`](https://gabors-data-analysis.com/datasets/#used-cars) 32 | 33 | ## Lecture Time 34 | 35 | Ideal overall time: **100 mins**. 36 | 37 | 38 | ## Further material 39 | 40 | - This lecture is a modified and combined version of [`ch13_used-cars.ipynb`](https://github.com/gabors-data-analysis/da_case_studies/blob/master/ch13-used-cars-reg/ch13_used-cars.ipynb) and [`ch14-used-cars-log.ipynb`](https://github.com/gabors-data-analysis/da_case_studies/blob/master/ch14-used-cars-log/ch14-used-cars-log.ipynb) codes from [Gabor's case study repository](https://github.com/gabors-data-analysis/da_case_studies). 41 | 42 | -------------------------------------------------------------------------------- /lecture19-lasso/README.md: -------------------------------------------------------------------------------- 1 | # Lecture 19: Prediction with LASSO 2 | 3 | ## Motivation 4 | 5 | You want to predict the rental prices of apartments in a big city using their location, size, amenities, and other features. You have access to data on many apartments with many variables. You know how to select the best regression model for prediction from several candidate models. But how should you specify those candidate models, to begin with? In particular, which of the many variables should they include, in what functional forms, and in what interactions? More generally, how can you make sure that the candidates include truly good predictive models? 6 | 7 | How should we specify the regression models? In particular, when we have many candidate predictor variables, how should we select from them, and how should we decide on their functional forms? 8 | 9 | ## This lecture 10 | 11 | This lecture discusses how to build regression models for prediction and how to evaluate the predictions they produce. We discuss how to select 12 | variables out of a large pool of candidate x variables, and how to decide on their functional forms. We introduce LASSO, an algorithm that can help with variable selection. With respect to evaluating predictions, we discuss why we need a holdout sample for evaluation that is separate from all of the rest of the data we use for model building and selection. 13 | 14 | Case study: 15 | - [Chapter 14, B: Predicting AirBnB apartment prices: selecting a regression model](https://gabors-data-analysis.com/casestudies/#ch14b-predicting-airbnb-apartment-prices-selecting-a-regression-model) 16 | 17 | ## Learning outcomes 18 | After successfully completing [`02_lasso_airbnb_prediction.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture19-lasso/02_lasso_airbnb_prediction.ipynb), students should be able: 19 | 20 | - Data cleaning and refactoring to prepare for LASSO type modelling 21 | - Basic feature engineering for LASSO 22 | - Understand the three sample approach: 23 | - train and test sample to select model (cross validation for tuning parameters) 24 | - hold-out sample to evaluate model prediction performance 25 | - Model selection with 26 | - (linear) regression models 27 | - LASSO, RIDGE and Elastic Net 28 | - Model diagnostics 29 | - Performance measure(s) on hold-out set to evalate competing models 30 | - stability of the prediction 31 | - specific diagnostic figures for LASSO 32 | 33 | ## Dataset used 34 | 35 | - [`airbnb`](https://gabors-data-analysis.com/datasets/#airbnb) 36 | 37 | ## Lecture Time 38 | 39 | Ideal overall time: **100 mins**. 40 | 41 | 42 | ## Further material 43 | 44 | - This lecture is a modified version of [`ch14-airbnb-prediction.ipynb`](https://github.com/gabors-data-analysis/da_case_studies/blob/master/ch14-airbnb-reg/ch14-airbnb-prediction.ipynb) from [Gabor's case study repository](https://github.com/gabors-data-analysis/da_case_studies). 45 | 46 | -------------------------------------------------------------------------------- /lecture20-regression-tree/01_usedcars_cart_data_preparation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "a4b11ce4", 6 | "metadata": {}, 7 | "source": [ 8 | "## Prediction with CART – data preparation\n", 9 | "Case studies: \n", 10 | " - CH15A Predicting used car value with regression trees \n", 11 | " \n", 12 | "Dataset:\n", 13 | "\n", 14 | " used-cars" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "id": "ad62ffe3", 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "import os\n", 25 | "import warnings\n", 26 | "\n", 27 | "import numpy as np\n", 28 | "import pandas as pd\n", 29 | "from skimpy import skim\n", 30 | "\n", 31 | "warnings.filterwarnings(\"ignore\")" 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "id": "70553db2", 37 | "metadata": {}, 38 | "source": [ 39 | "Import data" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "id": "49876c5d", 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "data = pd.read_csv(\"https://osf.io/7gvz9/download\")" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "id": "f2e86a5e", 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "data.head()" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "id": "17265243", 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "data.shape" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "id": "566e72ba", 75 | "metadata": {}, 76 | "source": [ 77 | "### Sample design\n", 78 | "\n", 79 | "Manage missing" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "id": "30133e81", 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "data[\"fuel\"] = data[\"fuel\"].fillna(\"Missing\")\n", 90 | "data[\"drive\"] = data[\"drive\"].fillna(\"Missing\")\n", 91 | "data[\"cylinders\"] = data[\"cylinders\"].fillna(\"Missing\")\n", 92 | "data[\"transmission\"] = data[\"transmission\"].fillna(\"Missing\")\n", 93 | "data[\"type\"] = data[\"type\"].fillna(\"Missing\")" 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "id": "0b2e2f57", 99 | "metadata": {}, 100 | "source": [ 101 | "Missing changed to good not missing for condition" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "id": "6e5493d0", 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "data[\"condition\"].value_counts()\n" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "id": "7d82b952", 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "data[\"condition\"] = data[\"condition\"].fillna(\"good\")\n" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "id": "60ccd5cb", 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "data[\"condition\"].value_counts()\n" 132 | ] 133 | }, 134 | { 135 | "cell_type": "markdown", 136 | "id": "afeff1e3", 137 | "metadata": {}, 138 | "source": [ 139 | "Drop hybrid models then drop column" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "id": "58ed2114", 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "data = data.loc[lambda x: x[\"Hybrid\"] == 0].drop(\"Hybrid\", axis=1)" 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "id": "70351995", 155 | "metadata": {}, 156 | "source": [ 157 | "Keep gas-fuelled vehicles" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": null, 163 | "id": "fe6377ef", 164 | "metadata": {}, 165 | "outputs": [], 166 | "source": [ 167 | "data = data.loc[lambda x: x[\"fuel\"] == \"gas\"]" 168 | ] 169 | }, 170 | { 171 | "cell_type": "markdown", 172 | "id": "2ce16b09", 173 | "metadata": {}, 174 | "source": [ 175 | "Drop vehicles in fair and new condition, trucks" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "id": "4fb0d2e9", 182 | "metadata": {}, 183 | "outputs": [], 184 | "source": [ 185 | "data = data.loc[lambda x: ~x[\"condition\"].isin([\"new\",\"fair\"])]" 186 | ] 187 | }, 188 | { 189 | "cell_type": "markdown", 190 | "id": "f2765ba3", 191 | "metadata": {}, 192 | "source": [ 193 | "Drop unrealistic values for price and odometer reading\n" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": null, 199 | "id": "3559a496", 200 | "metadata": {}, 201 | "outputs": [], 202 | "source": [ 203 | "data = data.loc[lambda x: (x[\"price\"].isin(range(500, 25001))) & (x[\"odometer\"] <= 100)]" 204 | ] 205 | }, 206 | { 207 | "cell_type": "markdown", 208 | "id": "998ea36b", 209 | "metadata": {}, 210 | "source": [ 211 | "Drop if price is smaller than 1000 and condition is like new or age is less than 8\n" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": null, 217 | "id": "f7ee1105", 218 | "metadata": {}, 219 | "outputs": [], 220 | "source": [ 221 | "data = data.loc[\n", 222 | " lambda x: ~((x[\"price\"] < 1000) & ((x[\"condition\"] == \"like new\") | (x[\"age\"] < 8)))\n", 223 | "]" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": null, 229 | "id": "0703f361", 230 | "metadata": {}, 231 | "outputs": [], 232 | "source": [ 233 | "data = data.loc[lambda x: x[\"transmission\"] != \"manual\"]" 234 | ] 235 | }, 236 | { 237 | "cell_type": "markdown", 238 | "id": "485cfca8", 239 | "metadata": {}, 240 | "source": [ 241 | "Drop if truck" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": null, 247 | "id": "7ca71bc0", 248 | "metadata": {}, 249 | "outputs": [], 250 | "source": [ 251 | "data = data.loc[lambda x: ~x[\"type\"].isin([\"truck\", \"pickup\"])]" 252 | ] 253 | }, 254 | { 255 | "cell_type": "markdown", 256 | "id": "910ce557", 257 | "metadata": {}, 258 | "source": [ 259 | "Drop price string" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": null, 265 | "id": "2f0fa7b6", 266 | "metadata": {}, 267 | "outputs": [], 268 | "source": [ 269 | "data = data.drop(\"pricestr\",axis=1)" 270 | ] 271 | }, 272 | { 273 | "cell_type": "markdown", 274 | "id": "dabd73c5", 275 | "metadata": {}, 276 | "source": [ 277 | "To be on the safe side, drop NA prices" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": null, 283 | "id": "e1fe2c0f", 284 | "metadata": {}, 285 | "outputs": [], 286 | "source": [ 287 | "data = data.loc[lambda x: x[\"price\"].notna()]" 288 | ] 289 | }, 290 | { 291 | "cell_type": "markdown", 292 | "id": "4703b309", 293 | "metadata": {}, 294 | "source": [ 295 | "### Data generation & descriptives\n", 296 | "\n", 297 | "Variables we are interested in:\n", 298 | " \n", 299 | " price age odometer + condition cylinder dealer city LE" 300 | ] 301 | }, 302 | { 303 | "cell_type": "markdown", 304 | "id": "85834185", 305 | "metadata": {}, 306 | "source": [ 307 | "Condition" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": null, 313 | "id": "f6c5c271", 314 | "metadata": {}, 315 | "outputs": [], 316 | "source": [ 317 | "data[\"cond_excellent\"] = np.where(data[\"condition\"] == \"excellent\", 1, 0)\n", 318 | "data[\"cond_good\"] = np.where(data[\"condition\"] == \"good\", 1, 0)\n", 319 | "data[\"cond_likenew\"] = np.where(data[\"condition\"] == \"like new\", 1, 0)" 320 | ] 321 | }, 322 | { 323 | "cell_type": "markdown", 324 | "id": "4313885e", 325 | "metadata": {}, 326 | "source": [ 327 | "Cylinders" 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": null, 333 | "id": "23c4fb24", 334 | "metadata": {}, 335 | "outputs": [], 336 | "source": [ 337 | "data.cylinders.value_counts()" 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": null, 343 | "id": "b54f0843", 344 | "metadata": {}, 345 | "outputs": [], 346 | "source": [ 347 | "data[\"cylind6\"] = np.where(data[\"cylinders\"] == \"6 cylinders\", 1, 0)\n", 348 | "data.cylind6.value_counts()" 349 | ] 350 | }, 351 | { 352 | "cell_type": "markdown", 353 | "id": "3b5a0120", 354 | "metadata": {}, 355 | "source": [ 356 | "Chicago\n" 357 | ] 358 | }, 359 | { 360 | "cell_type": "code", 361 | "execution_count": null, 362 | "id": "a1d17f79", 363 | "metadata": {}, 364 | "outputs": [], 365 | "source": [ 366 | "data[\"chicago\"] = np.where(data[\"area\"] == \"chicago\", 1, 0)" 367 | ] 368 | }, 369 | { 370 | "cell_type": "markdown", 371 | "id": "cf1961a7", 372 | "metadata": {}, 373 | "source": [ 374 | "age: quadratic, cubic" 375 | ] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "execution_count": null, 380 | "id": "668ab112", 381 | "metadata": {}, 382 | "outputs": [], 383 | "source": [ 384 | "data[\"agesq\"] = data[\"age\"] ** 2\n", 385 | "data[\"agecu\"] = data[\"age\"] ** 3" 386 | ] 387 | }, 388 | { 389 | "cell_type": "markdown", 390 | "id": "c8b04c5f", 391 | "metadata": {}, 392 | "source": [ 393 | "odometer quadratic" 394 | ] 395 | }, 396 | { 397 | "cell_type": "code", 398 | "execution_count": null, 399 | "id": "a9e3cb6f", 400 | "metadata": {}, 401 | "outputs": [], 402 | "source": [ 403 | "data[\"odometersq\"] = data[\"odometer\"] ** 3" 404 | ] 405 | }, 406 | { 407 | "cell_type": "markdown", 408 | "id": "a96e8b72", 409 | "metadata": {}, 410 | "source": [ 411 | "Take a look at descrpitives" 412 | ] 413 | }, 414 | { 415 | "cell_type": "code", 416 | "execution_count": null, 417 | "id": "6aaa3ec6", 418 | "metadata": {}, 419 | "outputs": [], 420 | "source": [ 421 | "skim(data)" 422 | ] 423 | }, 424 | { 425 | "cell_type": "code", 426 | "execution_count": null, 427 | "id": "f175d53b", 428 | "metadata": {}, 429 | "outputs": [], 430 | "source": [ 431 | "data[\"price\"].describe()" 432 | ] 433 | }, 434 | { 435 | "cell_type": "code", 436 | "execution_count": null, 437 | "id": "2ec30e19", 438 | "metadata": {}, 439 | "outputs": [], 440 | "source": [ 441 | "data[\"price\"].hist()" 442 | ] 443 | }, 444 | { 445 | "cell_type": "code", 446 | "execution_count": null, 447 | "id": "9e6af8f1", 448 | "metadata": {}, 449 | "outputs": [], 450 | "source": [ 451 | "data[\"price\"].apply(np.log).hist()" 452 | ] 453 | }, 454 | { 455 | "cell_type": "markdown", 456 | "id": "9cd3970b", 457 | "metadata": {}, 458 | "source": [ 459 | "Save data for prediction" 460 | ] 461 | }, 462 | { 463 | "cell_type": "code", 464 | "execution_count": null, 465 | "id": "ac698017", 466 | "metadata": {}, 467 | "outputs": [], 468 | "source": [ 469 | "os.makedirs(\"data\", exist_ok=True)\n", 470 | "data.to_csv(\"data/usedcars_cart_work.csv\", index=False)" 471 | ] 472 | }, 473 | { 474 | "cell_type": "code", 475 | "execution_count": null, 476 | "id": "625c585b", 477 | "metadata": {}, 478 | "outputs": [], 479 | "source": [] 480 | } 481 | ], 482 | "metadata": { 483 | "kernelspec": { 484 | "display_name": "Python 3 (ipykernel)", 485 | "language": "python", 486 | "name": "python3" 487 | }, 488 | "language_info": { 489 | "codemirror_mode": { 490 | "name": "ipython", 491 | "version": 3 492 | }, 493 | "file_extension": ".py", 494 | "mimetype": "text/x-python", 495 | "name": "python", 496 | "nbconvert_exporter": "python", 497 | "pygments_lexer": "ipython3", 498 | "version": "3.8.10" 499 | } 500 | }, 501 | "nbformat": 4, 502 | "nbformat_minor": 5 503 | } 504 | -------------------------------------------------------------------------------- /lecture20-regression-tree/README.md: -------------------------------------------------------------------------------- 1 | # Lecture 20: Prediction with regression trees (CART) 2 | 3 | ## Motivation 4 | 5 | You want to predict the price of used cars as a function of their age and other features. You want to specify a model that includes the most important interactions and nonlinearities of those features, but you don’t know how to start. In particular, you are worried that you can’t start with a very complex regression model and use LASSO or some other method to simplify it because there are way too many potential interactions. Is there an alternative approach to regression that includes the most important interactions without you having to specify them? 6 | 7 | To carry out the prediction of used car prices, we show how to use the regression tree, an alternative to linear regressions that are designed to build a model with the most important interactions and nonlinearities for a prediction. However, the regression tree you build appears to overfit your original data. How can you build a regression tree model that is less prone to overfitting the original data and can thus give a better prediction in the live data? 8 | 9 | 10 | ## This lecture 11 | 12 | This lecture introduces the regression tree, an alternative to linear regression for prediction purposes that can find the most important predictor variables and their interactions and can approximate any functional form automatically. Regression trees split the data into small bins (subsamples) by the value of the x variables. For a quantitative y, they use the average y value in those small sets to predict y. We introduce the regression tree model and the most widely used algorithm to build a regression tree model. Somewhat confusingly, both the model and the algorithm are called CART (for classification and regression trees), but we reserve this name for the algorithm. We show that a regression tree is an intuitively appealing method to model nonlinearities and interactions among the x variables, but it is rarely used for prediction in itself because it is prone to overfit the original data. Instead, the regression tree forms the basic element of very powerful prediction methods that we’ll cover in the next seminar. 13 | 14 | Case study: 15 | - [Chapter 15, A: Predicting used car value with regression trees](https://gabors-data-analysis.com/casestudies/#ch15a-predicting-used-car-value-with-regression-trees) 16 | 17 | ## Learning outcomes 18 | After successfully completing [`02_usedcars_cart_prediction.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture20-regression-tree/02_usedcars_cart_prediction.ipynb), students should be able: 19 | 20 | - Understand the how regression tree works 21 | - Estimate a regression tree 22 | - Visualize regression tree 23 | - Set stopping criteria for CART 24 | - Depth or level of the tree 25 | - Number of leaves 26 | - minimum fit measure increase by a split 27 | - Pruning a large tree 28 | - find optimal complexity parameter (also known as pruning parameter) 29 | - Variable importance plot 30 | - Simple 31 | - Permutation importance 32 | - Prediction evaluation 33 | - comparing trees 34 | - comparing tree vs linear regressions 35 | 36 | ## Dataset used 37 | 38 | - [used-cars](https://gabors-data-analysis.com/datasets/#used-cars) 39 | 40 | ## Lecture Time 41 | 42 | Ideal overall time: **100 mins**. 43 | 44 | 45 | ## Further material 46 | 47 | - This lecture is a modified version of [`ch15-used-cars-cart.ipynb`](https://github.com/gabors-data-analysis/da_case_studies/blob/master/ch15-used-cars-cart/ch15-used-cars-cart.ipynb) from [Gabor's case study repository](https://github.com/gabors-data-analysis/da_case_studies). 48 | 49 | -------------------------------------------------------------------------------- /lecture21-random-forest/00_download_model_fits.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "1f65e289", 6 | "metadata": {}, 7 | "source": [ 8 | "### This notebook is to download and unzip the fitted models used in `02_random_forest_airbnb.ipynb`.\n", 9 | "\n", 10 | "Since the cross-validation and model fits take a lots of time, we saved the results and uploaded them to an OSF repository which you can find [here](https://osf.io/mw4xj/?view_only=). You can also download the `model_fits.zip` folder from the repository and extract the zip file by hand, but the following code chunk will also do this for you." 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "id": "5ee46225", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "import requests, zipfile\n", 21 | "from io import BytesIO\n", 22 | "\n", 23 | "print(\"Downloading started\")\n", 24 | "url = \"https://osf.io/nsa3q/download\"\n", 25 | "filename = \"model_fits.zip\"\n", 26 | "\n", 27 | "# Downloading the file by sending the request to the URL\n", 28 | "req = requests.get(url)\n", 29 | "print(\"Downloading completed\")\n", 30 | "\n", 31 | "# extracting the zip file contents\n", 32 | "zipfile = zipfile.ZipFile(BytesIO(req.content))\n", 33 | "zipfile.extractall(\".\")\n", 34 | "print(\"Folder unzipped\")" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "id": "3d219fa2", 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [] 44 | } 45 | ], 46 | "metadata": { 47 | "kernelspec": { 48 | "display_name": "Python 3 (ipykernel)", 49 | "language": "python", 50 | "name": "python3" 51 | }, 52 | "language_info": { 53 | "codemirror_mode": { 54 | "name": "ipython", 55 | "version": 3 56 | }, 57 | "file_extension": ".py", 58 | "mimetype": "text/x-python", 59 | "name": "python", 60 | "nbconvert_exporter": "python", 61 | "pygments_lexer": "ipython3", 62 | "version": "3.8.10" 63 | } 64 | }, 65 | "nbformat": 4, 66 | "nbformat_minor": 5 67 | } 68 | -------------------------------------------------------------------------------- /lecture21-random-forest/README.md: -------------------------------------------------------------------------------- 1 | # Lecture 24: Predicting with Random Forest and Boosting 2 | 3 | ## Motivation 4 | 5 | You need to predict rental prices of apartments using various features. You don’t know that the various features may interact with each other in determining price, so you would like to use a regression tree. But you want to build a model that gives the best possible prediction, better than a single tree. What methods are available that keep the advantage of regression trees but give a better prediction? How should you choose from among those methods? 6 | 7 | How can you grow a random forest, the most widely used tree-based method, to carry out the prediction of apartment rental prices? What details do you have to decide on, how should you decide on them, and how can you evaluate the results? 8 | 9 | A regression tree can capture complicated interactions and nonlinearities for predicting a quantitative y variable, but it is prone to overfit the original data, even after appropriate pruning. It turns out, however, that combining multiple regression trees grown on the same data can yield a much better prediction. Such methods are called ensemble methods. There are many ensemble methods based on regression trees, and some are known to produce very good predictions. But these methods are rather complex, and some of them are not straightforward to use. 10 | 11 | ## This lecture 12 | 13 | This lecture introduces two ensemble methods based on regression trees: random forest and boosting. We start by introducing the main idea of ensemble methods: combining results from many imperfect models can lead to a much better prediction than a single model that we try to build to perfection. The random forest is perhaps the most frequently used method to predict a quantitative y variable, both because of its excellent predictive performance and because it is relatively simple to use. Even more than with a single tree, it is hard to understand the underlying patterns of association between y and x that drive the predictions of ensemble methods. We discuss some diagnostic tools that can help with that: variable importance plots, partial dependence plots, and examining the quality of predictions in subgroups. Finally, we show another method: boosting, an alternative approach to making predictions based on an ensemble of regression trees via `xgboost`. 14 | 15 | Note that some of the used methods take a considerable amount of time to run on a simple PC, thus pre-run model results are also uploaded to the repository, to speed up the seminar. 16 | 17 | Case study: 18 | - [Chapter 16, A: Predicting apartment prices with random forest](https://gabors-data-analysis.com/casestudies/#ch16a-predicting-apartment-prices-with-random-forest) 19 | 20 | ## Learning outcomes 21 | 22 | Lecturer/students should be aware that there is a separate file: [`airbnb_prepare.R`](https://github.com/gabors-data-analysis/da-coding-rstats/blob/main/lecture24-random-forest/codes/airbnb_prepare.R) for this seminar, overviewing only the data cleaning and feature engineering process. This is extremely important and powerful to understand how to prepare the data for these methods, as without it data analysts do garbage-in garbage-out analysis... Usually, due to time constraints, this part is not covered in the seminar but asked students to cover it before the seminar. 23 | 24 | After successfully completing [`02_random_forest_airbnb.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture21-random-forest/02_random_forest_airbnb.ipynb), students should be able: 25 | 26 | - Estimate random forest models via `sklearn` 27 | - unsderstand `max_features` and `min_samples_split` parameters 28 | - GridSearchCV to search for hyperparameters 29 | - Understanding random forest's output 30 | - variable importance plots: all, top 10 and grouped variables (typically factors) 31 | - partial dependence plot 32 | - sub-sample analysis for understanding prediction performance across groups 33 | - SHAP values 34 | - Run a 'Horse-Race' prediction competition with: 35 | - Linear regression (OLS) 36 | - ElasticNet 37 | - Regression Tree with CART 38 | - Random Forest 39 | - XGBoost model 40 | 41 | ## Dataset used 42 | 43 | - [airbnb](https://gabors-data-analysis.com/datasets/#airbnb) 44 | 45 | ## Lecture Time 46 | 47 | Ideal overall time: **100 mins**. 48 | 49 | 50 | ## Further material 51 | 52 | - This lecture is a modified version of [ch16-airbnb-random-forest.ipynb](https://github.com/gabors-data-analysis/da_case_studies/blob/master/ch16-airbnb-random-forest/Ch16-airbnb-random-forest.ipynb) from [Gabor's case study repository](https://github.com/gabors-data-analysis/da_case_studies). 53 | 54 | -------------------------------------------------------------------------------- /lecture22-classification/README.md: -------------------------------------------------------------------------------- 1 | # Lecture 25: Prediction and classification of binary outcome with ML tools 2 | 3 | ## Motivation 4 | 5 | Predicting whether people will repay their loans or default on them is important to a bank that sells such loans. Should the bank predict the default probability for applicants? Or, rather, should it classify applicants into prospective defaulters and prospective repayers? And how are the two kinds of predictions related? In particular, can the bank use probability predictions to classify applicants into defaulters and repayers, in a way that takes into account the bank’s costs when a default happens and its costs when it forgoes a good applicant? 6 | 7 | Many companies have relationships with other companies, as suppliers or clients. Whether those other companies stay in business in the future is an important question for them. You have rich data on many companies across the years that allows you to see which companies stayed in business and which companies exited, and relate that to various features of the companies. How should you use that data to predict the probability of exit for each company? How should you predict which companies will exit and which will stay in business in the future? 8 | 9 | In the previous seminars we covered the logic of predictive analytics and its most important steps, and we introduced specific methods to predict a quantitative y variable. But sometimes our y variable is not quantitative. The most important case is when y is binary: y = 1 or y = 0. How can we predict such a variable? 10 | 11 | ## This lecture 12 | 13 | This lecture introduces the framework and methods of probability prediction and classification analysis for binary y variables. Probability prediction means predicting the probability that y = 1, with the help of the predictor variables. Classification means predicting the binary y variable itself, with the help of the predictor variables: putting each observation in one of the y categories, also called classes. We build on what we know about probability models and the basics of probability prediction from [lecture14-binary-models](https://github.com/gabors-data-analysis/da-coding-python/tree/main/lecture14-binary-models). In this seminar, we put that into the framework of predictive analytics to arrive at the best probability model for prediction purposes and to evaluate its performance. We then discuss how we can turn probability predictions into classification with the help of a classification threshold and how we should use a loss function to find the optimal threshold. We discuss how to evaluate a classification by making use of a confusion table and expected loss. We introduce the ROC curve, which illustrates the trade-off of selecting different classification threshold values. We discuss how we can use random forests based on classification trees. 14 | 15 | Case study: 16 | - [Chapter 17, A: Predicting firm exit: probability and classification](https://gabors-data-analysis.com/casestudies/#ch17a-predicting-firm-exit-probability-and-classification) 17 | 18 | ## Learning outcomes 19 | 20 | Lecturer/students should be aware that there is a separate file at the official case studies repository: [`ch17-firm-exit-data-prep.ipynb`](https://github.com/gabors-data-analysis/da_case_studies/blob/master/ch17-predicting-firm-exit/ch17-firm-exit-data-prep.ipynb) for this seminar, overviewing only the data cleaning and feature engineering process for binary outcomes. This is extremely important and powerful to understand how to prepare the data for these methods, as without it data analysts do garbage-in garbage-out analysis... Usually, due to time constraints, this part is not covered in the seminar but asked students to cover it before the seminar. 21 | 22 | After successfully completing [`firm_exit_classification.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture22-classification/firm_exit_classification.ipynb), students should be able: 23 | 24 | - What is winsorizing and how it helps 25 | - Basic linear models for predicting probabilities 26 | - simple linear probability model (review) 27 | - simple logistic model (logit, review) 28 | - Cross-validation with logit model 29 | - LASSO with logit model 30 | - Evaluation of model prediction 31 | - Calibration curve (review) 32 | - Confusion matrix 33 | - ROC curve and AUC (Area Under Curve) 34 | - Model comparison based on RMSE and AUC 35 | - User-defined loss funtion 36 | - find the optimal trheshold based on self-defined loss function 37 | - Show ROC curve and optimal point 38 | - Show loss-function values for different points on ROC 39 | - CART and Random Forest 40 | - modelling porbabilities 41 | - Random Forest with majority voting as a misunderstand method, especially with user-defined loss function 42 | 43 | ## Dataset used 44 | 45 | -[bisnode-firms](https://gabors-data-analysis.com/datasets/#bisnode-firms) 46 | 47 | ## Lecture Time 48 | 49 | Ideal overall time: **100 mins**. 50 | 51 | 52 | ## Further material 53 | 54 | - This lecture is a modified version of [`ch17-predicting-firm-exit.ipynb`](https://github.com/gabors-data-analysis/da_case_studies/blob/master/ch17-predicting-firm-exit/ch17-predicting-firm-exit.ipynb) from [Gabor's case study repository](https://github.com/gabors-data-analysis/da_case_studies). 55 | 56 | -------------------------------------------------------------------------------- /lecture22-classification/helper_functions.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import sklearn.metrics as metrics 4 | from plotnine import * 5 | from sklearn.metrics import roc_curve 6 | 7 | 8 | def regression_results(y_true, y_pred): 9 | 10 | # Regression metrics 11 | explained_variance = metrics.explained_variance_score(y_true, y_pred) 12 | mean_absolute_error = metrics.mean_absolute_error(y_true, y_pred) 13 | mse = metrics.mean_squared_error(y_true, y_pred) 14 | median_absolute_error = metrics.median_absolute_error(y_true, y_pred) 15 | r2 = metrics.r2_score(y_true, y_pred) 16 | 17 | print("explained_variance: ", round(explained_variance, 4)) 18 | print("r2: ", round(r2, 4)) 19 | print("MAE: ", round(mean_absolute_error, 4)) 20 | print("MSE: ", round(mse, 4)) 21 | print("RMSE: ", round(np.sqrt(mse), 4)) 22 | 23 | 24 | def coef_matrix(X, model): 25 | 26 | coef_matrix = pd.concat( 27 | [pd.DataFrame(X.columns), pd.DataFrame(np.transpose(model.coef_))], axis=1 28 | ) 29 | coef_matrix.columns = ["variable", "coefficient"] 30 | coef_matrix = coef_matrix.append( 31 | {"variable": "Intercept", "coefficient": model.intercept_}, 32 | ignore_index=True, 33 | ) 34 | return coef_matrix 35 | 36 | 37 | def cv_summary(lambdas, C_values, model): 38 | d = { 39 | "lambdas": lambdas, 40 | "C_values": C_values, 41 | "mean_cv_score": model.scores_[1].mean(axis=0), 42 | } 43 | return pd.DataFrame(data=d) 44 | 45 | 46 | def create_roc_plot(y_true, y_pred): 47 | fpr, tpr, thresholds = roc_curve(y_true, y_pred) 48 | all_coords = pd.DataFrame({"fpr": fpr, "tpr": tpr, "thresholds": thresholds}) 49 | 50 | plot = ( 51 | ggplot(all_coords, aes(x="fpr", y="tpr")) 52 | + geom_line(color="blue", size=0.7) 53 | + geom_area(position="identity", fill="mediumaquamarine", alpha=0.3) 54 | + xlab("False Positive Rate (1-Specifity)") 55 | + ylab("True Positive Rate (Sensitivity)") 56 | + geom_abline(intercept=0, slope=1, linetype="dotted", color="black") 57 | + scale_y_continuous( 58 | limits=(0, 1), breaks=np.arange(0, 1.1, 0.1), expand=(0, 0.01) 59 | ) 60 | + scale_x_continuous( 61 | limits=(0, 1), breaks=np.arange(0, 1.1, 0.1), expand=(0.01, 0) 62 | ) 63 | + theme_bw() 64 | ) 65 | return plot 66 | 67 | 68 | def sigmoid_array(x): 69 | return 1 / (1 + np.exp(-x)) 70 | 71 | 72 | def generate_fold_prediction(model, X, fold, param_index): 73 | fold_coef = model.coefs_paths_[1][fold, param_index, :] 74 | return sigmoid_array( 75 | np.dot(X, np.transpose(fold_coef)[:-1]) + np.transpose(fold_coef)[-1] 76 | ) 77 | 78 | 79 | def create_loss_plot(all_coords, optimal_threshold, curr_exp_loss, FP, FN): 80 | all_coords_copy = all_coords.copy() 81 | all_coords_copy["loss"] = ( 82 | all_coords_copy.false_pos * FP + all_coords_copy.false_neg * FN 83 | ) / all_coords_copy.n 84 | 85 | t = optimal_threshold 86 | l = curr_exp_loss 87 | 88 | return ( 89 | ggplot(all_coords_copy, aes(x="thresholds", y="loss")) 90 | + geom_line(color="blue", size=0.7) 91 | + scale_x_continuous(breaks=np.arange(0, 1.1, 0.1)) 92 | + coord_cartesian(xlim=(0, 1)) 93 | + geom_vline(xintercept=t, color="blue") 94 | + annotate( 95 | geom="text", 96 | x=t - 0.01, 97 | y=max(all_coords_copy.loss) - 0.4, 98 | label="best threshold: " + str(round(t, 2)), 99 | colour="red", 100 | angle=90, 101 | size=7, 102 | ) 103 | + annotate(geom="text", x=t + 0.06, y=l, label=str(round(l, 2)), size=7) 104 | + theme_bw() 105 | ) 106 | 107 | 108 | def create_roc_plot_with_optimal(all_coords, optimal_threshold): 109 | all_coords_copy = all_coords.copy() 110 | all_coords_copy["sp"] = 1 - all_coords_copy.true_neg / all_coords_copy.neg 111 | all_coords_copy["se"] = all_coords_copy.true_pos / all_coords_copy.pos 112 | 113 | best_coords = all_coords_copy[all_coords_copy.thresholds == optimal_threshold] 114 | sp = best_coords.sp.values[0] 115 | se = best_coords.se.values[0] 116 | 117 | return ( 118 | ggplot(all_coords_copy, aes(x="sp", y="se")) 119 | + geom_line(color="blue", size=0.7) 120 | + scale_y_continuous(breaks=np.arange(0, 1.1, 0.1)) 121 | + scale_x_continuous(breaks=np.arange(0, 1.1, 0.1)) 122 | + geom_point(data=pd.DataFrame({"sp": [sp], "se": [se]})) 123 | + annotate( 124 | geom="text", 125 | x=sp, 126 | y=se + 0.03, 127 | label=str(round(sp, 2)) + ", " + str(round(se, 2)), 128 | size=7, 129 | ) 130 | + geom_area(position="identity", fill="mediumaquamarine", alpha=0.3) 131 | + xlab("False Positive Rate (1-Specifity)") 132 | + ylab("True Positive Rate (Sensitivity)") 133 | + geom_abline(intercept=0, slope=1, linetype="dotted", color="black") 134 | + theme_bw() 135 | ) 136 | -------------------------------------------------------------------------------- /lecture23-long-term-time-series/README.md: -------------------------------------------------------------------------------- 1 | # Lecture 23: Forecasting from Time Series Data I 2 | 3 | ## Motivation 4 | 5 | Your task is to predict the number of daily tickets sold for next year in a swimming pool in a large city. The swimming pool sells tickets through its sales terminal that records all transactions. You aggregate that data to daily frequency. How should you use the information on daily sales to produce your forecast? In particular, how should you model trends, and how should you model seasonality by months of the year and days of the week to produce the best prediction? 6 | 7 | 8 | ## This lecture 9 | 10 | This lecture discusses forecasting: prediction from time series data for one or more time periods in the future. The focus of this chapter is forecasting future values of one variable, by making use of past values of the same variable, and possibly other variables, too. We build on what we learned about time series regressions in [lecture16-timeseries-regression](https://github.com/gabors-data-analysis/da-coding-python/tree/main/lecture16-timeseries-regression). We start with forecasts with a long horizon, which means many time periods into the future. Such forecasts use the information on trends, seasonality, and other long-term features of the time series. 11 | 12 | Case study: 13 | - [Chapter 18, A: Forecasting daily ticket sales for a swimming pool](https://gabors-data-analysis.com/casestudies/#ch18a-forecasting-daily-ticket-sales-for-a-swimming-pool) 14 | 15 | ## Learning outcomes 16 | After successfully completing [`long_term_swimming.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture23-long-term-time-series/long_term_swimming.ipynb), students should be able: 17 | 18 | - Data munging with time series (review) 19 | - Adding deterministic variables such as trends, yearly/mounthly/weekly seasonality 20 | - Adding deterministic variables with `pandas_market_calendars` package such as holidays, weekdays, etc. 21 | - Sample splitting with time series 22 | - Simple linear models: 23 | - deterministic trend/seasonality and/or other deterministic variables (holidays, etc.) 24 | - Cross-validation with time series 25 | - `prophet` package 26 | - Forecasting 27 | - Comparing model based on forecasting performance (RMSE) 28 | - Graphical representation of model fit and forecasts 29 | 30 | ## Dataset used 31 | 32 | - [swim-transactions](https://gabors-data-analysis.com/datasets/#swim-transactions) 33 | 34 | ## Lecture Time 35 | 36 | Ideal overall time: **50-60 mins**. 37 | 38 | 39 | ## Further material 40 | 41 | - This lecture is a modified version of [ch18-swimmingpool-predict.ipynb](https://github.com/gabors-data-analysis/da_case_studies/blob/master/ch18-swimmingpool/ch18-swimmingpool-predict.ipynb) from [Gabor's case study repository](https://github.com/gabors-data-analysis/da_case_studies). 42 | 43 | -------------------------------------------------------------------------------- /lecture24-short-term-time-series/README.md: -------------------------------------------------------------------------------- 1 | # Lecture 24: Forecasting from Time Series Data II - ARIMA and VAR models 2 | 3 | ## Motivation 4 | 5 | Your task is to predict how house prices will move in a particular city in the next months. You have monthly data on the house price index of the city, and you can collect monthly data on other variables that may be correlated with how house prices move. How should you use that data to forecast changes in house prices for the next few months? In particular, how should you use those other variables to help that forecast even though you don’t know their future values? 6 | 7 | ## This lecture 8 | 9 | This lecture discusses forecasting: prediction from time series data for one or more time periods in the future. The focus of this chapter is forecasting future values of one variable, by making use of past values of the same variable, and possibly other variables, too. We build on what we learned about time series regressions in [lecture16-timeseries-regression](https://github.com/gabors-data-analysis/da-coding-python/tree/main/lecture16-timeseries-regression). Now, we then turn to short horizon forecasts that forecast y for a few time periods ahead. These forecasts make use of serial correlation of the time series of y besides those long-term features. We introduce autoregression (AR) and ARIMA models via the `statsmodels` package, which captures the patterns of serial correlation and can use for short horizon forecasting. We then turn to use other variables in forecasting and introduce vector autoregression (VAR) models that help in forecasting future values of those x variables that we can use to forecast y. We discuss how to carry out cross-validation in forecasting and the specific challenges and opportunities the time series nature of our data provides for assessing external validity. 10 | 11 | Case study: 12 | - [Chapter 18, B: Forecasting a house price index](https://gabors-data-analysis.com/casestudies/#ch18b-forecasting-a-house-price-index) 13 | 14 | ## Learning outcomes 15 | After successfully completing [`short_term_priceindex.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture24-short-term-time-series/short_term_priceindex.ipynb), students should be able: 16 | 17 | - Decide if a conversion of data to stationarity is needed 18 | - ARIMA models 19 | - self specified lags for AR, I, and MA components 20 | - auto select the lags 21 | - handling trend and seasonality within ARIMA 22 | - understand 'S' from SARIMA and why we do not use it in this course 23 | - Cross-validation with ARIMA models 24 | - Vector AutoRegressive models (VAR) 25 | - estimation and cross-validation 26 | - Forecasting 27 | - comparing models based on forecast performance 28 | - external validity check on a longer horizon 29 | - Fan charts for assessing risks 30 | 31 | ## Lecture Time 32 | 33 | Ideal overall time: **50-80 mins**. 34 | 35 | 36 | ## Further material 37 | 38 | - This lecture is a modified version of [`ch18-ts-pred-homeprices.ipynb`](https://github.com/gabors-data-analysis/da_case_studies/blob/master/ch18-case-shiller-la/ch18-ts-pred-homeprices.ipynb) from [Gabor's case study repository](https://github.com/gabors-data-analysis/da_case_studies). 39 | 40 | -------------------------------------------------------------------------------- /lecture25-matplotlib-vs-plotnine/README.md: -------------------------------------------------------------------------------- 1 | # Lecture 25: Matplotlib vs Plotnine on the GDP and Life Expectancy data 2 | 3 | ## Motivation 4 | 5 | People tend to live longer in richer countries. How long people live is usually measured by life expectancy; how rich a country is usually captured by its yearly income, measured by GDP. But should we use total GDP or GDP per capita? And what’s the shape of the patterns of association? Is the same percent difference in income related to the same difference in how long people live among richer countries and poorer countries? Finding the shape of the association helps benchmarking life expectancy among countries with similar levels of income and identify countries where people tend to live especially long or especially short lives for their income. 6 | 7 | The lecture illustrates the choice between total and per capita measures (here GDP), regressions with variables in logs, and two ways to model nonlinear patterns in the framework of the linear regression: piecewise linear splines, and polynomials. It also illustrates whether and how to use weights in regression analysis, and what that choice implies for the correct interpretation of the results. The lecture also shows how to use informative visualization to present the results of regressions. 8 | 9 | 10 | ## This lecture 11 | 12 | This lecture covers the same graphs in two separate notebooks: [`life_expectancy_gdp_plotnine.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture25-matplotlib-vs-plotnine/life_expectancy_gdp_plotnine.ipynb) and [`life_expectancy_gdp_matplotlib.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture25-matplotlib-vs-plotnine/life_expectancy_gdp_matplotlib.ipynb). Our goal was to show that exactly the same graphs can be created with `matplotlib` (and with its high-level interface, `seaborn`) and with `plotnine`. 13 | 14 | Case study: 15 | - [CH08B How is life expectancy related to the average income of a country?](https://gabors-data-analysis.com/casestudies/#ch08b-how-is-life-expectancy-related-to-the-average-income-of-a-country) 16 | 17 | ## Learning outcomes 18 | After successfully completing [`life_expectancy_gdp_plotnine.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture25-matplotlib-vs-plotnine/life_expectancy_gdp_plotnine.ipynb) and/or [`life_expectancy_gdp_matplotlib.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture25-matplotlib-vs-plotnine/life_expectancy_gdp_matplotlib.ipynb) students should be able to: 19 | 20 | - Estimate simple 21 | - level-level regression 22 | - log-level regression 23 | - quadratic regression 24 | - spline regression 25 | - Visualise regression line on a scatter plot 26 | - set axis ticks to percent format 27 | - scale axis to log 28 | - use weights for point sizes 29 | 30 | 31 | ## Lecture Time 32 | 33 | Ideal overall time: **30-60 mins** depending on whether you go through only one, or both notebooks. 34 | 35 | 36 | ## Further material 37 | 38 | - This lecture is a modified version of [`ch08-life-expectancy-income.ipynb`](https://github.com/gabors-data-analysis/da_case_studies/blob/master/ch08-life-expectancy-income/ch08-life-expectancy-income.ipynb) from [Gabor's case study repository](https://github.com/gabors-data-analysis/da_case_studies). 39 | - Tutorial to the `seaborn` library can be found [here](https://seaborn.pydata.org/tutorial.html). -------------------------------------------------------------------------------- /lecture25-matplotlib-vs-plotnine/helper_functions.py: -------------------------------------------------------------------------------- 1 | import copy 2 | 3 | import numpy as np 4 | import pandas as pd 5 | 6 | def knot_ceil(vector: np.array, knot: float) -> np.array: 7 | vector_copy = copy.deepcopy(vector) 8 | vector_copy[vector_copy > knot] = knot 9 | return vector_copy 10 | 11 | 12 | def lspline(series: pd.Series, knots: list) -> np.array: 13 | """ 14 | Function to create design matrix to esitmate a piecewise 15 | linear spline regression. 16 | 17 | Parameters 18 | ---------- 19 | series : pd.Series 20 | Your variable in a pandas Series. 21 | knots : List[float] 22 | The knots, that result in n + 1 line segments. 23 | """ 24 | 25 | if type(knots) != list: 26 | knots = [knots] 27 | design_matrix = None 28 | vector = series.values 29 | 30 | for i in range(len(knots)): 31 | # print(i) 32 | # print(vector) 33 | if i == 0: 34 | column = knot_ceil(vector, knots[i]) 35 | else: 36 | column = knot_ceil(vector, knots[i] - knots[i - 1]) 37 | # print(column) 38 | if i == 0: 39 | design_matrix = column 40 | else: 41 | design_matrix = np.column_stack((design_matrix, column)) 42 | # print(design_matrix) 43 | vector = vector - column 44 | design_matrix = np.column_stack((design_matrix, vector)) 45 | # print(design_matrix) 46 | return design_matrix 47 | 48 | 49 | def poly(x: np.array, degree=1) -> pd.DataFrame: 50 | """ 51 | Fit polynomial. 52 | 53 | These are non orthogonal factors, but it may not matter if 54 | we only need this for predictions (without interpreting the 55 | coefficients) or visualisation. 56 | 57 | Parameters 58 | ---------- 59 | x : npt.ArrayLike 60 | Data array. 61 | degree : int, default=1 62 | Degree of the polynomial. 63 | """ 64 | d = {} 65 | for i in range(degree + 1): 66 | if i == 1: 67 | d["x"] = x 68 | else: 69 | d[f"x**{i}"] = np.power(x, i) 70 | return pd.DataFrame(d) 71 | 72 | 73 | def add_margin(ax, x=0.05, y=0.05): 74 | """ 75 | This will, by default, add 5% to the x and y margins. You 76 | can customise this using the x and y arguments when you call it. 77 | """ 78 | 79 | xlim = ax.get_xlim() 80 | ylim = ax.get_ylim() 81 | 82 | xmargin = (xlim[1] - xlim[0]) * x 83 | ymargin = (ylim[1] - ylim[0]) * y 84 | 85 | ax.set_xlim(xlim[0] - xmargin, xlim[1] + xmargin) 86 | ax.set_ylim(ylim[0] - ymargin, ylim[1] + ymargin) 87 | --------------------------------------------------------------------------------