├── .gitignore
├── LICENSE
├── Pipfile
├── Pipfile.lock
├── README.md
├── lecture00-intro
├── README.md
├── getting_started.ipynb
└── setup.md
├── lecture01-coding-basics
├── README.md
└── coding_basics.ipynb
├── lecture02-basic-structures
├── README.md
├── basic_structures.ipynb
└── sample_module.py
├── lecture03-data-IO
├── Linux_2k.log
├── README.md
├── city_names.txt
├── data_IO.ipynb
└── message.txt
├── lecture04-pandas-basics
├── 01_pandas_basics.ipynb
├── 02_pandas_data_munging.ipynb
├── README.md
└── hotel_vienna_restricted.csv
├── lecture05-graphs-basics
├── 01_plotnine_intro.ipynb
├── 02_matplotlib_intro.ipynb
└── README.md
├── lecture06-conditionals
├── README.md
└── conditionals_and_control_flows.ipynb
├── lecture07-data-exploration
├── README.md
└── data_exploration.ipynb
├── lecture08-functions
├── README.md
└── functions.ipynb
├── lecture09-exception-handling
├── README.md
└── exception_handling.ipynb
├── lecture10-intro-to-regression
├── README.md
└── intro_to_regression.ipynb
├── lecture11-feature-engineering
├── 01_feature_engineering_wms.ipynb
├── 02_feature_engineering_bisnode.ipynb
└── README.md
├── lecture12-simple-linear-regression
├── 00_life_exp_get_data.ipynb
├── 01_life_exp_clean.ipynb
├── 02_life_exp_analysis.ipynb
├── README.md
└── data
│ └── WDI_lifeexp_raw.csv
├── lecture13-advanced-linear-regression
├── README.md
└── hotels_advanced_regression.ipynb
├── lecture14-binary-models
├── README.md
└── binary_models.ipynb
├── lecture15-datetime
├── 01_datetime_basics.ipynb
├── 02_datetime_manipulations.ipynb
└── README.md
├── lecture16-timeseries-regression
├── README.md
└── intro_time_series.ipynb
├── lecture17-basic-spatial-viz
├── 01_spatial_datavisualisation.ipynb
├── 02_spatial_datavisualisation.ipynb
├── 03_spatial_datavisualisation_plotly.ipynb
├── README.md
├── data_map
│ ├── BEZIRKSGRENZEOGDPolygon.dbf
│ ├── BEZIRKSGRENZEOGDPolygon.shp
│ ├── BEZIRKSGRENZEOGDPolygon.shx
│ ├── London_Borough_Excluding_MHW.dbf
│ ├── London_Borough_Excluding_MHW.shp
│ ├── London_Borough_Excluding_MHW.shx
│ └── worldmap.csv
└── output
│ ├── heu_prices.png
│ └── lifeexp.png
├── lecture18-cross-validation
├── README.md
└── crossvalidation_usedcars.ipynb
├── lecture19-lasso
├── 01_lasso_airbnb_data_prep.ipynb
├── 02_lasso_airbnb_prediction.ipynb
└── README.md
├── lecture20-regression-tree
├── 01_usedcars_cart_data_preparation.ipynb
├── 02_usedcars_cart_prediction.ipynb
└── README.md
├── lecture21-random-forest
├── 00_download_model_fits.ipynb
├── 01_prepare_airbnb.ipynb
├── 02_random_forest_airbnb.ipynb
└── README.md
├── lecture22-classification
├── README.md
├── data
│ └── bisnode_firms_clean.csv
├── firm_exit_classification.ipynb
└── helper_functions.py
├── lecture23-long-term-time-series
├── README.md
└── long_term_swimming.ipynb
├── lecture24-short-term-time-series
├── README.md
└── short_term_priceindex.ipynb
└── lecture25-matplotlib-vs-plotnine
├── README.md
├── helper_functions.py
├── life_expectancy_gdp_matplotlib.ipynb
└── life_expectancy_gdp_plotnine.ipynb
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Data folder
10 | data/
11 | model_fits/
12 |
13 | # Distribution / packaging
14 | .Python
15 | build/
16 | develop-eggs/
17 | dist/
18 | downloads/
19 | eggs/
20 | .eggs/
21 | lib/
22 | lib64/
23 | parts/
24 | sdist/
25 | var/
26 | wheels/
27 | pip-wheel-metadata/
28 | share/python-wheels/
29 | *.egg-info/
30 | .installed.cfg
31 | *.egg
32 | MANIFEST
33 |
34 | # PyInstaller
35 | # Usually these files are written by a python script from a template
36 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
37 | *.manifest
38 | *.spec
39 |
40 | # Installer logs
41 | pip-log.txt
42 | pip-delete-this-directory.txt
43 |
44 | # Unit test / coverage reports
45 | htmlcov/
46 | .tox/
47 | .nox/
48 | .coverage
49 | .coverage.*
50 | .cache
51 | nosetests.xml
52 | coverage.xml
53 | *.cover
54 | *.py,cover
55 | .hypothesis/
56 | .pytest_cache/
57 |
58 | # Translations
59 | *.mo
60 | *.pot
61 |
62 | # Django stuff:
63 | *.log
64 | local_settings.py
65 | db.sqlite3
66 | db.sqlite3-journal
67 |
68 | # Flask stuff:
69 | instance/
70 | .webassets-cache
71 |
72 | # Scrapy stuff:
73 | .scrapy
74 |
75 | # Sphinx documentation
76 | docs/_build/
77 |
78 | # PyBuilder
79 | target/
80 |
81 | # Jupyter Notebook
82 | .ipynb_checkpoints
83 |
84 | # IPython
85 | profile_default/
86 | ipython_config.py
87 |
88 | # pyenv
89 | .python-version
90 |
91 | # pipenv
92 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
93 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
94 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
95 | # install all needed dependencies.
96 | #Pipfile.lock
97 |
98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
99 | __pypackages__/
100 |
101 | # Celery stuff
102 | celerybeat-schedule
103 | celerybeat.pid
104 |
105 | # SageMath parsed files
106 | *.sage.py
107 |
108 | # Environments
109 | .env
110 | .venv
111 | env/
112 | venv/
113 | ENV/
114 | env.bak/
115 | venv.bak/
116 |
117 | # Spyder project settings
118 | .spyderproject
119 | .spyproject
120 |
121 | # Rope project settings
122 | .ropeproject
123 |
124 | # mkdocs documentation
125 | /site
126 |
127 | # mypy
128 | .mypy_cache/
129 | .dmypy.json
130 | dmypy.json
131 |
132 | # Pyre type checker
133 | .pyre/
134 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2021 Gabors Data Analysis
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/Pipfile:
--------------------------------------------------------------------------------
1 | [[source]]
2 | url = "https://pypi.org/simple"
3 | verify_ssl = true
4 | name = "pypi"
5 |
6 | [packages]
7 | arch = "==5.3.1"
8 | black = "==22.8.0"
9 | fredapi = "==0.5.0"
10 | geopandas = "==0.11.1"
11 | jupyter = "==1.0.0"
12 | jupyter-contrib-nbextensions = "==0.5.1"
13 | matplotlib = "==3.5.0"
14 | numpy = "==1.23.3"
15 | pandas = "==1.5.0"
16 | pandas-market-calendars = "==4.0"
17 | patchworklib = "==0.4.7"
18 | plotly = "==5.10.0"
19 | plotnine = "==0.9"
20 | pmdarima = "==2.0.1"
21 | prophet = "==1.0"
22 | pycountry-convert = "==0.7.2"
23 | pydotplus = "==2.0.2"
24 | pystan = "==2.19.1.1"
25 | pyzmq = "==19.0.2"
26 | scikit-learn = "==1.1.2"
27 | scikit-misc = "==0.1.4"
28 | seaborn = "==0.12.0"
29 | shap = "==0.41.0"
30 | skimpy = "==0.0.6"
31 | stargazer = "==0.0.5"
32 | statsmodels = "==0.13.2"
33 | wbdata = "==0.3.0"
34 | xgboost = "==1.6.2"
35 |
36 | [dev-packages]
37 |
38 | [requires]
39 | python_version = "3.8"
40 |
--------------------------------------------------------------------------------
/lecture00-intro/README.md:
--------------------------------------------------------------------------------
1 | # Lecture 00: Introduction to Python and Jupyter notebook
2 |
3 | ## Motivation
4 |
5 | In this course, we focus on Python and Jupyter Notebook. This means you won’t learn anything about R, Julia, or any other programming language useful for data science. They’re also excellent choices, and in practice, most data science teams use a mix of languages, often at least Python and R.
6 |
7 | ## This lecture
8 |
9 | This is the starting lecture, that introduces students to Python and Jupyter notebook (download and install), installs the virtual environment, runs a pre-written script, and highlights the importance of version control.
10 |
11 | The aim of this class is not to teach coding, but to make sure that everybody has Python, Jupyter Notebook and the virtual environment installed on their laptop. The main aim of these steps is to reveal possible OS mismatches or other problems with R and RStudio.
12 | The material and steps are detailed in [`getting_started.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture00-intro/getting_started.ipynb) and [`setup.md`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture00-intro/setup.md).
13 |
14 |
15 | ## Learning outcomes
16 | After successfully teaching the material, students will have
17 |
18 | - Python and Jupyter Notebook on their laptop/computers
19 |
20 | and understand,
21 |
22 | - How Jupyter Notebook looks like, which window is which.
23 | - How to run a command via console using VSCode.
24 | - What are packages, and how to install and load them into the virtual environment.
25 | - Why version control is important and what are the main possibilities with Git and GitHub.
26 |
27 | These steps are found to be extremely important, as fixing installation problems may take days to weeks.
28 |
29 | ## Datasets used
30 | * No dataset is used in this lecture
31 |
32 | ## Lecture Time
33 |
34 | Ideal overall time: **20-30 mins**.
35 |
36 | It can substantially differ from this if the teacher decides to do a live coding session with students and fixes the emerging problems during the class (up to ~90 mins).
37 |
38 | ## Homework
39 |
40 | No homework, apart from fixing possible issues.
41 |
42 | ## Further material
43 | - Jupyter notebook [guide](https://www.dataquest.io/blog/jupyter-notebook-tutorial/)
44 | - Git references:
45 | - [Technical foundations of informatics book](https://info201.github.io/git-basics.html)
46 | - [Software carpentry course](https://swcarpentry.github.io/git-novice/) (Strongly recommended)
47 | - [Github Learning Lab](https://lab.github.com/)
48 | - [If you are really committed](https://git-scm.com/book/en/v2) (pun intended)
49 |
--------------------------------------------------------------------------------
/lecture00-intro/getting_started.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "b53567b6",
6 | "metadata": {},
7 | "source": [
8 | "### Lecture 0\n",
9 | "\n",
10 | "- Setting up the environment \n",
11 | "- Basic terminology \n",
12 | "- Using Jupyter notebook \n",
13 | "- Using VScode \n",
14 | "- Running script from VScode"
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "id": "70a0337e",
20 | "metadata": {},
21 | "source": [
22 | "## Jupyter notebooks\n",
23 | "This file - a Jupyter notebook - does not follow the standard pattern with Python code in a text file. Instead, a Jupyter notebook is stored as a file in the [JSON](http://en.wikipedia.org/wiki/JSON) format. The advantage is that we can mix formatted text, Python code and code output. It requires the Jupyter notebook server to run it though, and therefore isn't a stand-alone Python program as described above. Other than that, there is no difference between the Python code that goes into a program file or a Jupyter notebook.\n",
24 | "We will return to JSON files later, when we will work with dictionaries, and advanced data structures."
25 | ]
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "id": "c5616878",
30 | "metadata": {},
31 | "source": [
32 | "## Getting familiar with the interface"
33 | ]
34 | },
35 | {
36 | "cell_type": "markdown",
37 | "id": "a2b8f8a3",
38 | "metadata": {},
39 | "source": [
40 | "There are two fairly prominent terms that you should notice, which are probably new to you: cells and kernels are key both to understanding Jupyter and to what makes it more than just a word processor. Fortunately, these concepts are not difficult to understand.\n",
41 | "\n",
42 | "- A kernel is a “computational engine” that executes the code contained in a notebook document.\n",
43 | "- A cell is a container for text to be displayed in the notebook or code to be executed by the notebook’s kernel."
44 | ]
45 | },
46 | {
47 | "cell_type": "markdown",
48 | "id": "7008f1be",
49 | "metadata": {},
50 | "source": [
51 | "### Cells\n",
52 | "We’ll return to kernels a little later, but first let’s come to grips with cells.\n",
53 | "\n",
54 | "- A code cell contains code to be executed in the kernel. When the code is run, the notebook displays the output below the code cell that generated it.\n",
55 | "- A Markdown cell contains text formatted using Markdown and displays its output in-place when the Markdown cell is run.\n",
56 | "\n",
57 | "The first cell in a new notebook is always a code cell.\n",
58 | "\n",
59 | "Let’s test it out with a classic hello world example: Type `print(\"Hello World!\")` into the cell and click the run button Notebook Run Button in the toolbar above or press Ctrl + Enter.\n",
60 | "\n",
61 | "The result should look like this:"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": 1,
67 | "id": "ceb0aa58",
68 | "metadata": {},
69 | "outputs": [
70 | {
71 | "name": "stdout",
72 | "output_type": "stream",
73 | "text": [
74 | "Hello World!\n"
75 | ]
76 | }
77 | ],
78 | "source": [
79 | "print(\"Hello World!\")"
80 | ]
81 | },
82 | {
83 | "cell_type": "markdown",
84 | "id": "be2ac251",
85 | "metadata": {},
86 | "source": [
87 | "### Kernels\n",
88 | "Behind every notebook runs a kernel. When you run a code cell, that code is executed within the kernel. Any output is returned back to the cell to be displayed. The kernel’s state persists over time and between cells — it pertains to the document as a whole and not individual cells.\n",
89 | "\n"
90 | ]
91 | },
92 | {
93 | "cell_type": "markdown",
94 | "id": "7730964a",
95 | "metadata": {},
96 | "source": [
97 | "More on Jupyter Notebooks for beginners: https://www.dataquest.io/blog/jupyter-notebook-tutorial/"
98 | ]
99 | },
100 | {
101 | "cell_type": "markdown",
102 | "id": "5b1f1795",
103 | "metadata": {},
104 | "source": [
105 | "## Version control: Git and GitHub\n",
106 | "\n",
107 | "Version control is an essential part of coding. It ensures that you keep proper track of your progress when writing a code for analysis or developing a code/package/material etc. We strongly encourage you to create this good habit during this course. \n",
108 | "\n",
109 | "There are multiple ways to do version control all of it has some benefits and drawbacks. Here we list the three most commonly used ones:\n",
110 | "\n",
111 | "- [GitHub Desktop](https://desktop.github.com/) (and [friends](https://www.slant.co/options/13488/alternatives/~github-desktop-alternatives), eg. [VS Code](https://code.visualstudio.com/)):\n",
112 | " - Application to conveniently keep track of your modifications, commit, pull and push to GitHub\n",
113 | " - Pro: easy to use, flexible for all types of files, helps to avoid conflicts\n",
114 | " - Con: extra application, should create a habit and not forget about it :)\n",
115 | "\n",
116 | "- Shell/Terminal\n",
117 | " - Using shell or terminal for version control\n",
118 | " - Pro: flexible for all types of files, can do literally everything there\n",
119 | " - Con: hard to learn, can make mistakes, which is hard to correct\n",
120 | " \n",
121 | "It does not matter what you use, the main issue is to use version control. It makes your life much easier, especially with complicated projects, where you have to test and try out different directions. [GitHub](https://github.com/) is a great platform to collaborate, however, there are others as well.\n",
122 | "\n",
123 | "In this course, we do not overview how to do version control but assume the basics are known. Some useful material (thanks to [Oliver Kiss](https://github.com/kiss-oliver)) can be found at:\n",
124 | "\n",
125 | " - Technical foundations of informatics book: https://info201.github.io/git-basics.html\n",
126 | " - Software carpentry course (Strongly recommended): https://swcarpentry.github.io/git-novice/\n",
127 | " - Github Learning Lab: https://lab.github.com/\n",
128 | " - If you are really committed (pun intended): https://git-scm.com/book/en/v2"
129 | ]
130 | },
131 | {
132 | "cell_type": "markdown",
133 | "id": "bce2a334",
134 | "metadata": {},
135 | "source": [
136 | "\n",
137 | "## VS Code: An IDE For Python Scripts\n",
138 | "\n",
139 | "While the course is using Jupyter nootebooks, Python codes in production environments are run from script files ending with a `.py` extension. Jupyter offers a simple text editor to write script files but most developers use some special software application, a so-called 'integrated development environment', or IDE, to write these scripts. \n",
140 | "\n",
141 | "From the many possible alternative IDEs we recommend Visual Studio Code, or VS Code, a free Microsoft tool for these developments. It is a light-weight code editor with miriad of possible extensions which enable VS Code to support basicly *any* kind of programming languages. Beyond particular language supports VS Code also has solutions for things like version control, container management or cloud access. VS Code works equally well on Windows, MAC or Linux. \n",
142 | "\n",
143 | "VS Code can be downloaded from [here](https://code.visualstudio.com/) and tutorials can be accessed through the [documentation](https://code.visualstudio.com/docs).\n",
144 | "\n",
145 | "We recommend using the [Pylance](https://marketplace.visualstudio.com/items?itemName=ms-python.vscode-pylance) extension for Python projects but other options can equally be fine. The course does not cover the deployment of production-ready Python solutions, so VS Code is just an optional component of your toolkit for using Python later. \n",
146 | "\n",
147 | "\n",
148 | "\n",
149 | "## Appendix: A Primer On Virtual Environments\n",
150 | "\n",
151 | "A virtual environment is an isolated workspace for a particular project. In effect it is a directory structure which contains Python executable files and other files which tell Python the packages and their version numbers to use in that project. We set up this environment to make sure that all readers get exactly the same results when running the code snippets on the book's exercises. \n",
152 | "\n",
153 | "If you want ot take a deep dive into Python's virtual environment read [this](https://realpython.com/python-virtual-environments-a-primer/) detailed discussion of the topic. Beyond the dosumentation we refer above you can also get some more technical information about `pipenv` [here](https://pipenv-searchable.readthedocs.io/). \n",
154 | "\n",
155 | "Nevertheless, you don't need to be an expert on virtualenvs in order to be able to follow the course material. \n"
156 | ]
157 | },
158 | {
159 | "cell_type": "markdown",
160 | "id": "032c4444",
161 | "metadata": {},
162 | "source": []
163 | }
164 | ],
165 | "metadata": {
166 | "kernelspec": {
167 | "display_name": "Python 3.10.0 64-bit",
168 | "language": "python",
169 | "name": "python3"
170 | },
171 | "language_info": {
172 | "codemirror_mode": {
173 | "name": "ipython",
174 | "version": 3
175 | },
176 | "file_extension": ".py",
177 | "mimetype": "text/x-python",
178 | "name": "python",
179 | "nbconvert_exporter": "python",
180 | "pygments_lexer": "ipython3",
181 | "version": "3.10.0"
182 | },
183 | "vscode": {
184 | "interpreter": {
185 | "hash": "98590ff4fe04c8543246b2a01debd3de3c5ca9b666f43f1fa87d5110c692004c"
186 | }
187 | }
188 | },
189 | "nbformat": 4,
190 | "nbformat_minor": 5
191 | }
192 |
--------------------------------------------------------------------------------
/lecture00-intro/setup.md:
--------------------------------------------------------------------------------
1 | # Set up environment
2 | ## Get Python
3 |
4 | 1. Install latest version of Python from the [official website](https://www.python.org/downloads/). **We used [version 3.8](https://www.python.org/downloads/release/python-3811/)**
5 |
6 | 2. We suggest to use [Jupyter Notebook](https://jupyter-notebook.readthedocs.io/en/stable/) to edit and run Python code. You can install it via `pip` by running `pip3 install jupyter` in your terminal/PowerShell.
7 |
8 |
9 | ## How to run case studies and coding class in Python
10 |
11 | 1. **Install `Pipenv`**
12 |
13 | We use [Pipenv](https://pipenv-fork.readthedocs.io/en/latest/index.html) for Python dependency management. First, install it via `pip` by running the following code in your terminal/PowerShell:
14 |
15 | ```
16 | pip3 install pipenv
17 | ```
18 |
19 | 2. **Create virtual environment and install required packages**
20 |
21 | Go to the `da-coding-python` folder to create a virtual environment and install packages by running the following code in your terminal/PowerShell:
22 |
23 | ```
24 | pipenv sync
25 | ```
26 |
27 | This installs the required Python version and packages stored in the `Pipfile.lock`.
28 |
29 |
30 |
31 | 3. **Run Jupyter Notebook**
32 |
33 | To start a Jupyter Notebook in this virtual environment, go to the `da-coding-python` folder and run the following code in your terminal/PowerShell:
34 |
35 | ```
36 | pipenv run jupyter notebook
37 | ```
38 |
39 | The jupyter environment should be opened on your default browser. You are good to go!
40 |
41 | **NOTE:** For Windows users, the above code chunks might result in an error, because the `pipenv` terminal shortcut sometimes does not install properly. In this case, run ```python3 -m pipenv sync``` and ```python3 -m pipenv run jupyter notebook```.
--------------------------------------------------------------------------------
/lecture01-coding-basics/README.md:
--------------------------------------------------------------------------------
1 | # Lecture 01: Coding basics
2 |
3 | ## Motivation
4 |
5 | In this lecture we jump into the very basics of Python. This is the beginning of a long journey, which, honestly, will never end.
6 |
7 |
8 | ## This lecture
9 |
10 | We start with general coding principles, how to name variables, why and how to comment scripts, and we give some formatting tips. Then we cover basic variable types, assignments and operators. We end with string manipulations, showing hands-on examples how to automate the composition of SQL query strings. By the time you go through [`coding_basics.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture01-coding-basics/coding_basics.ipynb) you will get a taste of things to come.
11 |
12 |
13 | ## Learning outcomes
14 |
15 | After successfully completing the learning material, students will be familiar with
16 |
17 | - general coding principles (for both Python and other languages)
18 | - basic variable types of integers, gloats and booleans
19 | - how to assign value to a variable
20 | - how to do basic operations with these variables
21 | - how to manipulate strings, focusing on writing some simple SQL queries, the primary tool of any data professional.
22 |
23 |
24 | ## Datasets used
25 | * No dataset is used in this lecture
26 |
27 |
28 | ## Lecture Time
29 |
30 | Ideal overall time: **20-30 mins**.
31 |
32 | We tried to keep this part to the bare minimum as it is pretty straightforward. There will be many more examples on how to use variables of various types later in the course.
33 |
34 |
35 | ## Homework
36 |
37 | No homework for this lecture.
38 |
39 |
40 | ## Further material
41 |
42 | [The Zen of Python](https://peps.python.org/pep-0020/)
43 |
--------------------------------------------------------------------------------
/lecture02-basic-structures/README.md:
--------------------------------------------------------------------------------
1 | # Lecture 02: Basic structures
2 |
3 | ## Motivation
4 |
5 | Learning how to program is a long journey. The good news is that once you learn the very basics you can write your first programs. In these programs you may need some simple variables, but nothing more. As you move forward and want to write a little more complex solutions you will add more complexity to your data. Also, your solution may need other functionalities which needed to be added to your workspace when your script runs.
6 |
7 |
8 | ## This lecture
9 |
10 | This lecture covers `collections` of data and their usage:
11 |
12 | - lists
13 | - tuples
14 | - sets
15 | - dictionaries.
16 |
17 | In addition, we introduce `JSON`, a lightweight format for storing and transferring data.
18 |
19 | Finally, we show how to import, use, and write `modules`.
20 |
21 |
22 | ## Learning outcomes
23 |
24 | After completing this [basic_structures.ipynb](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture02-basic-structures/basic_structures.ipynb) lecture students will
25 |
26 | - understand the similarities and differences between the four Python collections
27 | - be able to inspect, search and modify lists
28 | - be able to iterate on these list
29 | - be able to use lists in SQL script automation
30 | - understand how tuples work
31 | - be able to inspect dictionary items and select particular dictionary values by keys
32 | - be able to cast lists into sets
33 | - be able to do set operations
34 | - be able to cast JSON-format strings into dictionaries and vice versa
35 | - be able to import and use modules
36 | - write and import their own modules.
37 |
38 |
39 | ## Datasets used
40 | * No dataset is used in this lecture
41 |
42 |
43 | ## Lecture Time
44 |
45 | Ideal overall time: **30-40 mins**.
46 |
47 |
48 | ## Homework
49 |
50 | Create a dictionary.
51 |
52 | - Iterate through the keys and print the keys together with the values.
53 | - Try to print a value for an unknown key (which is not included in your dictionary) in two ways:
54 | 1. Your script throws an error when calling the unknown key.
55 | 2. Your script prints a default values.
56 | - Cast your dictionary keys into lists.
57 |
58 | Create another dictionary with overlapping keys with the first one but different values. Try to merge the two. What do you see?
59 |
60 |
61 | ## Further material
62 | - [Official Python tutorial](https://docs.python.org/2/tutorial/datastructures.html) on lists, tuples, set & dictionaries.
63 |
64 |
--------------------------------------------------------------------------------
/lecture02-basic-structures/sample_module.py:
--------------------------------------------------------------------------------
1 | def print_hello(name):
2 | print(f'Hello {name}!')
--------------------------------------------------------------------------------
/lecture03-data-IO/README.md:
--------------------------------------------------------------------------------
1 | # Lecture 03: Data I/O
2 |
3 | ## Motivation
4 |
5 | Reading and writing files is a regular step in all software applications. All data containers (such as numpy arrays or Pandas data frames) have their own read and write operations, so we focus on reading text files in this lecture.
6 |
7 |
8 | ## This lecture
9 |
10 | This [short module](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture03-data-IO/data_IO.ipynb) shows you how to read text files from the file system, how to write and save files, and how to navigate in the file system using the `os` module.
11 |
12 |
13 | ## Learning outcomes
14 |
15 | After running the material's code chunks students will be able to
16 |
17 | - open file for reading
18 | - reading text files line by line
19 | - handling file encodings
20 | - writing new files, and appending and overwriting existing files
21 | - getting working directory information
22 | - listing files in a directory
23 | - creating operating system-specific file path strings
24 | - create new directory with Python
25 |
26 |
27 | ## Datasets used
28 | * No dataset is used in this lecture
29 |
30 |
31 | ## Lecture Time
32 |
33 | Ideal overall time: **10 mins**.
34 |
35 |
36 | ## Homework
37 |
38 | Explore the directory of your Python environment (from which you are running these codes) using the `os` module. List the content of your directory and create a new subfolder there. Write a short text in new text file saved in this directory. Read the [docs](https://docs.python.org/3/library/os.html) how to delete files and directories, and delete this new directory and the file you have just saved there.
39 |
40 |
--------------------------------------------------------------------------------
/lecture03-data-IO/city_names.txt:
--------------------------------------------------------------------------------
1 | Český Krumlov, Pécs, Kraków
--------------------------------------------------------------------------------
/lecture03-data-IO/data_IO.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "68ce080e-8ee3-4120-b63e-d65731319404",
6 | "metadata": {},
7 | "source": [
8 | "# Lecture 3\n",
9 | "\n",
10 | "## I/O (Reading From and Writing To Files) \n",
11 | "\n",
12 | "- [reading](#reading)\n",
13 | "- [writing](#writing)\n",
14 | "\n",
15 | "## Navigating The File System \n",
16 | "\n",
17 | "- using the [`os` module](#os)\n",
18 | "----"
19 | ]
20 | },
21 | {
22 | "cell_type": "markdown",
23 | "id": "2c3950bf-f26d-40ba-903c-ffaf11d56988",
24 | "metadata": {},
25 | "source": [
26 | "## I/O"
27 | ]
28 | },
29 | {
30 | "cell_type": "markdown",
31 | "id": "51631150-3331-4848-a397-e132a9878da6",
32 | "metadata": {},
33 | "source": [
34 | "### Reading "
35 | ]
36 | },
37 | {
38 | "cell_type": "markdown",
39 | "id": "8258ef5e-3455-4a0c-94ee-6f2f6574d2f4",
40 | "metadata": {},
41 | "source": [
42 | "Before any file operation we need to `open` the file."
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": null,
48 | "id": "51d8cfcb-366a-477d-beba-9a55ab7a2d04",
49 | "metadata": {},
50 | "outputs": [],
51 | "source": [
52 | "f = open('Linux_2k.log')"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": null,
58 | "id": "6543d7be-ef7c-40ea-9607-32a24e03863b",
59 | "metadata": {},
60 | "outputs": [],
61 | "source": [
62 | "print(f)"
63 | ]
64 | },
65 | {
66 | "cell_type": "code",
67 | "execution_count": null,
68 | "id": "04eed29a-dc05-4dc6-b4ad-92d7014fbb85",
69 | "metadata": {},
70 | "outputs": [],
71 | "source": [
72 | "f.read()"
73 | ]
74 | },
75 | {
76 | "cell_type": "markdown",
77 | "id": "df58897a-973d-4f27-bb0e-d828241babe9",
78 | "metadata": {},
79 | "source": [
80 | "You also need to `close` the file, otherwise your program will not allow other programs to access it."
81 | ]
82 | },
83 | {
84 | "cell_type": "code",
85 | "execution_count": null,
86 | "id": "f6bc5c6d-e35f-4152-8f0f-3db9533d67ee",
87 | "metadata": {},
88 | "outputs": [],
89 | "source": [
90 | "f.close()"
91 | ]
92 | },
93 | {
94 | "cell_type": "markdown",
95 | "id": "4bfa0064-1ee7-49ec-8cef-2f255cc78a9f",
96 | "metadata": {},
97 | "source": [
98 | "Note: We are using a system log example from the [Loghub](https://github.com/logpai/loghub) repository. The relevant documentation can be found on [arxiv.org](https://arxiv.org/abs/2008.06448)."
99 | ]
100 | },
101 | {
102 | "cell_type": "markdown",
103 | "id": "00fac8ee-7666-4ff7-9152-1b1dca73e71e",
104 | "metadata": {},
105 | "source": [
106 | "You can also add *encoding information* to the `open()` method to avoid the mess with funny characters. "
107 | ]
108 | },
109 | {
110 | "cell_type": "code",
111 | "execution_count": null,
112 | "id": "d3bc00aa-2a92-4fdc-8cb8-b4f86c87fabe",
113 | "metadata": {},
114 | "outputs": [],
115 | "source": [
116 | "f = open('city_names.txt')\n",
117 | "f.read()"
118 | ]
119 | },
120 | {
121 | "cell_type": "code",
122 | "execution_count": null,
123 | "id": "52d4e448-4f04-4c94-9655-abfe3bc09d86",
124 | "metadata": {},
125 | "outputs": [],
126 | "source": [
127 | "f.close()"
128 | ]
129 | },
130 | {
131 | "cell_type": "code",
132 | "execution_count": null,
133 | "id": "5f0c3f04-b3bf-433d-993b-5cfe6e2c3c05",
134 | "metadata": {},
135 | "outputs": [],
136 | "source": [
137 | "f = open('city_names.txt', encoding = 'utf-8')\n",
138 | "f.read()"
139 | ]
140 | },
141 | {
142 | "cell_type": "code",
143 | "execution_count": null,
144 | "id": "4033b6e3-24a2-41a0-9d70-84db73c4d953",
145 | "metadata": {},
146 | "outputs": [],
147 | "source": [
148 | "f.close()"
149 | ]
150 | },
151 | {
152 | "cell_type": "markdown",
153 | "id": "e9ff1cca-0bea-4137-bfb2-5526522a5f98",
154 | "metadata": {},
155 | "source": [
156 | "You can find encoding options for all languages and character sets in the documentation of the [codecs module](https://docs.python.org/3/library/codecs.html#standard-encodings). "
157 | ]
158 | },
159 | {
160 | "cell_type": "markdown",
161 | "id": "a78f4683-5484-48ea-b425-2692e63f093a",
162 | "metadata": {},
163 | "source": [
164 | "Multiline text can be also be read sequentially."
165 | ]
166 | },
167 | {
168 | "cell_type": "code",
169 | "execution_count": null,
170 | "id": "ef14caef-b018-49c6-9eb0-05372fb1a0e4",
171 | "metadata": {},
172 | "outputs": [],
173 | "source": [
174 | "f = open('Linux_2k.log')"
175 | ]
176 | },
177 | {
178 | "cell_type": "code",
179 | "execution_count": null,
180 | "id": "2d9d4ef2-74fd-4b8e-9d02-1d7451cd3d23",
181 | "metadata": {},
182 | "outputs": [],
183 | "source": [
184 | "f.readline()"
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": null,
190 | "id": "e1114b95-fad7-453c-84ef-7021f427b500",
191 | "metadata": {},
192 | "outputs": [],
193 | "source": [
194 | "f.readline()"
195 | ]
196 | },
197 | {
198 | "cell_type": "code",
199 | "execution_count": null,
200 | "id": "4736993f-2c12-42f7-a486-6a50784b0be6",
201 | "metadata": {},
202 | "outputs": [],
203 | "source": [
204 | "f.close()"
205 | ]
206 | },
207 | {
208 | "cell_type": "markdown",
209 | "id": "8947bdfa-ba99-45ab-98ba-3081ef54d9b4",
210 | "metadata": {},
211 | "source": [
212 | "The best way to read and write file is by using the `with` statement. This ensures that the file is closed when the block inside the with statement is exited. We don't need to explicitly call the `close()` method, it is done internally."
213 | ]
214 | },
215 | {
216 | "cell_type": "code",
217 | "execution_count": null,
218 | "id": "4aa35b7a-f047-4d60-b604-37e9514a6ea8",
219 | "metadata": {},
220 | "outputs": [],
221 | "source": [
222 | "with open(\"Linux_2k.log\", encoding=\"utf-8\") as f:\n",
223 | " for line in f: # remember to indent!\n",
224 | " print(line)\n",
225 | "\n",
226 | "# After the operation the connection to the file is closed."
227 | ]
228 | },
229 | {
230 | "cell_type": "markdown",
231 | "id": "8bf19b9c-2ecc-4940-8647-978653884b06",
232 | "metadata": {},
233 | "source": [
234 | "### Writing "
235 | ]
236 | },
237 | {
238 | "cell_type": "code",
239 | "execution_count": null,
240 | "id": "127c154e-84d9-4938-acee-8ece7647f85e",
241 | "metadata": {},
242 | "outputs": [],
243 | "source": [
244 | "with open(file = 'message.txt', mode = 'w', encoding = 'utf-8') as write_text:\n",
245 | " write_text.write('Hello Monthy! \\nThis is Python class on file I/O.')"
246 | ]
247 | },
248 | {
249 | "cell_type": "markdown",
250 | "id": "5b747e36-1232-447b-9e75-1e47b3708675",
251 | "metadata": {},
252 | "source": [
253 | "There are four ways to open a file:\n",
254 | "- \"r\" - Read - Default value. Opens a file for reading, error if the file does not exist\n",
255 | "- \"a\" - Append - Opens a file for appending, creates the file if it does not exist\n",
256 | "- \"w\" - Write - Opens a file for writing, creates the file if it does not exist\n",
257 | "- \"x\" - Create - Creates the specified file, returns an error if the file exists"
258 | ]
259 | },
260 | {
261 | "cell_type": "markdown",
262 | "id": "bb263a87-dc6c-40e1-9933-5ee5e3e7bc7e",
263 | "metadata": {},
264 | "source": [
265 | "## Navigating The File System "
266 | ]
267 | },
268 | {
269 | "cell_type": "markdown",
270 | "id": "d31e62c0-8375-4f92-99cf-7c984a183dce",
271 | "metadata": {},
272 | "source": [
273 | "One way to navigate in your file system is by using the `os` module. This module provides methods for getting directory info, creating and deleting folders, listing files, etc. "
274 | ]
275 | },
276 | {
277 | "cell_type": "code",
278 | "execution_count": null,
279 | "id": "26233f46-e530-440a-8ffd-f8ab8f43d90e",
280 | "metadata": {},
281 | "outputs": [],
282 | "source": [
283 | "import os"
284 | ]
285 | },
286 | {
287 | "cell_type": "markdown",
288 | "id": "a91f70f6-b83a-443b-a85e-600391803ee1",
289 | "metadata": {},
290 | "source": [
291 | "`getcwd()` will give you your current working directory, and `listdir()` lists the file in the directory of your choice. (If you don't give the 'path' parameter as input it will list the files in your current working directory.)"
292 | ]
293 | },
294 | {
295 | "cell_type": "code",
296 | "execution_count": null,
297 | "id": "dccb38e4-21eb-47fc-9519-31e83500b74a",
298 | "metadata": {},
299 | "outputs": [],
300 | "source": [
301 | "current_directory = os.getcwd()"
302 | ]
303 | },
304 | {
305 | "cell_type": "code",
306 | "execution_count": null,
307 | "id": "7d9b6be5-b656-4350-9733-56a805bb10ab",
308 | "metadata": {},
309 | "outputs": [],
310 | "source": [
311 | "files = os.listdir(current_directory)"
312 | ]
313 | },
314 | {
315 | "cell_type": "code",
316 | "execution_count": null,
317 | "id": "e9d0aae3-8035-4e27-a738-0dff1e07164b",
318 | "metadata": {},
319 | "outputs": [],
320 | "source": [
321 | "print(files)"
322 | ]
323 | },
324 | {
325 | "cell_type": "code",
326 | "execution_count": null,
327 | "id": "10f5c962-a32a-480b-8e14-1541a856f102",
328 | "metadata": {},
329 | "outputs": [],
330 | "source": [
331 | "type(files)"
332 | ]
333 | },
334 | {
335 | "cell_type": "markdown",
336 | "id": "69487636-8810-4b03-b0da-d94b045307e6",
337 | "metadata": {},
338 | "source": [
339 | "The `os`module uses Linux commands to interact with the file system. `mkdir()` will create a new directory, and `path.join()` is used to define new paths. Note, that the `path()` method uses the approprite directory separators, depending on your operating system. (Forward slashes for Linux and MAC, double backslashes for Windows.)"
340 | ]
341 | },
342 | {
343 | "cell_type": "code",
344 | "execution_count": null,
345 | "id": "3f1187ef-16f1-49f9-a347-a823e05a55e9",
346 | "metadata": {},
347 | "outputs": [],
348 | "source": [
349 | "path = \"C:\\\\Users\\\\\" # win\n",
350 | "path = \"/Users/\" # mac os\n",
351 | "\n",
352 | "# Join various path components\n",
353 | "os.path.join(path, \"Documents\", \"Python_classes\", \"\")"
354 | ]
355 | }
356 | ],
357 | "metadata": {
358 | "kernelspec": {
359 | "display_name": "Python 3 (ipykernel)",
360 | "language": "python",
361 | "name": "python3"
362 | },
363 | "language_info": {
364 | "codemirror_mode": {
365 | "name": "ipython",
366 | "version": 3
367 | },
368 | "file_extension": ".py",
369 | "mimetype": "text/x-python",
370 | "name": "python",
371 | "nbconvert_exporter": "python",
372 | "pygments_lexer": "ipython3",
373 | "version": "3.8.10"
374 | }
375 | },
376 | "nbformat": 4,
377 | "nbformat_minor": 5
378 | }
379 |
--------------------------------------------------------------------------------
/lecture03-data-IO/message.txt:
--------------------------------------------------------------------------------
1 | Hello Monthy!
2 | This is Python class on file I/O.
--------------------------------------------------------------------------------
/lecture04-pandas-basics/README.md:
--------------------------------------------------------------------------------
1 | # Lecture 04: Data Munging with pandas
2 |
3 | ## Motivation
4 |
5 | Before analyzing the data, data analysts spend a lot of time organizing, managing, and cleaning it to prepare it for analysis. This is called data wrangling or data munging. It is often said that 80 percent of data analysis time is spent on these tasks. Data wrangling is an iterative process: we usually start by organizing and cleaning our data, then start doing the analysis, and then go back to the cleaning process as problems emerge during analysis.
6 |
7 | Here we introduce students to a (relatively) easy way of carrying out this task and use the case study of [finding a good deal among hotels]((https://gabors-data-analysis.com/casestudies/#ch02a-finding-a-good-deal-among-hotels-data-preparation)). The initial data preparation, continues to work towards finding hotels that are underpriced relative to their location and quality. In this lecture, we illustrate how to find problems with observations and variables and how to solve those problems.
8 |
9 | ## This lecture
10 |
11 |
12 | This lecture introduces `pandas` as the data type of variable Python. It shows multiple columns and row manipulations with one DataFrame, also introduces students how to manipulate raw data in various ways with `pandas`.
13 |
14 | This lecture is based on [Chapter 02, A: Finding a good deal among hotels: data preparation](https://gabors-data-analysis.com/casestudies/#ch02a-finding-a-good-deal-among-hotels-data-preparation).
15 |
16 |
17 | ## Learning outcomes
18 | After successfully completing [`01_pandas_basics.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture04-pandas-basics/01_pandas_basics.ipynb), students should be able to:
19 |
20 | - create pandas `Series`
21 | - create pandas `DataFrames` from `Series`, dictionaries, lists
22 | - access data in a `DataFrame` with `loc` and `iloc`
23 | - reset index
24 | - rename columns
25 | - access metadata of `DataFrame`s
26 |
27 | After successfully completing [`02_pandas_data_munging.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture04-pandas-basics/02_pandas_data_munging.ipynb), students should be able to:
28 |
29 | - add variables
30 | - separate a character variable into two (or more) variables
31 | - convert different type of variables to specific types:
32 | - character to numeric
33 | - string manipulations in pandas Series
34 | - filter out different observations
35 | - select observations with specific values
36 | - tabulate different values of a variable
37 | - filter out missing values
38 | - replace specific values with others
39 | - handle duplicates
40 | - use pipes to do multiple manipulations at once
41 | - sort data ascending or descending according to a specific variable
42 |
43 | ## Datasets used
44 | * [Hotels Europe](https://gabors-data-analysis.com/datasets/#hotels-europe)
45 |
46 |
47 | ## Lecture Time
48 |
49 | Ideal overall time: **60 mins**.
50 |
51 |
52 | ## Homework
53 |
54 | *Type*: quick practice, approx 10 mins
55 |
56 | Use the same [hotel-europe data from OSF](https://osf.io/r6uqb/), but now
57 | - Download both `hotels-europe_price.csv` and `hotels-europe_features.csv`
58 | - `merge` them in this order by `hotel_id`
59 | - filter for :
60 | - time: 2018/01 and weekend == 1
61 | - city: Vienna or London. Hint: for multiple matches, use something like:
62 | ```r
63 | data.loc[data["city"].isin(['City_A','City_B'])]
64 | ```
65 | - accommodation should be Apartment, 3-4 stars (only) with more than 10 reviews
66 | - price is less than 600$
67 | - arrange the data in ascending order by price
68 |
69 | ## Further material
70 |
71 | - More materials on the case study can be found in Gabor's [da_case_studies repository](https://github.com/gabors-data-analysis/da_case_studies): [ch02-hotels-data-prep](https://github.com/gabors-data-analysis/da_case_studies/blob/master/ch02-hotels-data-prep/ch02-hotels-data-prep.R)
72 | - Arthur Turrell's Coding for Economics classes: [Data Analysis Quickstart](https://aeturrell.github.io/coding-for-economists/data-analysis-quickstart.html), [Working with Data](https://aeturrell.github.io/coding-for-economists/data-intro.html), [Data Transformation](https://aeturrell.github.io/coding-for-economists/data-transformation.html)
--------------------------------------------------------------------------------
/lecture05-graphs-basics/README.md:
--------------------------------------------------------------------------------
1 | # Lecture 05: graphs basics
2 |
3 | ## Motivation
4 |
5 | You should look at your data. Graphs and charts let you explore and learn about the structure of the information you collect. Good data visualizations also make it easier to communicate your ideas and findings to other people. Beyond that, producing effective plots from your own data is the best way to develop a good eye for reading and understanding graphs — good and bad — made by others, whether presented in research articles, business slide decks, public policy advocacy, or media reports.
6 |
7 | To create a powerful graph, it is a good starting principle that all of our decisions should be guided by the *usage of the graph*: a summary concept to capture what we want to show and to whom. Its main elements are purpose, focus, and audience. Once usage is clear, the first set of decisions to make is about how we convey information: how to show what we want to show. For those decisions it is helpful to understand the entire graph as the
8 | overlay of three graphical objects:
9 |
10 | 1. Geometric object; the geometric visualization of the information we want to convey, such as a
11 | set of bars, a set of points, or a line; multiple geometric objects may be combined.
12 | 2. Scaffolding: elements that support understanding the geometric object, such as axes, labels, and
13 | legends.
14 | 3. Annotation: adding anything else to emphasize specific values or explain more detail.
15 |
16 | Keeping these in mind this lecture introduces students to how to create graphs that take into account these principles.
17 |
18 | ## This lecture
19 |
20 | This lecture introduces the tools to create and manipulate plots with `plotnine` and `matplotlib`. `plotnine` is used through the [`case studies`](https://github.com/gabors-data-analysis/da_case_studies) for the textbook, its based on `ggplot2` of the R language.
21 |
22 | `matplotlib` is the primary charting library of Python. It is a massive library, which offers so much, that it can easily become overwhelming. Creating a basic chart is fairly simple, but sometimes just a little customization already requires a deep dive into the API.
23 |
24 | One of the reasons we cover matplotlib here though is that many other libraries are also built on the matplotlib API, and plotting charts directly from Pandas dataframes is easier if we have a basic understading of matplotlib's mechanics. There are other popular charting packages, such as `seaborn` or `Plotly`, but we think that a real Pythonista should be able to work with matplotlib objects.
25 |
26 | Case studies used/related in/to this lecture:
27 |
28 | - [Chapter 03, B Comparing hotel prices in Europe: Vienna vs London](https://gabors-data-analysis.com/casestudies/#ch03b-comparing-hotel-prices-in-europe-vienna-vs-london) is the base for this lecture.
29 | - Some tools are used in [Chapter 04, A Management quality and firm size: describing patterns of association](https://gabors-data-analysis.com/casestudies/#ch04a-management-quality-and-firm-size-describing-patterns-of-association)
30 |
31 |
32 | ## Learning outcomes
33 | After completing [`01_plotnine_intro.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture05-graphs-basics/01_plotnine_intro.ipynb) students should be able to:
34 |
35 | - create `ggplot` objects with different types of figures using `geoms`
36 | - manipulating axis with `scale_*_continuous` and `scale_*_discrete`, where `*` stands for `y` or `x`
37 | - set limits
38 | - set break points
39 | - add annotation to a plot
40 | - lines, dots and text
41 | - bar charts:
42 | - simple
43 | - stacked
44 | - stacked with percentages, using `scales` package
45 | - box plot
46 | - violine plot
47 | - use `color[x]` color values from a pre-defined list
48 |
49 | After completing [`02_matplotlib_intro.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture05-graphs-basics/02_matplotlib_intro.ipynb) students should be able to:
50 |
51 | - the two key object in a `matplotlib` plot
52 | - `Figure`
53 | - `Axes`
54 | - set
55 | - y-axis limits
56 | - legends
57 | - log scale
58 | - using a second axis
59 | - spacing between the bars and horizontal grids
60 | - chart within a chart
61 |
62 | ## Datasets used
63 | * [Hotel Europe](https://gabors-data-analysis.com/datasets/#hotels-europe)
64 |
65 | ## Lecture Time
66 |
67 | Ideal overall time: **30-60mins**.
68 |
69 | Showing [`plotnine_intro.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture05-graphs-basics/plotnine_intro.ipynb) takes around *30 minutes* while doing the tasks would take approx *10-15 minutes*. Showing [`matplotlib_intro.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture05-graphs-basics/matplotlib_intro.ipynb) takes around *15-20 minutes*.
70 |
71 | ## Further material
72 |
73 | - [Introduction to Matplotlib — Data Visualization in Python](https://heartbeat.comet.ml/introduction-to-matplotlib-data-visualization-in-python-d9143287ae39) in general focuses on visualization with matplotlib.
74 | - Arthur Turrell's Coding for Economics classes: [Intro to Data Visualisation](https://aeturrell.github.io/coding-for-economists/vis-intro.html), [Common Plots](https://aeturrell.github.io/coding-for-economists/vis-common-plots.html)
75 | - [Official webpage of `plotnine`](https://plotnine.readthedocs.io/en/stable/)
76 | - [Official webpage of `matplotlib`](https://matplotlib.org/)
77 |
--------------------------------------------------------------------------------
/lecture06-conditionals/README.md:
--------------------------------------------------------------------------------
1 | # Lecture 06: Conditional Programming
2 |
3 | ## Motivation
4 |
5 | Deciding what to do on a case by case is widely used in decision making and also in programming. Conditional programming enables writing codes with this in mind. If a certain condition holds execute a command otherwise do something different. Conditional programming is an element of the basic programming technique, which emerges in multiple situations. Adding this technique to the programming toolbox is a must for data scientists.
6 |
7 | ## This lecture
8 |
9 | This lecture introduces students to conditional programming with `if-else` statements. It covers the essentials control flows as `for` and `while` loop and list comprehension.
10 |
11 |
12 | ## Learning outcomes
13 | After successfully live-coding the material (see: [`conditionals_and_control_flows.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture06-conditionals/conditionals_and_control_flows.ipynb)), students will have knowledge on
14 |
15 | - how a conditional statement works
16 | - what are the crucial elements of an `if-else` statement
17 | - what is a `for` loop
18 | - what is a `while` loop
19 | - wow to create a list comprehension
20 |
21 |
22 | ## Lecture Time
23 |
24 | Ideal overall time: **10-20 mins**.
25 |
26 | This is a relatively short lecture, and it can be even shorter if logical operators with vectors is neglected. Although good understanding of the anatomy of an `if-else` statement is important
--------------------------------------------------------------------------------
/lecture06-conditionals/conditionals_and_control_flows.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "6bd65ab7-b72f-46b2-b0f1-c66d477e93a0",
6 | "metadata": {},
7 | "source": [
8 | "# Lecture 6\n",
9 | "\n",
10 | "## Conditional statements\n",
11 | "- the [if statement](#conditionals)\n",
12 | "\n",
13 | "## Control Flows\n",
14 | "- the [for loop](#for)\n",
15 | "- the [while loop](#while)\n",
16 | "- using for loops for [list comprehension](#comprehension)\n",
17 | "---"
18 | ]
19 | },
20 | {
21 | "cell_type": "markdown",
22 | "id": "386d4659-98e0-4764-b66c-145d57e04bc6",
23 | "metadata": {},
24 | "source": [
25 | "## Conditional Statements "
26 | ]
27 | },
28 | {
29 | "cell_type": "markdown",
30 | "id": "0a62d10e-f954-4283-b2f7-be6ededd84dd",
31 | "metadata": {},
32 | "source": [
33 | "Conditional statments is the `if`-`else` structure. The program performs an operation (or more) if certain conditions are met, and - optionally - performs some other if those conditions are not fulfilled."
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": null,
39 | "id": "1c6be79a-9e10-4cec-b14b-7b583090c826",
40 | "metadata": {},
41 | "outputs": [],
42 | "source": [
43 | "import random"
44 | ]
45 | },
46 | {
47 | "cell_type": "markdown",
48 | "id": "c61ec38f-229b-4411-8e13-b972af8fa3dc",
49 | "metadata": {},
50 | "source": [
51 | "Condtional statements are controlled by ***indentation***. Each new embedded condition needs to be shifted one tab right. (Other languages, like Java or JavaScript, use curly braces.)"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": null,
57 | "id": "5ba3b5fd-acee-4e54-9512-038c8df28a05",
58 | "metadata": {},
59 | "outputs": [],
60 | "source": [
61 | "r = random.randint(20,34)\n",
62 | "print(r)\n",
63 | "if r < 25:\n",
64 | " print('A small number!')\n",
65 | "elif r < 30:\n",
66 | " print('A moderately high number.')\n",
67 | "else:\n",
68 | " print('A large number!')"
69 | ]
70 | },
71 | {
72 | "cell_type": "markdown",
73 | "id": "0653ae7a-a29a-49f7-889b-b182a31db17f",
74 | "metadata": {},
75 | "source": [
76 | "Conditional statements do not have to have an `else` branch. If the condition is not met the program can also stay idle. "
77 | ]
78 | },
79 | {
80 | "cell_type": "code",
81 | "execution_count": null,
82 | "id": "2e27f14d-1e50-45b9-ba48-d1112e4d0997",
83 | "metadata": {},
84 | "outputs": [],
85 | "source": [
86 | "a = random.randint(1,12)\n",
87 | "b = random.randint(1,16)\n",
88 | "\n",
89 | "print('a:', a)\n",
90 | "print('b:', b)\n",
91 | "\n",
92 | "if a > 6:\n",
93 | " print(\"'a' is large\")\n",
94 | " if b > a:\n",
95 | " print('Both numbers are large.')\n",
96 | " print('Result: b is larger than a.')"
97 | ]
98 | },
99 | {
100 | "cell_type": "markdown",
101 | "id": "a14665d2-279f-43a6-8133-9e10c9c30e0e",
102 | "metadata": {},
103 | "source": [
104 | "## Control Flows"
105 | ]
106 | },
107 | {
108 | "cell_type": "markdown",
109 | "id": "0255d90b-a436-4823-b370-649226a85aa9",
110 | "metadata": {},
111 | "source": [
112 | "### The _'for'_ Loop "
113 | ]
114 | },
115 | {
116 | "cell_type": "code",
117 | "execution_count": null,
118 | "id": "f0018cea-cd2a-4add-a308-891592cd7769",
119 | "metadata": {},
120 | "outputs": [],
121 | "source": [
122 | "for i in range(20): # Remember: 20 is not included in the range! \n",
123 | " if i%2 == 0: # The 'modulo' operator returns the integer part left after an integer division.\n",
124 | " print('Number %d is even.'% i)\n",
125 | " else:\n",
126 | " print('Number %d is odd.'% i)"
127 | ]
128 | },
129 | {
130 | "cell_type": "code",
131 | "execution_count": null,
132 | "id": "e1ba7e8a-e47f-467c-8ab4-4496215e8b42",
133 | "metadata": {},
134 | "outputs": [],
135 | "source": [
136 | "for word in ['Business', 'analytics', 'with', 'Python']:\n",
137 | " print(word, len(word)) # functions can also be print inputs"
138 | ]
139 | },
140 | {
141 | "cell_type": "code",
142 | "execution_count": null,
143 | "id": "0834b4d6-7041-495d-9c14-83fd5c5b2cfc",
144 | "metadata": {},
145 | "outputs": [],
146 | "source": [
147 | "list_capitals = []\n",
148 | "for i in range(65,91):\n",
149 | " list_capitals.append(chr(i))\n",
150 | "print(list_capitals)"
151 | ]
152 | },
153 | {
154 | "cell_type": "markdown",
155 | "id": "2114f1f2-7cf1-45f2-b66e-c8300953123f",
156 | "metadata": {},
157 | "source": [
158 | "The `enumerate` function helps you get a counter. "
159 | ]
160 | },
161 | {
162 | "cell_type": "code",
163 | "execution_count": null,
164 | "id": "ee9d72f3-9656-4e3e-8189-e5285a675ec0",
165 | "metadata": {},
166 | "outputs": [],
167 | "source": [
168 | "for k, v in enumerate(list_capitals):\n",
169 | " print(k, v)"
170 | ]
171 | },
172 | {
173 | "cell_type": "markdown",
174 | "id": "4f01fb95-cf0b-462a-86b7-5b8aa63678b4",
175 | "metadata": {},
176 | "source": [
177 | "Add some simple formatting: right-adjust k, the counter. This is what the `.rjust()` function does. This, however, is a *string function*, so we need to *cast* our 'k' variable, which an integer, into string. For this we use the `str()` function."
178 | ]
179 | },
180 | {
181 | "cell_type": "code",
182 | "execution_count": null,
183 | "id": "d54c539d-edd9-415a-8e17-a46edcc5ce67",
184 | "metadata": {},
185 | "outputs": [],
186 | "source": [
187 | "for k, v in enumerate(list_capitals):\n",
188 | " print(str(k).rjust(2)+': ', v)"
189 | ]
190 | },
191 | {
192 | "cell_type": "markdown",
193 | "id": "c61e2222-0ef3-419c-b4cb-bbdbd31e7db4",
194 | "metadata": {},
195 | "source": [
196 | "### The *'while'* Loop "
197 | ]
198 | },
199 | {
200 | "cell_type": "code",
201 | "execution_count": null,
202 | "id": "c9b5d1f0-b39c-48a5-bcb5-3b05cc74cc28",
203 | "metadata": {},
204 | "outputs": [],
205 | "source": [
206 | "i = 0 # the counter\n",
207 | "while i < 20:\n",
208 | " if i%2 == 0:\n",
209 | " print('Number %d is even.'% i)\n",
210 | " else:\n",
211 | " print('Number %d is odd.'% i)\n",
212 | " i += 1 # increment in Python (same as i++ in Java)\n",
213 | "print('\\nDone.') # Indented so that it will only print at the end."
214 | ]
215 | },
216 | {
217 | "cell_type": "markdown",
218 | "id": "e6de1e59-4666-4b8e-ae85-9f697f2ec97d",
219 | "metadata": {},
220 | "source": [
221 | "**Caution!!!** If you don't increment the counter, the loop will never stop!"
222 | ]
223 | },
224 | {
225 | "cell_type": "markdown",
226 | "id": "28d1f43c-fba5-4080-afd2-4c866eb7ed73",
227 | "metadata": {},
228 | "source": [
229 | "If you use '*True*' in the `while` condition the script runs until manual interruption. "
230 | ]
231 | },
232 | {
233 | "cell_type": "code",
234 | "execution_count": null,
235 | "id": "32ea677f-3873-407d-82e2-5048f7d146bb",
236 | "metadata": {},
237 | "outputs": [],
238 | "source": [
239 | "from IPython.display import clear_output\n",
240 | "import time\n",
241 | "\n",
242 | "i = 1\n",
243 | "while True: # This syntax makes it run forever, or untill manual interruption. \n",
244 | " print(i)\n",
245 | " i += 1\n",
246 | " time.sleep(1)\n",
247 | " clear_output()"
248 | ]
249 | },
250 | {
251 | "cell_type": "markdown",
252 | "id": "709db49c-ac70-4f7f-bf6a-316bdc74ca14",
253 | "metadata": {},
254 | "source": [
255 | "To interrupt the script in a code cell click in the cell and then click ■ (the black rectangle icon) on the notebook's menu bar. "
256 | ]
257 | },
258 | {
259 | "cell_type": "markdown",
260 | "id": "fb2e7251-a6ea-428e-b258-a16f550b42d3",
261 | "metadata": {},
262 | "source": [
263 | "### List Comprehension "
264 | ]
265 | },
266 | {
267 | "cell_type": "markdown",
268 | "id": "07617a0b-a3cd-4e98-9131-778999da6c47",
269 | "metadata": {},
270 | "source": [
271 | "List comprehension is a logical construct to create a list from another lists or from an iterable, or to modify an existing list *in place*. "
272 | ]
273 | },
274 | {
275 | "cell_type": "code",
276 | "execution_count": null,
277 | "id": "8eb7bcef-9b0a-42a6-971c-b36f4b030289",
278 | "metadata": {},
279 | "outputs": [],
280 | "source": [
281 | "L = [x**2 for x in range(0,10)]\n",
282 | "L"
283 | ]
284 | },
285 | {
286 | "cell_type": "markdown",
287 | "id": "cef71af7-5a4c-4306-bda2-3eeaade310a4",
288 | "metadata": {},
289 | "source": [
290 | "You can also combine it with conditional statements. For example:"
291 | ]
292 | },
293 | {
294 | "cell_type": "code",
295 | "execution_count": null,
296 | "id": "357b3dc6-f4e9-467d-a039-7b9754741d60",
297 | "metadata": {},
298 | "outputs": [],
299 | "source": [
300 | "[x for x in L if x%2 == 1]"
301 | ]
302 | },
303 | {
304 | "cell_type": "markdown",
305 | "id": "9c7b8eab-2b45-4347-8d52-fcb18c713fca",
306 | "metadata": {},
307 | "source": [
308 | "You can also use an ``if else`` statement"
309 | ]
310 | },
311 | {
312 | "cell_type": "code",
313 | "execution_count": null,
314 | "id": "b0db5a8f-fe3b-48c3-992a-039fe1bb1325",
315 | "metadata": {},
316 | "outputs": [],
317 | "source": [
318 | "['even' if x%2 == 0 else 'odd' for x in L]"
319 | ]
320 | },
321 | {
322 | "cell_type": "code",
323 | "execution_count": null,
324 | "id": "251e6732",
325 | "metadata": {},
326 | "outputs": [],
327 | "source": []
328 | }
329 | ],
330 | "metadata": {
331 | "kernelspec": {
332 | "display_name": "Python 3 (ipykernel)",
333 | "language": "python",
334 | "name": "python3"
335 | },
336 | "language_info": {
337 | "codemirror_mode": {
338 | "name": "ipython",
339 | "version": 3
340 | },
341 | "file_extension": ".py",
342 | "mimetype": "text/x-python",
343 | "name": "python",
344 | "nbconvert_exporter": "python",
345 | "pygments_lexer": "ipython3",
346 | "version": "3.8.10"
347 | }
348 | },
349 | "nbformat": 4,
350 | "nbformat_minor": 5
351 | }
352 |
--------------------------------------------------------------------------------
/lecture07-data-exploration/README.md:
--------------------------------------------------------------------------------
1 | # Lecture 07: Data Exploration
2 |
3 | ## Motivation
4 |
5 | You want to know whether online and offline prices differ in your country for products that are sold in both ways. You have access to data on a sample of products with their online and offline prices. How would you use this data to establish whether prices tend to be different or the same for all products?
6 |
7 | After collecting the data, assessing its quality, cleaning it, and structuring it, the next step is exploratory data analysis (EDA). Exploratory data analysis aims to describe variables in a dataset. EDA is important for understanding potential problems with the data and making analysts and their audiences familiar with the most important variables. The results of EDA help additional data cleaning, decisions for further steps of the analysis, and giving context to the results of the following hypothesis testing.
8 |
9 | The lecture discusses some basic concepts such as frequencies, probabilities, distributions, and extreme values. It includes guidelines
10 | for producing informative graphs and tables for presentation and describes the most important summary statistics. Furthermore, we cover the logic and practice of testing hypotheses. We describe the steps of hypothesis testing and discuss two alternative ways to carry it out: one with the help of a test statistic and a critical value, and another one with the help of a p-value. We focus on testing hypotheses about averages, but, as we show in one of our case studies, this focus is less restrictive than it may appear.
11 |
12 |
13 | ## This lecture
14 |
15 | This lecture introduces students to data exploration. `pandas` is used for data descriptive tables, `plotnine` for creating graphs, and `scipy.stats` for hypothesis testing.
16 | Descriptive statistics and descriptive graphs for one variable are concerned to decide on further data munging.
17 | Moreover, simple hypothesis testing is covered as well as association graphs and statistics between two variables.
18 |
19 | Case studies connected to this lecture:
20 | - [Chapter 03, A: Finding a good deal among hotels: data exploration](https://gabors-data-analysis.com/casestudies/#ch03a-finding-a-good-deal-among-hotels-data-exploration) - emphasis on one variable descriptive analysis, different data
21 | - [Chapter 03, D: Distributions of body height and income](https://gabors-data-analysis.com/casestudies/#ch03d-distributions-of-body-height-and-income) and [Chapter 03, U1: Size distribution of Japanese cities](https://gabors-data-analysis.com/casestudies/#ch03u1-size-distribution-of-japanese-cities) connects theoretical and empirical distributions
22 | - [Chapter 04, A: Management quality and firm size: describing patterns of association](https://gabors-data-analysis.com/casestudies/#ch04a-management-quality-and-firm-size-describing-patterns-of-association) - focuses on the association between two variables, one variable descriptive is not emphasized, different data.
23 | - [Chapter 06, A: Comparing online and offline prices: testing the difference](https://gabors-data-analysis.com/casestudies/#ch06a-comparing-online-and-offline-prices-testing-the-difference) - focuses on hypothesis testing, association and one variable descriptive is not emphasized.
24 |
25 | This lecture uses [Chapter 06, A](https://gabors-data-analysis.com/casestudies/#ch06a-comparing-online-and-offline-prices-testing-the-difference) as the starting point, but stresses the one variable descriptives such as in [Chapter 03, A](https://gabors-data-analysis.com/casestudies/#ch03a-finding-a-good-deal-among-hotels-data-exploration) and adds the two variable pattern analysis such as in [Chapter 04, A](https://gabors-data-analysis.com/casestudies/#ch04a-management-quality-and-firm-size-describing-patterns-of-association).
26 |
27 |
28 | ## Learning outcomes
29 | After completing the codes in [`data_exploration.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture07-data-exploration/data_exploration.ipynb), students should be able to:
30 |
31 | - `describe` for a quick summary of all variables in the dataframe
32 | - `skim` from the `skimpy` package for a nicer looking descriptive table
33 | - specific variables with their descriptive statistics with `filter` such as
34 | - mean, median, standard deviation, minimum, maximum, percentiles, number of observations, number of missing observations
35 | - user-created functions added to `agg` such as range or mode
36 | - descriptives for specific groups
37 | - use of `plotnine`:
38 | - histogram to plot empirical density with count or relative frequency. Understanding the role of the number of bins and bins' width.
39 | - kernel density to plot a smooth function for the empirical density with an understanding of the role of bandwidth.
40 | - stack multiple geometry objects in one graph and control for opaqueness
41 | - manipulate labels with `labs`
42 | - set axis limits with `xlim` and `ylim`
43 | - use a factor variable to graph multiple groups in one ggplot and understand the differences between `fill`, `color`, and `group` arguments.
44 | - create multiple plots in one graph with `facet_wrap`
45 | - carry out hypothesis test via t-test
46 | - two-sided, one-sided tests
47 | - multiple hypothesis test with `agg` and `groupby` functions
48 | - Association between two variables:
49 | - covariance with `cov` and correlation with `corr`
50 | - scatter plot
51 | - bin-scatter: equidistance bin-scatter with `stat_summary_bin` and an equal number of observations in each bin by hand
52 | - correlation for specific subgroups and how to plot them. Use of `reorder`.
53 |
54 | ## Datasets used
55 |
56 | * [billion-prices](https://gabors-data-analysis.com/datasets/#billion-prices)
57 | * [wms-management-survey](https://gabors-data-analysis.com/datasets/#wms-management-survey) as homework
58 |
59 |
60 | ## Lecture Time
61 |
62 | Ideal overall time: **70-100mins**.
63 |
64 | Showing [`data_exploration.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture07-data-exploration/data_exploration.ipynb) takes around *50 minutes* while doing the tasks would take the rest.
65 |
66 | I highly recommend doing all the tasks as this lecture involves many new functions.
67 | If you do not have the needed time for one lecture, you may take this into two parts. Good breakpoints are:
68 |
69 | - hypothesis-testing
70 | - association
71 |
72 |
73 | ## Homework
74 |
75 | *Type*: quick practice, approx 15 mins
76 |
77 | Use the [wms-management-survey](https://gabors-data-analysis.com/datasets/#wms-management-survey) data, ['wms_da_textbook.csv' file](https://osf.io/uzpce/).
78 | Use the following units:
79 | - United States firms, observed in wave 2004 and employment of the firms should be between 100 and 5000.
80 | - Create a descriptive statistic table for variables of `management`, `emp_firm`, and `firm_age` with mean, median, sd, min, max, range, and 5% and 95% percentiles.
81 | - Create descriptive statistics for `management` grouped by `ownership` types. Use mean, median, min, and max.
82 | - Create a plot with histogram and kernel density, with proper labeling for `management` variable.
83 | - Create a new factor variable `firm_size`, which takes the value of 'small and medium' if `emp_firm` is smaller than 1000 and otherwise it is 'large' Hint: use a simple logical operator in a factor function, specifying the label.
84 | - Test if the average `management` score is different in large vs small and medium firms
85 | - Create a bin-scatter with 10 bins, where on x-axis is the `emp_firm` and y-axis the `management` score. Use the same number of observations within each bin.
86 |
87 | ## Further material
88 |
89 | - Billion-Price-Project case study can be found in Gabor's da_case_studies repository: [ch06-online-offline-price-test](https://github.com/gabors-data-analysis/da_case_studies/tree/master/ch06-online-offline-price-test) This case study primarily focuses on hypothesis testing only.
90 | - Data exploration case studies in Gabor's da_case_studies repository are [ch03-hotels-vienna-explore](https://github.com/gabors-data-analysis/da_case_studies/blob/master/ch03-hotels-vienna-explore) and [ch03-hotels-europe-compare](https://github.com/gabors-data-analysis/da_case_studies/blob/master/ch03-hotels-europe-compare). It focuses on bars, histograms and basic descriptive statistics.
91 | - Association, scatter, and bin-scatter is used in the case study [ch04-management-firm-size](https://github.com/gabors-data-analysis/da_case_studies/tree/master/ch04-management-firm-size) in Gabor's book.
92 | - Arthur Turrell's Coding for Economics classes: [Explanatory Data Analysis](https://aeturrell.github.io/coding-for-economists/data-exploratory-analysis.html) that introduces the `skimpy` and `pandas-profiling` packages.
--------------------------------------------------------------------------------
/lecture08-functions/README.md:
--------------------------------------------------------------------------------
1 | # Lecture 11: Writing Functions
2 |
3 | ## Motivation
4 |
5 | One of the best ways to improve your reach as a data scientist is to write functions. Functions allow automating common tasks in a more powerful and general way than copy-and-pasting. Writing a function has three big advantages over using copy-and-paste:
6 |
7 | 1. You can give a function an evocative name that makes your code easier to understand.
8 | 2. As requirements change, you only need to update code in one place, instead of many.
9 | 3. You eliminate the chance of making incidental mistakes when you copy and paste (i.e. updating a variable name in one place, but not in another).
10 |
11 | Writing good functions is a lifetime journey. Even after using Python for many years, one can still learn new techniques and better ways of approaching old problems. The goal is not to teach you every esoteric detail of functions but to get you started with some pragmatic advice that you can apply immediately.
12 |
13 | ## This lecture
14 |
15 | This lecture introduces functions, how they are structured and how to write them.
16 |
17 |
18 | ## Learning outcomes
19 | After successfully live-coding the material (see: [`functions.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture08-functions/functions.ipynb)), students will know on
20 |
21 | - how to create user definded functions
22 | - what is the structure of a function
23 | - the use of `docstring`s to document functions
24 | - the use of the `help` function to retreive function descriptions
25 | - the use of `lambda` function
26 |
27 | ## Lecture Time
28 |
29 | Ideal overall time: **20-30 mins**.
30 |
31 |
32 | ## Homework
33 |
34 | *Type*: quick practice, approx 15 mins, together with [lecture06-conditionals](https://github.com/gabors-data-analysis/da-coding-python/edit/main/lecture06-conditionals).
35 |
36 | Bootstrapping - using the [`sp500`](https://gabors-data-analysis.com/datasets/#sp500) data
37 |
38 | - download the cleaned data for `sp500` from [OSF](https://osf.io/h64z2/)
39 | - write a function, which calculates the bootstrap standard errors and confidence intervals based on these standard errors.
40 | - function should have an input for a) vector of prices, b) number of bootstraps, c) level for the confidence interval
41 | - create a new variable for `sp500`: `daily_return`, which is the difference in the prices from one day to the next day.
42 | - use this `daily_return` variable and calculate the 80% confidence interval based on bootstrap standard errors along with the mean.
43 |
44 |
45 | ## Further material
46 |
47 | - Case study materials from Gabor's da_case_studies repository on generalization (with bootstrapping) is: [ch05-stock-market-loss-generalize](https://github.com/gabors-data-analysis/da_case_studies/tree/master/ch05-stock-market-loss-generalize) on testing are: [ch06-online-offline-price-test](https://github.com/gabors-data-analysis/da_case_studies/tree/master/ch06-online-offline-price-test) and [ch06-stock-market-loss-test](https://github.com/gabors-data-analysis/da_case_studies/tree/master/ch06-stock-market-loss-test)
48 |
--------------------------------------------------------------------------------
/lecture08-functions/functions.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "9909070d-84b3-4484-8a73-49666a8c7dcc",
6 | "metadata": {},
7 | "source": [
8 | "# Lecture 8\n",
9 | "\n",
10 | "## Writing Functions\n",
11 | "- User-Defined [Functions](#UDF) (UDFs)\n",
12 | "- [lambda](#lambda) functions\n",
13 | "\n",
14 | "---"
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "id": "64cf5f6f-84ca-4224-96e1-ba0a4d158564",
20 | "metadata": {},
21 | "source": [
22 | "## User-defined Functions "
23 | ]
24 | },
25 | {
26 | "cell_type": "markdown",
27 | "id": "6f6446a6-9255-477a-b928-3d4a273a2808",
28 | "metadata": {},
29 | "source": [
30 | "A function is a block of organized, reusable code that is used to perform a single, related action. Functions provide better modularity for your application and a high degree of code reusing.\n",
31 | "\n",
32 | "You can define functions to provide the required functionality. Here are simple rules to define a function in Python.\n",
33 | "\n",
34 | "* Function blocks begin with the keyword ```def``` followed by the function name and parentheses ```( )```.\n",
35 | "\n",
36 | "* Any input parameters or arguments should be placed within these parentheses. You can also define parameters inside these parentheses.\n",
37 | "\n",
38 | "* The first statement of a function can be an optional statement - the documentation string of the function or docstring.\n",
39 | "\n",
40 | "* The code block within every function starts with a colon (```:```) and is **indented**.\n",
41 | "\n",
42 | "* The statement ```return``` [expression] returns a value, or a serious of values, a list, a dictionary, .... A return statement with no arguments is the same as return None."
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": null,
48 | "id": "e264b08d-0fc3-4fdd-b8a7-3b660d353884",
49 | "metadata": {},
50 | "outputs": [],
51 | "source": [
52 | "def add_one(number):\n",
53 | " x = number + 1\n",
54 | " return x"
55 | ]
56 | },
57 | {
58 | "cell_type": "code",
59 | "execution_count": null,
60 | "id": "71379516-7976-488a-8073-ef572c8ad946",
61 | "metadata": {},
62 | "outputs": [],
63 | "source": [
64 | "add_one(20)"
65 | ]
66 | },
67 | {
68 | "cell_type": "markdown",
69 | "id": "6ef54f7a-6379-4f6b-831f-3a0c1bc964e1",
70 | "metadata": {},
71 | "source": [
72 | "You can return more than one object from a single function. "
73 | ]
74 | },
75 | {
76 | "cell_type": "code",
77 | "execution_count": null,
78 | "id": "dad4db6d-6289-43d4-8a42-f421294c8382",
79 | "metadata": {},
80 | "outputs": [],
81 | "source": [
82 | "def add_one_and_return_both(number):\n",
83 | " x = number\n",
84 | " y = x + 1\n",
85 | " return x, y"
86 | ]
87 | },
88 | {
89 | "cell_type": "code",
90 | "execution_count": null,
91 | "id": "2c2690e9-3ff2-4b92-9f7e-b55f428a2b2c",
92 | "metadata": {},
93 | "outputs": [],
94 | "source": [
95 | "x, y = add_one_and_return_both(23)\n",
96 | "print(x)\n",
97 | "print(y)"
98 | ]
99 | },
100 | {
101 | "cell_type": "markdown",
102 | "id": "e927412f-55ba-40c6-a1b1-661942b6fd2a",
103 | "metadata": {},
104 | "source": [
105 | "Function arguments can have default values."
106 | ]
107 | },
108 | {
109 | "cell_type": "code",
110 | "execution_count": null,
111 | "id": "bd1def85-514c-482f-aa17-e3b301c03eda",
112 | "metadata": {},
113 | "outputs": [],
114 | "source": [
115 | "def number_to_the_power(number, exponent = 2):\n",
116 | " return number ** exponent"
117 | ]
118 | },
119 | {
120 | "cell_type": "code",
121 | "execution_count": null,
122 | "id": "7d4dcb2a-aadc-46ce-b253-12d3e83b991f",
123 | "metadata": {},
124 | "outputs": [],
125 | "source": [
126 | "number_to_the_power(5)"
127 | ]
128 | },
129 | {
130 | "cell_type": "code",
131 | "execution_count": null,
132 | "id": "d1734ec5-1097-4d3a-8b56-3ee9f0fd5122",
133 | "metadata": {},
134 | "outputs": [],
135 | "source": [
136 | "number_to_the_power(5, 3)"
137 | ]
138 | },
139 | {
140 | "cell_type": "markdown",
141 | "id": "02a7f46d-2fc1-427a-a7d7-a5aa60d1d94f",
142 | "metadata": {},
143 | "source": [
144 | "Return objects can be of any type. Also, `docstrings` help you document your function. More on docstrings [here](https://www.datacamp.com/community/tutorials/docstrings-python)."
145 | ]
146 | },
147 | {
148 | "cell_type": "code",
149 | "execution_count": null,
150 | "id": "8dc06669-ca6b-4ad8-9493-74e3d7d360af",
151 | "metadata": {},
152 | "outputs": [],
153 | "source": [
154 | "def cast_listitems_to_string(list):\n",
155 | " \"\"\"\n",
156 | " Casts list of various elements to string. \n",
157 | " \n",
158 | " The function cast elements in a list to string,\n",
159 | " whatever their original type is.\n",
160 | " \n",
161 | " Parameters\n",
162 | " ----------\n",
163 | " list: list \n",
164 | " A list of various data types.\n",
165 | " \n",
166 | " Returns\n",
167 | " -------\n",
168 | " list: list\n",
169 | " A list of strings, cast from the original elements.\n",
170 | " \"\"\"\n",
171 | " for i in range(len(list)):\n",
172 | " list[i] = str(list[i]) # remember: lists are mutable\n",
173 | " return list"
174 | ]
175 | },
176 | {
177 | "cell_type": "markdown",
178 | "id": "cd7e75b0-fa2d-4a1a-bb66-384a15e03099",
179 | "metadata": {},
180 | "source": [
181 | "Docstrings are returned when you call the `help()` function on your UDF. This is especially helpful when you import your function from a module in a complex solution. "
182 | ]
183 | },
184 | {
185 | "cell_type": "code",
186 | "execution_count": null,
187 | "id": "e568ac74-5c5a-4ad4-b3a1-3c7d23406e03",
188 | "metadata": {},
189 | "outputs": [],
190 | "source": [
191 | "help(cast_listitems_to_string)"
192 | ]
193 | },
194 | {
195 | "cell_type": "code",
196 | "execution_count": null,
197 | "id": "4c4a7080-bbb8-4787-95b5-163eb3ce8719",
198 | "metadata": {},
199 | "outputs": [],
200 | "source": [
201 | "import math \n",
202 | "\n",
203 | "ls_convertable = [1,2, 'a', math.cos(math.pi / 3)]"
204 | ]
205 | },
206 | {
207 | "cell_type": "code",
208 | "execution_count": null,
209 | "id": "bee0d780-184d-418e-9dd9-da19f9624be7",
210 | "metadata": {},
211 | "outputs": [],
212 | "source": [
213 | "ls_convertable"
214 | ]
215 | },
216 | {
217 | "cell_type": "code",
218 | "execution_count": null,
219 | "id": "19f1dac0-1a56-4f2b-be45-b8c6c78e4fa3",
220 | "metadata": {},
221 | "outputs": [],
222 | "source": [
223 | "ls_converted = cast_listitems_to_string(ls_convertable)"
224 | ]
225 | },
226 | {
227 | "cell_type": "code",
228 | "execution_count": null,
229 | "id": "d4e6dfe2-7cbc-41ea-ac4e-af8e51c33dc5",
230 | "metadata": {},
231 | "outputs": [],
232 | "source": [
233 | "ls_converted"
234 | ]
235 | },
236 | {
237 | "cell_type": "markdown",
238 | "id": "318ca855-a546-4a00-b694-6fdca2ae1bcd",
239 | "metadata": {},
240 | "source": [
241 | "## Lambda Functions "
242 | ]
243 | },
244 | {
245 | "cell_type": "markdown",
246 | "id": "d974ef34-2a6b-4297-8b4b-f16ee634b9f2",
247 | "metadata": {},
248 | "source": [
249 | "A lambda function is a small anonymous function. A lambda function can take any number of arguments, but can only have one expression. It is created using the `lambda` keyword."
250 | ]
251 | },
252 | {
253 | "cell_type": "code",
254 | "execution_count": null,
255 | "id": "b73341b4-ac29-4fe3-b424-f7197d0116ab",
256 | "metadata": {},
257 | "outputs": [],
258 | "source": [
259 | "square = lambda x: x ** 2"
260 | ]
261 | },
262 | {
263 | "cell_type": "code",
264 | "execution_count": null,
265 | "id": "f731ca49-31ca-47f8-9846-ed45c8254849",
266 | "metadata": {},
267 | "outputs": [],
268 | "source": [
269 | "square(2)"
270 | ]
271 | },
272 | {
273 | "cell_type": "markdown",
274 | "id": "2ef41ded-502e-4699-91fb-ed316f69d510",
275 | "metadata": {},
276 | "source": [
277 | "We use lambda to simplify our code, to create temporary definitions, which are used only once. The same can be achieved with a normal definiton:"
278 | ]
279 | },
280 | {
281 | "cell_type": "code",
282 | "execution_count": null,
283 | "id": "bc234107-fac9-4c5a-987d-7429ace83fe1",
284 | "metadata": {},
285 | "outputs": [],
286 | "source": [
287 | "def square_def(x): \n",
288 | " return x ** 2"
289 | ]
290 | },
291 | {
292 | "cell_type": "code",
293 | "execution_count": null,
294 | "id": "c4648bd8-426d-4e96-9557-af69fcd04c56",
295 | "metadata": {},
296 | "outputs": [],
297 | "source": [
298 | "square_def(2)"
299 | ]
300 | },
301 | {
302 | "cell_type": "markdown",
303 | "id": "3542957c-9b5f-4ca0-ba95-7c967e3e5028",
304 | "metadata": {},
305 | "source": [
306 | "You can combine `lambda` functions with *list comprehension*. "
307 | ]
308 | },
309 | {
310 | "cell_type": "code",
311 | "execution_count": null,
312 | "id": "3dafcdf7-9726-41b8-bd94-48e8b5086d65",
313 | "metadata": {},
314 | "outputs": [],
315 | "source": [
316 | "ls_numbers = list(range(10))"
317 | ]
318 | },
319 | {
320 | "cell_type": "code",
321 | "execution_count": null,
322 | "id": "a75a8ef2-986a-4cf4-a136-c2858721c524",
323 | "metadata": {},
324 | "outputs": [],
325 | "source": [
326 | "ls_numbers"
327 | ]
328 | },
329 | {
330 | "cell_type": "markdown",
331 | "id": "970486fc-a0e4-40c7-90a1-557e5c4280fb",
332 | "metadata": {},
333 | "source": [
334 | "Let's square all the values from the list and add 1 to each element"
335 | ]
336 | },
337 | {
338 | "cell_type": "code",
339 | "execution_count": null,
340 | "id": "668c2e5c-37c4-4c91-bce1-d86d9294bdd3",
341 | "metadata": {},
342 | "outputs": [],
343 | "source": [
344 | "f = lambda x: x**2 + 1\n",
345 | "[f(x) for x in ls_numbers]"
346 | ]
347 | },
348 | {
349 | "cell_type": "markdown",
350 | "id": "b3fb2d74-1ea1-4d88-933b-a3285dc4a6ff",
351 | "metadata": {},
352 | "source": [
353 | "Let's square and add one to each even number in the list"
354 | ]
355 | },
356 | {
357 | "cell_type": "code",
358 | "execution_count": null,
359 | "id": "c73d8160-6160-42cb-8cfe-452ddf92500e",
360 | "metadata": {},
361 | "outputs": [],
362 | "source": [
363 | "[f(x) for x in ls_numbers if x%2 == 0 ]"
364 | ]
365 | },
366 | {
367 | "cell_type": "markdown",
368 | "id": "8561c4c4-c82a-40cb-986f-50b92553335f",
369 | "metadata": {},
370 | "source": [
371 | "Square and add one to each even number in the list but return the odd numbers without transformation"
372 | ]
373 | },
374 | {
375 | "cell_type": "code",
376 | "execution_count": null,
377 | "id": "b6952703-dc5c-45b8-b985-16ebe95a27fa",
378 | "metadata": {},
379 | "outputs": [],
380 | "source": [
381 | "[f(x) if x%2 == 0 else x for x in ls_numbers]"
382 | ]
383 | },
384 | {
385 | "cell_type": "markdown",
386 | "id": "0136d6c4-f121-4d6a-a733-6b8ca8313930",
387 | "metadata": {},
388 | "source": [
389 | "You can also handle errors with lambda functions and conditional list comprehension."
390 | ]
391 | },
392 | {
393 | "cell_type": "code",
394 | "execution_count": null,
395 | "id": "bce0f8e9-0975-497e-8faa-5f2aceeb07bd",
396 | "metadata": {},
397 | "outputs": [],
398 | "source": [
399 | "replace_comma = lambda x: x.replace(',', '.')"
400 | ]
401 | },
402 | {
403 | "cell_type": "code",
404 | "execution_count": null,
405 | "id": "576579d5-8402-496f-b97f-4cf2ea1c32fc",
406 | "metadata": {},
407 | "outputs": [],
408 | "source": [
409 | "replace_comma('4,5')"
410 | ]
411 | },
412 | {
413 | "cell_type": "code",
414 | "execution_count": null,
415 | "id": "177b5ba9-735b-49a1-8ac7-12808c73dc88",
416 | "metadata": {},
417 | "outputs": [],
418 | "source": [
419 | "ls_mixed_data = [1.2, '1,2', 5, 7, '4,5', 7]"
420 | ]
421 | },
422 | {
423 | "cell_type": "code",
424 | "execution_count": null,
425 | "id": "5205a3a3-f91d-4f47-9923-5d2612d83fc4",
426 | "metadata": {},
427 | "outputs": [],
428 | "source": [
429 | "[replace_comma(x) if isinstance(x, str) else x for x in ls_mixed_data]"
430 | ]
431 | }
432 | ],
433 | "metadata": {
434 | "kernelspec": {
435 | "display_name": "Python 3 (ipykernel)",
436 | "language": "python",
437 | "name": "python3"
438 | },
439 | "language_info": {
440 | "codemirror_mode": {
441 | "name": "ipython",
442 | "version": 3
443 | },
444 | "file_extension": ".py",
445 | "mimetype": "text/x-python",
446 | "name": "python",
447 | "nbconvert_exporter": "python",
448 | "pygments_lexer": "ipython3",
449 | "version": "3.8.10"
450 | }
451 | },
452 | "nbformat": 4,
453 | "nbformat_minor": 5
454 | }
455 |
--------------------------------------------------------------------------------
/lecture09-exception-handling/README.md:
--------------------------------------------------------------------------------
1 | # Lecture 09: Exception Handling
2 |
3 |
4 | ## Motivation
5 |
6 | Our code can, and will, run into errors. Sometimes this is a consequence of incorrect coding, some times of improper input data, sometimes of some malfunction of the underlying infrastructure. Programming languages offer tools to handle these exceptions and to transfer control to another component of the codebase. Even basic solutions need to handle errors, so exception handling is also a part of the basic tools for a data scientist or analyst.
7 |
8 |
9 | ## This lecture
10 |
11 | We introduce `try` and `except`, and offer a few simple examples on how to '_catch_' these errors. By identifying exception types we show how to be selective on the treatment of the various types of errors.
12 |
13 |
14 | ## Learning outcomes
15 |
16 | After completing [exception_handling.ipynb](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture09-exception-handling/exception_handling.ipynb) students should be able to :
17 |
18 | - Test code chunks for potential errors
19 | - Control how to handle these exceptions
20 | - Identify exception (aka error) types
21 | - Selecting actions based on the types of the exception occurred
22 |
23 |
24 | ## Datasets used
25 |
26 | None.
27 |
28 |
29 | ## Lecture time
30 |
31 | Ideal overall time: **10 mins**.
32 |
33 |
34 | ## Homework
35 |
36 | Define the following function:
37 |
38 | ```python
39 | def divide(a, b):
40 |
41 | return a / b
42 | ```
43 |
44 | The user needs to add both `a` and `b` as user input using the `input()` function.
45 |
46 | ```python
47 | a = input()
48 | b = input()
49 | ```
50 |
51 | Define a complex `try` - `except` block which handles all false user input which otherwise crash the function. Make sure your code sends different instructions to the user depending on the nature of the error:
52 |
53 | - when division with zero
54 | - when using strings instead of numbers
55 | - in case of other false input.
56 |
57 |
--------------------------------------------------------------------------------
/lecture09-exception-handling/exception_handling.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "3037f2b2-e21d-4ccd-85e1-167998d299e2",
6 | "metadata": {},
7 | "source": [
8 | "# Lecture 9\n",
9 | "\n",
10 | "## Exception Handling\n",
11 | "\n",
12 | "---"
13 | ]
14 | },
15 | {
16 | "cell_type": "markdown",
17 | "id": "695bc9bd-629d-4610-b662-b96d870097db",
18 | "metadata": {},
19 | "source": [
20 | "## Exception Handling (Try Except)"
21 | ]
22 | },
23 | {
24 | "cell_type": "markdown",
25 | "id": "5f339c7c-535c-4761-86ff-cfc847e155dd",
26 | "metadata": {},
27 | "source": [
28 | "`Exceptions` handle errors in the code. They let you write contructs so that your program falls back to somewhere else if an error blocks the normal run of your code. "
29 | ]
30 | },
31 | {
32 | "cell_type": "markdown",
33 | "id": "c5a70f13-d4a8-4026-bc16-abdbf0476b5b",
34 | "metadata": {},
35 | "source": [
36 | "The `try` block lets you test a block of code for errors.
\n",
37 | "The `except` block lets you handle the error.
\n",
38 | "The `else` block is to be executed if no errors were raised.
\n",
39 | "The `finally` block lets you execute code, regardless of the result of the try- and except blocks.
"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": null,
45 | "id": "a280322d-696d-48de-ab6d-df87f30114a3",
46 | "metadata": {},
47 | "outputs": [],
48 | "source": [
49 | "try:\n",
50 | " print(\"test\")\n",
51 | " # generate an error: the variable test is not defined\n",
52 | " print(test)\n",
53 | " \n",
54 | "except:\n",
55 | " print(\"Caught an exception\")"
56 | ]
57 | },
58 | {
59 | "cell_type": "markdown",
60 | "id": "4d31a4e1-50c4-4940-ba85-d4fde4ce5d4d",
61 | "metadata": {},
62 | "source": [
63 | "To get information about the error, we can access the `Exception` class instance that describes the exception by using for example:\n",
64 | "\n",
65 | " except Exception as e:"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": null,
71 | "id": "3e108b0d-ca61-4fcf-955b-82885c3d6b74",
72 | "metadata": {},
73 | "outputs": [],
74 | "source": [
75 | "try:\n",
76 | " print(\"test\")\n",
77 | " # generate an error: the variable test is not defined\n",
78 | " print(test)\n",
79 | " \n",
80 | "except Exception as e:\n",
81 | " print(\"The problem with our code is the following: \" + str(e))"
82 | ]
83 | },
84 | {
85 | "cell_type": "markdown",
86 | "id": "23d8cafd-1c9b-4b08-8d29-8d92b4203d40",
87 | "metadata": {},
88 | "source": [
89 | "
Let's define two functions! "
90 | ]
91 | },
92 | {
93 | "cell_type": "code",
94 | "execution_count": null,
95 | "id": "fd868cbc-8766-46ac-95c6-dd4194a29328",
96 | "metadata": {
97 | "tags": []
98 | },
99 | "outputs": [],
100 | "source": [
101 | "def add_two_numbers(a, b):\n",
102 | " return a + b"
103 | ]
104 | },
105 | {
106 | "cell_type": "code",
107 | "execution_count": null,
108 | "id": "c1b96f43-a511-40d1-9ecf-19eabbcc4f8a",
109 | "metadata": {
110 | "tags": []
111 | },
112 | "outputs": [],
113 | "source": [
114 | "def divide_two_numbers(a, b):\n",
115 | " \n",
116 | " try: \n",
117 | " result = a / b\n",
118 | " \n",
119 | " except Exception as e:\n",
120 | " pass\n",
121 | " \n",
122 | " else:\n",
123 | " return result"
124 | ]
125 | },
126 | {
127 | "cell_type": "code",
128 | "execution_count": null,
129 | "id": "282448b1-42eb-48d4-8735-bfdccd2c219d",
130 | "metadata": {},
131 | "outputs": [],
132 | "source": [
133 | "add_two_numbers(3, 5)"
134 | ]
135 | },
136 | {
137 | "cell_type": "markdown",
138 | "id": "0b7657c7-6ceb-4ccb-b521-e6a8eac52e5b",
139 | "metadata": {},
140 | "source": [
141 | "If we call our function we run into an error and our script stops running. "
142 | ]
143 | },
144 | {
145 | "cell_type": "code",
146 | "execution_count": null,
147 | "id": "65f49b77-dcb7-4239-aee9-9af8388e55be",
148 | "metadata": {},
149 | "outputs": [],
150 | "source": [
151 | "add_two_numbers(3, 'b')"
152 | ]
153 | },
154 | {
155 | "cell_type": "markdown",
156 | "id": "57c3dc95-7e6a-4754-80de-8577c6505945",
157 | "metadata": {},
158 | "source": [
159 | "We can handle the error and - for instance - call our user to modify the inputs."
160 | ]
161 | },
162 | {
163 | "cell_type": "code",
164 | "execution_count": null,
165 | "id": "f8636462-43cc-4fed-88d1-7410bea183e6",
166 | "metadata": {},
167 | "outputs": [],
168 | "source": [
169 | "try:\n",
170 | " add_two_numbers(3, 'b')\n",
171 | "except Exception as e:\n",
172 | " print('We ran into this error: ' + str(e) + '.', 'Try another input.')"
173 | ]
174 | },
175 | {
176 | "cell_type": "markdown",
177 | "id": "b09a9d9a-9fd0-41e7-8ad4-e745bf617ae9",
178 | "metadata": {},
179 | "source": [
180 | "And what happens here? "
181 | ]
182 | },
183 | {
184 | "cell_type": "code",
185 | "execution_count": null,
186 | "id": "d68db218-956a-4b6c-94ae-4b7ed13d5723",
187 | "metadata": {},
188 | "outputs": [],
189 | "source": [
190 | "try:\n",
191 | " divide_two_numbers(3, 'b') # This function already handles the error inside thus a string input does not crash the function!\n",
192 | "except Exception as e:\n",
193 | " print('We ran into this error: ' + str(e))\n",
194 | "else:\n",
195 | " print('Everything went fine.')"
196 | ]
197 | },
198 | {
199 | "cell_type": "markdown",
200 | "id": "1d7e201f-9070-4990-8e0c-73b5ff0aa76b",
201 | "metadata": {},
202 | "source": [
203 | "Our `try - except` block did not throw an error, since the function already handled it. Nevertheless, we did not get any result back. \n",
204 | "\n",
205 | "If we decide to handle the exceptions inside the function, but we do want to enter the `except` block in case of an inproprer input, we can `raise` the exception inside the function. This is a useful trick when we handle various exceptions inside the function but we want to throw an error in certain cases only. "
206 | ]
207 | },
208 | {
209 | "cell_type": "code",
210 | "execution_count": null,
211 | "id": "e7ffdd17-6af9-4020-9a93-9ce125157f2f",
212 | "metadata": {},
213 | "outputs": [],
214 | "source": [
215 | "def division(a,b):\n",
216 | " \n",
217 | " try:\n",
218 | " result = a / b\n",
219 | " \n",
220 | " except ZeroDivisionError:\n",
221 | " print('Division by zero. Use a non-zero denominator!')\n",
222 | " \n",
223 | " except Exception as e:\n",
224 | " print('Exited with error: ' + str(e) + '.')\n",
225 | " raise\n",
226 | " \n",
227 | " else: \n",
228 | " return result"
229 | ]
230 | },
231 | {
232 | "cell_type": "code",
233 | "execution_count": null,
234 | "id": "bcedd700-0dcc-46ea-be3c-8f7ece6fc127",
235 | "metadata": {},
236 | "outputs": [],
237 | "source": [
238 | "# Here the function will not throw an error, only tells the user about the false input. The code would continue running. \n",
239 | "division(30, 0)"
240 | ]
241 | },
242 | {
243 | "cell_type": "code",
244 | "execution_count": null,
245 | "id": "eae74039-c304-4201-b9db-bc3f08d3703e",
246 | "metadata": {},
247 | "outputs": [],
248 | "source": [
249 | "# This is an unhandled error which stops the code running. \n",
250 | "division(30, 'a')"
251 | ]
252 | },
253 | {
254 | "cell_type": "code",
255 | "execution_count": null,
256 | "id": "044738df-28ac-4749-87b6-e958f4c4e6a3",
257 | "metadata": {},
258 | "outputs": [],
259 | "source": [
260 | "# In this case we enter the 'else' branch. \n",
261 | "division(30,7)"
262 | ]
263 | },
264 | {
265 | "cell_type": "markdown",
266 | "id": "560e6037-b55b-4050-ba9a-c6b36efd3082",
267 | "metadata": {},
268 | "source": [
269 | "As you see, a `try - except` block can have multiple `except` branches so different errors can be handled in different ways. You can read about Python's various exception types in the documentation of [built-in exceptions](https://docs.python.org/3.8/library/exceptions.html). "
270 | ]
271 | },
272 | {
273 | "cell_type": "markdown",
274 | "id": "ac41476b-9701-4d4e-a4fe-9c01fec46bca",
275 | "metadata": {},
276 | "source": [
277 | "Note: we used to following code for printing the exception itself:\n",
278 | "```\n",
279 | "print(str(e))\n",
280 | "```\n",
281 | "This is because the `e` is an `Exception` class object, and as such cannot be the input of the `print()` function. The `str()` method calls the *string representation* of this object which then can be printed. "
282 | ]
283 | },
284 | {
285 | "cell_type": "code",
286 | "execution_count": null,
287 | "id": "db2221c8",
288 | "metadata": {},
289 | "outputs": [],
290 | "source": []
291 | }
292 | ],
293 | "metadata": {
294 | "kernelspec": {
295 | "display_name": "Python 3 (ipykernel)",
296 | "language": "python",
297 | "name": "python3"
298 | },
299 | "language_info": {
300 | "codemirror_mode": {
301 | "name": "ipython",
302 | "version": 3
303 | },
304 | "file_extension": ".py",
305 | "mimetype": "text/x-python",
306 | "name": "python",
307 | "nbconvert_exporter": "python",
308 | "pygments_lexer": "ipython3",
309 | "version": "3.8.10"
310 | }
311 | },
312 | "nbformat": 4,
313 | "nbformat_minor": 5
314 | }
315 |
--------------------------------------------------------------------------------
/lecture10-intro-to-regression/README.md:
--------------------------------------------------------------------------------
1 | # Lecture 10: Introduction to regression
2 |
3 | ## Motivation
4 |
5 | You want to identify hotels in a city that are good deals: underpriced for their location and quality. You have scraped the web for data on all hotels in the city, and you have cleaned the data. You have carried out exploratory data analysis that revealed that hotels closer to the city center tend to be more expensive, but there is a lot of variation in prices between hotels at the same distance. How should you identify hotels that are underpriced relative to their distance to the city center? In particular, how should you capture the average price–distance relationship that would provide you a benchmark, to which you can compare actual prices to find good deals?
6 |
7 | The analysis of hotel prices and distance to the city center reveals that hotels further away from the center are less expensive by a certain amount, on average. Can you use this result to estimate how much more revenue a hotel developer could expect if it were to build a hotel closer to the center rather than farther away? Regression is a model for the conditional mean: the mean of y for different values of one or more x variables. Regression is used to uncover patterns of association. That, in turn, is used in the causal analysis, to uncover the effect of x on y, and in predictions, to arrive at a good guess of what the value of y is if we don’t know it, but we know the value of x.
8 |
9 | In this lecture, we introduce simple non-parametric regression and simple linear regression, and we show how to visualize their results. We then discuss simple linear regression in detail. We introduce the regression equation, how its coefficients are uncovered (estimated) in actual data, and we emphasize how to interpret the coefficients. We introduce the concepts of predicted value and residual and goodness of fit, and we discuss the relationship between regression and correlation.
10 |
11 | ## This lecture
12 |
13 | This lecture introduces regressions via [hotels-vienna dataset](https://gabors-data-analysis.com/datasets/#hotels-vienna). It overviews models based on simple binary means, binscatters, lowess nonparametric regression, and introduces simple linear regression techniques. The lecture illustrates the use of predicted values and regression residuals with linear regression, but as homework, the same exercise is repeated with a binscatter-based model.
14 |
15 | This lecture is based on [Chapter 07, A: *Finding a good deal among hotels with simple regression*](https://gabors-data-analysis.com/casestudies/#ch07a-finding-a-good-deal-among-hotels-with-simple-regression)
16 |
17 | ## Learning outcomes
18 | After successfully completing [`hotels_intro_to_regression.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture10-intro-to-regression/intro_to_regression.ipynb) students should be able:
19 |
20 | - Binary means:
21 | - Calculate prediction based on means of two categories and create an annotated graph
22 | - Binscatter:
23 | - Create means based on differently defined bins for the X variable
24 | - Show two different graphs: simple mean predictions for each bins as a dot and scatter with step functions
25 | - Lowess nonparametric regression:
26 | - How to create a lowess (loess) graph
27 | - What is an output of a loess model? What are the main advantages and disadvantages?
28 | - Simple linear regression
29 | - How to create a simple linear regression line in a scatterplot
30 | - `statsmodels` package: estimate two models w and w/o heteroscedastic robust SE and compare the two model
31 | - How to get predicted values and errors of predictions
32 | - Get the best and worst deals: identify hotels with the smallest/largest errors
33 | - Visualize the errors via histogram and scatter plot with annotating the best and worst 5 deals.
34 |
35 | ## Dataset used
36 |
37 | - [hotels-vienna](https://gabors-data-analysis.com/datasets/#hotels-vienna)
38 |
39 | ## Lecture Time
40 |
41 | Ideal overall time: **60 mins**.
42 |
43 | Going through [`hotels_intro_to_regression.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture10-intro-to-regression/intro_to_regression.ipynb) takes around *45-50 minutes*, the rests are the tasks.
44 |
45 |
46 | ## Homework
47 |
48 | *Type*: quick practice, approx 15 mins
49 |
50 | Use the binscatter model with 7 bins and save the predicted values and errors (true price minus the predicted value). Find the best and worst 10 deals and visualize with a scatterplot, highlighting the under/overpriced hotels with these best/worst deals according to this model. Compare to the simple linear regression. Which model would you use? Argue!
51 |
52 |
53 | ## Further material
54 |
55 | - More materials on the case study can be found in Gabor's *da_case_studies* repository: [ch07-hotels-simple-reg](https://github.com/gabors-data-analysis/da_case_studies/tree/master/ch07-hotels-simple-reg)
56 | - Arthur Turrell's Coding for Economics classes: [Regression](https://aeturrell.github.io/coding-for-economists/econmt-regression.html)
--------------------------------------------------------------------------------
/lecture11-feature-engineering/README.md:
--------------------------------------------------------------------------------
1 | # Lecture 11: Feature Engineering
2 |
3 | ## Motivation
4 |
5 | Feature engineering is the part when we take the variables as recorded in the raw data and create (transform) the y and x variables that we’ll include in the model. In general x variables are called features (in predictive analytics) whereas y is often called labels (coming from categorization tasks).
6 |
7 | Specifying the functional form of the variables is a difficult aspect of feature engineering. That includes capturing nonlinear relationships with quantitative variables (quadratic, higher order polynomial, piecewise linear spline, etc), deciding on the number of categories for qualitative variables (joining
8 | rare categories into fewer ones), and deciding on interactions. The emphasis is on getting the best fit without overfitting the data. Domain knowledge is important: knowledge from previous analyses, and/or theory, about what tends to make the outcome different. Domain knowledge can help answer what variables are likely to be more important versus less important, what interactions are likely important, and where should we be most worried about nonlinearity. For instance, professional weather forecasts use computational models that use the laws of physics to relate many variables and feed in measured values of those variables from data. Or, many central banks complement purely data-driven inflation forecasts with predictions from general equilibrium models that are simplified representations of how the economy works. The other source of information is the data itself. Exploratory data analysis (EDA) is a key part of all predictive analytics. We do EDA to make sure we understand the content of each variable, to make sure they are as clean as possible, and to understand their distribution. Besides exploring the variables in themselves, we need to investigate the patterns of associations with the y variable. In addition, we may look at how the x variables are correlated with each other, to make sure that we don’t include variables together that are extremely closely related to each other (e.g., that have a correlation coefficient of 0.95) unless we have a very good reason (usually theoretical) to do so.
9 |
10 | This work is tedious and time-consuming. Some of it is unavoidable. We need to know our data: we should never build models with x variables whose content we don’t understand. That’s because we cannot assess, or even think about, the stability of the patterns of association between y and x if we don’t know what those variables are, what their content is, and how they are measured. And assessing stability is necessary for assessing external validity, which is a key aspect of a good model. Thus, we can play around with data and estimate models without knowing what’s in them, but that won’t necessarily help with the true goal of our analysis.
11 |
12 | ## This lecture
13 |
14 | This lecture introduces feature engineering practices and focuses on simple methods used in [Gabor's book](https://gabors-data-analysis.com/) and its [case studies]((https://github.com/gabors-data-analysis/da_case_studies)). It utilizes [wms-management-survey](https://gabors-data-analysis.com/datasets/#wms-management-survey) dataset for manipulation of (multiple) variable(s) into a new one and [bisnode-firms](https://gabors-data-analysis.com/datasets/#bisnode-firms) dataset to show more elaborate techniques such as imputing, nonlinear transformations and winsorizing.
15 |
16 | The lecture (partially) uses the following case studies:
17 | - [Chapter 01, C: Management quality: data collection](https://gabors-data-analysis.com/casestudies/#ch01c-management-quality-data-collection)
18 | - [Chapter 04, A: Management quality and firm size: describing patterns of association](https://gabors-data-analysis.com/casestudies/#ch04a-management-quality-and-firm-size-describing-patterns-of-association)
19 | - [Chapter 08, C: Measurement error in hotel ratings](https://gabors-data-analysis.com/casestudies/#ch08c-measurement-error-in-hotel-ratings) as homework
20 | - [Chapter 17, A: Predicting firm exit: probability and classification](https://gabors-data-analysis.com/casestudies/#ch17a-predicting-firm-exit-probability-and-classification)
21 |
22 | *Note: this is rather an introduction to feature engineering, emphasizing the importance of what kind of (basic) transformations are necessary with the variables. However, the literature rather thinks of feature engineering as a complex, usually machine learning-based method, to create new variables. Main applications are converting texts, pictures, videos, web-page content, etc into data-analysis-ready variables.*
23 |
24 |
25 | ## Learning outcomes
26 | After successfully completing [`01_feature_engineering_wms.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture11-feature-engineering/01_feature_engineering_wms.ipynb) and [`02_feature_engineering_bisnode.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture11-feature-engineering/02_feature_engineering_bisnode.ipynb), students should be able:
27 |
28 | - How to create a new variable from multiple already existing variables by calculating the mean or the sum
29 | - Create groups of a categorical variable
30 | - `pycountry_convert` package to get continents and regions
31 | - Create an ordered factor variable
32 | - convert an integer-valued variable to an ordered factor variable
33 | - `cut` to convert a continuous variable into an ordered factor variable
34 | - Create dummy variables from a factor variable with `get_dummies`
35 | - Extra: introduction to principal component analysis with `sklearn.decomposition`'s `PCA`
36 | - Imputing values
37 | - replacing with mean or median
38 | - using outside knowledge (or other variables)
39 | - creating a categorical variable with a specific value for missing
40 | - Adjusting log transformation (to avoid log(0))
41 | - Using `shift` functions
42 | - Numeric vs factor representation with visualization
43 | - Random sampling with panel data for (faster) visualization
44 | - Winsorizing
45 |
46 | ## Datasets used
47 |
48 | - [wms-management-survey](https://gabors-data-analysis.com/datasets/#wms-management-survey)
49 | - [bisnode-firms](https://gabors-data-analysis.com/datasets/#bisnode-firms)
50 | - [hotels-vienna](https://gabors-data-analysis.com/datasets/#hotels-vienna) as homework.
51 |
52 | ## Lecture Time
53 |
54 | Ideal overall time: **30-50 mins**.
55 |
56 | This lecture is a collection of basic feature engineering techniques used throughout [this Python course](https://github.com/gabors-data-analysis/da-coding-python), [Gabor's book](https://gabors-data-analysis.com/) and its [case studies](https://github.com/gabors-data-analysis/da_case_studies). It can be skipped and one can spend more time in each lecture on the transformations/engineering. However, it is highly useful to see almost all the transformations in one place.
57 |
58 | ## Homework
59 |
60 | *Type*: quick practice, approx 15 mins
61 |
62 | This homework should make students think about other issues with variables, namely measurement error in the explanatory variable.
63 |
64 | Use [hotels-vienna](https://gabors-data-analysis.com/datasets/#hotels-vienna) data from [OSF](https://osf.io/y6jvb/).
65 |
66 | - Filter observations to Hotels with 3-4 stars in Vienna (`city_actual`) and with prices less than 600$
67 | - Create a new variable: log-price
68 | - Create three sub-samples, where `rating_count` is:
69 | - less than 100
70 | - between 100 and 200
71 | - more than 200
72 | - Run simple linear regressions: `log-price ~ rating` on all of the abovementioned samples
73 | - Plot the three predicted log prices on one plot, with proper formatting and legends
74 | - Argue briefly why the slopes are different.
75 |
76 |
77 | ## Further material
78 | - More materials on the **World-Management Survey case study** can be found in Gabor's *da_case_studies* repository: [ch04-management-firm-size](https://github.com/gabors-data-analysis/da_case_studies/tree/master/ch04-management-firm-size)
79 | - More materials on the **Predicting firm exit case study** can be found in Gabor's *da_case_studies* repository: [ch17-predicting-firm-exit](https://github.com/gabors-data-analysis/da_case_studies/blob/master/ch17-predicting-firm-exit), especially in the [data preparation file](https://github.com/gabors-data-analysis/da_case_studies/blob/master/ch17-predicting-firm-exit/ch17-firm-exit-data-prep.R)
80 |
--------------------------------------------------------------------------------
/lecture12-simple-linear-regression/00_life_exp_get_data.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "0d78786d",
6 | "metadata": {},
7 | "source": [
8 | "# Lecture 12\n",
9 | "\n",
10 | "## Getting the data for analysis\n",
11 | " - practice with WDI package \n",
12 | " \n",
13 | "#### Case Study: \n",
14 | " - Life-expectancy and income \n",
15 | "---"
16 | ]
17 | },
18 | {
19 | "cell_type": "code",
20 | "execution_count": null,
21 | "id": "3851750c",
22 | "metadata": {},
23 | "outputs": [],
24 | "source": [
25 | "import pandas as pd\n",
26 | "import numpy as np\n",
27 | "from datetime import date\n",
28 | "import warnings\n",
29 | "\n",
30 | "warnings.filterwarnings(\"ignore\")"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": null,
36 | "id": "6f957733",
37 | "metadata": {},
38 | "outputs": [],
39 | "source": [
40 | "import wbdata"
41 | ]
42 | },
43 | {
44 | "cell_type": "markdown",
45 | "id": "209c0911",
46 | "metadata": {},
47 | "source": [
48 | "Reminder on how WDI works - it is an API\\\n",
49 | "Search for variables which contains GDP"
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": null,
55 | "id": "5c9e7543",
56 | "metadata": {},
57 | "outputs": [],
58 | "source": [
59 | "wbdata.search_indicators(\"gdp\")"
60 | ]
61 | },
62 | {
63 | "cell_type": "markdown",
64 | "id": "b2ea0e3c",
65 | "metadata": {},
66 | "source": [
67 | "Narrow down the serach for: GDP + something + capita + something + constant"
68 | ]
69 | },
70 | {
71 | "cell_type": "code",
72 | "execution_count": null,
73 | "id": "61e7255c",
74 | "metadata": {},
75 | "outputs": [],
76 | "source": [
77 | "wbdata.search_indicators(\"gdp.*capita.*constant\")"
78 | ]
79 | },
80 | {
81 | "cell_type": "markdown",
82 | "id": "784147fc",
83 | "metadata": {},
84 | "source": [
85 | "Get GDP data"
86 | ]
87 | },
88 | {
89 | "cell_type": "code",
90 | "execution_count": null,
91 | "id": "a0c077b0",
92 | "metadata": {},
93 | "outputs": [],
94 | "source": [
95 | "gdp_data = wbdata.get_data(\n",
96 | " indicator=\"NY.GDP.PCAP.PP.KD\", country=\"all\", data_date=date(2019, 1, 1)\n",
97 | ")\n",
98 | "gdp_data"
99 | ]
100 | },
101 | {
102 | "cell_type": "markdown",
103 | "id": "3c2fd41c",
104 | "metadata": {},
105 | "source": [
106 | "### Task: \n",
107 | "\n",
108 | "Get the GDP data, along with `population`, `total` and `life expectancy at birth`\n",
109 | "for year 2019 and save to your data folder!\\\n",
110 | "Note: We have pushed it to Github, we will use that later, just to be on the same page!"
111 | ]
112 | },
113 | {
114 | "cell_type": "code",
115 | "execution_count": null,
116 | "id": "7b8bc4c3",
117 | "metadata": {},
118 | "outputs": [],
119 | "source": [
120 | "wbdata.search_indicators('population, total')"
121 | ]
122 | },
123 | {
124 | "cell_type": "code",
125 | "execution_count": null,
126 | "id": "bd5a71ac",
127 | "metadata": {},
128 | "outputs": [],
129 | "source": [
130 | "wbdata.search_indicators(\"life expectancy at birth\")"
131 | ]
132 | },
133 | {
134 | "cell_type": "code",
135 | "execution_count": null,
136 | "id": "4ff5383f",
137 | "metadata": {},
138 | "outputs": [],
139 | "source": [
140 | "data_raw = wbdata.get_dataframe(\n",
141 | " indicators={\n",
142 | " \"NY.GDP.PCAP.PP.KD\": \"gdppc\",\n",
143 | " \"SP.DYN.LE00.IN\": \"lifeexp\",\n",
144 | " \"SP.POP.TOTL\": \"population\",\n",
145 | " },\n",
146 | " country=\"all\",\n",
147 | " data_date=date(2019, 1, 1),\n",
148 | ").reset_index()\n",
149 | "\n",
150 | "ISOcodes = pd.DataFrame(wbdata.get_country())[[\"iso2Code\", \"name\"]]"
151 | ]
152 | },
153 | {
154 | "cell_type": "code",
155 | "execution_count": null,
156 | "id": "5aff6a41",
157 | "metadata": {},
158 | "outputs": [],
159 | "source": [
160 | "data_raw = (\n",
161 | " data_raw.reset_index(drop=True)\n",
162 | " .merge(ISOcodes, left_on=\"country\", right_on=\"name\", how=\"left\")\n",
163 | " .drop(\"name\", axis=1)\n",
164 | ")"
165 | ]
166 | },
167 | {
168 | "cell_type": "code",
169 | "execution_count": null,
170 | "id": "4453a51a",
171 | "metadata": {},
172 | "outputs": [],
173 | "source": [
174 | "data_raw.to_csv(\"data/WDI_lifeexp_raw.csv\", index = False)"
175 | ]
176 | },
177 | {
178 | "cell_type": "code",
179 | "execution_count": null,
180 | "id": "b3dddfba",
181 | "metadata": {},
182 | "outputs": [],
183 | "source": []
184 | }
185 | ],
186 | "metadata": {
187 | "kernelspec": {
188 | "display_name": "Python 3 (ipykernel)",
189 | "language": "python",
190 | "name": "python3"
191 | },
192 | "language_info": {
193 | "codemirror_mode": {
194 | "name": "ipython",
195 | "version": 3
196 | },
197 | "file_extension": ".py",
198 | "mimetype": "text/x-python",
199 | "name": "python",
200 | "nbconvert_exporter": "python",
201 | "pygments_lexer": "ipython3",
202 | "version": "3.8.10"
203 | }
204 | },
205 | "nbformat": 4,
206 | "nbformat_minor": 5
207 | }
208 |
--------------------------------------------------------------------------------
/lecture12-simple-linear-regression/01_life_exp_clean.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "c7da3efc",
6 | "metadata": {},
7 | "source": [
8 | "# Lecture 12 #\n",
9 | " \n",
10 | "## Auxiliary file to clean data \n",
11 | " - can practice, but not mandatory \n",
12 | " \n",
13 | "#### Case Study: \n",
14 | "- life-expectancy and income \n",
15 | "---"
16 | ]
17 | },
18 | {
19 | "cell_type": "code",
20 | "execution_count": null,
21 | "id": "469640e2",
22 | "metadata": {},
23 | "outputs": [],
24 | "source": [
25 | "import pandas as pd\n",
26 | "import numpy as np\n",
27 | "import warnings\n",
28 | "\n",
29 | "warnings.filterwarnings(\"ignore\")"
30 | ]
31 | },
32 | {
33 | "cell_type": "markdown",
34 | "id": "0e7ed69e",
35 | "metadata": {},
36 | "source": [
37 | "Call the data from github"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": null,
43 | "id": "172b3be5",
44 | "metadata": {},
45 | "outputs": [],
46 | "source": [
47 | "my_url = \"https://raw.githubusercontent.com/gabors-data-analysis/da-coding-python/main/lecture12-simple-linear-regression/data/WDI_lifeexp_raw.csv\"\n",
48 | "#df = pd.read_csv(my_url)\n",
49 | "df = pd.read_csv(\"data/WDI_lifeexp_raw.csv\")\n",
50 | "df.loc[lambda x: x[\"country\"] == \"Namibia\", \"iso2Code\"] = \"NA\"\n",
51 | "df = df.dropna(subset=[\"iso2Code\"])"
52 | ]
53 | },
54 | {
55 | "cell_type": "markdown",
56 | "id": "49f14043",
57 | "metadata": {},
58 | "source": [
59 | "Check the observations:\n",
60 | "\n",
61 | "Lot of grouping observations usually contains a number"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": null,
67 | "id": "3cc311ab",
68 | "metadata": {},
69 | "outputs": [],
70 | "source": [
71 | "d1 = df.loc[~df[\"iso2Code\"].str.isalpha()]\n",
72 | "d1"
73 | ]
74 | },
75 | {
76 | "cell_type": "markdown",
77 | "id": "93901b50",
78 | "metadata": {},
79 | "source": [
80 | "Filter these out"
81 | ]
82 | },
83 | {
84 | "cell_type": "code",
85 | "execution_count": null,
86 | "id": "ad8aba60",
87 | "metadata": {},
88 | "outputs": [],
89 | "source": [
90 | "df = df.loc[df[\"iso2Code\"].str.isalpha()]"
91 | ]
92 | },
93 | {
94 | "cell_type": "markdown",
95 | "id": "bf36bf49",
96 | "metadata": {},
97 | "source": [
98 | " Some grouping observations are still there, check each of them\\\n",
99 | " HK - Hong Kong, China\\\n",
100 | " OE - OECD members\\\n",
101 | " all with starting X, except XK which is Kosovo\\\n",
102 | " all with starting Z, except ZA-South Africa, ZM-Zambia and ZW-Zimbabwe\\"
103 | ]
104 | },
105 | {
106 | "cell_type": "markdown",
107 | "id": "c8295167",
108 | "metadata": {},
109 | "source": [
110 | "1st drop speficif values"
111 | ]
112 | },
113 | {
114 | "cell_type": "code",
115 | "execution_count": null,
116 | "id": "70e6a09a",
117 | "metadata": {},
118 | "outputs": [],
119 | "source": [
120 | "df.loc[lambda x: x[\"iso2Code\"].isin([\"EU\",\"HK\",\"OE\"])]"
121 | ]
122 | },
123 | {
124 | "cell_type": "markdown",
125 | "id": "f2a58b7c",
126 | "metadata": {},
127 | "source": [
128 | "Save opposite"
129 | ]
130 | },
131 | {
132 | "cell_type": "code",
133 | "execution_count": null,
134 | "id": "7831ca31",
135 | "metadata": {},
136 | "outputs": [],
137 | "source": [
138 | "df = df.loc[lambda x: ~x[\"iso2Code\"].isin([\"EU\",\"HK\",\"OE\"])]"
139 | ]
140 | },
141 | {
142 | "cell_type": "markdown",
143 | "id": "fdac841f",
144 | "metadata": {},
145 | "source": [
146 | "2nd drop values with certain starting character\\\n",
147 | "Get the first letter from iso2c"
148 | ]
149 | },
150 | {
151 | "cell_type": "code",
152 | "execution_count": null,
153 | "id": "b919919e",
154 | "metadata": {},
155 | "outputs": [],
156 | "source": [
157 | "df = df.loc[\n",
158 | " (~((df[\"iso2Code\"].str[0] == \"X\") | (df[\"iso2Code\"].str[0] == \"Z\")))\n",
159 | " | (df[\"iso2Code\"].isin([\"XK\", \"ZA\", \"ZM\", \"ZW\"]))\n",
160 | "]"
161 | ]
162 | },
163 | {
164 | "cell_type": "markdown",
165 | "id": "339c7488",
166 | "metadata": {},
167 | "source": [
168 | "Check for missing observations"
169 | ]
170 | },
171 | {
172 | "cell_type": "code",
173 | "execution_count": null,
174 | "id": "915a69e7",
175 | "metadata": {},
176 | "outputs": [],
177 | "source": [
178 | "df = df.dropna(subset=['gdppc', 'lifeexp', 'population'])"
179 | ]
180 | },
181 | {
182 | "cell_type": "markdown",
183 | "id": "143402d8",
184 | "metadata": {},
185 | "source": [
186 | "### Clean variables"
187 | ]
188 | },
189 | {
190 | "cell_type": "markdown",
191 | "id": "4cd7c273",
192 | "metadata": {},
193 | "source": [
194 | "Rename variables and scale them\\\n",
195 | "Drop all the others !! in this case write into readme it is referring to year 2018!!"
196 | ]
197 | },
198 | {
199 | "cell_type": "code",
200 | "execution_count": null,
201 | "id": "70459d65",
202 | "metadata": {},
203 | "outputs": [],
204 | "source": [
205 | "df[\"population\"] = df[\"population\"] / 10**6\n",
206 | "df[\"gdppc\"] = df[\"gdppc\"] / 10**3"
207 | ]
208 | },
209 | {
210 | "cell_type": "code",
211 | "execution_count": null,
212 | "id": "5e3754b9",
213 | "metadata": {},
214 | "outputs": [],
215 | "source": [
216 | "df.filter([ \"population\", \"gdppc\", \"lifeexp\"]).hist()"
217 | ]
218 | },
219 | {
220 | "cell_type": "markdown",
221 | "id": "afd3b4d0",
222 | "metadata": {},
223 | "source": [
224 | "It seems we have a large value(s) for population:"
225 | ]
226 | },
227 | {
228 | "cell_type": "code",
229 | "execution_count": null,
230 | "id": "fc064c21",
231 | "metadata": {},
232 | "outputs": [],
233 | "source": [
234 | "df.loc[df[\"population\"]>500]"
235 | ]
236 | },
237 | {
238 | "cell_type": "markdown",
239 | "id": "d4a01758",
240 | "metadata": {},
241 | "source": [
242 | "These are India and China... not an extreme value"
243 | ]
244 | },
245 | {
246 | "cell_type": "code",
247 | "execution_count": null,
248 | "id": "b325c8e0",
249 | "metadata": {},
250 | "outputs": [],
251 | "source": [
252 | "df.describe()"
253 | ]
254 | },
255 | {
256 | "cell_type": "markdown",
257 | "id": "5c027e67",
258 | "metadata": {},
259 | "source": [
260 | "Save the raw data file for your working directory"
261 | ]
262 | },
263 | {
264 | "cell_type": "code",
265 | "execution_count": null,
266 | "id": "189d5e97",
267 | "metadata": {},
268 | "outputs": [],
269 | "source": [
270 | "df.to_csv(\"data/WDI_lifeexp_clean.csv\",index=False)"
271 | ]
272 | },
273 | {
274 | "cell_type": "code",
275 | "execution_count": null,
276 | "id": "f8b21d77",
277 | "metadata": {},
278 | "outputs": [],
279 | "source": []
280 | }
281 | ],
282 | "metadata": {
283 | "kernelspec": {
284 | "display_name": "Python 3 (ipykernel)",
285 | "language": "python",
286 | "name": "python3"
287 | },
288 | "language_info": {
289 | "codemirror_mode": {
290 | "name": "ipython",
291 | "version": 3
292 | },
293 | "file_extension": ".py",
294 | "mimetype": "text/x-python",
295 | "name": "python",
296 | "nbconvert_exporter": "python",
297 | "pygments_lexer": "ipython3",
298 | "version": "3.8.10"
299 | }
300 | },
301 | "nbformat": 4,
302 | "nbformat_minor": 5
303 | }
304 |
--------------------------------------------------------------------------------
/lecture12-simple-linear-regression/README.md:
--------------------------------------------------------------------------------
1 | # Lecture 12: Analysis of life expectancy and GDP
2 |
3 | ## Motivation
4 |
5 | Life expectancy at birth shows how long residents of a country live; it is a summary measure of their health. Residents of richer countries tend to live longer, but you want to know the strength of that pattern. You also want to identify countries where people live especially long for the income level of their country, to start thinking about what may cause their exceptional health. You download cross-country data from the World Bank database on life expectancy and GDP per capita, and you want to uncover the pattern of association between them. How would you do that in a way that accommodates potentially nonlinear patterns and, at the same time, produces results that you can interpret?
6 |
7 | Linear regression gives a meaningful approximation to the patterns of association, but real-life data can be messy, and the patterns may be nonlinear. What those mean for regression analysis and what we can do about them is important to understand. There are several tools that we can apply to make linear regression approximate nonlinear patterns of association, but whether we want to do so depends on the goal of the analysis. The fact that real-life data tends to be messy, with errors and extreme values, poses other challenges for regression analysis.
8 |
9 | ## This lecture
10 |
11 | This lecture provides materials to analyze the association between life expectancy and GDP measures for various countries in 2019 (or later), inspired by the dataset [worldbank-lifeexpectancy](https://gabors-data-analysis.com/datasets/#worldbank-lifeexpectancy). During this exercise, students get familiar with creating simple linear regression-based models with different transformations, such as level-level, log-level, level-log, and log-log models, or using polynomials and piecewise linear splines transformation of the explanatory variable.
12 |
13 | This lecture is a practice (or similar to live coding) lecture, as it does not teaches much new material, but provides students to deepen their understanding with simple regressions and the reasoning behind them.
14 |
15 | This lecture is based on [Chapter 08, B: How is life expectancy related to the average income of a country?](https://gabors-data-analysis.com/casestudies/#ch08b-how-is-life-expectancy-related-to-the-average-income-of-a-country)
16 |
17 | ## Learning outcomes
18 | After successfully completing codes in *raw_codes* student should have:
19 |
20 | [`life_exp_get_data.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture12-simple-linear-regression/life_exp_get_data.ipynb)
21 | - Solid ground for importing and exporting data from World Bank's website via API.
22 |
23 | [`life_exp_analysis.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture12-simple-linear-regression/life_exp_analysis.ipynb)
24 | - Create scatter plots for competing models.
25 | - Transform variables from level to log in a ggplot and scale the axis for proper interpretation.
26 | - Run and plot multiple single-variable regressions with:
27 | - log transformation,
28 | - higher-order polynomial,
29 | - piecewise linear spline
30 | - or using weighted OLS.
31 | - Be able to estimate heteroscedastic robust SEs and compare specific model results with `stargazer` in one output.
32 | - Create a graph, which automatically annotates observations with the *n* largest and smallest errors.
33 |
34 |
35 | ## Datasets used
36 |
37 | - [worldbank-lifeexpectancy](https://gabors-data-analysis.com/datasets/#worldbank-lifeexpectancy), but for more recent year.
38 |
39 | ## Lecture Time
40 |
41 | Ideal overall time: approx 60 minutes.
42 |
43 | Solving [`00_life_exp_get_data.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture12-simple-linear-regression/00_life_exp_get_data.ipynb)) takes around *5-10 minutes* as it builds on [lecture03-data-IO](https://github.com/gabors-data-analysis/da-coding-python/tree/main/lecture03-data-IO). In principle it should be a quick reminder and practice.
44 |
45 | Solving [`02_life_exp_analysis.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture12-simple-linear-regression/02_life_exp_analysis.ipynb) covers the main material, and takes *40-60 minutes* depending on the student's background. This lecture is mainly theory-based (practice via case study) and includes easy, but many new commands in a repetitive way.
46 |
47 | ## Homework
48 |
49 | *Type*: quick practice, approx 20 mins
50 |
51 | Use the [hotels-vienna dataset](https://gabors-data-analysis.com/datasets/#hotels-vienna), similarly as we used in [`hotels_intro_to_regression.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture10-intro-to-regression/intro_to_regression.ipynb). Create and compare different models, based on transformations of `y=price` or `x=distance` variables:
52 |
53 | - level-level
54 | - log-level
55 | - level-log
56 | - log-log
57 | - polinomials of distance with square and cube terms
58 | - piecewise-linear-spline model, with a cutoff at 2 miles
59 |
60 | Estimate these models with `statsmodels`, using robust SEs, and compare with `stargazer`. Decide which model would you use and why! Argue!
61 |
62 | ## Further material
63 |
64 | - More materials on the case study can be found in Gabor's *da_case_studies* repository: [ch08-life-expectancy-income](https://github.com/gabors-data-analysis/da_case_studies/tree/master/ch08-life-expectancy-income)
65 | - Arthur Turrell's Coding for Economics classes: [Regression](https://aeturrell.github.io/coding-for-economists/econmt-regression.html)
--------------------------------------------------------------------------------
/lecture13-advanced-linear-regression/README.md:
--------------------------------------------------------------------------------
1 | # Lecture 13: Advanced Linear Regression
2 |
3 | ## Motivation
4 |
5 | You have analyzed your data on hotel prices in a particular city to find hotels that are underpriced relative to how close they are to the city center. But you have also uncovered differences in terms of other features of the hotels that measure quality and are related to price. How would you use this data to find hotels that are underpriced relative to all of their features? And how can you visualize the distribution of hotel prices relative to what price you would expect for their features in a way that helps identify underpriced hotels?
6 |
7 | After understanding simple linear regression, we can turn to multiple linear regression, which has more than one explanatory variable. Multiple linear regression is the most used method to uncover patterns of associations between variables. There are multiple reasons to include more explanatory variables in a regression. We may be interested in uncovering patterns of association between y and other explanatory variables, which may help us understand differences in terms of the x variable we are interested in most. Or, we may be interested in the effect of an x variable, but we want to compare observations that are different in x but similar in other variables. Finally, we may want to predict y, and we want to use more x variables to arrive at better predictions.
8 |
9 | We discuss why and when we should estimate multiple regression, how to interpret its coefficients, and how to construct and interpret confidence intervals and test the coefficients. We discuss the relationship between multiple regression and simple regression. We explain that piecewise linear splines and polynomial regressions are technically multiple linear regressions without the same interpretation of the coefficients. We include an informal discussion on how to decide what explanatory variables to include and in what functional form.
10 |
11 | Finally, we want to generalize the results of a regression from the data we are analyzing to a decision situation we care about. We can use methods to quantify the uncertainty brought about by generalizing to the general pattern represented by the data (statistical inference), and we can have guidelines to assess whether the general pattern represented by the data is likely close to the general pattern behind the situation we care about (external validity).
12 |
13 | ## This lecture
14 |
15 | This lecture introduces multiple variable regressions via [hotels-europe](https://gabors-data-analysis.com/datasets/#hotels-europe) dataset. It introduces topics on
16 |
17 | - how to choose a model from many possible candidates based on R2,
18 | - how to evaluate prediction with multiple regressors:
19 | - different graphs prediction uncertainty, and
20 | - calculate the confidence and prediction intervals.
21 |
22 | Moreover, it covers external validity with robustness test: checking model results in different time/location/type of observations. Finally, as an extra part, it shows a simple example of using a training and test sample to better understand the process of model choice and the limitation of R2.
23 |
24 | This lecture is based on
25 | - [Chapter 09, B: How stable is the hotel price–distance to the center relationship?](https://gabors-data-analysis.com/casestudies/#ch09b-how-stable-is-the-hotel-pricedistance-to-center-relationship)
26 | - [Chapter 10, B: Finding a good deal among hotels with multiple regression](https://gabors-data-analysis.com/casestudies/#ch10b-finding-a-good-deal-among-hotels-with-multiple-regression)
27 |
28 |
29 | ## Learning outcomes
30 | After successfully completing [`hotels_advanced_regression.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture13-advanced-linear-regression/hotels_advanced_regression.ipynb), students should be able to:
31 |
32 | - Visualize multiple explanatory variables with the outcome:
33 | - With a scatter plot decide the functional form which is needed.
34 | - Multiple linear regression
35 | - Use `statsmodels` to estimate regressions with multiple explanatory variables
36 | - Use `stargazer` to compare multiple candidate models and report model statistics such as R2 to evaluate models.
37 | - Analysing model prediction
38 | - Get model predictions and residuals and in case of a log-transformed outcome, how to convert the predictions and residuals into level.
39 | - y-yhat scatter plot with 45-degree line to evaluate prediction uncertainty
40 | - residual-yhat or residual-explanatory variable scatter plot to evaluate model performance along different dimensions
41 | - Confidence and Prediction interval
42 | - Using `get_prediction` function to get confidence and prediction interval
43 | - Set the significance level for the intervals with `alpha` input argument
44 | - Convert log-transformed outcome confidence and/or prediction intervals into level. Limitations.
45 | - External Validity: robustness checks
46 | - Estimate a selected model with different data to assess model uncertainty
47 | - Using different time periods, locations, and types of hotels/apartments.
48 | - Compare these models to the original and evaluate external validity
49 | - Extra:
50 | - Split the original sample into a training and test samples
51 | - Use the training sample to estimate the model and the test sample to predict hotel prices
52 | - Evaluate which model performs better with RMSE measure.
53 |
54 | ## Dataset used
55 |
56 | - [hotels-europe](https://gabors-data-analysis.com/datasets/#hotels-europe)
57 |
58 | ## Lecture Time
59 |
60 | Ideal overall time: **100 mins**.
61 |
62 | Going through [`hotels_advanced_regression.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture13-advanced-linear-regression/hotels_advanced_regression.ipynb takes around *70-80 minutes*. There are many discussions and interpretations of the models, which are similarly important. Solving the tasks takes the remaining *20-30 minutes*.
63 |
64 |
65 | ## Homework
66 |
67 | *Type*: quick practice, approx 20 mins
68 |
69 | Choose a different city from Vienna and make sure you have **at least 100 observations after filtering**. Create at least 3 models with at least 3 explanatory variables (check for transformation) and choose the best. Imagine you can build a new hotel in your city and can specify the feature values as you wish. Predict the price and estimate confidence and prediction intervals with a 90% significance level. Set the price of your hotel and argue, why is it your choice.
70 |
71 |
72 | ## Further material
73 |
74 | - More materials on the case study can be found in Gabor's *da_case_studies* repository: [ch10-hotels-multiple-reg](https://github.com/gabors-data-analysis/da_case_studies/tree/master/ch10-hotels-multiple-reg) on multiple regressions and [ch09-hotels-europe-stability](https://github.com/gabors-data-analysis/da_case_studies/tree/master/ch09-hotels-europe-stability) discusses external validity.
75 | - Arthur Turrell's Coding for Economics classes: [Regression](https://aeturrell.github.io/coding-for-economists/econmt-regression.html)
--------------------------------------------------------------------------------
/lecture14-binary-models/README.md:
--------------------------------------------------------------------------------
1 | # Lecture 14: Binary outcome - modeling probabilities
2 |
3 | ## Motivation
4 |
5 | Does smoking make you sick? And can smoking make you sick in late middle age even if you stopped years earlier? You have data on many healthy people in their fifties from various countries, and you know whether they stayed healthy four years later. You have variables on their smoking habits, age, income, and many other characteristics. How can you use this data to estimate how much more likely non-smokers are to stay healthy? How can you uncover if that depends on whether they never smoked or are former smokers? And how can you tell if that association is the result of smoking itself or, instead, underlying differences in smoking by education, income, and other factors?
6 |
7 | The lecture is related to the chapter that discusses probability models: regressions with binary y variables. In a sense, we can treat a binary y variable just like any other variable and use regression analysis as we would otherwise. with a binary y variable, we can estimate nonlinear probability models instead of the linear ones. Data analysts need to have a good understanding of when to use these different probability models, and how to interpret and evaluate their results.
8 |
9 | ## This lecture
10 |
11 | This lecture introduces binary outcome models with an analysis of health outcomes with multiple variables based on the [share-health](https://gabors-data-analysis.com/datasets/#share-health) dataset. First, we introduce saturated models (smoking on health) and linear probability models with multiple explanatory variables. We check the predicted outcome probabilities for certain groups. Then we focus on non-linear binary models: the logit and probit model. We estimate marginal effects, to interpret the average (marginal) effects of variables on the outcome probabilities. We overview goodness of fit statistics (R2, Pseudo-R2, Brier score, and Log-loss) along with visual and descriptive inspection of the predicted probabilities. Finally, we calculate the estimated bias and the calibration curve to understand model perform better.
12 |
13 | This lecture is based on [Chapter 11, A: Does smoking pose a health risk?](https://gabors-data-analysis.com/casestudies/#ch11a-does-smoking-pose-a-health-risk)
14 |
15 | ## Learning outcomes
16 | After successfully completing codes in [`binary_models.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture14-binary-models/binary_models.ipynb), students should be able:
17 |
18 |
19 | - Calculated by hand or estimate saturated models
20 | - Visualize and understand binary outcome scatter-plots
21 | - Estimate Linear Probability Models (LPM)
22 | - Use `statsmodels` to estimate regressions with multiple explanatory variables
23 | - Use `stargazer` to compare multiple candidate models and report model statistics such as R2 to evaluate models.
24 | - Understand the limitations of LPM
25 | - Carry out sub-group analysis based on predicted probabilities
26 | - Estimate Non-Linear Probability Models
27 | - Use `statsmodels` to estimate logit or probit models
28 | - Estimate `marginaleffects` with package `get_margeff`
29 | - Use `statsmodels` to compare logit and probit coefficients
30 | - Compare LPM, logit/probit and logit/probit with marginal effects
31 | - Get relevant goodness-of-fit measures
32 | - Understand the usefulness of comparing the distribution of predicted probabilities for different models
33 | - Understanding the usefulness of comparing descriptive statistics of the predicted probabilities for different models
34 | - Calculate the bias of the model along with the calibration curve
35 |
36 | ## Datasets used
37 |
38 | - [share-health](https://gabors-data-analysis.com/datasets/#share-health)
39 |
40 | ## Lecture Time
41 |
42 | Ideal overall time: **100 mins**.
43 |
44 | Going through [`binary_models.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture14-binary-models/binary_models.ipynb) takes around *80-90 minutes* as there are many discussion and interpretation of the models. Solving the tasks takes the remaining *10-20 minutes*.
45 |
46 |
47 | ## Homework
48 |
49 | *Type*: quick practice, approx 20 mins
50 |
51 | Use the same [share-health](https://gabors-data-analysis.com/datasets/#share-health) dataset, but now use `smoking` as your outcome variable as this task is going to ask you to predict if a person is a smoker or not. Use similar variables except `stayshealthy` to explain `smoking`. Run a LPM, logit and probit model. Compare the coefficients of these models along with the average marginal effects. Compute the goodness of fit statistics (R2, Pseudo-R2, Brier score, log-loss) of all of the models. Choose one, calculate the bias, and plot the calibration curve.
52 |
53 |
54 | ## Further material
55 |
56 | - More materials on the case study can be found in Gabor's *da_case_studies* repository: [ch11-smoking-health-risk](https://github.com/gabors-data-analysis/da_case_studies/tree/master/ch11-smoking-health-risk)
57 |
58 |
--------------------------------------------------------------------------------
/lecture15-datetime/README.md:
--------------------------------------------------------------------------------
1 | # Lecture 15: Date and time manipulations
2 |
3 | ## Motivation
4 |
5 | Time series data is often used to analyze business, economic, and policy questions. Time series data presents additional opportunities as well as additional challenges for regression analysis. Unlike cross-sectional data, it enables examining how y changes when x changes, and it also allows us to examine what happens to y right away or with a delay. However, variables in time series data come with some special features that affect how we should estimate regressions, and how we can interpret their coefficients.
6 |
7 | One of these differences is the frequency of the time series. It can vary from seconds to years. Time series with more frequent observations have higher frequency, e.g. monthly frequency is higher than yearly frequency, but it is lower than daily frequency. The frequency may also be irregular with gaps in-between. Gaps in time series data can be viewed as missing values of variables. But they tend to have specific causes. To run a regression of y on x in time series data, the two variables need to be at the same time series frequency. When the time series frequencies of y and x are different, we need to adjust one of them. Most often that means aggregating the variable at a higher frequency (e.g., from weekly to monthly). With flow variables, such as sales, aggregation means adding up; with stock variables and other kinds of variables, such as prices, it is often taking an average for the period or taking the last value, such as the closing price.
8 |
9 | Another fundamental feature of time series data is that variables evolve with time. They may hover around a stable average value, or they may drift upwards or downwards. A variable in time series data follows a trend if it tends to change in one direction; in other words, it has a tendency to increase or decrease. Another possible issue is seasonality. Seasonality means that the value of the variable is expected to follow a cyclical pattern, tracking the seasons of the year, days of the week, or hours of the day. Because of such systematic changes, later observations tend to be different from earlier observations. Understanding trends and seasonality is important because they make regression analysis challenging. They are examples of a broader concept, non-stationarity. Stationarity means stability; non-stationarity means the lack of stability. Stationary time series variables have the same expected
10 | value and the same distribution at all times. Trends and seasonality violate stationarity because the expected value is different at different times.
11 |
12 | ## This lecture
13 |
14 | This lecture introduces basic date and time-variable manipulations. The first part starts with the basics using `datetime` package by overviewing basic time-related functions and manipulations with time-related values and variables. The second part discusses time-series data aggregation from different frequencies along with visualization for time-series data and unit root tests.
15 |
16 | This lecture utilizes the case study of [Chapter 12, A: Returns on a company stock and market returns](https://gabors-data-analysis.com/casestudies/#ch12a-returns-on-a-company-stock-and-market-returns) as homework, and uses [`stocks-sp500`](https://gabors-data-analysis.com/datasets/#stocks-sp500) dataset.
17 |
18 | ## Learning outcomes
19 | After successfully completing [`01_datetime_basics.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture15-datetime/01_datetime_basics.ipynb) and [`02_datetime_manipulations.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture15-datetime/2_datetime_manipulations.ipynb), students should be:
20 |
21 | - Familiar with the `datetime` package, especially with
22 | - creating specific time variables, converting other types of variables into a date or datetime object
23 | - understand the importance of time zones
24 | - Get specific parts of a date object such as `year, quarter, month, day, hour etc.`
25 | - Understand the difference between duration and periods
26 | - Carry out time aggregation
27 | - Aggregate different time series objects to lower frequencies, using mean/median/max/end date, etc.
28 | - Adding `lag`-ged and differenced variables to data
29 | - Visualize time series with
30 | - handle time variable on x-axis with `scale_x_date()`
31 | - `facet_wrap` to stack multiple graphs
32 | - standardize variables and put multiple lines into one graph
33 | - Unit root tests using `arch` package's `PhillipsPerron` function
34 | - understanding the result of the Philip-Perron test and deciding if the variable needs to be differenced or not.
35 |
36 | ## Datasets used
37 |
38 | - [`stocks-sp500`](https://gabors-data-analysis.com/datasets/#stocks-sp500)
39 |
40 | ## Lecture Time
41 |
42 | Ideal overall time: **35-40 mins**.
43 |
44 | Going through the notebooks takes around *30 minutes*. There are some discussions and interpretations of the time series (e.g. stationarity). Solving the tasks takes the remaining *5-10 minutes*. The lecture can be shortened by only showing the methods. It will be partially repeated in [lecture16-timeseries-regression](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture16-timeseries-regression).
45 |
46 |
47 | ## Homework
48 |
49 | *Type*: quick practice, approx 10 mins
50 |
51 | Estimate the *beta* coefficient between quarterly SP500 log returns on Microsoft stocks log return. Use the [`stocks-sp500`](https://gabors-data-analysis.com/datasets/#stocks-sp500) dataset. Take care when aggregating the data to a) use the last day in the quarter and then take the logs and then difference the variable to get log returns. When estimating the regression use heteroskedastic robust standard error (next lecture we learn how to use Newey-West SE).
52 |
53 |
54 | ## Further material
55 |
56 | - More materials on the case study can be found in Gabor's *da_case_studies* repository: [ch12-stock-returns-risk](https://github.com/gabors-data-analysis/da_case_studies/blob/master/ch12-stock-returns-risk/ch12-stock-returns-risk.R)
57 | - Arthur Turrell's Coding for Economics classes: [Time Series](https://aeturrell.github.io/coding-for-economists/time-series.html)
58 |
--------------------------------------------------------------------------------
/lecture16-timeseries-regression/README.md:
--------------------------------------------------------------------------------
1 | # Lecture 16: Introduction to time-series regression
2 |
3 | ## Motivation
4 |
5 | Heating and cooling are potentially important uses of electricity. To investigate how weather conditions affect electricity consumption, you have collected data on temperature and residential electricity consumption in a hot region. How should you estimate the association between temperature and electricity consumption? How should you define the variables of interest, and how should you prepare the data, which has daily observations on temperature and monthly observations on electricity consumption? Should you worry about the fact that both electricity consumption and temperature vary a lot across months within years, and if yes, what should you do about it?
6 |
7 | Time series data is often used to analyze business, economic, and policy questions. Time series data presents additional opportunities as well as additional challenges for regression analysis. Unlike cross-sectional data, it enables examining how y changes when x changes, and it also allows us to examine what happens to y right away or with a delay. However, variables in time series data come with some special features that affect how we should estimate regressions, and how we can interpret their coefficients.
8 |
9 | ## This lecture
10 |
11 | This lecture introduces time-series regression via the [arizona-electricity](https://gabors-data-analysis.com/datasets/#arizona-electricity) dataset. During this lecture, students manipulate time-series data along time dimensions, create multiple time-series related graphs and get familiar with (partial) autocorrelation. Differenced variables, lags of the outcome, and lags of the explanatory variables, (deterministic) seasonality are used during regression models. Estimating these models are via `statsmodels`'s `get_robustcov_results` with Newey-West standard errors. Model comparisons and estimating cumulative effects with valid SEs are shown.
12 |
13 | This lecture is based on [Chapter 12, B: Electricity consumption and temperature](https://gabors-data-analysis.com/casestudies/#ch12b-electricity-consumption-and-temperature)
14 |
15 | ## Learning outcomes
16 | After successfully completing [`intro_time_series.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture16-timeseries-regression/intro_time_series.ipynb), students should be able:
17 |
18 | - Merge different time-series data
19 | - Create time-series related descriptives and graphs
20 | - handle date as the axis with different formatting
21 | - create autocorrelation and partial autocorrelation graphs and interpret
22 | - Run time-series regression
23 | - Estimate Newey-West standard errors and understand the role of lags
24 | - Control for seasonality via dummies
25 | - Add lagged variables to the model (and possibly leads as well)
26 | - How and why to use the same time interval when comparing competing time-series models
27 | - Estimate the standard error(s) for the cumulative effect
28 |
29 | ## Datasets used
30 |
31 | - [arizona-electricity](https://gabors-data-analysis.com/datasets/#arizona-electricity)
32 |
33 | ## Lecture Time
34 |
35 | Ideal overall time: **60-80 mins**.
36 |
37 | Going through [`intro_time_series.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture16-timeseries-regression/intro_time_series.ipynb) takes around *50-70 minutes* as there are some discussions and interpretations of the time series (e.g. stationarity, a transformation of variables, etc). Solving the tasks takes the remaining *5-10 minutes*.
38 |
39 |
40 | ## Homework
41 |
42 | *Type*: quick practice, approx 20 mins
43 |
44 | You will use the [case-shiller-la](https://gabors-data-analysis.com/datasets/#case-shiller-la) dataset to build a model for unemployment based on the Shiller price index. Load the data and consider only `pn` (Shiller price index) and `un` (unemployment) as the variables of interest. Both are seasonally adjusted. Decide which transformation to use to make the variables stationary. Create models, where you predict unemployment based on the Shiller price index. At least you should have one model where you use only the contemporaneous effects and one when you use lagged variables for both variables as explanatory variables.
45 |
46 |
47 | ## Further material
48 |
49 | - More materials on the case study can be found in Gabor's *da_case_studies* repository: [ch12-electricity-temperature](https://github.com/gabors-data-analysis/da_case_studies/tree/master/ch12-electricity-temperature)
50 | - Arthur Turrell's Coding for Economics classes: [Time Series](https://aeturrell.github.io/coding-for-economists/time-series.html), [Forecasting](https://aeturrell.github.io/coding-for-economists/time-fcasts-env.html)
51 |
--------------------------------------------------------------------------------
/lecture17-basic-spatial-viz/03_spatial_datavisualisation_plotly.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "305165ae",
6 | "metadata": {},
7 | "source": [
8 | "# Lecture 20 – part II \n",
9 | " \n",
10 | "## Basic spatial data visualization \n",
11 | " - Hotels-Europe \n",
12 | " - Create maps with `plotly` \n",
13 | "\n",
14 | "\n",
15 | "Case-studies:\n",
16 | "\n",
17 | " - Ch03B Comparing hotel prices in Europe: Vienna vs London \n",
18 | " \n",
19 | "Data used:\n",
20 | "\n",
21 | " hotels-europe \n",
22 | "\n",
23 | "___"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": null,
29 | "id": "d8106332",
30 | "metadata": {},
31 | "outputs": [],
32 | "source": [
33 | "import json\n",
34 | "import geopandas as gpd\n",
35 | "import numpy as np\n",
36 | "import warnings\n",
37 | "import pandas as pd\n",
38 | "import plotly.express as px\n",
39 | "\n",
40 | "%matplotlib inline\n",
41 | "warnings.filterwarnings(\"ignore\")"
42 | ]
43 | },
44 | {
45 | "cell_type": "markdown",
46 | "id": "6fe1f494",
47 | "metadata": {},
48 | "source": [
49 | "Read Vienna data"
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": null,
55 | "id": "08a5b712",
56 | "metadata": {},
57 | "outputs": [],
58 | "source": [
59 | "vienna_map = pd.read_csv(\"data_map/vienna.csv\")"
60 | ]
61 | },
62 | {
63 | "cell_type": "markdown",
64 | "id": "a23efa4e",
65 | "metadata": {},
66 | "source": [
67 | "Convert pandas dataframe to geopandas dataframe"
68 | ]
69 | },
70 | {
71 | "cell_type": "code",
72 | "execution_count": null,
73 | "id": "51ba68e8",
74 | "metadata": {},
75 | "outputs": [],
76 | "source": [
77 | "vienna_map = gpd.GeoDataFrame(\n",
78 | " vienna_map.loc[:, [c for c in vienna_map.columns if c != \"geometry\"]],\n",
79 | " geometry=gpd.GeoSeries.from_wkt(vienna_map[\"geometry\"]),\n",
80 | " crs=\"epsg:3005\",\n",
81 | ")"
82 | ]
83 | },
84 | {
85 | "cell_type": "markdown",
86 | "id": "1752aeb6",
87 | "metadata": {},
88 | "source": [
89 | "Create a geojson object"
90 | ]
91 | },
92 | {
93 | "cell_type": "code",
94 | "execution_count": null,
95 | "id": "6dd2e16d",
96 | "metadata": {},
97 | "outputs": [],
98 | "source": [
99 | "vmap = json.loads(vienna_map.to_json())"
100 | ]
101 | },
102 | {
103 | "cell_type": "markdown",
104 | "id": "0b4bba31",
105 | "metadata": {},
106 | "source": [
107 | "Create figure with plotly"
108 | ]
109 | },
110 | {
111 | "cell_type": "code",
112 | "execution_count": null,
113 | "id": "f6d6cfe2",
114 | "metadata": {},
115 | "outputs": [],
116 | "source": [
117 | "fig = px.choropleth_mapbox(\n",
118 | " vienna_map,\n",
119 | " geojson=vmap,\n",
120 | " locations=\"district\",\n",
121 | " color=\"price\",\n",
122 | " title=\"Average hotel prices in Vienna ($,2017)\",\n",
123 | " color_continuous_scale=\"viridis\",\n",
124 | " featureidkey=\"properties.district\", # featureidkey connects the original geopandas dataframe (vienna_map) to the geojson object (vmap)\n",
125 | " mapbox_style=\"carto-positron\",\n",
126 | " zoom=10,\n",
127 | " center={\"lat\": 48.210033, \"lon\": 16.363449},\n",
128 | " opacity=0.5,\n",
129 | ")\n",
130 | "fig.update_layout(margin={\"r\": 0, \"l\": 0, \"b\": 0})\n",
131 | "fig.show()"
132 | ]
133 | },
134 | {
135 | "cell_type": "code",
136 | "execution_count": null,
137 | "id": "d0a350c6",
138 | "metadata": {},
139 | "outputs": [],
140 | "source": [
141 | "def generateColorScale(colors, naColor):\n",
142 | " colorArray = []\n",
143 | " colorArray.append([0, naColor])\n",
144 | " for grenze, color in zip(np.linspace(0.7, 1, len(colors)), colors):\n",
145 | " colorArray.append([grenze, color])\n",
146 | " return colorArray"
147 | ]
148 | },
149 | {
150 | "cell_type": "code",
151 | "execution_count": null,
152 | "id": "7b00b60d",
153 | "metadata": {},
154 | "outputs": [],
155 | "source": [
156 | "fig = px.choropleth_mapbox(\n",
157 | " vienna_map.fillna(0),\n",
158 | " geojson=vmap,\n",
159 | " locations=\"district\",\n",
160 | " color=\"price\",\n",
161 | " title=\"Average hotel prices in Vienna ($,2017)\",\n",
162 | " color_continuous_scale=generateColorScale(colors=[\"red\", \"yellow\"], naColor=\"gray\"),\n",
163 | " featureidkey=\"properties.district\",\n",
164 | " mapbox_style=\"carto-positron\",\n",
165 | " zoom=10,\n",
166 | " center={\"lat\": 48.210033, \"lon\": 16.363449},\n",
167 | " opacity=0.5,\n",
168 | ")\n",
169 | "fig.update_layout(margin={\"r\": 0, \"l\": 0, \"b\": 0})\n",
170 | "fig.show()"
171 | ]
172 | },
173 | {
174 | "cell_type": "code",
175 | "execution_count": null,
176 | "id": "4e706c25",
177 | "metadata": {},
178 | "outputs": [],
179 | "source": []
180 | }
181 | ],
182 | "metadata": {
183 | "kernelspec": {
184 | "display_name": "Python 3 (ipykernel)",
185 | "language": "python",
186 | "name": "python3"
187 | },
188 | "language_info": {
189 | "codemirror_mode": {
190 | "name": "ipython",
191 | "version": 3
192 | },
193 | "file_extension": ".py",
194 | "mimetype": "text/x-python",
195 | "name": "python",
196 | "nbconvert_exporter": "python",
197 | "pygments_lexer": "ipython3",
198 | "version": "3.8.10"
199 | }
200 | },
201 | "nbformat": 4,
202 | "nbformat_minor": 5
203 | }
204 |
--------------------------------------------------------------------------------
/lecture17-basic-spatial-viz/README.md:
--------------------------------------------------------------------------------
1 | # Lecture 17: Spatial data visualization
2 |
3 | ## Motivation
4 |
5 | Visualizing data spatially can allow us to make insights as to what is going on beyond our bubble. Aside from being great visuals that immediately engage audiences, map data visualizations provide a critical context for the metrics. Combining geospatial information with data creates a greater scope of understanding. Some benefits of using maps in your data visualization include:
6 |
7 | 1. A greater ability to more easily understand the distribution of your variable across the city, state, country, or world.
8 | 2. The ability to compare the activity across several locations at a glance
9 | 3. More intuitive decision making for company leaders
10 | 4. Contextualizing your data in the real world
11 |
12 |
13 | There is lots of room for creativity when making map dashboards because there are numerous ways to convey information with this kind of visualization. We map geographical regions colored, shaded, or graded according to some variable. They are visually striking, especially when the spatial units of the map are familiar entities.
14 |
15 | | Life expectancy map | Hotel prices in cities |
16 | |-------------------------|-------------------------|
17 | |  |  |
18 |
19 |
20 | ## This lecture
21 |
22 | This lecture introduces spatial data visualization using maps. During the lecture, students learn how to use the `maps` package which offers built-in maps with the [worldbank-lifeexpectancy](https://gabors-data-analysis.com/datasets/#worldbank-lifeexpectancy) data. Plotting the raw life expectancy at birth on a world map is already a powerful tool, but students will learn how to show deviance from the expected value given by the regression model. In the second part, students import raw `shp` files with auxiliary files, which contain the map of London boroughs and Vienna districts. With the [hotels-europe](https://gabors-data-analysis.com/datasets/#hotels-europe) dataset the average price for each unit on the map is shown.
23 |
24 | Case studies used during the lecture:
25 | - [Chapter 08, B: How is life expectancy related to the average income of a country?](https://gabors-data-analysis.com/casestudies/#ch08b-how-is-life-expectancy-related-to-the-average-income-of-a-country)
26 | - [Chapter 03, B: Comparing hotel prices in Europe: Vienna vs London](https://gabors-data-analysis.com/casestudies/#ch03b-comparing-hotel-prices-in-europe-vienna-vs-london)
27 |
28 | ## Learning outcomes
29 | After successfully completing [`01_spatial_datavisualisation.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture17-basic-spatial-viz/01_spatial_datavisualisation.ipynb) students should be able:
30 |
31 | - Understand how `geom_polygon` works
32 | - Shaping the outlook of the map with `coord_equal` or `coord_map`
33 | - Creating a `theme_map` theme
34 | - Use different coloring with `scale_fill_gradient`
35 | - How to match different data tables to be able to plot a map
36 | - Use custom values as a filler on the map based on life-expectancy case study
37 |
38 | After successfully completing [`02_spatial_datavisualisation.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture17-basic-spatial-viz/02_spatial_datavisualisation.ipynb) students should be able:
39 |
40 | - Use `geopandas` package to import 'shp' files and other needed auxiliary files as 'shx' and 'dbf'
41 | - `geom_path` to color the edges of the map
42 | - Map manipulations to show only inner-London boroughs
43 | - Add (borough or district) names to a map with `geom_text`
44 | - Control for limits of legend colors with `scale_fill_gradientn()`
45 | - Use nice color maps with unique palettes
46 | - Task for Vienna: replicate the same as for London
47 |
48 | After successfully completing [`02_spatial_datavisualisation_plotly.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture17-basic-spatial-viz/02_spatial_datavisualisation_plotly.ipynb) students should be able:
49 |
50 | - Use`plotly`'s 'choropleth_mapbox' function to create interactive maps.
51 |
52 |
53 | ## Lecture Time
54 |
55 | Ideal overall time: **40-60 mins**.
56 |
57 | Going through [`01_spatial_datavisualisation.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture17-basic-spatial-viz/01_spatial_datavisualisation.ipynb) and [`02_spatial_datavisualisation.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture17-basic-spatial-viz/02_spatial_datavisualisation.ipynb) takes around 20-40 minutes. Solving the tasks takes the remaining 20-40 minutes as there are two long tasks.
58 |
59 |
60 | ## Homework
61 |
62 | *Type*: quick practice, approx 10 mins
63 |
64 | Get countries' GDP growth rates with the `WDI` package. Plot the values in a world map.
65 |
66 |
67 | ## Further material
68 |
69 | - Arthur Turrell's Coding for Economics classes: [Geo-Spatial Visualisation](https://aeturrell.github.io/coding-for-economists/geo-vis.html).
70 | - Create beautiful maps with [Plotly](https://plotly.com/python/maps/).
71 | - Maps with [Matplotlib](https://towardsdatascience.com/mapping-with-matplotlib-pandas-geopandas-and-basemap-in-python-d11b57ab5dac).
72 |
--------------------------------------------------------------------------------
/lecture17-basic-spatial-viz/data_map/BEZIRKSGRENZEOGDPolygon.dbf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gabors-data-analysis/da-coding-python/ac4dcefcb3b6e54e36e21490948e2edb41a13485/lecture17-basic-spatial-viz/data_map/BEZIRKSGRENZEOGDPolygon.dbf
--------------------------------------------------------------------------------
/lecture17-basic-spatial-viz/data_map/BEZIRKSGRENZEOGDPolygon.shp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gabors-data-analysis/da-coding-python/ac4dcefcb3b6e54e36e21490948e2edb41a13485/lecture17-basic-spatial-viz/data_map/BEZIRKSGRENZEOGDPolygon.shp
--------------------------------------------------------------------------------
/lecture17-basic-spatial-viz/data_map/BEZIRKSGRENZEOGDPolygon.shx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gabors-data-analysis/da-coding-python/ac4dcefcb3b6e54e36e21490948e2edb41a13485/lecture17-basic-spatial-viz/data_map/BEZIRKSGRENZEOGDPolygon.shx
--------------------------------------------------------------------------------
/lecture17-basic-spatial-viz/data_map/London_Borough_Excluding_MHW.dbf:
--------------------------------------------------------------------------------
1 | q
2 | ! J NAME C GSS_CODE C HECTARES N NONLD_AREA N ONS_INNER C SUB_2009 C SUB_2006 C
3 |
Kingston upon Thames E09000021 3726.117 0.000F Croydon E09000008 8649.441 0.000F Bromley E09000006 15013.487 0.000F Hounslow E09000018 5658.541 60.755F Ealing E09000009 5554.428 0.000F Havering E09000016 11445.735 210.763F Hillingdon E09000017 11570.063 0.000F Harrow E09000015 5046.330 0.000F Brent E09000005 4323.270 0.000F Barnet E09000003 8674.837 0.000F Lambeth E09000022 2724.940 43.927T Southwark E09000028 2991.340 105.139T Lewisham E09000023 3531.706 16.795T Greenwich E09000011 5044.190 310.785F Bexley E09000004 6428.649 370.619F Enfield E09000010 8220.025 0.000F Waltham Forest E09000031 3880.793 0.000F Redbridge E09000026 5644.225 2.300F Sutton E09000029 4384.698 0.000F Richmond upon Thames E09000027 5876.111 135.443F Merton E09000024 3762.466 0.000F Wandsworth E09000032 3522.022 95.600T Hammersmith and FulhamE09000013 1715.409 75.648T Kensington and ChelseaE09000020 1238.379 25.994T Westminster E09000033 2203.005 54.308T Camden E09000007 2178.932 0.000T Tower Hamlets E09000030 2157.501 179.707T Islington E09000019 1485.664 0.000T Hackney E09000012 1904.902 0.000T Haringey E09000014 2959.837 0.000T Newham E09000025 3857.806 237.637T Barking and Dagenham E09000002 3779.934 169.150F City of London E09000001 314.942 24.546T
--------------------------------------------------------------------------------
/lecture17-basic-spatial-viz/data_map/London_Borough_Excluding_MHW.shp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gabors-data-analysis/da-coding-python/ac4dcefcb3b6e54e36e21490948e2edb41a13485/lecture17-basic-spatial-viz/data_map/London_Borough_Excluding_MHW.shp
--------------------------------------------------------------------------------
/lecture17-basic-spatial-viz/data_map/London_Borough_Excluding_MHW.shx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gabors-data-analysis/da-coding-python/ac4dcefcb3b6e54e36e21490948e2edb41a13485/lecture17-basic-spatial-viz/data_map/London_Borough_Excluding_MHW.shx
--------------------------------------------------------------------------------
/lecture17-basic-spatial-viz/output/heu_prices.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gabors-data-analysis/da-coding-python/ac4dcefcb3b6e54e36e21490948e2edb41a13485/lecture17-basic-spatial-viz/output/heu_prices.png
--------------------------------------------------------------------------------
/lecture17-basic-spatial-viz/output/lifeexp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gabors-data-analysis/da-coding-python/ac4dcefcb3b6e54e36e21490948e2edb41a13485/lecture17-basic-spatial-viz/output/lifeexp.png
--------------------------------------------------------------------------------
/lecture18-cross-validation/README.md:
--------------------------------------------------------------------------------
1 | # Lecture 18: Cross-validating linear models
2 |
3 | ## Motivation
4 |
5 | You have a car that you want to sell in the near future. You want to know what price you can expect if you were to sell it. You may also want to know what you could expect if you were to wait one more year and sell your car then. You have data on used cars with their age and other features, and you can predict price with several kinds of regression models with different righthand-side variables in different functional forms. How should you select the regression model that would give the best prediction?
6 |
7 | We introduce point prediction versus interval prediction; we discuss the components of prediction error and how to find the best prediction model that will likely produce the best fit (smallest prediction error) in the live data, using observations in the original data. We introduce loss functions in general and mean squared error (MSE) and its square root (RMSE) in particular, to evaluate predictions. We discuss three ways of finding the best predictor model, using all data and the Bayesian Information Criterion (BIC) as the measure of fit, using training–test splitting of the data, and using k-fold cross-validation, which is an improvement on the training–test split.
8 |
9 | ## This lecture
10 |
11 | This lecture refreshes methods for data cleaning and refactoring data as well as some basic feature engineering practices. After data is set, multiple competing regressions are run and compared via BIC and k-fold cross validation. Cross validation is carried out by the `sklearn` package as well. After the best-performing model is chosen (by RMSE), prediction performance and risks associated are discussed. In the case, when log-transformed outcome is used as the model, transformation back to level and evaluation of the prediction performance is also covered.
12 |
13 | Case studies used:
14 | - [Chapter 13, A: Predicting used car value with linear regressions](https://gabors-data-analysis.com/casestudies/#ch13a-predicting-used-car-value-with-linear-regressions)
15 | - [Chapter 14, A: Predicting used car value: log prices](https://gabors-data-analysis.com/casestudies/#ch14a-predicting-used-car-value-log-prices)
16 |
17 | ## Learning outcomes
18 | After successfully completing [`crossvalidation_usedcars.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture18-cross-validation/crossvalidation_usedcars.ipynb), students should be able:
19 |
20 | - Clean and prepare data for modeling
21 | - Decide for functional forms and do meaningful variable transformations
22 | - Run multiple regressions and compare performance based on BIC
23 | - Carry out k-fold cross validation with `sklearn` package for different regression models
24 | - Compare the prediction performance of the models
25 | - Understand what happens if a log-transformed outcome is used
26 | - convert prediction back to level
27 | - compare prediction performance of other (non-log) models
28 |
29 | ## Dataset used
30 |
31 | - [`used-cars`](https://gabors-data-analysis.com/datasets/#used-cars)
32 |
33 | ## Lecture Time
34 |
35 | Ideal overall time: **100 mins**.
36 |
37 |
38 | ## Further material
39 |
40 | - This lecture is a modified and combined version of [`ch13_used-cars.ipynb`](https://github.com/gabors-data-analysis/da_case_studies/blob/master/ch13-used-cars-reg/ch13_used-cars.ipynb) and [`ch14-used-cars-log.ipynb`](https://github.com/gabors-data-analysis/da_case_studies/blob/master/ch14-used-cars-log/ch14-used-cars-log.ipynb) codes from [Gabor's case study repository](https://github.com/gabors-data-analysis/da_case_studies).
41 |
42 |
--------------------------------------------------------------------------------
/lecture19-lasso/README.md:
--------------------------------------------------------------------------------
1 | # Lecture 19: Prediction with LASSO
2 |
3 | ## Motivation
4 |
5 | You want to predict the rental prices of apartments in a big city using their location, size, amenities, and other features. You have access to data on many apartments with many variables. You know how to select the best regression model for prediction from several candidate models. But how should you specify those candidate models, to begin with? In particular, which of the many variables should they include, in what functional forms, and in what interactions? More generally, how can you make sure that the candidates include truly good predictive models?
6 |
7 | How should we specify the regression models? In particular, when we have many candidate predictor variables, how should we select from them, and how should we decide on their functional forms?
8 |
9 | ## This lecture
10 |
11 | This lecture discusses how to build regression models for prediction and how to evaluate the predictions they produce. We discuss how to select
12 | variables out of a large pool of candidate x variables, and how to decide on their functional forms. We introduce LASSO, an algorithm that can help with variable selection. With respect to evaluating predictions, we discuss why we need a holdout sample for evaluation that is separate from all of the rest of the data we use for model building and selection.
13 |
14 | Case study:
15 | - [Chapter 14, B: Predicting AirBnB apartment prices: selecting a regression model](https://gabors-data-analysis.com/casestudies/#ch14b-predicting-airbnb-apartment-prices-selecting-a-regression-model)
16 |
17 | ## Learning outcomes
18 | After successfully completing [`02_lasso_airbnb_prediction.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture19-lasso/02_lasso_airbnb_prediction.ipynb), students should be able:
19 |
20 | - Data cleaning and refactoring to prepare for LASSO type modelling
21 | - Basic feature engineering for LASSO
22 | - Understand the three sample approach:
23 | - train and test sample to select model (cross validation for tuning parameters)
24 | - hold-out sample to evaluate model prediction performance
25 | - Model selection with
26 | - (linear) regression models
27 | - LASSO, RIDGE and Elastic Net
28 | - Model diagnostics
29 | - Performance measure(s) on hold-out set to evalate competing models
30 | - stability of the prediction
31 | - specific diagnostic figures for LASSO
32 |
33 | ## Dataset used
34 |
35 | - [`airbnb`](https://gabors-data-analysis.com/datasets/#airbnb)
36 |
37 | ## Lecture Time
38 |
39 | Ideal overall time: **100 mins**.
40 |
41 |
42 | ## Further material
43 |
44 | - This lecture is a modified version of [`ch14-airbnb-prediction.ipynb`](https://github.com/gabors-data-analysis/da_case_studies/blob/master/ch14-airbnb-reg/ch14-airbnb-prediction.ipynb) from [Gabor's case study repository](https://github.com/gabors-data-analysis/da_case_studies).
45 |
46 |
--------------------------------------------------------------------------------
/lecture20-regression-tree/01_usedcars_cart_data_preparation.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "a4b11ce4",
6 | "metadata": {},
7 | "source": [
8 | "## Prediction with CART – data preparation\n",
9 | "Case studies: \n",
10 | " - CH15A Predicting used car value with regression trees \n",
11 | " \n",
12 | "Dataset:\n",
13 | "\n",
14 | " used-cars"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": null,
20 | "id": "ad62ffe3",
21 | "metadata": {},
22 | "outputs": [],
23 | "source": [
24 | "import os\n",
25 | "import warnings\n",
26 | "\n",
27 | "import numpy as np\n",
28 | "import pandas as pd\n",
29 | "from skimpy import skim\n",
30 | "\n",
31 | "warnings.filterwarnings(\"ignore\")"
32 | ]
33 | },
34 | {
35 | "cell_type": "markdown",
36 | "id": "70553db2",
37 | "metadata": {},
38 | "source": [
39 | "Import data"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": null,
45 | "id": "49876c5d",
46 | "metadata": {},
47 | "outputs": [],
48 | "source": [
49 | "data = pd.read_csv(\"https://osf.io/7gvz9/download\")"
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": null,
55 | "id": "f2e86a5e",
56 | "metadata": {},
57 | "outputs": [],
58 | "source": [
59 | "data.head()"
60 | ]
61 | },
62 | {
63 | "cell_type": "code",
64 | "execution_count": null,
65 | "id": "17265243",
66 | "metadata": {},
67 | "outputs": [],
68 | "source": [
69 | "data.shape"
70 | ]
71 | },
72 | {
73 | "cell_type": "markdown",
74 | "id": "566e72ba",
75 | "metadata": {},
76 | "source": [
77 | "### Sample design\n",
78 | "\n",
79 | "Manage missing"
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": null,
85 | "id": "30133e81",
86 | "metadata": {},
87 | "outputs": [],
88 | "source": [
89 | "data[\"fuel\"] = data[\"fuel\"].fillna(\"Missing\")\n",
90 | "data[\"drive\"] = data[\"drive\"].fillna(\"Missing\")\n",
91 | "data[\"cylinders\"] = data[\"cylinders\"].fillna(\"Missing\")\n",
92 | "data[\"transmission\"] = data[\"transmission\"].fillna(\"Missing\")\n",
93 | "data[\"type\"] = data[\"type\"].fillna(\"Missing\")"
94 | ]
95 | },
96 | {
97 | "cell_type": "markdown",
98 | "id": "0b2e2f57",
99 | "metadata": {},
100 | "source": [
101 | "Missing changed to good not missing for condition"
102 | ]
103 | },
104 | {
105 | "cell_type": "code",
106 | "execution_count": null,
107 | "id": "6e5493d0",
108 | "metadata": {},
109 | "outputs": [],
110 | "source": [
111 | "data[\"condition\"].value_counts()\n"
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": null,
117 | "id": "7d82b952",
118 | "metadata": {},
119 | "outputs": [],
120 | "source": [
121 | "data[\"condition\"] = data[\"condition\"].fillna(\"good\")\n"
122 | ]
123 | },
124 | {
125 | "cell_type": "code",
126 | "execution_count": null,
127 | "id": "60ccd5cb",
128 | "metadata": {},
129 | "outputs": [],
130 | "source": [
131 | "data[\"condition\"].value_counts()\n"
132 | ]
133 | },
134 | {
135 | "cell_type": "markdown",
136 | "id": "afeff1e3",
137 | "metadata": {},
138 | "source": [
139 | "Drop hybrid models then drop column"
140 | ]
141 | },
142 | {
143 | "cell_type": "code",
144 | "execution_count": null,
145 | "id": "58ed2114",
146 | "metadata": {},
147 | "outputs": [],
148 | "source": [
149 | "data = data.loc[lambda x: x[\"Hybrid\"] == 0].drop(\"Hybrid\", axis=1)"
150 | ]
151 | },
152 | {
153 | "cell_type": "markdown",
154 | "id": "70351995",
155 | "metadata": {},
156 | "source": [
157 | "Keep gas-fuelled vehicles"
158 | ]
159 | },
160 | {
161 | "cell_type": "code",
162 | "execution_count": null,
163 | "id": "fe6377ef",
164 | "metadata": {},
165 | "outputs": [],
166 | "source": [
167 | "data = data.loc[lambda x: x[\"fuel\"] == \"gas\"]"
168 | ]
169 | },
170 | {
171 | "cell_type": "markdown",
172 | "id": "2ce16b09",
173 | "metadata": {},
174 | "source": [
175 | "Drop vehicles in fair and new condition, trucks"
176 | ]
177 | },
178 | {
179 | "cell_type": "code",
180 | "execution_count": null,
181 | "id": "4fb0d2e9",
182 | "metadata": {},
183 | "outputs": [],
184 | "source": [
185 | "data = data.loc[lambda x: ~x[\"condition\"].isin([\"new\",\"fair\"])]"
186 | ]
187 | },
188 | {
189 | "cell_type": "markdown",
190 | "id": "f2765ba3",
191 | "metadata": {},
192 | "source": [
193 | "Drop unrealistic values for price and odometer reading\n"
194 | ]
195 | },
196 | {
197 | "cell_type": "code",
198 | "execution_count": null,
199 | "id": "3559a496",
200 | "metadata": {},
201 | "outputs": [],
202 | "source": [
203 | "data = data.loc[lambda x: (x[\"price\"].isin(range(500, 25001))) & (x[\"odometer\"] <= 100)]"
204 | ]
205 | },
206 | {
207 | "cell_type": "markdown",
208 | "id": "998ea36b",
209 | "metadata": {},
210 | "source": [
211 | "Drop if price is smaller than 1000 and condition is like new or age is less than 8\n"
212 | ]
213 | },
214 | {
215 | "cell_type": "code",
216 | "execution_count": null,
217 | "id": "f7ee1105",
218 | "metadata": {},
219 | "outputs": [],
220 | "source": [
221 | "data = data.loc[\n",
222 | " lambda x: ~((x[\"price\"] < 1000) & ((x[\"condition\"] == \"like new\") | (x[\"age\"] < 8)))\n",
223 | "]"
224 | ]
225 | },
226 | {
227 | "cell_type": "code",
228 | "execution_count": null,
229 | "id": "0703f361",
230 | "metadata": {},
231 | "outputs": [],
232 | "source": [
233 | "data = data.loc[lambda x: x[\"transmission\"] != \"manual\"]"
234 | ]
235 | },
236 | {
237 | "cell_type": "markdown",
238 | "id": "485cfca8",
239 | "metadata": {},
240 | "source": [
241 | "Drop if truck"
242 | ]
243 | },
244 | {
245 | "cell_type": "code",
246 | "execution_count": null,
247 | "id": "7ca71bc0",
248 | "metadata": {},
249 | "outputs": [],
250 | "source": [
251 | "data = data.loc[lambda x: ~x[\"type\"].isin([\"truck\", \"pickup\"])]"
252 | ]
253 | },
254 | {
255 | "cell_type": "markdown",
256 | "id": "910ce557",
257 | "metadata": {},
258 | "source": [
259 | "Drop price string"
260 | ]
261 | },
262 | {
263 | "cell_type": "code",
264 | "execution_count": null,
265 | "id": "2f0fa7b6",
266 | "metadata": {},
267 | "outputs": [],
268 | "source": [
269 | "data = data.drop(\"pricestr\",axis=1)"
270 | ]
271 | },
272 | {
273 | "cell_type": "markdown",
274 | "id": "dabd73c5",
275 | "metadata": {},
276 | "source": [
277 | "To be on the safe side, drop NA prices"
278 | ]
279 | },
280 | {
281 | "cell_type": "code",
282 | "execution_count": null,
283 | "id": "e1fe2c0f",
284 | "metadata": {},
285 | "outputs": [],
286 | "source": [
287 | "data = data.loc[lambda x: x[\"price\"].notna()]"
288 | ]
289 | },
290 | {
291 | "cell_type": "markdown",
292 | "id": "4703b309",
293 | "metadata": {},
294 | "source": [
295 | "### Data generation & descriptives\n",
296 | "\n",
297 | "Variables we are interested in:\n",
298 | " \n",
299 | " price age odometer + condition cylinder dealer city LE"
300 | ]
301 | },
302 | {
303 | "cell_type": "markdown",
304 | "id": "85834185",
305 | "metadata": {},
306 | "source": [
307 | "Condition"
308 | ]
309 | },
310 | {
311 | "cell_type": "code",
312 | "execution_count": null,
313 | "id": "f6c5c271",
314 | "metadata": {},
315 | "outputs": [],
316 | "source": [
317 | "data[\"cond_excellent\"] = np.where(data[\"condition\"] == \"excellent\", 1, 0)\n",
318 | "data[\"cond_good\"] = np.where(data[\"condition\"] == \"good\", 1, 0)\n",
319 | "data[\"cond_likenew\"] = np.where(data[\"condition\"] == \"like new\", 1, 0)"
320 | ]
321 | },
322 | {
323 | "cell_type": "markdown",
324 | "id": "4313885e",
325 | "metadata": {},
326 | "source": [
327 | "Cylinders"
328 | ]
329 | },
330 | {
331 | "cell_type": "code",
332 | "execution_count": null,
333 | "id": "23c4fb24",
334 | "metadata": {},
335 | "outputs": [],
336 | "source": [
337 | "data.cylinders.value_counts()"
338 | ]
339 | },
340 | {
341 | "cell_type": "code",
342 | "execution_count": null,
343 | "id": "b54f0843",
344 | "metadata": {},
345 | "outputs": [],
346 | "source": [
347 | "data[\"cylind6\"] = np.where(data[\"cylinders\"] == \"6 cylinders\", 1, 0)\n",
348 | "data.cylind6.value_counts()"
349 | ]
350 | },
351 | {
352 | "cell_type": "markdown",
353 | "id": "3b5a0120",
354 | "metadata": {},
355 | "source": [
356 | "Chicago\n"
357 | ]
358 | },
359 | {
360 | "cell_type": "code",
361 | "execution_count": null,
362 | "id": "a1d17f79",
363 | "metadata": {},
364 | "outputs": [],
365 | "source": [
366 | "data[\"chicago\"] = np.where(data[\"area\"] == \"chicago\", 1, 0)"
367 | ]
368 | },
369 | {
370 | "cell_type": "markdown",
371 | "id": "cf1961a7",
372 | "metadata": {},
373 | "source": [
374 | "age: quadratic, cubic"
375 | ]
376 | },
377 | {
378 | "cell_type": "code",
379 | "execution_count": null,
380 | "id": "668ab112",
381 | "metadata": {},
382 | "outputs": [],
383 | "source": [
384 | "data[\"agesq\"] = data[\"age\"] ** 2\n",
385 | "data[\"agecu\"] = data[\"age\"] ** 3"
386 | ]
387 | },
388 | {
389 | "cell_type": "markdown",
390 | "id": "c8b04c5f",
391 | "metadata": {},
392 | "source": [
393 | "odometer quadratic"
394 | ]
395 | },
396 | {
397 | "cell_type": "code",
398 | "execution_count": null,
399 | "id": "a9e3cb6f",
400 | "metadata": {},
401 | "outputs": [],
402 | "source": [
403 | "data[\"odometersq\"] = data[\"odometer\"] ** 3"
404 | ]
405 | },
406 | {
407 | "cell_type": "markdown",
408 | "id": "a96e8b72",
409 | "metadata": {},
410 | "source": [
411 | "Take a look at descrpitives"
412 | ]
413 | },
414 | {
415 | "cell_type": "code",
416 | "execution_count": null,
417 | "id": "6aaa3ec6",
418 | "metadata": {},
419 | "outputs": [],
420 | "source": [
421 | "skim(data)"
422 | ]
423 | },
424 | {
425 | "cell_type": "code",
426 | "execution_count": null,
427 | "id": "f175d53b",
428 | "metadata": {},
429 | "outputs": [],
430 | "source": [
431 | "data[\"price\"].describe()"
432 | ]
433 | },
434 | {
435 | "cell_type": "code",
436 | "execution_count": null,
437 | "id": "2ec30e19",
438 | "metadata": {},
439 | "outputs": [],
440 | "source": [
441 | "data[\"price\"].hist()"
442 | ]
443 | },
444 | {
445 | "cell_type": "code",
446 | "execution_count": null,
447 | "id": "9e6af8f1",
448 | "metadata": {},
449 | "outputs": [],
450 | "source": [
451 | "data[\"price\"].apply(np.log).hist()"
452 | ]
453 | },
454 | {
455 | "cell_type": "markdown",
456 | "id": "9cd3970b",
457 | "metadata": {},
458 | "source": [
459 | "Save data for prediction"
460 | ]
461 | },
462 | {
463 | "cell_type": "code",
464 | "execution_count": null,
465 | "id": "ac698017",
466 | "metadata": {},
467 | "outputs": [],
468 | "source": [
469 | "os.makedirs(\"data\", exist_ok=True)\n",
470 | "data.to_csv(\"data/usedcars_cart_work.csv\", index=False)"
471 | ]
472 | },
473 | {
474 | "cell_type": "code",
475 | "execution_count": null,
476 | "id": "625c585b",
477 | "metadata": {},
478 | "outputs": [],
479 | "source": []
480 | }
481 | ],
482 | "metadata": {
483 | "kernelspec": {
484 | "display_name": "Python 3 (ipykernel)",
485 | "language": "python",
486 | "name": "python3"
487 | },
488 | "language_info": {
489 | "codemirror_mode": {
490 | "name": "ipython",
491 | "version": 3
492 | },
493 | "file_extension": ".py",
494 | "mimetype": "text/x-python",
495 | "name": "python",
496 | "nbconvert_exporter": "python",
497 | "pygments_lexer": "ipython3",
498 | "version": "3.8.10"
499 | }
500 | },
501 | "nbformat": 4,
502 | "nbformat_minor": 5
503 | }
504 |
--------------------------------------------------------------------------------
/lecture20-regression-tree/README.md:
--------------------------------------------------------------------------------
1 | # Lecture 20: Prediction with regression trees (CART)
2 |
3 | ## Motivation
4 |
5 | You want to predict the price of used cars as a function of their age and other features. You want to specify a model that includes the most important interactions and nonlinearities of those features, but you don’t know how to start. In particular, you are worried that you can’t start with a very complex regression model and use LASSO or some other method to simplify it because there are way too many potential interactions. Is there an alternative approach to regression that includes the most important interactions without you having to specify them?
6 |
7 | To carry out the prediction of used car prices, we show how to use the regression tree, an alternative to linear regressions that are designed to build a model with the most important interactions and nonlinearities for a prediction. However, the regression tree you build appears to overfit your original data. How can you build a regression tree model that is less prone to overfitting the original data and can thus give a better prediction in the live data?
8 |
9 |
10 | ## This lecture
11 |
12 | This lecture introduces the regression tree, an alternative to linear regression for prediction purposes that can find the most important predictor variables and their interactions and can approximate any functional form automatically. Regression trees split the data into small bins (subsamples) by the value of the x variables. For a quantitative y, they use the average y value in those small sets to predict y. We introduce the regression tree model and the most widely used algorithm to build a regression tree model. Somewhat confusingly, both the model and the algorithm are called CART (for classification and regression trees), but we reserve this name for the algorithm. We show that a regression tree is an intuitively appealing method to model nonlinearities and interactions among the x variables, but it is rarely used for prediction in itself because it is prone to overfit the original data. Instead, the regression tree forms the basic element of very powerful prediction methods that we’ll cover in the next seminar.
13 |
14 | Case study:
15 | - [Chapter 15, A: Predicting used car value with regression trees](https://gabors-data-analysis.com/casestudies/#ch15a-predicting-used-car-value-with-regression-trees)
16 |
17 | ## Learning outcomes
18 | After successfully completing [`02_usedcars_cart_prediction.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture20-regression-tree/02_usedcars_cart_prediction.ipynb), students should be able:
19 |
20 | - Understand the how regression tree works
21 | - Estimate a regression tree
22 | - Visualize regression tree
23 | - Set stopping criteria for CART
24 | - Depth or level of the tree
25 | - Number of leaves
26 | - minimum fit measure increase by a split
27 | - Pruning a large tree
28 | - find optimal complexity parameter (also known as pruning parameter)
29 | - Variable importance plot
30 | - Simple
31 | - Permutation importance
32 | - Prediction evaluation
33 | - comparing trees
34 | - comparing tree vs linear regressions
35 |
36 | ## Dataset used
37 |
38 | - [used-cars](https://gabors-data-analysis.com/datasets/#used-cars)
39 |
40 | ## Lecture Time
41 |
42 | Ideal overall time: **100 mins**.
43 |
44 |
45 | ## Further material
46 |
47 | - This lecture is a modified version of [`ch15-used-cars-cart.ipynb`](https://github.com/gabors-data-analysis/da_case_studies/blob/master/ch15-used-cars-cart/ch15-used-cars-cart.ipynb) from [Gabor's case study repository](https://github.com/gabors-data-analysis/da_case_studies).
48 |
49 |
--------------------------------------------------------------------------------
/lecture21-random-forest/00_download_model_fits.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "1f65e289",
6 | "metadata": {},
7 | "source": [
8 | "### This notebook is to download and unzip the fitted models used in `02_random_forest_airbnb.ipynb`.\n",
9 | "\n",
10 | "Since the cross-validation and model fits take a lots of time, we saved the results and uploaded them to an OSF repository which you can find [here](https://osf.io/mw4xj/?view_only=). You can also download the `model_fits.zip` folder from the repository and extract the zip file by hand, but the following code chunk will also do this for you."
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": null,
16 | "id": "5ee46225",
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "import requests, zipfile\n",
21 | "from io import BytesIO\n",
22 | "\n",
23 | "print(\"Downloading started\")\n",
24 | "url = \"https://osf.io/nsa3q/download\"\n",
25 | "filename = \"model_fits.zip\"\n",
26 | "\n",
27 | "# Downloading the file by sending the request to the URL\n",
28 | "req = requests.get(url)\n",
29 | "print(\"Downloading completed\")\n",
30 | "\n",
31 | "# extracting the zip file contents\n",
32 | "zipfile = zipfile.ZipFile(BytesIO(req.content))\n",
33 | "zipfile.extractall(\".\")\n",
34 | "print(\"Folder unzipped\")"
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": null,
40 | "id": "3d219fa2",
41 | "metadata": {},
42 | "outputs": [],
43 | "source": []
44 | }
45 | ],
46 | "metadata": {
47 | "kernelspec": {
48 | "display_name": "Python 3 (ipykernel)",
49 | "language": "python",
50 | "name": "python3"
51 | },
52 | "language_info": {
53 | "codemirror_mode": {
54 | "name": "ipython",
55 | "version": 3
56 | },
57 | "file_extension": ".py",
58 | "mimetype": "text/x-python",
59 | "name": "python",
60 | "nbconvert_exporter": "python",
61 | "pygments_lexer": "ipython3",
62 | "version": "3.8.10"
63 | }
64 | },
65 | "nbformat": 4,
66 | "nbformat_minor": 5
67 | }
68 |
--------------------------------------------------------------------------------
/lecture21-random-forest/README.md:
--------------------------------------------------------------------------------
1 | # Lecture 24: Predicting with Random Forest and Boosting
2 |
3 | ## Motivation
4 |
5 | You need to predict rental prices of apartments using various features. You don’t know that the various features may interact with each other in determining price, so you would like to use a regression tree. But you want to build a model that gives the best possible prediction, better than a single tree. What methods are available that keep the advantage of regression trees but give a better prediction? How should you choose from among those methods?
6 |
7 | How can you grow a random forest, the most widely used tree-based method, to carry out the prediction of apartment rental prices? What details do you have to decide on, how should you decide on them, and how can you evaluate the results?
8 |
9 | A regression tree can capture complicated interactions and nonlinearities for predicting a quantitative y variable, but it is prone to overfit the original data, even after appropriate pruning. It turns out, however, that combining multiple regression trees grown on the same data can yield a much better prediction. Such methods are called ensemble methods. There are many ensemble methods based on regression trees, and some are known to produce very good predictions. But these methods are rather complex, and some of them are not straightforward to use.
10 |
11 | ## This lecture
12 |
13 | This lecture introduces two ensemble methods based on regression trees: random forest and boosting. We start by introducing the main idea of ensemble methods: combining results from many imperfect models can lead to a much better prediction than a single model that we try to build to perfection. The random forest is perhaps the most frequently used method to predict a quantitative y variable, both because of its excellent predictive performance and because it is relatively simple to use. Even more than with a single tree, it is hard to understand the underlying patterns of association between y and x that drive the predictions of ensemble methods. We discuss some diagnostic tools that can help with that: variable importance plots, partial dependence plots, and examining the quality of predictions in subgroups. Finally, we show another method: boosting, an alternative approach to making predictions based on an ensemble of regression trees via `xgboost`.
14 |
15 | Note that some of the used methods take a considerable amount of time to run on a simple PC, thus pre-run model results are also uploaded to the repository, to speed up the seminar.
16 |
17 | Case study:
18 | - [Chapter 16, A: Predicting apartment prices with random forest](https://gabors-data-analysis.com/casestudies/#ch16a-predicting-apartment-prices-with-random-forest)
19 |
20 | ## Learning outcomes
21 |
22 | Lecturer/students should be aware that there is a separate file: [`airbnb_prepare.R`](https://github.com/gabors-data-analysis/da-coding-rstats/blob/main/lecture24-random-forest/codes/airbnb_prepare.R) for this seminar, overviewing only the data cleaning and feature engineering process. This is extremely important and powerful to understand how to prepare the data for these methods, as without it data analysts do garbage-in garbage-out analysis... Usually, due to time constraints, this part is not covered in the seminar but asked students to cover it before the seminar.
23 |
24 | After successfully completing [`02_random_forest_airbnb.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture21-random-forest/02_random_forest_airbnb.ipynb), students should be able:
25 |
26 | - Estimate random forest models via `sklearn`
27 | - unsderstand `max_features` and `min_samples_split` parameters
28 | - GridSearchCV to search for hyperparameters
29 | - Understanding random forest's output
30 | - variable importance plots: all, top 10 and grouped variables (typically factors)
31 | - partial dependence plot
32 | - sub-sample analysis for understanding prediction performance across groups
33 | - SHAP values
34 | - Run a 'Horse-Race' prediction competition with:
35 | - Linear regression (OLS)
36 | - ElasticNet
37 | - Regression Tree with CART
38 | - Random Forest
39 | - XGBoost model
40 |
41 | ## Dataset used
42 |
43 | - [airbnb](https://gabors-data-analysis.com/datasets/#airbnb)
44 |
45 | ## Lecture Time
46 |
47 | Ideal overall time: **100 mins**.
48 |
49 |
50 | ## Further material
51 |
52 | - This lecture is a modified version of [ch16-airbnb-random-forest.ipynb](https://github.com/gabors-data-analysis/da_case_studies/blob/master/ch16-airbnb-random-forest/Ch16-airbnb-random-forest.ipynb) from [Gabor's case study repository](https://github.com/gabors-data-analysis/da_case_studies).
53 |
54 |
--------------------------------------------------------------------------------
/lecture22-classification/README.md:
--------------------------------------------------------------------------------
1 | # Lecture 25: Prediction and classification of binary outcome with ML tools
2 |
3 | ## Motivation
4 |
5 | Predicting whether people will repay their loans or default on them is important to a bank that sells such loans. Should the bank predict the default probability for applicants? Or, rather, should it classify applicants into prospective defaulters and prospective repayers? And how are the two kinds of predictions related? In particular, can the bank use probability predictions to classify applicants into defaulters and repayers, in a way that takes into account the bank’s costs when a default happens and its costs when it forgoes a good applicant?
6 |
7 | Many companies have relationships with other companies, as suppliers or clients. Whether those other companies stay in business in the future is an important question for them. You have rich data on many companies across the years that allows you to see which companies stayed in business and which companies exited, and relate that to various features of the companies. How should you use that data to predict the probability of exit for each company? How should you predict which companies will exit and which will stay in business in the future?
8 |
9 | In the previous seminars we covered the logic of predictive analytics and its most important steps, and we introduced specific methods to predict a quantitative y variable. But sometimes our y variable is not quantitative. The most important case is when y is binary: y = 1 or y = 0. How can we predict such a variable?
10 |
11 | ## This lecture
12 |
13 | This lecture introduces the framework and methods of probability prediction and classification analysis for binary y variables. Probability prediction means predicting the probability that y = 1, with the help of the predictor variables. Classification means predicting the binary y variable itself, with the help of the predictor variables: putting each observation in one of the y categories, also called classes. We build on what we know about probability models and the basics of probability prediction from [lecture14-binary-models](https://github.com/gabors-data-analysis/da-coding-python/tree/main/lecture14-binary-models). In this seminar, we put that into the framework of predictive analytics to arrive at the best probability model for prediction purposes and to evaluate its performance. We then discuss how we can turn probability predictions into classification with the help of a classification threshold and how we should use a loss function to find the optimal threshold. We discuss how to evaluate a classification by making use of a confusion table and expected loss. We introduce the ROC curve, which illustrates the trade-off of selecting different classification threshold values. We discuss how we can use random forests based on classification trees.
14 |
15 | Case study:
16 | - [Chapter 17, A: Predicting firm exit: probability and classification](https://gabors-data-analysis.com/casestudies/#ch17a-predicting-firm-exit-probability-and-classification)
17 |
18 | ## Learning outcomes
19 |
20 | Lecturer/students should be aware that there is a separate file at the official case studies repository: [`ch17-firm-exit-data-prep.ipynb`](https://github.com/gabors-data-analysis/da_case_studies/blob/master/ch17-predicting-firm-exit/ch17-firm-exit-data-prep.ipynb) for this seminar, overviewing only the data cleaning and feature engineering process for binary outcomes. This is extremely important and powerful to understand how to prepare the data for these methods, as without it data analysts do garbage-in garbage-out analysis... Usually, due to time constraints, this part is not covered in the seminar but asked students to cover it before the seminar.
21 |
22 | After successfully completing [`firm_exit_classification.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture22-classification/firm_exit_classification.ipynb), students should be able:
23 |
24 | - What is winsorizing and how it helps
25 | - Basic linear models for predicting probabilities
26 | - simple linear probability model (review)
27 | - simple logistic model (logit, review)
28 | - Cross-validation with logit model
29 | - LASSO with logit model
30 | - Evaluation of model prediction
31 | - Calibration curve (review)
32 | - Confusion matrix
33 | - ROC curve and AUC (Area Under Curve)
34 | - Model comparison based on RMSE and AUC
35 | - User-defined loss funtion
36 | - find the optimal trheshold based on self-defined loss function
37 | - Show ROC curve and optimal point
38 | - Show loss-function values for different points on ROC
39 | - CART and Random Forest
40 | - modelling porbabilities
41 | - Random Forest with majority voting as a misunderstand method, especially with user-defined loss function
42 |
43 | ## Dataset used
44 |
45 | -[bisnode-firms](https://gabors-data-analysis.com/datasets/#bisnode-firms)
46 |
47 | ## Lecture Time
48 |
49 | Ideal overall time: **100 mins**.
50 |
51 |
52 | ## Further material
53 |
54 | - This lecture is a modified version of [`ch17-predicting-firm-exit.ipynb`](https://github.com/gabors-data-analysis/da_case_studies/blob/master/ch17-predicting-firm-exit/ch17-predicting-firm-exit.ipynb) from [Gabor's case study repository](https://github.com/gabors-data-analysis/da_case_studies).
55 |
56 |
--------------------------------------------------------------------------------
/lecture22-classification/helper_functions.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | import sklearn.metrics as metrics
4 | from plotnine import *
5 | from sklearn.metrics import roc_curve
6 |
7 |
8 | def regression_results(y_true, y_pred):
9 |
10 | # Regression metrics
11 | explained_variance = metrics.explained_variance_score(y_true, y_pred)
12 | mean_absolute_error = metrics.mean_absolute_error(y_true, y_pred)
13 | mse = metrics.mean_squared_error(y_true, y_pred)
14 | median_absolute_error = metrics.median_absolute_error(y_true, y_pred)
15 | r2 = metrics.r2_score(y_true, y_pred)
16 |
17 | print("explained_variance: ", round(explained_variance, 4))
18 | print("r2: ", round(r2, 4))
19 | print("MAE: ", round(mean_absolute_error, 4))
20 | print("MSE: ", round(mse, 4))
21 | print("RMSE: ", round(np.sqrt(mse), 4))
22 |
23 |
24 | def coef_matrix(X, model):
25 |
26 | coef_matrix = pd.concat(
27 | [pd.DataFrame(X.columns), pd.DataFrame(np.transpose(model.coef_))], axis=1
28 | )
29 | coef_matrix.columns = ["variable", "coefficient"]
30 | coef_matrix = coef_matrix.append(
31 | {"variable": "Intercept", "coefficient": model.intercept_},
32 | ignore_index=True,
33 | )
34 | return coef_matrix
35 |
36 |
37 | def cv_summary(lambdas, C_values, model):
38 | d = {
39 | "lambdas": lambdas,
40 | "C_values": C_values,
41 | "mean_cv_score": model.scores_[1].mean(axis=0),
42 | }
43 | return pd.DataFrame(data=d)
44 |
45 |
46 | def create_roc_plot(y_true, y_pred):
47 | fpr, tpr, thresholds = roc_curve(y_true, y_pred)
48 | all_coords = pd.DataFrame({"fpr": fpr, "tpr": tpr, "thresholds": thresholds})
49 |
50 | plot = (
51 | ggplot(all_coords, aes(x="fpr", y="tpr"))
52 | + geom_line(color="blue", size=0.7)
53 | + geom_area(position="identity", fill="mediumaquamarine", alpha=0.3)
54 | + xlab("False Positive Rate (1-Specifity)")
55 | + ylab("True Positive Rate (Sensitivity)")
56 | + geom_abline(intercept=0, slope=1, linetype="dotted", color="black")
57 | + scale_y_continuous(
58 | limits=(0, 1), breaks=np.arange(0, 1.1, 0.1), expand=(0, 0.01)
59 | )
60 | + scale_x_continuous(
61 | limits=(0, 1), breaks=np.arange(0, 1.1, 0.1), expand=(0.01, 0)
62 | )
63 | + theme_bw()
64 | )
65 | return plot
66 |
67 |
68 | def sigmoid_array(x):
69 | return 1 / (1 + np.exp(-x))
70 |
71 |
72 | def generate_fold_prediction(model, X, fold, param_index):
73 | fold_coef = model.coefs_paths_[1][fold, param_index, :]
74 | return sigmoid_array(
75 | np.dot(X, np.transpose(fold_coef)[:-1]) + np.transpose(fold_coef)[-1]
76 | )
77 |
78 |
79 | def create_loss_plot(all_coords, optimal_threshold, curr_exp_loss, FP, FN):
80 | all_coords_copy = all_coords.copy()
81 | all_coords_copy["loss"] = (
82 | all_coords_copy.false_pos * FP + all_coords_copy.false_neg * FN
83 | ) / all_coords_copy.n
84 |
85 | t = optimal_threshold
86 | l = curr_exp_loss
87 |
88 | return (
89 | ggplot(all_coords_copy, aes(x="thresholds", y="loss"))
90 | + geom_line(color="blue", size=0.7)
91 | + scale_x_continuous(breaks=np.arange(0, 1.1, 0.1))
92 | + coord_cartesian(xlim=(0, 1))
93 | + geom_vline(xintercept=t, color="blue")
94 | + annotate(
95 | geom="text",
96 | x=t - 0.01,
97 | y=max(all_coords_copy.loss) - 0.4,
98 | label="best threshold: " + str(round(t, 2)),
99 | colour="red",
100 | angle=90,
101 | size=7,
102 | )
103 | + annotate(geom="text", x=t + 0.06, y=l, label=str(round(l, 2)), size=7)
104 | + theme_bw()
105 | )
106 |
107 |
108 | def create_roc_plot_with_optimal(all_coords, optimal_threshold):
109 | all_coords_copy = all_coords.copy()
110 | all_coords_copy["sp"] = 1 - all_coords_copy.true_neg / all_coords_copy.neg
111 | all_coords_copy["se"] = all_coords_copy.true_pos / all_coords_copy.pos
112 |
113 | best_coords = all_coords_copy[all_coords_copy.thresholds == optimal_threshold]
114 | sp = best_coords.sp.values[0]
115 | se = best_coords.se.values[0]
116 |
117 | return (
118 | ggplot(all_coords_copy, aes(x="sp", y="se"))
119 | + geom_line(color="blue", size=0.7)
120 | + scale_y_continuous(breaks=np.arange(0, 1.1, 0.1))
121 | + scale_x_continuous(breaks=np.arange(0, 1.1, 0.1))
122 | + geom_point(data=pd.DataFrame({"sp": [sp], "se": [se]}))
123 | + annotate(
124 | geom="text",
125 | x=sp,
126 | y=se + 0.03,
127 | label=str(round(sp, 2)) + ", " + str(round(se, 2)),
128 | size=7,
129 | )
130 | + geom_area(position="identity", fill="mediumaquamarine", alpha=0.3)
131 | + xlab("False Positive Rate (1-Specifity)")
132 | + ylab("True Positive Rate (Sensitivity)")
133 | + geom_abline(intercept=0, slope=1, linetype="dotted", color="black")
134 | + theme_bw()
135 | )
136 |
--------------------------------------------------------------------------------
/lecture23-long-term-time-series/README.md:
--------------------------------------------------------------------------------
1 | # Lecture 23: Forecasting from Time Series Data I
2 |
3 | ## Motivation
4 |
5 | Your task is to predict the number of daily tickets sold for next year in a swimming pool in a large city. The swimming pool sells tickets through its sales terminal that records all transactions. You aggregate that data to daily frequency. How should you use the information on daily sales to produce your forecast? In particular, how should you model trends, and how should you model seasonality by months of the year and days of the week to produce the best prediction?
6 |
7 |
8 | ## This lecture
9 |
10 | This lecture discusses forecasting: prediction from time series data for one or more time periods in the future. The focus of this chapter is forecasting future values of one variable, by making use of past values of the same variable, and possibly other variables, too. We build on what we learned about time series regressions in [lecture16-timeseries-regression](https://github.com/gabors-data-analysis/da-coding-python/tree/main/lecture16-timeseries-regression). We start with forecasts with a long horizon, which means many time periods into the future. Such forecasts use the information on trends, seasonality, and other long-term features of the time series.
11 |
12 | Case study:
13 | - [Chapter 18, A: Forecasting daily ticket sales for a swimming pool](https://gabors-data-analysis.com/casestudies/#ch18a-forecasting-daily-ticket-sales-for-a-swimming-pool)
14 |
15 | ## Learning outcomes
16 | After successfully completing [`long_term_swimming.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture23-long-term-time-series/long_term_swimming.ipynb), students should be able:
17 |
18 | - Data munging with time series (review)
19 | - Adding deterministic variables such as trends, yearly/mounthly/weekly seasonality
20 | - Adding deterministic variables with `pandas_market_calendars` package such as holidays, weekdays, etc.
21 | - Sample splitting with time series
22 | - Simple linear models:
23 | - deterministic trend/seasonality and/or other deterministic variables (holidays, etc.)
24 | - Cross-validation with time series
25 | - `prophet` package
26 | - Forecasting
27 | - Comparing model based on forecasting performance (RMSE)
28 | - Graphical representation of model fit and forecasts
29 |
30 | ## Dataset used
31 |
32 | - [swim-transactions](https://gabors-data-analysis.com/datasets/#swim-transactions)
33 |
34 | ## Lecture Time
35 |
36 | Ideal overall time: **50-60 mins**.
37 |
38 |
39 | ## Further material
40 |
41 | - This lecture is a modified version of [ch18-swimmingpool-predict.ipynb](https://github.com/gabors-data-analysis/da_case_studies/blob/master/ch18-swimmingpool/ch18-swimmingpool-predict.ipynb) from [Gabor's case study repository](https://github.com/gabors-data-analysis/da_case_studies).
42 |
43 |
--------------------------------------------------------------------------------
/lecture24-short-term-time-series/README.md:
--------------------------------------------------------------------------------
1 | # Lecture 24: Forecasting from Time Series Data II - ARIMA and VAR models
2 |
3 | ## Motivation
4 |
5 | Your task is to predict how house prices will move in a particular city in the next months. You have monthly data on the house price index of the city, and you can collect monthly data on other variables that may be correlated with how house prices move. How should you use that data to forecast changes in house prices for the next few months? In particular, how should you use those other variables to help that forecast even though you don’t know their future values?
6 |
7 | ## This lecture
8 |
9 | This lecture discusses forecasting: prediction from time series data for one or more time periods in the future. The focus of this chapter is forecasting future values of one variable, by making use of past values of the same variable, and possibly other variables, too. We build on what we learned about time series regressions in [lecture16-timeseries-regression](https://github.com/gabors-data-analysis/da-coding-python/tree/main/lecture16-timeseries-regression). Now, we then turn to short horizon forecasts that forecast y for a few time periods ahead. These forecasts make use of serial correlation of the time series of y besides those long-term features. We introduce autoregression (AR) and ARIMA models via the `statsmodels` package, which captures the patterns of serial correlation and can use for short horizon forecasting. We then turn to use other variables in forecasting and introduce vector autoregression (VAR) models that help in forecasting future values of those x variables that we can use to forecast y. We discuss how to carry out cross-validation in forecasting and the specific challenges and opportunities the time series nature of our data provides for assessing external validity.
10 |
11 | Case study:
12 | - [Chapter 18, B: Forecasting a house price index](https://gabors-data-analysis.com/casestudies/#ch18b-forecasting-a-house-price-index)
13 |
14 | ## Learning outcomes
15 | After successfully completing [`short_term_priceindex.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture24-short-term-time-series/short_term_priceindex.ipynb), students should be able:
16 |
17 | - Decide if a conversion of data to stationarity is needed
18 | - ARIMA models
19 | - self specified lags for AR, I, and MA components
20 | - auto select the lags
21 | - handling trend and seasonality within ARIMA
22 | - understand 'S' from SARIMA and why we do not use it in this course
23 | - Cross-validation with ARIMA models
24 | - Vector AutoRegressive models (VAR)
25 | - estimation and cross-validation
26 | - Forecasting
27 | - comparing models based on forecast performance
28 | - external validity check on a longer horizon
29 | - Fan charts for assessing risks
30 |
31 | ## Lecture Time
32 |
33 | Ideal overall time: **50-80 mins**.
34 |
35 |
36 | ## Further material
37 |
38 | - This lecture is a modified version of [`ch18-ts-pred-homeprices.ipynb`](https://github.com/gabors-data-analysis/da_case_studies/blob/master/ch18-case-shiller-la/ch18-ts-pred-homeprices.ipynb) from [Gabor's case study repository](https://github.com/gabors-data-analysis/da_case_studies).
39 |
40 |
--------------------------------------------------------------------------------
/lecture25-matplotlib-vs-plotnine/README.md:
--------------------------------------------------------------------------------
1 | # Lecture 25: Matplotlib vs Plotnine on the GDP and Life Expectancy data
2 |
3 | ## Motivation
4 |
5 | People tend to live longer in richer countries. How long people live is usually measured by life expectancy; how rich a country is usually captured by its yearly income, measured by GDP. But should we use total GDP or GDP per capita? And what’s the shape of the patterns of association? Is the same percent difference in income related to the same difference in how long people live among richer countries and poorer countries? Finding the shape of the association helps benchmarking life expectancy among countries with similar levels of income and identify countries where people tend to live especially long or especially short lives for their income.
6 |
7 | The lecture illustrates the choice between total and per capita measures (here GDP), regressions with variables in logs, and two ways to model nonlinear patterns in the framework of the linear regression: piecewise linear splines, and polynomials. It also illustrates whether and how to use weights in regression analysis, and what that choice implies for the correct interpretation of the results. The lecture also shows how to use informative visualization to present the results of regressions.
8 |
9 |
10 | ## This lecture
11 |
12 | This lecture covers the same graphs in two separate notebooks: [`life_expectancy_gdp_plotnine.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture25-matplotlib-vs-plotnine/life_expectancy_gdp_plotnine.ipynb) and [`life_expectancy_gdp_matplotlib.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture25-matplotlib-vs-plotnine/life_expectancy_gdp_matplotlib.ipynb). Our goal was to show that exactly the same graphs can be created with `matplotlib` (and with its high-level interface, `seaborn`) and with `plotnine`.
13 |
14 | Case study:
15 | - [CH08B How is life expectancy related to the average income of a country?](https://gabors-data-analysis.com/casestudies/#ch08b-how-is-life-expectancy-related-to-the-average-income-of-a-country)
16 |
17 | ## Learning outcomes
18 | After successfully completing [`life_expectancy_gdp_plotnine.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture25-matplotlib-vs-plotnine/life_expectancy_gdp_plotnine.ipynb) and/or [`life_expectancy_gdp_matplotlib.ipynb`](https://github.com/gabors-data-analysis/da-coding-python/blob/main/lecture25-matplotlib-vs-plotnine/life_expectancy_gdp_matplotlib.ipynb) students should be able to:
19 |
20 | - Estimate simple
21 | - level-level regression
22 | - log-level regression
23 | - quadratic regression
24 | - spline regression
25 | - Visualise regression line on a scatter plot
26 | - set axis ticks to percent format
27 | - scale axis to log
28 | - use weights for point sizes
29 |
30 |
31 | ## Lecture Time
32 |
33 | Ideal overall time: **30-60 mins** depending on whether you go through only one, or both notebooks.
34 |
35 |
36 | ## Further material
37 |
38 | - This lecture is a modified version of [`ch08-life-expectancy-income.ipynb`](https://github.com/gabors-data-analysis/da_case_studies/blob/master/ch08-life-expectancy-income/ch08-life-expectancy-income.ipynb) from [Gabor's case study repository](https://github.com/gabors-data-analysis/da_case_studies).
39 | - Tutorial to the `seaborn` library can be found [here](https://seaborn.pydata.org/tutorial.html).
--------------------------------------------------------------------------------
/lecture25-matplotlib-vs-plotnine/helper_functions.py:
--------------------------------------------------------------------------------
1 | import copy
2 |
3 | import numpy as np
4 | import pandas as pd
5 |
6 | def knot_ceil(vector: np.array, knot: float) -> np.array:
7 | vector_copy = copy.deepcopy(vector)
8 | vector_copy[vector_copy > knot] = knot
9 | return vector_copy
10 |
11 |
12 | def lspline(series: pd.Series, knots: list) -> np.array:
13 | """
14 | Function to create design matrix to esitmate a piecewise
15 | linear spline regression.
16 |
17 | Parameters
18 | ----------
19 | series : pd.Series
20 | Your variable in a pandas Series.
21 | knots : List[float]
22 | The knots, that result in n + 1 line segments.
23 | """
24 |
25 | if type(knots) != list:
26 | knots = [knots]
27 | design_matrix = None
28 | vector = series.values
29 |
30 | for i in range(len(knots)):
31 | # print(i)
32 | # print(vector)
33 | if i == 0:
34 | column = knot_ceil(vector, knots[i])
35 | else:
36 | column = knot_ceil(vector, knots[i] - knots[i - 1])
37 | # print(column)
38 | if i == 0:
39 | design_matrix = column
40 | else:
41 | design_matrix = np.column_stack((design_matrix, column))
42 | # print(design_matrix)
43 | vector = vector - column
44 | design_matrix = np.column_stack((design_matrix, vector))
45 | # print(design_matrix)
46 | return design_matrix
47 |
48 |
49 | def poly(x: np.array, degree=1) -> pd.DataFrame:
50 | """
51 | Fit polynomial.
52 |
53 | These are non orthogonal factors, but it may not matter if
54 | we only need this for predictions (without interpreting the
55 | coefficients) or visualisation.
56 |
57 | Parameters
58 | ----------
59 | x : npt.ArrayLike
60 | Data array.
61 | degree : int, default=1
62 | Degree of the polynomial.
63 | """
64 | d = {}
65 | for i in range(degree + 1):
66 | if i == 1:
67 | d["x"] = x
68 | else:
69 | d[f"x**{i}"] = np.power(x, i)
70 | return pd.DataFrame(d)
71 |
72 |
73 | def add_margin(ax, x=0.05, y=0.05):
74 | """
75 | This will, by default, add 5% to the x and y margins. You
76 | can customise this using the x and y arguments when you call it.
77 | """
78 |
79 | xlim = ax.get_xlim()
80 | ylim = ax.get_ylim()
81 |
82 | xmargin = (xlim[1] - xlim[0]) * x
83 | ymargin = (ylim[1] - ylim[0]) * y
84 |
85 | ax.set_xlim(xlim[0] - xmargin, xlim[1] + xmargin)
86 | ax.set_ylim(ylim[0] - ymargin, ylim[1] + ymargin)
87 |
--------------------------------------------------------------------------------