├── .gitignore ├── LICENSE ├── Pipfile ├── Pipfile.lock ├── README.md ├── ch00-tech-prep ├── README.md ├── __init__.py ├── ch00_install_libraries.R ├── ch00_install_libraries.do ├── da_helper_functions.R ├── py_helper_functions.py ├── requirements.txt ├── requirements_pipenv.txt └── theme_bg.R ├── ch01-billion-prices-collect └── This is empty on purpose.txt ├── ch01-hotels-data-collect ├── ch01-hotels-data-collect.R ├── ch01-hotels-data-collect.do └── ch01-hotels-data-collect.ipynb ├── ch01-management-data-collect └── This is empty on purpose.txt ├── ch02-football-manager-success ├── ch02-football-manager-success.R ├── ch02-football-manager-success.do └── ch02-football-manager-success.ipynb ├── ch02-hotels-data-prep ├── ch02-hotels-data-prep.R ├── ch02-hotels-data-prep.do └── ch02-hotels-data-prep.ipynb ├── ch02-immunization-crosscountry ├── ch02-immunization-crosscountry.R ├── ch02-immunization-crosscountry.do └── ch02-immunization-crosscountry.ipynb ├── ch03-city-size-japan ├── ch03-city-size-Japan.R ├── ch03-city-size-Japan.do └── ch03-city-size-Japan.ipynb ├── ch03-distributions-height-income ├── ch03-height-income.R ├── ch03-height-income.do ├── ch03-height-income.ipynb └── stata library to edit.txt ├── ch03-football-home-advantage ├── ch03-football-home-advantage-describe.R ├── ch03-football-home-advantage-describe.do └── ch03-football-home-advantage-describe.ipynb ├── ch03-hotels-europe-compare ├── README ├── ch03-hotels-europe-compare.R ├── ch03-hotels-europe-compare.do └── ch03-hotels-europe-compare.ipynb ├── ch03-hotels-vienna-explore ├── README ├── ch03-hotels-vienna-explore.R ├── ch03-hotels-vienna-explore.do └── ch03-hotels-vienna-explore.ipynb ├── ch03-simulations ├── ch03-distributions.R ├── ch03-distributions.do └── ch03-distributions.ipynb ├── ch04-management-firm-size ├── ch04-wms-management-size-boxplot-violinplot.R ├── ch04-wms-management-size-boxplot-violinplot.ipynb ├── ch04-wms-management-size.R ├── ch04-wms-management-size.do └── ch04-wms-management-size.ipynb ├── ch05-stock-market-loss-generalize ├── ch05-stock-market-loss-generalize.R ├── ch05-stock-market-loss-generalize.do └── ch05-stock-market-loss-generalize.ipynb ├── ch06-online-offline-price-test ├── ch06-online-offline-price-test.R ├── ch06-online-offline-price-test.do └── ch06-online-offline-price-test.ipynb ├── ch06-stock-market-loss-test ├── ch06-stock-market-loss-test.R ├── ch06-stock-market-loss-test.do └── ch06-stock-market-loss-test.ipynb ├── ch07-hotels-simple-reg ├── ch07-hotels-simple-reg.R ├── ch07-hotels-simple-reg.do └── ch07-hotels-simple-reg.ipynb ├── ch07-ols-simulation ├── ch07-ols-simulation.R ├── ch07-ols-simulation.do └── ch07-ols-simulation.ipynb ├── ch08-hotels-measurement-error ├── README ├── ch08-hotels-measeerror.R ├── ch08-hotels-measeerror.do └── ch08-hotels-measeerror.ipynb ├── ch08-hotels-nonlinear ├── ch08-hotels-nonlinear-reg.R ├── ch08-hotels-nonlinear-reg.do └── ch08-hotels-nonlinear-reg.ipynb ├── ch08-life-expectancy-income ├── ch08-life-expectancy-income.R ├── ch08-life-expectancy-income.do └── ch08-life-expectancy-income.ipynb ├── ch09-gender-age-earnings ├── ch09-earnings-inference.R ├── ch09-earnings-inference.do └── ch09-earnings-inference.ipynb ├── ch09-hotels-europe-stability ├── ch09-hotels-externalvalid.R ├── ch09-hotels-externalvalid.do └── ch09-hotels-externalvalid.ipynb ├── ch10-gender-earnings-understand ├── ch10-gender-earnings-multireg.R ├── ch10-gender-earnings-multireg.do └── ch10-gender-earnings-multireg.ipynb ├── ch10-hotels-multiple-reg ├── ch10-hotels-multiple-reg.R ├── ch10-hotels-multiple-reg.do └── ch10-hotels-multiple-reg.ipynb ├── ch11-australia-rainfall-predict ├── ch11-australia-rainfall-predict.R ├── ch11-australia-rainfall-predict.do └── ch11-australia-rainfall-predict.ipynb ├── ch11-smoking-health-risk ├── ch11-smoking-health-risk-01-munging.R ├── ch11-smoking-health-risk-02-analysis.R ├── ch11-smoking-health-risk.R ├── ch11-smoking-health-risk.do └── ch11-smoking-health-risk.ipynb ├── ch12-electricity-temperature ├── ch12-arizona-electricity.R ├── ch12-arizona-electricity.do └── ch12-arizona-electricity.ipynb ├── ch12-stock-returns-risk ├── ch12-stock-returns-risk.R ├── ch12-stock-returns-risk.do └── ch12-stock-returns-risk.ipynb ├── ch12-time-series-simulations ├── ch12-randomwalk-serialcorr-simul.ipynb ├── ch12-randomwalk-serialcorr-simull.R ├── ch12-randomwalk-simul.do └── ch12-serialcorr-simul.do ├── ch13-used-cars-reg ├── ch13-used-cars.R ├── ch13-used-cars.do └── ch13-used-cars.ipynb ├── ch14-airbnb-reg ├── Ch14-airbnb-prepare.R ├── README.md ├── ch14-airbnb-prediction.R ├── ch14-airbnb-prediction.do ├── ch14-airbnb-prediction.ipynb ├── ch14-airbnb-prepare.do └── ch14-airbnb-prepare.ipynb ├── ch14-used-cars-log ├── ch14-used-cars-log.R ├── ch14-used-cars-log.do └── ch14-used-cars-log.ipynb ├── ch15-used-cars-cart ├── ch15-used-cars-cart.R └── ch15-used-cars-cart.ipynb ├── ch16-airbnb-random-forest ├── README.md ├── ch16-airbnb-prepare-london.R ├── ch16-airbnb-prepare-london.ipynb ├── ch16-airbnb-random-forest-shap.R ├── ch16-airbnb-random-forest-shap.ipynb ├── ch16-airbnb-random-forest.R └── ch16-airbnb-random-forest.ipynb ├── ch17-predicting-firm-exit ├── ch17-firm-exit-data-prep.R ├── ch17-firm-exit-data-prep.ipynb ├── ch17-predicting-firm-exit.R └── ch17-predicting-firm-exit.ipynb ├── ch18-case-shiller-la ├── ch18-pred-homeprices.do ├── ch18-ts-pred-homeprices.R └── ch18-ts-pred-homeprices.ipynb ├── ch18-swimmingpool ├── ch18-swimmingpool-predict.R ├── ch18-swimmingpool-predict.do └── ch18-swimmingpool-predict.ipynb ├── ch19-food-health ├── ch19-food-health.R ├── ch19-food-health.do ├── ch19-food-health.ipynb ├── ch19_food-health-maker.R ├── ch19_food-health-maker.do └── ch19_food-health-maker.ipynb ├── ch20-ab-test-social-media ├── ch20-ab-test-powercalc-pvalues.R ├── ch20-ab-test-powercalc-pvalues.do └── ch20-ab-test-powercalc-pvalues.ipynb ├── ch20-working-from-home ├── README_wfh_datawork.txt ├── background │ └── working-from-home-QJE.pdf ├── ch20-wfh.R ├── ch20-wfh.do └── ch20-wfh.ipynb ├── ch21-ownership-management-quality ├── background │ ├── Management-Practices-Across-Firms-and-Countries-Bloom-Genakos-Sadun-and-Van-Reenen.pdf │ ├── SIC-2dig-manuf-labels.txt │ └── the-ties-that-bind-lemos-and-scur.pdf ├── ch21-wms-01-dataprep.R ├── ch21-wms-01-dataprep.ipynb ├── ch21-wms-02-analysis.R ├── ch21-wms-02-analysis.ipynb └── ch21-wms.do ├── ch22-airline-merger-prices ├── ch22-airlines-01-dataprep.R ├── ch22-airlines-01-dataprep.ipynb ├── ch22-airlines-02-analysis.R ├── ch22-airlines-02-analysis.ipynb └── ch22-airlines.do ├── ch23-immunization-life ├── ch23-immunization-life.R ├── ch23-immunization-life.do └── ch23-immunization-life.ipynb ├── ch23-import-demand-and-production ├── ch23-asia-ip-imports.R ├── ch23-asia-ip-imports.do └── ch23-asia-ip-imports.ipynb ├── ch24-football-manager-replace ├── ch24-football-manager-replace.R ├── ch24-football-manager-replace.do └── ch24-football-manager-replace.ipynb ├── ch24-haiti-earthquake-gdp ├── README.md ├── haiti-earthquake-gdp.R ├── haiti-earthquake-gdp.do ├── haiti-earthquake-gdp.ipynb └── temp │ ├── gdp-1-temp.dta │ └── gdp-1.dta ├── da_illustrations ├── da_illustration_plots.R └── da_illustration_plots.ipynb ├── renv.lock ├── set-data-directory.R └── set-data-directory.do /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .Rprofile 4 | .RData 5 | .Ruserdata 6 | .Renviron 7 | */.DS_Store 8 | *.Rproj 9 | */output/ 10 | */temp/ 11 | *.csv 12 | *.dta 13 | *.rds 14 | *.pkl 15 | __pycache__/ 16 | .ipynb_checkpoints/ 17 | .ipynb_checkpoints 18 | set-data-directory.do 19 | set-data-directory.R 20 | renv/ 21 | .DS_Store 22 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Gabors Data Analysis 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | url = "https://pypi.org/simple" 3 | verify_ssl = true 4 | name = "pypi" 5 | 6 | [packages] 7 | arch = "==5.3.1" 8 | black = "==22.8.0" 9 | jupyter = "==1.0.0" 10 | jupyter-contrib-nbextensions = "==0.5.0" 11 | linearmodels = "==4.27" 12 | matplotlib = "==3.5.1" 13 | mizani = "==0.8.1" 14 | numpy = "==1.23.3" 15 | openpyxl = "==3.0.10" 16 | patchworklib = "==0.4.7" 17 | pandas = "==1.5.0" 18 | pandas-market-calendars = "==4.0" 19 | plotnine = "==0.9" 20 | pmdarima = "==2.0.1" 21 | prophet = "*" 22 | pydotplus = "==2.0.2" 23 | pystan = "==2.19.1.1" 24 | regex = "==2022.9.13" 25 | scikit-learn = "==1.1.2" 26 | scikit-misc = "==0.1.4" 27 | scipy = "==1.9.1" 28 | shap = "==0.41.0" 29 | stargazer = "==0.0.5" 30 | statsmodels = "==0.13.2" 31 | syntheticcontrolmethods = "==1.1.17" 32 | seaborn = "*" 33 | 34 | [dev-packages] 35 | 36 | [requires] 37 | python_version = "3.8" 38 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Data Analysis Case Study codebase for R, Python and Stata 2 | 3 | **R, Python and Stata code for** 4 | **Data Analysis for Business, Economics, and Policy** 5 | by Gábor Békés (CEU) and Gábor Kézdi (U. Michigan) 6 | Published on 6 May 2021 by Cambridge University Press 7 | [**gabors-data-analysis.com**](https://gabors-data-analysis.com/) 8 | 9 | 10 | ## How to use 11 | On the [textbook's website]((https://gabors-data-analysis.com/)), we have detailed discussion of how to set up libraries, get data and code: [Overview of data and code](https://gabors-data-analysis.com/data-and-code/) 12 | 13 | To see options for various languages, check out: 14 | 1. **R** -- [How to run code in R ](https://gabors-data-analysis.com/howto-r/) 15 | 2. **Stata** -- [How to run code in Stata ](https://gabors-data-analysis.com/howto-stata/) 16 | 3. **Python** -- [How to run code in Python ](https://gabors-data-analysis.com/howto-python/) 17 | 18 | 19 | ## Status (25 November, 2022) 20 | 21 | The [Latest release, 0.8.3 "Ethics Gradient"](https://github.com/gabors-data-analysis/da_case_studies/releases/tag/v0.8.3) was released on 25 November. 22 | 23 | In the latest release we did some refactoring re Python and R codes. We continuously monitor bugs and do regular, if mostly minor updates. 24 | 25 | ## Organization 26 | 1. Each case study has a separate folder. 27 | 2. Within case study folders, codes in different languages are simply stored together. 28 | 3. Data should be downloaded and stored in a separate folder. 29 | 30 | ## Code language versions 31 | 1. **R** -- We used R 4.0.2. 32 | 2. **Stata** -- We used version 15, allmost all code should work in version 13 up. 33 | 3. **Python** -- We used Python 3.8.0. 34 | 35 | ## Get data 36 | Data is hosted on OSF.io 37 | 38 | [Get data by datasets](https://osf.io/7epdj/) 39 | 40 | 41 | 42 | ## Found an error or have a suggestion? 43 | Awesome, we know there are errors and bugs. Or just much better ways to do a procedure. 44 | 45 | To make a suggestion, please open a `github issue` here with a title containing the case study name. You may also contact [us directctly](https://gabors-data-analysis.com/contact-us/). Cheers! 46 | -------------------------------------------------------------------------------- /ch00-tech-prep/README.md: -------------------------------------------------------------------------------- 1 | # Tech README 2 | 3 | This includes technical files for R, Stata and Python codes used in case studies. 4 | 5 | ## Python 6 | We have both an environment file and the requirements to help users with various technical settings 7 | 8 | 9 | ## R: ch00_install_libraries.R 10 | * This file was written to install libraries Using R 4.0.2, version 0.8 2020-09-07 11 | * We suggest using renv instead 12 | * kept it for continuity 13 | 14 | ## Stata: ch00_install_libraries 15 | You just need to run this once. 16 | -------------------------------------------------------------------------------- /ch00-tech-prep/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gabors-data-analysis/da_case_studies/08e55b021c295653e1b546faeeb4550926c9a7d4/ch00-tech-prep/__init__.py -------------------------------------------------------------------------------- /ch00-tech-prep/ch00_install_libraries.R: -------------------------------------------------------------------------------- 1 | ######################################################################################### 2 | # Prepared for Gabor's Data Analysis 3 | # 4 | # Data Analysis for Business, Economics, and Policy 5 | # by Gabor Bekes and Gabor Kezdi 6 | # Cambridge University Press 2021 7 | # 8 | # gabors-data-analysis.com 9 | # 10 | # License: Free to share, modify and use for educational purposes. 11 | # Not to be used for commercial purposes. 12 | 13 | # This file was written to install libraries 14 | # Using R 4.0.2, version 0.8 2020-09-07 15 | # We suggest using renv instead 16 | # kept it for continuity 17 | 18 | ######################################################################################### 19 | 20 | 21 | 22 | # basix 23 | install.packages("tidyverse") 24 | install.packages("pacman") 25 | library(tidyverse) 26 | library(pacman) 27 | 28 | #ch00 29 | pacman::p_load(urca, sandwich, stargazer,stringr) 30 | pacman::p_load(scales, data.table, knitr ) 31 | pacman::p_load(devtools, remotes) 32 | 33 | #Part I 34 | pacman::p_load(lspline, cowplot, arm, pastecs, DataCombine, janitor) 35 | pacman::p_load(haven, Hmisc, xtable, binsreg, estimatr, modelsummary, pscl) 36 | # make sure it's estimatr 0.26.0 37 | 38 | # need this till cran update 39 | library(remotes) 40 | install_github('vincentarelbundock/modelsummary') 41 | 42 | 43 | # Part II 44 | pacman::p_load(viridis, grid, gridExtra, dyn, estimatr, huxtable, segmented, rms, fixest) 45 | pacman::p_load(mfx, margins, psych) 46 | 47 | # temp fix 48 | install.packages(‘remotes’) 49 | remotes::install_github(‘lrberge/fixest’) 50 | 51 | #Part III 52 | pacman::p_load(lmtest, caret, glmnet, skimr, directlabels, 53 | prophet, timeDate, fpp3, aTSA, plotly) 54 | pacman::p_load(ggthemes, rpart, rpart.plot, rattle, pdp, ranger, partykit, e1071) 55 | pacman::p_load(glue, vctrs) # needed for FredR 56 | devtools::install_github("sboysel/fredr") 57 | 58 | 59 | # Part IV 60 | pacman::p_load(MatchIt, Matching, gmodels) 61 | pacman::p_load(reshape, car, plm, readstata13, Synth) 62 | -------------------------------------------------------------------------------- /ch00-tech-prep/ch00_install_libraries.do: -------------------------------------------------------------------------------- 1 | ********************************************** 2 | * Data Analysis textbook 3 | ********************************************** 4 | 5 | 6 | * before running stata code, do this 7 | 8 | * user written 9 | ssc install egenmore, replace 10 | ssc install unique, replace 11 | ssc install regsave, replace 12 | ssc install matchit, replace 13 | ssc install binscatter, replace 14 | ssc install vioplot, replace 15 | ssc install estout, replace 16 | *net install tabmiss, replace 17 | ssc install texsave, replace 18 | ssc install outreg2, replace 19 | ssc install listtex, replace 20 | ssc install listtab, replace 21 | 22 | ssc install elasticregress, replace 23 | ssc install lassopack, replace 24 | ssc install Rsource, replace 25 | ssc install ftools, replace 26 | ssc install matmap, replace 27 | ssc install crossfold, replace 28 | 29 | * color issues 30 | ssc install palettes, replace 31 | ssc install colrspace, replace 32 | net install scheme_virdis, from(https://raw.github.com/vikjam/stata-scheme-virdis/master/) replace 33 | 34 | colorpalette, vertical n(20): viridis 35 | 36 | * stata 37 | ssc install wbopendata 38 | * datahelpdesk.worldbank.org/knowledgebase/articles/889464-wbopendata-stata-module-to-access-world-bank-data 39 | 40 | *fred 41 | * https://blog.stata.com/2017/08/08/importing-data-with-import-fred/ 42 | *key is a valid API key, which is provided by the St. Louis Federal Reserve and may be obtained from 43 | * https://research.stlouisfed.org/docs/api/api key.html. 44 | 45 | *set fredkey key, permanently 46 | 47 | * faster tools, collapse 48 | * https://github.com/mcaceresb/stata-gtools 49 | local github "https://raw.githubusercontent.com" 50 | net install gtools, from(`github'/mcaceresb/stata-gtools/master/build/) 51 | 52 | * settings 53 | set matsize 2000 54 | 55 | ************************************- 56 | * not yet used 57 | ************************************- 58 | 59 | 60 | * trimming 61 | *net install st0313.pkg, from(http://www.stata-journal.com/software/sj13-3/) 62 | *net get st0313.pkg, from(http://www.stata-journal.com/software/sj13-3/) 63 | 64 | 65 | 66 | * missing ibs 67 | *net install dm91.pkg, from(http://www.stata.com/stb/stb61/) 68 | *net installl spost9_ado.pkg, from( http://www.indiana.edu/~jslsoc/stata/) 69 | -------------------------------------------------------------------------------- /ch00-tech-prep/requirements.txt: -------------------------------------------------------------------------------- 1 | patsy ==0.5.1 2 | matplotlib ==3.4.2 3 | pandas ==1.2.1 4 | plotnine ==0.7.1 5 | statsmodels ==0.12.2 6 | mizani ==0.7.1 7 | seaborn ==0.11.0 8 | scipy ==1.6.2 9 | linearmodels ==4.23 10 | ipython ==7.31.1 11 | scikit-learn ==0.24.2 12 | notebook == 6.4.1 13 | jupyter-nbextensions-configurator ==0.4.1 14 | stargazer ==0.0.5 15 | regex ==2021.4.4 16 | pmdarima ==1.8.2 17 | pydotplus ==2.0.2 18 | scikit-misc ==0.1.3 19 | black ==20.8b1 20 | skranger ==0.4.1 21 | arch ==4.19 22 | pandas-market-calendars ==2.0 23 | prophet ==1.0.1 24 | ipywidgets ==7.6.3 25 | xlrd ==1.2.0 -------------------------------------------------------------------------------- /ch00-tech-prep/requirements_pipenv.txt: -------------------------------------------------------------------------------- 1 | appnope==0.1.3 2 | arch==5.3.1 3 | argon2-cffi==21.3.0 4 | argon2-cffi-bindings==21.2.0 5 | astor==0.8.1 6 | asttokens==2.0.8 7 | attrs==22.1.0 8 | backcall==0.2.0 9 | backports.zoneinfo==0.2.1 10 | beautifulsoup4==4.11.1 11 | black==22.8.0 12 | bleach==5.0.1 13 | cffi==1.15.1 14 | click==8.1.3 15 | cmdstanpy==0.9.68 16 | contourpy==1.0.5 17 | convertdate==2.4.0 18 | cycler==0.11.0 19 | Cython==0.29.32 20 | debugpy==1.6.3 21 | decorator==5.1.1 22 | defusedxml==0.7.1 23 | entrypoints==0.4 24 | ephem==4.1.3 25 | exchange-calendars==4.2.3 26 | executing==1.1.0 27 | fastjsonschema==2.16.2 28 | fonttools==4.37.4 29 | formulaic==0.3.4 30 | hijri-converter==2.2.4 31 | holidays==0.16 32 | importlib-metadata==5.0.0 33 | importlib-resources==5.9.0 34 | interface-meta==1.3.0 35 | ipykernel==6.16.0 36 | ipython==8.5.0 37 | ipython-genutils==0.2.0 38 | ipywidgets==8.0.2 39 | jedi==0.18.1 40 | Jinja2==3.1.2 41 | joblib==1.2.0 42 | jsonschema==4.16.0 43 | jupyter==1.0.0 44 | jupyter-console==6.4.4 45 | jupyter-core==4.11.2 46 | jupyter_client==7.3.5 47 | jupyterlab-pygments==0.2.2 48 | jupyterlab-widgets==3.0.3 49 | kiwisolver==1.4.4 50 | korean-lunar-calendar==0.3.1 51 | linearmodels==4.27 52 | LunarCalendar==0.0.9 53 | lxml==4.9.1 54 | MarkupSafe==2.1.1 55 | matplotlib==3.6.0 56 | matplotlib-inline==0.1.6 57 | mistune==2.0.4 58 | mizani==0.8.1 59 | mypy-extensions==0.4.3 60 | nbclient==0.6.8 61 | nbconvert==7.0.0 62 | nbformat==5.6.1 63 | nest-asyncio==1.5.6 64 | notebook==6.4.12 65 | numpy==1.23.3 66 | packaging==21.3 67 | palettable==3.3.0 68 | pandas==1.5.0 69 | pandas-market-calendars==4.0 70 | pandocfilters==1.5.0 71 | parso==0.8.3 72 | pathspec==0.10.1 73 | patsy==0.5.2 74 | pexpect==4.8.0 75 | pickleshare==0.7.5 76 | Pillow==9.3.0 77 | pkgutil_resolve_name==1.3.10 78 | platformdirs==2.5.2 79 | plotnine==0.10.1 80 | pmdarima==2.0.1 81 | prometheus-client==0.14.1 82 | prompt-toolkit==3.0.31 83 | property-cached==1.6.4 84 | prophet==1.0 85 | psutil==5.9.2 86 | ptyprocess==0.7.0 87 | pure-eval==0.2.2 88 | pycparser==2.21 89 | pydotplus==2.0.2 90 | Pygments==2.13.0 91 | pyhdfe==0.1.0 92 | pyluach==2.0.1 93 | PyMeeus==0.5.11 94 | pyparsing==3.0.9 95 | pyrsistent==0.18.1 96 | pystan==2.19.1.1 97 | python-dateutil==2.8.2 98 | pytz==2022.4 99 | pyzmq==24.0.1 100 | qtconsole==5.3.2 101 | QtPy==2.2.0 102 | regex==2022.9.13 103 | scikit-learn==1.1.2 104 | scikit-misc==0.1.4 105 | scipy==1.9.1 106 | Send2Trash==1.8.0 107 | setuptools-git==1.2 108 | setuptools-scm==6.4.2 109 | six==1.16.0 110 | sklearn==0.0 111 | soupsieve==2.3.2.post1 112 | stack-data==0.5.1 113 | stargazer==0.0.5 114 | statsmodels==0.13.2 115 | terminado==0.16.0 116 | threadpoolctl==3.1.0 117 | tinycss2==1.1.1 118 | tomli==2.0.1 119 | toolz==0.12.0 120 | tornado==6.2 121 | tqdm==4.64.1 122 | traitlets==5.4.0 123 | typing_extensions==4.3.0 124 | ujson==5.5.0 125 | urllib3==1.26.12 126 | wcwidth==0.2.5 127 | webencodings==0.5.1 128 | widgetsnbextension==4.0.3 129 | wrapt==1.14.1 130 | zipp==3.8.1 -------------------------------------------------------------------------------- /ch01-billion-prices-collect/This is empty on purpose.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gabors-data-analysis/da_case_studies/08e55b021c295653e1b546faeeb4550926c9a7d4/ch01-billion-prices-collect/This is empty on purpose.txt -------------------------------------------------------------------------------- /ch01-hotels-data-collect/ch01-hotels-data-collect.R: -------------------------------------------------------------------------------- 1 | ################################################################################################ 2 | # Prepared for the textbook: 3 | # Data Analysis for Business, Economics, and Policy 4 | # by Gabor BEKES and Gabor KEZDI 5 | # Cambridge University Press 2021 6 | # 7 | # License: Free to share, modify and use for educational purposes. Not to be used for business purposes. 8 | # 9 | ###############################################################################################x 10 | 11 | # CHAPTER 02 12 | # CH02B Identifying successful football managers 13 | 14 | # football dataset 15 | # version 0.9 2020-08-28 16 | 17 | ########### 18 | 19 | # CLEAR MEMORY 20 | rm(list=ls()) 21 | 22 | # install.packages("tidyverse") 23 | library(tidyverse) 24 | #---------------------------------------------------------------------------------------------------- 25 | 26 | 27 | # set working directory 28 | # option A: open material as project 29 | # option B: set working directory for da_case_studies 30 | # example: setwd("C:/Users/bekes.gabor/Documents/github/da_case_studies/") 31 | 32 | # set data dir, load theme and functions 33 | 34 | source("ch00-tech-prep/theme_bg.R") 35 | source("ch00-tech-prep/da_helper_functions.R") 36 | # If source code does not run, install the following packages: 37 | # install.packages("urca") 38 | # install.packages("stargazer") 39 | 40 | # data used 41 | source("set-data-directory.R") #data_dir must be first defined # 42 | data_in <- paste(data_dir,"hotels-vienna","clean/", sep = "/") 43 | 44 | use_case_dir <- "ch01-hotels-data-collect/" 45 | data_out <- use_case_dir 46 | output <- paste0(use_case_dir,"output/") 47 | create_output_if_doesnt_exist(output) 48 | 49 | 50 | 51 | # load in clean and tidy data and create workfile 52 | df <- read.csv(paste0(data_in,"hotels-vienna.csv")) 53 | # or from the website 54 | # df <- read_csv("https://osf.io/y6jvb/download") 55 | 56 | ############################################ 57 | # First look 58 | ############################################ 59 | df <- df%>% 60 | select(hotel_id, accommodation_type, country, city, city_actual, neighbourhood, center1label, distance, 61 | center2label, distance_alter, stars, rating, rating_count, ratingta, ratingta_count, year, month, 62 | weekend, holiday, nnights, price, scarce_room, offer, offer_cat) 63 | 64 | summary(df) 65 | glimpse(df) 66 | 67 | # export list 68 | df <- subset(df, select = c(hotel_id, accommodation_type, country, city, city_actual, center1label, distance, stars, rating, price)) 69 | write.csv(df[1:5,], paste0(output, "hotel_listobs.csv"), row.names = F) 70 | 71 | -------------------------------------------------------------------------------- /ch01-hotels-data-collect/ch01-hotels-data-collect.do: -------------------------------------------------------------------------------- 1 | ******************************************************************** 2 | * Prepared for Gabor's Data Analysis 3 | * 4 | * Data Analysis for Business, Economics, and Policy 5 | * by Gabor Bekes and Gabor Kezdi 6 | * Cambridge University Press 2021 7 | * 8 | * gabors-data-analysis.com 9 | * 10 | * License: Free to share, modify and use for educational purposes. 11 | * Not to be used for commercial purposes. 12 | * 13 | * Chapter 01 14 | * CH01A Finding a good deal among hotels: data collection 15 | * using the hotels-vienna dataset 16 | * version 0.9 2020-09-06 17 | ******************************************************************** 18 | 19 | 20 | * SETTING UP DIRECTORIES 21 | 22 | * STEP 1: set working directory for da_case_studies. 23 | * for example: 24 | * cd "C:/Users/xy/Dropbox/gabors_data_analysis/da_case_studies" 25 | 26 | * STEP 2: * Directory for data 27 | * Option 1: run directory-setting do file 28 | do set-data-directory.do 29 | /* this is a one-line do file that should sit in 30 | the working directory you have just set up 31 | this do file has a global definition of your working directory 32 | more details: gabors-data-analysis.com/howto-stata/ */ 33 | 34 | * Option 2: set directory directly here 35 | * for example: 36 | * global data_dir "C:/Users/xy/gabors_data_analysis/da_data_repo" 37 | 38 | global data_in "$data_dir/hotels-vienna/clean" 39 | global work "ch01-hotels-data-collect" 40 | 41 | cap mkdir "$work/output" 42 | global output "$work/output" 43 | 44 | 45 | * load in clean and tidy data and create workfile 46 | use "$data_in/hotels-vienna.dta", clear 47 | 48 | * Or download directly from OSF: 49 | 50 | /* 51 | copy "https://osf.io/download/dn8je/" "workfile.dta" 52 | use "workfile.dta", clear 53 | erase "workfile.dta" 54 | */ 55 | 56 | order hotel_id accommodation_type country city city_actual neighbourhood center1label distance center2label distance_alter stars rating rating_count ratingta ratingta_count year month weekend holiday nnights price scarce_room offer offer_cat 57 | sum 58 | 59 | * export list 60 | export excel hotel_id accommodation_type country city city_actual center1label distance stars rating price using "$output\hotel_listobs.xls" in 1/5, firstrow(variables) replace 61 | 62 | -------------------------------------------------------------------------------- /ch01-management-data-collect/This is empty on purpose.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gabors-data-analysis/da_case_studies/08e55b021c295653e1b546faeeb4550926c9a7d4/ch01-management-data-collect/This is empty on purpose.txt -------------------------------------------------------------------------------- /ch02-football-manager-success/ch02-football-manager-success.R: -------------------------------------------------------------------------------- 1 | ################################################################################################ 2 | # Prepared for the textbook: 3 | # Data Analysis for Business, Economics, and Policy 4 | # by Gabor BEKES and Gabor KEZDI 5 | # Cambridge University Press 2021 6 | # 7 | # License: Free to share, modify and use for educational purposes. Not to be used for business purposes. 8 | # 9 | ###############################################################################################x 10 | 11 | # CHAPTER 02 12 | # CH02B Identifying successful football managers 13 | 14 | # football dataset 15 | # version 0.9 2020-08-28 16 | 17 | ########### 18 | 19 | # CLEAR MEMORY 20 | rm(list=ls()) 21 | 22 | library(tidyverse) 23 | library(haven) 24 | library(lspline) 25 | library(grid) 26 | library(cowplot) 27 | #---------------------------------------------------------------------------------------------------- 28 | 29 | 30 | # set working directory 31 | # option A: open material as project 32 | # option B: set working directory for da_case_studies 33 | # example: setwd("C:/Users/bekes.gabor/Documents/github/da_case_studies/") 34 | 35 | # set data dir, load theme and functions 36 | source("ch00-tech-prep/theme_bg.R") 37 | source("ch00-tech-prep/da_helper_functions.R") 38 | 39 | # data used 40 | source("set-data-directory.R") #data_dir must be first defined # 41 | data_in <- paste(data_dir,"football","clean/", sep = "/") 42 | 43 | use_case_dir <- "ch02-football-manager-success/" 44 | data_out <- use_case_dir 45 | output <- paste0(use_case_dir,"output/") 46 | create_output_if_doesnt_exist(output) 47 | 48 | 49 | # look at basic data 50 | epl_games <- read_csv(paste0(data_in,"epl_games.csv")) 51 | # Import directly from web 52 | # epl_games <- read_csv("https://osf.io/bdjt5/download") 53 | 54 | #--------------------------------------------------------------------------------------------------------- 55 | 56 | epl_games <- epl_games %>% 57 | arrange(.,team_home) 58 | View(epl_games) 59 | 60 | epl_games <- epl_games %>% 61 | arrange(.,season, team_home) 62 | View(epl_games) 63 | 64 | epl_games <- epl_games %>% 65 | filter(season == 2016) 66 | View(epl_games) 67 | #--------------------------------------------------------------------------------------------------------- 68 | # look at data for team-game level 69 | # TODO: change to csv when we have it 70 | epl_teams_games <- read_csv(paste0(data_in, "epl-teams-games.csv")) 71 | # From the web 72 | # epl_teams_games <- read_csv("https://osf.io/bck6q/download") 73 | 74 | epl_teams_games <- epl_teams_games %>% 75 | arrange(.,team) 76 | View(epl_games) 77 | 78 | epl_teams_games <- epl_teams_games %>% 79 | arrange(.,season, team) 80 | 81 | epl_teams_games <- epl_teams_games %>% 82 | filter(season == 2016) %>% 83 | arrange(., date) 84 | View(epl_games) 85 | 86 | football_managers <- read_csv(paste0(data_in, "football_managers.csv")) 87 | # From the web 88 | # football_managers <- read_csv("https://osf.io/pcu6s/download") 89 | View(football_managers) 90 | Hmisc:: describe(football_managers$manager_id) 91 | 92 | 93 | #-------------------------------------------------------------------------------------------------------- 94 | 95 | # finally the merged file 96 | 97 | # read.csv accent problem, using read_csv 98 | football_managers_merged <- read_csv(paste0(data_in,"football_managers_workfile.csv")) 99 | # From the web 100 | # football_managers_merged <- read_csv("https://osf.io/t6dgh/download") 101 | 102 | football_managers_merged <- football_managers_merged %>% 103 | arrange(.,season, team) 104 | 105 | games <- football_managers_merged %>% 106 | group_by(team, manager_id, manager_name) %>% 107 | summarise(manager_games=n()) 108 | 109 | points <- football_managers_merged %>% 110 | group_by(team, manager_id, manager_name) %>% 111 | summarise(manager_points=sum(points)) 112 | 113 | avg_points <- merge(games, points, by = c('manager_id', 'team', 'manager_name')) %>% 114 | group_by(team, manager_id, manager_name) %>% 115 | mutate(manager_avg_points = (manager_points/manager_games)) %>% 116 | arrange(manager_avg_points) 117 | 118 | avg_points <- avg_points %>% 119 | arrange(-manager_avg_points) 120 | avg_points 121 | 122 | top_managers <- avg_points %>% 123 | filter(manager_avg_points >= 2) 124 | top_managers 125 | 126 | # denote caretakers 127 | top_managers <- top_managers %>% 128 | mutate(manager_avg_points0 = ifelse(manager_games < 18, manager_avg_points, NA), 129 | manager_avg_points1 = ifelse(manager_games > 18, manager_avg_points, NA)) 130 | 131 | # -------------------------------------------------------------------------------------------- 132 | # visualize 133 | 134 | # denote caretakers 135 | top_managers <- top_managers %>% 136 | mutate(fill= case_when (manager_games < 18 ~ "1", 137 | manager_games > 18 ~ "0" )) 138 | 139 | top_managers_graph <- top_managers %>% 140 | ggplot(., aes( x= reorder(manager_name, manager_avg_points), y = manager_avg_points, fill = fill, alpha = fill)) + 141 | geom_col(show.legend=F) + 142 | ylab("Average points per game") + 143 | xlab("Manager name") + 144 | scale_fill_manual(values = c(color[1], color[4])) + 145 | scale_alpha_manual(values =c(0.8,0.3)) + 146 | scale_y_continuous(expand = c(0.01,0.01), limits=c(0, 3), breaks=seq(0, 3, 0.3)) + 147 | coord_flip() + 148 | theme_bg() + 149 | cowplot::background_grid(major="x", minor="none") 150 | top_managers_graph 151 | #save_fig("03_top_managers_R", output, "small") 152 | save_fig("ch02-figures1-top-managers", output, "small") 153 | 154 | 155 | -------------------------------------------------------------------------------- /ch02-football-manager-success/ch02-football-manager-success.do: -------------------------------------------------------------------------------- 1 | ******************************************************************** 2 | * Prepared for Gabor's Data Analysis 3 | * 4 | * Data Analysis for Business, Economics, and Policy 5 | * by Gabor Bekes and Gabor Kezdi 6 | * Cambridge University Press 2021 7 | * 8 | * gabors-data-analysis.com 9 | * 10 | * License: Free to share, modify and use for educational purposes. 11 | * Not to be used for commercial purposes. 12 | * 13 | * Chapter 02 14 | * CH02B Identifying successful football managers 15 | * using the football dataset 16 | * version 0.9 2020-09-06 17 | ******************************************************************** 18 | 19 | 20 | * SETTING UP DIRECTORIES 21 | 22 | * STEP 1: set working directory for da_case_studies. 23 | * for example: 24 | * cd "C:/Users/xy/Dropbox/gabors_data_analysis/da_case_studies" 25 | 26 | * STEP 2: * Directory for data 27 | * Option 1: run directory-setting do file 28 | do set-data-directory.do 29 | /* this is a one-line do file that should sit in 30 | the working directory you have just set up 31 | this do file has a global definition of your working directory 32 | more details: gabors-data-analysis.com/howto-stata/ */ 33 | 34 | * Option 2: set directory directly here 35 | * for example: 36 | * global data_dir "C:/Users/xy/gabors_data_analysis/da_data_repo" 37 | 38 | global data_in "$data_dir/football/clean" 39 | global work "ch02-football-manager-success" 40 | 41 | cap mkdir "$work/output" 42 | global output "$work/output" 43 | 44 | 45 | ********************************************************************* 46 | * look at entire clean data table on games 47 | use "$data_in/epl_games.dta",clear 48 | 49 | * Or download directly from OSF: 50 | /* 51 | copy "https://osf.io/download/tyjp8/" "workfile.dta" 52 | use "workfile.dta", clear 53 | erase "workfile.dta" 54 | */ 55 | 56 | sort team_home 57 | sort season team_home 58 | keep if season == 2016 59 | browse 60 | 61 | * look at data for team-game level 62 | use "$data_in/epl_teams_games.dta",clear 63 | 64 | * Or download directly from OSF: 65 | /* 66 | copy "https://osf.io/download/qzvx7/" "workfile.dta" 67 | use "workfile.dta", clear 68 | erase "workfile.dta" 69 | */ 70 | 71 | sort team 72 | sort season team 73 | keep if season == 2016 74 | sort date 75 | browse 76 | 77 | * look at data table on managers 78 | use "$data_in/football-managers.dta", clear 79 | 80 | * Or download directly from OSF: 81 | /* 82 | copy "https://osf.io/download/w6uph/" "workfile.dta" 83 | use "workfile.dta", clear 84 | erase "workfile.dta" 85 | */ 86 | 87 | browse 88 | 89 | * look at the clean merged file 90 | use "$data_in/football-managers-workfile.dta", clear 91 | 92 | * Or download directly from OSF: 93 | /* 94 | copy "https://osf.io/download/hycmg/" "workfile.dta" 95 | use "workfile.dta", clear 96 | erase "workfile.dta" 97 | */ 98 | 99 | sort season team 100 | browse 101 | 102 | gen a = 1 103 | bys team manager_id: egen manager_games = sum(a) 104 | bys team manager_id: egen manager_points = sum(points) 105 | gen manager_win_ratio = manager_points /manager_games 106 | 107 | collapse (mean) manager_games manager_points manager_win_ratio, by(manager_name team) 108 | sort manager_win_ratio 109 | 110 | gsort -manager_win_ratio 111 | 112 | list if manager_win_ratio >= 2 113 | 114 | 115 | * denote caretakers 116 | separate manager_win_ratio, by(manager_games<18) 117 | 118 | colorpalette viridis, n(4) select(2) nograph 119 | graph hbar (mean) manager_win_ratio0 manager_win_ratio1 if manager_win_ratio>=2, /// 120 | nofill over(manager_name, sort(manager_win_ratio) descending) /// 121 | scheme(virdis) /// 122 | legend(off) yscale(r(1.6(0.2)3)) exclude0 ylabel(1.6(0.2)3, grid) /// 123 | yline(1.6(0.2)3) /// 124 | graphregion(fcolor(white) ifcolor(none)) /// 125 | plotregion(fcolor(white) ifcolor(white)) 126 | graph export "$output/ch02-figure1-top-managers-Stata.png", replace 127 | 128 | -------------------------------------------------------------------------------- /ch02-hotels-data-prep/ch02-hotels-data-prep.R: -------------------------------------------------------------------------------- 1 | ################################################################################################ 2 | # Prepared for the textbook: 3 | # Data Analysis for Business, Economics, and Policy 4 | # by Gabor BEKES and Gabor KEZDI 5 | # Cambridge University Press 2021 6 | # 7 | # License: Free to share, modify and use for educational purposes. Not to be used for business purposes. 8 | # 9 | ###############################################################################################x 10 | 11 | # CHAPTER 02 12 | 13 | # CH02A Finding a good deal among hotels: data preparation 14 | # using the hotels-vienna dataset 15 | # version 0.9 2020-09-06 16 | 17 | ########### 18 | 19 | 20 | # CLEAR MEMORY 21 | rm(list=ls()) 22 | 23 | library(tidyverse) 24 | library(haven) 25 | library(Hmisc) 26 | library(desc) 27 | library(reshape2) 28 | library(modelsummary) 29 | 30 | # set working directory 31 | # option A: open material as project 32 | # option B: set working directory for da_case_studies 33 | # example: setwd("C:/Users/bekes.gabor/Documents/github/da_case_studies/") 34 | 35 | # set data dir, load theme and functions 36 | source("ch00-tech-prep/theme_bg.R") 37 | source("ch00-tech-prep/da_helper_functions.R") 38 | 39 | # data used 40 | source("set-data-directory.R") #data_dir must be first defined # 41 | data_in_clean <- paste(data_dir,"hotels-vienna","clean/", sep = "/") 42 | data_in_raw <- paste(data_dir,"hotels-vienna","raw", sep = "/") 43 | 44 | use_case_dir <- "ch02-hotels-data-prep/" 45 | data_out <- use_case_dir 46 | output <- paste0(use_case_dir,"output/") 47 | create_output_if_doesnt_exist(output) 48 | 49 | # load in clean and tidy data and create workfile 50 | data <- read_csv(paste0(data_in_clean,"/hotels-vienna.csv",sep="")) 51 | # Can load from website as well 52 | # data <- read_csv("https://osf.io/y6jvb/download") 53 | data <- data %>% select(hotel_id, accommodation_type ,distance, stars,rating,rating_count,price) 54 | 55 | # look at accomodation types 56 | table(data$accommodation_type) 57 | 58 | #********************************************** 59 | # Table 1.1 60 | #********************************************** 61 | 62 | head(data,n=5) 63 | 64 | 65 | #********************************************** 66 | # Table 2.2 67 | #********************************************** 68 | data[2,] 69 | 70 | #********************************************** 71 | # Table 2.3 72 | #********************************************** 73 | 74 | data <- data %>% filter(accommodation_type == "Hotel") 75 | 76 | nrow(data) 77 | 78 | data %>% select(hotel_id,price,distance) %>% slice(1:3) 79 | 80 | 81 | ## PART B: repeat part of the cleaning code 82 | #using the raw csv data file 83 | #includes some additional output 84 | #********************************************************* 85 | 86 | # *IMPORT AND PREPARE DATA* 87 | 88 | # variables downoaded as string, often in form that is not helpful 89 | # need to transform then to numbers that we can use 90 | 91 | data <- read_csv(paste0(data_in_raw,"/hotelbookingdata-vienna.csv",sep="")) 92 | # Can load from website as well 93 | # data <- read_csv( "https://osf.io/g5dmw/download" ) 94 | 95 | # distance to center entered as string in miles with one decimal 96 | # generate numerical variable of rating variable from string variable 97 | 98 | data <- data %>% separate(center1distance,c("distance",NA),sep = " ") %>% 99 | separate(center2distance,c("distance_alter",NA),sep = " ") %>% 100 | separate(accommodationtype,c(NA,"accommodation_type"),sep = "@") %>% 101 | separate(price_night,c(NA,NA,"nnight",NA),sep = " ") %>% 102 | separate(guestreviewsrating,c("rating",NA),sep = " ") 103 | 104 | 105 | # check: frequency table of all values incl. missing values 106 | 107 | tab_rating <- data %>% 108 | group_by(rating) %>% 109 | summarise(n = n()) %>% 110 | mutate(percent = round((n / sum(n)), 3), 111 | cumpercent = round(cumsum(freq = n / sum(n)),3)) 112 | 113 | View(tab_rating) 114 | 115 | # check: frequency table of all values incl. missing varlues 116 | 117 | tab_rating_reviewcount <- data %>% 118 | group_by(rating_reviewcount) %>% 119 | summarise(n = n()) %>% 120 | mutate(percent = round((n / sum(n)), 3), 121 | cumpercent = round(cumsum(freq = n / sum(n)),3)) 122 | 123 | View(tab_rating_reviewcount) 124 | 125 | data <- data%>% mutate(rating_count = as.numeric(rating_reviewcount)) 126 | 127 | describe(data$rating_count) 128 | 129 | # *RENAME VARIABLES* 130 | 131 | data <- data %>% rename(ratingta = rating2_ta,ratingta_count = rating2_ta_reviewcount, 132 | country=addresscountryname,city=s_city,stars=starrating) 133 | 134 | # look at key variables 135 | 136 | tab_stars <- data %>% 137 | group_by(stars) %>% 138 | summarise(n = n()) %>% 139 | mutate(percent = round((n / sum(n)), 3), 140 | cumpercent = round(cumsum(freq = n / sum(n)),3)) 141 | 142 | View(tab_stars) 143 | 144 | tab_rating <- data %>% 145 | group_by(rating) %>% 146 | summarise(n = n()) %>% 147 | mutate(percent = round((n / sum(n)), 3), 148 | cumpercent = round(cumsum(freq = n / sum(n)),3)) 149 | 150 | View(tab_rating) 151 | 152 | #********************************************** 153 | # Table 2.10 154 | #********************************************** 155 | 156 | # Look for perfect duplicates 157 | 158 | data <- data %>% arrange(hotel_id) 159 | 160 | data %>% group_by(hotel_id) %>% filter(n()>1) %>% 161 | select(c(hotel_id,accommodation_type,price,distance,stars,rating,rating_count)) 162 | 163 | data <- data %>% distinct() 164 | 165 | 166 | #********************************************** 167 | # Missing values in text 168 | #********************************************** 169 | 170 | datasummary_skim(data=data,histogram=F) 171 | 172 | data <- data %>% mutate(misrating = ifelse(is.na(rating),1,0)) 173 | 174 | table(data$misrating) 175 | 176 | 177 | addmargins(table(data$accommodation_type,data$misrating)) 178 | 179 | data %>% group_by(accommodation_type,misrating) %>% summarise(mean(price)) 180 | 181 | data %>% filter((misrating == 1)&(accommodation_type == "Hotel")) %>% 182 | select(hotel_id, accommodation_type,price,distance, stars,rating,rating_count) %>% 183 | slice(1) 184 | 185 | 186 | -------------------------------------------------------------------------------- /ch02-immunization-crosscountry/ch02-immunization-crosscountry.R: -------------------------------------------------------------------------------- 1 | ################################################################################################ 2 | # Prepared for the textbook: 3 | # Data Analysis for Business, Economics, and Policy 4 | # by Gabor BEKES and Gabor KEZDI 5 | # Cambridge University Press 2021 6 | # 7 | # License: Free to share, modify and use for educational purposes. Not to be used for business purposes. 8 | # 9 | ###############################################################################################x 10 | 11 | ## Chapter 02 12 | 13 | ### CH02C Displaying immunization rates across countries 14 | # using the world-bank-immunizationdataset 15 | # version 0.9 2020-09-06 16 | 17 | # CLEAR MEMORY 18 | rm(list=ls()) 19 | library(tidyverse) 20 | library(haven) 21 | library(Hmisc) 22 | 23 | # set working directory 24 | # option A: open material as project 25 | # option B: set working directory for da_case_studies 26 | # example: setwd("C:/Users/bekes.gabor/Documents/github/da_case_studies/") 27 | 28 | # set data dir, load theme and functions 29 | source("ch00-tech-prep/theme_bg.R") 30 | source("ch00-tech-prep/da_helper_functions.R") 31 | 32 | # data used 33 | source("set-data-directory.R") #data_dir must be first defined # 34 | 35 | data_in_clean <- paste(data_dir,"worldbank-immunization","clean/", sep = "/") 36 | data_in_raw <- paste(data_dir,"worldbank-immunization","raw", sep = "/") 37 | 38 | use_case_dir <- "ch02-immunization-crosscountry/" 39 | data_out <- use_case_dir 40 | output <- paste0(use_case_dir,"output/") 41 | create_output_if_doesnt_exist(output) 42 | 43 | 44 | # load in clean and tidy data and create workfile 45 | data <- read_csv(paste0(data_in_clean,"worldbank-immunization-panel.csv",sep="")) 46 | #data <- read_csv("https://osf.io/download/gk5cn/") 47 | # cleaning 48 | 49 | data <- data %>% select(c(countryname,year,imm,gdppc)) %>% 50 | filter((imm != 0) & (year >= 2015) & (countryname=='Pakistan'|countryname=='India')) 51 | 52 | summary(data) 53 | 54 | # Table 2.5 55 | data %>% arrange(countryname, year) 56 | 57 | #Table 2.4 58 | data %>% pivot_wider(names_from = c(year),values_from=c(imm,gdppc)) 59 | 60 | 61 | 62 | 63 | -------------------------------------------------------------------------------- /ch02-immunization-crosscountry/ch02-immunization-crosscountry.do: -------------------------------------------------------------------------------- 1 | ******************************************************************** 2 | * Prepared for Gabor's Data Analysis 3 | * 4 | * Data Analysis for Business, Economics, and Policy 5 | * by Gabor Bekes and Gabor Kezdi 6 | * Cambridge University Press 2021 7 | * 8 | * gabors-data-analysis.com 9 | * 10 | * License: Free to share, modify and use for educational purposes. 11 | * Not to be used for commercial purposes. 12 | * 13 | * Chapter 02 14 | * CH02C Displaying immunization rates across countries 15 | * using the world-bank-immunizationdataset 16 | * version 0.9 2020-09-06 17 | ******************************************************************** 18 | 19 | 20 | * SETTING UP DIRECTORIES 21 | 22 | * STEP 1: set working directory for da_case_studies. 23 | * for example: 24 | * cd "C:/Users/xy/Dropbox/gabors_data_analysis/da_case_studies" 25 | 26 | 27 | * STEP 2: * Directory for data 28 | * Option 1: run directory-setting do file 29 | do set-data-directory.do 30 | /* this is a one-line do file that should sit in 31 | the working directory you have just set up 32 | this do file has a global definition of your working directory 33 | more details: gabors-data-analysis.com/howto-stata/ */ 34 | 35 | * Option 2: set directory directly here 36 | * for example: 37 | * global data_dir "C:/Users/xy/gabors_data_analysis/da_data_repo" 38 | 39 | global data_in "$data_dir/worldbank-immunization/clean" 40 | global work "ch02-immunization-crosscountry" 41 | 42 | cap mkdir "$work/output" 43 | global output "$work/output" 44 | 45 | 46 | * load in clean and tidy data and create workfile 47 | use "$data_in/worldbank-immunization-panel.dta", clear 48 | 49 | * Or download directly from OSF: 50 | 51 | /* 52 | copy "https://osf.io/download/ku4fd/" "workfile.dta" 53 | use "workfile.dta", clear 54 | erase "workfile.dta" 55 | */ 56 | 57 | keep countryname year imm gdppc 58 | keep if imm!=. 59 | keep if year>=2015 60 | keep if countryname=="Pakistan" | countryname=="India" 61 | sum 62 | 63 | sort countryname year 64 | lis 65 | 66 | ***************************** 67 | * Table 2.5 68 | ***************************** 69 | 70 | listtab countryname year imm gdppc /// 71 | using "$output/xt_immun_long.tex", replace /// 72 | rstyle(tabular) /// 73 | head("\begin{tabular}{lrrr}" /// 74 | `"Country & Year & imm & gdppc \\"') /// 75 | foot("\end{tabular}") 76 | 77 | reshape wide imm gdppc, i(countryname) j(year) 78 | order countryname imm* gdp* 79 | lis 80 | 81 | 82 | ***************************** 83 | * Table 2.4 84 | ***************************** 85 | 86 | 87 | listtab countryname imm* gdppc* /// 88 | using "$output/xt_immun_wide.tex", replace /// 89 | rstyle(tabular) /// 90 | head("\begin{tabular}{lrrrrrr}" /// 91 | `"Country & imm2015 & imm2016 & imm2017 & gdppc2015 & gdppc2016 & gdppc2017 \\"') /// 92 | foot("\end{tabular}") 93 | -------------------------------------------------------------------------------- /ch03-city-size-japan/ch03-city-size-Japan.R: -------------------------------------------------------------------------------- 1 | ################################################################################################ 2 | # Prepared for the textbook: 3 | # Data Analysis for Business, Economics, and Policy 4 | # by Gabor BEKES and Gabor KEZDI 5 | # Cambridge University Press 2021 6 | # 7 | # License: Free to share, modify and use for educational purposes. Not to be used for business purposes. 8 | # 9 | ###############################################################################################x 10 | 11 | # CHAPTER 03 12 | # CH03 UNDER THE HOOD: MORE ON THEORETICAL DISTRIBUTIONS --- City size distribution in Japan 13 | # city-size-japan dataset 14 | # version 0.9 2020-08-28 15 | 16 | 17 | 18 | # It is advised to start a new session for every case study 19 | # CLEAR MEMORY 20 | rm(list=ls()) 21 | 22 | # Import libraries 23 | library(tidyverse) 24 | library(scales) 25 | 26 | 27 | # set working directory 28 | # option A: open material as project 29 | # option B: set working directory for da_case_studies 30 | # example: setwd("C:/Users/bekes.gabor/Documents/github/da_case_studies/") 31 | 32 | # set data dir, data used 33 | source("set-data-directory.R") # data_dir must be first defined 34 | # alternative: give full path here, 35 | # example data_dir="C:/Users/bekes.gabor/Dropbox (MTA KRTK)/bekes_kezdi_textbook/da_data_repo" 36 | 37 | # load theme and functions 38 | source("ch00-tech-prep/theme_bg.R") 39 | source("ch00-tech-prep/da_helper_functions.R") 40 | 41 | data_in <- paste(data_dir,"city-size-japan","clean/", sep = "/") 42 | 43 | use_case_dir <- "ch03-city-size-japan/" 44 | data_out <- use_case_dir 45 | output <- paste0(use_case_dir,"output/") 46 | create_output_if_doesnt_exist(output) 47 | 48 | #----------------------------------------------------------------------------------------- 49 | # import data 50 | city_size <- read_csv(paste0(data_in, "city-size-japan.csv")) 51 | # city_size <- read_csv("https://osf.io/download/9mgep/") 52 | Hmisc::describe(city_size) 53 | # create variables 54 | 55 | city_size <- city_size %>% 56 | mutate( 57 | pop = (pop_2015/1000), 58 | lnpop = log(pop)) %>% 59 | arrange(-pop) 60 | 61 | 62 | city_size <- city_size %>% 63 | mutate (rank = seq( from = 1, to = nrow(.), by = 1)) 64 | 65 | #------------------------------------------------------------ 66 | # ln(rank) vs ln(x) 67 | 68 | city_size <- city_size %>% 69 | mutate(lnrank = log(rank)) 70 | 71 | 72 | R_03_lnrank <- ggplot(data = city_size, aes(x=lnpop, y=lnrank)) + 73 | geom_smooth_da(method="lm")+ 74 | geom_point_da()+ 75 | labs(x="ln(population)",y="ln(rank)")+ 76 | scale_y_continuous(expand = c(0.01,0.01),limits = c(0, 6), breaks = seq(0, 6, by = 1)) + 77 | scale_x_continuous(expand = c(0.01,0.01),limits = c(5, 9.5), breaks = seq(5, 9.5, by = 0.5)) + 78 | theme_bg() 79 | R_03_lnrank 80 | #save_fig("ch03_citysize-japan-logrank", output, "small") 81 | save_fig("ch03-figure-12-city-logrank", output, "small") 82 | 83 | 84 | #------------------------------------------------------------ 85 | ## ln P(X>x) vs ln(x) figure 86 | ## should be the same s with ln(rank) except for constant shift 87 | 88 | 89 | # city_size <- city_size %>% 90 | # mutate(P = (rank / max(nrow(.))), 91 | # lnP = log(P)) 92 | # 93 | # R_03_lnP <- ggplot(data = city_size, aes(x=lnpop, y=lnP)) + 94 | # geom_point_da()+ 95 | # labs(x="ln(population)",y="")+ 96 | # geom_smooth_da(method="lm")+ 97 | # theme_bg() 98 | # R_03_lnP 99 | 100 | #--------------------------------------------------------------- 101 | # scale invariance 102 | 103 | x1 <- 200 104 | x2 <- 300 105 | bound <- 0.2 106 | 107 | print(paste0(x1, " ", x2)) 108 | 109 | city_size %>% 110 | filter(pop >= x1*(1-bound) & pop <= x1*(1+bound)) %>% 111 | count() 112 | 113 | city_size %>% 114 | filter(pop >= x2*(1-bound) & pop <= x2*(1+bound)) %>% 115 | count() 116 | 117 | shift <- 3 118 | x3 <- x1*shift 119 | x4 <- x2*shift 120 | 121 | print(paste0(x3, " ", x4)) 122 | 123 | city_size %>% 124 | filter(pop >= x3*(1-bound) & pop <= x3*(1+bound)) %>% 125 | count() 126 | 127 | city_size %>% 128 | filter(pop >= x4*(1-bound) & pop <= x4*(1+bound)) %>% 129 | count() 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | -------------------------------------------------------------------------------- /ch03-city-size-japan/ch03-city-size-Japan.do: -------------------------------------------------------------------------------- 1 | ******************************************************************** 2 | * Prepared for Gabor's Data Analysis 3 | * 4 | * Data Analysis for Business, Economics, and Policy 5 | * by Gabor Bekes and Gabor Kezdi 6 | * Cambridge University Press 2021 7 | * 8 | * gabors-data-analysis.com 9 | * 10 | * License: Free to share, modify and use for educational purposes. 11 | * Not to be used for commercial purposes. 12 | * 13 | * Chapter 03 14 | * Also good to know section, power loaw distribution 15 | * using the city-size-japan dataset 16 | * version 0.9 2020-09-06 17 | ******************************************************************** 18 | 19 | 20 | * SETTING UP DIRECTORIES 21 | 22 | * STEP 1: set working directory for da_case_studies. 23 | * for example: 24 | * cd "C:/Users/xy/Dropbox/gabors_data_analysis/da_case_studies" 25 | 26 | * STEP 2: * Directory for data 27 | * Option 1: run directory-setting do file 28 | do set-data-directory.do 29 | /* this is a one-line do file that should sit in 30 | the working directory you have just set up 31 | this do file has a global definition of your working directory 32 | more details: gabors-data-analysis.com/howto-stata/ */ 33 | 34 | * Option 2: set directory directly here 35 | * for example: 36 | * global data_dir "C:/Users/xy/gabors_data_analysis/da_data_repo" 37 | 38 | global data_in "$data_dir/city-size-japan/clean" 39 | global work "ch03-city-size-japan" 40 | 41 | cap mkdir "$work/output" 42 | global output "$work/output" 43 | 44 | 45 | 46 | clear 47 | insheet using "$data_in\city-size-japan.csv" 48 | 49 | * OSF: 50 | /* 51 | copy "https://osf.io/download/9mgep/" "workfile.csv" 52 | import delimited "workfile.csv", clear 53 | erase "workfile.csv" 54 | */ 55 | 56 | 57 | sum 58 | 59 | gen pop=pop_2015/1000 60 | gen lnpop=ln(pop) 61 | gsort -pop 62 | gen rank = _n 63 | 64 | ******************************* 65 | ** Figure: ln(rank) vs ln(x) 66 | gen lnrank = ln(rank) 67 | 68 | * Figure 3.12 69 | colorpalette viridis, n(4) select(2) nograph 70 | scatter lnrank lnpop|| lfit lnrank lnpop, /// 71 | color(`r(p)') /// 72 | legend(off) ytitle("ln(rank)") xtitle("ln(population in thousand)") /// 73 | ylab(, grid) xlab(,grid) 74 | graph export "$output/ch03-figure-12-logrank-Stata.png", replace 75 | 76 | 77 | 78 | ******************************* 79 | ** SCALE INVARIANCE 80 | 81 | local x1=200 82 | local x2=300 83 | local bound = 0.2 84 | 85 | dis `x1' " " `x2' 86 | count if pop >= `x1'*(1-`bound') & pop <= `x1'*(1+`bound') 87 | count if pop >= `x2'*(1-`bound') & pop <= `x2'*(1+`bound') 88 | 89 | local shift = 3 90 | local x3 = `x1'*`shift' 91 | local x4 = `x2'*`shift' 92 | 93 | dis `x3' " " `x4' 94 | count if pop >= `x3'*(1-`bound') & pop <= `x3'*(1+`bound') 95 | count if pop >= `x4'*(1-`bound') & pop <= `x4'*(1+`bound') 96 | 97 | -------------------------------------------------------------------------------- /ch03-distributions-height-income/ch03-height-income.R: -------------------------------------------------------------------------------- 1 | ################################################################################################ 2 | # Prepared for the textbook: 3 | # Data Analysis for Business, Economics, and Policy 4 | # by Gabor BEKES and Gabor KEZDI 5 | # Cambridge University Press 2021 6 | # 7 | # License: Free to share, modify and use for educational purposes. Not to be used for business purposes. 8 | # 9 | ###############################################################################################x 10 | 11 | # CHAPTER 03 12 | # CH03D Distributions of Body Height and Income 13 | # height-income-distributions dataset 14 | # version 0.9 2020-08-28 15 | 16 | 17 | # ------------------------------------------------------------------------------------------------------ 18 | #### SET UP 19 | # It is advised to start a new session for every case study 20 | # CLEAR MEMORY 21 | rm(list=ls()) 22 | 23 | # Import libraries 24 | library(tidyverse) 25 | library(scales) 26 | #!library(modelsummary) 27 | 28 | 29 | # set working directory 30 | # option A: open material as project 31 | # option B: set working directory for da_case_studies 32 | # example: setwd("C:/Users/bekes.gabor/Documents/github/da_case_studies/") 33 | 34 | # set data dir, data used 35 | source("set-data-directory.R") # data_dir must be first defined 36 | # alternative: give full path here, 37 | # example data_dir="C:/Users/bekes.gabor/Dropbox (MTA KRTK)/bekes_kezdi_textbook/da_data_repo" 38 | 39 | # load theme and functions 40 | source("ch00-tech-prep/theme_bg.R") 41 | source("ch00-tech-prep/da_helper_functions.R") 42 | 43 | data_in <- paste(data_dir,"height-income-distributions","clean/", sep = "/") 44 | 45 | use_case_dir <- "ch03-distributions-height-income/" 46 | data_out <- use_case_dir 47 | output <- paste0(use_case_dir,"output/") 48 | create_output_if_doesnt_exist(output) 49 | 50 | 51 | #----------------------------------------------------------------------------------------- 52 | # load in clean and tidy data and create workfile 53 | hrs <- read.csv(paste(data_in,"hrs_height_income.csv", sep = "/")) 54 | # or load from the web 55 | # hrs <- read_csv("https://osf.io/rnuh2/download") 56 | 57 | #------------------------------------------------------------------------------------------------------ 58 | 59 | hrs$height <- as.numeric(as.character(hrs$height)) 60 | 61 | # NORMAL: height of women age 55-59 62 | Hmisc::describe(hrs$height) 63 | #! datasummary_skim(hrs$height) 64 | filtered_women <- hrs %>% 65 | filter(age >= 55 & age < 60 & female == 1) 66 | Hmisc::describe(hrs$height) 67 | #! datasummary_skim(filtered_women$height) 68 | filtered_women_height <- hrs %>% 69 | filter(age >= 55 & age < 60 & female == 1 & height > 1.3 & height < 2.1) 70 | Hmisc::describe(filtered_women_height$height) 71 | #! datasummary_skim(filtered_women_height$height) 72 | 73 | # graph --height 74 | ch03_normal_height <- ggplot(filtered_women_height, aes(x = height)) + 75 | geom_histogram(aes(y = ..density..), binwidth = 0.025, boundary = min(filtered_women_height$height), 76 | fill = color[1], color = color.outline, alpha = 0.8, closed='left') + 77 | stat_function(fun = dnorm, colour= color[2], 78 | args = with(filtered_women_height, c(mean = mean(height), sd = sd(height)))) + 79 | scale_y_continuous("Density", position = "right", expand=c(0,0), limits = c(0, 6), 80 | sec.axis = sec_axis(~ . *0.025, name = "Percent",breaks =seq(0,0.15, by=0.025),labels = percent_format(accuracy = 0.1))) + 81 | theme_bg() + 82 | xlab("Height (meters)") 83 | ch03_normal_height 84 | save_fig("ch03-figure-10-hist-height", output, "small") 85 | 86 | #------------------------------------------------------------------------------------------- 87 | # LOGNORMAL: family income of women age 55-59 88 | 89 | # income variable 90 | 91 | # filter dataset 92 | filtered_women_income <- hrs %>% 93 | filter(age >= 55 & age < 60 & female == 1 & hhincome > 1 & hhincome < 1000) 94 | Hmisc::describe(filtered_women_income$hhincome) 95 | 96 | 97 | # graph --income 98 | ch03_lognormal_income <- ggplot(filtered_women_income, aes(x = hhincome)) + 99 | geom_histogram(aes(y = (..count..)/sum(..count..)), binwidth = 20, boundary=0, closed='left', 100 | fill = color[1], color = color.outline, alpha = 0.8) + 101 | ylab("Percent") + xlab("Household income (thousand USD)") + 102 | expand_limits(x = 0.01, y = 0.01) + 103 | scale_x_continuous(expand = c(0.01,0.01),limits = c(0, 1000), breaks = seq(0, 1000, by = 100)) + 104 | scale_y_continuous(expand = c(0.00,0.00),limits = c(0, 0.3), breaks = seq(0, 0.3, by = 0.05), labels = scales::percent_format(accuracy = 1)) + 105 | theme_bg() 106 | ch03_lognormal_income 107 | save_fig("ch03-figure-11a-hist-income", output, "small") 108 | 109 | 110 | 111 | 112 | 113 | # ln income 114 | filtered_women_income <- filtered_women_income %>% 115 | mutate(lnincome = log(hhincome)) 116 | 117 | 118 | # graph --ln income 119 | ch03_lognormal_lnincome <- ggplot(filtered_women_income, aes(x = lnincome)) + 120 | geom_histogram(aes(y = ..density..), binwidth = 0.25, boundary = 0, closed='left', 121 | fill = color[1], color = color.outline, alpha = 0.8) + 122 | stat_function(fun = dnorm, colour= color[2], 123 | args = with(filtered_women_income, c(mean = mean(lnincome), sd = sd(lnincome)))) + 124 | scale_x_continuous(expand = c(0.01,0.01), limits = c(0, 8), breaks = seq(0, 8, by = 1)) + 125 | scale_y_continuous("Density", position = "right", expand=c(0,0), limits = c(0, 0.4), 126 | sec.axis = sec_axis(~ . *0.25, name = "Percent",breaks =seq(0,0.1, by=0.025), 127 | labels = percent_format(accuracy = 0.1))) + 128 | theme_bg() + 129 | ylab("Percent") + xlab("ln(household income, thousand USD)") 130 | ch03_lognormal_lnincome 131 | save_fig("ch03-figure-11b-hist-income-log", output, "small") 132 | 133 | -------------------------------------------------------------------------------- /ch03-distributions-height-income/ch03-height-income.do: -------------------------------------------------------------------------------- 1 | ******************************************************************** 2 | * Prepared for Gabor's Data Analysis 3 | * 4 | * Data Analysis for Business, Economics, and Policy 5 | * by Gabor Bekes and Gabor Kezdi 6 | * Cambridge University Press 2021 7 | * 8 | * gabors-data-analysis.com 9 | * 10 | * License: Free to share, modify and use for educational purposes. 11 | * Not to be used for commercial purposes. 12 | * 13 | * Chapter 03 14 | * CH03D Distributions of body height and income 15 | * using the height-indome-distribution dataset 16 | * version 0.9 2020-09-06 17 | ******************************************************************** 18 | 19 | 20 | * SETTING UP DIRECTORIES 21 | 22 | * STEP 1: set working directory for da_case_studies. 23 | * for example: 24 | * cd "C:/Users/xy/Dropbox/gabors_data_analysis/da_case_studies" 25 | 26 | 27 | * STEP 2: * Directory for data 28 | * Option 1: run directory-setting do file 29 | do set-data-directory.do 30 | /* this is a one-line do file that should sit in 31 | the working directory you have just set up 32 | this do file has a global definition of your working directory 33 | more details: gabors-data-analysis.com/howto-stata/ */ 34 | 35 | * Option 2: set directory directly here 36 | * for example: 37 | * global data_dir "C:/Users/xy/gabors_data_analysis/da_data_repo" 38 | 39 | global data_in "$data_dir/height-income-distributions/clean" 40 | global work "ch03-distributions-height-income" 41 | 42 | cap mkdir "$work/output" 43 | global output "$work/output" 44 | 45 | 46 | 47 | 48 | 49 | use "$data_in/hrs_height_income.dta",clear 50 | 51 | * Or download directly from OSF: 52 | 53 | /* 54 | copy "https://osf.io/download/2bqsg/" "workfile.dta" 55 | use "workfile.dta", clear 56 | erase "workfile.dta" 57 | */ 58 | 59 | * Normal distribution: height of women age 55-59 60 | sum height if age>=55 & age<60 & female==1 61 | tab height if height>1.80 & age>=55 & age<60 & female==1 ,mis 62 | 63 | count if age>=55 & age<60 & female==1 & height>1.3 64 | sum height if age>=55 & age<60 & female==1 & height>1.3 65 | 66 | * Histogram with normal density overlayed 67 | * Figure 3.10 68 | colorpalette viridis, n(4) select(2) nograph 69 | hist height if age>=55 & age<60 & female==1 & height>1.3 , /// 70 | percent width(0.025) /// 71 | color(`r(p)') lcol(white) /// 72 | normal /// 73 | ylabel(,grid) xlabel(1.4(0.1)1.9, grid) /// 74 | xtitle("Body height") 75 | graph export "$output/ch03-figure-10-hist-height-Stata.png", replace 76 | 77 | 78 | * Lognormal distribution: family income of women age 55-59 79 | 80 | * Histogram of income and ln income with normal density overlayed 81 | * Figure 3.11a 82 | count if age>=55 & age<60 & female==1 83 | count if age>=55 & age<60 & female==1 & hhincome<1 84 | count if age>=55 & age<60 & female==1 & hhincome>1000 & hhincome!=. 85 | count if age>=55 & age<60 & female==1 & hhincome==. 86 | count if age>=55 & age<60 & female==1 & hhincome>1 & hhincome<1000 87 | 88 | colorpalette viridis, n(4) select(2) nograph 89 | hist hhincome if age>=55 & age<60 & female==1 & hhincome>1 & hhincome<1000, /// 90 | percent width(20) /// 91 | color(`r(p)') lcol(white) lw(vvthin) /// 92 | ylabel(0(5)25, grid) xlabel(0(200)1000, grid) /// 93 | xtitle("Household income (thousand US dollars)") 94 | graph export "$output/ch03-figure-11a-hist-inc-Stata.png", replace 95 | 96 | * Figure 3.11b 97 | gen lnincome=ln(hhincome) 98 | lab var lnincome "ln(houehold income, thousand US dollars)" 99 | colorpalette viridis, n(4) select(2) nograph 100 | hist lnincome if age>=55 & age<60 & female==1 & lnincome>0 & hhincome<1000, /// 101 | percent width(0.25) start(0) /// 102 | color(`r(p)') lcol(white) /// 103 | ylabel(0(2.5)10, grid) xlabel(0(1)8, grid) /// 104 | normal 105 | graph export "$output/ch03-figure-11b-hist-lninc-Stata.png", replace 106 | -------------------------------------------------------------------------------- /ch03-distributions-height-income/stata library to edit.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gabors-data-analysis/da_case_studies/08e55b021c295653e1b546faeeb4550926c9a7d4/ch03-distributions-height-income/stata library to edit.txt -------------------------------------------------------------------------------- /ch03-football-home-advantage/ch03-football-home-advantage-describe.R: -------------------------------------------------------------------------------- 1 | ################################################################################################ 2 | # Prepared for the textbook: 3 | # Data Analysis for Business, Economics, and Policy 4 | # by Gabor BEKES and Gabor KEZDI 5 | # Cambridge University Press 2021 6 | # 7 | # License: Free to share, modify and use for educational purposes. Not to be used for business purposes. 8 | # 9 | ###############################################################################################x 10 | 11 | # CHAPTER 03 12 | # CH03C Measuring Home Team Advantage in Football 13 | # football dataset 14 | # version 0.9 2020-08-28 15 | 16 | 17 | # ------------------------------------------------------------------------------------------------------ 18 | #### SET UP 19 | # It is advised to start a new session for every case study 20 | # CLEAR MEMORY 21 | rm(list=ls()) 22 | 23 | # Import libraries 24 | library(tidyverse) 25 | library(haven) 26 | library(cowplot) 27 | library(grid) 28 | library(scales) 29 | library(Hmisc) 30 | 31 | 32 | # set working directory 33 | # option A: open material as project 34 | # option B: set working directory for da_case_studies 35 | # example: setwd("C:/Users/bekes.gabor/Documents/github/da_case_studies/") 36 | 37 | # set data dir, data used 38 | source("set-data-directory.R") # data_dir must be first defined 39 | # alternative: give full path here, 40 | # example data_dir="C:/Users/bekes.gabor/Dropbox (MTA KRTK)/bekes_kezdi_textbook/da_data_repo" 41 | 42 | # load theme and functions 43 | source("ch00-tech-prep/theme_bg.R") 44 | source("ch00-tech-prep/da_helper_functions.R") 45 | 46 | data_in <- paste(data_dir,"football","clean/", sep = "/") 47 | 48 | use_case_dir <- "ch03-football-home-advantage/" 49 | data_out <- use_case_dir 50 | output <- paste0(use_case_dir,"output/") 51 | create_output_if_doesnt_exist(output) 52 | 53 | 54 | #----------------------------------------------------------------------------------------- 55 | 56 | # Import dataset 57 | df <- read.csv(paste0(data_in,"epl_games.csv"), 58 | stringsAsFactors = F) 59 | # Or can load from web 60 | #! df <- read_csv( "https://osf.io/bdjt5/download" ) 61 | 62 | # look at 2016/17 season only 63 | df <- subset(df, season==2016) 64 | glimpse(df) 65 | 66 | df <- df %>% 67 | mutate(home_goaladv = goals_home- goals_away) 68 | 69 | 70 | # Summary statistics 71 | summary(df$home_goaladv) 72 | describe(df$home_goaladv) 73 | 74 | # Histogram 75 | p1<-ggplot(data = df, aes (x = home_goaladv, y = (..count..)/sum(..count..))) + 76 | geom_histogram(color = color.outline, fill = theme_colors[1], 77 | size = 0.2, alpha = 0.8, show.legend=F, na.rm=TRUE, 78 | binwidth = 1) + 79 | geom_text(stat='count', aes(label=round((..count..)/sum(..count..)*100, 1)), hjust=0.5, vjust = -0.5, size = 2) + 80 | labs(x = "Goal difference", y = "Share of games (percent)") + 81 | scale_x_continuous(expand = c(0.05,0.05),limits = c(-6, 6), breaks = seq(-6, 6, by = 1)) + 82 | scale_y_continuous(expand = c(0,0), limits = c(0, 0.25), breaks = seq(0,0.25, by = 0.05), labels = scales::percent_format(accuracy = 5L)) + 83 | theme_bg() 84 | p1 85 | save_fig("ch03-figure-9-hist-homeadv", output, "small") 86 | 87 | 88 | 89 | # look at goal advantage by team 90 | # table *not* used in book, but interesting 91 | df %>% 92 | filter(team_home %in% c("Chelsea", "Arsenal", "Leicester", "Stoke", "West Ham") ) %>% 93 | group_by(team_home) %>% 94 | dplyr::summarize(Count = n(), 95 | Mean = mean(home_goaladv, na.rm=TRUE), 96 | Median = median(home_goaladv, na.rm=TRUE), 97 | Std = sd(home_goaladv, na.rm=TRUE), 98 | Min = min(home_goaladv, na.rm=TRUE)) 99 | df %>% 100 | filter(team_home %in% c("Chelsea", "Arsenal", "Leicester", "Stoke", "West Ham") ) %>% 101 | dplyr::summarize(Count = n(), 102 | Mean = mean(home_goaladv, na.rm=TRUE), 103 | Median = median(home_goaladv, na.rm=TRUE), 104 | Std = sd(home_goaladv, na.rm=TRUE), 105 | Min = min(home_goaladv, na.rm=TRUE)) 106 | 107 | 108 | 109 | 110 | 111 | 112 | -------------------------------------------------------------------------------- /ch03-football-home-advantage/ch03-football-home-advantage-describe.do: -------------------------------------------------------------------------------- 1 | ******************************************************************** 2 | * Prepared for Gabor's Data Analysis 3 | * 4 | * Data Analysis for Business, Economics, and Policy 5 | * by Gabor Bekes and Gabor Kezdi 6 | * Cambridge University Press 2021 7 | * 8 | * gabors-data-analysis.com 9 | * 10 | * License: Free to share, modify and use for educational purposes. 11 | * Not to be used for commercial purposes. 12 | * 13 | * Chapter 03 14 | * CH03C Measurig home team advantage in football 15 | * using the football dataset 16 | * version 0.9 2020-09-06 17 | ******************************************************************** 18 | 19 | 20 | * SETTING UP DIRECTORIES 21 | 22 | * STEP 1: set working directory for da_case_studies. 23 | * for example: 24 | * cd "C:/Users/xy/Dropbox/gabors_data_analysis/da_case_studies" 25 | 26 | * STEP 2: * Directory for data 27 | * Option 1: run directory-setting do file 28 | do set-data-directory.do 29 | /* this is a one-line do file that should sit in 30 | the working directory you have just set up 31 | this do file has a global definition of your working directory 32 | more details: gabors-data-analysis.com/howto-stata/ */ 33 | 34 | * Option 2: set directory directly here 35 | * for example: 36 | * global data_dir "C:/Users/xy/gabors_data_analysis/da_data_repo" 37 | 38 | global data_in "$data_dir/football/clean" 39 | global work "ch03-football-home-advantage" 40 | 41 | cap mkdir "$work/output" 42 | global output "$work/output" 43 | 44 | 45 | 46 | * use full 11 season data table on games 47 | use "$data_in\epl_games.dta", clear 48 | 49 | * Or download directly from OSF: 50 | /* 51 | copy "https://osf.io/download/tyjp8/" "workfile.dta" 52 | use "workfile.dta", clear 53 | erase "workfile.dta" 54 | */ 55 | 56 | * sample design: keep one season 57 | keep if season == 2016 58 | 59 | * generate home goal advantage 60 | gen home_goaladv = goals_home- goals_away 61 | * generate direction of home goal advantage (negative - zero - positive) 62 | gen home_goaldir = -1 if home_goaladv<0 63 | replace home_goaldir = 0 if home_goaladv==0 64 | replace home_goaldir = 1 if home_goaladv>0 65 | 66 | 67 | 68 | * histogram 69 | * Figure 3.9 70 | colorpalette viridis, n(4) select(2) nograph 71 | histogram home_goaladv, /// 72 | discrete percent /// 73 | ylabel(0(5)25, grid) xlabel(-6(1)6, grid) /// 74 | color(`r(p)') lcol(white) lw(vthin) /// 75 | ytitle(Share of games (percent)) xtitle(Goal difference) /// 76 | addlabel addlabopts(yvarformat(%4.1f)) /// 77 | graphregion(fcolor(white) ifcolor(none)) /// 78 | plotregion(fcolor(white) ifcolor(white)) 79 | graph export "$output/ch03-figure-9-hist-goaldiff-Stata.png", replace 80 | 81 | * statistics 82 | * Table 3.7 83 | * same as Table 3.9 84 | tabstat home_goaladv, s(mean sd n) format(%4.1f) 85 | tab home_goaldir 86 | 87 | -------------------------------------------------------------------------------- /ch03-hotels-europe-compare/README: -------------------------------------------------------------------------------- 1 | Chapter 03 2 | hotels-europe 3 | Vienna vs London compare 4 | -------------------------------------------------------------------------------- /ch03-hotels-europe-compare/ch03-hotels-europe-compare.R: -------------------------------------------------------------------------------- 1 | ################################################################################################ 2 | # Prepared for the textbook: 3 | # Data Analysis for Business, Economics, and Policy 4 | # by Gabor BEKES and Gabor KEZDI 5 | # Cambridge University Press 2021 6 | # 7 | # License: Free to share, modify and use for educational purposes. Not to be used for business purposes. 8 | # 9 | ###############################################################################################x 10 | 11 | # CHAPTER 03 12 | # CH03B Comparing hotel prices in Europe: Vienna vs. London 13 | # hotels-europe dataset 14 | # version 0.9 2020-08-28 15 | 16 | 17 | # ------------------------------------------------------------------------------------------------------ 18 | #### SET UP 19 | # It is advised to start a new session for every case study 20 | # CLEAR MEMORY 21 | rm(list=ls()) 22 | 23 | # Import libraries 24 | library(tidyverse) 25 | library(xtable) 26 | 27 | 28 | # set working directory 29 | # option A: open material as project 30 | # option B: set working directory for da_case_studies 31 | # example: setwd("C:/Users/bekes.gabor/Documents/github/da_case_studies/") 32 | 33 | # set data dir, data used 34 | source("set-data-directory.R") # data_dir must be first defined 35 | # alternative: give full path here, 36 | # example data_dir="C:/Users/bekes.gabor/Dropbox (MTA KRTK)/bekes_kezdi_textbook/da_data_repo" 37 | 38 | # load theme and functions 39 | source("ch00-tech-prep/theme_bg.R") 40 | source("ch00-tech-prep/da_helper_functions.R") 41 | 42 | data_in <- paste(data_dir,"hotels-europe","clean/", sep = "/") 43 | 44 | use_case_dir <- "ch03-hotels-europe-compare/" 45 | data_out <- use_case_dir 46 | output <- paste0(use_case_dir,"output/") 47 | create_output_if_doesnt_exist(output) 48 | 49 | 50 | #----------------------------------------------------------------------------------------- 51 | 52 | # load in clean and tidy data and create workfile 53 | hotels_europe_price <- read_csv(paste0(data_in,"hotels-europe_price.csv")) 54 | hotels_europe_features <- read_csv(paste0(data_in,"hotels-europe_features.csv")) 55 | 56 | 57 | # hotels_europe_price <- read_csv("https://osf.io/download/p6tyr/") 58 | # hotels_europe_features <- read_csv("https://osf.io/download/utwjs/") 59 | 60 | hotels_europe <- left_join(hotels_europe_price, hotels_europe_features, by = "hotel_id") 61 | rm(hotels_europe_price) 62 | rm(hotels_europe_features) 63 | 64 | # filter for same Vienna data we used + London same date 65 | hotels_europe_cut <- hotels_europe %>% filter(year == 2017 & month == 11 & weekend == 0) %>% 66 | filter(city %in% c("Vienna", "London")) %>% 67 | filter(accommodation_type == "Hotel") %>% 68 | filter(stars>=3 & stars<=4) %>% 69 | filter(!is.na(stars)) %>% 70 | filter(city_actual %in% c("Vienna", "London")) %>% 71 | filter(price <=600) 72 | 73 | 74 | hotels_europe_cut %>% 75 | group_by(city) %>% 76 | summarise(mean_price = mean(price), max=max(price), n = n()) 77 | 78 | write_csv(hotels_europe_cut, paste0(data_out,"hotels-vienna-london.csv")) 79 | 80 | 81 | 82 | # Vienna vs London 83 | # have same range on x axis 84 | 85 | histprice_Vienna5_R<- ggplot(data = filter(hotels_europe_cut, city=="Vienna"), aes (x = price)) + 86 | geom_histogram_da(type="percent", binwidth = 20 )+ 87 | labs(x = "Price (US dollars)", y = "Percent") + 88 | scale_x_continuous(expand = c(0.01,0.01),limits = c(0, 500), breaks = seq(0, 500, by = 100)) + 89 | scale_y_continuous(expand = c(0.00,0.00),limits = c(0, 0.3), breaks =seq(0,0.3, by=0.1), labels = scales::percent_format()) + 90 | theme_bg() 91 | histprice_Vienna5_R 92 | save_fig("ch03-figure-6a-hist-price-vienna", output, "small") 93 | 94 | 95 | # histprice_London_R<-ggplot(data = filter(hotels_europe_cut, city=="London"), aes (x = price, y = (..count..)/sum(..count..))) + 96 | #geom_histogram(binwidth = 20, color = color.outline, fill = color[1], size = 0.25, alpha = 0.8, show.legend=F, na.rm =TRUE) + 97 | 98 | histprice_London_R<-ggplot(data = filter(hotels_europe_cut, city=="London"), aes (x = price)) + 99 | geom_histogram_da(type="percent", binwidth = 20 )+ 100 | labs(x = "Price (US dollars)", y = "Percent") + 101 | scale_x_continuous(expand = c(0.01,0.01),limits = c(0, 500), breaks = seq(0, 500, by = 100)) + 102 | scale_y_continuous(expand = c(0.00,0.00),limits = c(0, 0.3), breaks =seq(0,0.3, by=0.1), labels = scales::percent_format()) + 103 | theme_bg() 104 | histprice_London_R 105 | save_fig("ch03-figure-6b-hist-price-london", output, "small") 106 | 107 | 108 | 109 | # kernel density plots 110 | 111 | kdens_ViennaLondon_R<-ggplot(data = hotels_europe_cut, aes(x=price, y = stat(density), color = city)) + 112 | geom_line(stat="density", show.legend=F, na.rm =TRUE) + 113 | labs(x="Price (US dollars)", y="Density", color = "") + 114 | scale_color_manual(name="", 115 | values=c(color[2],color[1]), 116 | labels=c("London","Vienna")) + 117 | scale_y_continuous(expand = c(0.0,0.0), limits = c(0, 0.015), breaks = seq(0, 0.015, by = 0.003)) + 118 | scale_x_continuous(expand = c(0.01,0.01),limits = c(0, 500), breaks = seq(0, 500, by = 100)) + 119 | geom_text(aes(x = 340, y = 0.0026, label = "London"), color = color[2], size=2.5) + 120 | geom_text(aes(x = 170, y = 0.008, label = "Vienna"), color = color[1], size=2.5) + 121 | theme_bg() 122 | kdens_ViennaLondon_R 123 | save_fig("ch03-figure-7-kdens_ViennaLondon", output, "small") 124 | 125 | 126 | 127 | table_3_6 <- 128 | hotels_europe_cut %>% 129 | group_by(city) %>% 130 | summarise(n = length(price), mean=mean(price), median=median(price), min = min(price), max = max(price), 131 | sd = sd(price), skew= ((mean(price)-median(price))/sd(price))) 132 | table_3_6 133 | # print out nicely 134 | xt<-xtable(table_3_6,align='llccccccc', digits = c(0, 0,0,2,0,0,0,2,3)) 135 | names(xt) <- c('City','Observations','Mean','Median','Min','Max','Std.dev.','Skewness' ) 136 | print(xt, type = "latex", file = paste0(output,"ch03-table-6-vienna-london-compare.tex",include.rownames = FALSE)) 137 | 138 | 139 | -------------------------------------------------------------------------------- /ch03-hotels-europe-compare/ch03-hotels-europe-compare.do: -------------------------------------------------------------------------------- 1 | ******************************************************************** 2 | * Prepared for Gabor's Data Analysis 3 | * 4 | * Data Analysis for Business, Economics, and Policy 5 | * by Gabor Bekes and Gabor Kezdi 6 | * Cambridge University Press 2021 7 | * 8 | * gabors-data-analysis.com 9 | * 10 | * License: Free to share, modify and use for educational purposes. 11 | * Not to be used for commercial purposes. 12 | * 13 | * Chapter 03 14 | * CH03B Comparing hotel prices in Europe: Vienna vs. London 15 | * using the hotels-europe dataset 16 | * version 0.9 2020-09-06 17 | ******************************************************************** 18 | 19 | 20 | * SETTING UP DIRECTORIES 21 | 22 | * STEP 1: set working directory for da_case_studies. 23 | * for example: 24 | * cd "C:/Users/xy/Dropbox/gabors_data_analysis/da_case_studies" 25 | 26 | * STEP 2: * Directory for data 27 | * Option 1: run directory-setting do file 28 | do set-data-directory.do 29 | /* this is a one-line do file that should sit in 30 | the working directory you have just set up 31 | this do file has a global definition of your working directory 32 | more details: gabors-data-analysis.com/howto-stata/ */ 33 | 34 | * Option 2: set directory directly here 35 | * for example: 36 | * global data_dir "C:/Users/xy/gabors_data_analysis/da_data_repo" 37 | 38 | global data_in "$data_dir/hotels-europe/clean" 39 | global work "ch03-hotels-europe-compare" 40 | 41 | cap mkdir "$work/output" 42 | global output "$work/output" 43 | 44 | cap mkdir "$work/temp" 45 | global temp "$work/temp" 46 | 47 | 48 | 49 | * Vienna vs London 50 | 51 | * load in clean and tidy data and create workfile 52 | use "$data_in/hotels-europe_price", clear 53 | 54 | merge m:m hotel_id using "$data_in/hotels-europe_features.dta", nogen 55 | 56 | * Or download directly from OSF: 57 | /* 58 | copy "https://osf.io/download/hz4gw/" "workfile.dta" 59 | use "workfile.dta", clear 60 | erase "workfile.dta" 61 | preserve 62 | copy "https://osf.io/download/j9mkf/" "workfile.dta" 63 | use "workfile.dta", clear 64 | erase "workfile.dta" 65 | tempfile hotels_features 66 | save `hotels_features' 67 | restore 68 | merge m:m hotel_id using `hotels_features', nogen 69 | */ 70 | 71 | 72 | 73 | 74 | * sample design 75 | * KEEP NOV 2017 weekend, 3-4 star hotels 76 | keep if year==2017 & month==11 & weekend==0 77 | keep if city== "Vienna" | city=="London" 78 | keep if accommodation_type== "Hotel" 79 | keep if stars>=3 & stars<=4 80 | 81 | * in actual city (takes care of extreme distances, too) 82 | keep if city_actual== "Vienna" | city_actual=="London" 83 | * drop Vienna hotel with erroneous price 84 | keep if price<1000 85 | 86 | save "$temp/hotels-vienna-london.dta", replace 87 | tab city 88 | 89 | * distribution of price 90 | 91 | * Figure 3.6a Vienna 92 | colorpalette viridis, n(4) select(2) nograph 93 | return list 94 | hist price if city=="Vienna", /// 95 | width(20) percent /// 96 | xtitle(Price (US dollars)) /// 97 | color(`r(p)') lcol(white) lw(vthin) /// 98 | xlabel(0(50)500 , grid) ylabel(0(10)30, grid ) /// 99 | graphregion(fcolor(white) ifcolor(none)) /// 100 | plotregion(fcolor(white) ifcolor(white)) 101 | graph export "$output/ch03-figure-6a-hist-price-Vienna-Stata.png", replace 102 | 103 | * Figure 3.6b London 104 | colorpalette viridis, n(4) select(2) nograph 105 | return list 106 | hist price if city=="London", /// 107 | width(20) percent /// 108 | xtitle(Price (US dollars)) /// 109 | color(`r(p)') lcol(white) lw(vthin) /// 110 | xlabel(0(50)500 , grid) ylabel(0(10)30, grid ) /// 111 | graphregion(fcolor(white) ifcolor(none)) /// 112 | plotregion(fcolor(white) ifcolor(white)) 113 | graph export "$output/ch03-figure-6b-hist-price-London-Stata.png", replace 114 | 115 | * two density plots overlayed 116 | * Figure 3.7 117 | kdensity price if city=="Vienna", gen (xV yV) nograph 118 | kdensity price if city=="London", gen (xL yL) nograph 119 | colorpalette viridis, n(4) select(2) nograph 120 | return list 121 | line yV yL xL, lc(`r(p)') lw(thick thick) /// 122 | xtitle(Price (US dollars)) /// 123 | xlabel(0(100)500 , grid) ylabel(, grid ) /// 124 | legend(off) /// 125 | text(0.007 180 "Vienna" 0.0025 340 "London") /// 126 | graphregion(fcolor(white) ifcolor(none)) /// 127 | plotregion(fcolor(white) ifcolor(white)) 128 | graph export "$output/ch03-figure-7-densities-price-ViennaLondon-Stata.png", replace 129 | 130 | 131 | 132 | * Table 3.6 133 | tabstat price, s(n mean median min max sd ) by(city) format(%4.2f) save 134 | * calculate mean-median skewness statistic 135 | * London 136 | qui sum price if city=="London",d 137 | dis (r(mean) - r(p50)) / r(sd) 138 | * Vienna 139 | qui sum price if city=="Vienna",d 140 | dis (r(mean) - r(p50)) / r(sd) 141 | 142 | * into .tex format 143 | * replace the built-in skewness measure with our measure computed above: (mean-median)/sd 144 | tabout city using "$output/ch03-table-6-summary-ViennaLondon-Stata.tex" /// 145 | , replace style(tex) sum /// 146 | c(count price mean price median price min price max price sd price skewness price) 147 | -------------------------------------------------------------------------------- /ch03-hotels-vienna-explore/README: -------------------------------------------------------------------------------- 1 | Chapter 02 2 | Hotels data, descriptive stats, dataviz 3 | -------------------------------------------------------------------------------- /ch03-hotels-vienna-explore/ch03-hotels-vienna-explore.do: -------------------------------------------------------------------------------- 1 | ******************************************************************** 2 | * Prepared for Gabor's Data Analysis 3 | * 4 | * Data Analysis for Business, Economics, and Policy 5 | * by Gabor Bekes and Gabor Kezdi 6 | * Cambridge University Press 2021 7 | * 8 | * gabors-data-analysis.com 9 | * 10 | * License: Free to share, modify and use for educational purposes. 11 | * Not to be used for commercial purposes. 12 | * 13 | * Chapter 03 14 | * CH03A Finding a good deal among hotels: data exploration 15 | * using the hotels-vienna dataset 16 | * version 0.9 2020-09-06 17 | ******************************************************************** 18 | 19 | 20 | * SETTING UP DIRECTORIES 21 | 22 | * STEP 1: set working directory for da_case_studies. 23 | * for example: 24 | * cd "C:/Users/xy/Dropbox/gabors_data_analysis/da_case_studies" 25 | 26 | * STEP 2: * Directory for data 27 | * Option 1: run directory-setting do file 28 | do set-data-directory.do 29 | /* this is a one-line do file that should sit in 30 | the working directory you have just set up 31 | this do file has a global definition of your working directory 32 | more details: gabors-data-analysis.com/howto-stata/ */ 33 | 34 | * Option 2: set directory directly here 35 | * for example: 36 | * global data_dir "C:/Users/xy/gabors_data_analysis/da_data_repo" 37 | 38 | global data_in "$data_dir/hotels-vienna/clean" 39 | global work "ch03-hotels-vienna-explore" 40 | 41 | cap mkdir "$work/output" 42 | global output "$work/output" 43 | 44 | 45 | 46 | * load in clean and tidy data and create workfile 47 | use "$data_in/hotels-vienna.dta", clear 48 | * Or download directly from OSF: 49 | 50 | /* 51 | copy "https://osf.io/download/dn8je/" "workfile.dta" 52 | use "workfile.dta", clear 53 | erase "workfile.dta" 54 | */ 55 | 56 | * DISTRIBUTIONS 57 | 58 | * sample design 59 | * KEEP hotels in Vienna 60 | keep if city== "Vienna" 61 | keep if accommodation_type== "Hotel" 62 | 63 | 64 | * Stars 65 | 66 | * Figure 3.1a 67 | colorpalette viridis, n(4) select(2) nograph 68 | return list 69 | histogram stars, /// 70 | discrete percent /// 71 | xtitle(Star rating (number of stars)) /// 72 | xlabel(1(0.5)5, grid format(%3.1f)) ylabel(0(10)50, grid) /// 73 | fcolor(`r(p)') gap(5) lcolor(white) lwidth(vthin) /// 74 | addlabel addlabopts(mlabsize(medium) yvarformat(%3.1f)) /// 75 | graphregion(fcolor(white) ifcolor(none)) /// 76 | plotregion(fcolor(white) ifcolor(white)) 77 | graph export "$output/ch03-figure-1a-hist-stars-Stata.png", replace 78 | 79 | * Figure 3.1b 80 | colorpalette viridis, n(4) select(2) nograph 81 | hist stars , /// 82 | discrete frequency /// 83 | xtitle(Star rating (number of stars)) /// 84 | fcolor(`r(p)') gap(5) lcolor(white) lwidth(vthin) /// 85 | xlabel(1(0.5)5 , grid) ylabel(0(20)140, grid ) /// 86 | addlabel addlabopts(mlabsize(medium) yvarformat(%3.0f)) /// 87 | graphregion(fcolor(white) ifcolor(none)) /// 88 | plotregion(fcolor(white) ifcolor(white)) 89 | graph export "$output/ch03-figure-1b-hist-stars-Stata.png", replace 90 | 91 | 92 | * Price 93 | 94 | use "$data_in/hotels-vienna.dta", clear 95 | * Or download directly from OSF: 96 | 97 | /* 98 | copy "https://osf.io/download/dn8je/" "workfile.dta" 99 | use "workfile.dta", clear 100 | erase "workfile.dta" 101 | */ 102 | 103 | * sample design 104 | * KEEP hotels in Vienna, stars 3 to 4 105 | * DROP observation with erroneous price variable (price>1000) 106 | keep if accommodation_type== "Hotel" 107 | keep if stars>=3 & stars<=4 108 | keep if price<1000 109 | 110 | * brief look at data 111 | tab city 112 | tab stars 113 | 114 | * Figure 3.2a 115 | colorpalette viridis, n(4) select(2) nograph 116 | hist price , /// 117 | discrete frequency /// 118 | xtitle(Price (US dollars)) /// 119 | color(`r(p)') /// 120 | xlabel(0(50)500 , grid) ylabel(0(2)8, grid ) /// 121 | graphregion(fcolor(white) ifcolor(none)) /// 122 | plotregion(fcolor(white) ifcolor(white)) 123 | graph export "$output/ch03-figure-2a-hist-price-Stata.png", replace 124 | 125 | * Figure 3.2b 126 | colorpalette viridis, n(4) select(2) nograph 127 | hist price , /// 128 | width(10) frequency /// 129 | xtitle(Price (US dollars)) /// 130 | color(`r(p)') lcol(white) lw(vthin) /// 131 | xlabel(0(50)500 , grid) ylabel(0(5)40, grid ) /// 132 | graphregion(fcolor(white) ifcolor(none)) /// 133 | plotregion(fcolor(white) ifcolor(white)) 134 | graph export "$output/ch03-figure-2b-hist-price-Stata.png", replace 135 | 136 | 137 | * Figure 3.3a 138 | colorpalette viridis, n(4) select(2) nograph 139 | hist price , /// 140 | width(40) start(40) frequency /// 141 | xtitle(Price (US dollars)) /// 142 | color(`r(p)') lcol(white) lw(vthin) /// 143 | xlabel(0(80)500 , grid) ylabel(0(20)120, grid ) /// 144 | graphregion(fcolor(white) ifcolor(none)) /// 145 | plotregion(fcolor(white) ifcolor(white)) 146 | graph export "$output/ch03-figure-3a-hist-price-Stata.png", replace 147 | 148 | * Figure 3.3b 149 | colorpalette viridis, n(4) select(2) nograph 150 | hist price , /// 151 | width(80) start(0) frequency /// 152 | xtitle(Price (US dollars)) /// 153 | color(`r(p)') lcol(white) lw(vthin) /// 154 | xlabel(0(80)500 , grid) ylabel(0(50)150, grid ) /// 155 | graphregion(fcolor(white) ifcolor(none)) /// 156 | plotregion(fcolor(white) ifcolor(white)) 157 | graph export "$output/ch03-figure-3b-hist-price-Stata.png", replace 158 | 159 | 160 | * Distance to city center 161 | 162 | * Figure 3.4 163 | * also Figure 3.5 (it's the same with some annotation) 164 | colorpalette viridis, n(4) select(2) nograph 165 | hist distance, /// 166 | width(0.5) frequency /// 167 | xtitle(Distance to city center (miles)) /// 168 | color(`r(p)') lcol(white) lw(vthin) /// 169 | xlabel(0(2)14 , grid) ylabel(0(10)60, grid ) /// 170 | graphregion(fcolor(white) ifcolor(none)) /// 171 | plotregion(fcolor(white) ifcolor(white)) 172 | graph export "$output/ch03-figure-4-hist-dist-Stata.png", replace 173 | 174 | count if distance>8 175 | drop if distance>8 176 | 177 | tab city_actual 178 | keep if city_actual=="Vienna" 179 | count 180 | 181 | -------------------------------------------------------------------------------- /ch03-simulations/ch03-distributions.R: -------------------------------------------------------------------------------- 1 | ################################################################################################ 2 | # Prepared for the textbook: 3 | # Data Analysis for Business, Economics, and Policy 4 | # by Gabor BEKES and Gabor KEZDI 5 | # Cambridge University Press 2021 6 | # 7 | # License: Free to share, modify and use for educational purposes. Not to be used for business purposes. 8 | # 9 | ###############################################################################################x 10 | 11 | # CHAPTER 03 12 | # CH03C Measuring Home Team Advantage in Football 13 | # football dataset 14 | # version 0.9 2020-08-28 15 | 16 | 17 | # ------------------------------------------------------------------------------------------------------ 18 | #### SET UP 19 | # It is advised to start a new session for every case study 20 | # CLEAR MEMORY 21 | rm(list=ls()) 22 | 23 | # Import libraries 24 | library(tidyverse) 25 | 26 | 27 | # set working directory 28 | # option A: open material as project 29 | # option B: set working directory for da_case_studies 30 | # example: setwd("C:/Users/bekes.gabor/Documents/github/da_case_studies/") 31 | 32 | # set data dir, data used 33 | source("set-data-directory.R") # data_dir must be first defined 34 | # alternative: give full path here, 35 | # example data_dir="C:/Users/bekes.gabor/Dropbox (MTA KRTK)/bekes_kezdi_textbook/da_data_repo" 36 | 37 | # load theme and functions 38 | source("ch00-tech-prep/theme_bg.R") 39 | source("ch00-tech-prep/da_helper_functions.R") 40 | 41 | data_in <- paste(data_dir,"football","clean/", sep = "/") 42 | 43 | use_case_dir <- "ch03-simulations/" 44 | data_out <- use_case_dir 45 | output <- paste0(use_case_dir,"output/") 46 | create_output_if_doesnt_exist(output) 47 | 48 | 49 | #------------------------------------ 50 | # set the seed 51 | set.seed(16460) 52 | 53 | # sample size 54 | N <- 100000 55 | obs <- N 56 | 57 | # Bernoulli 58 | bernoulli <- as.data.frame(rbinom(obs, 1, 0.7)) 59 | colnames(bernoulli) <- "bernoulli" 60 | 61 | g_bernoulli<- ggplot(data = bernoulli, aes (x = bernoulli, y = (..count..)/sum(..count..))) + 62 | geom_histogram(binwidth = 0.1, color = color.outline, fill = color[1], size = 0.25, alpha = 0.8, show.legend=F, na.rm =TRUE) + 63 | labs(y = "Percent") + 64 | expand_limits(x = 0.01, y = 0.01) + 65 | coord_cartesian(clip = "off") + 66 | scale_x_continuous(limits = c(-0.1, 1.1), breaks = seq(0, 1, by = 1)) + 67 | scale_y_continuous(labels = scales::percent_format(accuracy = 5L)) + 68 | theme_bg() 69 | g_bernoulli 70 | save_fig("ch03-figure-rb7-1a-bernoulli", output, size = "small") 71 | 72 | 73 | # Binomial 74 | # with smaller sample 75 | Nbinom <- 20 76 | binomial <- as.data.frame(rbinom(obs,Nbinom,0.4)) 77 | colnames(binomial) <- "binomial" 78 | 79 | g_binom<-ggplot(data = binomial, aes (x = binomial, y = (..count..)/sum(..count..))) + 80 | geom_histogram_da(binwidth = 0.5, type="percent") + 81 | labs(y = "Percent") + 82 | coord_cartesian(clip = "off") + 83 | scale_y_continuous(expand = c(0.01,0.01), labels = scales::percent_format(accuracy = 1)) + 84 | scale_x_continuous(expand = c(0.01,0.01), limits=c(0, 16), breaks=seq(0, 16, by=2)) + 85 | theme_bg() 86 | g_binom 87 | save_fig("ch03-figure-rb7-1b-binomial", output, size = "small") 88 | 89 | 90 | # uniform [0,1] 91 | 92 | uniform <- as.data.frame(runif(obs, 0, 1)) 93 | colnames(uniform) <- "uniform" 94 | 95 | g_uniform<-ggplot(data = uniform, aes (x = uniform, y = (..count..)/sum(..count..))) + 96 | geom_histogram(bins =50, center=1, 97 | color = color.outline, fill = color[1], size = 0.25, alpha = 0.8, show.legend=F, na.rm =TRUE) + 98 | labs(y = "Percent") + 99 | scale_x_continuous(expand = c(0.01,0.01), limits=c(0, 1), breaks=seq(0, 1, by=0.1)) + 100 | scale_y_continuous(expand = c(0.001,0.001), labels = scales::percent_format(accuracy = .1)) + 101 | coord_cartesian(clip = "off") + 102 | theme_bg() 103 | g_uniform 104 | save_fig("ch03-figure-rb7-1c-uniform", output, size = "small") 105 | 106 | # normal 107 | normal <- as.data.frame(rnorm(obs, 0,1)) 108 | colnames(normal) <- "normal" 109 | 110 | g_normal<-ggplot(data = normal, aes (x = normal, y = (..count..)/sum(..count..))) + 111 | geom_histogram(binwidth = 0.2, color = color.outline, fill = color[1], size = 0.25, alpha = 0.8, show.legend=F, na.rm =TRUE) + 112 | labs(y = "Percent") + 113 | scale_y_continuous(labels = scales::percent_format(accuracy = 1), limits=c(0, 0.1), breaks=seq(0, 0.1, by=0.02)) + 114 | scale_x_continuous(expand = c(0.01,0.01), limits=c(-5, 5), breaks=seq(-4, 4, by=1)) + 115 | expand_limits(x = 0.01, y = 0.01) + 116 | coord_cartesian(clip = "off") + 117 | theme_bg() 118 | g_normal 119 | save_fig("ch03-figure-rb7-2a-normal", output, size = "small") 120 | 121 | # lognoromal 122 | # take the exponential of the randomly generated normal above 123 | lognormal <- as.data.frame(exp(normal)) 124 | colnames(lognormal) <- "lognormal" 125 | 126 | g_lognormal<-ggplot(data = subset(lognormal, lognormal <10), aes (x = lognormal, y = (..count..)/sum(..count..))) + 127 | geom_histogram(binwidth = 0.1, boundary=0.0, 128 | color = color.outline, fill = color[1], size = 0.25, alpha = 0.8, show.legend=F, na.rm =TRUE) + 129 | labs(y = "Percent") + 130 | scale_y_continuous(labels = scales::percent_format(accuracy = 1), limits=c(0, 0.08), breaks=seq(0, 0.08, by=0.02)) + 131 | scale_x_continuous(expand = c(0.01,0.01), limits=c(0, 8), breaks=seq(0, 8, by=1)) + 132 | expand_limits(x = 0, y = 0) + 133 | coord_cartesian(clip = "off") + 134 | theme_bg() 135 | g_lognormal 136 | save_fig("ch03-figure-rb7-2b-lognormal", output, size = "small") 137 | 138 | 139 | # power-law 140 | alpha <- 3 141 | xmin <- 1 142 | x <- seq(1, obs, 1) 143 | powerlaw <- xmin * (x ^ (-alpha)) 144 | histrange <- quantile(powerlaw, .75) 145 | powerlaw <- powerlaw / sum(powerlaw) 146 | 147 | powerlaw <- as.data.frame(powerlaw) 148 | 149 | 150 | g_power<-ggplot(data = subset(powerlaw, powerlaw < histrange), aes (x = powerlaw, y = (..count..)/sum(..count..))) + 151 | geom_histogram(bins=50, boundary=0.5, 152 | color = color.outline, fill = color[1], size = 0.25, alpha = 0.8, show.legend=F, na.rm =TRUE) + 153 | labs(y = "Percent") + 154 | scale_y_continuous(labels = scales::percent_format(accuracy = 5L)) + 155 | scale_x_continuous(labels = fancy_scientific) + 156 | theme_bg() 157 | g_power 158 | save_fig("ch03-figure-rb7-2c-powerlaw", output, size = "small") 159 | 160 | 161 | -------------------------------------------------------------------------------- /ch03-simulations/ch03-distributions.do: -------------------------------------------------------------------------------- 1 | ******************************************************************** 2 | * Prepared for Gabor's Data Analysis 3 | * 4 | * Data Analysis for Business, Economics, and Policy 5 | * by Gabor Bekes and Gabor Kezdi 6 | * Cambridge University Press 2021 7 | * 8 | * gabors-data-analysis.com 9 | * 10 | * License: Free to share, modify and use for educational purposes. 11 | * Not to be used for commercial purposes. 12 | * 13 | * Chapter 03 14 | * Simulating the density function (histograms) of theoretical distributions 15 | * noa ctual data used here 16 | * version 0.9 2020-09-06 17 | ******************************************************************** 18 | 19 | 20 | * SETTING UP DIRECTORIES 21 | 22 | * set working directory for da_case_studies. 23 | * for example: 24 | * cd "C:/Users/xy/Dropbox/gabors_data_analysis/da_case_studies" 25 | 26 | 27 | global work "ch03-simulations" 28 | 29 | cap mkdir "$work/output" 30 | global output "$work/output" 31 | 32 | 33 | *clear environment 34 | clear 35 | 36 | *set the seed 37 | set seed 16460 38 | 39 | *sample size 40 | global N=100000 41 | set obs $N 42 | 43 | 44 | 45 | 46 | * Bernoulli 47 | gen bernoulli=rbinomial(1,0.7) 48 | hist bernoulli, /// 49 | xtitle("") ytitle("Percent") xlab(0 1) color(navy*0.8) percent 50 | graph export "$output/dist-Bernoulli-Stata.png", replace 51 | more 52 | 53 | * Binomial 54 | * with smaller sample 55 | global Nbinom = 20 56 | gen rbinomial=rbinomial($Nbinom,.4) 57 | hist rbinomial, disc width(0.5) /// 58 | xtitle("") ytitle("Percent") xlab("") color(navy*0.8) percent 59 | graph export "$output/dist-binomial-Stata.png", replace 60 | more 61 | 62 | * uniform [0,1] 63 | gen runif=runiform(0,1) 64 | hist runif, /// 65 | xtitle("") ytitle("Percent") fcolor(navy*0.8) lcolor(white) percent 66 | graph export "$output/dist-uniform-Stata.png", replace 67 | more 68 | 69 | * noromal 70 | gen rnormal=rnormal(0,1) 71 | hist rnormal, /// 72 | xtitle("") ytitle("Percent") xlab("") fcolor(navy*0.8) lcolor(white) percent 73 | graph export "$output/dist-normal-Stata.png", replace 74 | more 75 | 76 | * lognoromal 77 | * take the exponential of the randomly generated normal above 78 | generate lognormal = exp(rnormal) 79 | hist lognormal if lognormal <10 , /// 80 | xtitle("") ytitle("Percent") xlab("") fcolor(navy*0.8) lcolor(white) percent 81 | graph export "$output/dist-lognormal-Stata.png", replace 82 | more 83 | 84 | 85 | * power-law 86 | global alpha = 6 87 | global xmin = 1 88 | cap gen x = _n 89 | cap drop powerlaw 90 | generate powerlaw = $xmin*x^(-$alpha) 91 | sum powerlaw, d 92 | replace powerlaw = powerlaw/r(sum) 93 | local histrange = r(p75) 94 | hist powerlaw if powerlaw < `histrange', /// 95 | xtitle("") ytitle("Percent") xlab("") fcolor(navy*0.8) lcolor(white) percent 96 | graph export "$output/dist-powerlaw-Stata.png", replace 97 | 98 | sum powerlaw,d 99 | 100 | 101 | -------------------------------------------------------------------------------- /ch04-management-firm-size/ch04-wms-management-size-boxplot-violinplot.R: -------------------------------------------------------------------------------- 1 | ######################################################################################### 2 | # Prepared for Gabor's Data Analysis 3 | # 4 | # Data Analysis for Business, Economics, and Policy 5 | # by Gabor Bekes and Gabor Kezdi 6 | # Cambridge University Press 2021 7 | # 8 | # gabors-data-analysis.com 9 | # 10 | # License: Free to share, modify and use for educational purposes. 11 | # Not to be used for commercial purposes. 12 | 13 | # CHAPTER 03 two illustrative plots 14 | # WMS dataset 15 | # version 0.9 2020-08-28 16 | ######################################################################################### 17 | 18 | 19 | ###################################################################### 20 | 21 | # Clear memory 22 | rm(list=ls()) 23 | 24 | # Import libraries 25 | library(tidyverse) 26 | library(gridExtra) 27 | library(cowplot) 28 | library(viridis) 29 | library(haven) 30 | #library(Hmisc) 31 | library(binsreg) 32 | 33 | # set working directory 34 | # option A: open material as project 35 | # option B: set working directory for da_case_studies 36 | # example: setwd("C:/Users/bekes.gabor/Documents/github/da_case_studies/") 37 | 38 | # set data dir, load theme and functions 39 | source("ch00-tech-prep/theme_bg.R") 40 | source("ch00-tech-prep/da_helper_functions.R") 41 | 42 | # data used 43 | source("set-data-directory.R") #data_dir must be first defined # 44 | data_in <- paste(data_dir,"wms-management-survey","clean/", sep = "/") 45 | 46 | use_case_dir <- "ch04-management-firm-size/" 47 | data_out <- use_case_dir 48 | output <- paste0(use_case_dir,"output/") 49 | create_output_if_doesnt_exist(output) 50 | 51 | 52 | 53 | ######################################################################## 54 | 55 | # Import data 56 | 57 | 58 | ######################################################################## 59 | 60 | # Import data 61 | df <- read_csv(paste0(data_in,"wms_da_textbook.csv")) 62 | # From the web 63 | # df <- read_csv( "https://osf.io/uzpce/download" ) 64 | 65 | # Sample selection 66 | df <- df %>% 67 | filter(country=="Mexico" & wave==2013 & emp_firm>=100 & emp_firm<=5000) 68 | 69 | 70 | 71 | 72 | # --------------------------------------------------------------- 73 | # Figure 3.6 74 | 75 | # Code employee bins 76 | df$emp3bins <- ifelse(df$emp_firm<200, 1, 77 | ifelse(df$emp_firm>=200 & df$emp_firm<1000, 2, 78 | ifelse(df$emp_firm>=1000, 3,100) 79 | ) ) 80 | # Box plots by emp bins 81 | df$emp3bins <- as.factor(df$emp3bins) 82 | levels(df$emp3bins) <- c('Small','Medium', 'Large') 83 | 84 | # Boxplot 85 | df2 <- df %>% 86 | select(emp3bins, management) %>% 87 | filter(emp3bins=="Small") 88 | 89 | q1 <- quantile(df2$management, 0.25) 90 | q3 <- quantile(df2$management, 0.75) 91 | q2 <- quantile(df2$management, 0.50) 92 | iqr <- q3 - q1 93 | ub <- max(df2[df2$management=(q1-iqr*1.5), ]$management) 95 | out_lb <- min(df2$management) 96 | min <- min(df$management) 97 | max <- max(df$management) 98 | 99 | 100 | ggplot(data = df2, aes(x = emp3bins, y = management)) + 101 | geom_boxplot(color = "blue", fill = color[1], size = 0.5, width = 0.1, alpha = 0.5, na.rm=T) + 102 | stat_boxplot(geom = "errorbar", width = 0.05, color = "blue", size = 0.5, na.rm=T) + 103 | scale_y_continuous(limits = c(min,max)) + 104 | annotate("text", x = 1.1, y = ub, label = "← Upper adjacent value", hjust=0) + 105 | annotate("text", x = 1.1, y = q3, label = "← 75th percentile (upper hinge)", hjust=0) + 106 | annotate("text", x = 1.1, y = q2, label = "← Median", hjust=0) + 107 | annotate("text", x = 1.1, y = q1, label = "← 25th percentile (upper hinge)", hjust=0) + 108 | annotate("text", x = 1.1, y = lb, label = "← Lower adjacent value", hjust=0) + 109 | annotate("text", x = 1.1, y = out_lb, label = "← Outside values", hjust=0) + 110 | 111 | annotate("text", x = 0.63, y = ub, label = "Adjacent line", hjust=0) + 112 | annotate("text", x = 0.63, y = q3, label = "Whiskers", hjust=0) + 113 | annotate("text", x = 0.63, y = q2, label = "Median", hjust=0) + 114 | annotate("text", x = 0.63, y = q1, label = "Whiskers", hjust=0) + 115 | annotate("text", x = 0.63, y = lb, label = "Adjacent line", hjust=0) + 116 | 117 | geom_segment(aes(x = 0.9, y = lb, xend = 0.9, yend = ub)) + 118 | geom_segment(aes(x = 0.88, y = lb, xend = 0.9, yend = lb)) + 119 | geom_segment(aes(x = 0.88, y = q1, xend = 0.9, yend = q1)) + 120 | geom_segment(aes(x = 0.88, y = q2, xend = 0.9, yend = q2)) + 121 | geom_segment(aes(x = 0.88, y = q3, xend = 0.9, yend = q3)) + 122 | geom_segment(aes(x = 0.88, y = ub, xend = 0.9, yend = ub)) + 123 | 124 | theme( axis.title.x=element_blank(), 125 | axis.line.x=element_blank(), 126 | axis.ticks.x=element_blank(), 127 | axis.text.x = element_blank(), 128 | axis.title.y=element_blank(), 129 | axis.line.y=element_blank(), 130 | axis.ticks.y=element_blank(), 131 | axis.text.y = element_blank(), 132 | panel.grid = element_blank(), panel.border = element_blank()) 133 | ggsave(paste0(output, "boxlot_R.png"), width=14, height=8, units = "cm", dpi = 1200) 134 | 135 | 136 | #Violin 137 | ggplot(data = df2, aes(x = emp3bins, y = management)) + 138 | geom_violin(size=0.2, width = 0.3, trim = F, show.legend=F, na.rm =TRUE, color = "blue", fill = "blue", alpha = 0.3) + 139 | geom_boxplot(color = "blue", fill = color[1], size = 0.6, width = 0.01, alpha = 0.5, na.rm=T, outlier.shape = NA) + 140 | annotate("text", x = 1.05, y = ub, label = "← 95% Confidence Interval", hjust=0) + 141 | annotate("text", x = 1.18, y = q3, label = "← Interquartile range", hjust=0) + 142 | annotate("text", x = 1.18, y = q2, label = "← Median", hjust=0) + 143 | 144 | 145 | theme( axis.title.x=element_blank(), 146 | axis.line.x=element_blank(), 147 | axis.ticks.x=element_blank(), 148 | axis.text.x = element_blank(), 149 | axis.title.y=element_blank(), 150 | axis.line.y=element_blank(), 151 | axis.ticks.y=element_blank(), 152 | axis.text.y = element_blank(), 153 | panel.grid = element_blank(), panel.border = element_blank()) 154 | ggsave(paste0(output, "violin_R.png"), width=12, height=8, units = "cm", dpi = 1200) 155 | 156 | 157 | 158 | # --------------------------------------------------------------- 159 | 160 | -------------------------------------------------------------------------------- /ch06-online-offline-price-test/ch06-online-offline-price-test.R: -------------------------------------------------------------------------------- 1 | ################################################################################################ 2 | # Prepared for the textbook: 3 | # Data Analysis for Business, Economics, and Policy 4 | # by Gabor BEKES and Gabor KEZDI 5 | # Cambridge University Press 2021 6 | # 7 | # License: Free to share, modify and use for educational purposes. Not to be used for business purposes. 8 | # 9 | ###############################################################################################x 10 | 11 | # CHAPTER 06 12 | # CH06A 13 | # billion-prices dataset 14 | # version 0.9 2020-08-28 15 | 16 | 17 | # ------------------------------------------------------------------------------------------------------ 18 | #### SET UP 19 | # It is advised to start a new session for every case study 20 | # CLEAR MEMORY 21 | rm(list=ls()) 22 | 23 | # Import libraries 24 | library(tidyverse) 25 | library(xtable) 26 | library(broom) 27 | library(modelsummary) 28 | 29 | # set working directory 30 | # option A: open material as project 31 | # option B: set working directory for da_case_studies 32 | # example: setwd("C:/Users/bekes.gabor/Documents/github/da_case_studies/") 33 | 34 | # set data dir, data used 35 | source("set-data-directory.R") # data_dir must be first defined 36 | # alternative: give full path here, 37 | # example data_dir="C:/Users/bekes.gabor/Dropbox (MTA KRTK)/bekes_kezdi_textbook/da_data_repo" 38 | 39 | # load theme and functions 40 | source("ch00-tech-prep/theme_bg.R") 41 | source("ch00-tech-prep/da_helper_functions.R") 42 | 43 | data_in <- paste(data_dir,"billion-prices","clean/", sep = "/") 44 | 45 | use_case_dir <- "ch06-online-offline-price-test/" 46 | data_out <- use_case_dir 47 | output <- paste0(use_case_dir,"output/") 48 | create_output_if_doesnt_exist(output) 49 | 50 | 51 | #----------------------------------------------------------------------------------------- 52 | 53 | # load data 54 | pd <- read.csv(paste0(data_in,"online_offline_ALL_clean.csv")) 55 | # Load from the web 56 | # pd <- read_csv( "https://osf.io/yhbr5/download" ) 57 | 58 | 59 | # FILTER DATA 60 | pd <- pd %>% filter(COUNTRY=="USA") %>% 61 | filter(PRICETYPE == "Regular Price") %>% 62 | filter(is.na(sale_online)) %>% 63 | filter(!is.na(price)) %>% 64 | filter(!is.na(price_online)) 65 | 66 | 67 | # Drop obvious errors 68 | pd <- pd %>% filter(price<1000) 69 | 70 | # Compare variables 71 | pd<-pd %>% mutate(diff = price_online-price) 72 | 73 | # Check the main descriptives 74 | datasummary( diff ~ Mean + SD + Min + Max + Median + Max , data = pd ) 75 | 76 | hist1<- ggplot(data=pd, aes(diff))+ 77 | geom_histogram(binwidth = 5, boundary=0, closed="left", 78 | fill = color[1], size = 0.25, alpha = 0.8, show.legend=F, na.rm=TRUE) + 79 | labs(x = "Online - offline price difference (US dollars)", y = "Frequency") + 80 | theme_bg()+ 81 | scale_x_continuous(limits = c(-420, 420), breaks = seq(-400, 420, by = 100)) + 82 | scale_y_continuous(limits=c(0,6000), breaks = seq(0, 6000, by = 1000), expand = c(0.01,0.01))+ 83 | geom_segment(aes(x = 300, y = 500, xend = 415, yend = 20), arrow = arrow(length = unit(0.1, "cm")))+ 84 | annotate("text", x = 300, y = 700, label = "max value= 415", size=2.5) + 85 | geom_segment(aes(x = -280, y = 500, xend = -380, yend = 20), arrow = arrow(length = unit(0.1, "cm")))+ 86 | annotate("text", x = -300, y = 700, label = "min value= -380", size=2.5) 87 | hist1 88 | #save_fig("R_F06_1", output, "small") 89 | save_fig("ch06-figure-1a-pricediff", output, "small") 90 | 91 | # 4.99999 not 5 -- needed because of data imported from stata may be stored wierdly. 92 | pd1<-subset(pd,abs(pd$diff)<4.999999) 93 | Hmisc::describe(pd1$diff) 94 | 95 | hist2<- ggplot(data=pd, aes(diff))+ 96 | geom_histogram(binwidth = 0.5, boundary=-0, closed="left", 97 | color = color.outline, fill = color[1], size = 0.25, alpha = 0.8, show.legend=F, na.rm=TRUE) + 98 | labs(x = "Online - offline price difference (US dollars)", y = "Frequency") + 99 | theme_bg()+ 100 | expand_limits(x = 0.01, y = 0.01) + 101 | scale_x_continuous(limits = c(-5, 5), breaks = seq(-5, 5, by = 1)) + 102 | scale_y_continuous(expand = c(0.00,0.00), limits=c(0,5000), breaks = seq(0, 5000, by = 1000)) 103 | hist2 104 | #save_fig("R_F06_2", output, "small") 105 | save_fig("ch06-figure-1b-pricediff2", output, "small") 106 | 107 | # HYPOTHESIS 108 | t.test(pd$diff,mu=0) 109 | 110 | # MULTIPLE HYPOTHESES 111 | spd <- split(pd, pd$retailer,drop=FALSE) 112 | out <- vector("list", length = length(spd)) 113 | out <- lapply(1:length(spd),function (x) out[[x]]<- t.test(spd[[x]]$diff,mu=0)) 114 | out 115 | 116 | # create table 117 | table_out <- pd %>% group_by(retailer) %>% group_modify(~ tidy(t.test(.x$diff))) 118 | table_out<-table_out %>% 119 | dplyr::select(retailer,estimate,p.value) 120 | 121 | xt<-xtable(table_out,align='llcc', digits = c(0,0,2,2)) 122 | names(xt) <- c('Retailer ID','Diff','p-value') 123 | #print(xt, type = "latex",include.rownames = FALSE, file = paste0(output,"ch06.tex")) 124 | print(xt, type = "latex",include.rownames = FALSE, 125 | file = paste0(output,"ch06-table-2-test.tex")) 126 | 127 | 128 | 129 | 130 | 131 | -------------------------------------------------------------------------------- /ch06-online-offline-price-test/ch06-online-offline-price-test.do: -------------------------------------------------------------------------------- 1 | ******************************************************************** 2 | * Prepared for Gabor's Data Analysis 3 | * 4 | * Data Analysis for Business, Economics, and Policy 5 | * by Gabor Bekes and Gabor Kezdi 6 | * Cambridge University Press 2021 7 | * 8 | * gabors-data-analysis.com 9 | * 10 | * License: Free to share, modify and use for educational purposes. 11 | * Not to be used for commercial purposes. 12 | * 13 | * Chapter 06 14 | * CH06A Comparing online and offline prices: testing the differences 15 | * using the billion-prices dataset 16 | * version 0.9 2020-09-06 17 | ******************************************************************** 18 | 19 | 20 | * SETTING UP DIRECTORIES 21 | 22 | * STEP 1: set working directory for da_case_studies. 23 | * for example: 24 | * cd "C:/Users/xy/Dropbox/gabors_data_analysis/da_case_studies" 25 | 26 | 27 | * STEP 2: * Directory for data 28 | * Option 1: run directory-setting do file 29 | do set-data-directory.do 30 | /* this is a one-line do file that should sit in 31 | the working directory you have just set up 32 | this do file has a global definition of your working directory 33 | more details: gabors-data-analysis.com/howto-stata/ */ 34 | 35 | * Option 2: set directory directly here 36 | * for example: 37 | * global data_dir "C:/Users/xy/gabors_data_analysis/da_data_repo" 38 | 39 | 40 | global data_in "$data_dir/billion-prices/clean" 41 | global work "ch06-online-offline-price-test" 42 | 43 | cap mkdir "$work/output" 44 | global output "$work/output" 45 | 46 | 47 | 48 | 49 | 50 | clear 51 | 52 | 53 | use "$data_in/online_offline_ALL_clean.dta",replace 54 | 55 | * Or download directly from OSF: 56 | /* 57 | copy "https://osf.io/download/wm6ge/" "workfile.dta" 58 | use "workfile.dta", clear 59 | erase "workfile.dta" 60 | */ 61 | 62 | 63 | 64 | * filter dataset 65 | tab country 66 | keep if country == "USA" 67 | keep if PRICETYPE == "Regular Price" 68 | drop if sale_online == 1 69 | drop if price==. 70 | drop if price_online==. 71 | 72 | sum price*,d 73 | drop if price>1000 /* 3 observations with obvious error */ 74 | sum price* 75 | 76 | * PRICE 77 | 78 | gen pd = price_online - price 79 | lab var pd "Online - offline price difference (USD)" 80 | 81 | tabstat pd, s(mean sd min median max n) 82 | count 83 | count if pd==0 84 | count if pd>0 85 | count if pd<0 86 | count if pd>=-1 & pd<=1 87 | 88 | * Figure 6.1 (a) 89 | histogram pd , freq start(-400) width(5) /// 90 | col(navy*0.8) xlab(-400(100)400, grid) ylab(, grid) 91 | graph export "$output/ch06-figure-1a-pricediff-Stata.png",replace 92 | 93 | * Figure 6.1 (b) 94 | histogram pd if pd>-5 & pd<5, freq start(-5) width(0.5) /// 95 | col(navy*0.8) lcol(white) xlab(-5(1)5, grid) ylab(0(1000)5000, grid) 96 | graph export "$output/ch06-figure-1b-pricediff-Stata.png",replace 97 | 98 | 99 | ** HYPOTHESIS 100 | ttest pd = 0 101 | 102 | ** MULTIPLE HYPOTHESES 103 | sort retailer 104 | by retailer: ttest pd = 0 105 | 106 | 107 | 108 | 109 | -------------------------------------------------------------------------------- /ch06-stock-market-loss-test/ch06-stock-market-loss-test.R: -------------------------------------------------------------------------------- 1 | ################################################################################################ 2 | # Prepared for the textbook: 3 | # Data Analysis for Business, Economics, and Policy 4 | # by Gabor BEKES and Gabor KEZDI 5 | # Cambridge University Press 2021 6 | # 7 | # License: Free to share, modify and use for educational purposes. Not to be used for business purposes. 8 | # 9 | ###############################################################################################x 10 | 11 | # CHAPTER 06 12 | # CH06B CH06B Testing the likelihood of loss on a stock portfolio? 13 | # using the sp500 dataset 14 | # version 0.91 2021-10-21 15 | 16 | 17 | # ------------------------------------------------------------------------------------------------------ 18 | #### SET UP 19 | # It is advised to start a new session for every case study 20 | # CLEAR MEMORY 21 | rm(list=ls()) 22 | 23 | # Import libraries 24 | library(tidyverse) 25 | #install.packages("arm") 26 | library(arm) 27 | #install.packages("pastecs") 28 | library(pastecs) 29 | #install.packages("DataCombine") 30 | library(DataCombine) 31 | library(janitor) 32 | 33 | 34 | # set working directory 35 | # option A: open material as project 36 | # option B: set working directory for da_case_studies 37 | # example: setwd("C:/Users/bekes.gabor/Documents/github/da_case_studies/") 38 | 39 | # set data dir, data used 40 | source("set-data-directory.R") # data_dir must be first defined 41 | # alternative: give full path here, 42 | # example data_dir="C:/Users/bekes.gabor/Dropbox (MTA KRTK)/bekes_kezdi_textbook/da_data_repo" 43 | 44 | # load theme and functions 45 | source("ch00-tech-prep/theme_bg.R") 46 | source("ch00-tech-prep/da_helper_functions.R") 47 | 48 | data_in <- paste(data_dir,"sp500","clean/", sep = "/") 49 | 50 | use_case_dir <- "ch05-stock-market-loss-generalize/" 51 | data_out <- use_case_dir 52 | output <- paste0(use_case_dir,"output/") 53 | create_output_if_doesnt_exist(output) 54 | 55 | 56 | #----------------------------------------------------------------------------------------- 57 | # LOAD DATA 58 | sp500 <- read_csv(paste0(data_in,"SP500_2006_16_data.csv"),na = c("", "#N/A")) 59 | # From web 60 | # sp500 <- read_csv("https://osf.io/h64z2/download" , na = c("", "#N/A") ) 61 | sp500 <- subset(sp500, VALUE != "NA") 62 | 63 | 64 | # CREATE PERCENT RETURN 65 | sp500<- sp500 %>% 66 | mutate(pct_return = (VALUE - lag(VALUE)) / lag(VALUE) * 100) 67 | 68 | 69 | # remove first row as it has NA in pct_return 70 | pct_return <- sp500 %>% filter(!is.na(pct_return)) %>% pull(pct_return) 71 | 72 | sp500 <- sp500 %>% 73 | mutate(loss5=ifelse((pct_return < -5),1,0)) 74 | 75 | options(digits = 6) 76 | 77 | # t-test to show p-value of two sided. One sided p-value is p/2 78 | t.test(sp500$loss5,mu=0.01) 79 | 80 | -------------------------------------------------------------------------------- /ch06-stock-market-loss-test/ch06-stock-market-loss-test.do: -------------------------------------------------------------------------------- 1 | ******************************************************************** 2 | * Prepared for Gabor's Data Analysis 3 | * 4 | * Data Analysis for Business, Economics, and Policy 5 | * by Gabor Bekes and Gabor Kezdi 6 | * Cambridge University Press 2021 7 | * 8 | * gabors-data-analysis.com 9 | * 10 | * License: Free to share, modify and use for educational purposes. 11 | * Not to be used for commercial purposes. 12 | * 13 | * Chapter 06 14 | * CH06B Testing the likelihood of loss on a stock portfolio? 15 | * using the sp500 dataset 16 | * version 0.9 2021-10-20 17 | ******************************************************************** 18 | 19 | 20 | * SETTING UP DIRECTORIES 21 | 22 | * STEP 1: set working directory for da_case_studies. 23 | * for example: 24 | * cd "C:/Users/xy/Dropbox/gabors_data_analysis/da_case_studies" 25 | 26 | * STEP 2: * Directory for data 27 | * Option 1: run directory-setting do file 28 | do set-data-directory.do 29 | /* this is a one-line do file that should sit in 30 | the working directory you have just set up 31 | this do file has a global definition of your working directory 32 | more details: gabors-data-analysis.com/howto-stata/ */ 33 | 34 | * Option 2: set directory directly here 35 | * for example: 36 | * global data_dir "C:/Users/xy/gabors_data_analysis/da_data_repo" 37 | 38 | 39 | global data_in "$data_dir/sp500/clean" 40 | global work "ch06-stock-market-loss-test" 41 | 42 | cap mkdir "$work/output" 43 | global output "$work/output" 44 | 45 | cap mkdir "$work/temp" 46 | global temp "$work/temp" 47 | 48 | 49 | 50 | * set level of loss to inquire (in percent) 51 | global loss 5 52 | 53 | *************************************************************** 54 | * load data 55 | use "$data_in/SP500_2006_16_data.dta", clear 56 | 57 | * Or download directly from OSF: 58 | /* 59 | copy "https://osf.io/download/wm6ge/" "workfile.dta" 60 | use "workfile.dta", clear 61 | erase "workfile.dta" 62 | */ 63 | 64 | * create gap variable 65 | gen gap=date-date[_n-1]-1 66 | 67 | * label variables 68 | lab var value "Value of the S&P500" 69 | lab var datestr "Date, in string format (YMD)" 70 | lab var date "Date" 71 | lab var gap "Gap between observations, in days" 72 | 73 | * create variable for each year and each month 74 | * for later use 75 | gen year =year(date) 76 | gen month=month(date) 77 | 78 | 79 | * create percent daily returns 80 | sort date 81 | gen pct_return=(value-value[_n-1])/value[_n-1]*100 82 | lab var pct_return "Percent daily return" 83 | 84 | gen loss_$loss=100*(pct_return<-$loss) 85 | sum loss 86 | 87 | * t-test 88 | ttest loss_5=1 89 | di `r(p)' 90 | di `r(p_l)' 91 | 92 | -------------------------------------------------------------------------------- /ch06-stock-market-loss-test/ch06-stock-market-loss-test.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "f0ecf2f6", 6 | "metadata": {}, 7 | "source": [ 8 | "#### Prepared for Gabor's Data Analysis\n", 9 | "\n", 10 | "### Data Analysis for Business, Economics, and Policy\n", 11 | "by Gabor Bekes and Gabor Kezdi\n", 12 | " \n", 13 | "Cambridge University Press 2021\n", 14 | "\n", 15 | "**[gabors-data-analysis.com ](https://gabors-data-analysis.com/)**\n", 16 | "\n", 17 | " License: Free to share, modify and use for educational purposes. \n", 18 | " Not to be used for commercial purposes.\n", 19 | "\n", 20 | "### Chapter 05\n", 21 | "**CH05A What Likelihood of Loss to Expect on a Stock Portfolio?**\n", 22 | "\n", 23 | "using the sp500 dataset\n", 24 | "\n", 25 | "version 0.91 2021-10-21" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 1, 31 | "id": "0adb08a8", 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "import os\n", 36 | "import sys\n", 37 | "import warnings\n", 38 | "\n", 39 | "import numpy as np\n", 40 | "import pandas as pd\n", 41 | "from scipy import stats\n", 42 | "\n", 43 | "warnings.filterwarnings(\"ignore\")\n" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 2, 49 | "id": "04062aae", 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "# Current script folder\n", 54 | "current_path = os.getcwd()\n", 55 | "dirname = current_path.split(\"da_case_studies\")[0]\n", 56 | "\n", 57 | "# location folders\n", 58 | "data_in = dirname + \"da_data_repo/sp500/clean/\"\n", 59 | "data_out = dirname + \"da_case_studies/ch05-stock-market-loss-generalize/\"\n", 60 | "output = dirname + \"da_case_studies/ch05-stock-market-loss-generalize/output/\"\n", 61 | "func = dirname + \"da_case_studies/ch00-tech-prep/\"\n", 62 | "sys.path.append(func)\n" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 3, 68 | "id": "96fd9fd9", 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "# Import the prewritten helper functions\n", 73 | "from py_helper_functions import *\n" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 6, 79 | "id": "995509fe", 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "sp500 = pd.read_csv(data_in + \"SP500_2006_16_data.csv\")\n", 84 | "# sp500 = pd.read_csv(\"https://osf.io/h64z2/download\")" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "id": "a7377b5f", 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "sp500 = sp500.replace(\"\", np.nan).replace(\"#N/A\", np.nan).dropna().reset_index(drop=True)" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 7, 100 | "id": "da6e95c8", 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "sp500[\"pct_return\"] = sp500[\"VALUE\"].pct_change() * 100\n" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 8, 110 | "id": "bd72bad0", 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "sp500.loc[1:, \"loss5\"] = np.where(sp500[\"pct_return\"].dropna() < -5, 1, 0)" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 9, 120 | "id": "6a1e1c93", 121 | "metadata": {}, 122 | "outputs": [ 123 | { 124 | "data": { 125 | "text/plain": [ 126 | "Ttest_1sampResult(statistic=-3.3862058433914672, pvalue=0.0007195412199398962)" 127 | ] 128 | }, 129 | "execution_count": 9, 130 | "metadata": {}, 131 | "output_type": "execute_result" 132 | } 133 | ], 134 | "source": [ 135 | "# t-test to show p-value of two sided. One sided p-value is p/2\n", 136 | "\n", 137 | "stats.ttest_1samp(sp500[\"loss5\"], popmean=0.01, nan_policy=\"omit\")\n" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 11, 143 | "id": "67f1310e", 144 | "metadata": {}, 145 | "outputs": [ 146 | { 147 | "data": { 148 | "text/plain": [ 149 | "0.005163" 150 | ] 151 | }, 152 | "execution_count": 11, 153 | "metadata": {}, 154 | "output_type": "execute_result" 155 | } 156 | ], 157 | "source": [ 158 | "sp500[\"loss5\"].mean().round(6)\n" 159 | ] 160 | } 161 | ], 162 | "metadata": { 163 | "kernelspec": { 164 | "display_name": "da_case_studies-4lQ4EmNL", 165 | "language": "python", 166 | "name": "python3" 167 | }, 168 | "language_info": { 169 | "codemirror_mode": { 170 | "name": "ipython", 171 | "version": 3 172 | }, 173 | "file_extension": ".py", 174 | "mimetype": "text/x-python", 175 | "name": "python", 176 | "nbconvert_exporter": "python", 177 | "pygments_lexer": "ipython3", 178 | "version": "3.8.10" 179 | } 180 | }, 181 | "nbformat": 4, 182 | "nbformat_minor": 5 183 | } 184 | -------------------------------------------------------------------------------- /ch07-ols-simulation/ch07-ols-simulation.R: -------------------------------------------------------------------------------- 1 | ######################################################################################### 2 | # Prepared for Gabor's Data Analysis 3 | # 4 | # Data Analysis for Business, Economics, and Policy 5 | # by Gabor Bekes and Gabor Kezdi 6 | # Cambridge University Press 2021 7 | # 8 | # gabors-data-analysis.com 9 | # 10 | # License: Free to share, modify and use for educational purposes. 11 | # Not to be used for commercial purposes. 12 | 13 | # Chapter 07 14 | # CH07 OLS fit simulation 15 | # version 0.9 2020-09-07 16 | ######################################################################################### 17 | 18 | 19 | 20 | # ------------------------------------------------------------------------------------------------------ 21 | #### SET UP 22 | # It is advised to start a new session for every case study 23 | # CLEAR MEMORY 24 | rm(list=ls()) 25 | 26 | 27 | # Import libraries 28 | library(tidyverse) 29 | 30 | 31 | # set working directory 32 | # option A: open material as project 33 | # option B: set working directory for da_case_studies 34 | # example: setwd("C:/Users/bekes.gabor/Documents/github/da_case_studies/") 35 | 36 | # set data dir, data used 37 | source("set-data-directory.R") # data_dir must be first defined 38 | # alternative: give full path here, 39 | # example data_dir="C:/Users/bekes.gabor/Dropbox (MTA KRTK)/bekes_kezdi_textbook/da_data_repo" 40 | 41 | # load theme and functions 42 | source("ch00-tech-prep/theme_bg.R") 43 | source("ch00-tech-prep/da_helper_functions.R") 44 | 45 | use_case_dir <- "ch07-ols-simulation/" 46 | 47 | data_out <- use_case_dir 48 | output <- paste0(use_case_dir,"output/") 49 | create_output_if_doesnt_exist(output) 50 | 51 | 52 | #----------------------------------------------------------------------------------------- 53 | 54 | # set the seed 55 | set.seed(1458) 56 | 57 | # sample size 58 | n <- 100 59 | 60 | # uniformly distributed x, [0,4] 61 | xvar <- runif(n,0,4) 62 | 63 | # y = a + bx + u (u normally distributed) 64 | a <- 2 65 | b <- 0.5 66 | sigmau <- 0.7 67 | yvar <- a+b*xvar+rnorm(n,0,sigmau) 68 | 69 | 70 | reg <- lm(yvar~xvar) 71 | summary(reg) 72 | 73 | # save coefficients 74 | coeffs = coefficients(reg) 75 | 76 | # scatterplot and OLS regression line 77 | # average y and average x shown 78 | ols <- data.frame(xvar,yvar) 79 | 80 | 81 | F07_sim <- ggplot(data = ols, aes(x = xvar, y = yvar)) + 82 | geom_point_da() + 83 | geom_smooth_da(method = "lm") + 84 | #geom_abline(intercept=coeffs[1], slope=coeffs[2], size=1.2, color=color[3]) + # alternative 85 | expand_limits(x = 0.01, y = 0.01) + 86 | scale_x_continuous(expand = c(0.01,0.01), limits=c(0, 4), breaks=seq(0, 4, by=1)) + 87 | scale_y_continuous(expand = c(0.01,0.01), limits = c(0, 5), breaks = seq(0, 5, by = 1)) + 88 | labs(x = "Simulated x variable",y = "Simulated y variable")+ 89 | theme_bg() + 90 | geom_vline(xintercept = mean(xvar), color=color[3], linetype="dashed", size=0.4) + 91 | geom_hline(yintercept = mean(yvar), color=color[3], linetype="dashed", size=0.4) + 92 | geom_segment(aes(x = 0.5, y = 3.5, xend = 0.5, yend = 2.9), arrow = arrow(length = unit(0.01, "cm")))+ 93 | annotate("text", x = 0.3, y = 3.6, label = "Average y", size=2) + 94 | geom_segment(aes(x = 1.2, y = 4, xend = 1.9, yend = 4), arrow = arrow(length = unit(0.01, "cm")))+ 95 | annotate("text", x = 0.9, y = 4, label = "Average x", size=2) 96 | F07_sim 97 | save_fig("ch07-figure-4-olsfit", output, "small") 98 | 99 | -------------------------------------------------------------------------------- /ch07-ols-simulation/ch07-ols-simulation.do: -------------------------------------------------------------------------------- 1 | ******************************************************************** 2 | * Prepared for Gabor's Data Analysis 3 | * 4 | * Data Analysis for Business, Economics, and Policy 5 | * by Gabor Bekes and Gabor Kezdi 6 | * Cambridge University Press 2021 7 | * 8 | * gabors-data-analysis.com 9 | * 10 | * License: Free to share, modify and use for educational purposes. 11 | * Not to be used for commercial purposes. 12 | * 13 | * Chapter 07 14 | * simulated simple linear regression estimated by OLS 15 | * no actual dataset used 16 | * version 0.9 2020-09-06 17 | ******************************************************************** 18 | 19 | 20 | * SETTING UP DIRECTORIES 21 | 22 | * STEP 1: set working directory for da_case_studies. 23 | * for example: 24 | * cd "C:/Users/xy/Dropbox/gabors_data_analysis/da_case_studies" 25 | 26 | global work "ch07-ols-simulation" 27 | 28 | cap mkdir "$work/output" 29 | global output "$work/output" 30 | 31 | 32 | 33 | * No real data is used 34 | 35 | 36 | * clear environment 37 | clear 38 | 39 | * set the seed 40 | set seed 1458 41 | 42 | * sample size 43 | global N=100 44 | set obs $N 45 | 46 | * uniformly distributed x, [0,4] 47 | gen x = runiform(0,4) 48 | 49 | * y = a + bx + u (u normally distributed) 50 | local a = 2 51 | local b = 0.5 52 | local sigmau = 0.7 53 | 54 | gen y = `a' + `b'*x + rnormal(0,`sigmau') 55 | 56 | summarize y 57 | local meany = r(mean) 58 | summarize x 59 | local meanx = r(mean) 60 | 61 | * scatterplot and OLS regression line 62 | * average y and average x shown 63 | 64 | scatter y x, mc(navy*0.6) ms(O) msize(small) mlw(thick) /// 65 | || lfit y x, legend(off) lc(green) lw(thick) /// 66 | ylabel(0(0.5)6, grid) xlabel(0(0.5)4, grid) /// 67 | ytitle("Simulated y variable") xtitle("Simulated x variable") /// 68 | yline(`meany', lc(black) lp(dash)) xline(`meanx', lc(black) lp(dash)) /// 69 | text(5 1.6 "Average x") text(3.2 0.4 "Average y") /// 70 | graphregion(fcolor(white) ifcolor(none)) /// 71 | plotregion(fcolor(white) ifcolor(white)) 72 | graph export "$output\ch07-figure-4-olsfit-Stata.png", replace 73 | -------------------------------------------------------------------------------- /ch08-hotels-measurement-error/README: -------------------------------------------------------------------------------- 1 | Chapter 08 2 | hotels 3 | measurement error based on review count 4 | -------------------------------------------------------------------------------- /ch08-hotels-measurement-error/ch08-hotels-measeerror.R: -------------------------------------------------------------------------------- 1 | ######################################################################################### 2 | # Prepared for Gabor's Data Analysis 3 | # 4 | # Data Analysis for Business, Economics, and Policy 5 | # by Gabor Bekes and Gabor Kezdi 6 | # Cambridge University Press 2021 7 | # 8 | # gabors-data-analysis.com 9 | # 10 | # License: Free to share, modify and use for educational purposes. 11 | # Not to be used for commercial purposes. 12 | 13 | # Chapter 08 14 | # CH08C Measurement error in hotel ratings 15 | # using the hotels-vienna dataset 16 | # version 0.9 2020-09-07 17 | ######################################################################################### 18 | 19 | 20 | 21 | # ------------------------------------------------------------------------------------------------------ 22 | #### SET UP 23 | # It is advised to start a new session for every case study 24 | # CLEAR MEMORY 25 | rm(list=ls()) 26 | 27 | 28 | # Import libraries 29 | library(tidyverse) 30 | library(haven) 31 | library(lspline) 32 | library(grid) 33 | library(cowplot) 34 | library(scales) 35 | 36 | 37 | # set working directory 38 | # option A: open material as project 39 | # option B: set working directory for da_case_studies 40 | # example: setwd("C:/Users/bekes.gabor/Documents/github/da_case_studies/") 41 | 42 | # set data dir, data used 43 | source("set-data-directory.R") # data_dir must be first defined 44 | # alternative: give full path here, 45 | # example data_dir="C:/Users/bekes.gabor/Dropbox (MTA KRTK)/bekes_kezdi_textbook/da_data_repo" 46 | 47 | # load theme and functions 48 | source("ch00-tech-prep/theme_bg.R") 49 | source("ch00-tech-prep/da_helper_functions.R") 50 | 51 | data_in <- paste(data_dir,"hotels-vienna","clean/", sep = "/") 52 | use_case_dir <- "ch08-hotels-measurement-error/" 53 | 54 | data_out <- use_case_dir 55 | output <- paste0(use_case_dir,"output/") 56 | create_output_if_doesnt_exist(output) 57 | 58 | 59 | #----------------------------------------------------------------------------------------- 60 | # load vienna 61 | hotels <- read_csv(paste0(data_in,"hotels-vienna.csv")) 62 | # df <- read_csv("https://osf.io/y6jvb/download") 63 | 64 | # ------------------------------------------------------------------------------------------------------ 65 | ####SAMPLE SELECTION 66 | hotels <- hotels %>% filter(accommodation_type=="Hotel") %>% 67 | filter(city_actual=="Vienna") %>% 68 | filter(stars>=3 & stars<=4) %>% filter(!is.na(stars)) %>% 69 | filter(price<=600) 70 | 71 | 72 | 73 | 74 | ####################################### 75 | # Look at measurement error by rating count 76 | ####################################### 77 | 78 | 79 | # TAKE LOG PRICE 80 | hotels$lnprice <- log(hotels$price) 81 | 82 | # define cutoffs 83 | k1=100 84 | k2=200 85 | 86 | # FIGURE 87 | reg_me <- lm(lnprice ~ rating, data=subset(hotels, rating_count=k1 & rating_count=k2)) 96 | summary(reg_me3) 97 | hotels$yhat3<-predict(reg_me3,hotels) 98 | 99 | 100 | F08_noise1<- ggplot(data = hotels) + 101 | geom_line(aes(x = rating, y = yhat, color = color[2]), size = 1)+ 102 | geom_line(aes(x = rating, y = yhat3, color = color[1]), size = 1)+ 103 | scale_color_manual(name = "", values=c(color[2], color[1]), labels=NULL, guide = 'none') + 104 | coord_cartesian(xlim = c(2, 5), ylim = c(3.5, 5)) + 105 | expand_limits(x = 0.01, y = 0.01) + 106 | scale_y_continuous(expand = c(0.01,0.01)) + 107 | scale_x_continuous(expand = c(0.01,0.01), limits=c(2,5), breaks=seq(2,5, by=0.5)) + 108 | labs(x = "Average rating",y = "ln(Hotel price, US dollars)")+ 109 | theme_bg() + 110 | annotate("text", x = 2.6, y = 4.4, label = "More noisy: # of ratings<100", size=2, color=color[2])+ 111 | annotate("text", x = 3.1, y = 3.6, label = "Less noisy: # of ratings>200", size=2, color=color[1]) 112 | F08_noise1 113 | #save_fig("F08_noise1_R", output, "small") 114 | save_fig("ch08-figure-8-hotels-measerror", output, "small") 115 | 116 | 117 | 118 | -------------------------------------------------------------------------------- /ch08-hotels-measurement-error/ch08-hotels-measeerror.do: -------------------------------------------------------------------------------- 1 | ******************************************************************** 2 | * Prepared for Gabor's Data Analysis 3 | * 4 | * Data Analysis for Business, Economics, and Policy 5 | * by Gabor Bekes and Gabor Kezdi 6 | * Cambridge University Press 2021 7 | * 8 | * gabors-data-analysis.com 9 | * 10 | * License: Free to share, modify and use for educational purposes. 11 | * Not to be used for commercial purposes. 12 | * 13 | * Chapter 08 14 | * CH08C Measurement error in hotel ratings 15 | * using the hotels-vienna dataset 16 | * version 0.9 2020-09-06 17 | ******************************************************************** 18 | 19 | 20 | * SETTING UP DIRECTORIES 21 | 22 | * STEP 1: set working directory for da_case_studies. 23 | * for example: 24 | * cd "C:/Users/xy/Dropbox/gabors_data_analysis/da_case_studies" 25 | 26 | * STEP 2: * Directory for data 27 | * Option 1: run directory-setting do file 28 | do set-data-directory.do 29 | /* this is a one-line do file that should sit in 30 | the working directory you have just set up 31 | this do file has a global definition of your working directory 32 | more details: gabors-data-analysis.com/howto-stata/ */ 33 | 34 | * Option 2: set directory directly here 35 | * for example: 36 | * global data_dir "C:/Users/xy/gabors_data_analysis/da_data_repo" 37 | 38 | 39 | global data_in "$data_dir/hotels-vienna/clean" 40 | global work "ch08-hotels-measurement-error" 41 | 42 | cap mkdir "$work/output" 43 | global output "$work/output" 44 | 45 | 46 | 47 | 48 | * load in clean and tidy data and create workfile 49 | use "$data_in/hotels-vienna.dta", clear 50 | 51 | * Or download directly from OSF: 52 | 53 | /* 54 | copy "https://osf.io/download/dn8je/" "workfile.dta" 55 | use "workfile.dta", clear 56 | erase "workfile.dta" 57 | */ 58 | 59 | *** SAMPLE SELECTION 60 | 61 | *** 3 to 4-star hotels (incl 3.5 stars) 62 | keep if stars>=3 & stars<=4 63 | keep if accommodation_type=="Hotel" 64 | label var distance "Distance to city center, miles" 65 | drop if price>600 /* likely error */ 66 | 67 | 68 | *** drop hotels not really in Vienna 69 | tab city_actual 70 | keep if city_actual=="Vienna" 71 | 72 | 73 | gen lnprice=ln(price) 74 | lab var lnprice "ln(Price)" 75 | 76 | sum rating_count ,d 77 | 78 | * define cutoffs 79 | local k1 100 80 | local k2 200 81 | 82 | sum rating_count rating if rating_count <`k1' 83 | sum rating_count rating if rating_count >=`k1' & rating_count <`k2' 84 | sum rating_count rating if rating_count >=`k2' 85 | 86 | * FIGURE 87 | reg lnprice rating if rating_count <`k1' 88 | predict yhat1 89 | lab var yhat1 "more noisy x: # ratings <`k1'" 90 | reg lnprice rating if rating_count >=`k1' & rating_count <`k2' 91 | cap predict yhat2 92 | cap lab var yhat2 "`k1' <= # ratings <`k2' " 93 | reg lnprice rating if rating_count >=`k2' 94 | predict yhat3 95 | lab var yhat3 "less noisy x: # ratings >`k2'" 96 | 97 | line yhat1 yhat3 rating, lw(vthick vthick) lc(green*0.8 navy*0.8) lp(solid solid) /// 98 | xtitle("Average rating") ytitle("ln(Hotel price, US dollars)") /// 99 | ylab(3.5(0.5)5.0, grid) xlab(2.0(0.5)5, grid) /// 100 | legend(off) /// 101 | text(4.3 2.55 "More noisy: n. of ratings<100", col(green*0.8)) /// 102 | text(3.7 3.1 "Less noisy: n. of ratings>200", col(navy*0.8)) /// 103 | graphregion(fcolor(white) ifcolor(none)) /// 104 | plotregion(fcolor(white) ifcolor(white)) 105 | graph export "$output/ch08-figure-8-hotels-measerror-Stata.png", as(png) replace 106 | 107 | -------------------------------------------------------------------------------- /ch08-hotels-nonlinear/ch08-hotels-nonlinear-reg.R: -------------------------------------------------------------------------------- 1 | ######################################################################################### 2 | # Prepared for Gabor's Data Analysis 3 | # 4 | # Data Analysis for Business, Economics, and Policy 5 | # by Gabor Bekes and Gabor Kezdi 6 | # Cambridge University Press 2021 7 | # 8 | # gabors-data-analysis.com 9 | # 10 | # License: Free to share, modify and use for educational purposes. 11 | # Not to be used for commercial purposes. 12 | 13 | # Chapter 08 14 | # CH08A Finding a good deal among hotels with nonlinear function 15 | # using the hotels-vienna dataset 16 | # version 0.9 2020-09-07 17 | ######################################################################################### 18 | 19 | # ------------------------------------------------------------------------------------------------------ 20 | #### SET UP 21 | # It is advised to start a new session for every case study 22 | # CLEAR MEMORY 23 | rm(list=ls()) 24 | 25 | 26 | # Import libraries 27 | library(haven) 28 | library(lspline) 29 | library(gridExtra) 30 | library(cowplot) 31 | library(scales) 32 | library(tidyverse) 33 | 34 | # set working directory 35 | # option A: open material as project 36 | # option B: set working directory for da_case_studies 37 | # example: setwd("C:/Users/bekes.gabor/Documents/github/da_case_studies/") 38 | 39 | # set data dir, data used 40 | source("set-data-directory.R") # data_dir must be first defined 41 | # alternative: give full path here, 42 | # example data_dir="C:/Users/bekes.gabor/Dropbox (MTA KRTK)/bekes_kezdi_textbook/da_data_repo" 43 | 44 | # load theme and functions 45 | source("ch00-tech-prep/theme_bg.R") 46 | source("ch00-tech-prep/da_helper_functions.R") 47 | 48 | data_in <- paste(data_dir,"hotels-vienna","clean/", sep = "/") 49 | use_case_dir <- "ch08-hotels-nonlinear/" 50 | 51 | data_out <- use_case_dir 52 | output <- paste0(use_case_dir,"output/") 53 | create_output_if_doesnt_exist(output) 54 | 55 | 56 | # ------------------------------------------------------------------------------------------------------ 57 | 58 | # load vienna 59 | hotels <- read_csv(paste0(data_in,"hotels-vienna.csv")) 60 | # df <- read_csv("https://osf.io/y6jvb/download") 61 | # ------------------------------------------------------------------------------------------------------ 62 | ####SAMPLE SELECTION 63 | # Apply filters: 3-4 stars, Vienna actual, without extreme value 64 | hotels <- hotels %>% filter(accommodation_type=="Hotel") %>% 65 | filter(city_actual=="Vienna") %>% 66 | filter(stars>=3 & stars<=4) %>% filter(!is.na(stars)) %>% 67 | filter(price<=600) 68 | 69 | 70 | ############################# 71 | #LOG MODELS 72 | ############################ 73 | 74 | # TAKE LOG PRICE 75 | hotels$lnprice <- log(hotels$price) 76 | 77 | hotels$distance2<-hotels$distance 78 | hotels$distance2[hotels$distance2<0.05] <- 0.05 79 | 80 | hotels$lndistance<-log(hotels$distance2) 81 | 82 | 83 | # describe price and ln price 84 | summary(hotels$price) 85 | summary(hotels$lnprice) 86 | 87 | 88 | # REGRESSION 89 | reg1 <- lm(price ~ distance, data=hotels) 90 | summary(reg1) 91 | reg2 <- lm(price ~ lndistance, data=hotels) 92 | summary(reg2) 93 | reg3 <- lm(lnprice ~ distance, data=hotels) 94 | summary(reg3) 95 | reg4 <- lm(lnprice ~ lndistance, data=hotels) 96 | summary(reg4) 97 | 98 | 99 | ############### 100 | # FIGURES 8.1 101 | 102 | # LEVEL-LEVEL LINEAR REGRESSION 103 | F08_1a <- ggplot(data = hotels, aes(x = distance, y = price)) + 104 | geom_point_da() + 105 | geom_smooth_da(method = "lm")+ 106 | expand_limits(x = 0.01, y = 0.01) + 107 | scale_x_continuous(expand = c(0.01,0.01), limits=c(0, 7), breaks=seq(0, 7, by=1)) + 108 | scale_y_continuous(expand = c(0.01,0.01), limits = c(0, 400), breaks = seq(0, 400, by = 50)) + 109 | labs(x = "Distance to city center (miles)",y = "Price (US dollars)")+ 110 | theme_bg() 111 | F08_1a 112 | save_fig("ch08-figure-1a-hotel-levlev", output, "small") 113 | 114 | 115 | # LOG-LEVEL LINEAR REGRESSION 116 | F08_1b <- ggplot(data = hotels, aes(x = distance, y = lnprice)) + 117 | geom_point_da() + 118 | geom_smooth_da(method = "lm")+ 119 | expand_limits(x = 0.01, y = 0.01) + 120 | scale_x_continuous(expand = c(0.01,0.01), limits=c(0, 7), breaks=seq(0, 7, by=1)) + 121 | scale_y_continuous(expand = c(0.01,0.01), limits = c(3.5, 6), breaks = seq(3.5, 6, by = 0.50)) + 122 | labs(x = "Distance to city center (miles)",y = "ln(price, US dollars)")+ 123 | theme_bg() 124 | F08_1b 125 | save_fig("ch08-figure-1b-hotel-loglev", output, "small") 126 | 127 | 128 | 129 | # LEVEL-LOG LINEAR REGRESSION 130 | F08_1c <- ggplot(data = hotels, aes(x = lndistance, y = price)) + 131 | geom_point_da() + 132 | geom_smooth_da(method = "lm")+ 133 | expand_limits(x = 0.01, y = 0.01) + 134 | scale_y_continuous(expand = c(0.01,0.01), limits = c(0, 400), breaks = seq(0, 400, by = 50)) + 135 | labs(x = "ln(distance to city center, miles)",y = "Price (US dollars)")+ 136 | theme_bg() 137 | F08_1c 138 | save_fig("ch08-figure-2a-hotel-levlog", output, "small") 139 | 140 | # LOG-LOG LINEAR REGRESSION 141 | F08_1d <- ggplot(data = hotels, aes(x = lndistance, y = lnprice)) + 142 | geom_point_da() + 143 | geom_smooth_da(method = "lm")+ 144 | #scale_x_continuous(limits=c(-2.5, 2), breaks=seq(-2.5, 2, by=0.5)) + 145 | expand_limits(x = 0.01, y = 0.01) + 146 | scale_y_continuous(expand = c(0.01,0.01), limits = c(3.5, 6), breaks = seq(3.5, 6, by = 0.50)) + 147 | labs(x = "ln(distance to city center, miles)",y = "ln(price, US dollars)")+ 148 | theme_bg() 149 | F08_1d 150 | save_fig("ch08-figure-2b-hotel-loglog", output, "small") 151 | -------------------------------------------------------------------------------- /ch08-hotels-nonlinear/ch08-hotels-nonlinear-reg.do: -------------------------------------------------------------------------------- 1 | ******************************************************************** 2 | * Prepared for Gabor's Data Analysis 3 | * 4 | * Data Analysis for Business, Economics, and Policy 5 | * by Gabor Bekes and Gabor Kezdi 6 | * Cambridge University Press 2021 7 | * 8 | * gabors-data-analysis.com 9 | * 10 | * License: Free to share, modify and use for educational purposes. 11 | * Not to be used for commercial purposes. 12 | * 13 | * Chapter 08 14 | * CH0bA Finding a good deal among hotels with nonlinear function 15 | * using the hotels-vienna dataset 16 | * version 0.9 2020-09-06 17 | ******************************************************************** 18 | 19 | 20 | * SETTING UP DIRECTORIES 21 | 22 | * STEP 1: set working directory for da_case_studies. 23 | * for example: 24 | * cd "C:/Users/xy/Dropbox/gabors_data_analysis/da_case_studies" 25 | 26 | 27 | * STEP 2: * Directory for data 28 | * Option 1: run directory-setting do file 29 | do set-data-directory.do 30 | /* this is a one-line do file that should sit in 31 | the working directory you have just set up 32 | this do file has a global definition of your working directory 33 | more details: gabors-data-analysis.com/howto-stata/ */ 34 | 35 | * Option 2: set directory directly here 36 | * for example: 37 | * global data_dir "C:/Users/xy/gabors_data_analysis/da_data_repo" 38 | 39 | 40 | global data_in "$data_dir/hotels-vienna/clean" 41 | global work "ch07-hotels-nonlinear" 42 | 43 | cap mkdir "$work/output" 44 | global output "$work/output" 45 | 46 | 47 | 48 | 49 | 50 | * load in clean and tidy data and create workfile 51 | use "$data_in/hotels-vienna.dta", clear 52 | 53 | * Or download directly from OSF: 54 | 55 | /* 56 | copy "https://osf.io/download/dn8je/" "workfile.dta" 57 | use "workfile.dta", clear 58 | erase "workfile.dta" 59 | */ 60 | 61 | *** SAMPLE SELECTION 62 | 63 | *** 3 to 4-star hotels (incl 3.5 stars) 64 | keep if stars>=3 & stars<=4 65 | keep if accommodation_type=="Hotel" 66 | label var distance "Distance to city center, miles" 67 | drop if price>600 /* likely error */ 68 | 69 | 70 | *** drop hotels not really in Vienna 71 | tab city_actual 72 | keep if city_actual=="Vienna" 73 | 74 | * save work file 75 | save "$work/hotels_work.dta", replace 76 | 77 | 78 | 79 | 80 | * Fig 8.1 81 | *** SCATTERPLOT + REGRESSION LINE 82 | 83 | 84 | gen lnprice=ln(price) 85 | lab var lnprice "ln(price)" 86 | count 87 | count if distance==0 88 | gen lndistance=ln(distance) 89 | replace lndistance = ln(distance+0.05) if distance==0 90 | lab var lndistance "ln(distance to city center)" 91 | 92 | 93 | * run and compare regressions 94 | reg price distance, r 95 | outreg2 using "$output/T08_reg1.tex", label bdec(2) tex(frag) nose noaster replace 96 | reg lnprice distance, r 97 | outreg2 using "$output/T08_reg1.tex", label bdec(2) tex(frag) nose noaster append 98 | reg price lndistance, r 99 | outreg2 using "$output/T08_reg1.tex", label bdec(2) tex(frag) nose noaster append 100 | reg lnprice lndistance 101 | outreg2 using "$output/T08_reg1.tex", label bdec(2) tex(frag) nose noaster append 102 | 103 | * create graphs 104 | scatter price distance , /// 105 | ms(O) msize(small) mlw(thick) mcolor(navy*0.6) /// 106 | xlab(0(1)7, grid) ylab(000(50)400, grid) /// 107 | xtitle("Distance to city center (miles)") /// 108 | ytitle("Hotel price(US dollars)") /// 109 | || lfit price distance, lw(thick) lc(green) legend(off) /// 110 | graphregion(fcolor(white) ifcolor(none)) /// 111 | plotregion(fcolor(white) ifcolor(white)) 112 | graph export "$output/ch08-figure-1a-hotel-levlev-Stata", as(png) replace 113 | 114 | 115 | scatter lnprice distance , /// 116 | ms(O) msize(small) mlw(thick) mcolor(navy*0.6) /// 117 | xlab(0(1)7, grid) ylab(3.5(0.50)6, grid) /// 118 | xtitle("Distance to city center (miles)") /// 119 | ytitle("ln(hotel price in US dollars)") /// 120 | || lfit lnprice distance, lw(thick) lc(dkgreen) legend(off) /// 121 | graphregion(fcolor(white) ifcolor(none)) /// 122 | plotregion(fcolor(white) ifcolor(white)) 123 | graph export "$output/ch08-figure-1b-hotel-loglev-Stata", as(png) replace 124 | 125 | scatter price lndistance , /// 126 | ms(O) msize(small) mlw(thick) mcolor(navy*0.6) /// 127 | xlab(-2.5(0.5)2, grid) ylab(000(50)400, grid) /// 128 | xtitle("ln(distance to city center, miles)") /// 129 | ytitle("Hotel price (US dollars) ") /// 130 | || lfit price lndistance, lw(thick) lc(dkgreen) legend(off) /// 131 | graphregion(fcolor(white) ifcolor(none)) /// 132 | plotregion(fcolor(white) ifcolor(white)) 133 | graph export "$output/ch08-figure-1c-hotel-levlog-Stata", as(png) replace 134 | 135 | 136 | scatter lnprice lndistance , /// 137 | ms(O) msize(small) mlw(thick) mcolor(navy*0.6) /// 138 | xlab(-3(0.5)2, grid) ylab(3.5(0.50)6, grid) /// 139 | xtitle("ln(distance to city center, miles)") /// 140 | ytitle("ln(hotel price in US dollars)") /// 141 | || lfit lnprice lndistance, lw(thick) lc(dkgreen) legend(off) /// 142 | graphregion(fcolor(white) ifcolor(none)) /// 143 | plotregion(fcolor(white) ifcolor(white)) 144 | graph export "$output/ch08-figure-1d-hotel-loglog-Stata", as(png) replace 145 | -------------------------------------------------------------------------------- /ch09-hotels-europe-stability/ch09-hotels-externalvalid.do: -------------------------------------------------------------------------------- 1 | ******************************************************************** 2 | * Prepared for Gabor's Data Analysis 3 | * 4 | * Data Analysis for Business, Economics, and Policy 5 | * by Gabor Bekes and Gabor Kezdi 6 | * Cambridge University Press 2021 7 | * 8 | * gabors-data-analysis.com 9 | * 10 | * License: Free to share, modify and use for educational purposes. 11 | * Not to be used for commercial purposes. 12 | * 13 | * Chapter 09 14 | * CH09B How stable is the hotel price - distance to city center relathionship? 15 | * using the hotels-europe dataset 16 | * version 0.9 2020-09-06 17 | ******************************************************************** 18 | 19 | 20 | * SETTING UP DIRECTORIES 21 | 22 | * STEP 1: set working directory for da_case_studies. 23 | * for example: 24 | * cd "C:/Users/xy/Dropbox/gabors_data_analysis/da_case_studies" 25 | 26 | * STEP 2: * Directory for data 27 | * Option 1: run directory-setting do file 28 | do set-data-directory.do 29 | /* this is a one-line do file that should sit in 30 | the working directory you have just set up 31 | this do file has a global definition of your working directory 32 | more details: gabors-data-analysis.com/howto-stata/ */ 33 | 34 | * Option 2: set directory directly here 35 | * for example: 36 | * global data_dir "C:/Users/xy/gabors_data_analysis/da_data_repo" 37 | 38 | 39 | global data_in "$data_dir/hotels-europe/clean" 40 | global work "ch09-hotels-europe-stability" 41 | 42 | cap mkdir "$work/output" 43 | global output "$work/output" 44 | 45 | 46 | 47 | 48 | * Vienna vs London 49 | 50 | * load in clean and tidy data and create workfile 51 | use "$data_in/hotels-europe_price", clear 52 | merge m:m hotel_id using "$data_in/hotels-europe_features.dta" 53 | 54 | * Or download directly from OSF: 55 | /* 56 | copy "https://osf.io/download/hz4gw/" "workfile.dta" 57 | use "workfile.dta", clear 58 | erase "workfile.dta" 59 | preserve 60 | copy "https://osf.io/download/j9mkf/" "workfile.dta" 61 | use "workfile.dta", clear 62 | erase "workfile.dta" 63 | tempfile hotels_features 64 | save `hotels_features' 65 | restore 66 | merge m:m hotel_id using `hotels_features', nogen 67 | */ 68 | 69 | drop _m 70 | label var distance "Distance to city center, miles" 71 | 72 | * filter a few cities 73 | * keep if city_actual==city 74 | keep if inlist(city_actual, "Vienna", "Amsterdam", "Barcelona") 75 | keep if inlist(accommodation_type, "Hotel", "Apartment") 76 | 77 | * drop long stay , 1000E+ 78 | drop if nnights==4 79 | drop if price>1000 80 | 81 | * check for duplicates 82 | duplicates report 83 | duplicates drop 84 | 85 | 86 | gen date="" 87 | replace date = "2017-NOV-weekday" if month==11 & weekend==0 88 | replace date = "2017-NOV-weekend" if month==11 & weekend==1 89 | replace date = "2017-DEC-holiday" if month==12 & holiday==1 90 | replace date = "2018-JUNE-weekend" if month==6 & weekend==1 91 | drop if date=="" 92 | 93 | count 94 | 95 | tab city 96 | tab accommodation_type city 97 | tab date 98 | 99 | 100 | gen lnprice=ln(price) 101 | lab var lnprice "ln(Price)" 102 | 103 | keep hotel_id date city accommodation_type stars rating distance price lnprice 104 | 105 | * save work file 106 | saveold "$work/hotels_work.dta", replace 107 | 108 | 109 | ********************************************************** 110 | * External validity by time 111 | use "$work/hotels_work.dta",replace 112 | keep if stars>=3 & stars<=4 113 | keep if accommodation_type=="Hotel" 114 | keep if city=="Vienna" 115 | tab date 116 | 117 | tabstat distance , s(min max p50 mean n) by(date) 118 | tabstat price, s(min max p50 mean n) by(date) format(%4.1f) 119 | tabstat lnprice, s(min max p50 mean n) by(city) format(%4.1f) 120 | 121 | mkspline dist_0_2 2 dist_2_7 = distance 122 | 123 | 124 | *** Regressions with three dates for textbook 125 | * original regression 126 | reg lnprice dist_0_2 dist_2_7 if date=="2017-NOV-weekday", robust 127 | outreg2 using "$output/hotels_extval_time1", se 2aster bdec(2) ctitle("2017-NOV-weekday") tex(frag) nonotes replace 128 | * other dates 129 | foreach d in "2017-NOV-weekend" "2017-DEC-holiday" "2018-JUNE-weekend" { 130 | reg lnprice dist_0_2 dist_2_7 if date=="`d'", robust 131 | outreg2 using "$output/hotels_extval_time1", se 2aster bdec(2) ctitle("`d'") tex(frag) nonotes append 132 | } 133 | 134 | 135 | ** same with hotels restricted to be the same 136 | * first create variable that counts the number of times a hotel is in the data 137 | egen hotelcount = count(price), by(hotel_id) 138 | tab hotelcount 139 | keep if hotelcount==4 140 | * original regression 141 | reg lnprice dist_0_2 dist_2_7 if date=="2017-NOV-weekday", robust 142 | outreg2 using "$output/hotels_extval_time2", se 2aster bdec(2) ctitle("2017-NOV-weekday") tex(frag) nonotes replace 143 | * other dates 144 | foreach d in "2017-NOV-weekend" "2017-DEC-holiday" "2018-JUNE-weekend" { 145 | reg lnprice dist_0_2 dist_2_7 if date=="`d'", robust 146 | outreg2 using "$output/hotels_extval_time2", se 2aster bdec(2) ctitle("`d'") tex(frag) nonotes append 147 | } 148 | 149 | * check interaction term p value 150 | keep if date=="2017-NOV-weekday" | date=="2017-NOV-weekend" 151 | gen we= date=="2017-NOV-weekend" 152 | tab we 153 | reg lnprice we c.dist_0_2##we c.dist_2_7##we , robust 154 | 155 | 156 | ********************************************************** 157 | * External validity by city 158 | use "$work/hotels_work.dta",replace 159 | keep if stars>=3 & stars<=4 160 | keep if accommodation_type=="Hotel" 161 | keep if date=="2017-NOV-weekday" 162 | 163 | tabstat distance , s(min max p50 mean n) by(city) 164 | tabstat price, s(min max p50 mean n) by(city) format(%4.1f) 165 | tabstat lnprice, s(min max p50 mean n) by(city) format(%4.1f) 166 | 167 | mkspline dist_0_2 2 dist_2_7 = distance 168 | 169 | 170 | *** Regressions for three cities 171 | * original regression: Vienna 172 | reg lnprice dist_0_2 dist_2_7 if city=="Vienna" , r 173 | outreg2 using "$output/hotels_extval_city", se 2aster bdec(2) ctitle("Vienna") tex(frag) nonotes replace 174 | * two other cities 175 | foreach c in Amsterdam Barcelona{ 176 | reg lnprice dist_0_2 dist_2_7 if city=="`c'" , r 177 | outreg2 using "$output/hotels_extval_city", se 2aster bdec(2) ctitle("`c'") tex(frag) nonotes append 178 | } 179 | 180 | 181 | 182 | ********************************************************** 183 | * External validity by accommodation type: hotels vs apartments 184 | use "$work/hotels_work.dta",replace 185 | keep if city=="Vienna" 186 | keep if date=="2017-NOV-weekday" 187 | keep if stars>=3 & stars<=4 188 | tab accommodation_type stars 189 | 190 | 191 | 192 | tabstat distance , s(min max p50 mean n) by(stars) 193 | tabstat price, s(min max p50 mean n) by(stars) format(%4.1f) 194 | tabstat lnprice, s(min max p50 mean n) by(stars) format(%4.1f) 195 | 196 | mkspline dist_0_2 2 dist_2_7 = distance 197 | 198 | 199 | reg lnprice dist_0_2 dist_2_7 if accommodation=="Hotel" 200 | outreg2 using "$output/hotels_extval_type", se 2aster bdec(2) ctitle("Hotels") tex(frag) nonotes replace 201 | reg lnprice dist_0_2 dist_2_7 if accommodation=="Apartment" 202 | outreg2 using "$output/hotels_extval_type", se 2aster bdec(2) ctitle("Apartments") tex(frag) nonotes append 203 | 204 | 205 | 206 | -------------------------------------------------------------------------------- /ch10-hotels-multiple-reg/ch10-hotels-multiple-reg.R: -------------------------------------------------------------------------------- 1 | ################################################################################################ 2 | # Prepared for the textbook: 3 | # Data Analysis for Business, Economics, and Policy 4 | # by Gabor BEKES and Gabor KEZDI 5 | # Cambridge University Press 2021 6 | # 7 | # License: Free to share, modify and use for educational purposes. Not to be used for business purposes. 8 | # 9 | ###############################################################################################x 10 | 11 | # CHAPTER 10 12 | # CH10B Finding a good deal among hotels with multiple regression 13 | # version 0.9 2020-08-31 14 | 15 | 16 | # ------------------------------------------------------------------------------------------------------ 17 | #### SET UP 18 | # It is advised to start a new session for every case study 19 | # CLEAR MEMORY 20 | rm(list=ls()) 21 | 22 | # Import libraries 23 | library(tidyverse) 24 | library(stargazer) 25 | library(haven) 26 | library(scales) 27 | library(lspline) 28 | 29 | 30 | 31 | # set working directory 32 | # option A: open material as project 33 | # option B: set working directory for da_case_studies 34 | # example: setwd("C:/Users/bekes.gabor/Documents/github/da_case_studies/") 35 | 36 | # set data dir, data used 37 | source("set-data-directory.R") # data_dir must be first defined 38 | # alternative: give full path here, 39 | # example data_dir="C:/Users/bekes.gabor/Dropbox (MTA KRTK)/bekes_kezdi_textbook/da_data_repo" 40 | 41 | # load theme and functions 42 | source("ch00-tech-prep/theme_bg.R") 43 | source("ch00-tech-prep/da_helper_functions.R") 44 | options(digits = 3) 45 | 46 | data_in <- paste(data_dir,"hotels-vienna","clean/", sep = "/") 47 | use_case_dir <- "ch10-hotels-multiple-reg/" 48 | 49 | data_out <- use_case_dir 50 | output <- paste0(use_case_dir,"output/") 51 | create_output_if_doesnt_exist(output) 52 | 53 | 54 | ##################################################################### 55 | 56 | 57 | # load vienna 58 | hotels <- read_csv(paste0(data_in,"hotels-vienna.csv")) 59 | # From web 60 | # hotels <- read_csv( "https://osf.io/y6jvb/download" ) 61 | 62 | # ------------------------------------------------------------------------------------------------------ 63 | ####SAMPLE SELECTION 64 | # Apply filters: 3-4 stars, Vienna actual, without extreme value 65 | hotels <- hotels %>% filter(accommodation_type=="Hotel") %>% 66 | filter(city_actual=="Vienna") %>% 67 | filter(stars>=3 & stars<=4) %>% filter(!is.na(stars)) %>% 68 | filter(price<=600) 69 | 70 | 71 | 72 | 73 | 74 | ##################################################################### 75 | # TAKE LOG 76 | hotels$lnprice <- log(hotels$price) 77 | 78 | hotels$distance2<-hotels$distance 79 | hotels$distance2[hotels$distance2<0.05] <- 0.05 80 | 81 | hotels$lndistance<-log(hotels$distance2) 82 | 83 | 84 | 85 | # Stars: binary indicators 86 | hotels$star35 = ifelse(hotels$stars==3.5, 1, 0) 87 | hotels$star4 = ifelse(hotels$stars==4, 1, 0) 88 | 89 | 90 | summary(hotels$price) 91 | summary(hotels$distance) 92 | summary(hotels$lnprice) 93 | 94 | 95 | ##################################################################### 96 | # Regressions 97 | ##################################################################### 98 | # Basic 99 | reg0 <- lm(lnprice ~ rating, data=hotels) 100 | 101 | reg1 <- lm(lnprice ~ distance, data=hotels) 102 | 103 | reg2 <- lm(lnprice ~ distance + rating, data=hotels) 104 | 105 | # _r robust errors in stargazer 106 | stargazer_r(list(reg0, reg1, reg2), se = 'robust', digits=3, out=paste(output,"T10_hotels_R.html",sep="")) 107 | 108 | # more complex models 109 | # Predicted values 110 | reg3 <- lm(lnprice ~ lspline(distance, c(1,4)) + lspline(rating, 3.5) + star35 + star4, data=hotels) 111 | summary(reg3, vcov=sandwich) 112 | hotels$lnprice_hat <- predict(reg3) 113 | hotels$lnprice_resid <- hotels$lnprice - hotels$lnprice_hat 114 | hotels$bestdeals <- ifelse(hotels$lnprice_resid %in% tail(sort(hotels$lnprice_resid, decreasing=TRUE),5),TRUE,FALSE) 115 | 116 | # Compare R-sqared with distance only 117 | reg4 <- lm(lnprice ~ lspline(distance, c(1,4)), data=hotels) 118 | summary(reg4) 119 | 120 | stargazer_r(list(reg1, reg2, reg3, reg4), se = 'robust', digits=3, out=paste(output,"T10_hotels2_R.tex",sep="")) 121 | 122 | 123 | # List of 5 best deals 124 | hotels %>% 125 | dplyr::select(hotel_id, price, lnprice_resid, distance, stars, rating) %>% 126 | arrange(lnprice_resid) %>% 127 | .[1:5,] %>% 128 | as.data.frame() %>% 129 | stargazer(summary= FALSE, digits = 1, out = paste(output,"T10_hotels_best_deals.tex",sep="")) 130 | 131 | # y - yhat graph 132 | y_yhat_hotels<- ggplot(data = hotels, aes(x = lnprice_hat, y = lnprice)) + 133 | geom_point(aes(color=bestdeals,shape=bestdeals), size = 1.2, fill=color[3], alpha = 0.8, show.legend=F, na.rm = TRUE) + 134 | #geom_smooth_da(method="lm") + 135 | geom_segment(aes(x = 3.8, y = 3.8, xend = 6, yend =6), size=0.8, color=color[2], linetype=2) + 136 | labs(x = "ln(predicted price, US dollars) ",y = "ln(price, US dollars)")+ 137 | coord_cartesian(xlim = c(3.8, 6), ylim = c(3.8, 6)) + 138 | scale_colour_manual(name='',values=c(color[1],'black')) + 139 | scale_shape_manual(name='',values=c(16,21)) + 140 | geom_segment(aes(x = 4.8, y = 3.9, xend = 4.68, yend = 4.05), arrow = arrow(length = unit(0.1, "cm")))+ 141 | annotate("text", x = 4.93, y = 3.9, label = "Best deal", size=2.5)+ 142 | theme_bg() + 143 | theme(axis.text.x=element_text(size=9)) + 144 | theme(axis.text.y=element_text(size=9)) + 145 | theme(axis.title.x=element_text(size=9)) + 146 | theme(axis.title.y=element_text(size=9)) 147 | y_yhat_hotels 148 | save_fig("ch10-figure-3-hotels-yhat-y", output, "large") 149 | 150 | 151 | # residual - yhat graph (not in book) 152 | ggplot(data = hotels, aes(x = lnprice_hat, y = lnprice_resid)) + 153 | geom_point(color = color[1], size = 1, shape = 16, alpha = 0.6, show.legend=F, na.rm = TRUE) + 154 | geom_smooth(method="lm", colour=color[4], se=F, size=1) + 155 | labs(x = "ln(Predicted hotel price, US dollars)",y = "Residuals")+ 156 | coord_cartesian(xlim = c(4, 5.5)) + 157 | theme_bg() + 158 | background_grid(major = "xy", minor="xy", size.major = 0.2) 159 | 160 | 161 | 162 | 163 | 164 | -------------------------------------------------------------------------------- /ch10-hotels-multiple-reg/ch10-hotels-multiple-reg.do: -------------------------------------------------------------------------------- 1 | ******************************************************************** 2 | * Prepared for Gabor's Data Analysis 3 | * 4 | * Data Analysis for Business, Economics, and Policy 5 | * by Gabor Bekes and Gabor Kezdi 6 | * Cambridge University Press 2021 7 | * 8 | * gabors-data-analysis.com 9 | * 10 | * License: Free to share, modify and use for educational purposes. 11 | * Not to be used for commercial purposes. 12 | * 13 | * Chapter 10 14 | * CH10A Finding a good deal among hotels with multiple regression 15 | * using the hotels-vienna dataset 16 | * version 0.9 2020-09-06 17 | ******************************************************************** 18 | 19 | 20 | * SETTING UP DIRECTORIES 21 | 22 | * STEP 1: set working directory for da_case_studies. 23 | * for example: 24 | * cd "C:/Users/xy/Dropbox/gabors_data_analysis/da_case_studies" 25 | 26 | * STEP 2: * Directory for data 27 | * Option 1: run directory-setting do file 28 | do set-data-directory.do 29 | /* this is a one-line do file that should sit in 30 | the working directory you have just set up 31 | this do file has a global definition of your working directory 32 | more details: gabors-data-analysis.com/howto-stata/ */ 33 | 34 | * Option 2: set directory directly here 35 | * for example: 36 | * global data_dir "C:/Users/xy/gabors_data_analysis/da_data_repo" 37 | 38 | 39 | global data_in "$data_dir/hotels-vienna/clean" 40 | global work "ch10-hotels-multiple-reg" 41 | 42 | cap mkdir "$work/output" 43 | global output "$work/output" 44 | 45 | 46 | 47 | 48 | * add user written library 49 | ssc install listtex 50 | 51 | ********************************************************************* 52 | *** LOAD and PREP DATA 53 | 54 | 55 | 56 | * load in clean and tidy data and create workfile 57 | use "$data_in/hotels-vienna.dta", clear 58 | 59 | * Or download directly from OSF: 60 | /* 61 | copy "https://osf.io/download/dn8je/" "workfile.dta" 62 | use "workfile.dta", clear 63 | erase "workfile.dta" 64 | */ 65 | 66 | 67 | 68 | *** SAMPLE SELECTION 69 | 70 | *** 3 to 4-star hotels (incl 3.5 stars) 71 | keep if stars>=3 & stars<=4 72 | keep if accommodation_type=="Hotel" 73 | label var distance "Distance to city center, miles" 74 | drop if price>600 /* likely error */ 75 | 76 | 77 | *** drop hotels not really in Vienna 78 | tab city_actual 79 | keep if city_actual=="Vienna" 80 | 81 | *** take log price 82 | gen lnprice=ln(price) 83 | lab var lnprice "ln(Price)" 84 | 85 | *** piecewise lins pline of distance 86 | mkspline distsp1 1 distsp2 4 distsp3 = distance 87 | 88 | *** piecewise lins pline rating 89 | mkspline ratingsp1 3.5 ratingsp2 = rating 90 | 91 | *** stars: binary indicators 92 | gen star35 = stars==3.5 93 | gen star4 = stars==4 94 | 95 | tabstat price distance lnprice, s(mean sd min p25 p50 p75 max n) col(s) 96 | 97 | 98 | ********************************************************************* 99 | *regressions 100 | ********************************************************************* 101 | 102 | reg lnprice distance, robust 103 | outreg2 using "$output/T10_hotels.tex", tex(frag) excel bdec(3) replace 104 | reg lnprice rating, robust 105 | outreg2 using "$output/T10_hotels.tex", tex(frag) excel bdec(3) append 106 | reg lnprice distance rating, robust 107 | outreg2 using "$output/T10_hotels.tex", tex(frag) excel bdec(2) append 108 | reg distance rating, robust 109 | outreg2 using "$output/T10_hotels.tex", tex(frag) excel bdec(2) append 110 | 111 | 112 | 113 | 114 | * basic 115 | reg lnprice distance rating, robust 116 | 117 | 118 | * predicted values 119 | reg lnprice distsp1 distsp2 distsp3 star35 star4 ratingsp1 ratingsp2, robust 120 | predict lnprice_hat 121 | predict lnprice_resid, resid 122 | 123 | * compare R-sqared with distance only 124 | reg lnprice distsp1 distsp2 distsp3,r 125 | 126 | * list of 5 best deals 127 | sort lnprice_resid 128 | 129 | format lnprice_resid %5.3f 130 | format distance %3.1f 131 | list hotel_id price lnprice_resid distance stars rating if _n<=5 132 | * outputing the list in a LaTex format 133 | listtex hotel_id price lnprice_resid distance stars rating /// 134 | using "$output\ch10-table-6-hotels-good-deals-Stata.tex" if _n<=5, replace /// 135 | headlines( "\begin{tabular}{l c c c c c})" /// 136 | \hline "Hotel name & price & residual in ln(price) & distance & stars & rating \\" \hline) /// 137 | footlines(\hline \end{tabular}) rstyle(tabular) 138 | /* the tabular style makes the table LaTex friendly */ 139 | /* headlines defines the way the table layed out */ 140 | 141 | * yhat - y graph 142 | * two scatterplot commants, one for best 5 deals, one for rest 143 | * Figure 10.3 144 | sort lnprice_resid 145 | scatter lnprice lnprice_hat if _n>5, ms(O) mc(navy*0.6) /// 146 | || scatter lnprice lnprice_hat if _n<=5, ms(O) mc(black) /// 147 | || line lnprice lnprice, sort lw(thick) lc(green*0.6) lp(dash) /// 148 | graphregion(fcolor(white) ifcolor(none)) /// 149 | plotregion(fcolor(white) ifcolor(white)) /// 150 | xlab(3.75(0.25)6, grid) ylab(3.75(0.25)6, grid) legend(off) /// 151 | ytitle("ln(price, US dollars)") xtitle("predicted ln(price, US dollars)") /// 152 | text(4.1 4.84 "Best deal") 153 | graph export "$output\ch10-figure-3-hitels-yhat-y.png", replace 154 | 155 | -------------------------------------------------------------------------------- /ch11-australia-rainfall-predict/ch11-australia-rainfall-predict.R: -------------------------------------------------------------------------------- 1 | ######################################################################################### 2 | # Prepared for Gabor's Data Analysis 3 | # 4 | # Data Analysis for Business, Economics, and Policy 5 | # by Gabor Bekes and Gabor Kezdi 6 | # Cambridge University Press 2021 7 | # 8 | # gabors-data-analysis.com 9 | # 10 | # License: Free to share, modify and use for educational purposes. 11 | # Not to be used for commercial purposes. 12 | 13 | # Chapter 11 14 | # CH11B Are Australian weather forecasts well calibrated? 15 | # using the australia-weather-forecasts dataset 16 | # version 0.9 2020-09-08 17 | ######################################################################################### 18 | 19 | 20 | 21 | # ------------------------------------------------------------------------------------------------------ 22 | #### SET UP 23 | # It is advised to start a new session for every case study 24 | # CLEAR MEMORY 25 | rm(list=ls()) 26 | 27 | 28 | # Import libraries 29 | 30 | 31 | 32 | # set working directory 33 | # option A: open material as project 34 | # option B: set working directory for da_case_studies 35 | # example: setwd("C:/Users/bekes.gabor/Documents/github/da_case_studies/") 36 | 37 | # set data dir, data used 38 | source("set-data-directory.R") # data_dir must be first defined 39 | # alternative: give full path here, 40 | # example data_dir="C:/Users/bekes.gabor/Dropbox (MTA KRTK)/bekes_kezdi_textbook/da_data_repo" 41 | 42 | # load theme and functions 43 | source("ch00-tech-prep/theme_bg.R") 44 | source("ch00-tech-prep/da_helper_functions.R") 45 | 46 | data_in <- paste(data_dir,"australia-weather-forecasts","clean/", sep = "/") 47 | use_case_dir <- "ch11-australia-rainfall-predict/" 48 | 49 | data_out <- use_case_dir 50 | output <- paste0(use_case_dir,"output/") 51 | create_output_if_doesnt_exist(output) 52 | 53 | 54 | #----------------------------------------------------------------------------------------- 55 | 56 | 57 | data <- read.csv(paste(data_in, "rainfall_australia.csv", sep = "/")) %>% 58 | filter(station_name=="DARWIN AIRPORT") 59 | 60 | #data <- read.csv("https://osf.io/download/kdva8/") %>% 61 | # filter(station_name=="DARWIN AIRPORT") 62 | 63 | 64 | data <- data %>% 65 | filter(bd_FC_Before_Start == 39) %>% 66 | mutate( 67 | rain_prob_fc=prob/100, 68 | ) 69 | 70 | # replace bin = bin+0.05 71 | 72 | create_calibration_plot(data, 73 | file_name = "ch11-figure-6-weather-calib", 74 | prob_var = "rain_prob_fc", 75 | actual_var = "daily_sum", 76 | breaks=c(0, 0.005, 0.1,0.2,0.3,0.4,0.5,0.6, 0.7, 0.8, 0.9)) 77 | 78 | Hmisc::describe(data$rain_prob_fc) 79 | 80 | 81 | 82 | -------------------------------------------------------------------------------- /ch11-australia-rainfall-predict/ch11-australia-rainfall-predict.do: -------------------------------------------------------------------------------- 1 | ******************************************************************** 2 | * Prepared for Gabor's Data Analysis 3 | * 4 | * Data Analysis for Business, Economics, and Policy 5 | * by Gabor Bekes and Gabor Kezdi 6 | * Cambridge University Press 2021 7 | * 8 | * gabors-data-analysis.com 9 | * 10 | * License: Free to share, modify and use for educational purposes. 11 | * Not to be used for commercial purposes. 12 | * 13 | * Chapter 11 14 | * CH11B Are Australian weather forecasts well calibrated? 15 | * using the australia-weather-forecasts dataset 16 | * version 0.9 2020-09-06 17 | ******************************************************************** 18 | 19 | 20 | * SETTING UP DIRECTORIES 21 | 22 | * STEP 1: set working directory for da_case_studies. 23 | * for example: 24 | * cd "C:/Users/xy/Dropbox/gabors_data_analysis/da_case_studies" 25 | 26 | * STEP 2: * Directory for data 27 | * Option 1: run directory-setting do file 28 | do set-data-directory.do 29 | /* this is a one-line do file that should sit in 30 | the working directory you have just set up 31 | this do file has a global definition of your working directory 32 | more details: gabors-data-analysis.com/howto-stata/ */ 33 | 34 | * Option 2: set directory directly here 35 | * for example: 36 | * global data_dir "C:/Users/xy/gabors_data_analysis/da_data_repo" 37 | 38 | 39 | global data_in "$data_dir/australia-weather-forecasts/clean" 40 | global work "ch11-australia-rainfall-predict" 41 | 42 | cap mkdir "$work/output" 43 | global output "$work/output" 44 | 45 | 46 | 47 | 48 | clear 49 | import delimited "$data_in/rainfall_australia.csv" 50 | 51 | * Or download directly from OSF: 52 | /* 53 | copy "https://osf.io/download/kdva8/" "workfile.csv" 54 | import delimited "workfile.csv" 55 | erase "workfile.csv" 56 | */ 57 | 58 | *drop if station_name =="MARREE AERO" 59 | keep if station_name=="DARWIN AIRPORT" 60 | compress 61 | 62 | tab bd_fc_before_start 63 | keep if bd_fc_before_start == 39 64 | gen pred_time="2-day forecast" 65 | 66 | * Darwin well calibrated overall 67 | su prob daily_sum 68 | gen rain_prob_fc=prob/100 69 | gen rain=daily_sum 70 | 71 | egen bin = cut(rain_prob_fc), at(0(0.1)1) 72 | replace bin = bin+0.05 73 | format bin %3.2f 74 | replace bin=0 if rain_prob_fc==0 75 | tabstat rain_prob_fc, by(bin) s(min max n) 76 | 77 | format bin %3.1f 78 | collapse rain rain_prob_fc, by(bin) 79 | 80 | scatter rain bin bin, connect(l l) lw(thick medium) lc(navy*0.8 green*0.6) /// 81 | mcolor(navy*0.8) msize(medium) msymbol(O none) /// 82 | xla(0(0.1)1, grid) yla(0(0.1)1, grid) /// 83 | xtitle("Bins of predicted probaiblities") ytitle("Proportion rainy days") /// 84 | legend(off) /// 85 | graphregion(fcolor(white) ifcolor(none)) /// 86 | plotregion(fcolor(white) ifcolor(white)) 87 | graph export "$output\ch11-figure-6-weather-calib-Stata.png", replace 88 | 89 | -------------------------------------------------------------------------------- /ch12-time-series-simulations/ch12-randomwalk-serialcorr-simull.R: -------------------------------------------------------------------------------- 1 | ######################################################################################### 2 | # Prepared for Gabor's Data Analysis 3 | # 4 | # Data Analysis for Business, Economics, and Policy 5 | # by Gabor Bekes and Gabor Kezdi 6 | # Cambridge University Press 2021 7 | # 8 | # gabors-data-analysis.com 9 | # 10 | # License: Free to share, modify and use for educational purposes. 11 | # Not to be used for commercial purposes. 12 | 13 | # Chapter 12 14 | # CH12 Time series simulation 15 | # version 0.9 2020-09-09 16 | ######################################################################################### 17 | 18 | 19 | 20 | # ------------------------------------------------------------------------------------------------------ 21 | #### SET UP 22 | # It is advised to start a new session for every case study 23 | # CLEAR MEMORY 24 | rm(list=ls()) 25 | 26 | 27 | # Import libraries 28 | library(tidyverse) 29 | 30 | # set working directory 31 | # option A: open material as project 32 | # option B: set working directory for da_case_studies 33 | # example: setwd("C:/Users/bekes.gabor/Documents/github/da_case_studies/") 34 | 35 | # set data dir, data used 36 | source("set-data-directory.R") # data_dir must be first defined 37 | # alternative: give full path here, 38 | # example data_dir="C:/Users/bekes.gabor/Dropbox (MTA KRTK)/bekes_kezdi_textbook/da_data_repo" 39 | 40 | # load theme and functions 41 | source("ch00-tech-prep/theme_bg.R") 42 | source("ch00-tech-prep/da_helper_functions.R") 43 | 44 | use_case_dir <- "ch12-time-series-simulations/" 45 | 46 | data_out <- use_case_dir 47 | output <- paste0(use_case_dir,"output/") 48 | create_output_if_doesnt_exist(output) 49 | 50 | 51 | #----------------------------------------------------------------------------------------- 52 | # PART 1 53 | # Random walk simulation 54 | # Generate k random walks across time {0, 1, ... , T} 55 | 56 | 57 | # set parameters 58 | set.seed (10) 59 | T <- 100 # number of obs 60 | k <- 5 # nr of random walks generated 61 | initial.value <- 0 62 | 63 | # create a function 64 | GetRandomWalk <- function() { 65 | # Add a standard normal at each step 66 | initial.value + c(0, cumsum(rnorm(T))) 67 | } 68 | 69 | # Matrix of random walks 70 | values <- replicate(k, GetRandomWalk()) 71 | 72 | # visualize 73 | rws <- as.data.frame(values) 74 | rws <- rws %>% 75 | mutate(time=as.numeric(rownames(.))) 76 | 77 | rws <- rws %>% 78 | gather(var, value, V1:V5) 79 | 80 | rws_plot <- ggplot(rws,aes(time, value, color=var)) + 81 | geom_line (show.legend = FALSE, size =0.8) + 82 | theme_bg() + 83 | scale_color_manual(values = c(color[1], color[2], color[3], color[4], color[5])) + 84 | labs(x = "Time period", 85 | y = "Value of the simulated variable") + 86 | scale_x_continuous(expand = c(0.01,0.01), limits = c(0,100), breaks=seq(0,100,10)) 87 | rws_plot 88 | save_fig("ch12-figure-1-randomwalks", output, "small") 89 | 90 | 91 | #----------------------------------------------------------------------------------------- 92 | # PART 2 93 | # Serially correlated vs serially uncorrelated series 94 | # simulation exercies 95 | 96 | 97 | 98 | # rnorm(n, mean = 0, sd = 1) 99 | 100 | # serially uncorrelated series/white noise 101 | set.seed(2016) 102 | uncorr <- as.data.frame(ts(rnorm(100, mean=0, sd=1)) ) 103 | 104 | uncorr <- uncorr %>% 105 | mutate(t=as.numeric(rownames(.))) 106 | uncorr 107 | 108 | whitenoise <- ggplot(uncorr,aes(t, x)) + 109 | geom_line (show.legend = FALSE, size =0.6, color=color[1]) + 110 | geom_hline(yintercept=0, 111 | color = color[2], size=1)+ 112 | labs(x = "Time period", 113 | y = "Value of the simulated variable") + 114 | theme_bg() + 115 | scale_y_continuous(expand = c(0.01,0.01)) + 116 | scale_x_continuous(expand = c(0.01,0.01),breaks=seq(0,100,10)) 117 | whitenoise 118 | save_fig("ch12-figure-9a-serialcorr-whitenoise", output, "small") 119 | 120 | 121 | # serially correlated series, pho=0.8 122 | set.seed(2016) 123 | rho=0.8 124 | E <- rnorm(100, 0, 1) 125 | x <- numeric() 126 | x[1] <- E[1] 127 | for(i in 2:100) x[i] <- rho*x[i-1] + E[i] 128 | 129 | E <- as.data.frame(E) 130 | 131 | corr08 <- E %>% 132 | mutate(t=as.numeric(rownames(.))) 133 | 134 | corr08_graph <- ggplot(corr08,aes(t, x)) + 135 | geom_line (show.legend = FALSE, size =0.6, color=color[1]) + 136 | geom_hline(yintercept=0, 137 | color = color[2], size=1)+ 138 | labs(x = "Time period", 139 | y = "Value of the simulated variable") + 140 | theme_bg() + 141 | scale_y_continuous(expand = c(0.01,0.01)) + 142 | scale_x_continuous(expand = c(0.01,0.01),breaks=seq(0,100,10)) 143 | corr08_graph 144 | save_fig("ch12-figure-9b-serialcorr-corr08", output, "small") 145 | 146 | 147 | 148 | 149 | 150 | 151 | -------------------------------------------------------------------------------- /ch12-time-series-simulations/ch12-randomwalk-simul.do: -------------------------------------------------------------------------------- 1 | ******************************************************************** 2 | * Prepared for Gabor's Data Analysis 3 | * 4 | * Data Analysis for Business, Economics, and Policy 5 | * by Gabor Bekes and Gabor Kezdi 6 | * Cambridge University Press 2021 7 | * 8 | * gabors-data-analysis.com 9 | * 10 | * License: Free to share, modify and use for educational purposes. 11 | * Not to be used for commercial purposes. 12 | * 13 | * Chapter 12 14 | * Simulating random walk time series 15 | * no actual data used 16 | * version 0.9 2020-09-12 17 | ******************************************************************** 18 | 19 | 20 | * SETTING UP DIRECTORIES 21 | 22 | * set working directory for da_case_studies. 23 | * for example: 24 | * cd "C:/Users/xy/Dropbox/gabors_data_analysis/da_case_studies" 25 | 26 | global data_in "$data_dir/hotels-vienna/clean" 27 | global work "ch12-time-series-simulations" 28 | 29 | cap mkdir "$work/output" 30 | global output "$work/output" 31 | 32 | 33 | clear 34 | set seed 201806 35 | set obs 100 36 | global sde=1 37 | global ystart=0 38 | 39 | gen t=_n 40 | tsset t 41 | 42 | forvalue i=1/5 { 43 | gen y`i' = $ystart if t==1 44 | replace y`i' = y`i'[_n-1] + rnormal(0,$sde) if t>1 45 | } 46 | 47 | tsline y1 y2 y3 y4 y5, xlab(, grid) ylab(,grid) legend(off) /// 48 | lw(thick thick thick thick thick) lp(solid dash shortdash longdash dash) /// 49 | lc(olive dkgreen blue red gs10) /// 50 | graphregion(fcolor(white) ifcolor(none)) 51 | graph export "$output\ch12-figure-1-randomwalks-Stata.png", replace 52 | 53 | -------------------------------------------------------------------------------- /ch12-time-series-simulations/ch12-serialcorr-simul.do: -------------------------------------------------------------------------------- 1 | ******************************************************************** 2 | * Prepared for Gabor's Data Analysis 3 | * 4 | * Data Analysis for Business, Economics, and Policy 5 | * by Gabor Bekes and Gabor Kezdi 6 | * Cambridge University Press 2021 7 | * 8 | * gabors-data-analysis.com 9 | * 10 | * License: Free to share, modify and use for educational purposes. 11 | * Not to be used for commercial purposes. 12 | * 13 | * Chapter 12 14 | * Simulating time series with various levels of serial correlation 15 | * no actual data used 16 | * version 0.9 2020-09-12 17 | ******************************************************************** 18 | 19 | 20 | * SETTING UP DIRECTORIES 21 | 22 | * set working directory for da_case_studies. 23 | * for example: 24 | * cd "C:/Users/xy/Dropbox/gabors_data_analysis/da_case_studies" 25 | 26 | global data_in "$data_dir/hotels-vienna/clean" 27 | global work "ch12-time-series-simulations" 28 | 29 | cap mkdir "$work/output" 30 | global output "$work/output" 31 | 32 | clear 33 | set seed 2016 34 | set obs 100 35 | global rho=0.8 36 | global sde=0.5 37 | 38 | gen t=_n 39 | tsset t 40 | 41 | ** serially uncorrelated series 42 | gen y1=rnormal(0,$sde) 43 | lab var y1 "simulated time series, no serial correlation" 44 | 45 | tsline y1, lw(thick) lc(navy*0.8) /// 46 | yline(0) xlabel(, grid) ylabel(, grid) xtitle("Time period") /// 47 | graphregion(fcolor(white) ifcolor(none)) 48 | graph export "$output\ch12-figure-9a-serialcorr-Stata.png", replace 49 | 50 | 51 | ** serially correlated series 52 | gen y2=0 if t==1 53 | lab var y2 "simulated time series, serial correlation = 0$rho" 54 | replace y2=$rho*y2[_n-1] + rnormal(0,$sde) if t>1 55 | 56 | tsline y2, lw(thick) lc(navy*0.8) /// 57 | yline(0) xlabel(, grid) ylabel(, grid) xtitle("Time period") /// 58 | graphregion(fcolor(white) ifcolor(none)) 59 | graph export "$output\ch12-figure-9b-serialcorr-Stata.png", replace 60 | 61 | -------------------------------------------------------------------------------- /ch14-airbnb-reg/README.md: -------------------------------------------------------------------------------- 1 | Key information re using these scripts 2 | 3 | ## Data 4 | 5 | There is two version of the clean data after we discovered some bugs in the raw to clean cleaning files (hosted at the data repo on osf.io.) We'll hope to sort it out later. 6 | 7 | The scripts use `airbnb/clean/airbnb_london_cleaned_book.csv`. -------------------------------------------------------------------------------- /ch14-used-cars-log/ch14-used-cars-log.do: -------------------------------------------------------------------------------- 1 | ******************************************************************** 2 | * Prepared for Gabor's Data Analysis 3 | * 4 | * Data Analysis for Business, Economics, and Policy 5 | * by Gabor Bekes and Gabor Kezdi 6 | * Cambridge University Press 2021 7 | * 8 | * gabors-data-analysis.com 9 | * 10 | * License: Free to share, modify and use for educational purposes. 11 | * Not to be used for commercial purposes. 12 | * 13 | * Chapter 14 14 | * CH14A Predicting uesd car value: log proces 15 | * using the used-car dataset 16 | * version 0.9 2020-09-12 17 | ******************************************************************** 18 | 19 | 20 | * SETTING UP DIRECTORIES 21 | 22 | * STEP 1: set working directory for da_case_studies. 23 | * for example: 24 | * cd "C:/Users/xy/Dropbox/gabors_data_analysis/da_case_studies" 25 | 26 | 27 | 28 | * STEP 2: * Directory for data 29 | * Option 1: run directory-setting do file 30 | do set-data-directory.do 31 | /* this is a one-line do file that should sit in 32 | the working directory you have just set up 33 | this do file has a global definition of your working directory 34 | more details: gabors-data-analysis.com/howto-stata/ */ 35 | 36 | * Option 2: set directory directly here 37 | * for example: 38 | * global data_dir "C:/Users/xy/gabors_data_analysis/da_data_repo" 39 | 40 | 41 | global data_in "$data_dir/used-cars/clean" 42 | global work "ch14-used-cars-log" 43 | 44 | cap mkdir "$work/output" 45 | global output "$work/output" 46 | 47 | 48 | use "$data_in/used-cars_2cities_prep.dta", clear 49 | * Or download directly from OSF: 50 | /* 51 | copy "https://osf.io/download/3zf8e/" "workfile.dta" 52 | use "workfile.dta", clear 53 | erase "workfile.dta" 54 | */ 55 | 56 | 57 | * Sample design 58 | 59 | drop if Hybrid==1 60 | drop Hybrid 61 | 62 | tab fuel 63 | keep if fuel=="gas" 64 | 65 | tab condition 66 | drop if condition=="new" 67 | drop if condition=="fair" 68 | 69 | * drop very small prices, likely error 70 | drop if price<500 71 | drop if price>25000 72 | drop if odometer>100 73 | drop if price<1000 & (condition=="like new" | age<8) 74 | drop if price==. 75 | 76 | tab transmission 77 | drop if transmission =="manual" 78 | drop pricestr 79 | 80 | tab type 81 | drop if type=="truck" 82 | 83 | tab area 84 | gen chicago=area=="chicago" 85 | keep if chicago==1 86 | 87 | 88 | * Some feature engineering 89 | * condition 90 | gen cond_excellent = condition=="excellent" 91 | gen cond_good = condition=="good" 92 | gen cond_likenew = condition=="like new" 93 | 94 | * cylinders 95 | gen cylind6 = cylinders=="6 cylinders" 96 | 97 | * price: quadratic 98 | gen agesq=age^2 99 | gen agecu=age^3 100 | gen odometersq=odometer^2 101 | 102 | save "$work/usedcars_work.dta", replace 103 | 104 | 105 | 106 | 107 | * lowess: price, lnprice 108 | lowess price age, mc(navy*0.6) lineopts( lc(green*0.8) lw(vthick)) /// 109 | ylab(, grid) xlab(, grid) ytitle("Price (US dollars)") xtitle("Age (years)") /// 110 | title("") note("") /// 111 | graphregion(fcolor(white) ifcolor(none)) /// 112 | plotregion(fcolor(white) ifcolor(white)) 113 | graph export "$output/ch14-figure-2a-p-age-lowess-Stata.png",replace 114 | 115 | lowess lnprice age, mc(navy*0.6) lineopts( lc(green*0.8) lw(vthick)) /// 116 | ylab(, grid) xlab(, grid) ytitle("ln(price, US dollars)") xtitle("Age (years)") /// 117 | title("") note("") /// 118 | graphregion(fcolor(white) ifcolor(none)) /// 119 | plotregion(fcolor(white) ifcolor(white)) 120 | graph export "$output/ch14-figure-2b-lnp-age-lowess-Stata.png",replace 121 | 122 | 123 | 124 | * PREDICTION 125 | * repeat what M3 in ch 13, no in logs 126 | * generate new observation with features our car 127 | 128 | global M3 age agesq odometer odometersq LE cond_excellent cond_good dealer 129 | 130 | * generate new observation with features our car 131 | local nplus1=_N+1 132 | set obs `nplus1' 133 | replace age=10 if _n==`nplus1' 134 | replace agesq=age^2 if _n==`nplus1' 135 | replace odometer=12 if _n==`nplus1' 136 | replace odometersq=odometer^2 if _n==`nplus1' 137 | replace LE=1 if _n==`nplus1' 138 | replace XLE=0 if _n==`nplus1' 139 | replace SE=0 if _n==`nplus1' 140 | replace cond_likenew=0 if _n==`nplus1' 141 | replace cond_excellent=1 if _n==`nplus1' 142 | replace cond_good=0 if _n==`nplus1' 143 | replace cylind6=0 if _n==`nplus1' 144 | replace dealer=0 if _n==`nplus1' 145 | lis if _n==`nplus1' 146 | 147 | 148 | * ln y 149 | * M3 150 | reg lnprice $M3 151 | predict lnpM3 if _n==`nplus1' 152 | gen lnpM3_sig = e(rmse) if _n==`nplus1' 153 | predict lnpM3_spe if _n==`nplus1', stdf 154 | gen lnpM3_80PIlow = lnpM3 - 1.28*lnpM3_spe 155 | gen lnpM3_80PIhigh = lnpM3 + 1.28*lnpM3_spe 156 | * log correction 157 | gen pM3_log = exp(lnpM3) * exp(lnpM3_sig^2/2) 158 | gen pM3_log_80PIlow = exp(lnpM3_80PIlow ) * exp(lnpM3_sig^2/2) 159 | gen pM3_log_80PIhigh = exp(lnpM3_80PIhigh) * exp(lnpM3_sig^2/2) 160 | 161 | * level y 162 | * M3 163 | reg price $M3 164 | predict pM3_level if _n==`nplus1' 165 | predict pM3_level_spe if _n==`nplus1', stdf 166 | gen pM3_level_80PIlow = pM3_level - 1.28*pM3_level_spe 167 | gen pM3_level_80PIhigh = pM3_level + 1.28*pM3_level_spe 168 | 169 | 170 | * Table 14.1 171 | * numbers are slightly different from textbook 172 | * due to differences in degrees-of-freedom corrections in how 173 | * sigma is calculated in R vs Stata 174 | tabstat lnpM3*, c(s) 175 | tabstat pM3_log*, c(s) 176 | tabstat pM3_level*, c(s) 177 | 178 | 179 | 180 | -------------------------------------------------------------------------------- /ch16-airbnb-random-forest/README.md: -------------------------------------------------------------------------------- 1 | # Information about *-shap* versions 2 | 3 | 4 | For this chapter we added additional code versions including preliminary analysis with [TreeSHAP](https://shap.readthedocs.io/en/latest/index.html). These are not part of the 1st edition of the textbook, but might be in the second edition. 5 | 6 | They are in development, comments and suggestions (as issues) are welcome. 7 | -------------------------------------------------------------------------------- /ch19-food-health/ch19-food-health.do: -------------------------------------------------------------------------------- 1 | ******************************************************************** 2 | * Prepared for Gabor's Data Analysis 3 | * 4 | * Data Analysis for Business, Economics, and Policy 5 | * by Gabor Bekes and Gabor Kezdi 6 | * Cambridge University Press 2021 7 | * 8 | * gabors-data-analysis.com 9 | * 10 | * License: Free to share, modify and use for educational purposes. 11 | * Not to be used for commercial purposes. 12 | * 13 | * Chapter 19 14 | * CH19A Food and health 15 | * using the food-health dataset 16 | * version 0.9 2020-09-13 17 | ******************************************************************** 18 | 19 | 20 | * SETTING UP DIRECTORIES 21 | 22 | * STEP 1: set working directory for da_case_studies. 23 | * for example: 24 | * cd "C:/Users/xy/Dropbox/gabors_data_analysis/da_case_studies" 25 | 26 | 27 | 28 | * STEP 2: * Directory for data 29 | * Option 1: run directory-setting do file 30 | do set-data-directory.do 31 | /* this is a one-line do file that should sit in 32 | the working directory you have just set up 33 | this do file has a global definition of your working directory 34 | more details: gabors-data-analysis.com/howto-stata/ */ 35 | 36 | * Option 2: set directory directly here 37 | * for example: 38 | * global data_dir "C:/Users/xy/gabors_data_analysis/da_data_repo" 39 | 40 | 41 | global data_in "$data_dir/food-health/clean" 42 | global work "ch19-food-health" 43 | 44 | cap mkdir "$work/output" 45 | global output "$work/output" 46 | 47 | * load clean data 48 | use "$data_in/food-health.dta", clear 49 | * Or download directly from OSF: 50 | /* 51 | copy "https://osf.io/download/wtybg/" "workfile.dta" 52 | use "workfile.dta", clear 53 | erase "workfile.dta" 54 | */ 55 | 56 | * simpler variable names 57 | * fruit and vegetables 58 | gen fv= veggies_n_fruits_gr 59 | lab var fv "Fruit and vegetables per day (grams)" 60 | 61 | * blood pressure 62 | gen bp = blood_p 63 | lab var bp "Blood pressure (systolic+diastolic)" 64 | sum fv,d 65 | 66 | * exercising 67 | gen exerc = paq655 if paq655 <=7 68 | replace exerc =0 if paq650==2 69 | lab var exerc "Days per week exercising" 70 | tab exerc,mis 71 | lab var hh_income_percap "Household income per capita (US dollars)" 72 | 73 | * potato chips 74 | gen pchips = gr_potato_chips 75 | lab var pchips "Potato chips per day (grams)" 76 | 77 | * sample design 78 | keep if age>=30 & age<60 79 | drop if fv>3200 80 | drop if bp==. 81 | count 82 | 83 | 84 | **************************** 85 | * Descriptive table 86 | eststo clear 87 | estpost sum bp fv ,d 88 | esttab using "$output/ch19-table-1-des-Stata.tex", replace /// 89 | cells("mean(fmt(0)) p50(fmt(0)) sd(fmt(0)) min(fmt(0)) max(fmt(0)) count(fmt(0))") /// 90 | label noobs nonum /// 91 | collabels(Mean Median Std.Dev. Minimum Maximum Observations) 92 | 93 | 94 | ********************************* 95 | * Scatterplot and regression line: 96 | * blood pressure on fruit and vegetables 97 | 98 | scatter bp fv, ms(o) mc(navy*0.6) msize(small) /// 99 | || lfit bp fv, lw(thick) lc(green) /// 100 | xlab(0(500)3000, grid) ylab(140(20)280, grid) legend(off) /// 101 | ytitle("Blood pressure (systolic+diastolic)") 102 | graph export "$output/ch19-figure-8a-fv-bp-Stata.png",replace 103 | 104 | twoway lfit bp fv, lw(thick) lc(green) /// 105 | xlab(0(500)3000, grid) ylab(180(2)200, grid) legend(off) /// 106 | ytitle("Blood pressure (systolic+diastolic)") 107 | graph export "$output/ch19-figure-8a-fv-bp-reg-Stata.png",replace 108 | 109 | 110 | *lowess bp fv 111 | *lowess bp fv if fv<1000 112 | 113 | ********************************* 114 | * Scatterplot and regression line: 115 | * fruit and vegetables on ln income, exercising, potato chips 116 | 117 | gen lninc=ln(hh_income_per) 118 | lab var lninc "Log household income per capita" 119 | 120 | scatter fv lninc, ms(o) mc(navy*0.6) msize(small) /// 121 | || lfit fv lninc, lw(thick) lc(green) /// 122 | xlab(6(1)12, grid) ylab(0(500)2000, grid) legend(off) /// 123 | ytitle("Fruit and vegetables per day (grams)") 124 | graph export "$output/ch19-figure-9a-fv-inc-Stata.png",replace 125 | 126 | scatter fv exerc, ms(o) mc(navy*0.6) msize(small) /// 127 | || lfit fv exerc, lw(thick) lc(green) /// 128 | xlab(0(1)7, grid) ylab(0(500)3000, grid) legend(off) /// 129 | ytitle("Fruit and vegetables per day (grams)") 130 | graph export "$output/ch19-figure-9a-fv-exerc-Stata.png",replace 131 | 132 | scatter fv pchips , ms(o) mc(navy*0.6) msize(small) /// 133 | || lfit fv pchips , lw(thick) lc(green) /// 134 | xlab(, grid) ylab(0(500)3000, grid) legend(off) /// 135 | ytitle("Fruit and vegetables per day (grams)") 136 | graph export "$output/ch19-figure-10-fv-pchips-Stata.png",replace 137 | 138 | -------------------------------------------------------------------------------- /ch20-ab-test-social-media/ch20-ab-test-powercalc-pvalues.R: -------------------------------------------------------------------------------- 1 | #******************************************************************** 2 | # * Prepared for Gabor's Data Analysis 3 | # 4 | # Data Analysis for Business, Economics, and Policy 5 | # by Gabor Bekes and Gabor Kezdi 6 | # Cambridge University Press 2021 7 | # 8 | # gabors-data-analysis.com 9 | # 10 | # License: Free to share, modify and use for educational purposes. 11 | # Not to be used for commercial purposes. 12 | # 13 | # Chapter 20 14 | # CH20B Fine tuning social media advertising 15 | # using the ab-test-social-media dataset 16 | # version 0.9 2020-09-13 17 | #******************************************************************** 18 | 19 | 20 | 21 | rm(list=ls()) 22 | 23 | # packages 24 | library(tidyverse) 25 | library(pwr) 26 | library(readxl) 27 | 28 | # set working directory 29 | # option A: open material as project 30 | # option B: set working directory for da_case_studies 31 | # example: setwd("C:/Users/bekes.gabor/Documents/github/da_case_studies/") 32 | 33 | # set data dir, load theme and functions 34 | source("ch00-tech-prep/theme_bg.R") 35 | source("ch00-tech-prep/da_helper_functions.R") 36 | 37 | # data used 38 | source("set-data-directory.R") #data_dir must be first defined # 39 | data_in <- paste(data_dir,"ab-test-social-media","clean", sep = "/") 40 | 41 | use_case_dir <- "ch20-ab-test-social-media/" 42 | data_out <- use_case_dir 43 | output <- paste0(use_case_dir,"output/") 44 | 45 | #*************************************************************** 46 | # * PART I 47 | #* sample size calculations 48 | # 49 | #* sample size calculation with planned rates 50 | 51 | clickthrough <- 0.01 52 | conversion <- 0.05 53 | 54 | proportionA = clickthrough * conversion 55 | 56 | proportionB = proportionA * 1.2 57 | 58 | h = 2 * asin(sqrt(proportionA)) - 2 * asin(sqrt(proportionB)) 59 | 60 | pwr.2p.test(h=h, sig.level=0.05, power = 0.8) 61 | 62 | 63 | 64 | clickthrough <- 0.0032 65 | conversion <- 0.0082 66 | 67 | proportionA = clickthrough * conversion 68 | 69 | proportionB = proportionA * 1.2 70 | 71 | h = 2 * asin(sqrt(proportionA)) - 2 * asin(sqrt(proportionB)) 72 | 73 | pwr.2p.test(h=h, sig.level=0.05, power = 0.8) 74 | 75 | 76 | # Part II 77 | 78 | # p-value of tests 79 | 80 | data_summary <- read_excel(paste(data_in, "/ab-test-summary.xlsx",sep="")) 81 | #data_summary <- read_excel("https://osf.io/download/mhybr/") 82 | 83 | 84 | type_b <- 0 85 | clicks <- c(rep(1, data_summary$clicks[1]), rep(0, data_summary[1,2]-data_summary$clicks[1])) 86 | action <- c(rep(1, data_summary$action[1]), rep(0, data_summary[1,2]-data_summary$action[1])) 87 | data_a <- data.frame(type_b,clicks,action) 88 | 89 | type_b <- 1 90 | clicks <- c(rep(1, data_summary$clicks[2]), rep(0, data_summary[1,2]-data_summary$clicks[2])) 91 | action <- c(rep(1, data_summary$action[2]), rep(0, data_summary[1,2]-data_summary$action[2])) 92 | data_b <- data.frame(type_b,clicks,action) 93 | 94 | data = rbind(data_a,data_b) 95 | 96 | 97 | table(data$type_b,data$clicks) 98 | 99 | table(data$type_b,data$action) 100 | 101 | table(data$action,data$clicks) 102 | 103 | 104 | reg1 = lm(clicks ~ type_b,data=data) 105 | 106 | summary(reg1) 107 | 108 | reg2 = lm(action ~ type_b,data=data) 109 | 110 | summary(reg2) 111 | -------------------------------------------------------------------------------- /ch20-ab-test-social-media/ch20-ab-test-powercalc-pvalues.do: -------------------------------------------------------------------------------- 1 | ******************************************************************** 2 | * Prepared for Gabor's Data Analysis 3 | * 4 | * Data Analysis for Business, Economics, and Policy 5 | * by Gabor Bekes and Gabor Kezdi 6 | * Cambridge University Press 2021 7 | * 8 | * gabors-data-analysis.com 9 | * 10 | * License: Free to share, modify and use for educational purposes. 11 | * Not to be used for commercial purposes. 12 | * 13 | * Chapter 20 14 | * CH20B Fine tuning social media advertising 15 | * using the ab-test-social-media dataset 16 | * version 0.91 2020-05-25 17 | ******************************************************************** 18 | 19 | 20 | * SETTING UP DIRECTORIES 21 | 22 | * STEP 1: set working directory for da_case_studies. 23 | * for example: 24 | * cd "C:/Users/xy/Dropbox/gabors_data_analysis/da_case_studies" 25 | 26 | 27 | 28 | * STEP 2: * Directory for data 29 | * Option 1: run directory-setting do file 30 | do set-data-directory.do 31 | /* this is a one-line do file that should sit in 32 | the working directory you have just set up 33 | this do file has a global definition of your working directory 34 | more details: gabors-data-analysis.com/howto-stata/ */ 35 | 36 | * Option 2: set directory directly here 37 | * for example: 38 | * global data_dir "C:/Users/xy/gabors_data_analysis/da_data_repo" 39 | 40 | 41 | global data_in "$data_dir/ab-test-social-media/clean" 42 | global work "ch20-ab-test-social-media" 43 | 44 | cap mkdir "$work/output" 45 | global output "$work/output" 46 | 47 | 48 | 49 | *************************************************************** 50 | * PART I 51 | * sample size calculations 52 | 53 | * sample size calculation with planned rates 54 | 55 | local clickthrough = 0.01 56 | local conversion = 0.05 57 | 58 | local proportionA = `clickthrough' * `conversion' 59 | dis `proportionA' 60 | 61 | local proportionB = `proportionA' * 1.2 62 | dis `proportionB' 63 | 64 | power twoproportions `proportionA' `proportionB' 65 | 66 | 67 | 68 | 69 | * sample size calculation with rates closer to actual 70 | 71 | local clickthrough = 0.0032 72 | local conversion = 0.0082 73 | 74 | local proportionA = `clickthrough' * `conversion' 75 | dis `proportionA' 76 | 77 | local proportionB = `proportionA' * 1.2 78 | dis `proportionB' 79 | 80 | power twoproportions `proportionA' `proportionB' 81 | 82 | 83 | *******************************- 84 | * PART II 85 | * p-value of tests 86 | 87 | clear 88 | import excel "$data_in/ab-test-summary.xlsx", sheet("Sheet1") firstrow 89 | 90 | * Or download directly from OSF: 91 | /* 92 | copy "https://osf.io/download/mhybr/" "workfile.xlsx" 93 | import excel "workfile.xlsx", sheet('Sheet1') firstrow 94 | erase "workfile.xlsx" 95 | */ 96 | 97 | foreach ab in A B{ 98 | foreach var in show clicks action { 99 | su `var' if action_type =="Action `ab'", meanonly 100 | local `var'_`ab' = r(mean) 101 | dis ``var'_`ab'' 102 | } 103 | } 104 | 105 | clear 106 | local obs = `show_A' + `show_B' 107 | set obs `obs' 108 | gen type_b=_n>`show_A' 109 | tab type_b 110 | gen clicks = _n<=`clicks_A' 111 | local sc= `show_B'+`clicks_B' 112 | replace clicks = 1 if _n>`show_B'& _n<=`sc' 113 | tab type_b clicks 114 | gen action = _n<=`action_A' 115 | 116 | local sa=`show_B'+`action_B' 117 | replace action = 1 if _n>`show_B' & _n<=`sa' 118 | tab type_b action 119 | 120 | reg clicks type_b, nohead robust 121 | reg action type_b, nohead robust 122 | -------------------------------------------------------------------------------- /ch20-working-from-home/README_wfh_datawork.txt: -------------------------------------------------------------------------------- 1 | Working from Home 2 | Raw data, Tidy data, Workfile 3 | 4 | 5 | Raw data 6 | 7 | many data tables 8 | In some, observations are person level 9 | In most, observations are person X week level 10 | There are many variables that we won't use in the analysis 11 | We'll work with the raw data tables that have variables that we'll use 12 | 13 | quit_data 14 | observations: person-level (n=249) 15 | variables: ID: personid 16 | x: treatment group, 17 | y: whether quit job by end of experiment, performance z-score during experiment, 18 | z: performance z-scores pre-experiment, few background variables 19 | 20 | tc_comparison.dta 21 | observations: person-level (n=249) 22 | variables: ID: personid 23 | z: all background variables used to check balance except pre-experiment performance z-score 24 | 25 | 26 | performance_during_exper.dta 27 | observations: person x week level (n=112,279, 1,934 persons, 86 weeks, unbalanced) 28 | variables: ID: personid, year_week 29 | y: phonecallraw 30 | +: other potential outcome variables, not used in our case study 31 | 32 | 33 | 34 | Tidy data 35 | Two data files 36 | 37 | One at person level, wfh_tidy_person 38 | From raw: Two person-level raw files merged 39 | + merged aggregated from one person x week level file, aggregated to person level 40 | (with one variable, sum phonecallraw before and during experiment 41 | 42 | One at person X week level, wfh_tidy_personweek 43 | From raw: One person x week level file, with only the phoencallraw (renamed phonecalls) variable 44 | (plus IDs plus treatment vars) 45 | (kept only 249 persons in experiment from total 1,943 persons; expgroup=0 or 1) 46 | 47 | 48 | 49 | Workfile 50 | Same as wfh_tidy_person 51 | (In Stata: variables ordered so that order conforms table 20.1 on balance) 52 | -------------------------------------------------------------------------------- /ch20-working-from-home/background/working-from-home-QJE.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gabors-data-analysis/da_case_studies/08e55b021c295653e1b546faeeb4550926c9a7d4/ch20-working-from-home/background/working-from-home-QJE.pdf -------------------------------------------------------------------------------- /ch20-working-from-home/ch20-wfh.R: -------------------------------------------------------------------------------- 1 | ######################################################################################### 2 | # Prepared for Gabor's Data Analysis 3 | # 4 | # Data Analysis for Business, Economics, and Policy 5 | # by Gabor Bekes and Gabor Kezdi 6 | # Cambridge University Press 2021 7 | # 8 | # gabors-data-analysis.com 9 | # 10 | # License: Free to share, modify and use for educational purposes. 11 | # Not to be used for commercial purposes. 12 | 13 | # CHAPTER 20 14 | # CH20A Working from home and employee performance 15 | # using the wfh dataset 16 | # version 0.91 2021-02-27 17 | ######################################################################################### 18 | 19 | 20 | ########### 21 | 22 | # 23 | # Clear memory 24 | rm(list=ls()) 25 | 26 | # Descriptive statistics and regressions 27 | library(tidyverse) 28 | library(haven) 29 | library(fixest) 30 | library(reshape) 31 | library(cowplot) 32 | 33 | # set data dir, data used 34 | source("set-data-directory.R") # data_dir must be first defined 35 | 36 | # option A: open material as project 37 | # option B: set working directory for da_case_studies 38 | # example: setwd("C:/Users/bekes.gabor/Documents/github/da_case_studies/") 39 | 40 | # load theme and functions 41 | source("ch00-tech-prep/theme_bg.R") 42 | source("ch00-tech-prep/da_helper_functions.R") 43 | options(digits = 3) 44 | 45 | data_in <- paste(data_dir,"working-from-home","clean/", sep = "/") 46 | use_case_dir <- "ch20-working-from-home/" 47 | 48 | data_out <- use_case_dir 49 | output <- paste0(use_case_dir,"output/") 50 | create_output_if_doesnt_exist(output) 51 | 52 | 53 | # Load in data ------------------------------------------------------- 54 | #data <- read_csv('https://osf.io/5c3rf/download') 55 | data <- read_csv(paste0(data_in, "wfh_tidy_person.csv")) 56 | 57 | 58 | data <- data %>% 59 | dplyr::select(personid:perform11, age, male, second_technical, high_school, tertiary_technical, university, 60 | prior_experience, tenure, married, children, ageyoungestchild, rental, 61 | costofcommute, internet, bedroom, basewage, bonus, grosswage, phonecalls1 ) 62 | 63 | 64 | # Balance ------------------------------------------------------------ 65 | 66 | # Modify variable 67 | data$ageyoungestchild <- ifelse(data$children == 0, NA, data$ageyoungestchild) 68 | 69 | 70 | # Table of averages in control and treatment 71 | 72 | data_temp <- data %>% 73 | dplyr::select (perform10, age:grosswage, ordertaker) 74 | 75 | 76 | vars <- colnames(data_temp) 77 | rm(data_temp) 78 | 79 | mean_t <- c() 80 | mean_c <- c() 81 | sd <- c() 82 | p_value <- c() 83 | model <- c() 84 | 85 | 86 | 87 | for(i in vars){ 88 | # Regression model 89 | model <- lm(paste(i, "~treatment"), data=data) 90 | 91 | # Mean control 92 | mean_c[i] <- mean(data[data$treatment==0, ][[paste(i)]], na.rm=T) 93 | # mean_c[i] <- model$coefficients[1] # or get it directly from regression 94 | 95 | # Mean treated 96 | mean_t[i] <- mean(data[data$treatment==1, ][[paste(i)]], na.rm=T) 97 | # mean_t[i] <- model$coefficients[1] + model$coefficients[2] # or get it directly from regression 98 | 99 | # p-value from regression 100 | p_value[i] <- anova(model)$'Pr(>F)'[1] 101 | 102 | # Standard deviation 103 | sd[i] <- sd(data[[paste(i)]], na.rm=T) 104 | } 105 | 106 | # Put together 107 | table <- data.frame(round(mean_t, 2), round(mean_c, 2), round(sd, 2), round(p_value, 2)) 108 | col.names <- c("Treatment mean", "Control mean", "Std.dev.", "p-value of test of equal means") 109 | names(table) <- col.names 110 | print(table) 111 | 112 | ############################ 113 | # outcomes: 114 | # quit firm during 8 months of experiment 115 | # phone calls worked, for order takers 116 | # 117 | 118 | quitjob <- data %>% 119 | group_by(treatment) %>% 120 | dplyr::summarise(mean=mean(quitjob), 121 | sd = sd(quitjob), 122 | N=n()) 123 | total_quitjob <- data %>% 124 | dplyr::summarise(mean_total=mean(quitjob), 125 | sd_total=sd(quitjob), 126 | N_total=n()) 127 | 128 | quitjob 129 | total_quitjob 130 | 131 | phonecalls1 <- data %>% 132 | group_by(treatment) %>% 133 | filter(ordertaker==1) %>% 134 | dplyr::summarise(mean=mean(phonecalls1), 135 | sd = sd(phonecalls1), 136 | N=n()) 137 | total_phonecalls <- data%>% 138 | filter(ordertaker==1) %>% 139 | dplyr::summarise(mean_total=mean(phonecalls1), 140 | sd_total=sd(phonecalls1), 141 | N_total=n()) 142 | phonecalls1 143 | total_phonecalls 144 | 145 | # Bar chart for quit rates 146 | 147 | data <- data %>% 148 | mutate(quit_pct=quitjob*100, 149 | stayed_pct = (1-quitjob)*100) 150 | 151 | barchart_data <- data %>% 152 | select(treatment, quit_pct, stayed_pct) %>% 153 | group_by(treatment) %>% 154 | summarise(quit_pct=mean(quit_pct), 155 | stayed_pct = mean(stayed_pct)) %>% 156 | gather(employees, pct, quit_pct:stayed_pct) 157 | 158 | quitrates_barchart <- ggplot(barchart_data,aes(fill = employees, y= pct, x=factor(treatment))) + 159 | geom_bar(stat = "identity") + 160 | theme_bg() + 161 | labs(y = "Share of employees (percent)", 162 | x = "") + 163 | scale_x_discrete(labels = c ("Non-treatment group", "Treatment group")) + 164 | scale_fill_manual(labels=c("Quit", "Stayed"), name = "", values= c(color[2], color[1])) + 165 | theme(legend.position="right", 166 | legend.background = element_rect(size=0.1, linetype="solid", colour = color.background), 167 | plot.margin=unit(x=c(0.1,0.1,0.1,0.1),units="mm") )+ 168 | background_grid(major="y", minor="none") 169 | quitrates_barchart 170 | save_fig("ch20-figure-1-wfh-quitrates-barchart", output, "small") 171 | 172 | # -------------------------------------------------------------------- 173 | # Regression analysis 174 | # Outcome variables: 1) quit firm during 8 months of experiment , 2) phone calls worked, for ordertakers 175 | # -------------------------------------------------------------------- 176 | 177 | # Outcomes by treatment 178 | 179 | # 1) Quit firm 180 | data %>% 181 | group_by(treatment) %>% 182 | summarise_at(vars(quitjob), funs(N=n(), Mean=mean(., na.rm=T), Sd=sd(., na.rm=T))) 183 | 184 | # 2) Phonecalls (ordertakers only) 185 | data %>% 186 | group_by(treatment) %>% 187 | filter(ordertaker==1) %>% 188 | summarise_at(vars(phonecalls1), funs(N=n(), Mean=mean(., na.rm=T), Sd=sd(., na.rm=T))) 189 | 190 | 191 | 192 | # Regression 1: ATE estimates, no covariates ------------------------- 193 | 194 | reg1 <- feols(quitjob ~ treatment, data=data , vcov = "HC1") 195 | reg2 <- feols(phonecalls1 ~ treatment, data=data[data$ordertaker==1, ], vcov = "HC1") 196 | 197 | 198 | etable(reg1,reg2,fitstat = c('n','r2','rmse')) 199 | 200 | 201 | 202 | # Regression 2: ATE estimates, with covariates of some unbalance ----- 203 | reg3 <- feols(quitjob ~ treatment + married + children + internet, data=data, vcov = "HC1") 204 | reg4 <- feols(phonecalls1 ~ treatment + married + children, data=data[data$ordertaker==1, ], vcov = "HC1") 205 | 206 | etable(reg3,reg4,fitstat = c('n','r2','rmse')) 207 | 208 | 209 | 210 | 211 | 212 | -------------------------------------------------------------------------------- /ch20-working-from-home/ch20-wfh.do: -------------------------------------------------------------------------------- 1 | ******************************************************************** 2 | * Prepared for Gabor's Data Analysis 3 | * 4 | * Data Analysis for Business, Economics, and Policy 5 | * by Gabor Bekes and Gabor Kezdi 6 | * Cambridge University Press 2021 7 | * 8 | * gabors-data-analysis.com 9 | * 10 | * License: Free to share, modify and use for educational purposes. 11 | * Not to be used for commercial purposes. 12 | * 13 | * Chapter 20 14 | * CH20A Working from home and employee performance 15 | * using the working-from-home dataset 16 | * version 0.9 2020-09-14 17 | ******************************************************************** 18 | 19 | 20 | * SETTING UP DIRECTORIES 21 | 22 | * STEP 1: set working directory for da_case_studies. 23 | * for example: 24 | * cd "C:/Users/xy/Dropbox/gabors_data_analysis/da_case_studies" 25 | 26 | 27 | 28 | * STEP 2: * Directory for data 29 | * Option 1: run directory-setting do file 30 | do set-data-directory.do 31 | /* this is a one-line do file that should sit in 32 | the working directory you have just set up 33 | this do file has a global definition of your working directory 34 | more details: gabors-data-analysis.com/howto-stata/ */ 35 | 36 | * Option 2: set directory directly here 37 | * for example: 38 | * global data_dir "C:/Users/xy/gabors_data_analysis/da_data_repo" 39 | 40 | 41 | global data_in "$data_dir/working-from-home/clean" 42 | global work "ch20-working-from-home" 43 | 44 | cap mkdir "$work/output" 45 | global output "$work/output" 46 | 47 | 48 | 49 | 50 | ******************************************************* 51 | * Create workfile from clean tidy data 52 | * same as tidy person-level data but variables ordered 53 | * so balance table is easier to make 54 | 55 | use "$data_in/wfh_tidy_person", clear 56 | * Or download directly from OSF: 57 | /* 58 | copy "https://osf.io/download/jrydb/" "workfile.dta" 59 | use "workfile.dta", clear 60 | erase "workfile.dta" 61 | */ 62 | 63 | order personid treatment ordertaker type quitjob phonecalls0 phonecalls1 /// 64 | perform10 perform11 age male second_techn high_school tertiary_tec university /// 65 | prior_experi tenure married children ageyoungestc rental costofcommut /// 66 | bedroom internet basewage bonus grosswage 67 | 68 | 69 | save "$work/ch20-wfh-workfile", replace 70 | 71 | 72 | ******************************************************* 73 | * Analysis 74 | 75 | * Balance 76 | 77 | use "$work/ch20-wfh-workfile", replace 78 | 79 | *des perform10 age-grosswage 80 | 81 | replace ageyoungest = . if children==0 82 | 83 | 84 | * Table 20.1 85 | * here produced from bits and pieces 86 | 87 | * First part: table with means and sd 88 | * tabstat in Stata, copied to Excel, 89 | * to laTex used https://www.latex-tables.com/ 90 | tabstat perform10 age-grosswage ordertaker if treatment==1, c(s) format(%5.2f) 91 | tabstat perform10 age-grosswage ordertaker if treatment==0, c(s) format(%5.2f) 92 | tabstat perform10 age-grosswage ordertaker, s(sd) c(s) format(%5.2f) 93 | 94 | * Second part: t-tests for equal means (we do them by regression for simplicity) 95 | * need to enter p-values one by one to LaTex or Excel table 96 | foreach z of varlist perform10 age-grosswage ordertaker { 97 | reg treatment `z', robust nohead 98 | } 99 | 100 | 101 | 102 | * outcomes: 103 | * quit firm during 8 months of experiment 104 | * # phone calls worked, for order takers 105 | 106 | des quit phonecalls1 107 | 108 | 109 | tabstat quit , by(treatment) s(mean sd n) 110 | tabstat phonecalls1 if ordertaker==1, by(treatment) s(mean sd n) 111 | 112 | * Bar chart for quit rates 113 | gen quit_pct = quitjob*100 114 | gen stayed_pct = (1-quitjob)*100 115 | lab def treatment 0 "Working from office" 1 "Working from home" 116 | lab val treatment treatment 117 | graph bar (mean) stayed_pct quit_pct, over(treatment) stack /// 118 | bar(1, col(navy*0.8)) bar(2, col(green*0.8)) /// 119 | ytitle("Percent of employees") ylabel(0(25)100) /// 120 | legend(label(1 "stayed") label(2 "quit")) 121 | graph export "$output/ch20-figure-1-wfh-quitrates-Stata.png", replace 122 | 123 | 124 | 125 | * Regression 1: ATE estimates, no covariates 126 | lab var treatment "Treatment group" 127 | lab var quitjob "Quit job " 128 | lab var phonecalls1 "Phone calls (thousand)" 129 | la var married "Married" 130 | lab var children "Children" 131 | lab var internet "Internet at home" 132 | 133 | 134 | reg quitjob treatment , robust 135 | outreg2 using "$output/ch20-table-3-wfh-reg1-Stata", bdec(2) sdec(3) 2aster tex(frag) nonotes label replace 136 | 137 | reg phonecalls1 treatment if ordertaker==1, robust 138 | outreg2 using "$output/ch20-table-3-wfh-reg1-Stata", bdec(2) sdec(2) 2aster tex(frag) nonotes label append 139 | 140 | 141 | * Regression 2: ATE estimates, with covariates of some unbalance 142 | 143 | reg quitjob treatment married children internet, robust 144 | outreg2 using "$output/ch20-table-4-wfh-reg2-Stata", bdec(2) sdec(3) 2aster tex(frag) nonotes label replace 145 | 146 | reg phonecalls1 treatment married children if ordertaker==1, robust 147 | outreg2 using "$output/ch20-table-4-wfh-reg2-Stata", bdec(2) sdec(2) 2aster tex(frag) nonotes label append 148 | 149 | -------------------------------------------------------------------------------- /ch21-ownership-management-quality/background/Management-Practices-Across-Firms-and-Countries-Bloom-Genakos-Sadun-and-Van-Reenen.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gabors-data-analysis/da_case_studies/08e55b021c295653e1b546faeeb4550926c9a7d4/ch21-ownership-management-quality/background/Management-Practices-Across-Firms-and-Countries-Bloom-Genakos-Sadun-and-Van-Reenen.pdf -------------------------------------------------------------------------------- /ch21-ownership-management-quality/background/SIC-2dig-manuf-labels.txt: -------------------------------------------------------------------------------- 1 | 20 Food & Kindred Products 2 | 21 Tobacco Products 3 | 22 Textile Mill Products 4 | 23 Apparel & Other Textile Products 5 | 24 Lumber & Wood Products 6 | 25 Furniture & Fixtures 7 | 26 Paper & Allied Products 8 | 27 Printing & Publishing 9 | 28 Chemical & Allied Products 10 | 29 Petroleum & Coal Products 11 | 30 Rubber & Miscellaneous Plastics Products 12 | 31 Leather & Leather Products 13 | 32 Stone, Clay, & Glass Products 14 | 33 Primary Metal Industries 15 | 34 Fabricated Metal Products 16 | 35 Industrial Machinery & Equipment 17 | 36 Electronic & Other Electric Equipment 18 | 37 Transportation Equipment 19 | 38 Instruments & Related Products 20 | 39 Miscellaneous Manufacturing Industries -------------------------------------------------------------------------------- /ch21-ownership-management-quality/background/the-ties-that-bind-lemos-and-scur.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gabors-data-analysis/da_case_studies/08e55b021c295653e1b546faeeb4550926c9a7d4/ch21-ownership-management-quality/background/the-ties-that-bind-lemos-and-scur.pdf -------------------------------------------------------------------------------- /ch21-ownership-management-quality/ch21-wms-01-dataprep.R: -------------------------------------------------------------------------------- 1 | ######################################################################################### 2 | # Prepared for Gabor's Data Analysis 3 | # 4 | # Data Analysis for Business, Economics, and Policy 5 | # by Gabor Bekes and Gabor Kezdi 6 | # Cambridge University Press 2021 7 | # 8 | # gabors-data-analysis.com 9 | # 10 | # License: Free to share, modify and use for educational purposes. 11 | # Not to be used for commercial purposes. 12 | 13 | # CHAPTER 21 14 | # CH20A Founder/family ownership and quality of management 15 | # using the wms-management dataset 16 | # version 0.9 2020-09-11 17 | ######################################################################################### 18 | 19 | # Clear memory ------------------------------------------------------- 20 | rm(list=ls()) 21 | 22 | # Import libraries --------------------------------------------------- 23 | 24 | library(tidyverse) 25 | library(purrr) 26 | library(haven) 27 | 28 | 29 | # set working directory 30 | # option A: open material as project 31 | # option B: set working directory for da_case_studies 32 | # example: setwd("C:/Users/bekes.gabor/Documents/github/da_case_studies/") 33 | 34 | # set data dir, load theme and functions 35 | source("ch00-tech-prep/theme_bg.R") 36 | source("ch00-tech-prep/da_helper_functions.R") 37 | 38 | # data used 39 | source("set-data-directory.R") #data_dir must be first defined # 40 | data_in <- paste(data_dir,"wms-management-survey","clean/", sep = "/") 41 | 42 | use_case_dir <- file.path("ch21-ownership-management-quality/") 43 | data_out <- use_case_dir 44 | output <- paste0(use_case_dir,"output/") 45 | create_output_if_doesnt_exist(output) 46 | 47 | 48 | 49 | 50 | # *************************************************************** 51 | # * 52 | # * PART I 53 | # * 54 | # * Data prep 55 | # *************************************************************** 56 | 57 | 58 | # Load in data ------------------------------------------------------- 59 | #data <- read_csv("https://osf.io/zy9j8/download") 60 | data <- read_csv(paste(data_in,"wms_da_textbook-xsec.csv",sep="")) 61 | 62 | # Ownership: define founder/family owned and drop ownership that's missing or not relevant 63 | # Ownership 64 | data %>% 65 | group_by(ownership) %>% 66 | summarise(Freq = n()) %>% 67 | mutate(Percent = Freq / sum(Freq)*100, Cum = cumsum(Percent)) 68 | 69 | # Define foundfam owned 70 | data$foundfam_owned <- ifelse( 71 | data$ownership== "Family owned, external CEO" | 72 | data$ownership== "Family owned, family CEO" | 73 | data$ownership== "Family owned, CEO unknown" | 74 | data$ownership== "Founder owned, external CEO" | 75 | data$ownership== "Founder owned, CEO unknown" | 76 | data$ownership== "Founder owned, founder CEO" , 1, 0) 77 | 78 | # Foundfam owned 79 | data %>% 80 | group_by(foundfam_owned) %>% 81 | summarise (Freq = n()) %>% 82 | mutate(Percent = Freq / sum(Freq)*100, Cum = cumsum(Percent)) 83 | 84 | data %>% 85 | count(ownership, foundfam_owned) %>% 86 | spread(foundfam_owned, n, fill = 0) 87 | 88 | # Proportion of managers/non-managers with a college degree 89 | # need correction: -44 means do not know, -99 means missing 90 | data <- data %>% 91 | mutate( 92 | degree_m = data$degree_m/100, 93 | degree_nm = data$degree_nm/100) %>% 94 | mutate( 95 | degree_m = ifelse(degree_m<0, NA, degree_m), 96 | degree_nm = ifelse(degree_nm<0, NA, degree_nm) 97 | ) 98 | 99 | # Generate bins from degree_nm 100 | quantile(data$degree_nm, na.rm = TRUE, c(0, 0.10, 0.25, 0.50, 0.75, 0.90, 1)) 101 | data$degree_nm_bins <- cut(data$degree_nm, c(0,0.001,0.05,0.20,1.01), right= FALSE) 102 | 103 | # Generate degree_nm_sq 104 | data$degree_nm_sq <- data$degree_nm^2 105 | 106 | data %>% 107 | group_by(degree_nm_bins) %>% 108 | summarise(min = min(degree_nm), 109 | max = max(degree_nm), n = n()) 110 | 111 | # 3. Take log of employment 112 | data$lnemp <- log(data$emp_firm) 113 | 114 | # 4. Competition 115 | table(data$competition) 116 | 117 | #itt van valam gond 1 obs 118 | data <- data %>% 119 | mutate( 120 | compet_weak = factor(competition == "0 competitors" | competition == "1-4 competitors"), 121 | compet_moder = factor(competition == "5-9 competitors"), 122 | compet_strong = factor( competition == "10+ competitors") 123 | ) 124 | 125 | data %>% 126 | group_by(competition) %>% 127 | summarise(weak = max(compet_weak == TRUE), 128 | moder = max(compet_moder == TRUE), 129 | strong = max(compet_strong == TRUE)) 130 | 131 | #data$competition <- 132 | # ifelse(data$compet_weak == TRUE, "0-4 competitors", 133 | # ifelse(data$compet_moder == TRUE, "5-9 competitors", "10+ competitors")) 134 | 135 | # 5. Industry in 2 digits 136 | 137 | industry_names <- 138 | c("food", "tobacco", "textile", "apparel", "lumber", "furniture", 139 | "paper", "printing", "chemical", "petrol", "rubber", "leather", "glass", 140 | "primary_metal", "fabricated_metal", "ind_machinery", "electronic", 141 | "transport", "instrument", "misc_manuf") 142 | 143 | data$industry <- factor(data$sic, 144 | levels = sort(unique(data$sic)), 145 | labels = industry_names) 146 | 147 | # 6. Country as factor 148 | data$countrycode <- factor(data$cty) 149 | 150 | # age 151 | data <- data %>% 152 | mutate(age_young = factor(firmage<30 & !is.na(firmage)), 153 | age_old = factor(firmage>80 & !is.na(firmage)), 154 | age_unknown = factor(is.na(firmage)), 155 | age_mid = factor(age_young == FALSE & age_old == FALSE & age_unknown == FALSE)) 156 | 157 | # ********************************************************** 158 | # ***** SAMPLE SELECTION 159 | # Keep observations with: 160 | # Non-employee/Research/Gov/Other type of ownership 161 | # non-missing variables 162 | data <- data %>% 163 | filter(!ownership %in% c("Government", "Other" ), 164 | !is.na(ownership), 165 | ) 166 | 167 | 168 | data <- data %>% 169 | filter( !is.na(management), 170 | !is.na(foundfam_owned), 171 | !is.na(degree_nm), 172 | !is.na(competition), 173 | !is.na(industry), 174 | !is.na(countrycode), 175 | !is.na(lnemp) 176 | ) 177 | 178 | 179 | 180 | 181 | # Summary of num. of employment 182 | data %>% 183 | dplyr::select(emp_firm) %>% 184 | summarise(min = min(emp_firm , na.rm=T), 185 | max = max(emp_firm , na.rm=T), 186 | p1 = quantile(emp_firm , probs = 0.01, na.rm=T), 187 | p50 = quantile(emp_firm , probs = 0.50, na.rm=T), 188 | q99 = quantile(emp_firm , probs = 0.99, na.rm=T), 189 | n = n()) 190 | 191 | 192 | # Drop tiny and large firms 193 | data %>% 194 | filter(emp_firm<50) %>% 195 | summarise(n = n()) 196 | 197 | data %>% 198 | filter(emp_firm>5000) %>% 199 | summarise(n = n()) 200 | 201 | data <- data %>% 202 | filter (!(emp_firm<50 | emp_firm>5000)) 203 | 204 | 205 | # Save workfile ------------------------------------------------------ 206 | write_csv(data, paste0(data_out, "wms_da_textbook-work.csv")) 207 | # N=8439 208 | -------------------------------------------------------------------------------- /ch24-haiti-earthquake-gdp/README.md: -------------------------------------------------------------------------------- 1 | ## Status 2 | 3 | * Stata is okay, works as expected 4 | * R is not ready, it uses a file produced by stata. R gives rather different results, we are not sure why. 5 | * Python works, slightly different results than Stata 6 | -------------------------------------------------------------------------------- /ch24-haiti-earthquake-gdp/haiti-earthquake-gdp.do: -------------------------------------------------------------------------------- 1 | ******************************************************************** 2 | * Prepared for Gabor's Data Analysis 3 | * 4 | * Data Analysis for Business, Economics, and Policy 5 | * by Gabor Bekes and Gabor Kezdi 6 | * Cambridge University Press 2021 7 | * 8 | * gabors-data-analysis.com 9 | * 10 | * License: Free to share, modify and use for educational purposes. 11 | * Not to be used for commercial purposes. 12 | * 13 | * Chapter 24 14 | * CH24A Estimating the effect of the 2010 Haiti earthquake on GDP 15 | * using the haiti-earthquake dataset 16 | * version 0.9 2020-09-06 17 | ******************************************************************** 18 | 19 | 20 | * SETTING UP DIRECTORIES 21 | 22 | * STEP 1: set working directory for da_case_studies. 23 | * for example: 24 | * cd "C:/Users/xy/Dropbox/gabors_data_analysis/da_case_studies" 25 | 26 | 27 | 28 | * STEP 2: * Directory for data 29 | * Option 1: run directory-setting do file 30 | do set-data-directory.do 31 | /* this is a one-line do file that should sit in 32 | the working directory you have just set up 33 | this do file has a global definition of your working directory 34 | more details: gabors-data-analysis.com/howto-stata/ */ 35 | 36 | * Option 2: set directory directly here 37 | * for example: 38 | * global data_dir "C:/Users/xy/gabors_data_analysis/da_data_repo" 39 | 40 | 41 | global data_in "$data_dir/haiti-earthquake/clean" 42 | global work "ch24-haiti-earthquake-gdp" 43 | 44 | cap mkdir "$work/output" 45 | global output "$work/output" 46 | 47 | cap mkdir "$work/temp" 48 | global temp "$work/temp" 49 | 50 | 51 | use "$data_in/haiti-earthquake-mod.dta", clear 52 | * Or download directly from OSF: 53 | /* 54 | copy "https://osf.io/download/h5yjm/" "workfile.dta" 55 | use "workfile.dta", clear 56 | erase "workfile.dta" 57 | */ 58 | 59 | 60 | *donor pool based on threshold calculations below: it is those countries with incomethreshold=1, and a balanced panel for all variables 61 | gen dp=0 62 | replace dp=1 if inlist(country, "Benin","Burkina Faso","Burundi" ,"Bangladesh" ,"Cambodia","Cameroon" ) 63 | replace dp=1 if inlist(country, "Kenya" ,"Kyrgyz Republic" ,"Liberia","Madagascar" ,"Mali","Moldova","Mozambique" ) 64 | replace dp=1 if inlist(country, "Nicaragua" ,"Nepal" ,"Rwanda","Senegal","Sierra Leone","Sudan","Tanzania","Togo","Uganda" ) 65 | replace dp=1 if country=="Haiti" 66 | lab var dp "Country in donor pool" 67 | keep if dp==1 68 | 69 | sort country year 70 | egen ccode = group(countrycode) if country!="Haiti" 71 | replace ccode = ccode+1 72 | replace ccode = 1 if country=="Haiti" 73 | 74 | xtset ccode year 75 | xtdes 76 | 77 | 78 | compress 79 | clear matrix 80 | save "$work/haiti-earthquake-workfile.dta",replace 81 | 82 | 83 | * time series in Haiti 84 | line gdptb_us year if ccode==1, lw(thick) lc(navy*0.8) /// 85 | xla(2004(2)2015, grid) yla(6(0.5)9, grid) /// 86 | xline(2010, lp(dash) lc(gray)) /// 87 | text(8 2009 "Earthquake") /// 88 | graphregion(fcolor(white) ifcolor(none)) /// 89 | plotregion(fcolor(white) ifcolor(white)) 90 | graph export "$output/ch24-figure-1-haiti-gdp-Stata.png", replace 91 | 92 | 93 | * Haiti and synthetic control 94 | 95 | synth gdptb_us cons exp imp gcf land pop inf gdppc_w /// 96 | gdptb_us(2005) gdptb_us(2007) gdptb_us(2009) , /// 97 | trunit(1) trperiod(2010) xperiod(2004(1)2009) nested /// 98 | unitnames(country) keep("$temp/gdp-1")replace 99 | 100 | use "$temp/gdp-1",replace 101 | lab var _time "Year" 102 | 103 | 104 | * total GDP in Haiti and synthetid control 105 | * figure 24.2a 106 | line _Y_treated _Y_synth _time, lw(vthick vthick) lc(navy*0.8 green*0.6) /// 107 | xla(2004(2)2015, grid) yla(6(0.5)9, grid) /// 108 | xline(2010, lp(dash) lc(gray)) /// 109 | text(8 2009 "Earthquake") /// 110 | text(9.1 2013 "Synthetic Haiti") text(7 2012 "Haiti") legend(off) /// 111 | graphregion(fcolor(white) ifcolor(none)) /// 112 | plotregion(fcolor(white) ifcolor(white)) /// 113 | ytitle("Total GDP, constant USD, billion") 114 | graph export "$output/ch24-figure-2a-haiti-gdp-synth-Stata.png", replace 115 | 116 | 117 | * differenence in log total GDP 118 | * figure 24.2b 119 | gen lndiffY = ln(_Y_treated) - ln(_Y_synth) 120 | line lndiffY _time, lw(vthick ) lc(navy*0.8) /// 121 | xla(2004(2)2015, grid) yla(-0.2(0.05)0.05, grid) yline(0) /// 122 | xline(2010, lp(dash) lc(gray)) /// 123 | text(-0.17 2009 "Earthquake") /// 124 | graphregion(fcolor(white) ifcolor(none)) /// 125 | plotregion(fcolor(white) ifcolor(white)) /// 126 | ytitle("Effect estimate, log of total GDP") 127 | graph export "$output/ch24-figure-2b-haiti-gdp-synth-Stata.png", replace 128 | 129 | 130 | **************************************************** 131 | * temporary stuff for textbook development 132 | * for R - temp 133 | use "$temp/gdp-1",replace 134 | rename _Y_treated Ytreated 135 | rename _Y_synthetic Ysynthetic 136 | rename _time year 137 | drop _W_Weight _Co_Number 138 | keep if year<. 139 | save "$temp\gdp-1-temp.dta",replace 140 | -------------------------------------------------------------------------------- /ch24-haiti-earthquake-gdp/temp/gdp-1-temp.dta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gabors-data-analysis/da_case_studies/08e55b021c295653e1b546faeeb4550926c9a7d4/ch24-haiti-earthquake-gdp/temp/gdp-1-temp.dta -------------------------------------------------------------------------------- /ch24-haiti-earthquake-gdp/temp/gdp-1.dta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gabors-data-analysis/da_case_studies/08e55b021c295653e1b546faeeb4550926c9a7d4/ch24-haiti-earthquake-gdp/temp/gdp-1.dta -------------------------------------------------------------------------------- /set-data-directory.R: -------------------------------------------------------------------------------- 1 | # set-data-directory for da_data_repo!!! 2 | data_dir="/Users/yourname/Dropbox/work/data_book/da_data_repo" 3 | -------------------------------------------------------------------------------- /set-data-directory.do: -------------------------------------------------------------------------------- 1 | * Set your path to the data directory here 2 | global data_dir "/Users/yourname/Dropbox/work/data_book/da_data_repo" 3 | 4 | --------------------------------------------------------------------------------