├── 06 Getting a Little Jumpy
    ├── Figure 6-1-2-R.png
    ├── Figure 6-2-1-R.png
    ├── Figure 6-1-1-Stata.png
    ├── Figure 6-1-2-Stata.png
    ├── Figure 6-2-1-Julia.png
    ├── 06 Getting a Little Jumpy.md
    ├── Table 6-2-1.do
    ├── Figure 6-2-1.py
    ├── Figure 6-2-1.jl
    ├── Figure 6-1-1.py
    ├── Figure 6-1-1.r
    ├── Figure 6-2-1.r
    ├── Figure 6-2-1.do
    ├── Figure 6-1-1.jl
    ├── Figure 6-1-1.do
    ├── Figure 6-1-2.py
    ├── Figure 6-1-2.r
    ├── Figure 6-1-2.do
    └── Figure 6-1-2.jl
├── 03 Making Regression Make Sense
    ├── Figure 3-1-2-Julia.png
    ├── Table 3-3-3.do
    ├── Figure 3-1-3.do
    ├── Figure 3-1-2.jl
    ├── Figure 3-1-2.do
    ├── Figure 3-1-2.r
    ├── Figure 3-1-2.py
    ├── Figure 3-1-3.py
    ├── Figure 3-1-3.r
    ├── Table 3-3-2.do
    ├── Table 3-3-2.py
    ├── Table 3-3-2.jl
    ├── Table 3-3-3.r
    ├── Table 3-3-2.r
    └── 03 Making Regression Make Sense.md
├── 04 Instrumental Variables in Action
    ├── Figure 4-1-1-R.png
    ├── Figure 4-6-1-R.png
    ├── Table 4-4-1.do
    ├── Figure 4-1-1.jl
    ├── Table 4-1-2.do
    ├── Table 4-1-2.py
    ├── Table 4-1-1.do
    ├── Figure 4-6-1.py
    ├── 04 Instrumental Variables in Action.md
    ├── Figure 4-6-1.jl
    ├── Table 4-1-1.r
    ├── Figure 4-1-1.py
    ├── Table 4-1-2.r
    ├── Figure 4-1-1.r
    ├── Figure 4-6-1.do
    ├── Figure 4-6-1.r
    ├── Figure 4-1-1.do
    └── Table 4-6-2.do
├── 05 Fixed Effects, DD and Panel Data
    ├── Figure 5-2-4-Stata.png
    ├── Table 5-2-2.do
    ├── 05 Fixed Effects, DD and Panel Data.md
    ├── Table 5-2-1.do
    ├── Table 5-2-1.r
    ├── Figure 5-2-4.do
    ├── Figure 5-2-4.r
    ├── Figure 5-2-4.jl
    ├── Figure 5-2-4.py
    └── Table 5-2-3.do
├── .gitignore
├── 02 The Experimental Ideal
    └── Table 2-2-1.do
├── 07 Quantile Regression
    ├── 07 Quantile Regression.md
    ├── Table 7-1-1.jl
    ├── Table 7-1-1.do
    ├── Table 7-1-1.r
    └── Table 7-1-1.py
├── 08 Nonstandard Standard Error Issues
    ├── Table 8-1-1-alt.r
    ├── 08 Nonstanard Standard Error Issues.md
    ├── Table 8-1-1.jl
    ├── Table 8-1-1.py
    ├── Table 8-1-1.r
    ├── Table 8-1-1.do
    └── Table-8-1-1.do
└── README.md


/06 Getting a Little Jumpy/Figure 6-1-2-R.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vikjam/mostly-harmless-replication/HEAD/06 Getting a Little Jumpy/Figure 6-1-2-R.png


--------------------------------------------------------------------------------
/06 Getting a Little Jumpy/Figure 6-2-1-R.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vikjam/mostly-harmless-replication/HEAD/06 Getting a Little Jumpy/Figure 6-2-1-R.png


--------------------------------------------------------------------------------
/06 Getting a Little Jumpy/Figure 6-1-1-Stata.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vikjam/mostly-harmless-replication/HEAD/06 Getting a Little Jumpy/Figure 6-1-1-Stata.png


--------------------------------------------------------------------------------
/06 Getting a Little Jumpy/Figure 6-1-2-Stata.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vikjam/mostly-harmless-replication/HEAD/06 Getting a Little Jumpy/Figure 6-1-2-Stata.png


--------------------------------------------------------------------------------
/06 Getting a Little Jumpy/Figure 6-2-1-Julia.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vikjam/mostly-harmless-replication/HEAD/06 Getting a Little Jumpy/Figure 6-2-1-Julia.png


--------------------------------------------------------------------------------
/03 Making Regression Make Sense/Figure 3-1-2-Julia.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vikjam/mostly-harmless-replication/HEAD/03 Making Regression Make Sense/Figure 3-1-2-Julia.png


--------------------------------------------------------------------------------
/04 Instrumental Variables in Action/Figure 4-1-1-R.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vikjam/mostly-harmless-replication/HEAD/04 Instrumental Variables in Action/Figure 4-1-1-R.png


--------------------------------------------------------------------------------
/04 Instrumental Variables in Action/Figure 4-6-1-R.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vikjam/mostly-harmless-replication/HEAD/04 Instrumental Variables in Action/Figure 4-6-1-R.png


--------------------------------------------------------------------------------
/05 Fixed Effects, DD and Panel Data/Figure 5-2-4-Stata.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vikjam/mostly-harmless-replication/HEAD/05 Fixed Effects, DD and Panel Data/Figure 5-2-4-Stata.png


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # System files
 2 | *.DS_Store
 3 | 
 4 | # Data and logs
 5 | *.dat
 6 | *.nj
 7 | *.zip
 8 | *.txt
 9 | 
10 | # Output
11 | *.pdf
12 | 
13 | # R
14 | *.Rhistory
15 | *.Rdata
16 | 
17 | # Stata
18 | *.dta
19 | 
20 | 


--------------------------------------------------------------------------------
/05 Fixed Effects, DD and Panel Data/Table 5-2-2.do:
--------------------------------------------------------------------------------
 1 | clear all
 2 | set more off
 3 | eststo clear
 4 | capture version 13
 5 | 
 6 | /* Pull data from the 'Mostly Harmless' website */
 7 | /* http://economics.mit.edu/faculty/angrist/data1/mhe/card */
 8 | shell curl -o njmin.zip http://economics.mit.edu/files/3845
 9 | shell unzip -j njmin.zip
10 | 
11 | /* End of script*/
12 | exit
13 | 


--------------------------------------------------------------------------------
/04 Instrumental Variables in Action/Table 4-4-1.do:
--------------------------------------------------------------------------------
 1 | clear all
 2 | set more off
 3 | 
 4 | /* Stata code for Table 4.4.1*/
 5 | * shell curl -o jtpa.raw http://economics.mit.edu/files/614
 6 | 
 7 | /* Import data */
 8 | infile ym   zm   dm   sex  xm6  xm7  xm8  xm9  xm10 ///
 9 |        xm17 xm18 xm12 xm13 xm14 xm15 xm16 xm19 using jtpa.raw, clear
10 | 
11 | reg sex xm6
12 | 
13 | 
14 | /* End of file */
15 | exit
16 | 


--------------------------------------------------------------------------------
/05 Fixed Effects, DD and Panel Data/05 Fixed Effects, DD and Panel Data.md:
--------------------------------------------------------------------------------
1 | # 05 Fixed Effects, DD and Panel Data
2 | ## 5.2 Differences-in-differences
3 | 
4 | ### Figure 5-2-4
5 | 
6 | Completed in [Stata](Figure%205-2-4.do), [R](Figure%205-2-4.r), [Python](Figure%205-2-4.py) and [Julia](Figure%205-2-4.jl).
7 | 
8 | ![Figure 5-2-4 in Stata](https://github.com/vikjam/mostly-harmless-replication/blob/master/05%20Fixed%20Effects,%20DD%20and%20Panel%20Data/Figure%205-2-4-Stata.png?raw=true)
9 | 


--------------------------------------------------------------------------------
/03 Making Regression Make Sense/Table 3-3-3.do:
--------------------------------------------------------------------------------
 1 | clear all
 2 | set more off
 3 | eststo clear
 4 | capture version 13
 5 | /* Required programs      */
 6 | /* - estout: output table */
 7 | 
 8 | /* Stata code for Table 3.3.2*/
 9 | 
10 | /* Download data */
11 | shell curl -o nswre74.dta  http://economics.mit.edu/files/3828
12 | shell curl -o cps1re74.dta http://economics.mit.edu/files/3824
13 | shell curl -o cps3re74.dta http://economics.mit.edu/files/3825
14 | 
15 | /* End of script */
16 | 


--------------------------------------------------------------------------------
/02 The Experimental Ideal/Table 2-2-1.do:
--------------------------------------------------------------------------------
 1 | clear all
 2 | set more off
 3 | eststo clear
 4 | capture version 13
 5 | 
 6 | /* Pull data from the 'Mostly Harmless' website */
 7 | /* http://economics.mit.edu/faculty/angrist/data1/mhe/krueger */
 8 | shell curl -o webstar.dta http://economics.mit.edu/files/3827/
 9 | 
10 | /* Load downloaded data */
11 | use webstar.dta, clear
12 | 
13 | /* Create variables in table */
14 | gen white_asian = (inlist(srace, 1, 3)) if !missing(srace)
15 | label var white_asian "White/Asian"
16 | 
17 | /* Calculate percentiles of test scores */
18 | local testscores "treadssk tmathssk treadss1 tmathss1 treadss2 tmathss2 treadss3 tmathss3"
19 | foreach var of varlist `testscores' {
20 | 	xtile pct_`var' = `var', nq(100)
21 | }
22 | egen avg_pct = rowmean(pct_*)
23 | label var avg_pct "Percentile score in kindergarten"
24 | 
25 | /* End of file */
26 | exit
27 | 


--------------------------------------------------------------------------------
/04 Instrumental Variables in Action/Figure 4-1-1.jl:
--------------------------------------------------------------------------------
 1 | # Load packages
 2 | using DataFrames
 3 | using Gadfly
 4 | using GLM
 5 | 
 6 | # Download the data and unzip it
 7 | download("http://economics.mit.edu/files/397", "asciiqob.zip")
 8 | run(`unzip -o asciiqob.zip`)
 9 | 
10 | # Import data
11 | pums = readtable("asciiqob.txt",
12 |                  header    = false,
13 |                  separator = ' ')
14 | names!(pums, [:lwklywge, :educ, :yob, :qob, :pob])
15 | 
16 | # Aggregate into means for figure
17 | means = aggregate(pums, [:yob, :qob], [mean])
18 | 
19 | # Create dates
20 | means[:date] = [Date(1900 + y, m * 3, 1) for (y, m) in zip(means[:yob], means[:qob])]
21 | 
22 | # Plot
23 | p = plot(means,
24 | 		 layer(x = "date", y = "educ_mean", Geom.point, Geom.line))
25 | p = plot(means,
26 | 		 layer(x = "date", y = "lwklywge_mean", Geom.point, Geom.line))
27 | 
28 | # End of file
29 | 


--------------------------------------------------------------------------------
/07 Quantile Regression/07 Quantile Regression.md:
--------------------------------------------------------------------------------
 1 | # 07 Quantile Regression
 2 | ## 7.1 The Quantile Regression Model
 3 | 
 4 | ### Table 7.1.1
 5 | Completed in [Stata](Table%207-1-1.do), [R](Table%207-1-1.r) and [Python](Table%207-1-1.py)
 6 | 
 7 | |     |Obs    |Mean |Std Dev |0.1   |0.25  |0.5   |0.75  |0.9   |OLS   |RMSE |
 8 | |:----|:------|:----|:-------|:-----|:-----|:-----|:-----|:-----|:-----|:----|
 9 | |1980 |65,023 |6.4  |0.671   |0.073 |0.073 |0.068 |0.07  |0.079 |0.072 |0.63 |
10 | |     |       |     |        |0.002 |0.001 |0.001 |0.001 |0.002 |0.001 |     |
11 | |1990 |86,785 |6.46 |0.694   |0.112 |0.11  |0.106 |0.111 |0.137 |0.114 |0.64 |
12 | |     |       |     |        |0.003 |0.001 |0.001 |0.001 |0.002 |0.001 |     |
13 | |2000 |97,397 |6.47 |0.746   |0.092 |0.105 |0.111 |0.119 |0.157 |0.114 |0.69 |
14 | |     |       |     |        |0.003 |0.001 |0.001 |0.001 |0.002 |0.001 |     |
15 | 


--------------------------------------------------------------------------------
/04 Instrumental Variables in Action/Table 4-1-2.do:
--------------------------------------------------------------------------------
 1 | clear all
 2 | set more off
 3 | /* Stata code for Table 4-1-2   */
 4 | /* Required additional packages */
 5 | /* - estout: output results     */
 6 | 
 7 | * /* Download data */
 8 | * shell curl -o asciiqob.zip http://economics.mit.edu/files/397
 9 | * unzipfile asciiqob.zip, replace
10 | 
11 | /* Import data */
12 | infile lwklywge educ yob qob pob using asciiqob.txt, clear
13 | 
14 | /* Create binary instrument */
15 | recode qob (1/2  = 0 "Born in the 1st or 2nd quarter of year") ///
16 |            (3/4  = 1 "Born in the 3rd or 4th quarter of year") ///
17 |            (else = .), gen(z)
18 | 
19 | /* Compare means (and differences) */
20 | ttest lwklywge, by(z)
21 | ttest educ, by(z)
22 | 
23 | /* Compute Wald estimate */
24 | sureg (educ z) (lwklywge z) if !missing(z)
25 | nlcom [lwklywge]_b[z] / [educ]_b[z]
26 | 
27 | /* OLS estimate */
28 | regress lwklywge educ if !missing(z)
29 | 
30 | /* End of script */
31 | 


--------------------------------------------------------------------------------
/03 Making Regression Make Sense/Figure 3-1-3.do:
--------------------------------------------------------------------------------
 1 | clear all
 2 | set more off
 3 | eststo clear
 4 | capture log close _all
 5 | capture version 13
 6 | 
 7 | /* Stata code for Table 3.3.2*/
 8 | /* !! Can't find right data !! */
 9 | 
10 | /* Download data */
11 | * shell curl -o asciiqob.zip http://economics.mit.edu/files/397
12 | * unzipfile asciiqob.zip, replace
13 | 
14 | /* Import data */
15 | infile lwklywge educ yob qob pob using asciiqob.txt, clear
16 | 
17 | /* Panel A */
18 | /* Old-fashioned standard errors */
19 | regress lwklywge educ
20 | /* Robust standard errors */
21 | regress lwklywge educ, robust
22 | 
23 | /* Collapse data for Panel B (counting only if in sample) */
24 | gen count = 1 if e(sample)
25 | collapse (sum) count (mean) lwklywge, by(educ)
26 | 
27 | /* Old-fashioned standard errors */
28 | regress lwklywge educ [aweight = count]
29 | /* Robust standard errors */
30 | regress lwklywge educ [aweight = count], robust
31 | 
32 | /* End of file */
33 | exit
34 | 


--------------------------------------------------------------------------------
/06 Getting a Little Jumpy/06 Getting a Little Jumpy.md:
--------------------------------------------------------------------------------
 1 | # 06 Getting a Little Jumpy
 2 | ## 6.1 Sharp RD
 3 | 
 4 | ### Figure 6-1-1
 5 | 
 6 | Completed in [Stata](Figure%206-1-1.do), [R](Figure%206-1-1.r), [Python](Figure%206-1-1.py) and [Julia](Figure%206-1-1.jl)
 7 | 
 8 | ![Figure 6-1-1 in Stata](https://github.com/vikjam/mostly-harmless-replication/blob/master/06%20Getting%20a%20Little%20Jumpy/Figure%206-1-1-Stata.png?raw=true)
 9 | 
10 | ### Figure 6-1-2
11 | 
12 | Completed in [Stata](Figure%206-1-2.do), [R](Figure%206-1-2.r), [Python](Figure%206-1-2.py) and [Julia](Figure%206-1-2.jl)
13 | 
14 | ![Figure 6-1-2 in R](https://github.com/vikjam/mostly-harmless-replication/blob/master/06%20Getting%20a%20Little%20Jumpy/Figure%206-1-2-R.png?raw=true)
15 | 
16 | ### Figure 6-2-1
17 | 
18 | Completed in [Stata](Figure%206-2-1.do), [R](Figure%206-2-1.r), [Python](Figure%206-2-1.py) and [Julia](Figure%206-1-1.jl)
19 | 
20 | ![Figure 6-2-1 in Julia](https://github.com/vikjam/mostly-harmless-replication/blob/master/06%20Getting%20a%20Little%20Jumpy/Figure%206-2-1-Julia.png?raw=true)
21 | 


--------------------------------------------------------------------------------
/07 Quantile Regression/Table 7-1-1.jl:
--------------------------------------------------------------------------------
 1 | # Julia code for Table 8-1-1                #
 2 | # Required packages                         #
 3 | # - DataRead: import Stata datasets         #
 4 | # - DataFrames: data manipulation / storage #
 5 | # - QuantileRegression: quantile regression #
 6 | # - GLM: OLS regression                     #
 7 | using DataRead
 8 | using DataFrames
 9 | using QuantileRegression
10 | using GLM
11 | 
12 | # Download the data and unzip it
13 | download("http://economics.mit.edu/files/384", "angcherfer06.zip")
14 | run(`unzip angcherfer06.zip`)
15 | 
16 | # Load the data
17 | dta_path = string("Data/census", "80", ".csv")
18 | df       = readtable(dta_path)
19 | 
20 | # Summary statistics
21 | obs = size(df[:logwk], 1)
22 | μ   = mean(df[:logwk])
23 | σ   = std(df[:logwk])
24 | 
25 | # Run OLS
26 | wls      = glm(logwk ~ educ + black + exper + exper2, df,
27 | 	           Normal(), IdentityLink(),
28 | 	           wts = convert(Array, (df[:perwt])))
29 | wls_coef = coef(wls)[2]
30 | wls_se   = stderr(wls)[2]
31 | wls_rmse = sqrt(sum((df[:logwk] - predict(wls)).^2) / df_residual(wls))
32 | 
33 | # Print results
34 | print(obs, μ, σ, wls_coef, wls_se, wls_rmse)
35 | 
36 | # End of script
37 | 


--------------------------------------------------------------------------------
/03 Making Regression Make Sense/Figure 3-1-2.jl:
--------------------------------------------------------------------------------
 1 | # Load packages
 2 | using CSV
 3 | using DataFrames
 4 | using GLM
 5 | using Statistics
 6 | using Gadfly
 7 | using Cairo
 8 | 
 9 | # Download the data and unzip it
10 | download("http://economics.mit.edu/files/397", "asciiqob.zip")
11 | run(`unzip asciiqob.zip`)
12 | 
13 | # Import data
14 | pums = DataFrame(CSV.File("asciiqob.txt", header = false, delim = " ",  ignorerepeated = true))
15 | rename!(pums, [:lwklywge, :educ, :yob, :qob, :pob])
16 | 
17 | # Run OLS and save predicted values
18 | OLS = lm(@formula(lwklywge ~ educ), pums)
19 | pums.predicted = predict(OLS)
20 | 
21 | # Aggregate into means for figure
22 | means = combine(groupby(pums, :educ), [:lwklywge, :predicted] .=> mean)
23 | 
24 | # Plot figure and export figure using Gadfly
25 | figure = plot(means,
26 |               layer(x = "educ", y = "predicted_mean", Geom.line, Theme(default_color = colorant"green")),
27 |               layer(x = "educ", y = "lwklywge_mean", Geom.line, Geom.point),
28 |               Guide.xlabel("Years of completed education"),
29 |               Guide.ylabel("Log weekly earnings, \$2003"))
30 | 
31 | draw(PNG("Figure 3-1-2-Julia.png", 7inch, 6inch), figure)
32 | 
33 | # End of script
34 | 


--------------------------------------------------------------------------------
/05 Fixed Effects, DD and Panel Data/Table 5-2-1.do:
--------------------------------------------------------------------------------
 1 | clear all
 2 | set more off
 3 | eststo clear
 4 | capture version 13
 5 | 
 6 | /* Stata code for Table 5.2.1*/
 7 | shell curl -o njmin.zip http://economics.mit.edu/files/3845
 8 | unzipfile njmin.zip, replace
 9 | 
10 | /* Import data */
11 | infile SHEET CHAIN CO_OWNED STATE SOUTHJ CENTRALJ NORTHJ PA1 PA2      ///
12 |        SHORE NCALLS EMPFT EMPPT NMGRS WAGE_ST INCTIME FIRSTINC BONUS  ///
13 |        PCTAFF MEALS OPEN HRSOPEN PSODA PFRY PENTREE NREGS NREGS11     ///
14 |        TYPE2 STATUS2 DATE2 NCALLS2 EMPFT2 EMPPT2 NMGRS2 WAGE_ST2      ///
15 |        INCTIME2 FIRSTIN2 SPECIAL2 MEALS2 OPEN2R HRSOPEN2 PSODA2 PFRY2 ///
16 |        PENTREE2 NREGS2 NREGS112 using "public.dat", clear
17 | 
18 | /* Label the state variables and values */
19 | label var STATE "State"
20 | label define state_labels 0 "PA" 1 "NJ"
21 | label values STATE state_labels
22 | 
23 | /* Calculate FTE employement */
24 | gen FTE  = EMPFT  + 0.5 * EMPPT  + NMGRS
25 | label var FTE  "FTE employment before"
26 | gen FTE2 = EMPFT2 + 0.5 * EMPPT2 + NMGRS2
27 | label var FTE2 "FTE employment after"
28 | 
29 | /* Calculate means */
30 | tabstat FTE FTE2, by(STATE) stat(mean semean)
31 | 
32 | /* End of script */
33 | 


--------------------------------------------------------------------------------
/06 Getting a Little Jumpy/Table 6-2-1.do:
--------------------------------------------------------------------------------
 1 | clear all
 2 | set more off
 3 | /* Stata code for Figure 5.2.4 */
 4 | 
 5 | /* Download data */
 6 | shell curl -o final4.dta http://economics.mit.edu/files/1359
 7 | shell curl -o final5.dta http://economics.mit.edu/files/1358
 8 | 
 9 | /* Import data */
10 | use "final5.dta", clear
11 | 
12 | replace avgverb= avgverb-100 if avgverb>100
13 | replace avgmath= avgmath-100 if avgmath>100
14 | 
15 | gen func1 = c_size  / (floor((c_size - 1) / 40) + 1)
16 | gen func2 = cohsize / (floor(cohsize      / 40) + 1)
17 | 
18 | replace avgverb  = . if verbsize == 0
19 | replace passverb = . if verbsize == 0
20 | 
21 | replace avgmath  = . if mathsize == 0
22 | replace passmath = . if mathsize == 0
23 | 
24 | /* Sample restrictions */
25 | keep if 1 < classize & classize < 45 & c_size > 5
26 | keep if c_leom == 1 & c_pik < 3
27 | 
28 | sum avgverb
29 | sum avgmath
30 | 
31 | mmoulton avgverb classize, cluvar(schlcode)
32 | mmoulton avgverb classize tipuach, cluvar(schlcode)
33 | mmoulton avgverb classize tipuach c_size, clu(schlcode)
34 | mmoulton avgmath classize, cluvar(schlcode)
35 | mmoulton avgmath classize tipuach, cluvar(schlcode)
36 | mmoulton avgmath classize tipuach c_size, clu(schlcode)
37 | 
38 | /* End of script */
39 | exit
40 | 


--------------------------------------------------------------------------------
/03 Making Regression Make Sense/Figure 3-1-2.do:
--------------------------------------------------------------------------------
 1 | clear all
 2 | set more off
 3 | eststo clear
 4 | capture log close _all
 5 | capture version 13
 6 | 
 7 | /* Stata code for Table 3.1.2   */
 8 | /* Required additional packages */
 9 | log using "Table 3-1-2-Stata.txt", name(table030102) text replace
10 | 
11 | /* Download data */
12 | shell curl -o asciiqob.zip http://economics.mit.edu/files/397
13 | unzipfile asciiqob.zip, replace
14 | 
15 | /* Import data */
16 | infile lwklywge educ yob qob pob using asciiqob.txt, clear
17 | 
18 | /* Get fitted line */
19 | regress lwklywge educ
20 | predict yhat, xb
21 | 
22 | /* Calculate means by collapsing the data */
23 | collapse lwklywge yhat, by(educ)
24 | 
25 | /* Graph the figures */
26 | graph twoway (connected lwklywge educ, lcolor(black) mcolor(black)) ///
27 | 			 (line yhat educ, lcolor(black) lpattern("-")),         ///
28 | 			 ylabel(4.8(0.2)6.6) ymtick(4.9(0.2)6.5)                ///
29 | 			 xlabel(0(2)20) xmtick(1(2)19)                          ///
30 |              ytitle("Log weekly earnings, $2003")                   ///
31 |              xtitle("Years of completed education")                 ///
32 |              legend(off)                                            ///
33 | 			 scheme(s1mono)
34 | 
35 | graph export "Figure 3-1-2-Stata.pdf", replace
36 | 
37 | log close table030102
38 | /* End of file */
39 | 


--------------------------------------------------------------------------------
/03 Making Regression Make Sense/Figure 3-1-2.r:
--------------------------------------------------------------------------------
 1 | # R code for Figure 3.1.2               #
 2 | # Required packages                     #
 3 | # - ggplot2: making pretty graphs       #
 4 | # - data.table: simple way to aggregate #
 5 | library(ggplot2)
 6 | library(data.table)
 7 | 
 8 | # Download data and unzip the data
 9 | download.file('http://economics.mit.edu/files/397', 'asciiqob.zip')
10 | unzip('asciiqob.zip')
11 | 
12 | # Read the data into a dataframe
13 | pums        <- read.table('asciiqob.txt',
14 |                           header           = FALSE,
15 |                           stringsAsFactors = FALSE)
16 | names(pums) <- c('lwklywge', 'educ', 'yob', 'qob', 'pob')
17 | 
18 | # Estimate OLS regression
19 | reg.model <- lm(lwklywge ~ educ, data = pums)
20 | 
21 | # Calculate means by educ attainment and predicted values
22 | pums.data.table <- data.table(pums)
23 | educ.means      <- pums.data.table[ , list(mean = mean(lwklywge)), by = educ]
24 | educ.means$yhat <- predict(reg.model, educ.means)
25 | 
26 | # Create plot
27 | p <- ggplot(data = educ.means, aes(x = educ)) +
28 |      geom_point(aes(y = mean))                +
29 |      geom_line(aes(y = mean))                 +
30 |      geom_line(aes(y = yhat))                 +
31 |      ylab("Log weekly earnings, $2003")       +
32 |      xlab("Years of completed education")
33 | 
34 | ggsave(filename = "Figure 3-1-2-R.pdf")
35 | 
36 | 
37 | # End of file
38 | 


--------------------------------------------------------------------------------
/04 Instrumental Variables in Action/Table 4-1-2.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Create Table 4-1-2 in MHE
 4 | Tested on Python 3.4
 5 | """
 6 | 
 7 | import zipfile
 8 | import urllib.request
 9 | import pandas as pd
10 | import scipy.stats
11 | import statsmodels.api as sm
12 | 
13 | # Download data and unzip the data
14 | urllib.request.urlretrieve('http://economics.mit.edu/files/397', 'asciiqob.zip')
15 | with zipfile.ZipFile('asciiqob.zip', "r") as z:
16 |    z.extractall()
17 | 
18 | # Read the data into a pandas dataframe
19 | pums         = pd.read_csv('asciiqob.txt',
20 |                            header           = None,
21 |                            delim_whitespace = True)
22 | pums.columns = ['lwklywge', 'educ', 'yob', 'qob', 'pob']
23 | 
24 | # Create binary variable
25 | pums['z'] = ((pums.educ == 3) | (pums.educ == 4)) * 1
26 | 
27 | # Compare means (and differences)
28 | ttest_lwklywge = scipy.stats.ttest_ind(pums.lwklywge[pums.z == 1], pums.lwklywge[pums.z == 0])
29 | ttest_educ     = scipy.stats.ttest_ind(pums.educ[pums.z == 1], pums.educ[pums.z == 0])
30 | 
31 | # Compute Wald estimate (need to use arrays to use SUR in statsmodels)
32 | wald_estimate = (np.mean(pums.lwklywge[pums.z == 1]) - np.mean(pums.lwklywge[pums.z == 0])) / \
33 |                 (np.mean(pums.educ[pums.z == 1])     - np.mean(pums.educ[pums.z == 0]))
34 | 
35 | # OLS estimate
36 | ols = sm.OLS(y, X).fit()
37 | 
38 | # End of script
39 | 


--------------------------------------------------------------------------------
/04 Instrumental Variables in Action/Table 4-1-1.do:
--------------------------------------------------------------------------------
 1 | clear all
 2 | set more off
 3 | /* Stata code for Table 4-1-1 */
 4 | 
 5 | /* Download data */
 6 | shell curl -o asciiqob.zip http://economics.mit.edu/files/397
 7 | unzipfile asciiqob.zip, replace
 8 | 
 9 | /* Import data */
10 | infile lwklywge educ yob qob pob using asciiqob.txt, clear
11 | 
12 | /* Column 1: OLS */
13 | regress lwklywge educ, robust
14 | 
15 | /* Column 2: OLS with YOB, POB dummies */
16 | regress lwklywge educ i.yob i.pob, robust
17 | 
18 | /* Column 3: 2SLS with instrument QOB = 1 */
19 | tabulate qob, gen(qob)
20 | ivregress 2sls lwklywge (educ = qob1), robust
21 | 
22 | /* Column 4: 2SLS with YOB, POB dummies and instrument QOB = 1 */
23 | ivregress 2sls lwklywge i.yob i.pob (educ = qob1), robust
24 | 
25 | /* Column 5: 2SLS with YOB, POB dummies and instrument (QOB = 1 | QOB = 2) */
26 | gen qob1or2 = (inlist(qob, 1, 2)) if !missing(qob)
27 | ivregress 2sls lwklywge i.yob i.pob (educ = qob1or2), robust
28 | 
29 | /* Column 6: 2SLS with YOB, POB dummies and full QOB dummies */
30 | ivregress 2sls lwklywge i.yob i.pob (educ = i.qob), robust
31 | 
32 | /* Column 7: 2SLS with YOB, POB dummies and full QOB dummies interacted with YOB */
33 | ivregress 2sls lwklywge i.yob i.pob (educ = i.qob#i.yob), robust
34 | 
35 | /* Column 8: 2SLS with age, YOB, POB dummies and with full QOB dummies interacted with YOB */
36 | ivregress 2sls lwklywge i.yob i.pob (educ = i.qob#i.yob), robust
37 | 
38 | /* End of script */
39 | 


--------------------------------------------------------------------------------
/03 Making Regression Make Sense/Figure 3-1-2.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Tested on Python 3.4
 4 | """
 5 | 
 6 | import urllib
 7 | import zipfile
 8 | import urllib.request
 9 | import pandas as pd
10 | import statsmodels.api as sm
11 | import matplotlib.pyplot as plt
12 | 
13 | # Download data and unzip the data
14 | urllib.request.urlretrieve('http://economics.mit.edu/files/397', 'asciiqob.zip')
15 | with zipfile.ZipFile('asciiqob.zip', "r") as z:
16 |    z.extractall()
17 | 
18 | # Read the data into a pandas dataframe
19 | pums = pd.read_csv("asciiqob.txt", header=None, delim_whitespace=True)
20 | pums.columns = ["lwklywge", "educ", "yob", "qob", "pob"]
21 | 
22 | # Set up the model
23 | y = pums.lwklywge
24 | X = pums.educ
25 | X = sm.add_constant(X)
26 | 
27 | # Save coefficient on education
28 | model = sm.OLS(y, X)
29 | results = model.fit()
30 | educ_coef = results.params[1]
31 | intercept = results.params[0]
32 | 
33 | # Calculate means by educ attainment and predicted values
34 | groupbyeduc = pums.groupby("educ")
35 | educ_means = groupbyeduc["lwklywge"].mean().reset_index()
36 | yhat = pd.Series(
37 |     intercept + educ_coef * educ_means.index.values, index=educ_means.index.values
38 | )
39 | 
40 | # Create plot
41 | plt.figure()
42 | educ_means.plot(kind="line", x="educ", y="lwklywge", style="-o")
43 | yhat.plot()
44 | plt.xlabel("Years of completed education")
45 | plt.ylabel("Log weekly earnings, \\$2003")
46 | plt.legend().set_visible(False)
47 | plt.savefig("Figure 3-1-2-Python.pdf")
48 | 
49 | # End of script
50 | 


--------------------------------------------------------------------------------
/08 Nonstandard Standard Error Issues/Table 8-1-1-alt.r:
--------------------------------------------------------------------------------
 1 | # R code for Table 8-1-1              #
 2 | # Required packages                   #
 3 | # - sandwich: robust standard error   #
 4 | library(sandwich)
 5 | library(data.table)
 6 | library(knitr)
 7 | 
 8 | # Set seed for replication
 9 | set.seed(1984, "L'Ecuyer")
10 | 
11 | # Set parameters
12 | NSIMS = 25000
13 | N     = 30
14 | r     = 0.9
15 | N1    = r * N
16 | sigma = 1  
17 | 
18 | # Generate random data
19 | dvec           <- c(rep(0, N1), rep(1, N - N1))
20 | simulated.data <- data.table(sim     = rep(1:NSIMS, each = N),
21 |                              y       = NA, 
22 |                              d       = rep(dvec, NSIMS),
23 |                              epsilon = NA)
24 | simulated.data[ , epsilon := ifelse(d == 1,
25 |                                     rnorm((N - N1) * 25),
26 |                                     rnorm(N1 * NSIMS, sd = sigma))]
27 | simulated.data[ , y := 0 * d + epsilon]
28 | 
29 | # Store a list of the standard error types
30 | se.types <- c("const", paste0("HC", 0:3))
31 | 
32 | # Create a function to extract standard errors
33 | calculate.se <- function(lm.obj, type) {
34 |     sqrt(vcovHC(lm.obj, type = type)[2, 2])
35 | }
36 | 
37 | # Function to calculate results
38 | calculateBias <- function(formula) {
39 |     lm.sim <- lm(formula)
40 |     b1     <- coef(lm.sim)[2]
41 |     se.sim <- sapply(se.types, calculate.se, lm.obj = lm.sim)
42 |     c(b1, se.sim)
43 | }
44 | simulated.results <- simulated.data[ , as.list(calculateBias(y ~ d)), by = sim]
45 | 
46 | # End of script
47 | 


--------------------------------------------------------------------------------
/07 Quantile Regression/Table 7-1-1.do:
--------------------------------------------------------------------------------
 1 | clear all
 2 | set more off
 3 | 
 4 | /* Stata code for Table 7.1.1 */
 5 | 
 6 | /* Download data */
 7 | shell curl -o angcherfer06.zip http://economics.mit.edu/files/384
 8 | unzipfile angcherfer06.zip, replace
 9 | 
10 | /* Create matrix to store all the results */
11 | matrix R = J(6, 10, .)
12 | matrix rownames R = 80 80se 90 90se 00 00se
13 | matrix colnames R = Obs Mean SD 10 25 50 75 90 Coef MSE
14 | 
15 | /* Loop through the years to get the results */
16 | foreach year in "80" "90" "00" {
17 | 	/* Load data */
18 | 	use "Data/census`year'.dta", clear
19 | 
20 | 	/* Summary statistics */
21 | 	summ logwk
22 | 	matrix R[rownumb(R, "`year'"), colnumb(R, "Obs")]  = r(N)
23 | 	matrix R[rownumb(R, "`year'"), colnumb(R, "Mean")] = r(mean)
24 | 	matrix R[rownumb(R, "`year'"), colnumb(R, "SD")]   = r(sd)
25 | 
26 | 	/* Define education variables */
27 | 	gen highschool = 1 if (educ == 12)
28 | 	gen college    = 1 if (educ == 16)
29 | 
30 | 	/* Run quantile regressions */
31 | 	foreach tau of numlist 10 25 50 75 90 {
32 | 		qreg logwk educ black exper exper2 [pweight = perwt], q(`tau')
33 | 		matrix R[rownumb(R, "`year'"), colnumb(R, "`tau'")]   = _b[edu]
34 | 		matrix R[rownumb(R, "`year'se"), colnumb(R, "`tau'")] = _se[edu]
35 | 	}
36 | 
37 | 	/* Run OLS */
38 | 	regress logwk educ black exper exper2 [pweight = perwt]
39 | 	matrix R[rownumb(R, "`year'"), colnumb(R, "Coef")]   = _b[edu]
40 | 	matrix R[rownumb(R, "`year'se"), colnumb(R, "Coef")] = _se[edu]
41 | 	matrix R[rownumb(R, "`year'"), colnumb(R, "MSE")]    = e(rmse)
42 | }
43 | 
44 | /* List results */
45 | matlist R
46 | 
47 | /* End of file */
48 | 


--------------------------------------------------------------------------------
/04 Instrumental Variables in Action/Figure 4-6-1.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Create Figure 4-6-1 in MHE
 4 | Tested on Python 3.4
 5 | """
 6 | 
 7 | import pandas as pd
 8 | import numpy as np
 9 | import statsmodels.api as sm
10 | from statsmodels.sandbox.regression import gmm
11 | import matplotlib.pyplot as plt
12 | import random
13 | import math
14 | from scipy.linalg import eigh
15 | 
16 | # Number of simulations
17 | nsims = 10
18 | 
19 | # Set seed
20 | random.seed(461)
21 | 
22 | # Set parameters
23 | Sigma   = [[1.0, 0.8],
24 |            [0.8, 1.0]]
25 | mu      = [0, 0]
26 | errors  = np.random.multivariate_normal(mu, Sigma, 1000)
27 | eta     = errors[:, 0]
28 | xi      = errors[:, 1]
29 | 
30 | # Create Z, x, y
31 | Z  = np.random.multivariate_normal([0] * 20, np.identity(20), 1000)
32 | x  = 0.1 * Z[: , 0] + xi
33 | y  = x + eta
34 | x  = sm.add_constant(x)
35 | Z  = sm.add_constant(x)
36 | 
37 | ols  = sm.OLS(y, x).fit().params[1]
38 | # tsls = np.linalg.inv(np.transpose(Z).dot(x)).dot(np.transpose(Z).dot(y))[1]
39 | tsls = gmm.IV2SLS(y, x, Z).fit().params[1]
40 | 
41 | def LIML(exogenous, endogenous, instruments):
42 |     y = exogenous
43 |     x = endogenous
44 |     Z = instruments
45 |     I = np.eye(y.shape[0])
46 |     Mz = I - Z.dot(np.linalg.inv(np.transpose(Z).dot(Z))).dot(np.transpose(Z))
47 |     Mx = I - x.dot(np.linalg.inv(np.transpose(x).dot(x))).dot(np.transpose(x))
48 |     A = np.transpose(np.hstack((y, x[:,1]))).dot(Mz).dot(np.hstack((y, x[:,1])))
49 |     k = 1 
50 |     beta = np.linalg.inv(np.transpose(Z).dot(I - k * M).dot(Z)).dot(np.transpose(Z).dot(I - k * M)).dot(y)
51 |     return 
52 | 
53 | # End of script
54 | 


--------------------------------------------------------------------------------
/06 Getting a Little Jumpy/Figure 6-2-1.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Create Figure 6.2.1 in MHE
 4 | Tested on Python 3.4
 5 | numpy: math and stat functions, array
 6 | matplotlib: plot figures
 7 | """
 8 | import numpy as np
 9 | import matplotlib.pyplot as plt
10 | import pandas as pd
11 | 
12 | # Download data
13 | urllib.request.urlretrieve('http://economics.mit.edu/files/1359', 'final4.dta')
14 | urllib.request.urlretrieve('http://economics.mit.edu/files/1358', 'final5.dta')
15 | 
16 | # Read the data into a pandas dataframe
17 | grade4 = pd.read_csv('final4.csv', encoding = 'iso8859_8')
18 | grade5 = pd.read_csv('final5.csv', encoding = 'iso8859_8')
19 | 
20 | # Find means class size by grade size
21 | grade4means = grade4.groupby('c_size')['classize'].mean()
22 | grade5means = grade5.groupby('c_size')['classize'].mean()
23 | 
24 | # Create grid and function for Maimonides Rule
25 | def maimonides_rule(x):
26 |     return x / (np.floor((x - 1)/40) + 1)
27 | 
28 | x = np.arange(0, 220, 1)
29 | 
30 | # Plot figures
31 | fig = plt.figure()
32 | 
33 | ax1 = fig.add_subplot(211)
34 | ax1.plot(grade4means)
35 | ax1.plot(x, maimonides_rule(x), '--')
36 | ax1.set_xticks(range(0, 221, 20))
37 | ax1.set_xlabel("Enrollment count")
38 | ax1.set_ylabel("Class size")
39 | ax1.set_title('B. Fourth grade')
40 | 
41 | ax2 = fig.add_subplot(212)
42 | ax2.plot(grade5means)
43 | ax2.plot(x, maimonides_rule(x), '--')
44 | ax2.set_xticks(range(0, 221, 20))
45 | ax2.set_xlabel("Enrollment count")
46 | ax2.set_ylabel("Class size")
47 | ax2.set_title('A. Fifth grade')
48 | 
49 | plt.tight_layout()
50 | plt.savefig('Figure 6-2-1-Python.png', dpi = 300)
51 | 
52 | # End of script
53 | 


--------------------------------------------------------------------------------
/06 Getting a Little Jumpy/Figure 6-2-1.jl:
--------------------------------------------------------------------------------
 1 | # Load packages
 2 | using DataFrames
 3 | using Gadfly
 4 | 
 5 | # Download the data
 6 | download("http://economics.mit.edu/files/1359", "final4.dta")
 7 | download("http://economics.mit.edu/files/1358", "final5.dta")
 8 | 
 9 | # Load the data
10 | grade4 = readtable("final4.csv");
11 | grade5 = readtable("final5.csv");
12 | 
13 | # Find means class size by grade size
14 | grade4      = grade4[[:c_size, :classize]];
15 | grade4means = aggregate(grade4, :c_size, [mean])
16 | 
17 | grade5      = grade5[[:c_size, :classize]];
18 | grade5means = aggregate(grade5, :c_size, [mean])
19 | 
20 | # Create function for Maimonides Rule
21 | function maimonides_rule(x)
22 |     x / (floor((x - 1)/40) + 1)
23 | end
24 | 
25 | ticks = collect(0:20:220)
26 | p_grade4 = plot(layer(x = grade4means[:c_size], y = grade4means[:classize_mean], Geom.line),
27 |                 layer(maimonides_rule, 1, 220, Theme(line_style = Gadfly.get_stroke_vector(:dot))),
28 |                 Guide.xticks(ticks = ticks),
29 |                 Guide.xlabel("Enrollment count"),
30 |                 Guide.ylabel("Class size"),
31 |                 Guide.title("B. Fourth grade"))
32 | 
33 | p_grade5 = plot(layer(x = grade5means[:c_size], y = grade5means[:classize_mean], Geom.line),
34 |                 layer(maimonides_rule, 1, 220, Theme(line_style = Gadfly.get_stroke_vector(:dot))),
35 |                 Guide.xticks(ticks = ticks),
36 |                 Guide.xlabel("Enrollment count"),
37 |                 Guide.ylabel("Class size"),
38 |                 Guide.title("A. Fifth grade"))
39 | 
40 | draw(PNG("Figure 6-2-1-Julia.png", 6inch, 8inch), vstack(p_grade5, p_grade4))
41 | 
42 | # End of script
43 | 


--------------------------------------------------------------------------------
/04 Instrumental Variables in Action/04 Instrumental Variables in Action.md:
--------------------------------------------------------------------------------
 1 | # 04 Instrumental Variables in Action
 2 | ## 4.1 IV and causality
 3 | 
 4 | ### Figure 4-1-1
 5 | 
 6 | Completed in [Stata](Figure%204-1-1.do), [R](Figure%204-1-1.r) and [Python](Figure%204-1-1.py)
 7 | 
 8 | ![Figure 4-1-1 in R](https://raw.githubusercontent.com/vikjam/mostly-harmless-replication/master/04%20Instrumental%20Variables%20in%20Action/Figure%204-1-1-R.png)
 9 | 
10 | ### Table 4-1-2
11 | 
12 | Completed in [Stata](Table%204-1-2.do) and [R](Table%204-1-2.r)
13 | 
14 | |                   | Born in the 1st or 2nd quarter of year| Born in the 3rd or 4th quarter of year| Difference|
15 | |:------------------|--------------------------------------:|--------------------------------------:|----------:|
16 | |ln(weekly wage)    |                               5.893844|                               5.905829|  0.0119847|
17 | |Years of education |                              12.716122|                              12.821813|  0.1056907|
18 | |Wald estimate      |                                     NA|                                     NA|  0.1133937|
19 | |Wald std error     |                                     NA|                                     NA|  0.0215257|
20 | |OLS estimate       |                                     NA|                                     NA|  0.0708510|
21 | |OLS std error      |                                     NA|                                     NA|  0.0003386|
22 | 
23 | ### Figure 4-6-1
24 | 
25 | Completed in [Stata](Figure%204-6-1.do) and [R](Figure%204-6-1.r)
26 | 
27 | ![Figure 4-6-1 in R](https://github.com/vikjam/mostly-harmless-replication/blob/master/04%20Instrumental%20Variables%20in%20Action/Figure%204-6-1-R.png?raw=true)
28 | 


--------------------------------------------------------------------------------
/03 Making Regression Make Sense/Figure 3-1-3.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Tested on Python 3.11.5
 4 | """
 5 | 
 6 | import urllib
 7 | import zipfile
 8 | import urllib.request
 9 | import pandas as pd
10 | import statsmodels.api as sm
11 | import statsmodels.formula.api as smf
12 | 
13 | # Read the data into a pandas.DataFrame
14 | angrist_archive_url = (
15 |    'https://economics.mit.edu/sites/'
16 |    'default/files/publications/asciiqob.zip'
17 | )
18 | pums = pd.read_csv(
19 |    angrist_archive_url,
20 |    compression = 'zip',
21 |    header = None,
22 |    sep = '\s+'
23 | )
24 | pums.columns = ['lwklywge', 'educ', 'yob', 'qob', 'pob']
25 | 
26 | # Panel A
27 | # Set up the model and fit it
28 | mod_a = smf.ols(
29 |    formula = 'lwklywge ~ educ',
30 |    data = pums
31 | )
32 | res_a = mod_a.fit()
33 | # Old-fashioned standard errors
34 | print(res_a.summary(title='Old-fashioned standard errors'))
35 | # Robust standard errors
36 | res_a_robust = res_a.get_robustcov_results(cov_type='HC1')
37 | print(
38 |    res_a_robust.summary(title='Robust standard errors')
39 | )
40 | # Panel B
41 | # Calculate means and count by educ attainment
42 | pums_agg = pums.groupby('educ').agg(
43 |    lwklywge = ('lwklywge', 'mean'),
44 |    count = ('lwklywge', 'count')
45 | ).reset_index()
46 | # Set up the model and fit it
47 | mod_b = smf.wls(
48 |    formula = 'lwklywge ~ educ',
49 |    weights = pums_agg['count'],
50 |    data = pums_agg
51 | )
52 | res_b = mod_b.fit()
53 | # Old-fashioned standard errors
54 | print(res_b.summary(title='Old-fashioned standard errors'))
55 | # Robust standard errors
56 | res_b_robust = res_b.get_robustcov_results(cov_type='HC1')
57 | print(
58 |    res_b_robust.summary(title='Robust standard errors')
59 | )
60 | 
61 | # End of script
62 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Mostly Harmless Replication
 2 | <img src="http://img12.deviantart.net/f2cc/i/2015/017/2/5/babel_fish_poster__color__by_mrrtist21-d8eb1ea.jpg" width="400">
 3 | 
 4 | ## Synopsis
 5 | 
 6 | A bold attempt to replicate the tables and figures from the book [_Mostly Harmless Econometrics_](http://www.mostlyharmlesseconometrics.com/) in the following languages:
 7 | * Stata
 8 | * R
 9 | * Python
10 | * Julia
11 | 
12 | Why undertake this madness? My primary motivation was to see if I could replace Stata with either R, Python, or Julia in my workflow, so I tried to replicate _Mostly Harmless Econometrics_ in each of these languages.
13 | 
14 | ## Chapters
15 | 1. Questions about _Questions_
16 | 2. The Experimental Ideal
17 | 3. [Making Regression Make Sense](03%20Making%20Regression%20Make%20Sense/03%20Making%20Regression%20Make%20Sense.md)
18 | 4. [Instrumental Variables in Action](04%20Instrumental%20Variables%20in%20Action/04%20Instrumental%20Variables%20in%20Action.md)
19 | 5. [Parallel Worlds](05%20Fixed%20Effects%2C%20DD%20and%20Panel%20Data/05%20Fixed%20Effects%2C%20DD%20and%20Panel%20Data.md)
20 | 6. [Getting a Little Jumpy](06%20Getting%20a%20Little%20Jumpy/06%20Getting%20a%20Little%20Jumpy.md)
21 | 7. [Quantile Regression](07%20Quantile%20Regression/07%20Quantile%20Regression.md)
22 | 8. [Nonstandard Standard Error Issues](08%20Nonstandard%20Standard%20Error%20Issues/08%20Nonstanard%20Standard%20Error%20Issues.md)
23 | 
24 | ## Getting started
25 | Check out [Getting Started](https://github.com/vikjam/mostly-harmless-replication/wiki/Getting-started) in the Wiki for tips on setting up your machine with each of these languages.
26 | 
27 | ## Contributions
28 | Feel free to submit [pull requests](https://github.com/blog/1943-how-to-write-the-perfect-pull-request)!
29 | 
30 | 


--------------------------------------------------------------------------------
/03 Making Regression Make Sense/Figure 3-1-3.r:
--------------------------------------------------------------------------------
 1 | # R code for Figure 3.1.3                           #
 2 | # Required packages                                 #
 3 | # - sandwhich: robust standard errors               #
 4 | # - lmtest: print table with robust standard errors #
 5 | # - data.table: aggregate function                  #
 6 | library(sandwich)
 7 | library(lmtest)
 8 | 
 9 | # Download data and unzip the data
10 | download.file('http://economics.mit.edu/files/397', 'asciiqob.zip')
11 | unzip('asciiqob.zip')
12 | 
13 | # Read the data into a dataframe
14 | pums        <- read.table('asciiqob.txt',
15 |                           header           = FALSE,
16 |                           stringsAsFactors = FALSE)
17 | names(pums) <- c('lwklywge', 'educ', 'yob', 'qob', 'pob')
18 | 
19 | # Panel A
20 | # Estimate OLS regression
21 | reg.model <- lm(lwklywge ~ educ, data = pums)
22 | # Robust standard errors
23 | robust.reg.vcov <- vcovHC(reg.model, "HC1")
24 | # Print results
25 | print(summary(reg.model))
26 | print(coeftest(reg.model, vcov = robust.reg.vcov))
27 | 
28 | # Panel B
29 | # Figure out which observations appear in the regression
30 | sample          <- !is.na(predict(reg.model, data = pums))
31 | pums.data.table <- data.table(pums[sample, ])
32 | # Aggregate
33 | educ.means <- pums.data.table[ , list(mean  = mean(lwklywge),
34 |                                       count = length(lwklywge)),
35 |                                   by = educ]
36 | # Estimate weighted OLS regression
37 | wgt.reg.model <- lm(lwklywge ~ educ,
38 |                     weights = pums.data.table$count,
39 |                     data    = pums.data.table)
40 | # Robust standard errors with weighted OLS regression
41 | wgt.robust.reg.vcov  <- vcovHC(wgt.reg.model, "HC1")
42 | # Print results
43 | print(summary(wgt.reg.model))
44 | print(coeftest(wgt.reg.model, vcov = wgt.reg.vcov))
45 | 
46 | # End of file
47 | 


--------------------------------------------------------------------------------
/03 Making Regression Make Sense/Table 3-3-2.do:
--------------------------------------------------------------------------------
 1 | clear all
 2 | set more off
 3 | eststo clear
 4 | /* Required programs      */
 5 | /* - estout: output table */
 6 | 
 7 | /* Stata code for Table 3.3.2 */
 8 | 
 9 | * Store URL to MHE Data Archive in local
10 | local base_url = "https://economics.mit.edu/sites/default/files/inline-files"
11 | 
12 | /* Store variable list in local */
13 | local summary_var "age ed black hisp nodeg married re74 re75"
14 | local pscore_var "age age2 ed black hisp married nodeg re74 re75"
15 | 
16 | /* Columns 1 and 2 */
17 | use "`base_url'/nswre74.dta", clear
18 | eststo column_1, title("NSW Treat"): estpost summarize `summary_var' if treat == 1
19 | eststo column_2, title("NSW Control"): estpost summarize `summary_var' if treat == 0
20 | 
21 | /* Column 3 */
22 | use "`base_url'/cps1re74.dta", clear
23 | eststo column_3, title("Full CPS-1"): estpost summarize `summary_var' if treat == 0
24 | 
25 | /* Column 5 */
26 | probit treat `pscore_var'
27 | predict p_score, pr
28 | keep if p_score > 0.1 & p_score < 0.9
29 | 
30 | eststo column_5, title("P-score CPS-1"): estpost summarize `summary_var' if treat == 0
31 | 
32 | /* Column 4 */
33 | use "`base_url'/cps3re74.dta", clear
34 | eststo column_4, title("Full CPS-3"): estpost summarize `summary_var' if treat == 0
35 | 
36 | /* Column 6 */
37 | probit treat `pscore_var'
38 | predict p_score, pr
39 | keep if p_score > 0.1 & p_score < 0.9
40 | 
41 | eststo column_6, title("P-score CPS-3"): estpost summarize `summary_var' if treat == 0
42 | 
43 | /* Label variables */
44 | label var age "Age"
45 | label var ed "Years of Schooling"
46 | label var black "Black"
47 | label var hisp "Hispanic"
48 | label var nodeg "Dropout"
49 | label var married "Married"
50 | label var re74 "1974 earnings"
51 | label var re75 "1975 earnings"
52 | 
53 | /* Output Table */
54 | esttab column_1 column_2 column_3 column_4 column_5 column_6, ///
55 |     label mtitle                                              ///
56 |     cells(mean(label(Mean) fmt(2 2 2 2 2 2 0 0)))
57 | 
58 | /* End of script */
59 | exit
60 | 


--------------------------------------------------------------------------------
/04 Instrumental Variables in Action/Figure 4-6-1.jl:
--------------------------------------------------------------------------------
 1 | # Julia code for Table 4-6-1                #
 2 | # Required packages                         #
 3 | # - DataFrames: data manipulation / storage #
 4 | # - Distributions: extended stats functions #
 5 | # - FixedEffectModels: IV regression        #
 6 | using DataFrames
 7 | using Distributions
 8 | using FixedEffectModels
 9 | using GLM
10 | using Gadfly
11 | 
12 | # Number of simulations
13 | nsims = 1000
14 | 
15 | # Set seed
16 | srand(113643)
17 | 
18 | # Set parameters
19 | Sigma = [1.0 0.8;
20 |          0.8 1.0]
21 | N     = 1000
22 | 
23 | function irrelevantInstrMC()
24 |     # Create Z, xi and eta
25 |     Z      = DataFrame(transpose(rand(MvNormal(eye(20)), N)))
26 |     errors = DataFrame(transpose(rand(MvNormal(Sigma), N)))
27 | 
28 |     # Rename columns of Z and errors
29 |     names!(Z, [Symbol("z$i") for i in 1:20])
30 |     names!(errors, [:eta, :xi])
31 | 
32 |     # Create y and x
33 |     df     = hcat(Z, errors);
34 |     df[:x] = 0.1 .* df[:z1] .+ df[:xi]
35 |     df[:y] = df[:x] .+ df[:eta]
36 | 
37 |     # Run regressions
38 |     ols  = coef(lm(@formula(y ~ x), df))[2]
39 |     tsls = coef(reg(df, @model(y ~ z1  + z2  + z3  + z4  + z5  + z6  + z7  + z8  + z9  + z10 +
40 |                              z11 + z12 + z13 + z14 + z15 + z16 + z17 + z18 + z19 + z20)))[2]
41 |     return([ols tsls])
42 | end
43 | 
44 | # Simulate IV regressions
45 | simulation_results = zeros(nsims, 2);
46 | for i = 1:nsims
47 |     simulation_results[i, :] = irrelevantInstrMC()
48 | end
49 | 
50 | # Create empirical CDFs from simulated results
51 | ols_ecdf  = ecdf(simulation_results[:, 1])
52 | tsls_ecdf = ecdf(simulation_results[:, 2])
53 | 
54 | # Plot the empirical CDFs of each estimator
55 | p = plot(layer(ols_ecdf, 0, 2.5, Theme(default_color = colorant"red")),
56 |          layer(tsls_ecdf, 0, 2.5, Theme(line_style = :dot)),
57 |          layer(xintercept = [0.5], Geom.vline,
58 |                Theme(default_color = colorant"black", line_style = :dot)),
59 |          layer(yintercept = [0.5], Geom.hline,
60 |                Theme(default_color = colorant"black", line_style = :dot)),
61 |          Guide.xlabel("Estimated β"),
62 |          Guide.ylabel("F<sub>n</sub>(Estimated β)"))
63 | 
64 | # Export figure as .png
65 | draw(PNG("Figure 4-6-1-Julia.png", 7inch, 6inch), p)
66 | 
67 | # End of script
68 | 


--------------------------------------------------------------------------------
/06 Getting a Little Jumpy/Figure 6-1-1.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Create Figure 4-6-1 in MHE
 4 | Tested on Python 3.4
 5 | numpy: math and stat functions, array
 6 | matplotlib: plot figures
 7 | """
 8 | 
 9 | import numpy as np
10 | import matplotlib.pyplot as plt
11 | 
12 | # Set seed
13 | np.random.seed(10633)
14 | 
15 | # Set number of simulations
16 | nobs = 100
17 | 
18 | # Generate series
19 | x         = np.random.uniform(0, 1, nobs)
20 | x         = np.sort(x)
21 | y_linear  = x + (x > 0.5) * 0.25 + np.random.normal(0, 0.1, nobs)
22 | y_nonlin  = 0.5 * np.sin(6 * (x - 0.5)) + 0.5 + (x > 0.5) * 0.25 + np.random.normal(0, 0.1, nobs)
23 | y_mistake = 1 / (1 + np.exp(-25 * (x - 0.5))) + np.random.normal(0, 0.1, nobs)
24 | 
25 | # Fit lines using user-created function
26 | def rdfit(x, y, cutoff, degree):
27 |     coef_0 = np.polyfit(x[cutoff >= x], y[cutoff >= x], degree)
28 |     fit_0  = np.polyval(coef_0, x[cutoff >= x])
29 | 
30 |     coef_1 = np.polyfit(x[x > cutoff], y[x > cutoff], degree)
31 |     fit_1  = np.polyval(coef_1, x[x > cutoff])
32 | 
33 |     return coef_0, fit_0, coef_1, fit_1
34 | 
35 | coef_y_linear_0 , fit_y_linear_0 , coef_y_linear_1 , fit_y_linear_1  = rdfit(x, y_linear, 0.5, 1)
36 | coef_y_nonlin_0 , fit_y_nonlin_0 , coef_y_nonlin_1 , fit_y_nonlin_1  = rdfit(x, y_nonlin, 0.5, 2)
37 | coef_y_mistake_0, fit_y_mistake_0, coef_y_mistake_1, fit_y_mistake_1 = rdfit(x, y_mistake, 0.5, 1)
38 | 
39 | # Plot figures
40 | fig = plt.figure()
41 | 
42 | ax1 = fig.add_subplot(311)
43 | ax1.scatter(x, y_linear, edgecolors = 'none')
44 | ax1.plot(x[0.5 >= x], fit_y_linear_0)
45 | ax1.plot(x[x > 0.5], fit_y_linear_1)
46 | ax1.axvline(0.5)
47 | ax1.set_title(r'A. Linear $E[Y_{0i} | X_i]$')
48 | 
49 | ax2 = fig.add_subplot(312)
50 | ax2.scatter(x, y_nonlin, edgecolors = 'none')
51 | ax2.plot(x[0.5 >= x], fit_y_nonlin_0)
52 | ax2.plot(x[x > 0.5], fit_y_nonlin_1)
53 | ax2.axvline(0.5)
54 | ax2.set_title(r'B. Nonlinear $E[Y_{0i} | X_i]$')
55 | 
56 | ax3 = fig.add_subplot(313)
57 | ax3.scatter(x, y_mistake, edgecolors = 'none')
58 | ax3.plot(x[0.5 >= x], fit_y_mistake_0)
59 | ax3.plot(x[x > 0.5], fit_y_mistake_1)
60 | ax3.plot(x, 1 / (1 + np.exp(-25 * (x - 0.5))), '--')
61 | ax3.axvline(0.5)
62 | ax3.set_title('C. Nonlinearity mistaken for discontinuity')
63 | 
64 | plt.tight_layout()
65 | plt.savefig('Figure 6-1-1-Python.png', dpi = 300)
66 | 
67 | # End of script
68 | 


--------------------------------------------------------------------------------
/04 Instrumental Variables in Action/Table 4-1-1.r:
--------------------------------------------------------------------------------
 1 | # R code for Table 4-1-1        #
 2 | # Required packages             #
 3 | # - data.table: data management #
 4 | # - sandwich: standard errors   #
 5 | # - AER: running IV regressions #
 6 | 
 7 | library(data.table)
 8 | library(sandwich)
 9 | library(AER)
10 | 
11 | # Download data and unzip the data
12 | download.file('http://economics.mit.edu/files/397', 'asciiqob.zip')
13 | unzip('asciiqob.zip')
14 | 
15 | # Read the data into a data.table
16 | pums        <- fread('asciiqob.txt',
17 |                      header           = FALSE,
18 |                      stringsAsFactors = FALSE)
19 | names(pums) <- c('lwklywge', 'educ', 'yob', 'qob', 'pob')
20 | 
21 | # Column 1: OLS
22 | col1 <- lm(lwklywge ~ educ, pums)
23 | 
24 | # Column 2: OLS with YOB, POB dummies
25 | col2 <- lm(lwklywge ~ educ + factor(yob) + factor(pob), pums)
26 | 
27 | # Create dummies for quarter of birth
28 | qobs      <- unique(pums$qob)
29 | qobs.vars <- sapply(qobs, function(x) paste0('qob', x))
30 | pums[, (qobs.vars) := lapply(qobs, function(x) qob == x)]
31 | 
32 | # Column 3: 2SLS with instrument QOB = 1
33 | col3 <- ivreg(lwklywge ~ educ, ~ qob1, pums)
34 | 
35 | # Column 4: 2SLS with YOB, POB dummies and instrument QOB = 1
36 | col4 <- ivreg(lwklywge ~ factor(yob) + factor(pob) + educ,
37 | 	                   ~ factor(yob) + factor(pob) + qob1,
38 | 	          pums)
39 | 
40 | # Create dummy for quarter 1 or 2
41 | pums[, qob1or2 := qob == 1 | qob == 2]
42 | 
43 | # Column 5: 2SLS with YOB, POB dummies and instrument (QOB = 1 | QOB = 2)
44 | col5 <- ivreg(lwklywge ~ factor(yob) + factor(pob) + educ,
45 |                        ~ factor(yob) + factor(pob) + qob1or2,
46 |               pums)
47 | 
48 | # Column 6: 2SLS with YOB, POB dummies and full QOB dummies
49 | col6 <- ivreg(lwklywge ~ factor(yob) + factor(pob) + educ,
50 | 	                   ~ factor(yob) + factor(pob) + factor(qob),
51 | 	          pums)
52 | 
53 | # Column 7: 2SLS with YOB, POB dummies and full QOB dummies interacted with YOB
54 | col7 <- ivreg(lwklywge ~ factor(yob) + factor(pob) + educ,
55 | 	                   ~ factor(pob) + factor(qob) * factor(yob),
56 | 	          pums)
57 | 
58 | # Column 8: 2SLS with age, YOB, POB dummies and with full QOB dummies interacted with YOB
59 | col8 <- ivreg(lwklywge ~ factor(yob) + factor(pob) + educ,
60 | 	                   ~ factor(pob) + factor(qob) * factor(yob),
61 | 	          pums)
62 | 
63 | # End of script
64 | 


--------------------------------------------------------------------------------
/04 Instrumental Variables in Action/Figure 4-1-1.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Create Figure 4-6-1 in MHE
 4 | Tested on Python 3.4
 5 | """
 6 | 
 7 | import zipfile
 8 | import urllib.request
 9 | import pandas as pd
10 | import statsmodels.api as sm
11 | import matplotlib.pyplot as plt
12 | from matplotlib.ticker import FormatStrFormatter
13 | 
14 | # Download data and unzip the data
15 | urllib.request.urlretrieve('http://economics.mit.edu/files/397', 'asciiqob.zip')
16 | with zipfile.ZipFile('asciiqob.zip', "r") as z:
17 |    z.extractall()
18 | 
19 | # Read the data into a pandas dataframe
20 | pums         = pd.read_csv('asciiqob.txt',
21 |                            header           = None,
22 |                            delim_whitespace = True)
23 | pums.columns = ['lwklywge', 'educ', 'yob', 'qob', 'pob']
24 | 
25 | # Calculate means by educ and lwklywge
26 | groupbybirth = pums.groupby(['yob', 'qob'])
27 | birth_means  = groupbybirth['lwklywge', 'educ'].mean()
28 | 
29 | # Create function to plot figures
30 | def plot_qob(yvar, ax, title, ylabel):
31 |     values = yvar.values
32 |     ax.plot(values, color = 'k')
33 | 
34 |     for i, y in enumerate(yvar):
35 |         qob = yvar.index.get_level_values('qob')[i]
36 |         ax.annotate(qob,
37 |                     (i, y),
38 |                     xytext = (-5, 5),
39 |                     textcoords = 'offset points')
40 |         if qob == 1:
41 |             ax.scatter(i, y, marker = 's', facecolors = 'none', edgecolors = 'k')
42 |         else:
43 |             ax.scatter(i, y, marker = 's', color = 'k')
44 | 
45 |     ax.set_xticks(range(0, len(yvar), 4))
46 |     ax.set_xticklabels(yvar.index.get_level_values('yob')[1::4])
47 |     ax.set_title(title)
48 |     ax.set_ylabel(ylabel)
49 |     ax.yaxis.set_major_formatter(FormatStrFormatter('%.2f'))
50 |     ax.set_xlabel("Year of birth")
51 |     ax.margins(0.1)
52 | 
53 | fig, (ax1, ax2) = plt.subplots(2, sharex = True)
54 | 
55 | plot_qob(yvar   = birth_means['educ'],
56 |          ax     = ax1,
57 |          title  = 'A. Average education by quarter of birth (first stage)',
58 |          ylabel = 'Years of education')
59 | 
60 | plot_qob(yvar   = birth_means['lwklywge'],
61 |          ax     = ax2,
62 |          title  = 'B. Average weekly wage by quarter of birth (reduced form)',
63 |          ylabel = 'Log weekly earnings')
64 | 
65 | fig.tight_layout()
66 | fig.savefig('Figure 4-1-1-Python.pdf', format = 'pdf')
67 | 
68 | # End of file
69 | 


--------------------------------------------------------------------------------
/06 Getting a Little Jumpy/Figure 6-1-1.r:
--------------------------------------------------------------------------------
 1 | # R code for Figure 6-1-1                 #
 2 | # Required packages                       #
 3 | # - ggplot2: making pretty graphs         #
 4 | # - gridExtra: combine graphs             #
 5 | library(ggplot2)
 6 | library(gridExtra)
 7 | 
 8 | # Generate series
 9 | nobs      = 100
10 | x         <- runif(nobs)
11 | y.linear  <- x + (x > 0.5) * 0.25 + rnorm(n = nobs, mean = 0, sd = 0.1)
12 | y.nonlin  <- 0.5 * sin(6 * (x - 0.5)) + 0.5 + (x > 0.5) * 0.25 + rnorm(n = nobs, mean = 0, sd = 0.1)
13 | y.mistake <- 1 / (1 + exp(-25 * (x - 0.5))) + rnorm(n = nobs, mean = 0, sd = 0.1)
14 | rd.series <- data.frame(x, y.linear, y.nonlin, y.mistake)
15 | 
16 | # Make graph with ggplot2
17 | g.data   <- ggplot(rd.series, aes(x = x, group = x > 0.5))
18 | 
19 | p.linear <- g.data + geom_point(aes(y = y.linear))  +
20 |                      stat_smooth(aes(y = y.linear),
21 |                                  method = "lm",
22 |                                  se     = FALSE)    +
23 |                      geom_vline(xintercept = 0.5)   +
24 |                      ylab("Outcome")                +
25 |                      ggtitle(bquote('A. Linear E[' * Y["0i"] * '|' * X[i] * ']'))
26 | 
27 | p.nonlin <- g.data + geom_point(aes(y = y.nonlin))  +
28 |                      stat_smooth(aes(y = y.nonlin),
29 |                                  method  = "lm",
30 |                                  formula = y ~ poly(x, 2),
31 |                                  se      = FALSE)   +
32 |                      geom_vline(xintercept = 0.5)   +
33 |                      ylab("Outcome")                +
34 |                      ggtitle(bquote('B. Nonlinear E[' * Y["0i"] * '|' * X[i] * ']'))
35 | 
36 | f.mistake <- function(x) {1 / (1 + exp(-25 * (x - 0.5)))}
37 | p.mistake <- g.data + geom_point(aes(y = y.mistake))     +
38 |                       stat_smooth(aes(y = y.mistake),
39 |                                   method = "lm",
40 |                                   se     = FALSE)        +
41 |                       stat_function(fun      = f.mistake,
42 |                                     linetype = "dashed") +
43 |                       geom_vline(xintercept = 0.5)       +
44 |                       ylab("Outcome")                    +
45 |                       ggtitle('C. Nonlinearity mistaken for discontinuity')
46 | 
47 | p.rd.examples <- arrangeGrob(p.linear, p.nonlin, p.mistake, ncol = 1)
48 | 
49 | ggsave(p.rd.examples, file = "Figure 6-1-1-R.pdf", width = 5, height = 9)
50 | 
51 | # End of script
52 | 


--------------------------------------------------------------------------------
/04 Instrumental Variables in Action/Table 4-1-2.r:
--------------------------------------------------------------------------------
 1 | # R code for Table 4-1-2        #
 2 | # Required packages             #
 3 | # - data.table: data management #
 4 | # - systemfit: SUR              #
 5 | library(data.table)
 6 | library(systemfit)
 7 | 
 8 | # Download data and unzip the data
 9 | download.file('http://economics.mit.edu/files/397', 'asciiqob.zip')
10 | unzip('asciiqob.zip')
11 | 
12 | # Read the data into a data.table
13 | pums        <- fread('asciiqob.txt',
14 |                      header           = FALSE,
15 |                      stringsAsFactors = FALSE)
16 | names(pums) <- c('lwklywge', 'educ', 'yob', 'qob', 'pob')
17 | 
18 | # Create binary variable
19 | pums$z <- (pums$qob == 3 | pums$qob == 4) * 1
20 | 
21 | # Compare means (and differences)
22 | ttest.lwklywge <- t.test(lwklywge ~ z, pums)
23 | ttest.educ     <- t.test(educ ~ z, pums)
24 | 
25 | # Compute Wald estimate
26 | sur  <- systemfit(list(first  = educ ~ z,
27 | 	                   second = lwklywge ~ z),
28 |                   data   = pums,
29 |                   method = "SUR")
30 | wald <- deltaMethod(sur, "second_z / first_z")
31 | 
32 | wald.estimate <- (mean(pums$lwklywge[pums$z == 1]) - mean(pums$lwklywge[pums$z == 0])) /
33 |                  (mean(pums$educ[pums$z == 1]) - mean(pums$educ[pums$z == 0]))
34 | wald.se       <- wald.estimate^2 * ()
35 | 
36 | # OLS estimate
37 | ols <- lm(lwklywge ~ educ, pums)
38 | 
39 | # Construct table
40 | lwklywge.row <- c(ttest.lwklywge$estimate[1],
41 | 				  ttest.lwklywge$estimate[2],
42 | 				  ttest.lwklywge$estimate[2] - ttest.lwklywge$estimate[1])
43 | educ.row     <- c(ttest.educ$estimate[1],
44 | 				  ttest.educ$estimate[2],
45 | 				  ttest.educ$estimate[2] - ttest.educ$estimate[1])
46 | wald.row.est <- c(NA, NA, wald$Estimate)
47 | wald.row.se  <- c(NA, NA, wald$SE)
48 | 
49 | ols.row.est <- c(NA, NA, summary(ols)$coef['educ' , 'Estimate'])
50 | ols.row.se  <- c(NA, NA, summary(ols)$coef['educ' , 'Std. Error'])
51 | 
52 | table           <- rbind(lwklywge.row, educ.row,
53 | 	                     wald.row.est, wald.row.se,
54 | 	                     ols.row.est, ols.row.se)
55 | colnames(table) <- c("Born in the 1st or 2nd quarter of year",
56 | 	                 "Born in the 3rd or 4th quarter of year",
57 | 	                 "Difference")
58 | rownames(table) <- c("ln(weekly wage)",
59 | 	                 "Years of education",
60 | 	                 "Wald estimate",
61 | 	                 "Wald std error",
62 | 	                 "OLS estimate",
63 | 	                 "OLS std error")
64 | 
65 | # End of script
66 | 


--------------------------------------------------------------------------------
/08 Nonstandard Standard Error Issues/08 Nonstanard Standard Error Issues.md:
--------------------------------------------------------------------------------
 1 | # 08 Nonstandard Standard Error Issues
 2 | ## 8.1 The Bias of Robust Standard Errors
 3 | 
 4 | ### Table 8.1.1
 5 | Completed in [Stata](Table%208-1-1.do), [R](Table%208-1-1.r), [Python](Table%208-1-1.py) and [Julia](Table%208-1-1.jl)
 6 | 
 7 | _Panel A: Lots of Heteroskedasticity_
 8 | 
 9 | |Estimate               |   Mean|   Std| Normal|     t|
10 | |:----------------------|------:|-----:|------:|-----:|
11 | |Beta_1                 | -0.006| 0.581|     NA|    NA|
12 | |Conventional           |  0.331| 0.052|  0.269| 0.249|
13 | |HC0                    |  0.433| 0.210|  0.227| 0.212|
14 | |HC1                    |  0.448| 0.218|  0.216| 0.201|
15 | |HC2                    |  0.525| 0.260|  0.171| 0.159|
16 | |HC3                    |  0.638| 0.321|  0.124| 0.114|
17 | |max(Conventional, HC0) |  0.461| 0.182|  0.174| 0.159|
18 | |max(Conventional, HC1) |  0.474| 0.191|  0.167| 0.152|
19 | |max(Conventional, HC2) |  0.543| 0.239|  0.136| 0.123|
20 | |max(Conventional, HC3) |  0.650| 0.305|  0.101| 0.091|
21 | 
22 | _Panel B: Little Heteroskedasticity_
23 | 
24 | |Estimate               |   Mean|   Std| Normal|     t|
25 | |:----------------------|------:|-----:|------:|-----:|
26 | |Beta_1                 | -0.006| 0.595|     NA|    NA|
27 | |Conventional           |  0.519| 0.070|  0.097| 0.084|
28 | |HC0                    |  0.456| 0.200|  0.204| 0.188|
29 | |HC1                    |  0.472| 0.207|  0.191| 0.175|
30 | |HC2                    |  0.546| 0.251|  0.153| 0.140|
31 | |HC3                    |  0.656| 0.312|  0.112| 0.102|
32 | |max(Conventional, HC0) |  0.569| 0.130|  0.081| 0.070|
33 | |max(Conventional, HC1) |  0.577| 0.139|  0.079| 0.067|
34 | |max(Conventional, HC2) |  0.625| 0.187|  0.068| 0.058|
35 | |max(Conventional, HC3) |  0.712| 0.260|  0.054| 0.045|
36 | 
37 | _Panel C: No Heteroskedasticity_
38 | 
39 | |Estimate               |   Mean|   Std| Normal|     t|
40 | |:----------------------|------:|-----:|------:|-----:|
41 | |Beta_1                 | -0.006| 0.604|     NA|    NA|
42 | |Conventional           |  0.603| 0.081|  0.059| 0.049|
43 | |HC0                    |  0.469| 0.196|  0.193| 0.177|
44 | |HC1                    |  0.485| 0.203|  0.180| 0.165|
45 | |HC2                    |  0.557| 0.246|  0.145| 0.131|
46 | |HC3                    |  0.667| 0.308|  0.106| 0.097|
47 | |max(Conventional, HC0) |  0.633| 0.116|  0.052| 0.043|
48 | |max(Conventional, HC1) |  0.639| 0.123|  0.051| 0.042|
49 | |max(Conventional, HC2) |  0.678| 0.166|  0.045| 0.036|
50 | |max(Conventional, HC3) |  0.752| 0.237|  0.036| 0.030|
51 | 


--------------------------------------------------------------------------------
/04 Instrumental Variables in Action/Figure 4-1-1.r:
--------------------------------------------------------------------------------
 1 | # R code for Figure 4-1-1         #
 2 | # Required packages               #
 3 | # - dplyr: easy data manipulation #
 4 | # - lubridate: data management    #
 5 | # - ggplot2: making pretty graphs #
 6 | # - gridExtra: combine graphs     #
 7 | library(lubridate)
 8 | library(dplyr)
 9 | library(ggplot2)
10 | library(gridExtra)
11 | 
12 | # Download data and unzip the data
13 | download.file('http://economics.mit.edu/files/397', 'asciiqob.zip')
14 | unzip('asciiqob.zip')
15 | 
16 | # Read the data into a dataframe
17 | pums        <- read.table('asciiqob.txt',
18 |                           header           = FALSE,
19 |                           stringsAsFactors = FALSE)
20 | names(pums) <- c('lwklywge', 'educ', 'yob', 'qob', 'pob')
21 | 
22 | # Collapse for means
23 | pums.qob.means      <- pums %>% group_by(yob, qob) %>% summarise_each(funs(mean))
24 | 
25 | # Add dates
26 | pums.qob.means$yqob <- ymd(paste0("19",
27 |                                   pums.qob.means$yob,
28 |                                   pums.qob.means$qob * 3),
29 |                            truncated = 2)
30 | 
31 | # Function for plotting data
32 | plot.qob <- function(ggplot.obj, ggtitle, ylab) {
33 |   gg.colours <- c("firebrick", rep("black", 3), "white")
34 |   ggplot.obj + geom_line()                                              +
35 |                geom_point(aes(colour = factor(qob)),
36 |                               size = 5)                                 +
37 |                geom_text(aes(label = qob, colour = "white"),
38 |                          size  = 3,
39 |                          hjust = 0.5, vjust = 0.5,
40 |                          show_guide = FALSE)                            +
41 |                scale_colour_manual(values = gg.colours, guide = FALSE)  +
42 |                ggtitle(ggtitle)                                         +
43 |                xlab("Year of birth")                                    +
44 |                ylab(ylab)                                               +
45 |                theme_set(theme_gray(base_size = 10))
46 | }
47 | 
48 | # Plot
49 | p.educ     <- plot.qob(ggplot(pums.qob.means, aes(x = yqob, y = educ)),
50 |                        "A. Average education by quarter of birth (first stage)",
51 |                        "Years of education")
52 | p.lwklywge <- plot.qob(ggplot(pums.qob.means, aes(x = yqob, y = lwklywge)),
53 |                        "B. Average weekly wage by quarter of birth (reduced form)",
54 |                        "Log weekly earnings")
55 | 
56 | p.ivgraph  <- arrangeGrob(p.educ, p.lwklywge)
57 | 
58 | ggsave(p.ivgraph, file = "Figure 4-1-1-R.png", height = 12, width = 8, dpi = 300)
59 | 
60 | # End of script
61 | 


--------------------------------------------------------------------------------
/06 Getting a Little Jumpy/Figure 6-2-1.r:
--------------------------------------------------------------------------------
 1 | # R code for Figure 6-2-1         #
 2 | # Required packages               #
 3 | # - haven: read Stata .dta files  #
 4 | # - ggplot2: making pretty graphs #
 5 | # - gridExtra: combine graphs     #
 6 | library(haven)
 7 | library(ggplot2)
 8 | library(gridExtra)
 9 | 
10 | # Download the data
11 | download.file("http://economics.mit.edu/files/1359", "final4.dta")
12 | download.file("http://economics.mit.edu/files/1358", "final5.dta")
13 | 
14 | # Load the data
15 | grade4 <- read_dta("final4.dta")
16 | grade5 <- read_dta("final5.dta")
17 | 
18 | # Restrict sample
19 | grade4 <- grade4[which(grade4$classize & grade4$classize < 45 & grade4$c_size > 5), ]
20 | grade5 <- grade5[which(grade5$classize & grade5$classize < 45 & grade5$c_size > 5), ]
21 | 
22 | # Find means class size by grade size
23 | grade4cmeans <- aggregate(grade4$classize,
24 |                           by  = list(grade4$c_size),
25 |                           FUN = mean,
26 |                           na.rm = TRUE)
27 | grade5cmeans <- aggregate(grade5$classize,
28 |                           by  = list(grade5$c_size),
29 |                           FUN = mean,
30 |                           na.rm = TRUE)
31 | 
32 | # Rename aggregaed columns
33 | colnames(grade4cmeans) <- c("c_size", "classize.mean")
34 | colnames(grade5cmeans) <- c("c_size", "classize.mean")
35 | 
36 | # Create function for Maimonides Rule
37 | maimonides.rule <- function(x) {x / (floor((x - 1)/40) + 1)}
38 | 
39 | # Plot each grade
40 | g4 <- ggplot(data = grade4cmeans, aes(x = c_size))
41 | p4 <- g4 + geom_line(aes(y = classize.mean))            +
42 |            stat_function(fun      = maimonides.rule,
43 |                          linetype = "dashed")           +
44 |            expand_limits(y = 0)                         +
45 |            scale_x_continuous(breaks = seq(0, 220, 20)) +
46 |            ylab("Class size")                           +
47 |            xlab("Enrollment count")                     +
48 |            ggtitle("B. Fourth grade")
49 | 
50 | g5 <- ggplot(data = grade5cmeans, aes(x = c_size))
51 | p5 <- g5 + geom_line(aes(y = classize.mean))            +
52 |            stat_function(fun      = maimonides.rule,
53 |                          linetype = "dashed")           +
54 |            expand_limits(y = 0)                         +
55 |            scale_x_continuous(breaks = seq(0, 220, 20)) + 
56 |            ylab("Class size")                           +
57 |            xlab("Enrollment count")                     +
58 |            ggtitle("A. Fifth grade")
59 | 
60 | first.stage <- arrangeGrob(p5, p4, ncol = 1)
61 | ggsave(first.stage, file = "Figure 6-2-1-R.png", height = 8, width = 5, dpi = 300)
62 | 
63 | # End of script
64 | 


--------------------------------------------------------------------------------
/07 Quantile Regression/Table 7-1-1.r:
--------------------------------------------------------------------------------
 1 | # R code for Table 7.1.1           #
 2 | # Required packages                #
 3 | # - haven: read in .dta files      #
 4 | # - quantreg: quantile regressions #
 5 | # - knitr: markdown tables         #
 6 | library(haven)
 7 | library(quantreg)
 8 | library(knitr)
 9 | 
10 | # Download data and unzip the data
11 | download.file('http://economics.mit.edu/files/384', 'angcherfer06.zip')
12 | unzip('angcherfer06.zip')
13 | 
14 | # Create a function to run the quantile/OLS regressions so we can use a loop
15 | quant.mincer <- function(tau, data) {
16 |     r <- rq(logwk ~ educ + black + exper + exper2,
17 |             weights = perwt,
18 |             data    = data,
19 |             tau     = tau)
20 |     return(rbind(summary(r)$coefficients["educ", "Value"],
21 |                  summary(r)$coefficients["educ", "Std. Error"]))
22 | }
23 | 
24 | # Create function for producing the results
25 | calculate.qr <- function(year) {
26 | 
27 |     # Create file path
28 |     dta.path <- paste('Data/census', year, '.dta', sep = "")
29 | 
30 |     # Load year into the census
31 |     df <- read_dta(dta.path)
32 | 
33 |     # Run quantile regressions
34 |     taus <- c(0.1, 0.25, 0.5, 0.75, 0.9)
35 |     qr   <- sapply(taus, quant.mincer, data = df)
36 | 
37 |     # Run OLS regressions and get RMSE
38 |     ols     <- lm(logwk ~ educ + black + exper + exper2,
39 |                   weights = perwt,
40 |                   data    = df)
41 |     coef.se <- rbind(summary(ols)$coefficients["educ", "Estimate"],
42 |                      summary(ols)$coefficients["educ", "Std. Error"])
43 |     rmse    <- sqrt(sum(summary(ols)$residuals^2) / ols$df.residual)
44 | 
45 |     # Summary statistics
46 |     obs  <- length(na.omit(df$educ))
47 |     mean <- mean(df$logwk, na.rm = TRUE)
48 |     sd   <- sd(df$logwk, na.rm = TRUE)
49 | 
50 |     return(cbind(rbind(obs, NA),
51 |                  rbind(mean, NA),
52 |                  rbind(sd, NA),
53 |                  qr,
54 |                  coef.se,
55 |                  rbind(rmse, NA)))
56 | 
57 | }
58 | 
59 | # Generate results
60 | results <- rbind(calculate.qr("80"),
61 |                  calculate.qr("90"),
62 |                  calculate.qr("00"))
63 | 
64 | # Name rows and columns
65 | row.names(results) <- c("1980", "", "1990", "", "2000", "")
66 | colnames(results)  <- c("Obs", "Mean", "Std Dev",
67 |                         "0.1", "0.25", "0.5", "0.75", "0.9",
68 |                         "OLS", "RMSE")
69 | 
70 | # Format decimals
71 | results              <- round(results, 3)
72 | results[ , c(2, 10)] <- round(results[ , c(2, 10)], 2)
73 | results[ , 1]        <- formatC(results[ , 1], format = "d", big.mark = ",")
74 | 
75 | # Export table
76 | print(kable(results))
77 | 
78 | # End of file
79 | 


--------------------------------------------------------------------------------
/03 Making Regression Make Sense/Table 3-3-2.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Tested on Python 3.11.13
 4 | """
 5 | 
 6 | import urllib.request
 7 | import pandas as pd
 8 | import statsmodels.api as sm
 9 | import numpy as np
10 | import patsy
11 | from tabulate import tabulate
12 | 
13 | # Read the Stata files into Python directly from website
14 | base_url = 'https://economics.mit.edu/sites/default/files/inline-files'
15 | nswre74  = pd.read_stata(f"{base_url}/nswre74.dta")
16 | cps1re74 = pd.read_stata(f"{base_url}/cps1re74.dta")
17 | cps3re74 = pd.read_stata(f"{base_url}/cps3re74.dta")
18 | 
19 | # Store list of variables for summary
20 | summary_vars = ['age', 'ed', 'black', 'hisp', 'nodeg', 'married', 're74', 're75']
21 | 
22 | # Calculate propensity scores
23 | # Create formula for probit
24 | f = 'treat ~ ' + ' + '.join(['age', 'age2', 'ed', 'black', 'hisp', \
25 |                              'nodeg', 'married', 're74', 're75'])
26 | 
27 | # Run probit with CPS-1
28 | y, X   = patsy.dmatrices(f, cps1re74, return_type = 'dataframe')
29 | model  = sm.Probit(y, X).fit()
30 | cps1re74['pscore'] = model.predict(X)
31 | 
32 | # Run probit with CPS-3
33 | y, X   = patsy.dmatrices(f, cps3re74, return_type = 'dataframe')
34 | model  = sm.Probit(y, X).fit()
35 | cps3re74['pscore'] = model.predict(X)
36 | 
37 | # Create function to summarize data
38 | def summarize(dataset, conditions):
39 |   stats          = dataset[summary_vars][conditions].mean()
40 |   stats['count'] = sum(conditions)
41 |   return stats
42 | 
43 | # Summarize data
44 | nswre74_treat_stats    = summarize(nswre74, nswre74.treat == 1)
45 | nswre74_control_stats  = summarize(nswre74, nswre74.treat == 0)
46 | cps1re74_control_stats = summarize(cps1re74, cps1re74.treat == 0)
47 | cps3re74_control_stats = summarize(cps3re74, cps3re74.treat == 0)
48 | cps1re74_ptrim_stats   = summarize(cps1re74, (cps1re74.treat == 0)   & \
49 |                                              (cps1re74.pscore > 0.1) & \
50 |                                              (cps1re74.pscore < 0.9))
51 | cps3re74_ptrim_stats   = summarize(cps3re74, (cps3re74.treat == 0)   & \
52 |                                              (cps3re74.pscore > 0.1) & \
53 |                                              (cps3re74.pscore < 0.9))
54 | 
55 | # Combine summary stats, add header and print to markdown
56 | frames = [nswre74_treat_stats,
57 |           nswre74_control_stats,
58 |           cps1re74_control_stats,
59 |           cps3re74_control_stats,
60 |           cps1re74_ptrim_stats,
61 |           cps3re74_ptrim_stats]
62 | 
63 | summary_stats = pd.concat(frames, axis = 1)
64 | header        = ["NSW Treat", "NSW Control", \
65 |                  "Full CPS-1", "Full CPS-3", \
66 |                  "P-score CPS-1", "P-score CPS-3"]
67 | 
68 | print(tabulate(summary_stats, header, tablefmt = "pipe"))
69 | 
70 | # End of script
71 | 


--------------------------------------------------------------------------------
/04 Instrumental Variables in Action/Figure 4-6-1.do:
--------------------------------------------------------------------------------
 1 | clear all
 2 | set more off
 3 | capture log close _all
 4 | capture version 13.1 // Note this script has only been tested in Stata 13.1
 5 | 
 6 | /* Stata code for Figure 4.6.1 */
 7 | 
 8 | /* Log output*/
 9 | log using "Figure 4-6-1-Stata.txt", name(figure040601) text replace
10 | 
11 | /* Set random seed for replication */
12 | set seed 42
13 | 
14 | /* Define program for use with -simulate- command */
15 | capture program drop weakinstr
16 | program define weakinstr, rclass
17 |     version 13.1
18 | 
19 |     /* Draw from random normal with correlation of 0.8 and variance of 1 */
20 |     matrix C = (1, 0.8 \ 0.8, 1)
21 |     quietly drawnorm eta xi, n(1000) corr(C) clear
22 | 
23 |     /* Create a random instruments */
24 |     forvalues i = 1/20 {
25 |         quietly gen z`i' = rnormal()
26 |     }
27 | 
28 |     /* Endogenous x only based on z1 while z2-z20 irrelevant */
29 |     quietly gen x = 0.1*z1 + xi
30 |     quietly gen y = x + eta
31 | 
32 |     /* OLS */
33 |     quietly: regress y x
34 |     matrix OLS = e(b)
35 | 
36 |     /* 2SLS */
37 |     quietly: ivregress 2sls y (x = z*)
38 |     matrix TSLS = e(b)
39 | 
40 |     /* LIML */
41 |     quietly: ivregress liml y (x = z*)
42 |     matrix LIML = e(b)
43 | 
44 |     /* Return results from program */
45 |     return scalar ols  = OLS[1, 1]
46 |     return scalar tsls = TSLS[1, 1]
47 |     return scalar liml = LIML[1, 1]
48 | 
49 | end
50 | 
51 | /* Run simulation */
52 | simulate coefols = r(ols) coeftsls = r(tsls) coefliml = r(liml), reps(10000): weakinstr
53 | 
54 | /* Create empirical CDFs */
55 | cumul coefols, gen(cols)
56 | cumul coeftsls, gen(ctsls)
57 | cumul coefliml, gen(climl)
58 | stack cols coefols ctsls coeftsls climl coefliml, into(c coef) wide clear
59 | label var coef "beta"
60 | label var cols "OLS"
61 | label var ctsls "2SLS"
62 | label var climl "LIML"
63 | 
64 | /* Graph results */
65 | graph set window fontface "Palatino"
66 | line cols ctsls climl coef if inrange(coef, 0, 2.5),                        ///
67 |      sort                                                                   ///
68 |      lpattern(solid dash longdash_dot)                                      ///
69 |      lwidth(medthick medthick medthick)                                     ///
70 |      lcolor("228 26 28" "55 126 184" "77 175 74")                           ///
71 |      scheme(s1color)                                                        ///
72 |      legend(rows(1) region(lwidth(none)))                                   ///
73 |      xline(1, lcolor("189 189 189") lpattern(shortdash) lwidth(medthick))   ///
74 |      yline(0.5, lcolor("189 189 189") lpattern(shortdash) lwidth(medthick)) ///
75 |      xtitle("estimated {&beta}")                                            ///
76 |      ytitle("F{subscript:n}")
77 | graph export "Figure 4-6-1-Stata.eps", replace
78 | 
79 | log close figure040601
80 | /* End of script */
81 | 


--------------------------------------------------------------------------------
/04 Instrumental Variables in Action/Figure 4-6-1.r:
--------------------------------------------------------------------------------
 1 | # R code for Figure 4.6.1                 #
 2 | # Required packages                       #
 3 | # - MASS: multivariate normal draws       #
 4 | # - ivmodel: IV regressions               #
 5 | # - parallel: Parallel process simulation #
 6 | # - ggplot2: making pretty graphs         #
 7 | # - RColorBrewer: pleasing color schemes  #
 8 | # - reshape: manipulate data              #
 9 | library(MASS)
10 | library(ivmodel)
11 | library(parallel)
12 | library(ggplot2)
13 | library(RColorBrewer)
14 | library(reshape)
15 | 
16 | nsims = 100000
17 | set.seed(1984, "L'Ecuyer")
18 | 
19 | irrelevantInstrMC <- function(...) {
20 |     # Store coefficients
21 |     COEFS        <- rep(NA, 3)
22 |     names(COEFS) <- c("ols", "tsls", "liml")
23 | 
24 |     # Set parameters
25 |     Sigma  = matrix(c(1, 0.8, 0.8, 1), 2, 2)
26 |     errors = mvrnorm(n = 1000, rep(0, 2), Sigma)
27 |     eta    = errors[ , 1]
28 |     xi     = errors[ , 2]
29 | 
30 |     # Create Z, x, y
31 |     Z = sapply(1:20, function(x) rnorm(1000))
32 |     x = 0.1 * Z[ , 1] + xi
33 |     y = x + eta
34 | 
35 |     # OLS
36 |     OLS           <- lm(y ~ x)
37 |     COEFS["ols"]  <- summary(OLS)$coefficients[2, 1]
38 | 
39 |     # Run IV regressions
40 |     ivregressions <- ivmodel(Y = y, D = x, Z = Z)
41 |     COEFS["tsls"] <- coef.ivmodel(ivregressions)["TSLS", "Estimate"]
42 |     COEFS["liml"] <- coef.ivmodel(ivregressions)["LIML", "Estimate"]
43 | 
44 |     # Return results
45 |     return(COEFS)
46 | }
47 | 
48 | # Run simulations
49 | SIMBETAS <- data.frame(t(simplify2array(mclapply(1:nsims, irrelevantInstrMC))))
50 | 
51 | df           <- melt(SIMBETAS[ , 1:3])
52 | names(df)    <- c("Estimator", "beta")
53 | df$Estimator <- factor(df$Estimator, 
54 |                        levels = c("ols", "tsls", "liml"),
55 |                        labels = c("OLS", "2SLS", "LIML"))
56 | 
57 | g <- ggplot(df, aes(x = beta, colour = Estimator, linetype = Estimator))        +
58 |         stat_ecdf(geom = "step")                                                +
59 |         xlab(expression(widehat(beta))) + ylab(expression(F[n](widehat(beta)))) +
60 |         xlim(0, 2.5)                                                            +
61 |         scale_linetype_manual(values = c("solid", "longdash", "twodash"))       +
62 |         scale_color_manual(values = brewer.pal(3, "Set1"),
63 |                            labels = c("OLS", "2SLS", "LIML"))                   +
64 |         geom_vline(xintercept = 1.0, linetype = "longdash")                     +
65 |         geom_hline(yintercept = 0.5, linetype = "longdash")                     +
66 |         theme(axis.title.y = element_text(angle=0))                             +
67 |         theme_set(theme_gray(base_size = 24))                                   
68 | ggsave(file = "Figure 4-6-1-R.png", height = 8, width = 12, dpi = 300)
69 | 
70 | write.csv(df, "Figure 4-6-1.csv")
71 | # End of script
72 | 


--------------------------------------------------------------------------------
/04 Instrumental Variables in Action/Figure 4-1-1.do:
--------------------------------------------------------------------------------
 1 | clear all
 2 | set more off
 3 | /* Stata code for Figure 4-1-1 */
 4 | 
 5 | /* Download data */
 6 | shell curl -o asciiqob.zip http://economics.mit.edu/files/397
 7 | unzipfile asciiqob.zip, replace
 8 | 
 9 | /* Import data */
10 | infile lwklywge educ yob qob pob using asciiqob.txt, clear
11 | 
12 | /* Use Stata date formats */
13 | gen yqob = yq(1900 + yob, qob)
14 | format yqob %tq
15 | 
16 | /* Collapse by quarter of birth */
17 | collapse (mean) educ (mean) lwklywge (mean) qob, by(yqob)
18 | 
19 | /* Plot data */
20 | graph twoway (line educ yqob, lcolor(black))                                        ///
21 |              (scatter educ yqob if qob == 1,                                        ///
22 |                 mlabel(qob) msize(small) msymbol(S) mcolor(black))                  ///
23 |              (scatter educ yqob if qob != 1,                                        ///
24 |                 mlabel(qob) msize(small) msymbol(Sh) mcolor(black)),                ///
25 |                     xlabel(, format(%tqY))                                          ///
26 |                     title("A. Average education by quarter of birth (first stage)") ///
27 |                     ytitle("Years of education")                                    ///
28 |                     xtitle("Year of birth")                                         ///
29 |                     legend(off)                                                     ///
30 |                     name(educ)                                                      ///
31 |                     scheme(s1mono)
32 | 
33 | graph twoway (line lwklywge yqob, lcolor(black))                                       ///
34 |              (scatter lwklywge yqob if qob == 1,                                       ///
35 |                 mlabel(qob) msize(small) msymbol(S) mcolor(black))                     ///
36 |              (scatter lwklywge yqob if qob != 1,                                       ///
37 |                 mlabel(qob) msize(small) msymbol(Sh) mcolor(black)),                   ///
38 |                     xlabel(, format(%tqY))                                             ///
39 |                     title("B. Average weekly wage by quarter of birth (reduced form)") ///
40 |                     ytitle("Log weekly earnings")                                      ///
41 |                     xtitle("Year of birth")                                            ///
42 |                     legend(off)                                                        ///
43 |                     name(lwklywge)                                                     ///
44 |                     scheme(s1mono)
45 | 
46 | /* Compare graphs */
47 | graph combine educ lwklywge,  ///
48 |     col(1)                    ///
49 |     xsize(4) ysize(6)         ///
50 |     graphregion(margin(zero)) ///
51 |     scheme(s1mono)
52 | 
53 | graph export "Figure 4-1-1-Stata.pdf", replace
54 | 
55 | /* End of file */
56 | 


--------------------------------------------------------------------------------
/06 Getting a Little Jumpy/Figure 6-2-1.do:
--------------------------------------------------------------------------------
 1 | clear all
 2 | set more off
 3 | 
 4 | /* Download data */
 5 | shell curl -o final4.dta http://economics.mit.edu/files/1359
 6 | shell curl -o final5.dta http://economics.mit.edu/files/1358
 7 | 
 8 | /*---------*/
 9 | /* Grade 4 */
10 | /*---------*/
11 | /* Import data */
12 | use "final4.dta", clear
13 | 
14 | /* Restrict sample */
15 | keep if 1 < classize & classize < 45 & c_size > 5
16 | 
17 | /* Find means class size by grade size */
18 | collapse classize, by(c_size)
19 | 
20 | /* Plot the actual and predicted class size based on grade size */
21 | graph twoway (line classize c_size, lcolor(black))                              ///
22 |              (function y = x / (floor((x - 1)/40) + 1),                         ///
23 |                     range(1 220) lpattern(dash) lcolor(black)),                 ///
24 |                 xlabel(20(20)220)                                               ///
25 |                 title("B. Fourth grade")                                        ///
26 |                 ytitle("Class size")                                            ///
27 |                 xtitle("Enrollment count")                                      ///
28 |                 legend(label(1 "Actual class size") label(2 "Maimonides Rule")) ///
29 |                 scheme(s1mono)                                                  ///
30 |                 saving(fourthgrade.gph, replace)
31 | 
32 | /*---------*/
33 | /* Grade 5 */
34 | /*---------*/
35 | /* Import data */
36 | use "final5.dta", clear
37 | 
38 | /* Restrict sample */
39 | keep if 1 < classize & classize < 45 & c_size > 5
40 | 
41 | /* Find means class size by grade size */
42 | collapse classize, by(c_size)
43 | 
44 | /* Plot the actual and predicted class size based on grade size */
45 | graph twoway (line classize c_size, lcolor(black))                               ///
46 |              (function y = x / (floor((x - 1)/40) + 1),                          ///
47 |                     range(1 220) lpattern(dash) lcolor(black)),                  ///
48 |                 xlabel(20(20)220)                                                ///
49 |                 title("A. Fifth grade")                                          ///
50 |                 ytitle("Class size")                                             ///
51 |                 xtitle("Enrollment count")                                       ///
52 |                 legend(label(1 "Actual class size") label(2 "Maimonides Rule"))  ///
53 |                 scheme(s1mono)                                                   ///
54 |                 saving(fifthgrade.gph, replace)
55 | 
56 | /* Combine graphs */
57 | graph combine fifthgrade.gph fourthgrade.gph,   ///
58 |     col(1)                                      ///
59 |     xsize(4) ysize(6)                           ///
60 |     graphregion(margin(zero))                   ///
61 |     scheme(s1mono)
62 | graph export "Figure 6-2-1-Stata.png", replace
63 | 
64 | /* End of file */
65 | exit
66 | 


--------------------------------------------------------------------------------
/03 Making Regression Make Sense/Table 3-3-2.jl:
--------------------------------------------------------------------------------
 1 | # Load packages
 2 | using DataFrames
 3 | using FileIO, StatFiles
 4 | using Statistics
 5 | using GLM
 6 | 
 7 | # Download the data from the MHE Data Archive
 8 | base_url = "https://economics.mit.edu/sites/default/files/inline-files"
 9 | download("$(base_url)/nswre74.dta", "nswre74.dta")
10 | download("$(base_url)/cps1re74.dta", "cps1re74.dta")
11 | download("$(base_url)/cps3re74.dta", "cps3re74.dta")
12 | 
13 | # Read the Stata files into Julia
14 | nswre74  = DataFrame(load("nswre74.dta"))
15 | cps1re74 = DataFrame(load("cps1re74.dta"))
16 | cps3re74 = DataFrame(load("cps3re74.dta"))
17 | 
18 | summary_vars = [:age, :ed, :black, :hisp, :nodeg, :married, :re74, :re75]
19 | nswre74_stat = combine(nswre74, summary_vars .=> mean)
20 | 
21 | # Calculate propensity scores
22 | probit = glm(@formula(treat ~ age + age2 + ed + black + hisp + 
23 |                       nodeg + married + re74 + re75),
24 |              cps1re74,
25 |              Binomial(),
26 |              ProbitLink())
27 | cps1re74.pscore = predict(probit)
28 | 
29 | probit = glm(@formula(treat ~ age + age2 + ed + black + hisp +
30 |                       nodeg + married + re74 + re75),
31 |              cps3re74,
32 |              Binomial(),
33 |              ProbitLink())
34 | cps3re74.pscore = predict(probit)
35 | 
36 | # Create function to summarize data
37 | function summarize(data, condition)
38 |     stats       = combine(data[condition, :], summary_vars .=> mean)
39 |     stats.count = [size(data[condition, summary_vars])[1]]
40 |     return(stats)
41 | end
42 | 
43 | # Summarize data
44 | nswre74_treat_stats    = summarize(nswre74, nswre74.treat .== 1)
45 | nswre74_control_stats  = summarize(nswre74, nswre74.treat .== 0)
46 | cps1re74_control_stats = summarize(cps1re74, cps1re74.treat .== 0)
47 | cps3re74_control_stats = summarize(cps3re74, cps3re74.treat .== 0)
48 | cps1re74_ptrim_stats   = summarize(cps1re74, broadcast(&, cps1re74.treat  .== 0,
49 |                                              cps1re74.pscore .> 0.1,
50 |                                              cps1re74.pscore .< 0.9))
51 | cps3re74_ptrim_stats   = summarize(cps3re74, broadcast(&, cps3re74.treat .== 0,
52 |                                              cps3re74.pscore .> 0.1,
53 |                                              cps3re74.pscore .< 0.9))
54 | 
55 | # Combine summary stats, add header and print to markdown
56 | table      = vcat(nswre74_treat_stats,
57 |                   nswre74_control_stats,
58 |                   cps1re74_control_stats,
59 |                   cps3re74_control_stats,
60 |                   cps1re74_ptrim_stats,
61 |                   cps3re74_ptrim_stats)
62 | table.id = 1:size(table, 1)
63 | table      = stack(table, [:age_mean, :ed_mean, :black_mean, :hisp_mean,
64 |                            :nodeg_mean, :married_mean, :re74_mean, :re75_mean, :count])
65 | table      = unstack(table, :variable, :id, :value)
66 | 
67 | rename!(table, [:Variable, :NSWTreat, :NSWControl,
68 |                :FullCPS1, :FullCPS3, :PscoreCPS1, :PscoreCPS3])
69 | 
70 | println(table)
71 | 
72 | # End of script
73 | 


--------------------------------------------------------------------------------
/06 Getting a Little Jumpy/Figure 6-1-1.jl:
--------------------------------------------------------------------------------
 1 | # Load packages
 2 | using Random
 3 | using DataFrames
 4 | using Gadfly
 5 | using Cairo
 6 | using Fontconfig
 7 | using Distributions
 8 | using CurveFit
 9 | using Colors
10 | 
11 | # Set seed
12 | Random.seed!(08421);
13 | 
14 | # Set number of simulations
15 | nsims = 100
16 | 
17 | # Set distributions for random draws
18 | uniform = Uniform(0, 1)
19 | normal  = Normal(0, 0.1) 
20 | 
21 | # Generate series
22 | x         = rand(uniform, nsims)
23 | y_linear  = x .+ (x .> 0.5) .* 0.25 .+ rand(normal, nsims)
24 | y_nonlin  = 0.5 .* sin.(6 .* (x .- 0.5)) .+ 0.5 .+ (x .> 0.5) .* 0.25 .+ rand(normal, nsims)
25 | y_mistake = 1 ./ (1 .+ exp.(-25 .* (x .- 0.5))) .+ rand(normal, nsims)
26 | 
27 | # Fit lines using user-created function
28 | function rdfit(xvar, yvar, cutoff, degree)
29 |     coef_0 = curve_fit(Poly, xvar[cutoff .>= x], yvar[cutoff .>= x], degree)
30 |     fit_0  = coef_0.(xvar[cutoff .>= x])
31 | 
32 |     coef_1 = curve_fit(Poly, xvar[xvar .> cutoff], yvar[xvar .> cutoff], degree)
33 |     fit_1  = coef_1.(xvar[xvar .> cutoff])
34 | 
35 |     nx_0 = length(xvar[xvar .> cutoff])
36 | 
37 |     df_0 = DataFrame(x_0 = xvar[cutoff .>= xvar], fit_0 = fit_0)
38 |     df_1 = DataFrame(x_1 = xvar[xvar .> cutoff],  fit_1 = fit_1)
39 |  
40 |     return df_0, df_1
41 | end
42 | 
43 | data_linear_0, data_linear_1 = rdfit(x, y_linear, 0.5, 1)
44 | data_nonlin_0, data_nonlin_1 = rdfit(x, y_nonlin, 0.5, 2)
45 | data_mistake_0, data_mistake_1 = rdfit(x, y_mistake, 0.5, 1)
46 | 
47 | p_linear = plot(layer(x = x, y = y_linear, Geom.point),
48 |                 layer(x = data_linear_0.x_0, y = data_linear_0.fit_0, Geom.line),
49 |                 layer(x = data_linear_1.x_1, y = data_linear_1.fit_1, Geom.line),
50 |                 layer(xintercept = [0.5], Geom.vline),
51 |                 Guide.xlabel("x"),
52 |                 Guide.ylabel("Outcome"),
53 |                 Guide.title("A. Linear E[Y<sub>01</sub> | X<sub>i</sub>]"))
54 | 
55 | p_nonlin = plot(layer(x = x, y = y_nonlin, Geom.point),
56 |                 layer(x = data_nonlin_0.x_0, y = data_nonlin_0.fit_0, Geom.line),
57 |                 layer(x = data_nonlin_1.x_1, y = data_nonlin_1.fit_1, Geom.line),
58 |                 layer(xintercept = [0.5], Geom.vline),
59 |                 Guide.xlabel("x"),
60 |                 Guide.ylabel("Outcome"),
61 |                 Guide.title("B. Nonlinear E[Y<sub>01</sub> | X<sub>i</sub>]"))
62 | 
63 | function rd_mistake(x)
64 |     1 / (1 + exp(-25 * (x - 0.5)))
65 | end
66 | 
67 | p_mistake = plot(layer(x = x, y = y_mistake, Geom.point),
68 |                  layer(x = data_mistake_0.x_0, y = data_mistake_0.fit_0, Geom.line),
69 |                  layer(x = data_mistake_1.x_1, y = data_mistake_1.fit_1, Geom.line),
70 |                  layer(rd_mistake, 0, 1),
71 |                  layer(xintercept = [0.5], Geom.vline),
72 |                  Guide.xlabel("x"),
73 |                  Guide.ylabel("Outcome"),
74 |                  Guide.title("C. Nonlinearity mistaken for discontinuity"))
75 | 
76 | draw(PNG("Figure 6-1-1-Julia.png", 6inch, 8inch), vstack(p_linear, p_nonlin, p_mistake))
77 | 
78 | # End of script
79 | 


--------------------------------------------------------------------------------
/03 Making Regression Make Sense/Table 3-3-3.r:
--------------------------------------------------------------------------------
 1 | # R code for Table 3-3-2     #
 2 | # Required packages          #
 3 | 
 4 | # Download the files
 5 | download.file("http://economics.mit.edu/files/3828", "nswre74.dta")
 6 | download.file("http://economics.mit.edu/files/3824", "cps1re74.dta")
 7 | download.file("http://economics.mit.edu/files/3825", "cps3re74.dta")
 8 | 
 9 | # Read the Stata files into R
10 | nswre74  <- read_dta("nswre74.dta")
11 | cps1re74 <- read_dta("cps1re74.dta")
12 | cps3re74 <- read_dta("cps3re74.dta")
13 | 
14 | # Function to create propensity trimmed data
15 | propensity.trim <- function(dataset) {
16 |     # Specify control formulas
17 |     controls <- c("age", "age2", "ed", "black", "hisp", "nodeg", "married", "re74", "re75")
18 |     # Paste together probit specification
19 |     spec <- paste("treat", paste(controls, collapse = " + "), sep = " ~ ")
20 |     # Run probit
21 |     probit <- glm(as.formula(spec), family = binomial(link = "probit"), data = dataset)
22 |     # Predict probability of treatment
23 |     pscore <- predict(probit, type = "response")
24 |     # Return data set within range
25 |     dataset[which(pscore > 0.1 & pscore < 0.9), ]
26 | }
27 | 
28 | # Propensity trim data
29 | cps1re74.ptrim <- propensity.trim(cps1re74)
30 | cps3re74.ptrim <- propensity.trim(cps3re74)
31 | 
32 | estimateTrainingFX <- function(dataset) {
33 |     # Raw difference
34 |     spec_raw  <- as.formula("re78 ~ treat")
35 |     coef_raw  <- lm(spec_raw, data = dataset)$coefficients["treat"]
36 | 
37 |     # Demographics
38 |     demos     <- c("age", "age2", "ed", "black", "hisp", "nodeg", "married")
39 |     spec_demo <- paste("re78",
40 |                        paste(c("treat", demos),
41 |                              collapse = " + "),
42 |                              sep      = " ~ ")
43 |     coef_demo <- lm(spec_demo, data = dataset)$coefficients["treat"]
44 | 
45 |     # 1975 Earnings
46 |     spec_re75 <- paste("re78 ~ treat + re75")
47 |     coef_re75 <- lm(spec_demo, data = dataset)$coefficients["treat"]
48 | 
49 |     # Demographics, 1975 Earnings
50 |     spec_demo_re75 <- paste("re78",
51 |                             paste(c("treat", demos, "re75"),
52 |                                   collapse = " + "),
53 |                                   sep      = " ~ ")
54 |     coef_demo_re75 <- lm(spec_demo_re75, data = dataset)$coefficients["treat"]
55 | 
56 |     # Demographics, 1974 and 1975 Earnings
57 |     spec_demo_re74_re75 <- paste("re78",
58 |                                  paste(c("treat", demos, "re74", "re75"),
59 |                                        collapse = " + "),
60 |                                        sep      = " ~ ")
61 |     coef_demo_re74_re75 <- lm(spec_demo_re74_re75, data = dataset)$coefficients["treat"]
62 | 
63 |     c(raw            = coef_raw,
64 |       demo           = coef_demo,
65 |       re75           = coef_re75,
66 |       demo_re75      = coef_demo_re75,
67 |       demo_re74_re75 = coef_demo_re74_re75)
68 | 
69 | }
70 | 
71 | nswre74.ols  <- estimateTrainingFX(nswre74)
72 | nswre74.ols  <- estimateTrainingFX(nswre74)
73 | cps1re74.ols <- estimateTrainingFX(cps1re74)
74 | cps3re74.ols <- estimateTrainingFX(cps3re74)
75 | 
76 | # End of script
77 | 


--------------------------------------------------------------------------------
/05 Fixed Effects, DD and Panel Data/Table 5-2-1.r:
--------------------------------------------------------------------------------
 1 | # R code for Figure 5-2-1         #
 2 | # Required packages               #
 3 | # - dplyr: easy data manipulation #
 4 | library(dplyr)
 5 | 
 6 | # Download data
 7 | download.file("http://economics.mit.edu/files/3845", "njmin.zip")
 8 | unzip("njmin.zip")
 9 | 
10 | # Import data
11 | njmin        <- read.table('public.dat',
12 |                           header           = FALSE,
13 |                           stringsAsFactors = FALSE,
14 |                           na.strings       = c("", ".", "NA"))
15 | names(njmin) <- c('SHEET', 'CHAIN', 'CO_OWNED', 'STATE', 'SOUTHJ', 'CENTRALJ',
16 |                   'NORTHJ', 'PA1', 'PA2', 'SHORE', 'NCALLS', 'EMPFT', 'EMPPT',
17 |                   'NMGRS', 'WAGE_ST', 'INCTIME', 'FIRSTINC', 'BONUS', 'PCTAFF',
18 |                   'MEALS', 'OPEN', 'HRSOPEN', 'PSODA', 'PFRY', 'PENTREE', 'NREGS',
19 |                   'NREGS11', 'TYPE2', 'STATUS2', 'DATE2', 'NCALLS2', 'EMPFT2',
20 |                   'EMPPT2', 'NMGRS2', 'WAGE_ST2', 'INCTIME2', 'FIRSTIN2', 'SPECIAL2',
21 |                   'MEALS2', 'OPEN2R', 'HRSOPEN2', 'PSODA2', 'PFRY2', 'PENTREE2',
22 |                   'NREGS2', 'NREGS112')
23 | 
24 | # Calculate FTE employement
25 | njmin$FTE  <- njmin$EMPFT  + 0.5 * njmin$EMPPT  + njmin$NMGRS
26 | njmin$FTE2 <- njmin$EMPFT2 + 0.5 * njmin$EMPPT2 + njmin$NMGRS2
27 | 
28 | # Create function for calculating standard errors of mean
29 | semean <- function(x, na.rm = FALSE) {
30 |     n <- ifelse(na.rm, sum(!is.na(x)), length(x))
31 |     sqrt(var(x, na.rm = na.rm) / n)
32 | }
33 | 
34 | # Calucate means
35 | summary.means <- njmin[ , c("FTE", "FTE2", "STATE")]         %>%
36 |                  group_by(STATE)                             %>%
37 |                  summarise_each(funs(mean(., na.rm = TRUE)))
38 | summary.means <- as.data.frame(t(summary.means[ , -1]))
39 | 
40 | colnames(summary.means)  <- c("PA", "NJ")
41 | summary.means$dSTATE     <- summary.means$NJ - summary.means$PA
42 | summary.means            <- rbind(summary.means,
43 |                                   summary.means[2, ] - summary.means[1, ])
44 | row.names(summary.means) <- c("FTE employment before, all available observations",
45 |                               "FTE employment after, all available observations",
46 |                               "Change in mean FTE employment")
47 | 
48 | # Calucate
49 | summary.semeans <- njmin[ , c("FTE", "FTE2", "STATE")]         %>%
50 |                  group_by(STATE)                               %>%
51 |                  summarise_each(funs(semean(., na.rm = TRUE)))
52 | summary.semeans <- as.data.frame(t(summary.semeans[ , -1]))
53 | 
54 | colnames(summary.semeans)  <- c("PA", "NJ")
55 | summary.semeans$dSTATE     <- sqrt(summary.semeans$NJ + summary.semeans$PA) / length
56 | 
57 | njmin         <- njmin[ , c("FTE", "FTE2", "STATE")]
58 | njmin         <- melt(njmin,
59 |                       id.vars       = c("STATE"),
60 |                       variable.name = "Period",
61 |                       value.name    = "FTE")
62 | summary.means <- njmin                   %>%
63 |                  group_by(STATE, Period) %>%
64 |                  summarise_each(funs(mean(., na.rm = TRUE), semean(., na.rm = TRUE)))
65 | 
66 | # End of script
67 | 


--------------------------------------------------------------------------------
/04 Instrumental Variables in Action/Table 4-6-2.do:
--------------------------------------------------------------------------------
 1 | clear all
 2 | set more off
 3 | eststo clear
 4 | capture log close _all
 5 | capture version 13
 6 | 
 7 | /* Stata code for Table 4.6.2     */
 8 | /* Required additional packages   */
 9 | /* ivreg2: running IV regressions */
10 | /* estout: for exporting tables   */
11 | 
12 | log using "Table 4-6-2-Stata.txt", name(table040602) text replace
13 | 
14 | /* Download data */
15 | shell curl -o asciiqob.zip http://economics.mit.edu/files/397
16 | unzipfile asciiqob.zip, replace
17 | 
18 | /* Import data */
19 | infile lwklywge educ yob qob pob using asciiqob.txt, clear
20 | 
21 | /*Creat variables */
22 | gen num_qob = yq(1900 + yob, qob)      // Quarter of birth
23 | gen survey_qtr = yq(1980, 3)           // Survey quarter
24 | gen age = survey_qtr - num_qob         // Age in quarter
25 | gen agesq = age^2                      // Age^2
26 | xi i.yob i.pob i.qob*i.yob i.qob*i.pob // Create all the dummies
27 | 
28 | /* Create locals for controls */
29 | local col1_controls   "_Iyob_31 - _Iyob_39"
30 | local col1_excl_instr "_Iqob_2 - _Iqob_4"
31 | 
32 | local col2_controls   "_Iyob_31 - _Iyob_39 age agesq"
33 | local col2_excl_instr "_Iqob_2 - _Iqob_3" // colinear age qob: drop _Iqob_4
34 | 
35 | local col3_controls   "_Iyob_31 - _Iyob_39"
36 | local col3_excl_instr "_Iqob_2 - _Iqob_4 _IqobXyob_2_31 - _IqobXyob_4_39"
37 | 
38 | local col4_controls   "_Iyob_31 - _Iyob_39 age agesq"
39 | local col4_excl_instr "_Iqob_2 - _Iqob_3 _IqobXyob_2_31 - _IqobXyob_4_38" // colinear age qob: drop _Iqob_4, _IqobXyob_4_39
40 | 
41 | local col5_controls   "_Iyob_31 - _Iyob_39 _Ipob_2 - _Ipob_56"
42 | local col5_excl_instr "_Iqob_2 - _Iqob_4 _IqobXyob_2_31 - _IqobXyob_4_39 _IqobXpob_2_2 - _IqobXpob_4_56"
43 | 
44 | local col6_controls   "_Iyob_31 - _Iyob_39 _Ipob_2 - _Ipob_56 age agesq"
45 | local col6_excl_instr "_Iqob_2 - _Iqob_3 _IqobXyob_2_31 - _IqobXyob_4_38 _IqobXpob_2_2 - _IqobXpob_4_56" // colinear age qob: drop _Iqob_4, _IqobXyob_4_39
46 | 
47 | foreach model in "2sls" "liml" {
48 |     if "`model'" == "2sls" {
49 |         local ivreg2_mod ""
50 |     }
51 |     else {
52 |         local ivreg2_mod "`model'"
53 |     }
54 |     foreach col in "col1" "col2" "col3" "col4" "col5" "col6" {
55 |         display "Time for `col', `model'"
56 |         display "Running ivreg2 lwklywge ``col'_controls' (educ = ``col'_excl_instr'), `ivreg2_mod'"
57 |         eststo `col'_`model': ivreg2 lwklywge ``col'_controls' (educ = ``col'_excl_instr'), `ivreg2_mod'
58 |         local num_instr = wordcount("`e(exexog)'")
59 |         estadd local num_instr `num_instr'
60 |         local fstat = round(`e(widstat)', 0.01)
61 |         estadd local fstat `fstat'
62 |     }
63 | }
64 | 
65 | /* OLS for comparison */
66 | eststo col1_ols: regress lwklywge educ i.yob
67 | eststo col2_ols: regress lwklywge educ i.yob age agesq
68 | eststo col5_ols: regress lwklywge educ i.yob i.pob
69 | eststo col6_ols: regress lwklywge educ i.yob i.pob age agesq
70 | 
71 | /* Export results */
72 | esttab, keep(educ)               ///
73 |         b(3) se(3)               ///
74 |         nostar se noobs mtitles  ///
75 |         scalars(fstat num_instr) ///
76 |         plain replace
77 | eststo clear
78 | 
79 | log close table040602
80 | /* End of file */
81 | 


--------------------------------------------------------------------------------
/07 Quantile Regression/Table 7-1-1.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Tested on Python 3.4
 4 | """
 5 | import urllib
 6 | import zipfile
 7 | import urllib.request
 8 | import pandas as pd
 9 | import numpy as np
10 | import statsmodels.api as sm
11 | import statsmodels.formula.api as smf
12 | import matplotlib.pyplot as plt
13 | from statsmodels.regression.quantile_regression import QuantReg
14 | from collections import defaultdict 
15 | from tabulate import tabulate
16 | 
17 | # Download data and unzip file
18 | urllib.request.urlretrieve('http://economics.mit.edu/files/384', 'angcherfer06.zip')
19 | with zipfile.ZipFile('angcherfer06.zip', 'r') as z:
20 |    z.extractall()
21 | 
22 | # Function to run the quantile regressions
23 | def quant_mincer(q, data):
24 |   r      = smf.quantreg('logwk ~ educ + black + exper + exper2 + wt - 1', data)
25 |   result = r.fit(q = q)
26 |   coef   = result.params['educ']
27 |   se     = result.bse['educ']
28 |   return [coef, se]
29 | 
30 | # Create dictionary to store the results
31 | results = defaultdict(list)
32 | 
33 | # Loop over years and quantiles
34 | years = ['80', '90', '00']
35 | taus  = [0.1, 0.25, 0.5, 0.75, 0.9]
36 | 
37 | for year in years:
38 |     # Load data
39 |     dta_path = 'Data/census%s.dta' % year
40 |     df       = pd.read_stata(dta_path)
41 |     # Weight the data by perwt
42 |     df['wt']  = np.sqrt(df['perwt'])
43 |     wdf       = df[['logwk', 'educ', 'black', 'exper', 'exper2']]. \
44 |                 multiply(df['wt'], axis = 'index')
45 |     wdf['wt'] = df['wt']
46 |     # Summary statistics
47 |     results['Obs']  += [df['logwk'].count(), None]
48 |     results['Mean'] += [np.mean(df['logwk']), None]
49 |     results['Std']  += [np.std(df['logwk']), None]
50 |     # Quantile regressions
51 |     for tau in taus:
52 |         results[tau] += quant_mincer(tau, wdf)
53 |     # Run OLS with weights to get OLS parameters and MSE
54 |     wls_model  = smf.ols('logwk ~ educ + black + exper + exper2 + wt - 1', wdf)
55 |     wls_result = wls_model.fit()
56 |     results['OLS']  += [wls_result.params['educ'], wls_result.bse['educ']]
57 |     results['RMSE'] += [np.sqrt(wls_result.mse_resid), None]
58 | 
59 | # Export table (round the results and place them in a DataFrame to tabulate)
60 | def format_results(the_list, the_format):
61 |   return([the_format.format(x) if x else x for x in the_list])
62 | 
63 | table = pd.DataFrame(columns = ['Year', 'Obs', 'Mean', 'Std',
64 |                                 '0.1', '0.25', '0.5', '0.75', '0.9',
65 |                                 'OLS', 'RMSE'])
66 | 
67 | table['Year'] = ['1980', None, '1990', None, '2000', None]
68 | table['Obs']  = format_results(results['Obs'], '{:,}')
69 | table['Mean'] = format_results(results['Mean'], '{:.2f}')
70 | table['Std']  = format_results(results['Std'], '{:.3f}')
71 | table['0.1']  = format_results(results[0.1], '{:.3f}')
72 | table['0.25'] = format_results(results[0.25], '{:.3f}')
73 | table['0.5']  = format_results(results[0.5], '{:.3f}')
74 | table['0.75'] = format_results(results[0.75], '{:.3f}')
75 | table['0.9']  = format_results(results[0.9], '{:.3f}')
76 | table['OLS']  = format_results(results['OLS'], '{:.3f}')
77 | table['RMSE'] = format_results(results['RMSE'], '{:.2f}')
78 | 
79 | print(tabulate(table, headers = 'keys'))
80 | 
81 | # End of script
82 | 


--------------------------------------------------------------------------------
/08 Nonstandard Standard Error Issues/Table 8-1-1.jl:
--------------------------------------------------------------------------------
 1 | # Julia code for Table 8-1-1                #
 2 | # Required packages                         #
 3 | # - DataFrames: data manipulation / storage #
 4 | # - Distributions: extended stats functions #
 5 | # - GLM: regression                         #
 6 | using DataFrames
 7 | using Distributions
 8 | using GLM
 9 | 
10 | # Set seed
11 | srand(08421)
12 | 
13 | nsims = 25000
14 | 
15 | function generateHC(sigma)
16 |     # Set parameters of the simulation
17 |     n   = 30
18 |     r   = 0.9
19 |     n_1 = int(r * 30)
20 | 
21 |     # Generate simulation data
22 |     d        = ones(n)
23 |     d[1:n_1] = 0
24 | 
25 |     r0      = Normal(0, sigma)
26 |     r1      = Normal(0, 1)
27 |     epsilon = [rand(r0, n_1), rand(r1, n - n_1)]
28 | 
29 |     y = 0 * d + epsilon
30 | 
31 |     simulated  = DataFrame(y = y, d = d, epsilon = epsilon)
32 | 
33 |     # Run regression, grab coef., conventional std error, and residuals
34 |     regression = lm(y ~ d, simulated)
35 |     b1         = coef(regression)[2]
36 |     conv       = stderr(regression)[2]
37 |     ehat       = simulated[:y] - predict(regression)
38 | 
39 |     # Calculate robust standard errors
40 |     X   = [ones(n) simulated[:d]]
41 |     vcovHC0 = inv(transpose(X) * X) * (transpose(X) * diagm(ehat.^2) * X) * inv(transpose(X) * X)
42 |     hc0 = sqrt(vcovHC0[2, 2])
43 |     vcovHC1 = (n / (n - 2)) * vcovHC0
44 |     hc1 = sqrt(vcovHC1[2, 2])
45 |     h = diag(X * inv(transpose(X) * X) * transpose(X))
46 |     meat2 = diagm(ehat.^2) ./ (1 - h)
47 |     vcovHC2 = inv(transpose(X) * X) * (transpose(X) * meat2 * X) * inv(transpose(X) * X)
48 |     hc2 = sqrt(vcovHC2[2, 2])
49 |     meat3 = diagm(ehat.^2) ./ (1 - h).^2
50 |     vcovHC3 = inv(transpose(X) * X) * (transpose(X) * meat3 * X) * inv(transpose(X) * X)
51 |     hc3 = sqrt(vcovHC3[2, 2])
52 | 
53 |     return [b1 conv hc0 hc1 hc2 hc3 max(conv, hc0) max(conv, hc1) max(conv, hc2) max(conv, hc3)]
54 | end
55 | 
56 | # Function to run simulation
57 | function simulateHC(nsims, sigma)
58 |     # Run simulation
59 |     simulation_results = zeros(nsims, 10)
60 | 
61 |     for i = 1:nsims
62 |         simulation_results[i, :] = generateHC(sigma)
63 |     end
64 | 
65 |     # Calculate mean and standard deviation
66 |     mean_est = mean(simulation_results, 1)
67 |     std_est  = std(simulation_results, 1)
68 | 
69 |     # Calculate rejection rates
70 |     test_stats = simulation_results[:, 1] ./ simulation_results[:, 2:10]
71 |     reject_z   = mean(2 * pdf(Normal(0, 1), -abs(test_stats)) .<= 0.05, 1)
72 |     reject_t   = mean(2 * pdf(TDist(30 - 2), -abs(test_stats)) .<= 0.05, 1)
73 | 
74 |     # Combine columns
75 |     value_labs   = ["Beta_1" "conv" "HC0" "HC1" "HC2" "HC3" "max(conv, HC0)" "max(conv, HC1)" "max(conv, HC2)" "max(conv, HC3)"]
76 |     summ_stats   = [mean_est; std_est]
77 |     reject_stats = [0 reject_z; 0 reject_t]
78 | 
79 |     all_stats = convert(DataFrame, transpose([value_labs; summ_stats; reject_stats]))
80 |     names!(all_stats, [:estimate, :mean, :std, :reject_z, :reject_t])
81 |     all_stats[1, 4:5] = NA
82 | 
83 |     return(all_stats)
84 | end
85 | 
86 | println("Panel A")
87 | println(simulateHC(nsims, 0.5))
88 | 
89 | println("Panel B")
90 | println(simulateHC(nsims, 0.85))
91 | 
92 | println("Panel C")
93 | println(simulateHC(nsims, 1))
94 | 
95 | # End of script
96 | 
97 | 


--------------------------------------------------------------------------------
/06 Getting a Little Jumpy/Figure 6-1-1.do:
--------------------------------------------------------------------------------
 1 | clear all
 2 | set more off
 3 | 
 4 | /* Set random seed for replication */
 5 | set seed 1149
 6 | 
 7 | /* Number of random variables */
 8 | local nobs = 100
 9 | 
10 | set obs `nobs'
11 | 
12 | gen x         = runiform()
13 | gen y_linear  = x + (x > 0.5) * 0.25 + rnormal(0, 0.1)
14 | gen y_nonlin  = 0.5 * sin(6 * (x - 0.5)) + 0.5 + (x > 0.5) * 0.25 + rnormal(0, 0.1)
15 | gen y_mistake = 1 / (1 + exp(-25 * (x - 0.5))) + rnormal(0, 0.1)
16 | 
17 | graph twoway (lfit y_linear x if x < 0.5, lcolor(black))                        ///
18 |              (lfit y_linear x if x > 0.5, lcolor(black))                        ///
19 |              (scatter y_linear x, msize(vsmall) msymbol(circle) mcolor(black)), ///
20 |                 title("A. Linear E[Y{sub:0i}|X{sub:i}]")                        ///
21 |                 ytitle("Outcome")                                               ///
22 |                 xtitle("x")                                                     ///
23 |                 xline(0.5, lpattern(dash))                                      ///
24 |                 scheme(s1mono)                                                  ///
25 |                 legend(off)                                                     ///
26 |                 saving(y_linear, replace)
27 | 
28 | graph twoway (qfit y_nonlin x if x < 0.5, lcolor(black))                        ///
29 |              (qfit y_nonlin x if x > 0.5, lcolor(black))                        ///
30 |              (scatter y_nonlin x, msize(vsmall) msymbol(circle) mcolor(black)), ///
31 |                 title("B. Nonlinear E[Y{sub:0i}|X{sub:i}]")                     ///
32 |                 ytitle("Outcome")                                               ///
33 |                 xtitle("x")                                                     ///
34 |                 xline(0.5, lpattern(dash))                                      ///
35 |                 scheme(s1mono)                                                  ///
36 |                 legend(off)                                                     ///
37 |                 saving(y_nonlin, replace)
38 | 
39 | graph twoway (lfit y_mistake x if x < 0.5, lcolor(black))                        ///
40 |              (lfit y_mistake x if x > 0.5, lcolor(black))                        ///
41 |              (function y = 1 / (1 + exp(-25 * (x - 0.5))), lpattern(dash))       ///
42 |              (scatter y_mistake x, msize(vsmall) msymbol(circle) mcolor(black)), ///
43 |                 title("C. Nonlinearity mistaken for discontinuity")              ///
44 |                 ytitle("Outcome")                                                ///
45 |                 xtitle("x")                                                      ///
46 |                 xline(0.5, lpattern(dash))                                       ///
47 |                 scheme(s1mono)                                                   ///
48 |                 legend(off)                                                      ///
49 |                 saving(y_mistake, replace)
50 | 
51 | graph combine y_linear.gph y_nonlin.gph y_mistake.gph, ///
52 |     col(1)                                             ///
53 |     xsize(4) ysize(6)                                  ///
54 |     graphregion(margin(zero))                          ///
55 |     scheme(s1mono)
56 | graph export "Figure 6-1-1-Stata.png", replace
57 | 
58 | /* End of file */
59 | exit
60 | 


--------------------------------------------------------------------------------
/05 Fixed Effects, DD and Panel Data/Figure 5-2-4.do:
--------------------------------------------------------------------------------
 1 | clear all
 2 | set more off
 3 | eststo clear
 4 | capture version 14
 5 | 
 6 | /* Stata code for Figure 5.2.4 */
 7 | 
 8 | /* Download the data and unzip it */
 9 | shell curl -o outsourcingatwill_table7.zip "http://economics.mit.edu/~dautor/outsourcingatwill_table7.zip"
10 | unzipfile outsourcingatwill_table7.zip
11 | 
12 | /*-------------*/
13 | /* Import data */
14 | /*-------------*/
15 | use "table7/autor-jole-2003.dta", clear
16 | 
17 | /* Log total employment: from BLS employment & earnings */
18 | gen lnemp = log(annemp)
19 | 
20 | /* Non-business-service sector employment from CBP */
21 | gen nonemp  = stateemp - svcemp
22 | gen lnnon   = log(nonemp)
23 | gen svcfrac = svcemp / nonemp
24 | 
25 | /* Total business services employment from CBP */
26 | gen bizemp = svcemp + peremp
27 | gen lnbiz  = log(bizemp)
28 | 
29 | /* Time trends */
30 | gen t  = year - 78 // Linear time trend
31 | gen t2 = t^2       // Quadratic time trend
32 | 
33 | /* Restrict sample */
34 | keep if inrange(year, 79, 95) & state != 98
35 | 
36 | /* Generate more aggregate demographics */
37 | gen clp     = clg + gtc
38 | gen a1624   = m1619 + m2024 + f1619 + f2024
39 | gen a2554   = m2554 + f2554
40 | gen a55up   = m5564 + m65up + f5564 + f65up
41 | gen fem     = f1619 + f2024 + f2554 + f5564 + f65up
42 | gen white   = rs_wm + rs_wf
43 | gen black   = rs_bm + rs_bf
44 | gen other   = rs_om + rs_of
45 | gen married = marfem + marmale
46 | 
47 | /* Modify union variable */
48 | replace unmem = . if inlist(year, 79, 81) // Don't interpolate 1979, 1981
49 | replace unmem = unmem * 100               // Rescale into percentage
50 | 
51 | /* Diff-in-diff regression */
52 | reg lnths lnemp admico_2 admico_1 admico0 admico1 admico2 admico3 mico4 admppa_2 admppa_1   ///
53 |     admppa0 admppa1 admppa2 admppa3 mppa4 admgfa_2 admgfa_1 admgfa0 admgfa1 admgfa2 admgfa3 ///
54 |     mgfa4 i.year i.state i.state#c.t, cluster(state)
55 | 
56 | coefplot, keep(admico_2 admico_1 admico0 admico1 admico2 admico3 mico4)                     ///
57 |           coeflabels(admico_2 = "2 yr prior"                                                ///
58 |                      admico_1 = "1 yr prior"                                                ///
59 |                      admico0  = "Yr of adopt"                                               ///
60 |                      admico1  = "1 yr after"                                                ///
61 |                      admico2  = "2 yr after"                                                ///
62 |                      admico3  = "3 yr after"                                                ///
63 |                      mico4    = "4+ yr after")                                              ///
64 |           vertical                                                                          ///
65 |           yline(0)                                                                          ///
66 |           ytitle("Log points")                                                              ///
67 |           xtitle("Time passage relative to year of adoption of implied contract exception") ///
68 |           addplot(line @b @at)                                                              ///
69 |           ciopts(recast(rcap))                                                              ///
70 |           rescale(100)                                                                      ///
71 |           scheme(s1mono)
72 | graph export "Figures/Figure 5-2-4.png", replace
73 | 
74 | /* End of script */
75 | 


--------------------------------------------------------------------------------
/05 Fixed Effects, DD and Panel Data/Figure 5-2-4.r:
--------------------------------------------------------------------------------
 1 | # R code for Figure 5-2-4            #
 2 | # Required packages                  #
 3 | # foreign: read Stata .dta files     #
 4 | # lfe: run fixed effect regressions  #
 5 | # ggplot2: plot results              #
 6 | library(foreign)
 7 | library(lfe)
 8 | library(ggplot2)
 9 | 
10 | # Download the data and unzip it
11 | download.file(
12 |   "https://www.dropbox.com/s/m6o0704ohzwep4s/outsourcingatwill_table7.zip?dl=1",
13 |   "outsourcingatwill_table7.zip"
14 | )
15 | unzip("outsourcingatwill_table7.zip")
16 | 
17 | # Load the data
18 | autor <- read.dta("table7/autor-jole-2003.dta")
19 | 
20 | # Log total employment: from BLS employment & earnings
21 | autor$lnemp <- log(autor$annemp)
22 | 
23 | # Non-business-service sector employment from CBP
24 | autor$nonemp <- autor$stateemp - autor$svcemp
25 | autor$lnnon <- log(autor$nonemp)
26 | autor$svcfrac <- autor$svcemp / autor$nonemp
27 | 
28 | # Total business services employment from CBP
29 | autor$bizemp <- autor$svcemp + autor$peremp
30 | autor$lnbiz <- log(autor$bizemp)
31 | 
32 | # Restrict sample
33 | autor <- autor[which(autor$year >= 79 & autor$year <= 95), ]
34 | autor <- autor[which(autor$state != 98), ]
35 | 
36 | # State dummies, year dummies, and state*time trends
37 | autor$t <- autor$year - 78
38 | autor$t2 <- autor$t^2
39 | 
40 | # Generate more aggregate demographics
41 | autor$clp <- autor$clg + autor$gtc
42 | autor$a1624 <- autor$m1619 + autor$m2024 + autor$f1619 + autor$f2024
43 | autor$a2554 <- autor$m2554 + autor$f2554
44 | autor$a55up <- autor$m5564 + autor$m65up + autor$f5564 + autor$f65up
45 | autor$fem <- autor$f1619 + autor$f2024 + autor$f2554 + autor$f5564 + autor$f65up
46 | autor$white <- autor$rs_wm + autor$rs_wf
47 | autor$black <- autor$rs_bm + autor$rs_bf
48 | autor$other <- autor$rs_om + autor$rs_of
49 | autor$married <- autor$marfem + autor$marmale
50 | 
51 | # Modify union variable (1. Don't interpolate 1979, 1981; 2. Rescale into percentage)
52 | autor$unmem[79 == autor$year | autor$year == 81] <- NA
53 | autor$unmem <- autor$unmem * 100
54 | 
55 | # Create state and year factors
56 | autor$state <- factor(autor$state)
57 | autor$year <- factor(autor$year)
58 | 
59 | # Diff-in-diff regression
60 | did <- felm(lnths ~ lnemp + admico_2 + admico_1 + admico0 + admico1 + admico2 +
61 |   admico3 + mico4 + admppa_2 + admppa_1 + admppa0 + admppa1 +
62 |   admppa2 + admppa3 + mppa4 + admgfa_2 + admgfa_1 + admgfa0 +
63 |   admgfa1 + admgfa2 + admgfa3 + mgfa4
64 | | state + year + state:t | 0 | state, data = autor)
65 | 
66 | # Plot results
67 | lags_leads <- c(
68 |   "admico_2", "admico_1", "admico0",
69 |   "admico1", "admico2", "admico3",
70 |   "mico4"
71 | )
72 | labels <- c(
73 |   "2 yr prior", "1 yr prior", "Yr of adopt",
74 |   "1 yr after", "2 yr after", "3 yr after",
75 |   "4+ yr after"
76 | )
77 | results.did <- data.frame(
78 |   label = factor(labels, levels = labels),
79 |   coef = summary(did)$coef[lags_leads, "Estimate"] * 100,
80 |   se = summary(did)$coef[lags_leads, "Cluster s.e."] * 100
81 | )
82 | g <- ggplot(results.did, aes(label, coef, group = 1))
83 | p <- g + geom_point() +
84 |   geom_line(linetype = "dotted") +
85 |   geom_pointrange(aes(
86 |     ymax = coef + 1.96 * se,
87 |     ymin = coef - 1.96 * se
88 |   )) +
89 |   geom_hline(yintercept = 0) +
90 |   ylab("Log points") +
91 |   xlab(paste(
92 |     "Time passage relative to year of",
93 |     "adoption of implied contract exception"
94 |   ))
95 | 
96 | ggsave(p, file = "Figure 5-2-4-R.png", height = 6, width = 8, dpi = 300)
97 | 
98 | # End of script
99 | 


--------------------------------------------------------------------------------
/03 Making Regression Make Sense/Table 3-3-2.r:
--------------------------------------------------------------------------------
 1 | # R code for Table 3-3-2     #
 2 | # Required packages          #
 3 | # - haven: read .dta files   #
 4 | # - knitr: print markdown    #
 5 | library(haven)
 6 | library(knitr)
 7 | 
 8 | # Read the Stata files into R directly from MHE Data Archive
 9 | base_url = "https://economics.mit.edu/sites/default/files/inline-files"
10 | nswre74  <- read_dta(paste(base_url, "nswre74.dta", sep = "/"))
11 | cps1re74 <- read_dta(paste(base_url, "cps1re74.dta", sep = "/"))
12 | cps3re74 <- read_dta(paste(base_url, "cps3re74.dta", sep = "/"))
13 | 
14 | # Function to create propensity trimmed data
15 | propensity.trim <- function(dataset) {
16 |   # Specify control formulas
17 |   controls <- c("age", "age2", "ed", "black", "hisp", "nodeg", "married", "re74", "re75")
18 |   # Paste together probit specification
19 |   spec <- paste("treat", paste(controls, collapse = " + "), sep = " ~ ")
20 |   # Run probit
21 |   probit <- glm(as.formula(spec), family = binomial(link = "probit"), data = dataset)
22 |   # Predict probability of treatment
23 |   pscore <- predict(probit, type = "response")
24 |   # Return data set within range
25 |   dataset[which(pscore > 0.1 & pscore < 0.9), ]
26 | }
27 | 
28 | # Propensity trim data
29 | cps1re74.ptrim <- propensity.trim(cps1re74)
30 | cps3re74.ptrim <- propensity.trim(cps3re74)
31 | 
32 | # Create function for summary statistics
33 | summarize <- function(dataset, treat) {
34 |   # Variables to summarize
35 |   summary.variables <- c("age", "ed", "black", "hisp", "nodeg", "married", "re74", "re75")
36 |   # Calculate mean, removing missing
37 |   summary.means <- sapply(dataset[treat, summary.variables], mean, na.rm = TRUE)
38 |   summary.count <- sum(treat)
39 |   c(summary.means, count = summary.count)
40 | }
41 | 
42 | # Summarize data
43 | nswre74.treat.stats   <- summarize(nswre74, nswre74$treat == 1)
44 | nswre74.control.stats <- summarize(nswre74, nswre74$treat == 0)
45 | cps1re74.stats        <- summarize(cps1re74, cps1re74$treat == 0)
46 | cps3re74.stats        <- summarize(cps3re74, cps3re74$treat == 0)
47 | cps1re74.ptrim.stats  <- summarize(cps1re74.ptrim, cps1re74.ptrim$treat == 0)
48 | cps3re74.ptrim.stats  <- summarize(cps3re74.ptrim, cps3re74.ptrim$treat == 0)
49 | 
50 | # Combine the summary statistics
51 | summary.stats <- rbind(nswre74.treat.stats,
52 |                        nswre74.control.stats,
53 |                        cps1re74.stats,
54 |                        cps3re74.stats,
55 |                        cps1re74.ptrim.stats,
56 |                        cps3re74.ptrim.stats)
57 | 
58 | # Round the digits and transpose table
59 | summary.stats <- cbind(round(summary.stats[ , 1:6], 2),
60 |                        formatC(round(summary.stats[ , 7:9], 0),
61 |                                format   = "d",
62 |                                big.mark = ","))
63 | summary.stats <- t(summary.stats)
64 | 
65 | # Format table with row and column names
66 | row.names(summary.stats) <- c("Age",
67 |                               "Years of schooling",
68 |                               "Black",
69 |                               "Hispanic",
70 |                               "Dropout",
71 |                               "Married",
72 |                               "1974 earnings",
73 |                               "1975 earnings",
74 |                               "Number of Obs.")
75 | 
76 | colnames(summary.stats)  <- c("NSW Treat", "NSW Control",
77 |                               "Full CPS-1", "Full CPS-3",
78 |                               "P-score CPS-1", "P-score CPS-3")
79 | 
80 | # Print table in markdown
81 | print(kable(summary.stats))
82 | 
83 | # End of script
84 | 


--------------------------------------------------------------------------------
/05 Fixed Effects, DD and Panel Data/Figure 5-2-4.jl:
--------------------------------------------------------------------------------
  1 | # Load packages
  2 | using FileIO, StatFiles, DataFrames, CategoricalArrays
  3 | using FixedEffectModels
  4 | using Gadfly
  5 | using Cairo
  6 | 
  7 | # Download the data and unzip it
  8 | download(
  9 |     "https://www.dropbox.com/s/m6o0704ohzwep4s/outsourcingatwill_table7.zip?dl=1",
 10 |     "outsourcingatwill_table7.zip",
 11 | )
 12 | run(`unzip -o outsourcingatwill_table7.zip`)
 13 | 
 14 | # Import data
 15 | autor = DataFrame(load("table7/autor-jole-2003.dta"));
 16 | 
 17 | # Log total employment: from BLS employment & earnings
 18 | autor.lnemp = log.(autor.annemp);
 19 | 
 20 | # Non-business-service sector employment from CBP
 21 | autor.nonemp = autor.stateemp .- autor.svcemp;
 22 | autor.lnnon = log.(autor.nonemp);
 23 | autor.svcfrac = autor.svcemp ./ autor.nonemp;
 24 | 
 25 | # Total business services employment from CBP
 26 | autor.bizemp = autor.svcemp .+ autor.peremp
 27 | autor.lnbiz = log.(autor.bizemp)
 28 | 
 29 | # Restrict sample
 30 | autor = autor[autor.year.>=79, :];
 31 | autor = autor[autor.year.<=95, :];
 32 | autor = autor[autor.state.!=98, :];
 33 | 
 34 | # State dummies, year dummies, and state*time trends
 35 | autor.t = autor.year .- 78;
 36 | autor.t2 = autor.t .^ 2;
 37 | 
 38 | # Generate more aggregate demographics
 39 | autor.clp = autor.clg .+ autor.gtc;
 40 | autor.a1624 = autor.m1619 .+ autor.m2024 .+ autor.f1619 .+ autor.f2024;
 41 | autor.a2554 = autor.m2554 .+ autor.f2554;
 42 | autor.a55up = autor.m5564 .+ autor.m65up .+ autor.f5564 .+ autor.f65up;
 43 | autor.fem = autor.f1619 .+ autor.f2024 .+ autor.f2554 .+ autor.f5564 .+ autor.f65up;
 44 | autor.white = autor.rs_wm .+ autor.rs_wf;
 45 | autor.black = autor.rs_bm .+ autor.rs_bf;
 46 | autor.other = autor.rs_om .+ autor.rs_of;
 47 | autor.married = autor.marfem .+ autor.marmale;
 48 | 
 49 | # Create categorical variable for state and year
 50 | autor.state_c = categorical(autor.state);
 51 | autor.year_c = categorical(autor.year);
 52 | 
 53 | # Diff-in-diff regression
 54 | did = reg(
 55 |     autor,
 56 |     @formula(
 57 |         lnths ~
 58 |             lnemp +
 59 |             admico_2 + admico_1 + admico0 + admico1 + admico2 + admico3 + mico4 +
 60 |             admppa_2 + admppa_1 + admppa0 + admppa1 + admppa2 + admppa3 + mppa4 +
 61 |             admgfa_2 + admgfa_1 + admgfa0 + admgfa1 + admgfa2 + admgfa3 + mgfa4 +
 62 |             fe(state_c) + fe(year_c) + fe(state_c)&t
 63 |     ),
 64 |     Vcov.cluster(:state_c),
 65 | )
 66 | 
 67 | # Store results in a DataFrame for a plot
 68 | results_did = DataFrame(
 69 |     label = coefnames(did),
 70 |     coef  = coef(did) .* 100,
 71 |     se    = stderror(did) .* 100
 72 | );
 73 | 
 74 | # Keep only the relevant coefficients
 75 | results_did = filter(r -> any(occursin.(r"admico|mico", r.label)), results_did);
 76 | 
 77 | # Define labels for coefficients
 78 | results_did.label .= [
 79 |     "2 yr prior",
 80 |     "1 yr prior",
 81 |     "Yr of adopt",
 82 |     "1 yr after",
 83 |     "2 yr after",
 84 |     "3 yr after",
 85 |     "4+ yr after",
 86 | ];
 87 | 
 88 | # Make plot
 89 | figure = plot(
 90 |     results_did,
 91 |     x = "label",
 92 |     y = "coef",
 93 |     ymin = results_did.coef .- 1.96 .* results_did.se,
 94 |     ymax = results_did.coef .+ 1.96 .* results_did.se,
 95 |     Geom.point,
 96 |     Geom.line,
 97 |     Geom.errorbar,
 98 |     Guide.xlabel(
 99 |         "Time passage relative to year of adoption " *
100 |         "of implied contract exception",
101 |     ),
102 |     Guide.ylabel("Log points"),
103 | );
104 | 
105 | # Export figure
106 | draw(PNG("Figure 5-2-4-Julia.png", 7inch, 6inch), figure);
107 | 


--------------------------------------------------------------------------------
/06 Getting a Little Jumpy/Figure 6-1-2.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Create Figure 6.1.2 in MHE
 4 | Tested on Python 3.4
 5 | pandas: import .dta and manipulate data
 6 | altair: plot figures
 7 | """
 8 | import urllib
 9 | import zipfile
10 | import urllib.request
11 | import pandas
12 | import matplotlib.pyplot as plt
13 | import seaborn as sns
14 | import numpy
15 | from patsy import dmatrices
16 | from sklearn.linear_model import LogisticRegression
17 | 
18 | # Download data and unzip the data
19 | urllib.request.urlretrieve('http://economics.mit.edu/faculty/angrist/data1/mhe/lee', 'Lee2008.zip')
20 | with zipfile.ZipFile('Lee2008.zip', 'r') as z:
21 |    z.extractall()
22 | 
23 | # Load the data
24 | lee = pandas.read_stata('Lee2008/individ_final.dta')
25 | 
26 | # Subset by non-missing in the outcome and running variable for panel (a)
27 | panel_a = lee[['myoutcomenext', 'difshare']].dropna(axis = 0)
28 | 
29 | # Create indicator when crossing the cut-off
30 | panel_a['d'] = (panel_a['difshare'] >= 0) * 1.0
31 | 
32 | # Create matrices for logistic regression
33 | y, X  = dmatrices('myoutcomenext ~ d*(difshare + numpy.power(difshare, 2) + numpy.power(difshare, 3) + numpy.power(difshare, 4))', panel_a)
34 | 
35 | # Flatten y into a 1-D array for the sklearn LogisticRegression
36 | y = numpy.ravel(y)
37 | 
38 | # Run the logistic regression
39 | logit = LogisticRegression().fit(X, y)
40 | 
41 | # Produce predicted probabilities
42 | panel_a['predict'] = logit.predict_proba(X)[:, 1]
43 | 
44 | # Create 0.005 intervals of the running variable
45 | breaks          = numpy.arange(-1.0, 1.005, 0.005)
46 | panel_a['i005'] = pandas.cut(panel_a['difshare'], breaks)
47 | 
48 | # Calculate means by interval
49 | mean_panel_a  = panel_a.groupby('i005').mean().dropna(axis = 0)
50 | restriction_a = (mean_panel_a['difshare'] > -0.251) & (mean_panel_a['difshare'] < 0.251)
51 | mean_panel_a  = mean_panel_a[restriction_a]
52 | 
53 | # Calculate means for panel (b)
54 | panel_b         = lee[['difshare', 'mofficeexp', 'mpofficeexp']].dropna(axis = 0)
55 | panel_b['i005'] = pandas.cut(panel_b['difshare'], breaks)
56 | mean_panel_b    = panel_b.groupby('i005').mean().dropna(axis = 0)
57 | restriction_b   = (mean_panel_b['difshare'] > -0.251) & (mean_panel_b['difshare'] < 0.251)
58 | mean_panel_b    = mean_panel_b[restriction_b]
59 | 
60 | # Plot figures
61 | fig = plt.figure(figsize = (7, 7))
62 | 
63 | # Panel (a)
64 | ax_a = fig.add_subplot(211)
65 | ax_a.scatter(mean_panel_a['difshare'],
66 | 	         mean_panel_a['myoutcomenext'],
67 | 	         edgecolors = 'none', color = 'black')
68 | ax_a.plot(mean_panel_a['difshare'][mean_panel_a['difshare'] >= 0],
69 | 	      mean_panel_a['predict'][mean_panel_a['difshare'] >= 0],
70 | 	      color = 'black')
71 | ax_a.plot(mean_panel_a['difshare'][mean_panel_a['difshare'] < 0],
72 | 	      mean_panel_a['predict'][mean_panel_a['difshare'] < 0],
73 | 	      color = 'black')
74 | ax_a.axvline(0, linestyle = '--', color = 'black')
75 | ax_a.set_title('a')
76 | 
77 | # Panel (b)
78 | ax_b = fig.add_subplot(212)
79 | ax_b.scatter(mean_panel_b['difshare'],
80 | 	         mean_panel_b['mofficeexp'],
81 | 	         edgecolors = 'none', color = 'black')
82 | ax_b.plot(mean_panel_b['difshare'][mean_panel_b['difshare'] >= 0],
83 | 	      mean_panel_b['mpofficeexp'][mean_panel_b['difshare'] >= 0],
84 | 	      color = 'black')
85 | ax_b.plot(mean_panel_b['difshare'][mean_panel_b['difshare'] < 0],
86 | 	      mean_panel_b['mpofficeexp'][mean_panel_b['difshare'] < 0],
87 | 	      color = 'black')
88 | ax_b.axvline(0, linestyle = '--', color = 'black')
89 | ax_b.set_title('b')
90 | 
91 | plt.tight_layout()
92 | plt.savefig('Figure 6-1-2-Python.png', dpi = 300)
93 | 
94 | # End of script
95 | 


--------------------------------------------------------------------------------
/06 Getting a Little Jumpy/Figure 6-1-2.r:
--------------------------------------------------------------------------------
 1 | # R code for Figure 6.1.2                 #
 2 | # Required packages                       #
 3 | # - haven: read .dta files                #
 4 | # - data.table: alternative to data.frame #
 5 | # - ggplot2: making pretty graphs         #
 6 | # - gridExtra: combine graphs             #
 7 | library(haven)
 8 | library(data.table)
 9 | library(ggplot2)
10 | library(gridExtra)
11 | 
12 | # Download data and unzip the data
13 | # download.file('http://economics.mit.edu/faculty/angrist/data1/mhe/lee', 'Lee2008.zip')
14 | # unzip('Lee2008.zip')
15 | 
16 | # Load the .dta file as data.table
17 | lee <- data.table(read_dta('Lee2008/individ_final.dta'))
18 | 
19 | # Subset by non-missing in the outcome and running variable for panel (a)
20 | panel.a <- na.omit(lee[, c("myoutcomenext", "difshare"), with = FALSE])
21 | 
22 | # Create indicator when crossing the cut-off
23 | panel.a <- panel.a[ , d := (difshare >= 0) * 1.0]
24 | 
25 | # Predict with local polynomial logit of degree 4
26 | logit   <- glm(formula = myoutcomenext ~ poly(difshare, degree = 4) +
27 | 										            poly(difshare, degree = 4) * d,
28 | 	             family  = binomial(link = "logit"),
29 | 	             data    = panel.a)
30 | panel.a <- panel.a[ , pmyoutcomenext := predict(logit, panel.a, type = "response")]
31 | 
32 | # Create local average by 0.005 interval of the running variable
33 | breaks  <- round(seq(-1, 1, by = 0.005), 3)
34 | panel.a <- panel.a[ , i005 := as.numeric(as.character(cut(difshare,
35 | 	                                                       breaks = breaks,
36 | 	                                                       labels = head(breaks, -1),
37 | 	                                                       right  = TRUE))), ]
38 | 
39 | panel.a <- panel.a[ , list(m_next  = mean(myoutcomenext),
40 | 	                         mp_next = mean(pmyoutcomenext)),
41 |                    by = i005]
42 | 
43 | # Plot panel (a)
44 | panel.a <- panel.a[which(panel.a$i005 > -0.251 & panel.a$i005 < 0.251), ]
45 | plot.a  <- ggplot(data = panel.a, aes(x = i005))                       +
46 |            geom_point(aes(y = m_next))                                 +
47 |            geom_line(aes(y = mp_next, group = i005 >= 0))              +
48 |            geom_vline(xintercept = 0, linetype = 'longdash')           +
49 |            xlab('Democratic Vote Share Margin of Victory, Election t') +
50 |            ylab('Probability of Victory, Election t+1')                +
51 |            ggtitle('a')
52 | 
53 | # Subset the outcome for panel (b)
54 | panel.b <- lee[ , i005 := as.numeric(as.character(cut(difshare,
55 | 	                                                   breaks = breaks,
56 | 	                                                   labels = head(breaks, -1),
57 | 	                                                   right  = TRUE))), ]
58 | 
59 | panel.b <- panel.b[ , list(m_vic  = mean(mofficeexp, na.rm = TRUE),
60 | 	                         mp_vic = mean(mpofficeexp, na.rm = TRUE)),
61 |                    by = i005]
62 | 
63 | panel.b <- panel.b[which(panel.b$i005 > -0.251 & panel.b$i005 < 0.251), ]
64 | plot.b  <- ggplot(data = panel.b, aes(x = i005))                       +
65 |            geom_point(aes(y = m_vic))                                  +
66 |            geom_line(aes(y = mp_vic, group = i005 >= 0))               +
67 |            geom_vline(xintercept = 0, linetype = 'longdash')           +
68 |            xlab('Democratic Vote Share Margin of Victory, Election t') +
69 |            ylab('No. of Past Victories as of Election t')              +
70 |            ggtitle('b')
71 | 
72 | lee.p  <- arrangeGrob(plot.a, plot.b)
73 | ggsave(lee.p, file = "Figure 6-1-2-R.png", height = 12, width = 8, dpi = 300)
74 | 
75 | # End of script
76 | 


--------------------------------------------------------------------------------
/05 Fixed Effects, DD and Panel Data/Figure 5-2-4.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """
  3 | Tested on Python 3.8.5
  4 | """
  5 | import urllib.request
  6 | import zipfile
  7 | import pandas as pd
  8 | import numpy as np
  9 | from linearmodels.panel import PanelOLS
 10 | import seaborn as sns
 11 | import matplotlib.pyplot as plt
 12 | 
 13 | # Download data and unzip the data
 14 | urllib.request.urlretrieve('https://www.dropbox.com/s/m6o0704ohzwep4s/outsourcingatwill_table7.zip?dl=1', 'outsourcingatwill_table7.zip')
 15 | with zipfile.ZipFile('outsourcingatwill_table7.zip', 'r') as z:
 16 |   z.extractall()
 17 | 
 18 | # Import data
 19 | autor = pd.read_stata("table7/autor-jole-2003.dta")
 20 | 
 21 | # Log total employment: from BLS employment & earnings
 22 | autor["lnemp"] = np.log(autor["annemp"])
 23 | 
 24 | # Non-business-service sector employment from CBP
 25 | autor["nonemp"] = autor["stateemp"] - autor["svcemp"]
 26 | autor["lnnon"] = np.log(autor["nonemp"])
 27 | autor["svcfrac"] = autor["svcemp"] / autor["nonemp"]
 28 | 
 29 | # Total business services employment from CBP
 30 | autor["bizemp"] = autor["svcemp"] + autor["peremp"]
 31 | autor["lnbiz"] = np.log(autor["bizemp"])
 32 | 
 33 | # Restrict sample
 34 | autor = autor[autor["year"] >= 79]
 35 | autor = autor[autor["year"] <= 95]
 36 | autor = autor[autor["state"] != 98]
 37 | 
 38 | # State dummies, year dummies, and state*time trends
 39 | autor["t"] = autor["year"] - 78
 40 | autor["t2"] = autor["t"] ** 2
 41 | 
 42 | # Generate more aggregate demographics
 43 | autor["clp"] = autor["clg"] + autor["gtc"]
 44 | autor["a1624"] = autor["m1619"] + autor["m2024"] + autor["f1619"] + autor["f2024"]
 45 | autor["a2554"] = autor["m2554"] + autor["f2554"]
 46 | autor["a55up"] = autor["m5564"] + autor["m65up"] + autor["f5564"] + autor["f65up"]
 47 | autor["fem"] = (
 48 |     autor["f1619"] + autor["f2024"] + autor["f2554"] + autor["f5564"] + autor["f65up"]
 49 | )
 50 | autor["white"] = autor["rs_wm"] + autor["rs_wf"]
 51 | autor["black"] = autor["rs_bm"] + autor["rs_bf"]
 52 | autor["other"] = autor["rs_om"] + autor["rs_of"]
 53 | autor["married"] = autor["marfem"] + autor["marmale"]
 54 | 
 55 | # Create categorical for state
 56 | autor["state_c"] = pd.Categorical(autor["state"])
 57 | 
 58 | # Set index for use with linearmodels
 59 | autor = autor.set_index(["state", "year"], drop=False)
 60 | 
 61 | # Diff-in-diff regression
 62 | did = PanelOLS.from_formula(
 63 |     (
 64 |         "lnths ~"
 65 |         "1 +"
 66 |         "lnemp +"
 67 |         "admico_2 + admico_1 + admico0 + admico1 + admico2 + admico3 + mico4 +"
 68 |         "admppa_2 + admppa_1 + admppa0 + admppa1 + admppa2 + admppa3 + mppa4 +"
 69 |         "admgfa_2 + admgfa_1 + admgfa0 + admgfa1 + admgfa2 + admgfa3 + mgfa4 +"
 70 |         "state_c:t +"
 71 |         "EntityEffects + TimeEffects"
 72 |     ),
 73 |     data=autor,
 74 |     drop_absorbed=True
 75 | ).fit(cov_type='clustered', cluster_entity=True)
 76 | 
 77 | # Store results in a DataFrame for a plot
 78 | results_did = pd.DataFrame(
 79 |     {"coef": did.params * 100, "ci": 1.96 * did.std_errors * 100}
 80 | )
 81 | 
 82 | # Keep only the relevant coefficients
 83 | results_did = results_did.filter(regex="admico|mico", axis=0).reset_index()
 84 | 
 85 | # Define labels for coefficients
 86 | results_did_labels = [
 87 |     "2 yr prior",
 88 |     "1 yr prior",
 89 |     "Yr of adopt",
 90 |     "1 yr after",
 91 |     "2 yr after",
 92 |     "3 yr after",
 93 |     "4+ yr after",
 94 | ]
 95 | 
 96 | # Make plot
 97 | fig, ax = plt.subplots()
 98 | 
 99 | ax.errorbar(x="index", y="coef", yerr="ci", marker=".", data=results_did)
100 | ax.axhline(y=0)
101 | ax.set_xticklabels(results_did_labels)
102 | ax.set_xlabel(
103 |     ("Time passage relative to year of adoption of " "implied contract exception")
104 | )
105 | ax.set_ylabel("Log points")
106 | 
107 | plt.tight_layout()
108 | plt.show()
109 | plt.savefig("Figure 5-2-4-Python.png", format="png")
110 | 
111 | # End of script
112 | 


--------------------------------------------------------------------------------
/06 Getting a Little Jumpy/Figure 6-1-2.do:
--------------------------------------------------------------------------------
 1 | clear all
 2 | set more off
 3 | 
 4 | * Download data and unzip the data
 5 | shell curl -o Lee2008.zip http://economics.mit.edu/faculty/angrist/data1/mhe/lee
 6 | unzipfile Lee2008.zip, replace
 7 | 
 8 | * Load the data
 9 | use "Lee2008/individ_final.dta", clear
10 | 
11 | * Create 0.005 intervals of democratic share of votes
12 | egen i005   = cut(difshare), at(-1(0.005)1.005)
13 | 
14 | * Take the mean within each interval
15 | egen m_next = mean(myoutcomenext), by(i005)
16 | 
17 | * Predict with polynomial logit of degree 4
18 | foreach poly of numlist 1(1)4 {
19 |     gen poly_`poly' = difshare^`poly'
20 | }
21 | 
22 | gen d = (difshare >= 0)
23 | logit myoutcomenext c.poly_*##d
24 | predict next_pr, pr
25 | egen mp_next  = mean(next_pr), by(i005)
26 | 
27 | * Create the variables for office of experience (taken as given from Lee, 2008)
28 | egen mp_vic   = mean(mpofficeexp), by(i005)
29 | egen m_vic    = mean(mofficeexp), by(i005)
30 | 
31 | * Tag each interval once for the plot
32 | egen tag_i005 = tag(i005)
33 | 
34 | * Plot panel (a)
35 | graph twoway (scatter m_next  i005, msize(small))                              ///
36 |              (line    mp_next i005 if i005 >= 0, sort)                         ///
37 |              (line    mp_next i005 if i005 <  0, sort)                         /// 
38 |                 if i005 > -0.251 & i005 < 0.251 & tag_i005 == 1,               /// 
39 |                 xline(0, lpattern(dash))                                       ///
40 |                 title("a")                                                     ///
41 |                 xtitle("Democratic Vote Share Margin of Victory, Election t")  /// 
42 |                 ytitle("Probability of Victory, Election t+1")                 ///
43 |                 yscale(r(0 1))        ylabel(0(.1)1)                           ///
44 |                 xscale(r(-0.25 0.25)) xlabel(-0.25(.05)0.25)                   ///
45 |                 legend(order(1 2) cols(1)                                      ///
46 |                        ring(0) bplacement(nwest)                               ///
47 |                        label(1 "Local Average") label(2 "Logit Fit"))          ///
48 |                 scheme(s1mono)                                                 ///
49 |                 saving(panel_a.gph, replace)
50 | 
51 | * Plot panel (b)
52 | graph twoway (scatter m_vic  i005, msize(small))                               ///
53 |              (line    mp_vic i005 if i005 >= 0, sort)                          ///
54 |              (line    mp_vic i005 if i005 <  0, sort)                          /// 
55 |                 if i005 > -0.251 & i005 < 0.251 & tag_i005 == 1,               /// 
56 |                 xline(0, lpattern(dash))                                       ///
57 |                 title("b")                                                     ///
58 |                 xtitle("Democratic Vote Share Margin of Victory, Election t")  /// 
59 |                 ytitle("No. of Past Victories as of Election t")               ///
60 |                 yscale(r(0 5))        ylabel(0(.5)5)                           ///
61 |                 xscale(r(-0.25 0.25)) xlabel(-0.25(.05)0.25)                   ///
62 |                 legend(order(1 2) cols(1)                                      ///
63 |                        ring(0) bplacement(nwest)                               ///
64 |                        label(1 "Local Average") label(2 "Logit Fit"))          ///
65 |                 scheme(s1mono)                                                 ///
66 |                 saving(panel_b.gph, replace)
67 | 
68 | * Combine plots
69 | graph combine panel_a.gph panel_b.gph, ///
70 |     col(1)                             ///
71 |     xsize(4) ysize(6)                  ///
72 |     graphregion(margin(zero))          ///
73 |     scheme(s1mono)
74 | 
75 | * Export figures
76 | graph export "Figure 6-1-2-Stata.png", replace
77 | 
78 | /* End of file */
79 | exit
80 | 


--------------------------------------------------------------------------------
/06 Getting a Little Jumpy/Figure 6-1-2.jl:
--------------------------------------------------------------------------------
 1 | # Load packages
 2 | using DataFrames
 3 | using Gadfly
 4 | using Compose
 5 | using GLM
 6 | 
 7 | # Download the data and unzip it
 8 | # download("http://economics.mit.edu/faculty/angrist/data1/mhe/lee", "Lee2008.zip")
 9 | # run(`unzip Lee2008.zip`)
10 | 
11 | # Read the data
12 | lee = readtable("Lee2008/individ_final.csv")
13 | 
14 | # Subset by non-missing in the outcome and running variable for panel (a)
15 | panel_a = lee[!isna(lee[:, Symbol("difshare")]) & !isna(lee[:, Symbol("myoutcomenext")]), :]
16 | 
17 | # Create indicator when crossing the cut-off
18 | panel_a[:d] = (panel_a[:difshare] .>= 0) .* 1.0
19 | 
20 | # Predict with local polynomial logit of degree 4
21 | panel_a[:difshare2] = panel_a[:difshare].^2
22 | panel_a[:difshare3] = panel_a[:difshare].^3
23 | panel_a[:difshare4] = panel_a[:difshare].^4
24 | 
25 | logit = glm(myoutcomenext ~ difshare   + difshare2   + difshare3   + difshare4   + d +
26 |                             d*difshare + d*difshare2 + d*difshare3 + d*difshare4,
27 |             panel_a,
28 |             Binomial(),
29 |             LogitLink())
30 | panel_a[:mmyoutcomenext] = predict(logit)
31 | 
32 | # Create local average by 0.005 interval of the running variable
33 | panel_a[:i005] = cut(panel_a[:difshare], collect(-1:0.005:1))
34 | mean_panel_a   = aggregate(panel_a, :i005, [mean])
35 | 
36 | # Restrict within bandwidth of +/- 0.251
37 | restriction_a = (mean_panel_a[:difshare_mean] .> -0.251) & (mean_panel_a[:difshare_mean] .< 0.251)
38 | mean_panel_a  = mean_panel_a[restriction_a, :]
39 | 
40 | # Plot panel (a)
41 | plot_a = plot(layer(x = mean_panel_a[:difshare_mean],
42 |                   y = mean_panel_a[:myoutcomenext_mean],
43 |                   Geom.point),
44 |               layer(x = mean_panel_a[mean_panel_a[:difshare_mean] .< 0, :difshare_mean],
45 |                     y = mean_panel_a[mean_panel_a[:difshare_mean] .< 0, :mmyoutcomenext_mean],
46 |                     Geom.line),
47 |               layer(x = mean_panel_a[mean_panel_a[:difshare_mean] .>= 0, :difshare_mean],
48 |                     y = mean_panel_a[mean_panel_a[:difshare_mean] .>= 0, :mmyoutcomenext_mean],
49 |                     Geom.line),
50 |               layer(xintercept = [0],
51 |                     Geom.vline,
52 |                     Theme(line_style = Gadfly.get_stroke_vector(:dot))),
53 |               Guide.xlabel("Democratic Vote Share Margin of Victory, Election t"),
54 |               Guide.ylabel("Probability of Victory, Election t+1"),
55 |               Guide.title("a"))
56 | 
57 | # Create local average by 0.005 interval of the running variable
58 | panel_b        = lee[!isna(lee[:, Symbol("difshare")]) & !isna(lee[:, Symbol("mofficeexp")]), :]
59 | panel_b[:i005] = cut(panel_b[:difshare], collect(-1:0.005:1))
60 | mean_panel_b   = aggregate(panel_b, :i005, [mean])
61 | 
62 | # Restrict within bandwidth of +/- 0.251
63 | restriction_b = (mean_panel_b[:difshare_mean] .> -0.251) & (mean_panel_b[:difshare_mean] .< 0.251)
64 | mean_panel_b  = mean_panel_b[restriction_b, :]
65 | 
66 | # Plot panel (b)
67 | plot_b = plot(layer(x = mean_panel_b[:difshare_mean],
68 |                   y = mean_panel_b[:mofficeexp_mean],
69 |                   Geom.point),
70 |               layer(x = mean_panel_b[mean_panel_b[:difshare_mean] .< 0, :difshare_mean],
71 |                     y = mean_panel_b[mean_panel_b[:difshare_mean] .< 0, :mpofficeexp_mean],
72 |                     Geom.line),
73 |               layer(x = mean_panel_b[mean_panel_b[:difshare_mean] .>= 0, :difshare_mean],
74 |                     y = mean_panel_b[mean_panel_b[:difshare_mean] .>= 0, :mpofficeexp_mean],
75 |                     Geom.line),
76 |               layer(xintercept = [0],
77 |                     Geom.vline,
78 |                     Theme(line_style = Gadfly.get_stroke_vector(:dot))),             
79 |               Guide.xlabel("Democratic Vote Share Margin of Victory, Election t"),
80 |               Guide.ylabel("No. of Past Victories as of Election t"),
81 |               Guide.title("b"))
82 | 
83 | # Combine plots
84 | draw(PNG("Figure 6-1-2-Julia.png", 6inch, 8inch), vstack(plot_a, plot_b))
85 | 
86 | # End of script
87 | 


--------------------------------------------------------------------------------
/08 Nonstandard Standard Error Issues/Table 8-1-1.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """
  3 | Tested on Python 3.4
  4 | numpy: generate random data, manipulate arrays
  5 | statsmodels.api: estimate OLS and robust errors
  6 | tabulate: pretty print to markdown
  7 | scipy.stats: calculate distributions
  8 | """
  9 | 
 10 | import numpy as np
 11 | import statsmodels.api as sm
 12 | from tabulate import tabulate
 13 | import scipy.stats
 14 | 
 15 | # Set seed
 16 | np.random.seed(1025)
 17 | 
 18 | # Set number of simulations
 19 | nsims = 25000
 20 | 
 21 | # Create function to create data for each run
 22 | def generateHC(sigma):
 23 |     # Set parameters of the simulation
 24 |     N   = 30
 25 |     r   = 0.9
 26 |     N_1 = int(r * 30)
 27 | 
 28 |     # Generate simulation data
 29 |     d = np.ones(N); d[0:N_1] = 0;
 30 | 
 31 |     epsilon         = np.empty(N)
 32 |     epsilon[d == 1] = np.random.normal(0, 1, N - N_1)
 33 |     epsilon[d == 0] = np.random.normal(0, sigma, N_1)
 34 | 
 35 |     # Run regression
 36 |     y       = 0 * d + epsilon
 37 |     X       = sm.add_constant(d)
 38 |     model   = sm.OLS(y, X)
 39 |     results = model.fit()
 40 |     b1      = results.params[1]
 41 | 
 42 |     # Calculate standard errors
 43 |     conventional = results.bse[1]
 44 |     hc0          = results.get_robustcov_results(cov_type = 'HC0').bse[1]
 45 |     hc1          = results.get_robustcov_results(cov_type = 'HC1').bse[1]
 46 |     hc2          = results.get_robustcov_results(cov_type = 'HC2').bse[1]
 47 |     hc3          = results.get_robustcov_results(cov_type = 'HC3').bse[1]
 48 |     return([b1, conventional, hc0, hc1, hc2, hc3])
 49 | 
 50 | # Create function to report simulations
 51 | def simulateHC(nsims, sigma):
 52 |     # Initialize array to save results
 53 |     simulation_results = np.empty(shape = [nsims, 6])
 54 | 
 55 |     # Run simulation
 56 |     for i in range(0, nsims):
 57 |         simulation_results[i, :] = generateHC(0.5)
 58 | 
 59 |     # Take maximum of conventional versus HC's, and combine with simulation results
 60 |     compare_errors     = np.maximum(simulation_results[:, 1].transpose(),
 61 |                                     simulation_results[:, 2:6].transpose()).transpose()
 62 |     simulation_results = np.concatenate((simulation_results, compare_errors), axis = 1)
 63 |     
 64 |     # Calculate rejection rates (note backslash = explicit line continuation)
 65 |     test_stats       = np.tile(simulation_results[:, 0], (9, 1)).transpose() / \
 66 |                        simulation_results[:, 1:10]
 67 |     summary_reject_z = np.mean(2 * scipy.stats.norm.cdf(-abs(test_stats)) <= 0.05,
 68 |                                axis = 0).transpose()
 69 |     summary_reject_t = np.mean(2 * scipy.stats.t.cdf(-abs(test_stats), df = 30 - 2) <= 0.05,
 70 |                                axis = 0).transpose()
 71 |     summary_reject_z = np.concatenate([[np.nan], summary_reject_z]).transpose()
 72 |     summary_reject_t = np.concatenate([[np.nan], summary_reject_t]).transpose()
 73 | 
 74 |     # Calculate mean and standard errors
 75 |     summary_mean  = np.mean(simulation_results, axis = 0).transpose()
 76 |     summary_std   = np.std(simulation_results, axis = 0).transpose()
 77 | 
 78 |     # Create labels
 79 |     summary_labs  = np.array(["Beta_1", "Conventional","HC0", "HC1", "HC2", "HC3",
 80 |                               "max(Conventional, HC0)", "max(Conventional, HC1)",
 81 |                               "max(Conventional, HC2)", "max(Conventional, HC3)"])
 82 | 
 83 |     # Combine all the results and labels
 84 |     summary_stats = np.column_stack((summary_labs,
 85 |                                      summary_mean,
 86 |                                      summary_std,
 87 |                                      summary_reject_z,
 88 |                                      summary_reject_t))
 89 | 
 90 |     # Create header for table
 91 |     header        = ["Mean", "Std", "z rate", "t rate"]
 92 |     return(tabulate(summary_stats, header, tablefmt = "pipe"))
 93 | 
 94 | print("Panel A")
 95 | print(simulateHC(nsims, 0.5))
 96 | 
 97 | print("Panel B")
 98 | print(simulateHC(nsims, 0.85))
 99 | 
100 | print("Panel C")
101 | print(simulateHC(nsims, 1))
102 | # End of script
103 | 


--------------------------------------------------------------------------------
/08 Nonstandard Standard Error Issues/Table 8-1-1.r:
--------------------------------------------------------------------------------
  1 | # R code for Table 8-1-1              #
  2 | # Required packages                   #
  3 | # - sandwich: robust standard error   #
  4 | # - parallel: parallelize simulation  #
  5 | # - plyr: apply functions             #
  6 | # - lmtest: simplifies testing        #
  7 | # - reshape2: reshapin' data          #
  8 | # - knitr: print markdown tables      #
  9 | library(sandwich)
 10 | library(parallel)
 11 | library(plyr)
 12 | library(lmtest)
 13 | library(reshape2)
 14 | library(knitr)
 15 | 
 16 | # Set seed for replication
 17 | set.seed(1984, "L'Ecuyer")
 18 | 
 19 | # Set number of simulations
 20 | nsims = 25000
 21 | 
 22 | # Set parameters of the simulation
 23 | N   = 30
 24 | r   = 0.9
 25 | N_1 = r * 30
 26 | 
 27 | # Store a list of the standard error types
 28 | se.types <- c("const", paste0("HC", 0:3))
 29 | 
 30 | # Create a function to extract standard errors
 31 | calculate.se <- function(lm.obj, type) {
 32 |     sqrt(vcovHC(lm.obj, type = type)[2, 2])
 33 | }
 34 | 
 35 | # Create function to calculate max of conventional versus robust, returning max
 36 | compare.conv <- function(conventional, x) {
 37 |     pmax(conventional, x)
 38 | }
 39 | 
 40 | # Create function for rejection rate
 41 | reject.rate <- function(x) {
 42 |     mean(ifelse(x <= 0.05, 1, 0))
 43 | }
 44 | 
 45 | # Create function for simulation
 46 | clusterBiasSim <- function(sigma = 1,...) {
 47 |     # Generate data
 48 |     d              <- c(rep(0, N_1), rep(1, N - N_1))
 49 |     epsilon        <- rnorm(n = N, sd = sigma) * (d == 0) + rnorm(n = N) * (d == 1)
 50 |     y              <- 0 * d + epsilon  
 51 |     simulated.data <- data.frame(y = y, d = d)
 52 |     
 53 |     # Run regression
 54 |     lm.sim <- lm(y ~ d, data = simulated.data)
 55 |     b1     <- coef(lm.sim)[2]
 56 | 
 57 |     # Calculate standard errors
 58 |     se.sim <- sapply(se.types, calculate.se, lm.obj = lm.sim)
 59 |     
 60 |     # Return the results of a simulation
 61 |     data.frame(b1, t(se.sim))
 62 | }
 63 | 
 64 | # Function for running simulations and returning table of results
 65 | summarizeBias <- function(nsims = 25000, sigma = 1) {
 66 |     # Run simulation
 67 |     simulated.results <- do.call(rbind,
 68 |                                  mclapply(1:nsims,
 69 |                                           clusterBiasSim,
 70 |                                           sigma = sigma))
 71 | 
 72 |     # Calculate maximums
 73 |     se.compare        <- sapply(simulated.results[ , se.types[-1]],
 74 |                                 compare.conv,
 75 |                                 conventional = simulated.results$const)
 76 |     colnames(se.compare) <- paste0("max.const.", colnames(se.compare))
 77 |     simulated.results <- data.frame(simulated.results, se.compare)
 78 | 
 79 |     # Calculate rejections
 80 |     melted.sims     <- melt(simulated.results, measure = 2:10)
 81 |     melted.sims$z.p <- 2 * pnorm(abs(melted.sims$b1 / melted.sims$value),
 82 |                                  lower.tail = FALSE)
 83 |     melted.sims$t.p <- 2 * pt(abs(melted.sims$b1 / melted.sims$value),
 84 |                               df = 30 - 2,
 85 |                               lower.tail = FALSE)
 86 | 
 87 |     rejections <- aggregate(melted.sims[ , c("z.p", "t.p")],
 88 |                             by  = list(melted.sims$variable),
 89 |                             FUN = reject.rate)
 90 |     rownames(rejections) <- rejections$Group.1
 91 | 
 92 |     # Get means and standard deviations
 93 |     summarize.table <- sapply(simulated.results,
 94 |                               each(mean, sd),
 95 |                               na.rm = TRUE)
 96 |     summarize.table <- t(summarize.table)
 97 | 
 98 |     # Return all the results as one data.frame
 99 |     merge(summarize.table, rejections[-1], by = "row.names", all.x = TRUE)
100 | }
101 | 
102 | # Function for printing results to markdown
103 | printBias <- function(obj.df) {
104 |     colnames(obj.df) <- c("Estimate", "Mean", "Std", "Normal", "t")
105 |     obj.df$Estimate  <- c("Beta_1", "Conventional",
106 |                           paste0("HC", 0:3),
107 |                           paste0("max(Conventional, HC", 0:3, ")"))
108 |     print(kable(obj.df, digits = 3))
109 | }
110 | 
111 | # Panel A
112 | panel.a <- summarizeBias(nsims = nsims, sigma = 0.5)
113 | printBias(panel.a)
114 | # Panel B
115 | panel.b <- summarizeBias(nsims = nsims, sigma = 0.85)
116 | printBias(panel.b)
117 | # Panel C
118 | panel.c <- summarizeBias(nsims = nsims, sigma = 1)
119 | printBias(panel.c)
120 | 
121 | # End of file
122 | 


--------------------------------------------------------------------------------
/08 Nonstandard Standard Error Issues/Table 8-1-1.do:
--------------------------------------------------------------------------------
  1 | clear all
  2 | set more off
  3 | eststo clear
  4 | capture log close _all
  5 | capture version 13
  6 | 
  7 | /* Set random seed for replication */
  8 | set seed 42
  9 | 
 10 | /* Number of simulations */
 11 | local reps = 25000
 12 | 
 13 | /* Define program for use with -simulate- command */
 14 | capture program drop clusterbias
 15 | program define clusterbias, rclass
 16 |     syntax, [sigma(real 1)]
 17 | 
 18 |     /* Set parameters of the simulation */
 19 |     local N   = 30
 20 |     local r   = 0.9
 21 |     local N_1 = `r' * 30
 22 | 
 23 |     clear
 24 |     set obs `N'
 25 |     gen D           = (`N_1' < _n)
 26 |     gen epsilon     = rnormal(0, `sigma') if D == 0
 27 |     replace epsilon = rnormal(0, 1)       if D == 1
 28 |     gen Y           = 0 * D + epsilon
 29 | 
 30 |     /* Conventional */
 31 |     regress Y D
 32 |     matrix B           = e(b)
 33 |     local b1           = B[1, 1]
 34 |     matrix C           = e(V)
 35 |     local conventional = sqrt(C[1, 1])
 36 | 
 37 |     /* HC0 and HC1 */
 38 |     regress Y D, vce(robust)
 39 |     matrix C  = e(V)
 40 |     local hc0 = sqrt(((`N' - 2) / `N') * C[1, 1]) // Stata doesn't have hc0
 41 |     local hc1 = sqrt(C[1, 1])
 42 | 
 43 |     /* HC2 */
 44 |     regress Y D, vce(hc2)
 45 |     matrix C  = e(V)
 46 |     local hc2 = sqrt(C[1, 1])
 47 | 
 48 |     /* HC3 */
 49 |     regress Y D, vce(hc3)
 50 |     matrix C  = e(V)
 51 |     local hc3 = sqrt(C[1, 1])
 52 | 
 53 |     /* Return results from program */
 54 |     return scalar b1           = `b1'
 55 |     return scalar conventional = `conventional'
 56 |     return scalar hc0          = `hc0'
 57 |     return scalar hc1          = `hc1'
 58 |     return scalar hc2          = `hc2'
 59 |     return scalar hc3          = `hc3'
 60 | end
 61 | 
 62 | /* Run simulations */
 63 | 
 64 | /*----------------------*/
 65 | /* Panel A: sigma = 0.5 */
 66 | /*----------------------*/
 67 | simulate b1           = r(b1)           ///
 68 |          conventional = r(conventional) ///
 69 |          hc0          = r(hc0)          ///
 70 |          hc1          = r(hc1)          ///
 71 |          hc2          = r(hc2)          ///
 72 |          hc3          = r(hc3), reps(`reps'): clusterbias, sigma(0.50)
 73 | 
 74 | gen max_conv_hc0 = max(conventional, hc0)
 75 | gen max_conv_hc1 = max(conventional, hc1)
 76 | gen max_conv_hc2 = max(conventional, hc2)
 77 | gen max_conv_hc3 = max(conventional, hc3)
 78 | 
 79 | /* Mean and standard deviations of simulation results */
 80 | tabstat *, stat(mean sd) column(stat) format(%9.3f) 
 81 | 
 82 | /* Rejection rates */
 83 | foreach stderr of varlist conventional hc* max_*_hc* {
 84 |     gen z_`stderr'_reject = (2 * normal(-abs(b1 / `stderr')) <= 0.05)
 85 |     gen t_`stderr'_reject = (2 * ttail(30 - 2, abs(b1 / `stderr')) <= 0.05)
 86 | }
 87 | /* Normal */
 88 | tabstat z_*_reject, stat(mean) column(stat) format(%9.3f)
 89 | /* t-distribution */ 
 90 | tabstat t_*_reject, stat(mean) column(stat) format(%9.3f) 
 91 | 
 92 | /*-----------------------*/
 93 | /* Panel B: sigma = 0.85 */
 94 | /*-----------------------*/
 95 | simulate b1           = r(b1)           ///
 96 |          conventional = r(conventional) ///
 97 |          hc0          = r(hc0)          ///
 98 |          hc1          = r(hc1)          ///
 99 |          hc2          = r(hc2)          ///
100 |          hc3          = r(hc3), reps(`reps'): clusterbias, sigma(0.85)
101 | 
102 | gen max_conv_hc0 = max(conventional, hc0)
103 | gen max_conv_hc1 = max(conventional, hc1)
104 | gen max_conv_hc2 = max(conventional, hc2)
105 | gen max_conv_hc3 = max(conventional, hc3)
106 | 
107 | /* Mean and standard deviations of simulation results */
108 | tabstat *, stat(mean sd) column(stat)
109 | 
110 | /* Rejection rates */
111 | foreach stderr of varlist conventional hc* max_*_hc* {
112 |     gen z_`stderr'_reject = (2 * normal(-abs(b1 / `stderr')) <= 0.05)
113 |     gen t_`stderr'_reject = (2 * ttail(30 - 2, abs(b1 / `stderr')) <= 0.05)
114 | }
115 | /* Normal */
116 | tabstat z_*_reject, stat(mean) column(stat) format(%9.3f)
117 | /* t-distribution */ 
118 | tabstat t_*_reject, stat(mean) column(stat) format(%9.3f) 
119 | 
120 | /*--------------------*/
121 | /* Panel C: sigma = 1 */
122 | /*--------------------*/
123 | simulate b1           = r(b1)           ///
124 |          conventional = r(conventional) ///
125 |          hc0          = r(hc0)          ///
126 |          hc1          = r(hc1)          ///
127 |          hc2          = r(hc2)          ///
128 |          hc3          = r(hc3), reps(`reps'): clusterbias
129 | 
130 | gen max_conv_hc0 = max(conventional, hc0)
131 | gen max_conv_hc1 = max(conventional, hc1)
132 | gen max_conv_hc2 = max(conventional, hc2)
133 | gen max_conv_hc3 = max(conventional, hc3)
134 | 
135 | /* Mean and standard deviations of simulation results */
136 | tabstat *, stat(mean sd) column(stat)
137 | 
138 | /* Rejection rates */
139 | foreach stderr of varlist conventional hc* max_*_hc* {
140 |     gen z_`stderr'_reject = (2 * normal(-abs(b1 / `stderr')) <= 0.05)
141 |     gen t_`stderr'_reject = (2 * ttail(30 - 2, abs(b1 / `stderr')) <= 0.05)
142 | }
143 | /* Normal */
144 | tabstat z_*_reject, stat(mean) column(stat) format(%9.3f)
145 | /* t-distribution */ 
146 | tabstat t_*_reject, stat(mean) column(stat) format(%9.3f) 
147 | 
148 | /* End of file */
149 | exit
150 | 


--------------------------------------------------------------------------------
/08 Nonstandard Standard Error Issues/Table-8-1-1.do:
--------------------------------------------------------------------------------
  1 | clear all
  2 | set more off
  3 | eststo clear
  4 | capture log close _all
  5 | capture version 13
  6 | 
  7 | /* Set random seed for replication */
  8 | set seed 42
  9 | 
 10 | /* Number of simulations */
 11 | local reps = 25000
 12 | 
 13 | /* Define program for use with -simulate- command */
 14 | capture program drop clusterbias
 15 | program define clusterbias, rclass
 16 |     syntax, [sigma(real 1)]
 17 | 
 18 |     /* Set parameters of the simulation */
 19 |     local N   = 30
 20 |     local r   = 0.9
 21 |     local N_1 = `r' * 30
 22 | 
 23 |     clear
 24 |     set obs `N'
 25 |     gen D           = (`N_1' < _n)
 26 |     gen epsilon     = rnormal(0, `sigma') if D == 0
 27 |     replace epsilon = rnormal(0, 1)       if D == 1
 28 |     gen Y           = 0 * D + epsilon
 29 | 
 30 |     /* Conventional */
 31 |     regress Y D
 32 |     matrix B           = e(b)
 33 |     local b1           = B[1, 1]
 34 |     matrix C           = e(V)
 35 |     local conventional = sqrt(C[1, 1])
 36 | 
 37 |     /* HC0 and HC1 */
 38 |     regress Y D, vce(robust)
 39 |     matrix C  = e(V)
 40 |     local hc0 = sqrt(((`N' - 2) / `N') * C[1, 1]) // Stata doesn't have hc0
 41 |     local hc1 = sqrt(C[1, 1])
 42 | 
 43 |     /* HC2 */
 44 |     regress Y D, vce(hc2)
 45 |     matrix C  = e(V)
 46 |     local hc2 = sqrt(C[1, 1])
 47 | 
 48 |     /* HC3 */
 49 |     regress Y D, vce(hc3)
 50 |     matrix C  = e(V)
 51 |     local hc3 = sqrt(C[1, 1])
 52 | 
 53 |     /* Return results from program */
 54 |     return scalar b1           = `b1'
 55 |     return scalar conventional = `conventional'
 56 |     return scalar hc0          = `hc0'
 57 |     return scalar hc1          = `hc1'
 58 |     return scalar hc2          = `hc2'
 59 |     return scalar hc3          = `hc3'
 60 | end
 61 | 
 62 | /* Run simulations */
 63 | 
 64 | /*----------------------*/
 65 | /* Panel A: sigma = 0.5 */
 66 | /*----------------------*/
 67 | simulate b1           = r(b1)           ///
 68 |          conventional = r(conventional) ///
 69 |          hc0          = r(hc0)          ///
 70 |          hc1          = r(hc1)          ///
 71 |          hc2          = r(hc2)          ///
 72 |          hc3          = r(hc3), reps(`reps'): clusterbias, sigma(0.50)
 73 | 
 74 | gen max_conv_hc0 = max(conventional, hc0)
 75 | gen max_conv_hc1 = max(conventional, hc1)
 76 | gen max_conv_hc2 = max(conventional, hc2)
 77 | gen max_conv_hc3 = max(conventional, hc3)
 78 | 
 79 | /* Mean and standard deviations of simulation results */
 80 | tabstat *, stat(mean sd) column(stat) format(%9.3f) 
 81 | 
 82 | /* Rejection rates */
 83 | foreach stderr of varlist conventional hc* max_*_hc* {
 84 |     gen z_`stderr'_reject = (2 * normal(-abs(b1 / `stderr')) <= 0.05)
 85 |     gen t_`stderr'_reject = (2 * ttail(30 - 2, abs(b1 / `stderr')) <= 0.05)
 86 | }
 87 | /* Normal */
 88 | tabstat z_*_reject, stat(mean) column(stat) format(%9.3f)
 89 | /* t-distribution */ 
 90 | tabstat t_*_reject, stat(mean) column(stat) format(%9.3f) 
 91 | 
 92 | /*-----------------------*/
 93 | /* Panel B: sigma = 0.85 */
 94 | /*-----------------------*/
 95 | simulate b1           = r(b1)           ///
 96 |          conventional = r(conventional) ///
 97 |          hc0          = r(hc0)          ///
 98 |          hc1          = r(hc1)          ///
 99 |          hc2          = r(hc2)          ///
100 |          hc3          = r(hc3), reps(`reps'): clusterbias, sigma(0.85)
101 | 
102 | gen max_conv_hc0 = max(conventional, hc0)
103 | gen max_conv_hc1 = max(conventional, hc1)
104 | gen max_conv_hc2 = max(conventional, hc2)
105 | gen max_conv_hc3 = max(conventional, hc3)
106 | 
107 | /* Mean and standard deviations of simulation results */
108 | tabstat *, stat(mean sd) column(stat)
109 | 
110 | /* Rejection rates */
111 | foreach stderr of varlist conventional hc* max_*_hc* {
112 |     gen z_`stderr'_reject = (2 * normal(-abs(b1 / `stderr')) <= 0.05)
113 |     gen t_`stderr'_reject = (2 * ttail(30 - 2, abs(b1 / `stderr')) <= 0.05)
114 | }
115 | /* Normal */
116 | tabstat z_*_reject, stat(mean) column(stat) format(%9.3f)
117 | /* t-distribution */ 
118 | tabstat t_*_reject, stat(mean) column(stat) format(%9.3f) 
119 | 
120 | /*--------------------*/
121 | /* Panel C: sigma = 1 */
122 | /*--------------------*/
123 | simulate b1           = r(b1)           ///
124 |          conventional = r(conventional) ///
125 |          hc0          = r(hc0)          ///
126 |          hc1          = r(hc1)          ///
127 |          hc2          = r(hc2)          ///
128 |          hc3          = r(hc3), reps(`reps'): clusterbias
129 | 
130 | gen max_conv_hc0 = max(conventional, hc0)
131 | gen max_conv_hc1 = max(conventional, hc1)
132 | gen max_conv_hc2 = max(conventional, hc2)
133 | gen max_conv_hc3 = max(conventional, hc3)
134 | 
135 | /* Mean and standard deviations of simulation results */
136 | tabstat *, stat(mean sd) column(stat)
137 | 
138 | /* Rejection rates */
139 | foreach stderr of varlist conventional hc* max_*_hc* {
140 |     gen z_`stderr'_reject = (2 * normal(-abs(b1 / `stderr')) <= 0.05)
141 |     gen t_`stderr'_reject = (2 * ttail(30 - 2, abs(b1 / `stderr')) <= 0.05)
142 | }
143 | /* Normal */
144 | tabstat z_*_reject, stat(mean) column(stat) format(%9.3f)
145 | /* t-distribution */ 
146 | tabstat t_*_reject, stat(mean) column(stat) format(%9.3f) 
147 | 
148 | /* End of file */
149 | exit
150 | 


--------------------------------------------------------------------------------
/05 Fixed Effects, DD and Panel Data/Table 5-2-3.do:
--------------------------------------------------------------------------------
  1 | clear all
  2 | set more off
  3 | eststo clear
  4 | capture version 14
  5 | 
  6 | /* Stata code for Table 5.2.3 */
  7 | 
  8 | /* Download the data and unzip it */
  9 | 
 10 | * /* Industry */
 11 | * shell curl -o industry.zip http://sticerd.lse.ac.uk/eopp/_new/data/indian_data/industry.zip
 12 | * unzipfile industry.zip, replace
 13 | 
 14 | * /* Socioeconomics */
 15 | * shell curl -o socioeconomics.zip "http://sticerd.lse.ac.uk/eopp/_new/data/indian_data/socioeconomics.zip"
 16 | * unzipfile socioeconomics.zip, replace
 17 | 
 18 | * /* Poverty and inequality */
 19 | * shell curl -o Poverty_Inequality.zip "http://sticerd.lse.ac.uk/eopp/_new/data/indian_data/Poverty_Inequality.zip"
 20 | * unzipfile Poverty_Inequality.zip, replace
 21 | 
 22 | * /* Public finance */
 23 | * shell curl -o Public_Finance.zip "http://sticerd.lse.ac.uk/eopp/_new/data/indian_data/Public_Finance.zip"
 24 | * unzipfile Public_Finance.zip, replace
 25 | 
 26 | * /* Politics */
 27 | * shell curl -o Politics.zip "http://sticerd.lse.ac.uk/eopp/_new/data/indian_data/Politics.zip"
 28 | * unzipfile Politics.zip, replace
 29 | 
 30 | /*----------------------*/
 31 | /* Import industry data */
 32 | /*----------------------*/
 33 | use industry.dta
 34 | 
 35 | /* Drop missing data */
 36 | drop if missing(state) | missing(year)
 37 | 
 38 | /* Save as temp file to merge to Socioeconomics.dta */
 39 | tempfile industry
 40 | save `industry'
 41 | 
 42 | /*------------------------*/
 43 | /* Poverty and inequality */
 44 | /*------------------------*/
 45 | 
 46 | /* Import poverty and inequality */
 47 | use poverty_and_inequality.dta
 48 | 
 49 | /* Drop missing data */
 50 | drop if missing(state) | missing(year)
 51 | 
 52 | /* Save as temp file to merge to Socioeconomics.dta */
 53 | tempfile poverty_and_inequality
 54 | save `poverty_and_inequality'
 55 | 
 56 | /*----------------*/
 57 | /* Socioeconomics */
 58 | /*----------------*/
 59 | 
 60 | /* Import socioeconomics */
 61 | use Socioeconomic.dta, clear
 62 | 
 63 | /* Drop missing data */
 64 | drop if missing(state) | missing(year)
 65 | 
 66 | /* Save as temp file to merge to Socioeconomics.dta */
 67 | tempfile socioeconomic
 68 | save `socioeconomic'
 69 | 
 70 | /* Drop missing data */
 71 | drop if missing(state) | missing(year)
 72 | 
 73 | /*----------------*/
 74 | /* Public finance */
 75 | /*----------------*/
 76 | 
 77 | /* Import socioeconomics */
 78 | use public_finance.dta, clear
 79 | 
 80 | /* Drop missing data */
 81 | drop if missing(state) | missing(year)
 82 | 
 83 | /* Save as temp file to merge to Socioeconomics.dta */
 84 | tempfile public_finance
 85 | save `public_finance'
 86 | 
 87 | /*----------*/
 88 | /* Politics */
 89 | /*----------*/
 90 | 
 91 | /* Import politics */
 92 | use politics.dta, clear
 93 | 
 94 | /* Merge by state-year */
 95 | merge 1:1 state year using "`industry'", gen(_mindustry)
 96 | merge 1:1 state year using "`poverty_and_inequality'", gen(_mpi)
 97 | merge 1:1 state year using "`socioeconomic'", gen(_socioeconomic)
 98 | merge 1:1 state year using "`public_finance'", gen(_public_finance)
 99 | 
100 | /* Set as time series */
101 | xtset state year
102 | 
103 | /* Restrict to 1958 to 1992 */
104 | keep if inrange(year, 1958, 1992)
105 | 
106 | /* Generate relevant variables */
107 | gen log_employm   = log(employm * 1000)
108 | gen lnstrict      = L.nstrict               // Labor regulation (lagged)
109 | gen log_pop       = log(pop1 + pop2)        // Log population
110 | gen log_devexppc  = log(devexp)    // Log development expenditure per capita
111 | gen log_regmanpc  = log(nsdpmanr) - log_pop // Log registered manufacturing output per capita
112 | gen log_uregmanpc = log(nsdpuman) - log_pop // Log unregistered manufacturing output per capita
113 | gen log_ffcappc   = log(ffcap / employm)    // Log registered manufacturing fixed capital per capita
114 | gen log_fvaladdpe = log(fvaladd) - log_pop
115 | gen log_instcap   = log(instcap)            // Log installed electricity capacity per capita
116 | gen mdlloc_wkr    = mdlloc / (workers) // Workdays lost to lockouts per worker
117 | gen mdldis_wkr    = mdldis / (workers) // Workdays lost to strikes per worker
118 | gen janata        = lkdp + jp + jd
119 | gen hard_left     = cpi + cpm
120 | gen regional      = oth
121 | gen congress      = inc + incu + ics
122 | 
123 | tabstat nstrict mdldis_wkr mdlloc_wkr log_regmanpc log_uregmanpc ///
124 |         log_employm log_ffcappc log_fvaladdpe h2 h1 log_devexppc ///
125 |         log_instcap log_pop congress hard_left janata regional,  ///
126 |             c(s) s(mean sd N)
127 | 
128 | /* Column 1 */
129 | eststo col1: regress log_regmanpc lnstrict i.year i.state, cluster(state)
130 | estadd local state_trends "NO"
131 | 
132 | /* Column 2 */
133 | eststo col2: regress log_regmanpc lnstrict log_devexppc log_instcap log_pop i.year i.state, cluster(state)
134 | estadd local state_trends "NO"
135 | 
136 | /* Column 3 */
137 | eststo col3: regress log_regmanpc lnstrict log_devexppc log_instcap log_pop congress hard_left janata regional i.year i.state, cluster(state)
138 | estadd local state_trends "NO"
139 | 
140 | /* Column 4 */
141 | eststo col4: regress log_regmanpc lnstrict log_devexppc log_instcap log_pop congress hard_left janata regional i.year i.state i.state#c.year, cluster(state)
142 | estadd local state_trends "YES"
143 | 
144 | esttab, se                                                                                 ///
145 |         nomtitles                                                                          ///
146 |         noobs                                                                              ///
147 |         ar2                                                                                ///
148 |         scalars("state_trends State-speciÖc trends")                                       ///
149 |         keep(lnstrict log_devexppc log_instcap log_pop congress hard_left janata regional)
150 | eststo clear
151 | 
152 | /* End of script */
153 | 


--------------------------------------------------------------------------------
/03 Making Regression Make Sense/03 Making Regression Make Sense.md:
--------------------------------------------------------------------------------
 1 | # 03 Making Regression Make Sense
 2 | ## 3.4 Regression Details
 3 | 
 4 | ### Figure 3-1-2
 5 | Completed in [Stata](Figure%203-1-2.do), [R](Figure%203-1-2.r), [Python](Figure%203-1-2.py) and [Julia](Figure%203-1-2.jl)
 6 | 
 7 | ![Figure 3-1-2 in Julia](https://github.com/vikjam/mostly-harmless-replication/blob/master/03%20Making%20Regression%20Make%20Sense/Figure%203-1-2-Julia.png?raw=true)
 8 | 
 9 | ### Table 3-3-2
10 | Completed in [Stata](Table%203-3-2.do), [R](Table%203-3-2.r), [Python](Table%203-3-2.py) and [Julia](Table%203-3-2.jl)
11 | 
12 | _Covariate means in the NSW and observational control samples_
13 | 
14 | |                   |NSW Treat |NSW Control |Full CPS-1 |Full CPS-3 |P-score CPS-1 |P-score CPS-3 |
15 | |:------------------|:---------|:-----------|:----------|:----------|:-------------|:-------------|
16 | |Age                |25.82     |25.05       |33.23      |28.03      |25.63         |25.97         |
17 | |Years of schooling |10.35     |10.09       |12.03      |10.24      |10.49         |10.42         |
18 | |Black              |0.84      |0.83        |0.07       |0.2        |0.96          |0.52          |
19 | |Hispanic           |0.06      |0.11        |0.07       |0.14       |0.03          |0.2           |
20 | |Dropout            |0.71      |0.83        |0.3        |0.6        |0.6           |0.63          |
21 | |Married            |0.19      |0.15        |0.71       |0.51       |0.26          |0.29          |
22 | |1974 earnings      |2,096     |2,107       |14,017     |5,619      |2,821         |2,969         |
23 | |1975 earnings      |1,532     |1,267       |13,651     |2,466      |1,950         |1,859         |
24 | |Number of Obs.     |185       |260         |15,992     |429        |352           |157           |
25 | 
26 | ### Figure 3-1-2
27 | Completed in [Stata](Figure%203-1-3.do), [R](Figure%203-1-3.r) and [Python](Figure%203-1-3.py)
28 | 
29 | ```
30 | /* Old-fashioned standard errors */
31 | 
32 |       Source |       SS       df       MS              Number of obs =  329509
33 | -------------+------------------------------           F(  1,329507) =43782.56
34 |        Model |    17808.83     1    17808.83           Prob > F      =  0.0000
35 |     Residual |  134029.045329507  .406756292           R-squared     =  0.1173
36 | -------------+------------------------------           Adj R-squared =  0.1173
37 |        Total |  151837.875329508  .460801788           Root MSE      =  .63777
38 | 
39 | ------------------------------------------------------------------------------
40 |     lwklywge |      Coef.   Std. Err.      t    P>|t|     [95% Conf. Interval]
41 | -------------+----------------------------------------------------------------
42 |         educ |    .070851   .0003386   209.24   0.000     .0701874    .0715147
43 |        _cons |   4.995182   .0044644  1118.88   0.000     4.986432    5.003932
44 | ------------------------------------------------------------------------------
45 | 
46 | /* Robust standard errors */
47 | 
48 | Linear regression                                      Number of obs =  329509
49 |                                                        F(  1,329507) =34577.15
50 |                                                        Prob > F      =  0.0000
51 |                                                        R-squared     =  0.1173
52 |                                                        Root MSE      =  .63777
53 | 
54 | ------------------------------------------------------------------------------
55 |              |               Robust
56 |     lwklywge |      Coef.   Std. Err.      t    P>|t|     [95% Conf. Interval]
57 | -------------+----------------------------------------------------------------
58 |         educ |    .070851    .000381   185.95   0.000     .0701042    .0715978
59 |        _cons |   4.995182   .0050739   984.49   0.000     4.985238    5.005127
60 | ------------------------------------------------------------------------------
61 | 
62 | /* Old-fashioned standard errors */
63 | 
64 |       Source |       SS       df       MS              Number of obs =      21
65 | -------------+------------------------------           F(  1,    19) =  485.23
66 |        Model |  1.13497742     1  1.13497742           Prob > F      =  0.0000
67 |     Residual |   .04444186    19  .002339045           R-squared     =  0.9623
68 | -------------+------------------------------           Adj R-squared =  0.9603
69 |        Total |  1.17941928    20  .058970964           Root MSE      =  .04836
70 | 
71 | ------------------------------------------------------------------------------
72 |     lwklywge |      Coef.   Std. Err.      t    P>|t|     [95% Conf. Interval]
73 | -------------+----------------------------------------------------------------
74 |         educ |    .070851   .0032164    22.03   0.000      .064119    .0775831
75 |        _cons |   4.995183   .0424075   117.79   0.000     4.906423    5.083943
76 | ------------------------------------------------------------------------------
77 | 
78 | /* Robust standard errors */
79 | 
80 | Linear regression                                      Number of obs =      21
81 |                                                        F(  1,    19) =  231.81
82 |                                                        Prob > F      =  0.0000
83 |                                                        R-squared     =  0.9623
84 |                                                        Root MSE      =  .04836
85 | 
86 | ------------------------------------------------------------------------------
87 |              |               Robust
88 |     lwklywge |      Coef.   Std. Err.      t    P>|t|     [95% Conf. Interval]
89 | -------------+----------------------------------------------------------------
90 |         educ |    .070851   .0046535    15.23   0.000     .0611112    .0805908
91 |        _cons |   4.995183   .0479533   104.17   0.000     4.894815     5.09555
92 | ------------------------------------------------------------------------------
93 | ```
94 | 


--------------------------------------------------------------------------------