├── README.md ├── figures ├── figures3-4 │ ├── extra-files │ │ ├── figure-3-matrix-eigvals.csv │ │ └── figure-4-matrix-eigvals.csv │ ├── figure3-data.csv │ ├── figure3-simulate.py │ ├── figure4-data.csv │ ├── figure4-simulate.py │ └── figures3-4-plot.R ├── population_estimators.py └── tools.py ├── real-data-experiment ├── README.md ├── anchorRegression.py ├── data │ └── README.md ├── figs │ └── README.md ├── notebooks │ ├── results_tempC.ipynb │ ├── results_tempC_figure12.ipynb │ └── results_tempC_proxies.ipynb ├── results │ └── README.md ├── run_exp_temp.py ├── run_exp_temp_proxies.py └── utils.py └── synthetic-experiments ├── appendix-B-identification.ipynb ├── experiment1 ├── experiment1-plot.R └── experiment1-simulate.py ├── experiment2 ├── experiment2-plot.R └── experiment2-simulate.py ├── experiment3 ├── experiment3-plot.R └── experiment3-simulate.py ├── experiment4 ├── experiment4-plot.R └── experiment4-simulate.py ├── population_estimators.py └── tools.py /README.md: -------------------------------------------------------------------------------- 1 | # Code for replication of "Regularizing Towards Causal Invariance: Linear Models with Proxies" 2 | 3 | * Section 5 (Synthetic Experiments): `synthetic-experiments`, depends on 4 | + Computation: `numpy`, `pandas`, `tqdm` 5 | + Plotting: `R`, with `tidyverse`, `dplyr`, `readr`, `ggplot2`, `tikzDevice` 6 | * Section 6 (Pollution): `real-data-experiment`, depends on 7 | + Computation: `Pandas`, `numpy`, `scipy`, `scikit-learn` 8 | + Notebooks: `jupyter` 9 | + Plotting: `seaborn`, `matplotlib` 10 | -------------------------------------------------------------------------------- /figures/figures3-4/extra-files/figure-3-matrix-eigvals.csv: -------------------------------------------------------------------------------- 1 | 5.199122989862130950e+00;1.758349216789134672e+01;9.891977568801584297e-01;-1.465871678669836420e-01;9.891977568801583187e-01;-1.465871678669836142e-01 2 | 2.266055106894913962e+00;5.999999999999999112e+00;1.465871678669836420e-01;9.891977568801584297e-01;1.465871678669836142e-01;9.891977568801583187e-01 3 | -------------------------------------------------------------------------------- /figures/figures3-4/extra-files/figure-4-matrix-eigvals.csv: -------------------------------------------------------------------------------- 1 | 3.000000000000000000e+00;3.497041666068850141e+00;1.700000000000000000e+01;9.474786857846720922e-01;3.198189174888669273e-01 2 | -3.000000000000000000e+00;5.929583339311506052e-01;0.000000000000000000e+00;-3.198189174888669273e-01;9.474786857846720922e-01 3 | -------------------------------------------------------------------------------- /figures/figures3-4/figure3-simulate.py: -------------------------------------------------------------------------------- 1 | ### Loading libraries 2 | import numpy as np 3 | import pandas as pd 4 | from tqdm import tqdm 5 | 6 | # Load file tools and population_estimators from parent folder 7 | import os, sys, inspect 8 | currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) 9 | parentdir = os.path.dirname(currentdir) 10 | sys.path.insert(0,parentdir) 11 | from population_estimators import pack_params, gamma_ar, gamma_par, gamma_ols, gamma_cross 12 | from tools import Id, simulate, N, get_mse, inv 13 | 14 | ### Set seed 15 | np.random.seed(1) 16 | 17 | ### Dimensions 18 | d = {"A": 2, "W": 2, 'Z': 2, "Y": 1, "X": 2, "H": 1} 19 | d['O'] = d['X'] + d['Y'] + d['H'] 20 | 21 | ### Parameters 22 | beta = Id(d['A'], d['W']) 23 | pars = {'M': np.random.poisson(lam=2, size=(d['O'], d['A'])), 24 | 'B': np.random.normal(size=(d['O'], d['O']))/3, 25 | 'beta': beta} 26 | np.fill_diagonal(pars['B'], 0) 27 | noise_W = np.array([[0.5, 0], [-0.8, 1.5]]) 28 | lamb = 5 29 | 30 | 31 | ### Save matrix eigendecomposition for plotting in R 32 | M_par = Id(2) + lamb*beta@inv(beta.T@beta + noise_W@noise_W.T)@beta.T 33 | d_par, U_par = np.linalg.eig(M_par) 34 | 35 | # Modifications to Figure 3 36 | lamb2 = lamb/np.linalg.eigvals(beta@inv(beta.T@beta + noise_W@noise_W.T)@beta.T).min() 37 | M_par2 = Id(2) + lamb2*beta@inv(beta.T@beta + noise_W@noise_W.T)@beta.T 38 | d_par2, U_par2 = np.linalg.eig(M_par2) 39 | 40 | # Save eigengalues for plotting ellipses 41 | np.savetxt("figures/figures3-4/extra-files/figure-3-matrix-eigvals.csv", np.concatenate((np.array([d_par, d_par2]).T, U_par, U_par2), axis=1), delimiter=";") 42 | 43 | # 1) Compute population estimators from parameters 44 | c = {"Y": [0], "X": [1, 2]} 45 | params = pack_params(pars, c, d, noise_W) 46 | 47 | gammas = {"ols": gamma_ols(params), 48 | "par5": gamma_par(params, lamb), 49 | "par10": gamma_par(params, lamb2), 50 | "cross": gamma_cross(params, lamb), 51 | "ar": gamma_ar(params, lamb) 52 | } 53 | 54 | # 2) Simulate interventions for scatter plot 55 | results = {k: [] for k in gammas.keys()} 56 | for intervention_strength in tqdm(np.arange(50)/8): 57 | # Interventions 58 | vs = N(int(8*(intervention_strength + 0.1)**1.1), d['A']) 59 | for v in vs: 60 | # Normalize 61 | v *= intervention_strength # /norm(v) 62 | 63 | # Evaluate estimators in intervened dataset 64 | for method, gamma in gammas.items(): 65 | results[method].append([intervention_strength, 66 | get_mse(simulate(n=50000, d=d, pars=pars, v=v, noise_W=noise_W), gamma), 67 | method, 68 | v]) 69 | 70 | # Convert to dataframe 71 | df = pd.concat(pd.DataFrame(results[method], columns=( 72 | "Strength", "MSE", "Method", "A")) for method in gammas.keys()).reset_index(drop=True) 73 | # Add columns with intervened value to plot in A-space 74 | df = df.join(pd.DataFrame(df.A.tolist(), index=df.index, 75 | columns=[f"A{i}" for i in range(d['A'])])) 76 | df.to_csv("figures/figures3-4/figure-3-data.csv") 77 | -------------------------------------------------------------------------------- /figures/figures3-4/figure4-simulate.py: -------------------------------------------------------------------------------- 1 | ### Loading libraries 2 | import numpy as np 3 | import pandas as pd 4 | from tqdm import tqdm 5 | 6 | # Load file tools from parent folder 7 | import os, sys, inspect 8 | currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) 9 | parentdir = os.path.dirname(currentdir) 10 | sys.path.insert(0,parentdir) 11 | from tools import Id, simulate, ols, ar, N, get_mse, tar 12 | 13 | 14 | ### Set seed 15 | np.random.seed(1) 16 | 17 | ### Dimensions 18 | d = {"A": 2, "W": 2, 'Z': 2, "Y": 1, "X": 2, "H": 1} 19 | d['O'] = d['X'] + d['Y'] + d['H'] 20 | 21 | ### Parameters 22 | beta = Id(d['A'], d['W']) 23 | pars = {'M': np.random.poisson(lam=2, size=(d['O'], d['A'])), 24 | 'B': np.random.normal(size=(d['O'], d['O']))/3, 25 | 'beta': beta} 26 | np.fill_diagonal(pars['B'], 0) 27 | noise_W = np.array([[0.5, 0], [-0.8, 1.5]]) 28 | 29 | 30 | 31 | # 1) Simulate 32 | n = 100000 33 | data = simulate(n, d, pars, noise_W=noise_W) 34 | A, X, Y, W, Z = data['A'], data['X'], data['Y'], data['W'], data['Z'] 35 | 36 | ### Simulation setups 37 | # rotat = np.diag([np.sqrt(2), 1]) 38 | rotat = np.array([[1.6, 0.8], 39 | [-0.8, .5]]) 40 | shift = np.array([3, -3]) 41 | 42 | 43 | # Set lambda 44 | eta = shift.reshape(-1, 1) 45 | lamb = np.linalg.eigvals(eta@eta.T).max() - 1 46 | 47 | ### Save matrix eigendecomposition for plotting in R 48 | radius, U = np.linalg.eig(rotat@rotat.T) 49 | np.savetxt("figures/figures3-4/extra-files/figure-4-matrix-eigvals.csv", np.concatenate((np.array([shift, radius, np.array([lamb,0])]).T, U), axis=1), delimiter=";") 50 | 51 | 52 | 53 | # 1) Compute population estimators 54 | gamma_tar, alpha_tar = tar(X, Y, A, Sigma = rotat@rotat.T, nu=shift) 55 | gammas = {"ols": ols(X, Y), 56 | "tar": gamma_tar, 57 | "ar": ar(X, Y, A, lamb=lamb), 58 | } 59 | 60 | 61 | 62 | # 2) Simulate interventions for scatter plot 63 | results = {k: [] for k in gammas.keys()} 64 | for intervention_strength in tqdm(np.arange(80)/8): 65 | # Interventions 66 | vs = N(int(np.round(8*(intervention_strength + 0.1)**1.1)), d['A']) 67 | for v in vs: 68 | # Normalize 69 | v *= intervention_strength # /norm(v) 70 | 71 | # Evaluate estimators in intervened dataset 72 | for method, gamma in gammas.items(): 73 | results[method].append([intervention_strength, 74 | get_mse(simulate(n=1000, d=d, pars=pars, v = v, cov_A = rotat@rotat.T), gamma, (alpha_tar if method=="tar" else 0)), 75 | method, 76 | v]) 77 | 78 | # Convert to dataframe 79 | df = pd.concat(pd.DataFrame(results[method], columns=( 80 | "Strength", "MSE", "Method", "A")) for method in gammas.keys()).reset_index(drop=True) 81 | # Add columns with intervened value to plot in A-space 82 | df = df.join(pd.DataFrame(df.A.tolist(), index=df.index, 83 | columns=[f"A{i}" for i in range(d['A'])])) 84 | df.to_csv("figures/figures3-4/figure-4-data.csv") 85 | -------------------------------------------------------------------------------- /figures/figures3-4/figures3-4-plot.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | library(grid) 3 | set.seed(1) 4 | 5 | # Load data 6 | df1 = read_csv("figure-3-data.csv") 7 | df2 = read_csv("figure-4-data.csv") %>% subset(Method=="tar") 8 | 9 | # Combine (because OLS data in figure 3 is the same as in figure 4) 10 | df <- rbind(df1, df2) 11 | method.labels <- c('ols' = "OLS", 12 | 'ar' = "AR($A$) = xPAR($W,Z$)", 13 | 'par5' = "${PAR}_{\\lambda_1}(W$)", 14 | 'par10' = "${PAR}_{\\lambda_2}(W$)", 15 | 'tar' = "TAR($A$)" 16 | ) 17 | region.labels <- c("ols"="$C_{OLS}$", 18 | "ar"="$C_A(\\lambda_1) = C_{W,Z}(\\lambda_1)$", 19 | "par5"="$C_W(\\lambda_1)$", 20 | "par10"="$C_W(\\lambda_2)$", 21 | "tar"="Targeted distr.") 22 | 23 | #### Functions for making circles and ellipses 24 | ellipse <- function(center = c(0, 0), a = 1, b = 1, npoints=100, method="ols", rotation=diag(2), lty="solid"){ 25 | xx = seq(-a, a, length.out = npoints) 26 | yy = b/a * sqrt(a**2 - xx**2) 27 | out = t(rotation%*%rbind(c(xx, rev(xx)), c(yy, -rev(yy)))) 28 | out = sweep(out, 2, center, '+') 29 | return(data.frame(x=out[,1], y=out[,2], Method=method, lty=lty)) 30 | } 31 | 32 | # The theoretical ellipse is specified by eigvals and eigenvectors of E[AAT] + lamb*Omega_W 33 | matrix.info <- read.table("extra-files/figure-3-matrix-eigvals.csv", sep=";", header=F) 34 | radius.par5 = sqrt(matrix.info[,1]) 35 | radius.par10 = sqrt(matrix.info[,2]) 36 | 37 | U5 = as.matrix(matrix.info[,3:4]) 38 | U10 = as.matrix(matrix.info[,5:6]) 39 | 40 | # Compute guarantee sets 41 | regions <- rbind(ellipse(lty="ols"), # OLS 42 | ellipse(a=sqrt(1+5), b=sqrt(1+5), lty="ar"), #OLS <- AR 43 | ellipse(a=radius.par5[1],b=radius.par5[2], method="par5", rotation=U5, lty="par5"), #PAR5 44 | ellipse(a=sqrt(1+5), b=sqrt(1+5), method="par5", lty="ar"), #PAR5 <- AR 45 | ellipse(a=radius.par10[1],b=radius.par10[2], method="par10", rotation=U10, lty="par10"), #PAR10 46 | ellipse(a=sqrt(1+5), b=sqrt(1+5), method="par10", lty="ar"), #PAR10 <- AR 47 | ellipse(a=sqrt(1+5), b=sqrt(1+5), method="ar", lty="ar")) #AR 48 | regions$Method <- factor(regions$Method, levels=names(method.labels)) 49 | regions$lty <- factor(regions$lty, levels=c("ols", "par5", "par10", "ar")) 50 | 51 | regions$Primary <- (as.character(regions$lty) == as.character(regions$Method)) 52 | 53 | # The theoretical ellipse is specified by eigvals and eigenvectors of E[AAT] + lamb*Omega_W 54 | matrix.info.target <- read.table("extra-files/figure-4-matrix-eigvals.csv", sep=";", header=F) 55 | shift = as.vector(matrix.info.target[,1]) 56 | radius = as.vector(matrix.info.target[,2]) 57 | lamb = as.vector(matrix.info.target[,3])[1] 58 | rotat = as.matrix(matrix.info.target[,4:5]) 59 | 60 | targ <- ellipse(center=shift, a=sqrt(radius[1]), b=sqrt(radius[2]), rotation=rotat, method="tar", lty="tar") %>% select(-Method) 61 | targ$lty <- factor(targ$lty, levels=names(region.labels)) 62 | targ$Method <- factor(targ$Method, levels=names(region.labels)) 63 | 64 | # Cut points to be inside interval (to cut away whitespace in tikzDevice) 65 | lims = 5 66 | df_ = df %>% 67 | subset(Method %in% c("ols", "par5", "par10", "ar")) %>% 68 | subset((-lims < A0) & (A0 < lims) & (-lims < A1) & (A1 < lims)) 69 | 70 | midpoint <- mean(log10(subset(df_, Method=="ols")$MSE))*1.2 71 | 72 | # Order factors for plotting order 73 | df_$Method <- factor(df_$Method, levels=names(method.labels)) 74 | 75 | # Plot 76 | p <- ggplot(df_) + 77 | geom_point(aes(x=A0, y=A1, color=MSE), alpha=1, size=2) + 78 | geom_path(data=regions, aes(x,y, lty = lty, alpha=Primary), size=0.6, color="#000066", show.legend = T) + 79 | labs(x = "$\\nu_1$", y="$\\nu_2$")+ 80 | theme_bw(base_size = 9) + 81 | scale_color_gradient2( 82 | low="#1c0f00", mid="#fcf78f", high="#D1654C", 83 | trans = "log10", 84 | midpoint = midpoint, 85 | limits = c(1, 10), oob=scales::squish, 86 | ) + 87 | scale_linetype_manual(values=c("solid", "11", "22", "33"), 88 | breaks = c("ols", "ar", "par5", "par10"), 89 | labels=as_labeller(region.labels) 90 | )+ 91 | scale_x_continuous(breaks = c(-5, 0, 5)) + scale_y_continuous(breaks=c(-5, 0, 5)) + 92 | scale_alpha_manual(values=c(0.4, 1), name=NULL, breaks=NULL) + 93 | guides(lty=guide_legend(title=NULL), 94 | alpha=guide_legend(title=NULL), 95 | color=guide_colorbar(order=1, title="MSPE", 96 | barheight=unit(1.5, "cm") 97 | )) + 98 | coord_fixed(ratio = 1, xlim=c(-lims, lims), ylim = c(-lims, lims)) + 99 | theme(panel.grid.minor = element_blank(), 100 | plot.title = element_blank(), 101 | legend.spacing.x = unit(0.05, 'cm'), 102 | legend.text = element_text(margin = margin(l = 2, unit = "pt")), 103 | legend.margin=margin(-10,0,0,0), 104 | plot.margin = margin(0, 0, 0, -5), 105 | legend.title.align = 0) + 106 | facet_wrap(~Method, ncol=5, labeller=as_labeller(method.labels)) 107 | print(p) 108 | 109 | 110 | ### Target AR plot 111 | # Cut points to be inside interval 112 | lims = 5 113 | df_ = df %>% 114 | subset(Method %in% c("tar", "ols")) %>% 115 | subset((-lims < A0) & (A0 < lims) & (-lims < A1) & (A1 < lims)) 116 | 117 | # Order factors for plotting order 118 | df_$Method <- factor(df_$Method, levels=names(method.labels)) 119 | 120 | p <- ggplot(df_) + 121 | geom_point(aes(x=A0, y=A1, color=MSE), alpha=1, size=2) + 122 | geom_path(data=subset(regions, Method=="ols" & lty=="ols"), aes(x,y, lty = lty), size=0.6, color="#000066", alpha = 1, show.legend = T) + 123 | geom_path(data=targ, aes(x,y, lty=lty), size=0.6, color="#000066", alpha = 1, show.legend = T) + 124 | geom_point(aes(x=0, y=0), shape=4, size=0.8, color="#000066", show.legend=F) + 125 | labs(x = "$\\nu_1$", y="$\\nu_2$")+ 126 | theme_bw(base_size = 9) + 127 | scale_color_gradient2( 128 | low="#1c0f00", mid="#fcf78f", high="#D1654C", 129 | trans = "log10", 130 | midpoint = midpoint, 131 | limits = c(1, 10), oob=scales::squish, 132 | ) + 133 | scale_linetype_manual(values=c("solid", "11", "22", "33", "11"), 134 | breaks = c("ols", "ar", "par5", "par10", "tar"), 135 | labels=as_labeller(region.labels) 136 | )+ 137 | scale_x_continuous(breaks = c(-5, 0, 5)) + scale_y_continuous(breaks=c(-5, 0, 5)) + 138 | guides(lty=guide_legend(title=NULL), 139 | color=guide_colorbar(order=1, title="MSPE", 140 | barheight=unit(1.5, "cm") 141 | )) + 142 | coord_fixed(ratio = 1, xlim=c(-lims, lims), ylim = c(-lims, lims)) + 143 | theme(panel.grid.minor = element_blank(), 144 | plot.title = element_blank(), 145 | legend.spacing.x = unit(0.05, 'cm'), 146 | legend.text = element_text(margin = margin(l = 2, unit = "pt")), 147 | legend.margin=margin(-10,0,0,0), 148 | plot.margin = margin(0, 0, 0, -5), 149 | legend.title.align = 0) + 150 | facet_wrap(~Method, ncol=5, labeller=as_labeller(method.labels)) 151 | print(p) -------------------------------------------------------------------------------- /figures/population_estimators.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | inv = np.linalg.inv; Id = lambda d, d2=None: np.eye(d, d2) 3 | """ 4 | For some experiments we need the population versions of estimators instead of sample estimators. 5 | This file has functions to such estimators, based on moments that are derived from parameter matrices. 6 | """ 7 | 8 | def pack_params(pars, c, d, noise_W): 9 | """ 10 | The population estimators are computed based on moments. 11 | We compute the moments based on the parameters MB and the noise distributions. 12 | 13 | In the naming below, we the variable XW corresponds to E[XW^T] etc. 14 | We let O denote the stacked outcome (Y, X, W) 15 | """ 16 | # Unpack inputs. c is a vector containing indices (e.g. O_1 is Y, O_2 is X_1,...) 17 | cY = c['Y']; cX = c['X'] # input 18 | beta, M, B = pars['beta'], pars['M'], pars['B'] 19 | 20 | # Store the inverse of the matrix (Id - B) 21 | IB = inv(Id(d['O']) - B) 22 | 23 | # Compute moments relating to the outcome O 24 | OA = IB@M 25 | OW = IB@M@beta 26 | OO = IB@(M@M.T + Id(d['O']))@IB.T 27 | 28 | # Compute moment E[WW^T] 29 | if len(np.shape(noise_W)) == 0: 30 | WW = beta.T@beta + noise_W**2*Id(d['W']) 31 | else: 32 | WW = beta.T@beta + noise_W@noise_W.T 33 | 34 | # Compute covariance of A and cross proxies 35 | AA = Id(d['A']) 36 | ZW = beta.T@beta 37 | #Covariances relating to X 38 | XX = OO[cX][:,cX]; XY = OO[cX][:,cY]; XW = OW[cX]; XA = OA[cX]; XZ = XW 39 | # Covariances relating to Y 40 | YW = OW[cY]; YA = OA[cY]; YZ = YW 41 | 42 | # Return dict with all moments 43 | return {"IB": IB, "OA": OA, "OW":OW, "OO":OO, "WW":WW, "AA":AA, "ZW":ZW,"XX":XX, 44 | "XY":XY, "XW":XW, "XA":XA, "XZ":XZ,"YW":YW, "YA":YA, "YZ":YZ, "M":M} 45 | 46 | # OLS 47 | def gamma_ols(params): 48 | # Unpack moments 49 | XX, XY = params['XX'], params['XY'] 50 | # Return estimator based on moments 51 | return inv(XX)@XY 52 | 53 | # Proxy anchor regression 54 | def gamma_par(params, lamb): 55 | # Unpack moments 56 | XX, XW, WW, XY, YW = params['XX'], params['XW'], params['WW'], params['XY'], params['YW'] 57 | # Return estimator based on moments 58 | return inv(XX + lamb*XW@inv(WW)@XW.T)@(XY + lamb*XW@inv(WW)@YW.T) 59 | 60 | # Anchor regression 61 | def gamma_ar(params, lamb): 62 | # Unpack moments 63 | XX, AA, XY, XA, YA = params['XX'], params['AA'], params['XY'], params['XA'], params['YA'] 64 | # Return estimator based on moments 65 | return inv(XX + lamb*XA@inv(AA)@XA.T)@(XY + lamb*XA@inv(AA)@YA.T) 66 | 67 | def gamma_cross(params, lamb): 68 | # Unpack moments 69 | XX, XW, ZW, XZ, XY, YW, YZ = params['XX'], params['XW'], params['ZW'], params['XZ'], params['XY'], params['YW'], params['YZ'] 70 | # Compute "denominator" (left-side inverse) 71 | denom = 2*XX + lamb*(XW@inv(ZW)@XZ.T + XZ@inv(ZW).T@XW.T) 72 | # Compute "numerator" 73 | num = 2*XY + lamb*(XW@inv(ZW)@YZ.T + XZ@inv(ZW).T@YW.T) 74 | return inv(denom)@num 75 | 76 | def get_mse_v(gamma, v, params, c): 77 | """Compute the population mse of using an estimator gamma""" 78 | # Unpack 79 | M, IB = params['M'], params['IB'] 80 | cY, cX = c['Y'], c['X'] 81 | # Compute w_gamma 82 | w_gamma = (IB[cY,] - gamma.T@IB[cX,]).T 83 | # Output population MSE 84 | return (w_gamma.T@M@v@v.T@M.T@w_gamma + w_gamma.T@w_gamma)[0,0] 85 | -------------------------------------------------------------------------------- /figures/tools.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | """ 4 | This file contains implementations of estimators and a function to simulate data, used for experiments in the paper 5 | """ 6 | 7 | ### Define convinience functions 8 | inv = np.linalg.inv; norm = np.linalg.norm; Id = lambda d, d2=None: np.eye(d, d2) 9 | # Column and row bind 10 | def cb(*args): return np.concatenate(args, axis=1) 11 | def rb(*args): return np.concatenate(args, axis=0) 12 | 13 | # Multivariate gaussian 14 | N = lambda d, n=1: np.random.normal(size=(d, n)) 15 | 16 | ### Simulate function 17 | def simulate(n, d, pars, v=None, shift=0, cov_A=None, noise_W=None, noise_Z=None): 18 | """ Simulate data from parameters. Dimensions d_A x n, etc. """ 19 | # Unpack 20 | d_A = d['A']; d_W = d['W']; d_X = d['X']; d_Y = d['Y']; d_O = d['O']; d_Z = d['Z'] 21 | M = pars['M']; B = pars['B']; beta = pars['beta'] 22 | 23 | # If no noise is provided, use spherical unit variance as proxy noise 24 | if noise_W is None: noise_W = Id(d_W) 25 | elif len(np.shape(noise_W)) == 0: noise_W = noise_W*Id(d_W) 26 | # If no noise for secondary proxy is supplied, use the same as W 27 | if noise_Z is None: noise_Z = noise_W 28 | elif len(np.shape(noise_Z)) == 0: noise_Z = noise_Z*Id(d_Z) 29 | 30 | # If no parameter beta_z is provided, use same as beta_W 31 | if "beta_z" in pars.keys(): beta_z = pars['beta_z'] 32 | else: beta_z = beta 33 | 34 | # If covariance matrix for A is given, use this, else use spherical noise 35 | # Since changed covariance matrices are only used for targeted, assumes also a v is given 36 | if cov_A is not None: 37 | A = np.random.multivariate_normal(v, cov=cov_A, size=n).T 38 | else: 39 | # Use either the intervention v tiled several times (fixed A), or a mean-zero gaussian 40 | A = (N(d_A, n) if v is None else np.tile(np.reshape(v, (d_A, 1)), n)) + shift 41 | # Compute the outcome O = (Y, X, H) 42 | O = inv(Id(d['O'])-B)@(M@A + N(d_O, n)) 43 | Y, X, H = np.split(O, [d_Y, d_Y+d_X]) 44 | #Simulate proxies 45 | W = beta.T@A + noise_W@N(d_W, n) 46 | Z = beta_z.T@A + noise_Z@N(d_Z, n) 47 | return {'A': A, 'W': W, 'Y': Y, 'X': X, 'H': H, 'Z': Z} 48 | 49 | # Mean function 50 | def E(X): 51 | return X.mean(axis=1).reshape(-1, 1) 52 | 53 | ### Estimators 54 | # Ordinary least squares 55 | def ols(X, Y, intercept=False): 56 | if intercept: 57 | X = np.concatenate((np.ones((1, X.shape[1])), X)) 58 | return inv(X@X.T)@X@Y.T 59 | 60 | # Anchor regression estimator 61 | def ar(X, Y, A, lamb=1, intercept=False): 62 | if intercept: 63 | X = np.concatenate((np.ones((1, X.shape[1])), X)) 64 | return inv(X@X.T + lamb*X@A.T@inv(A@A.T)@A@X.T)@(X@Y.T + lamb*X@A.T@inv(A@A.T)@A@Y.T) 65 | 66 | # Cross estimator 67 | def cross(X, Y, W, Z, lamb=1): 68 | ZW = inv(Z@W.T) 69 | denom = 2*X@X.T + lamb*(X@W.T@ZW@Z@X.T + X@Z.T@ZW.T@W@X.T) 70 | num = 2*X@Y.T + lamb*(X@W.T@ZW@Z@Y.T + X@Z.T@ZW.T@W@Y.T) 71 | return inv(denom)@num 72 | 73 | # Targeted anchor regression, targeted to covariance Sigma and mean shift nu 74 | def tar(X, Y, A, Sigma, nu=0): 75 | # Get dimensions 76 | d_A, n = A.shape 77 | if len(np.shape(nu)) == 0: 78 | nu = np.tile(nu, d_A).reshape(d_A, 1) 79 | 80 | # Compute alpha and gamma 81 | gamma = inv(X@X.T/n + X@A.T@inv(A@A.T)@(Sigma - A@A.T/n)@inv(A@A.T)@A@X.T)@(X@Y.T/n + X@A.T@inv(A@A.T)@(Sigma - A@A.T/n)@inv(A@A.T)@A@Y.T) 82 | alpha = (Y - gamma.T@X)@A.T@inv(A@A.T)@nu 83 | return gamma, alpha 84 | 85 | # IV estimator 86 | def iv(X, Y, A): 87 | return inv(X@A.T@inv(A@A.T)@A@X.T)@X@A.T@inv(A@A.T)@A@Y.T 88 | 89 | # Function to evaluate the prediction MSE of a dataset and some gamma 90 | def get_mse(data, gamma, alpha = 0): 91 | return ((data['Y'] - gamma.T@data['X'] - alpha)**2).mean() 92 | -------------------------------------------------------------------------------- /real-data-experiment/README.md: -------------------------------------------------------------------------------- 1 | # Executing real-data experiment 2 | 3 | Dependencies: 4 | * pandas, numpy, scipy, scikit-learn 5 | * seaborn, matplotlib 6 | 7 | 1. Execute scripts in this order 8 | * `run_exp_temp.py` 9 | * `run_exp_temp_proxies.py` 10 | 11 | 2. Run the notebooks 12 | * `results_tempC.ipynb` 13 | * `results_tempC_proxies.ipynb` 14 | * `results_tempC_figure12.ipynb` 15 | -------------------------------------------------------------------------------- /real-data-experiment/anchorRegression.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from sklearn.base import RegressorMixin, BaseEstimator 4 | from sklearn.linear_model._base import LinearModel 5 | from sklearn.utils.validation import check_X_y, check_array, check_is_fitted 6 | 7 | # Convenience functions 8 | inv = np.linalg.inv 9 | 10 | class AnchorRegression(LinearModel): 11 | def __init__(self, lamb=1, fit_intercept=False, normalize=False, copy_X=False): 12 | self.lamb = lamb 13 | self.fit_intercept=fit_intercept 14 | self.normalize=normalize 15 | self.copy_X = copy_X 16 | 17 | def fit(self, X, y, A=None): 18 | X, y = self._validate_data(X, y, y_numeric=True) 19 | 20 | X, y, X_offset, y_offset, X_scale = self._preprocess_data( 21 | X, y, fit_intercept=self.fit_intercept, normalize=self.normalize, 22 | copy=self.copy_X, sample_weight=None, 23 | return_mean=True) 24 | 25 | if type(A) is not np.ndarray: 26 | A = A.values 27 | 28 | # Center A 29 | A = A - A.mean(axis=0) 30 | 31 | self.coef_ = \ 32 | inv(X.T@X + self.lamb*X.T@A@inv(A.T@A)@A.T@X)@( 33 | X.T@y + self.lamb*X.T@A@inv(A.T@A)@A.T@y) 34 | 35 | self._set_intercept(X_offset, y_offset, X_scale) 36 | 37 | self.is_fitted_ = True 38 | return self 39 | 40 | class CrossProxyAnchorRegression(LinearModel): 41 | def __init__(self, lamb=1, fit_intercept=False, normalize=False, copy_X=False): 42 | self.lamb = lamb 43 | self.fit_intercept=fit_intercept 44 | self.normalize=normalize 45 | self.copy_X = copy_X 46 | 47 | def fit(self, X, y, W, Z): 48 | X, y = self._validate_data(X, y, y_numeric=True) 49 | 50 | X, y, X_offset, y_offset, X_scale = self._preprocess_data( 51 | X, y, fit_intercept=self.fit_intercept, normalize=self.normalize, 52 | copy=self.copy_X, sample_weight=None, 53 | return_mean=True) 54 | 55 | if type(W) is not np.ndarray: 56 | W = W.values 57 | if type(Z) is not np.ndarray: 58 | Z = Z.values 59 | 60 | # Center W 61 | W = W - W.mean(axis=0) 62 | Z = Z - Z.mean(axis=0) 63 | 64 | # Transpose to align with formatting of synth experiments 65 | W = W.T; Z = Z.T; X = X.T; Y = y.T 66 | 67 | ZW = inv(Z@W.T) 68 | denom = 2*X@X.T + self.lamb*(X@W.T@ZW@Z@X.T + X@Z.T@ZW.T@W@X.T) 69 | num = 2*X@Y.T + self.lamb*(X@W.T@ZW@Z@Y.T + X@Z.T@ZW.T@W@Y.T) 70 | self.coef_ = inv(denom)@num 71 | 72 | self._set_intercept(X_offset, y_offset, X_scale) 73 | self.is_fitted_ = True 74 | return self 75 | 76 | class TargetedAnchorRegression(LinearModel): 77 | def __init__(self, fit_intercept=False, normalize=False, copy_X=False): 78 | self.fit_intercept=fit_intercept 79 | self.normalize=normalize 80 | self.copy_X = copy_X 81 | 82 | def fit(self, X, y, A=None, nu=None): 83 | ''' 84 | Targeted shift where nu is the shifted A 85 | ''' 86 | X, y = self._validate_data(X, y, y_numeric=True) 87 | 88 | X, y, X_offset, y_offset, X_scale = self._preprocess_data( 89 | X, y, fit_intercept=self.fit_intercept, normalize=self.normalize, 90 | copy=self.copy_X, sample_weight=None, 91 | return_mean=True) 92 | 93 | if type(A) is not np.ndarray: 94 | A = A.values 95 | 96 | # Center columns of A and nu, with respect to A 97 | n, d_A = A.shape 98 | mean_A = A.mean(axis=0) 99 | A = A - mean_A 100 | nu = nu - mean_A 101 | 102 | Sig_A = np.cov(A.T, bias=True) 103 | Sig_nu = np.cov(nu.T, bias=True) 104 | mean_nu = np.mean(nu, axis=0).T 105 | 106 | if len(np.shape(Sig_A)) == 0: 107 | Sig_A = np.tile(Sig_A, d_A).reshape(d_A, 1) 108 | Sig_nu = np.tile(Sig_nu, d_A).reshape(d_A, 1) 109 | 110 | # Transpose to align with formatting of synth experiments 111 | A = A.T; X = X.T; Y = y.T 112 | 113 | Omega = inv(A@A.T)@(Sig_nu - Sig_A)@inv(A@A.T) 114 | 115 | gamma = inv(X@X.T/n + X@A.T @ Omega @ A@X.T)@( 116 | X@Y.T/n + X@A.T @ Omega @ A@Y.T) 117 | alpha = (Y - gamma.T@X)@A.T@inv(A@A.T)@mean_nu 118 | 119 | self.coef_ = gamma 120 | self.intercept_ = y_offset + alpha 121 | 122 | self.is_fitted_ = True 123 | return self 124 | 125 | class CrossTargetedAnchorRegression(LinearModel): 126 | def __init__(self, fit_intercept=False, normalize=False, copy_X=False): 127 | self.fit_intercept=fit_intercept 128 | self.normalize=normalize 129 | self.copy_X = copy_X 130 | 131 | def fit(self, X, y, W=None, nu=None, Z=None): 132 | ''' 133 | Targeted shift where nu is the shifted W 134 | ''' 135 | X, y = self._validate_data(X, y, y_numeric=True) 136 | 137 | X, y, X_offset, y_offset, X_scale = self._preprocess_data( 138 | X, y, fit_intercept=self.fit_intercept, normalize=self.normalize, 139 | copy=self.copy_X, sample_weight=None, 140 | return_mean=True) 141 | 142 | if type(W) is not np.ndarray: 143 | W = W.values 144 | 145 | # Center columns of W and nu, with respect to W 146 | n, d_W = W.shape 147 | mean_W = W.mean(axis=0) 148 | W = W - mean_W 149 | nu = nu - mean_W 150 | 151 | Sig_W = np.cov(W.T, bias=True) 152 | Sig_nu = np.cov(nu.T, bias=True) 153 | mean_nu = np.mean(nu, axis=0).T 154 | 155 | if len(np.shape(Sig_W)) == 0: 156 | Sig_W = np.tile(Sig_W, d_W).reshape(d_W, 1) 157 | Sig_nu = np.tile(Sig_nu, d_W).reshape(d_W, 1) 158 | 159 | # Transpose to align with formatting of synth experiments 160 | Z = Z.T; W = W.T; X = X.T; Y = y.T 161 | 162 | Omega = inv(W@Z.T)@(Sig_nu - Sig_W)@inv(Z@W.T) 163 | 164 | gamma = inv(X@X.T/n + X@W.T @ Omega @ W@X.T)@( 165 | X@Y.T/n + X@W.T @ Omega @ W@Y.T) 166 | alpha = (Y - gamma.T@X)@Z.T@inv(W@Z.T)@mean_nu 167 | 168 | self.coef_ = gamma 169 | self.intercept_ = y_offset + alpha 170 | 171 | self.is_fitted_ = True 172 | return self 173 | 174 | 175 | class MeanPredictor(BaseEstimator): 176 | def __init__(self): 177 | pass 178 | 179 | def fit(self, X, y): 180 | X, y = check_X_y(X, y, accept_sparse=True) 181 | self.mean_ = y.mean() 182 | 183 | self.is_fitted_ = True 184 | # `fit` should always return `self` 185 | return self 186 | 187 | def predict(self, X): 188 | X = check_array(X, accept_sparse=True) 189 | check_is_fitted(self, 'is_fitted_') 190 | return np.ones(X.shape[0], dtype=np.int64) * self.mean_ 191 | -------------------------------------------------------------------------------- /real-data-experiment/data/README.md: -------------------------------------------------------------------------------- 1 | # Data Folder 2 | 3 | Download data from https://archive.ics.uci.edu/ml/datasets/PM2.5+Data+of+Five+Chinese+Cities 4 | 5 | Place into this folder. Relevant files are: 6 | * 'BeijingPM20100101_20151231.csv' 7 | * 'GuangzhouPM20100101_20151231.csv' 8 | * 'ShenyangPM20100101_20151231.csv' 9 | * 'ChengduPM20100101_20151231.csv' 10 | * 'ShanghaiPM20100101_20151231.csv' 11 | -------------------------------------------------------------------------------- /real-data-experiment/figs/README.md: -------------------------------------------------------------------------------- 1 | # Figures 2 | 3 | Once the relevant notebooks are run, this will contain two figures 4 | * (Figure 11): best_case_performance_TempC.pdf 5 | * (Figure 12): coefficient_comparison.pdf 6 | 7 | -------------------------------------------------------------------------------- /real-data-experiment/notebooks/results_tempC.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Tables 1, 2, and Figure 11 (Temperature as proxy)\n", 8 | "\n", 9 | "Builds the portions of Tables 1, 2 that do not include W, Z\n", 10 | "\n", 11 | "Requires that `run_exp_temp.py` is run" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 1, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "RPATH = '../results'\n", 21 | "FPATH = '../figs'\n", 22 | "\n", 23 | "import numpy as np\n", 24 | "import pandas as pd\n", 25 | "import seaborn as sns\n", 26 | "import matplotlib.pyplot as plt\n", 27 | "import pickle as pkl" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 2, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "results = pd.read_csv(f\"{RPATH}/all_res_test_TempC.csv\")\n", 37 | "results = results.drop('Unnamed: 0', axis=1)\n", 38 | "\n", 39 | "residuals = results.drop(\"Lambda\", axis=1).query('Environment == \"Test\"')\n", 40 | "lambs = results.drop(\"Residual\", axis=1).query('Environment == \"Test\"')\n", 41 | "\n", 42 | "mse = lambda v: np.mean(v**2)\n", 43 | "\n", 44 | "# Get RMSE by season, city\n", 45 | "pt = pd.pivot_table(residuals, \n", 46 | " index=['Test_Season', 'City'], \n", 47 | " columns = 'Estimator', \n", 48 | " aggfunc={'Residual': mse})\n", 49 | "\n", 50 | "pt.columns = pt.columns.droplevel(0)\n", 51 | "pt = pt.rename(columns = {\n", 52 | " 'OLS (All)': 'OLS (TempC)', \n", 53 | " 'PA (TempC)': 'OLS + Est. Bias',\n", 54 | " 'TAR (TempC)': 'PTAR (TempC)', \n", 55 | " 'AR (TempC)': 'PAR (TempC)'\n", 56 | "})\n", 57 | "newcols = [\n", 58 | " 'OLS',\n", 59 | " 'OLS (TempC)',\n", 60 | " 'OLS + Est. Bias',\n", 61 | " 'PAR (TempC)',\n", 62 | " 'PTAR (TempC)'\n", 63 | "]\n", 64 | "pt = pt[newcols].reset_index()" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": {}, 70 | "source": [ 71 | "# Table 1" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 3, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "wins = lambda v: int(np.sum(v < 0))\n", 81 | "loss = lambda v: int(np.sum(v > 0))\n", 82 | "\n", 83 | "lambs_ar = lambs.query(f'Environment == \"Test\" & Estimator == \"AR (TempC)\"').groupby(\n", 84 | " ['City', 'Test_Season']).mean()[['Lambda']]\n", 85 | "\n", 86 | "lambs_ar = lambs_ar.reset_index().set_index(['City', 'Test_Season'])\n", 87 | "\n", 88 | "pt_diff = pt.copy()\n", 89 | "for est in newcols:\n", 90 | " pt_diff[est] = pt[est] - pt['OLS']\n", 91 | "\n", 92 | "pt_pos_lamb = pt.set_index(['City', 'Test_Season']).merge(lambs_ar, left_index=True, right_index=True)\n", 93 | "pt_pos_lamb = pt_pos_lamb.query(\"Lambda > 0\").drop(\"Lambda\", axis=1).reset_index()\n", 94 | "\n", 95 | "pt_diff_pos_lamb = pt_pos_lamb.copy()\n", 96 | "for est in newcols:\n", 97 | " pt_diff_pos_lamb[est] = pt_pos_lamb[est] - pt_pos_lamb['OLS']" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 4, 103 | "metadata": {}, 104 | "outputs": [ 105 | { 106 | "name": "stdout", 107 | "output_type": "stream", 108 | "text": [ 109 | "\\begin{tabular}{lrrrr}\n", 110 | "\\toprule\n", 111 | "{} & Mean & Win & min & max \\\\\n", 112 | "Estimator & & & & \\\\\n", 113 | "\\midrule\n", 114 | "OLS & 0.537 & 0 & 0.000 & 0.000 \\\\\n", 115 | "OLS (TempC) & 0.536 & 5 & -0.028 & 0.026 \\\\\n", 116 | "OLS + Est. Bias & 0.569 & 4 & -0.072 & 0.150 \\\\\n", 117 | "PAR (TempC) & 0.531 & 6 & -0.041 & 0.006 \\\\\n", 118 | "PTAR (TempC) & 0.525 & 8 & -0.061 & 0.001 \\\\\n", 119 | "\\bottomrule\n", 120 | "\\end{tabular}\n", 121 | "\n" 122 | ] 123 | } 124 | ], 125 | "source": [ 126 | "lt = pd.melt(pt_pos_lamb, id_vars=['Test_Season', 'City'], value_name = 'MSE', var_name = 'Estimator')\n", 127 | "\n", 128 | "mean_result = lt.groupby('Estimator', as_index=True).agg(\n", 129 | " **{'Mean': pd.NamedAgg(column='MSE', aggfunc=np.mean)}\n", 130 | ").reindex(newcols)\n", 131 | "\n", 132 | "pt_diff_long = pd.melt(pt_diff_pos_lamb, id_vars=['Test_Season', 'City'], value_name = 'MSE', var_name='Estimator')\n", 133 | "\n", 134 | "diff_ols = pt_diff_long.groupby('Estimator', as_index=True).agg(\n", 135 | " **{'min': pd.NamedAgg(column='MSE', aggfunc=np.min), \n", 136 | " 'max': pd.NamedAgg(column='MSE', aggfunc=np.max)}\n", 137 | ").reindex(newcols)\n", 138 | "\n", 139 | "win_loss_ols = pt_diff_long.groupby('Estimator', as_index=True).agg(\n", 140 | " **{'Win': pd.NamedAgg(column='MSE', aggfunc=wins)}\n", 141 | ").reindex(newcols)\n", 142 | "\n", 143 | "print(pd.concat([mean_result, win_loss_ols.astype(int), diff_ols], axis=1).to_latex(float_format=\"{:.3f}\".format))" 144 | ] 145 | }, 146 | { 147 | "cell_type": "markdown", 148 | "metadata": {}, 149 | "source": [ 150 | "## Table 2\n", 151 | "\n", 152 | "This does not include W, Z, see `results_tempC_proxies.ipynb` for those" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 5, 158 | "metadata": { 159 | "scrolled": false 160 | }, 161 | "outputs": [ 162 | { 163 | "name": "stdout", 164 | "output_type": "stream", 165 | "text": [ 166 | "\\begin{tabular}{lrrrr}\n", 167 | "\\toprule\n", 168 | "{} & Mean & Diff & min & max \\\\\n", 169 | "Estimator & & & & \\\\\n", 170 | "\\midrule\n", 171 | "OLS & 0.457 & 0.000 & 0.000 & 0.000 \\\\\n", 172 | "OLS (TempC) & 0.455 & -0.002 & -0.028 & 0.026 \\\\\n", 173 | "OLS + Est. Bias & 0.474 & 0.018 & -0.072 & 0.150 \\\\\n", 174 | "PAR (TempC) & 0.454 & -0.003 & -0.041 & 0.006 \\\\\n", 175 | "PTAR (TempC) & 0.450 & -0.007 & -0.061 & 0.002 \\\\\n", 176 | "\\bottomrule\n", 177 | "\\end{tabular}\n", 178 | "\n" 179 | ] 180 | } 181 | ], 182 | "source": [ 183 | "pt_diff = pt.copy()\n", 184 | "for est in newcols:\n", 185 | " pt_diff[est] = pt[est] - pt['OLS']\n", 186 | "\n", 187 | "lt = pd.melt(pt, id_vars=['Test_Season', 'City'], value_name = 'MSE')\n", 188 | "\n", 189 | "mean_result = lt.groupby('Estimator', as_index=True).agg(\n", 190 | " **{'Mean': pd.NamedAgg(column='MSE', aggfunc=np.mean)}\n", 191 | ").reindex(newcols)\n", 192 | "\n", 193 | "pt_diff_long = pd.melt(pt_diff, id_vars=['Test_Season', 'City'], value_name = 'MSE')\n", 194 | "\n", 195 | "diff_ols = pt_diff_long.groupby('Estimator', as_index=True).agg(\n", 196 | " **{'Diff': pd.NamedAgg(column='MSE', aggfunc=np.mean),\n", 197 | " 'min': pd.NamedAgg(column='MSE', aggfunc=np.min), \n", 198 | " 'max': pd.NamedAgg(column='MSE', aggfunc=np.max)}\n", 199 | ").reindex(newcols)\n", 200 | "\n", 201 | "print(pd.concat([mean_result, diff_ols], axis=1).to_latex(float_format=\"{:.3f}\".format))" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": 6, 207 | "metadata": { 208 | "scrolled": true 209 | }, 210 | "outputs": [ 211 | { 212 | "data": { 213 | "text/plain": [ 214 | "'City == 0 & Test_Season == 2'" 215 | ] 216 | }, 217 | "execution_count": 6, 218 | "metadata": {}, 219 | "output_type": "execute_result" 220 | } 221 | ], 222 | "source": [ 223 | "best_city, best_season = pt_diff.sort_values(\n", 224 | " f\"PAR (TempC)\", ascending=True).head(1)[['City', 'Test_Season']].values[0]\n", 225 | "\n", 226 | "best_case_query = f\"City == {best_city} & Test_Season == {best_season}\"\n", 227 | "\n", 228 | "best_case_query" 229 | ] 230 | }, 231 | { 232 | "cell_type": "markdown", 233 | "metadata": {}, 234 | "source": [ 235 | "# Figure 11" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": 7, 241 | "metadata": {}, 242 | "outputs": [], 243 | "source": [ 244 | "rmse_df = pd.read_csv(f\"{RPATH}/all_rmse_test_TempC.csv\")\n", 245 | "rmse_df = rmse_df.drop('Unnamed: 0', axis=1)" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": 8, 251 | "metadata": {}, 252 | "outputs": [ 253 | { 254 | "data": { 255 | "text/plain": [ 256 | "['AR (TempC)', 'Mean', 'OLS', 'OLS (All)', 'PA (TempC)', 'TAR (TempC)']" 257 | ] 258 | }, 259 | "execution_count": 8, 260 | "metadata": {}, 261 | "output_type": "execute_result" 262 | } 263 | ], 264 | "source": [ 265 | "plot_est = list(rmse_df.groupby('Estimator').mean().index.values)\n", 266 | "plot_est" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": 9, 272 | "metadata": {}, 273 | "outputs": [], 274 | "source": [ 275 | "plot_est = [f for f in plot_est if 'Mean' not in f]" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": 10, 281 | "metadata": {}, 282 | "outputs": [], 283 | "source": [ 284 | "plot_df = rmse_df.query('Environment == \"Test\" & Estimator in @plot_est').copy()\n", 285 | "plot_df['MSE'] = plot_df['RMSE'] **2" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": 11, 291 | "metadata": {}, 292 | "outputs": [], 293 | "source": [ 294 | "best_case = plot_df.query(best_case_query).copy()" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": 12, 300 | "metadata": {}, 301 | "outputs": [], 302 | "source": [ 303 | "order = [\n", 304 | " 'OLS',\n", 305 | " 'AR (TempC)',\n", 306 | " 'TAR (TempC)'\n", 307 | "]\n", 308 | "\n", 309 | "labels = [\n", 310 | " 'OLS',\n", 311 | " 'PAR (TempC)',\n", 312 | " 'PTAR (TempC)' \n", 313 | "]" 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "execution_count": 13, 319 | "metadata": {}, 320 | "outputs": [ 321 | { 322 | "data": { 323 | "image/png": "\n", 324 | "text/plain": [ 325 | "
" 326 | ] 327 | }, 328 | "metadata": { 329 | "needs_background": "light" 330 | }, 331 | "output_type": "display_data" 332 | } 333 | ], 334 | "source": [ 335 | "plt.rcParams.update({'font.size': 20})\n", 336 | "fig, ax = plt.subplots(ncols = 1, sharey = True, figsize=(10, 10))\n", 337 | "sns.boxplot(y = 'Estimator', x='MSE', order = order, data=best_case, ax=ax)\n", 338 | "ax.set_ylabel(None)\n", 339 | "ax.set_yticklabels(labels)\n", 340 | "plt.tight_layout()\n", 341 | "plt.savefig(f\"{FPATH}/best_case_performance_TempC.pdf\")" 342 | ] 343 | } 344 | ], 345 | "metadata": { 346 | "kernelspec": { 347 | "display_name": "Python 3", 348 | "language": "python", 349 | "name": "python3" 350 | }, 351 | "language_info": { 352 | "codemirror_mode": { 353 | "name": "ipython", 354 | "version": 3 355 | }, 356 | "file_extension": ".py", 357 | "mimetype": "text/x-python", 358 | "name": "python", 359 | "nbconvert_exporter": "python", 360 | "pygments_lexer": "ipython3", 361 | "version": "3.7.9" 362 | } 363 | }, 364 | "nbformat": 4, 365 | "nbformat_minor": 4 366 | } 367 | -------------------------------------------------------------------------------- /real-data-experiment/notebooks/results_tempC_figure12.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Figure 12 (Temperature as proxy)\n", 8 | "\n", 9 | "To get coefficients, we retrain the model, since this is not saved during the larger experiment" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import numpy as np\n", 19 | "import pandas as pd\n", 20 | "\n", 21 | "import pdb\n", 22 | "\n", 23 | "from sklearn import linear_model as lm\n", 24 | "from sklearn import preprocessing \n", 25 | "from sklearn import model_selection as ms\n", 26 | "from sklearn import pipeline\n", 27 | "from sklearn.model_selection import train_test_split as tt_split\n", 28 | "\n", 29 | "from scipy.stats import skew\n", 30 | "from scipy.stats.stats import pearsonr\n", 31 | "\n", 32 | "import seaborn as sns\n", 33 | "import matplotlib.pyplot as plt\n", 34 | "\n", 35 | "import sys; sys.path.insert(0, '../')\n", 36 | "\n", 37 | "import utils\n", 38 | "\n", 39 | "from anchorRegression import AnchorRegression as AR\n", 40 | "from anchorRegression import CrossProxyAnchorRegression as XAR\n", 41 | "from anchorRegression import TargetedAnchorRegression as TAR\n", 42 | "from anchorRegression import CrossTargetedAnchorRegression as XTAR\n", 43 | "from anchorRegression import MeanPredictor" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "## 1. Load data" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": {}, 56 | "source": [ 57 | "Note that the best scenario is determined in `results_tempC.ipynb` " 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 2, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "CITY = 0\n", 67 | "TEST_SEASON = 2\n", 68 | "\n", 69 | "proxy = 'TempC'\n", 70 | "proxies = [proxy]" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 3, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "DATA_PATH = \"../data\"\n", 80 | "\n", 81 | "files = [\n", 82 | " 'BeijingPM20100101_20151231.csv',\n", 83 | " 'GuangzhouPM20100101_20151231.csv',\n", 84 | " 'ShenyangPM20100101_20151231.csv',\n", 85 | " 'ChengduPM20100101_20151231.csv',\n", 86 | " 'ShanghaiPM20100101_20151231.csv'\n", 87 | "]\n", 88 | "\n", 89 | "dfs = [pd.read_csv(f\"{DATA_PATH}/{f}\") for f in files]" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 4, 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "raw_df = dfs[CITY].drop('No', axis=1)\n", 99 | "filt_df = raw_df.dropna()\n", 100 | "\n", 101 | "df, X, y = utils.process_df(filt_df)" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 5, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "dev_year = 2013\n", 111 | "drop_season = TEST_SEASON" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 6, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "import utils\n", 121 | "\n", 122 | "data = utils.get_dev_train_test_data(df, X, y, drop_season, dev_year, proxies)" 123 | ] 124 | }, 125 | { 126 | "cell_type": "markdown", 127 | "metadata": {}, 128 | "source": [ 129 | "## 2. Fit Estimators" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 7, 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [ 138 | "from copy import deepcopy" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 8, 144 | "metadata": {}, 145 | "outputs": [], 146 | "source": [ 147 | "lr = pipeline.Pipeline([('scaler', preprocessing.StandardScaler()), \n", 148 | " ('pred', lm.LinearRegression(fit_intercept=True, normalize=False))])\n", 149 | "\n", 150 | "baselines = {\n", 151 | " 'OLS': \n", 152 | " {'pipe': deepcopy(lr), \n", 153 | " 'drop_cols': proxies, \n", 154 | " 'fit_params': None}\n", 155 | "}" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 9, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "tar_estimators = utils.construct_tar(data, proxies, drop_all=True)\n", 165 | "ar_estimators = utils.construct_ar(data, proxies, drop_all=True)" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": 10, 171 | "metadata": {}, 172 | "outputs": [], 173 | "source": [ 174 | "estimators = {**baselines,\n", 175 | " **ar_estimators, \n", 176 | " **tar_estimators}" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": 11, 182 | "metadata": {}, 183 | "outputs": [ 184 | { 185 | "name": "stdout", 186 | "output_type": "stream", 187 | "text": [ 188 | "AR (TempC): {'pred__lamb': 40.0}\n" 189 | ] 190 | } 191 | ], 192 | "source": [ 193 | "for k, est in estimators.items():\n", 194 | " if 'tune_lambda' in est.keys() and est['tune_lambda']:\n", 195 | " best_lambda = utils.get_best_lambda(est, data)\n", 196 | " \n", 197 | " print(f\"{k}: {best_lambda}\")\n", 198 | " \n", 199 | " estimators[k]['pipe'] = estimators[k]['pipe'].set_params(\n", 200 | " **best_lambda)\n", 201 | " \n", 202 | " estimators[k].update(best_lambda)" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": 12, 208 | "metadata": {}, 209 | "outputs": [], 210 | "source": [ 211 | "for k, est in estimators.items():\n", 212 | " \n", 213 | " perf = {}\n", 214 | " \n", 215 | " # Get cross-validated training errors\n", 216 | " this_X_train = utils.get_estimator_X(data['train']['X'], est)\n", 217 | " this_X_test = utils.get_estimator_X(data['test']['X'], est)\n", 218 | " y_train = data['train']['y']\n", 219 | " y_test = data['test']['y']\n", 220 | "\n", 221 | " preds_train_cv = ms.cross_val_predict(est['pipe'], this_X_train, y_train, fit_params=est['fit_params'], cv=10)\n", 222 | " resid_train_cv = preds_train_cv - y_train\n", 223 | " \n", 224 | " perf['Train (CV)'] = {\n", 225 | " 'preds': preds_train_cv,\n", 226 | " 'resid': resid_train_cv.values\n", 227 | " }\n", 228 | " \n", 229 | " # Train on the full training set \n", 230 | " if est['fit_params'] is not None:\n", 231 | " est['fit'] = est['pipe'].fit(this_X_train, y_train, **est['fit_params'])\n", 232 | " else:\n", 233 | " est['fit'] = est['pipe'].fit(this_X_train, y_train)\n", 234 | " \n", 235 | " # Evaluate on the test set\n", 236 | " preds_test = est['fit'].predict(this_X_test)\n", 237 | " resid_test = preds_test - y_test\n", 238 | "\n", 239 | " perf['Test'] = {\n", 240 | " 'preds': preds_test,\n", 241 | " 'resid': resid_test.values\n", 242 | " }\n", 243 | " \n", 244 | " estimators[k]['perf'] = perf" 245 | ] 246 | }, 247 | { 248 | "cell_type": "markdown", 249 | "metadata": {}, 250 | "source": [ 251 | "## Construct Figure 12\n", 252 | "\n", 253 | "First, examine the intercepts, then the coefficients." 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": 13, 259 | "metadata": {}, 260 | "outputs": [ 261 | { 262 | "name": "stdout", 263 | "output_type": "stream", 264 | "text": [ 265 | "Intercept: 4.087 [OLS]\n", 266 | "Intercept: 4.087 [AR (TempC)]\n", 267 | "Intercept: 3.885 [TAR (TempC)]\n" 268 | ] 269 | } 270 | ], 271 | "source": [ 272 | "for k, est in estimators.items():\n", 273 | " if k != 'Mean':\n", 274 | " print(f\"Intercept: {est['fit']['pred'].intercept_:.3f} [{k}]\")" 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": 14, 280 | "metadata": {}, 281 | "outputs": [], 282 | "source": [ 283 | "labels = {\n", 284 | " 'OLS' : 'OLS',\n", 285 | " 'AR (TempC)' : 'PAR (TempC)',\n", 286 | " 'TAR (TempC)' : 'PTAR (TempC)' \n", 287 | "}" 288 | ] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "execution_count": 15, 293 | "metadata": {}, 294 | "outputs": [ 295 | { 296 | "data": { 297 | "image/png": "\n", 298 | "text/plain": [ 299 | "
" 300 | ] 301 | }, 302 | "metadata": { 303 | "needs_background": "light" 304 | }, 305 | "output_type": "display_data" 306 | } 307 | ], 308 | "source": [ 309 | "plt.rcParams.update({'font.size': 18})\n", 310 | "f, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 10), sharex=True, sharey=True)\n", 311 | "\n", 312 | "coefs = []\n", 313 | "ests = []\n", 314 | "for k, est in estimators.items():\n", 315 | " if k in ['OLS', 'TAR (TempC)', 'AR (TempC)']:\n", 316 | " this_X = utils.get_estimator_X(data['train']['X'], est)\n", 317 | " coefs.append(pd.Series(est['fit']['pred'].coef_, index = this_X.columns).sort_values())\n", 318 | " ests.append(labels[k])\n", 319 | "\n", 320 | "coef = pd.concat(coefs, axis=1)\n", 321 | "coef.columns = ests\n", 322 | "\n", 323 | "coef.plot(kind = \"barh\", ax=ax)\n", 324 | "#ax.set_title(\"Comparison of Coefficients\")\n", 325 | "\n", 326 | "plt.tight_layout()\n", 327 | "plt.savefig(\"../figs/coefficient_comparison.pdf\")" 328 | ] 329 | } 330 | ], 331 | "metadata": { 332 | "kernelspec": { 333 | "display_name": "Python 3", 334 | "language": "python", 335 | "name": "python3" 336 | }, 337 | "language_info": { 338 | "codemirror_mode": { 339 | "name": "ipython", 340 | "version": 3 341 | }, 342 | "file_extension": ".py", 343 | "mimetype": "text/x-python", 344 | "name": "python", 345 | "nbconvert_exporter": "python", 346 | "pygments_lexer": "ipython3", 347 | "version": "3.7.9" 348 | } 349 | }, 350 | "nbformat": 4, 351 | "nbformat_minor": 4 352 | } 353 | -------------------------------------------------------------------------------- /real-data-experiment/notebooks/results_tempC_proxies.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Tables 1, 2 (Proxies of Temperature)\n", 8 | "\n", 9 | "Builds the portions of Tables 1, 2 that include W, Z\n", 10 | "\n", 11 | "Requires that `run_exp_temp_proxies.py` is run" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 1, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "RPATH = '../results'\n", 21 | "FPATH = '../figs'" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 2, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "import numpy as np\n", 31 | "import pandas as pd\n", 32 | "import seaborn as sns\n", 33 | "import matplotlib.pyplot as plt\n", 34 | "import pickle as pkl" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 3, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "results = pd.read_csv(f\"{RPATH}/all_res_test_anchor_prox_TempC_lamb_fixed.csv\")\n", 44 | "results = results.drop('Unnamed: 0', axis=1)\n", 45 | "\n", 46 | "residuals = results.drop(\"Lambda\", axis=1).query('Environment == \"Test\"')\n", 47 | "lambs = results.drop(\"Residual\", axis=1).query('Environment == \"Test\"')\n", 48 | "\n", 49 | "mse = lambda v: np.mean(v**2)\n", 50 | "\n", 51 | "# Get RMSE by season, city\n", 52 | "pt = pd.pivot_table(residuals, \n", 53 | " index=['Test_Season', 'City'], \n", 54 | " columns = 'Estimator', \n", 55 | " aggfunc={'Residual': mse})\n", 56 | "\n", 57 | "pt.columns = pt.columns.droplevel(0)\n", 58 | "pt = pt.rename(columns = {\n", 59 | " 'TAR (W)': 'PTAR (W)',\n", 60 | " 'xTAR (W, Z)': 'xPTAR (W, Z)',\n", 61 | " 'AR (W)': 'PAR (W)',\n", 62 | " 'xAR (W, Z)': 'xPAR (W, Z)'\n", 63 | "})\n", 64 | "newcols = [\n", 65 | " 'OLS',\n", 66 | " 'PAR (W)',\n", 67 | " 'xPAR (W, Z)',\n", 68 | " 'PTAR (W)',\n", 69 | " 'xPTAR (W, Z)', \n", 70 | "]\n", 71 | "pt = pt[newcols].reset_index()" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": {}, 77 | "source": [ 78 | "# Table 1 (W, Z)" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 4, 84 | "metadata": { 85 | "scrolled": true 86 | }, 87 | "outputs": [ 88 | { 89 | "name": "stdout", 90 | "output_type": "stream", 91 | "text": [ 92 | "\\begin{tabular}{lrrrr}\n", 93 | "\\toprule\n", 94 | "{} & Mean & Win & min & max \\\\\n", 95 | "Estimator & & & & \\\\\n", 96 | "\\midrule\n", 97 | "OLS & 0.537 & 0 & 0.000 & 0.000 \\\\\n", 98 | "PAR (W) & 0.531 & 6 & -0.037 & 0.006 \\\\\n", 99 | "xPAR (W, Z) & 0.531 & 6 & -0.039 & 0.007 \\\\\n", 100 | "PTAR (W) & 0.529 & 8 & -0.038 & 0.001 \\\\\n", 101 | "xPTAR (W, Z) & 0.526 & 7 & -0.059 & 0.001 \\\\\n", 102 | "\\bottomrule\n", 103 | "\\end{tabular}\n", 104 | "\n" 105 | ] 106 | } 107 | ], 108 | "source": [ 109 | "wins = lambda v: int(np.sum(v < 0))\n", 110 | "loss = lambda v: int(np.sum(v > 0))\n", 111 | "\n", 112 | "lambs_ar = lambs.query(f'Environment == \"Test\" & Estimator == \"AR (W)\"').groupby(\n", 113 | " ['City', 'Test_Season']).mean()[['Lambda']]\n", 114 | "\n", 115 | "lambs_ar = lambs_ar.reset_index().set_index(['City', 'Test_Season'])\n", 116 | "\n", 117 | "pt_diff = pt.copy()\n", 118 | "for est in newcols:\n", 119 | " pt_diff[est] = pt[est] - pt['OLS']\n", 120 | "\n", 121 | "pt_pos_lamb = pt.set_index(['City', 'Test_Season']).merge(lambs_ar, left_index=True, right_index=True)\n", 122 | "pt_pos_lamb = pt_pos_lamb.query(\"Lambda > 0\").drop(\"Lambda\", axis=1).reset_index()\n", 123 | "\n", 124 | "pt_diff_pos_lamb = pt_pos_lamb.copy()\n", 125 | "for est in newcols:\n", 126 | " pt_diff_pos_lamb[est] = pt_pos_lamb[est] - pt_pos_lamb['OLS']\n", 127 | "\n", 128 | "lt = pd.melt(pt_pos_lamb, id_vars=['Test_Season', 'City'], value_name = 'MSE', var_name = 'Estimator')\n", 129 | "\n", 130 | "mean_result = lt.groupby('Estimator', as_index=True).agg(\n", 131 | " **{'Mean': pd.NamedAgg(column='MSE', aggfunc=np.mean)}\n", 132 | ").reindex(newcols)\n", 133 | "\n", 134 | "pt_diff_long = pd.melt(pt_diff_pos_lamb, id_vars=['Test_Season', 'City'], value_name = 'MSE', var_name='Estimator')\n", 135 | "\n", 136 | "diff_ols = pt_diff_long.groupby('Estimator', as_index=True).agg(\n", 137 | " **{'min': pd.NamedAgg(column='MSE', aggfunc=np.min), \n", 138 | " 'max': pd.NamedAgg(column='MSE', aggfunc=np.max)}\n", 139 | ").reindex(newcols)\n", 140 | "\n", 141 | "win_loss_ols = pt_diff_long.groupby('Estimator', as_index=True).agg(\n", 142 | " **{'Win': pd.NamedAgg(column='MSE', aggfunc=wins)}\n", 143 | ").reindex(newcols)\n", 144 | "\n", 145 | "print(pd.concat([mean_result, win_loss_ols.astype(int), diff_ols], axis=1).to_latex(float_format=\"{:.3f}\".format))" 146 | ] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "metadata": {}, 151 | "source": [ 152 | "# Table 2 (W, Z)" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 5, 158 | "metadata": {}, 159 | "outputs": [ 160 | { 161 | "name": "stdout", 162 | "output_type": "stream", 163 | "text": [ 164 | "\\begin{tabular}{lrrrr}\n", 165 | "\\toprule\n", 166 | "{} & Mean & Diff & min & max \\\\\n", 167 | "Estimator & & & & \\\\\n", 168 | "\\midrule\n", 169 | "OLS & 0.457 & 0.000 & 0.000 & 0.000 \\\\\n", 170 | "PAR (W) & 0.454 & -0.002 & -0.037 & 0.006 \\\\\n", 171 | "xPAR (W, Z) & 0.454 & -0.003 & -0.039 & 0.007 \\\\\n", 172 | "PTAR (W) & 0.452 & -0.005 & -0.038 & 0.001 \\\\\n", 173 | "xPTAR (W, Z) & 0.450 & -0.007 & -0.059 & 0.003 \\\\\n", 174 | "\\bottomrule\n", 175 | "\\end{tabular}\n", 176 | "\n" 177 | ] 178 | } 179 | ], 180 | "source": [ 181 | "pt_diff = pt.copy()\n", 182 | "for est in newcols:\n", 183 | " pt_diff[est] = pt[est] - pt['OLS']\n", 184 | "\n", 185 | "lt = pd.melt(pt, id_vars=['Test_Season', 'City'], value_name = 'MSE')\n", 186 | "\n", 187 | "mean_result = lt.groupby('Estimator', as_index=True).agg(\n", 188 | " **{'Mean': pd.NamedAgg(column='MSE', aggfunc=np.mean)}\n", 189 | ").reindex(newcols)\n", 190 | "\n", 191 | "pt_diff_long = pd.melt(pt_diff, id_vars=['Test_Season', 'City'], value_name = 'MSE')\n", 192 | "\n", 193 | "diff_ols = pt_diff_long.groupby('Estimator', as_index=True).agg(\n", 194 | " **{'Diff': pd.NamedAgg(column='MSE', aggfunc=np.mean),\n", 195 | " 'min': pd.NamedAgg(column='MSE', aggfunc=np.min), \n", 196 | " 'max': pd.NamedAgg(column='MSE', aggfunc=np.max)}\n", 197 | ").reindex(newcols)\n", 198 | "\n", 199 | "print(pd.concat([mean_result, diff_ols], axis=1).to_latex(float_format=\"{:.3f}\".format))" 200 | ] 201 | } 202 | ], 203 | "metadata": { 204 | "kernelspec": { 205 | "display_name": "Python 3", 206 | "language": "python", 207 | "name": "python3" 208 | }, 209 | "language_info": { 210 | "codemirror_mode": { 211 | "name": "ipython", 212 | "version": 3 213 | }, 214 | "file_extension": ".py", 215 | "mimetype": "text/x-python", 216 | "name": "python", 217 | "nbconvert_exporter": "python", 218 | "pygments_lexer": "ipython3", 219 | "version": "3.7.9" 220 | } 221 | }, 222 | "nbformat": 4, 223 | "nbformat_minor": 4 224 | } 225 | -------------------------------------------------------------------------------- /real-data-experiment/results/README.md: -------------------------------------------------------------------------------- 1 | # Results Folder 2 | 3 | This is used to store outputs of the various scripts 4 | -------------------------------------------------------------------------------- /real-data-experiment/run_exp_temp.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # Real-Data Experiments: Pollution in 5 Chinese Cities 5 | # Link: https://archive.ics.uci.edu/ml/datasets/PM2.5+Data+of+Five+Chinese+Cities 6 | 7 | # * Year / Month / Day / Hour 8 | # * Season 9 | # * DEWP: Dew Point (Celsius Degree) 10 | # * TEMP: Temperature (Celsius Degree) 11 | # * HUMI: Humidity (%) 12 | # * PRES: Pressure (hPa) 13 | # * cbwd: Combined wind direction 14 | # * Iws: Cumulated wind speed (m/s) 15 | # * precipitation: hourly precipitation (mm) 16 | # * Iprec: Cumulated precipitation (mm) 17 | # * (Target) PM2.5 concentration (ug/m^3) 18 | 19 | import numpy as np 20 | import pandas as pd 21 | from copy import deepcopy 22 | from numpy.random import default_rng 23 | import pickle as pkl 24 | 25 | import pdb 26 | 27 | from sklearn import linear_model as lm 28 | from sklearn import preprocessing 29 | from sklearn import model_selection as ms 30 | from sklearn import pipeline 31 | from sklearn.model_selection import train_test_split as tt_split 32 | 33 | import seaborn as sns 34 | import matplotlib.pyplot as plt 35 | 36 | from anchorRegression import AnchorRegression as AR 37 | from anchorRegression import CrossProxyAnchorRegression as XAR 38 | from anchorRegression import TargetedAnchorRegression as TAR 39 | from anchorRegression import CrossTargetedAnchorRegression as XTAR 40 | from anchorRegression import MeanPredictor 41 | import utils 42 | 43 | drop_all = True 44 | 45 | proxies = ['TempC'] 46 | 47 | cities = np.arange(5) 48 | seasons = np.arange(1, 5) 49 | 50 | all_res_dfs = [] 51 | all_rmse_dfs = [] 52 | prox_info = {} 53 | 54 | for CITY in cities: 55 | print(f"City: {CITY}") 56 | 57 | DATA_PATH = "data" 58 | 59 | files = [ 60 | 'BeijingPM20100101_20151231.csv', 61 | 'GuangzhouPM20100101_20151231.csv', 62 | 'ShenyangPM20100101_20151231.csv', 63 | 'ChengduPM20100101_20151231.csv', 64 | 'ShanghaiPM20100101_20151231.csv' 65 | ] 66 | 67 | dfs = [pd.read_csv(f"{DATA_PATH}/{f}") for f in files] 68 | 69 | raw_df = dfs[CITY].drop('No', axis=1) 70 | filt_df = raw_df.dropna() 71 | 72 | df, X, y = utils.process_df(filt_df) 73 | 74 | # Get Proxy Info for this city / season 75 | lr = pipeline.Pipeline([('scaler', preprocessing.StandardScaler()), 76 | ('lr', lm.LinearRegression(fit_intercept=True))]) 77 | 78 | this_city_prox_info = {} 79 | 80 | for prox in proxies: 81 | this_city_prox_info[prox] = {} 82 | 83 | # Get leave-one-out correlation with error 84 | this_X = X.drop([prox], axis=1) 85 | resid = y - lr.fit(this_X, y).predict(this_X) 86 | this_city_prox_info[prox]['corr_resid_lone'] = \ 87 | np.corrcoef(resid, X[prox].values)[0, 1] 88 | 89 | # Get leave-all-out correlation with error 90 | this_X = X.drop(proxies, axis=1) 91 | resid = y - lr.fit(this_X, y).predict(this_X) 92 | this_city_prox_info[prox]['corr_resid_lall'] = \ 93 | np.corrcoef(resid, X[prox].values)[0, 1] 94 | 95 | prox_info[CITY] = this_city_prox_info 96 | 97 | for test_season in seasons: 98 | print(f"\t Season: {test_season}") 99 | 100 | dev_year = 2013 101 | data = utils.get_dev_train_test_data( 102 | df, X, y, test_season, dev_year, proxies) 103 | 104 | baselines = utils.construct_baselines( 105 | data, proxies, drop_all=drop_all) 106 | tar_baselines = utils.construct_tar_baseline( 107 | data, proxies, drop_all=drop_all) 108 | tar_estimators = utils.construct_tar( 109 | data, proxies, drop_all=drop_all) 110 | ar_estimators = utils.construct_ar( 111 | data, proxies, drop_all=drop_all) 112 | 113 | if len(proxies) > 1: 114 | xtar_estimators = utils.construct_xtar( 115 | data, proxies, drop_all=drop_all) 116 | xar_estimators = utils.construct_xar( 117 | data, proxies, drop_all=drop_all) 118 | 119 | estimators = { 120 | **baselines, 121 | **tar_baselines, 122 | **tar_estimators, **xtar_estimators, 123 | **ar_estimators, **xar_estimators} 124 | else: 125 | estimators = { 126 | **baselines, 127 | **tar_baselines, 128 | **tar_estimators, 129 | **ar_estimators} 130 | 131 | for k, est in estimators.items(): 132 | if 'tune_lambda' in est.keys() and est['tune_lambda']: 133 | best_lambda = utils.get_best_lambda(est, data) 134 | 135 | print(f"\t\t {k}: {best_lambda}") 136 | 137 | estimators[k]['pipe'] = estimators[k]['pipe'].set_params( 138 | **best_lambda) 139 | 140 | estimators[k].update(best_lambda) 141 | 142 | for k, est in estimators.items(): 143 | 144 | perf = {} 145 | 146 | # Get cross-validated training errors 147 | this_X_train = utils.get_estimator_X(data['train']['X'], est) 148 | this_X_test = utils.get_estimator_X(data['test']['X'], est) 149 | y_train = data['train']['y'] 150 | y_test = data['test']['y'] 151 | 152 | preds_train_cv = ms.cross_val_predict(est['pipe'], 153 | this_X_train, y_train, fit_params=est['fit_params'], cv=10) 154 | resid_train_cv = preds_train_cv - y_train 155 | 156 | perf['Train (CV)'] = { 157 | 'preds': preds_train_cv, 158 | 'resid': resid_train_cv.values 159 | } 160 | 161 | # Train on the full training set 162 | if est['fit_params'] is not None: 163 | est['fit'] = est['pipe'].fit(this_X_train, y_train, **est['fit_params']) 164 | else: 165 | est['fit'] = est['pipe'].fit(this_X_train, y_train) 166 | 167 | # Evaluate on the test set 168 | preds_test = est['fit'].predict(this_X_test) 169 | resid_test = preds_test - y_test 170 | 171 | perf['Test'] = { 172 | 'preds': preds_test, 173 | 'resid': resid_test.values 174 | } 175 | 176 | estimators[k]['perf'] = perf 177 | 178 | res_dfs = [] 179 | 180 | for key, est in estimators.items(): 181 | for env_name, perf in est['perf'].items(): 182 | rs = pd.DataFrame(perf['resid'], columns=['Residual']) 183 | rs['City'] = CITY 184 | rs['Test_Season'] = test_season 185 | rs['Type'] = key.split()[0] 186 | rs['Estimator'] = key 187 | rs['Environment'] = env_name 188 | if 'lamb' in estimators[key]['fit']['pred'].get_params(): 189 | rs['Lambda'] = estimators[key]['fit']['pred'].get_params()['lamb'] 190 | else: 191 | rs['Lambda'] = np.nan 192 | 193 | res_dfs.append(rs) 194 | all_res_dfs.append(rs) 195 | 196 | res_df = pd.concat(res_dfs, axis=0) 197 | 198 | rmse = lambda v: np.sqrt(np.mean(v**2)) 199 | 200 | rng = default_rng(0) 201 | 202 | # boostrap the RMSE 203 | n_boot_iter = 1000 204 | 205 | # For each estimator 206 | for key, est in estimators.items(): 207 | # For each environment (train / test) 208 | for env_name, perf in est['perf'].items(): 209 | # Bootstrap RMSE 210 | rmse_set = [] 211 | for _ in range(n_boot_iter): 212 | rmse_set.append(rmse(rng.choice(perf['resid'], size=len(perf['resid'])))) 213 | rmse_set = np.array(rmse_set) 214 | 215 | # Save distribution of results 216 | estimators[key]['perf'][env_name]['rmse_boot'] = rmse_set 217 | 218 | rmse_dfs = [] 219 | 220 | for key, est in estimators.items(): 221 | for env_name, perf in est['perf'].items(): 222 | rs = pd.DataFrame(perf['rmse_boot'], columns=['RMSE']) 223 | rs['City'] = CITY 224 | rs['Test_Season'] = test_season 225 | rs['Type'] = key.split()[0] 226 | rs['Estimator'] = key 227 | rs['Environment'] = env_name 228 | rmse_dfs.append(rs) 229 | all_rmse_dfs.append(rs) 230 | 231 | rmse_df = pd.concat(rmse_dfs, axis=0) 232 | 233 | all_res_df = pd.concat(all_res_dfs, axis=0) 234 | all_rmse_df = pd.concat(all_rmse_dfs, axis=0) 235 | all_res_df.to_csv(f"results/all_res_test_{proxies[0]}.csv") 236 | all_rmse_df.to_csv(f"results/all_rmse_test_{proxies[0]}.csv") 237 | with open(f'results/prox_info_test_{proxies[0]}.pkl', 'wb') as f: 238 | pkl.dump(prox_info, f, protocol=pkl.HIGHEST_PROTOCOL) 239 | -------------------------------------------------------------------------------- /real-data-experiment/run_exp_temp_proxies.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # # Real-Data Experiments: Pollution in 5 Chinese Cities 5 | 6 | # https://archive.ics.uci.edu/ml/datasets/PM2.5+Data+of+Five+Chinese+Cities 7 | 8 | # This dataset contains hourly pollution readings (PM2.5 concentration) from five cities in China: Beijing, Guangzhou, Shenyang, Chengdu, and Shanghai, over the course of several years. In Shanghai, for instance, there are >20k readings, accompanied by several features 9 | 10 | # * Year / Month / Day / Hour 11 | # * Season 12 | # * DEWP: Dew Point (Celsius Degree) 13 | # * TEMP: Temperature (Celsius Degree) 14 | # * HUMI: Humidity (%) 15 | # * PRES: Pressure (hPa) 16 | # * cbwd: Combined wind direction 17 | # * Iws: Cumulated wind speed (m/s) 18 | # * precipitation: hourly precipitation (mm) 19 | # * Iprec: Cumulated precipitation (mm) 20 | # * (Target) PM2.5 concentration (ug/m^3) 21 | 22 | import numpy as np 23 | import pandas as pd 24 | from copy import deepcopy 25 | from numpy.random import default_rng 26 | import pickle as pkl 27 | 28 | import pdb 29 | 30 | from sklearn import linear_model as lm 31 | from sklearn import preprocessing 32 | from sklearn import model_selection as ms 33 | from sklearn import pipeline 34 | from sklearn.model_selection import train_test_split as tt_split 35 | 36 | import seaborn as sns 37 | import matplotlib.pyplot as plt 38 | 39 | from anchorRegression import AnchorRegression as AR 40 | from anchorRegression import CrossProxyAnchorRegression as XAR 41 | from anchorRegression import TargetedAnchorRegression as TAR 42 | from anchorRegression import CrossTargetedAnchorRegression as XTAR 43 | from anchorRegression import MeanPredictor 44 | import utils 45 | 46 | drop_all = True 47 | 48 | # From previous run, for comparability, set to None to re-tune 49 | lambda_list = { 50 | (0, 1): 40.0, 51 | (0, 2): 40.0, 52 | (0, 3): 40.0, 53 | (0, 4): 40.0, 54 | (1, 1): 0.0, 55 | (1, 2): 0.0, 56 | (1, 3): 0.0, 57 | (1, 4): 0.0, 58 | (2, 1): 40.0, 59 | (2, 2): 0.0, 60 | (2, 3): 0.0, 61 | (2, 4): 0.0, 62 | (3, 1): 40.0, 63 | (3, 2): 0.0, 64 | (3, 3): 0.0, 65 | (3, 4): 0.0, 66 | (4, 1): 0.0, 67 | (4, 2): 40.0, 68 | (4, 3): 40.0, 69 | (4, 4): 40.0 70 | } 71 | 72 | anchor = 'TempC' 73 | proxies = ['W', 'Z'] 74 | create_proxies = True 75 | rho = 0.9 76 | 77 | cities = np.arange(5) 78 | seasons = np.arange(1, 5) 79 | 80 | all_res_dfs = [] 81 | all_rmse_dfs = [] 82 | 83 | for CITY in cities: 84 | print(f"City: {CITY}") 85 | 86 | DATA_PATH = "data" 87 | 88 | files = [ 89 | 'BeijingPM20100101_20151231.csv', 90 | 'GuangzhouPM20100101_20151231.csv', 91 | 'ShenyangPM20100101_20151231.csv', 92 | 'ChengduPM20100101_20151231.csv', 93 | 'ShanghaiPM20100101_20151231.csv' 94 | ] 95 | 96 | dfs = [pd.read_csv(f"{DATA_PATH}/{f}") for f in files] 97 | 98 | raw_df = dfs[CITY].drop('No', axis=1) 99 | filt_df = raw_df.dropna() 100 | 101 | df, X, y = utils.process_df(filt_df) 102 | 103 | # Get Proxy Info for this city / season 104 | lr = pipeline.Pipeline([('scaler', preprocessing.StandardScaler()), 105 | ('lr', lm.LinearRegression(fit_intercept=True))]) 106 | 107 | for test_season in seasons: 108 | print(f"\t Season: {test_season}") 109 | 110 | dev_year = 2013 111 | if create_proxies: 112 | data = utils.get_dev_train_test_data_anchors( 113 | df, X, y, test_season, dev_year, 114 | anchor, proxies, rho) 115 | else: 116 | data = utils.get_dev_train_test_data( 117 | df, X, y, test_season, dev_year, proxies) 118 | 119 | baselines = utils.construct_baselines( 120 | data, proxies, drop_all=drop_all) 121 | tar_baselines = utils.construct_tar_baseline( 122 | data, proxies, drop_all=drop_all) 123 | tar_estimators = utils.construct_tar( 124 | data, proxies, drop_all=drop_all) 125 | ar_estimators = utils.construct_ar( 126 | data, proxies, drop_all=drop_all) 127 | 128 | if len(proxies) > 1: 129 | xtar_estimators = utils.construct_xtar( 130 | data, proxies, drop_all=drop_all) 131 | xar_estimators = utils.construct_xar( 132 | data, proxies, drop_all=drop_all) 133 | 134 | estimators = { 135 | **baselines, 136 | **tar_baselines, 137 | **tar_estimators, **xtar_estimators, 138 | **ar_estimators, **xar_estimators} 139 | else: 140 | estimators = { 141 | **baselines, 142 | **tar_baselines, 143 | **tar_estimators, 144 | **ar_estimators} 145 | 146 | for k, est in estimators.items(): 147 | if 'tune_lambda' in est.keys() and est['tune_lambda']: 148 | if lambda_list is not None: 149 | best_lambda = { 150 | 'pred__lamb': lambda_list[(CITY, test_season)] 151 | } 152 | print(f"\t\t {k}: {best_lambda}") 153 | else: 154 | best_lambda = utils.get_best_lambda(est, data) 155 | print(f"\t\t {k}: {best_lambda}") 156 | 157 | estimators[k]['pipe'] = estimators[k]['pipe'].set_params( 158 | **best_lambda) 159 | 160 | estimators[k].update(best_lambda) 161 | 162 | for k, est in estimators.items(): 163 | 164 | perf = {} 165 | 166 | # Get cross-validated training errors 167 | this_X_train = utils.get_estimator_X(data['train']['X'], est) 168 | this_X_test = utils.get_estimator_X(data['test']['X'], est) 169 | y_train = data['train']['y'] 170 | y_test = data['test']['y'] 171 | 172 | preds_train_cv = ms.cross_val_predict(est['pipe'], 173 | this_X_train, y_train, fit_params=est['fit_params'], cv=10) 174 | resid_train_cv = preds_train_cv - y_train 175 | 176 | perf['Train (CV)'] = { 177 | 'preds': preds_train_cv, 178 | 'resid': resid_train_cv.values 179 | } 180 | 181 | # Train on the full training set 182 | if est['fit_params'] is not None: 183 | est['fit'] = est['pipe'].fit(this_X_train, y_train, **est['fit_params']) 184 | else: 185 | est['fit'] = est['pipe'].fit(this_X_train, y_train) 186 | 187 | # Evaluate on the test set 188 | preds_test = est['fit'].predict(this_X_test) 189 | resid_test = preds_test - y_test 190 | 191 | perf['Test'] = { 192 | 'preds': preds_test, 193 | 'resid': resid_test.values 194 | } 195 | 196 | estimators[k]['perf'] = perf 197 | 198 | res_dfs = [] 199 | 200 | for key, est in estimators.items(): 201 | for env_name, perf in est['perf'].items(): 202 | rs = pd.DataFrame(perf['resid'], columns=['Residual']) 203 | rs['City'] = CITY 204 | rs['Test_Season'] = test_season 205 | rs['Type'] = key.split()[0] 206 | rs['Estimator'] = key 207 | rs['Environment'] = env_name 208 | if 'lamb' in estimators[key]['fit']['pred'].get_params(): 209 | rs['Lambda'] = estimators[key]['fit']['pred'].get_params()['lamb'] 210 | else: 211 | rs['Lambda'] = np.nan 212 | 213 | res_dfs.append(rs) 214 | all_res_dfs.append(rs) 215 | 216 | res_df = pd.concat(res_dfs, axis=0) 217 | 218 | rmse = lambda v: np.sqrt(np.mean(v**2)) 219 | 220 | rng = default_rng(0) 221 | 222 | # boostrap the RMSE 223 | n_boot_iter = 1000 224 | 225 | # For each estimator 226 | for key, est in estimators.items(): 227 | # For each environment (train / test) 228 | for env_name, perf in est['perf'].items(): 229 | # Bootstrap RMSE 230 | rmse_set = [] 231 | for _ in range(n_boot_iter): 232 | rmse_set.append(rmse(rng.choice(perf['resid'], size=len(perf['resid'])))) 233 | rmse_set = np.array(rmse_set) 234 | 235 | # Save distribution of results 236 | estimators[key]['perf'][env_name]['rmse_boot'] = rmse_set 237 | 238 | rmse_dfs = [] 239 | 240 | for key, est in estimators.items(): 241 | for env_name, perf in est['perf'].items(): 242 | rs = pd.DataFrame(perf['rmse_boot'], columns=['RMSE']) 243 | rs['City'] = CITY 244 | rs['Test_Season'] = test_season 245 | rs['Type'] = key.split()[0] 246 | rs['Estimator'] = key 247 | rs['Environment'] = env_name 248 | rmse_dfs.append(rs) 249 | all_rmse_dfs.append(rs) 250 | 251 | ar_types = ['AR', 'xAR', 'OLS'] 252 | tar_types = ['TAR', 'xTAR', 'OLS'] 253 | 254 | rmse_df = pd.concat(rmse_dfs, axis=0) 255 | 256 | all_res_df = pd.concat(all_res_dfs, axis=0) 257 | all_rmse_df = pd.concat(all_rmse_dfs, axis=0) 258 | all_res_df.to_csv(f"results/all_res_test_anchor_prox_{anchor}_lamb_fixed.csv") 259 | all_rmse_df.to_csv(f"results/all_rmse_test_anchor_prox_{anchor}_lamb_fixed.csv") 260 | -------------------------------------------------------------------------------- /real-data-experiment/utils.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from scipy.stats import skew 4 | from copy import deepcopy 5 | 6 | from sklearn import linear_model as lm 7 | from sklearn import preprocessing 8 | from sklearn import model_selection as ms 9 | from sklearn import pipeline 10 | from sklearn.model_selection import train_test_split as tt_split 11 | 12 | import sys; sys.path.insert(0, '..') 13 | 14 | from anchorRegression import AnchorRegression as AR 15 | from anchorRegression import CrossProxyAnchorRegression as XAR 16 | from anchorRegression import TargetedAnchorRegression as TAR 17 | from anchorRegression import CrossTargetedAnchorRegression as XTAR 18 | from anchorRegression import MeanPredictor 19 | 20 | import itertools as it 21 | 22 | LAMBDA_RANGE = np.linspace(0, 40, 100) 23 | 24 | def process_df(df): 25 | # The dataset contains PM (pollution) readings across different posts in the city. We average them here 26 | PM_vars = [f for f in df.columns if "PM" in f] 27 | 28 | avg_pm = 0 29 | for f in PM_vars: avg_pm += df[f] 30 | avg_pm = avg_pm / len(PM_vars) 31 | 32 | df = df.drop(PM_vars, axis=1) 33 | df['avg_pm'] = avg_pm 34 | 35 | # Create a date-time index 36 | time_vars = ['year', 'month', 'day', 'hour'] 37 | time = pd.to_datetime(df[time_vars]) 38 | df.index = time 39 | 40 | df['target'] = np.log1p(df['avg_pm']) 41 | 42 | # Construct Features 43 | drop_vars = ['year', 'month', 'day', 'hour', 'season'] 44 | 45 | cat_feats = ['month', 'day', 'hour', 'season', 'cbwd'] 46 | for f in cat_feats: 47 | df[f] = pd.Categorical(df[f]) 48 | 49 | X = df.copy() 50 | X = X.drop(drop_vars, axis=1) 51 | X = X.drop(['target', 'avg_pm'], axis=1) 52 | 53 | # Better variable names 54 | X = X.rename(columns={'DEWP': 'DewPt', 55 | 'HUMI': 'Humidity', 56 | 'PRES': 'Press', 57 | 'TEMP': 'TempC', 58 | 'cbwd': 'WindDir', 59 | 'Iws': 'WindSp', 60 | 'precipitation': 'PrecipHr', 61 | 'Iprec': 'PrecipCm'}) 62 | 63 | # Note that we drop the first dummy variable, because we'll be using OLS 64 | X = pd.get_dummies(X, drop_first=True) 65 | y = df['target'] 66 | 67 | Xy = pd.concat([X, y], axis=1) 68 | 69 | numeric_feats = [f for f in X.columns if f not in time_vars and 70 | 'WindDir' not in f and 71 | 'season' not in f] 72 | 73 | X['PrecipCm'] = X['PrecipCm'] - X['PrecipHr'] 74 | 75 | #log transform skewed numeric features: 76 | skewed_feats = X[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness 77 | skewed_feats = skewed_feats[skewed_feats > 0.75] 78 | skewed_feats = skewed_feats.index 79 | #print(skewed_feats) 80 | 81 | X[skewed_feats] = np.log1p(X[skewed_feats]) 82 | 83 | return df, X, y 84 | 85 | def get_dev_train_test_data_anchors(df, X, y, test_season, dev_year, 86 | anchor, proxy_names, rho=0.9): 87 | 88 | dev_ids = np.logical_and(df.season != test_season, df.year == dev_year) 89 | train_ids = np.logical_and(df.season != test_season, df.year != dev_year) 90 | test_ids = np.logical_and(df.season == test_season, df.year != dev_year) 91 | 92 | rng = np.random.default_rng(0) 93 | 94 | data = {'dev': {}, 'train': {}, 'test': {}} 95 | 96 | # Construct proxies, so that stv in train is equal to rho 97 | A = X[[anchor]] 98 | var_A = A[train_ids].var() 99 | # rho = var_A / (var_A + var_eps) 100 | sigma_eps = np.sqrt((var_A / rho) - var_A) 101 | print(f"\t\t\t Sigma for proxies is: {sigma_eps[0]:.3f}") 102 | 103 | for prox in proxy_names: 104 | X[[prox]] = pd.DataFrame(rng.normal(A, sigma_eps), 105 | index=A.index, columns=A.columns) 106 | 107 | data['dev'] = { 108 | 'G': df[dev_ids][['season']], 109 | 'X': X[dev_ids].drop(anchor, axis=1), 110 | 'y': y[dev_ids] 111 | } 112 | 113 | data['train'] = { 114 | 'G': df[train_ids][['season']], 115 | 'X': X[train_ids].drop(anchor, axis=1), 116 | 'y': y[train_ids] 117 | } 118 | 119 | data['test'] = { 120 | 'G': df[test_ids][['season']], 121 | 'X': X[test_ids].drop(anchor, axis=1), 122 | 'y': y[test_ids] 123 | } 124 | 125 | for prox in proxy_names: 126 | data['dev'][prox] = X[dev_ids][[prox]] 127 | data['train'][prox] = X[train_ids][[prox]] 128 | data['test'][prox] = X[test_ids][[prox]] 129 | 130 | return data 131 | 132 | def get_dev_train_test_data(df, X, y, test_season, dev_year, proxies): 133 | dev_ids = np.logical_and(df.season != test_season, df.year == dev_year) 134 | train_ids = np.logical_and(df.season != test_season, df.year != dev_year) 135 | test_ids = np.logical_and(df.season == test_season, df.year != dev_year) 136 | 137 | data = {'dev': {}, 'train': {}, 'test': {}} 138 | 139 | data['dev'] = { 140 | 'G': df[dev_ids][['season']], 141 | 'X': X[dev_ids], 142 | 'y': y[dev_ids] 143 | } 144 | 145 | data['train'] = { 146 | 'G': df[train_ids][['season']], 147 | 'X': X[train_ids], 148 | 'y': y[train_ids] 149 | } 150 | 151 | data['test'] = { 152 | 'G': df[test_ids][['season']], 153 | 'X': X[test_ids], 154 | 'y': y[test_ids] 155 | } 156 | 157 | for prox in proxies: 158 | data['dev'][prox] = X[dev_ids][[prox]] 159 | data['train'][prox] = X[train_ids][[prox]] 160 | data['test'][prox] = X[test_ids][[prox]] 161 | 162 | return data 163 | 164 | def construct_baselines(data, proxies, drop_all=False): 165 | mp = pipeline.Pipeline([('pred', MeanPredictor())]) 166 | 167 | lr = pipeline.Pipeline([('scaler', preprocessing.StandardScaler()), 168 | ('pred', lm.LinearRegression(fit_intercept=True, normalize=False))]) 169 | 170 | baselines = { 171 | 'Mean': 172 | {'pipe': mp, 173 | 'fit_params': None}, 174 | 'OLS (All)': 175 | {'pipe': deepcopy(lr), 176 | 'fit_params': None}, 177 | 'OLS': 178 | {'pipe': deepcopy(lr), 179 | 'fit_params': None, 180 | 'drop_cols': proxies}, 181 | } 182 | 183 | return baselines 184 | 185 | def construct_tar_baseline(data, proxies, drop_all=False): 186 | tar = pipeline.Pipeline( 187 | [('scaler', preprocessing.StandardScaler()), 188 | ('pred', TAR(fit_intercept=True, normalize=False))]) 189 | 190 | estimators = {} 191 | for prox in proxies: 192 | estimators[f"PA ({prox})"] = { 193 | 'pipe': deepcopy(tar), 194 | 'drop_cols': proxies if drop_all else [prox], 195 | 'fit_params': { 196 | 'pred__A': data['train'][prox], 197 | 'pred__nu': np.ones_like(data['test'][prox] 198 | )*data['test'][prox].mean().values[0] 199 | } 200 | } 201 | 202 | return estimators 203 | 204 | def construct_tar(data, proxies, drop_all=False): 205 | tar = pipeline.Pipeline( 206 | [('scaler', preprocessing.StandardScaler()), 207 | ('pred', TAR(fit_intercept=True, normalize=False))]) 208 | 209 | estimators = {} 210 | for prox in proxies: 211 | estimators[f"TAR ({prox})"] = { 212 | 'pipe': deepcopy(tar), 213 | 'drop_cols': proxies if drop_all else [prox], 214 | 'fit_params': { 215 | 'pred__A': data['train'][prox], 216 | 'pred__nu': data['test'][prox]} 217 | } 218 | 219 | return estimators 220 | 221 | def construct_xtar(data, proxies, drop_all=False): 222 | xtar = pipeline.Pipeline( 223 | [('scaler', preprocessing.StandardScaler()), 224 | ('pred', XTAR(fit_intercept=True, normalize=False))]) 225 | 226 | estimators = {} 227 | for prox_combo in it.permutations(proxies, 2): 228 | estimators[f"xTAR ({prox_combo[0]}, {prox_combo[1]})"] = { 229 | 'pipe': deepcopy(xtar), 230 | 'drop_cols': proxies if drop_all else [p for p in prox_combo], 231 | 'fit_params': { 232 | 'pred__W': data['train'][prox_combo[0]], 233 | 'pred__Z': data['train'][prox_combo[1]], 234 | 'pred__nu': data['test'][prox_combo[0]]} 235 | } 236 | 237 | return estimators 238 | 239 | def construct_ar(data, proxies, drop_all=False): 240 | ar = pipeline.Pipeline( 241 | [('scaler', preprocessing.StandardScaler()), 242 | ('pred', AR(lamb=0, fit_intercept=True, normalize=False))]) 243 | 244 | estimators = {} 245 | for prox in proxies: 246 | estimators[f"AR ({prox})"] = { 247 | 'pipe': deepcopy(ar), 248 | 'drop_cols': proxies if drop_all else [prox], 249 | 'fit_params_dev': { 250 | 'pred__A': data['dev'][prox], 251 | }, 252 | 'fit_params': { 253 | 'pred__A': data['train'][prox], 254 | }, 255 | 'tune_lambda': True 256 | } 257 | 258 | return estimators 259 | 260 | def construct_xar(data, proxies, drop_all=False): 261 | xar = pipeline.Pipeline( 262 | [('scaler', preprocessing.StandardScaler()), 263 | ('pred', XAR(lamb=0, fit_intercept=True, normalize=False))]) 264 | 265 | estimators = {} 266 | for prox_combo in it.combinations(proxies, 2): 267 | estimators[f"xAR ({prox_combo[0]}, {prox_combo[1]})"] = { 268 | 'pipe': deepcopy(xar), 269 | 'drop_cols': proxies if drop_all else [p for p in prox_combo], 270 | 'fit_params_dev': { 271 | 'pred__W': data['dev'][prox_combo[0]], 272 | 'pred__Z': data['dev'][prox_combo[1]] 273 | }, 274 | 'fit_params': { 275 | 'pred__W': data['train'][prox_combo[0]], 276 | 'pred__Z': data['train'][prox_combo[1]] 277 | }, 278 | 'tune_lambda': True 279 | } 280 | 281 | return estimators 282 | 283 | def get_estimator_X(X, est): 284 | if 'drop_cols' in est.keys(): 285 | return X.copy().drop(est['drop_cols'], axis=1) 286 | else: 287 | return X.copy() 288 | 289 | def get_best_lambda(est, data): 290 | X = data['dev']['X'] 291 | y = data['dev']['y'] 292 | G = data['dev']['G'] 293 | 294 | X = get_estimator_X(X, est) 295 | fit_params = est['fit_params_dev'] 296 | 297 | logo = ms.LeaveOneGroupOut() 298 | lamb_params = {'pred__lamb': LAMBDA_RANGE} 299 | 300 | est_cv = ms.GridSearchCV( 301 | est['pipe'], lamb_params, cv=logo, 302 | scoring='neg_root_mean_squared_error') 303 | est_cv = est_cv.fit(X, y, **fit_params, groups=np.ravel(G)) 304 | 305 | return est_cv.best_params_ 306 | -------------------------------------------------------------------------------- /synthetic-experiments/appendix-B-identification.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import pandas as pd\n", 11 | "\n", 12 | "import matplotlib.pyplot as plt\n", 13 | "import seaborn as sns" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "## Define the observed distribution" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 2, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "###############################\n", 30 | "# Define a covariance matrix\n", 31 | "###############################\n", 32 | "\n", 33 | "cov = {'wx': 1, 'xy': 3, 'wy': 2, 'ww': 9, 'xx': 9, 'yy': 9}\n", 34 | "\n", 35 | "#############################\n", 36 | "# Choose Lambda\n", 37 | "#############################\n", 38 | "lda = 5" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "## Generate valid parameters" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "Here we generate valid parameterizations. Notable restrictions implied by the model\n", 53 | "1. $\\rho_W$ is lower bounded by the squared covariance of W, X\n", 54 | "2. $\\sigma_Y^2$ cannot go below zero" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 3, 60 | "metadata": {}, 61 | "outputs": [ 62 | { 63 | "name": "stdout", 64 | "output_type": "stream", 65 | "text": [ 66 | "[13.11309058 5.79808822 8.08882119]\n" 67 | ] 68 | } 69 | ], 70 | "source": [ 71 | "# Verify that the chosen covariance matrix is PD\n", 72 | "cov_mat = np.array(\n", 73 | " [[cov['xx'], cov['xy'], cov['wx']],\n", 74 | " [cov['xy'], cov['yy'], cov['wy']],\n", 75 | " [cov['wx'], cov['wy'], cov['ww']]])\n", 76 | "\n", 77 | "w, _ = np.linalg.eig(cov_mat)\n", 78 | "assert np.all(w > 0) # Verify that covariance is PD\n", 79 | "print(w)" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 4, 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "corr_wx = cov['wx'] / np.sqrt(cov['xx'] * cov['ww'])\n", 89 | "assert corr_wx < 1 and corr_wx > 0\n", 90 | "\n", 91 | "# Generate a range of feasible rho_w\n", 92 | "# The smallest rw corresponds to the largest rx, which is 1.\n", 93 | "min_rw = corr_wx**2\n", 94 | "rw = np.linspace(min_rw+0.001, 1, 1000)\n", 95 | "#rw = np.linspace(0.06, 1, 1000)\n", 96 | "\n", 97 | "# Generate all remaining values\n", 98 | "rx = corr_wx**2 / rw\n", 99 | "\n", 100 | "sx = cov['xx'] * (1 - rx)\n", 101 | "sw = cov['ww'] * (1 - rw)\n", 102 | "\n", 103 | "bw = np.sqrt(cov['ww'] * rw)\n", 104 | "bx = np.sqrt(cov['xx'] * rx)\n", 105 | "by = (1 / (bw * (1 - rx))) * (cov['wy'] - (cov['xy'] * cov['wx']) / cov['xx'])\n", 106 | "a = (cov['xy'] - by * bx) / cov['xx']\n", 107 | "\n", 108 | "sy = cov['yy'] - by**2 - (2*a*by*bx) - (a**2) * cov['xx']" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 5, 114 | "metadata": { 115 | "scrolled": true 116 | }, 117 | "outputs": [ 118 | { 119 | "name": "stdout", 120 | "output_type": "stream", 121 | "text": [ 122 | "$\\rho_W$ cannot go below 0.012345679012345678\n" 123 | ] 124 | } 125 | ], 126 | "source": [ 127 | "print(f\"$\\\\rho_W$ cannot go below {min_rw}\")" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": 6, 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [ 136 | "# Verify that all observed moments match\n", 137 | "e_wx = bw * bx\n", 138 | "assert np.all(np.isclose(e_wx, cov['wx']))\n", 139 | "\n", 140 | "e_xy = by * bx + a * cov['xx']\n", 141 | "assert np.all(np.isclose(e_xy, cov['xy']))\n", 142 | "\n", 143 | "e_wy = bw * (by + a * bx)\n", 144 | "assert np.all(np.isclose(e_wy, cov['wy']))\n", 145 | "\n", 146 | "e_ww = bw**2 + sw\n", 147 | "assert np.all(np.isclose(e_ww, cov['ww']))\n", 148 | "\n", 149 | "e_xx = bx**2 + sx\n", 150 | "assert np.all(np.isclose(e_xx, cov['xx']))\n", 151 | "\n", 152 | "e_yy = (a**2)*cov['xx'] + (2*a*by*bx) + by**2 + sy\n", 153 | "assert np.all(np.isclose(e_yy, cov['yy']))" 154 | ] 155 | }, 156 | { 157 | "cell_type": "markdown", 158 | "metadata": {}, 159 | "source": [ 160 | "Later, we will filter out all instances where `sy < 0`, which will remove some of these" 161 | ] 162 | }, 163 | { 164 | "cell_type": "markdown", 165 | "metadata": {}, 166 | "source": [ 167 | "## Compute solutions\n", 168 | "\n", 169 | "We compute the anchor regression solution (which relies upon the unidentified variables) for each parameterization, and can then compare this to the solution when a single proxy is used, which is fixed (because it uses only the observed distribution over X, Y, W)" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 7, 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [ 178 | "# Anchor Regression Solution\n", 179 | "g_a = ((a * bx**2 + bx * by)*(1 + lda) + a * sx) \\\n", 180 | " / ((bx**2 * (1 + lda) + sx))\n", 181 | "\n", 182 | "# Proxy Anchor Regression solution\n", 183 | "g_w = (cov['xy'] * cov['ww'] + lda * cov['wy']) \\\n", 184 | " / (cov['xx'] * cov['ww'] + lda * cov['wx']) " 185 | ] 186 | }, 187 | { 188 | "cell_type": "markdown", 189 | "metadata": {}, 190 | "source": [ 191 | "We can then compute the expected loss, under the worst-case shift" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": 8, 197 | "metadata": {}, 198 | "outputs": [], 199 | "source": [ 200 | "def get_loss(g, shift, noise=True):\n", 201 | " loss = (((a - g) * bx + by)**2)*(shift) + ((a - g)**2) * sx\n", 202 | " if noise:\n", 203 | " loss = loss + sy\n", 204 | " return loss\n", 205 | "\n", 206 | "loss_g_a = get_loss(g_a, 1 + lda, noise=True)\n", 207 | "loss_g_w = get_loss(g_w, 1 + lda, noise=True)" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": 9, 213 | "metadata": {}, 214 | "outputs": [], 215 | "source": [ 216 | "results = np.array([\n", 217 | " # Parameters\n", 218 | " rw, rx, sw, sx, sy, \n", 219 | " bw, bx, by, a, \n", 220 | " # AR solution\n", 221 | " g_a, \n", 222 | " # Loss under different models\n", 223 | " loss_g_a, loss_g_w, \n", 224 | " loss_g_w - loss_g_a]).T\n", 225 | "\n", 226 | "results_df = pd.DataFrame(results, columns=[\n", 227 | " 'rho_W', 'rho_X', 'sig_W', 'sig_X', 'sig_Y', \n", 228 | " 'b_W', 'b_X', 'b_Y', 'a', \n", 229 | " 'g_a', \n", 230 | " 'loss_g_a', 'loss_g_w', \n", 231 | " 'diff_loss_w_a'])\n", 232 | "\n", 233 | "# Filter for parameterizations that are valid\n", 234 | "results_df = results_df.query('sig_Y > 0 & rho_X < 1')" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": 10, 240 | "metadata": {}, 241 | "outputs": [ 242 | { 243 | "name": "stdout", 244 | "output_type": "stream", 245 | "text": [ 246 | "After filtering out negative values of sigma_Y, the minimum $\\rho_W$ is 0.05186371556741927\n" 247 | ] 248 | } 249 | ], 250 | "source": [ 251 | "print(f\"After filtering out negative values of sigma_Y, the minimum $\\\\rho_W$ is {results_df['rho_W'].min()}\")" 252 | ] 253 | }, 254 | { 255 | "cell_type": "markdown", 256 | "metadata": {}, 257 | "source": [ 258 | "## Plot results" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": 11, 264 | "metadata": {}, 265 | "outputs": [], 266 | "source": [ 267 | "def plot_vars(var_list, df, id_vars=['rho_W']):\n", 268 | " \n", 269 | " long_df = pd.melt(df, id_vars=id_vars)\n", 270 | " long_df.rename(columns = {'variable': 'Parameter'}, inplace=True)\n", 271 | " \n", 272 | " f, ax = plt.subplots(figsize=(6.5, 6.5))\n", 273 | " sns.lineplot(x='rho_W', y='value', hue='Parameter', \n", 274 | " data=long_df.query('Parameter in @var_list'),\n", 275 | " ax=ax)\n", 276 | " ax.set_xlabel('$\\\\rho_W$')\n", 277 | " \n", 278 | " return f, ax" 279 | ] 280 | }, 281 | { 282 | "cell_type": "markdown", 283 | "metadata": {}, 284 | "source": [ 285 | "### Figure 10(a)" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": 13, 291 | "metadata": {}, 292 | "outputs": [ 293 | { 294 | "data": { 295 | "image/png": "\n", 296 | "text/plain": [ 297 | "
" 298 | ] 299 | }, 300 | "metadata": { 301 | "needs_background": "light" 302 | }, 303 | "output_type": "display_data" 304 | } 305 | ], 306 | "source": [ 307 | "var_list = ['b_W', 'b_X', 'b_Y', 'a', 'sig_Y']\n", 308 | "f, ax = plot_vars(var_list, results_df)\n", 309 | "\n", 310 | "plt.legend(title='Parameters', loc='upper right', labels=['$\\\\sigma_Y$', '$\\\\beta_W$', '$\\\\beta_X$', '$\\\\beta_Y$', '$\\\\alpha$'])\n", 311 | "#plt.savefig(\"figs/id_example_parameters.pdf\")\n", 312 | "plt.show()" 313 | ] 314 | }, 315 | { 316 | "cell_type": "markdown", 317 | "metadata": {}, 318 | "source": [ 319 | "### Figure 10(b)" 320 | ] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "execution_count": 14, 325 | "metadata": {}, 326 | "outputs": [ 327 | { 328 | "data": { 329 | "image/png": "\n", 330 | "text/plain": [ 331 | "
" 332 | ] 333 | }, 334 | "metadata": { 335 | "needs_background": "light" 336 | }, 337 | "output_type": "display_data" 338 | } 339 | ], 340 | "source": [ 341 | "var_list = ['g_a']\n", 342 | "f, ax = plot_vars(var_list, results_df)\n", 343 | "\n", 344 | "# Plot the value of gamma when W is used \n", 345 | "ax.axhline(y=g_w, xmin=0, xmax=1, label='g_w', color='green')\n", 346 | "\n", 347 | "plt.legend(title='Parameters', loc='upper right')\n", 348 | " #colors=['green', 'blue'])\n", 349 | "#plt.savefig(\"figs/id_example_gamma.pdf\")\n", 350 | "plt.show()" 351 | ] 352 | } 353 | ], 354 | "metadata": { 355 | "kernelspec": { 356 | "display_name": "Python 3", 357 | "language": "python", 358 | "name": "python3" 359 | }, 360 | "language_info": { 361 | "codemirror_mode": { 362 | "name": "ipython", 363 | "version": 3 364 | }, 365 | "file_extension": ".py", 366 | "mimetype": "text/x-python", 367 | "name": "python", 368 | "nbconvert_exporter": "python", 369 | "pygments_lexer": "ipython3", 370 | "version": "3.7.9" 371 | } 372 | }, 373 | "nbformat": 4, 374 | "nbformat_minor": 4 375 | } 376 | -------------------------------------------------------------------------------- /synthetic-experiments/experiment1/experiment1-plot.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | library(dplyr) 3 | 4 | # Import and convert data 5 | df = read_csv("experiment1-data.csv") 6 | method.names = c("ar" = "AR(A)", "cross" = "xPAR(W, Z)", "par" = "PAR(W)", "ols" = "OLS") 7 | df$method <- factor(df$method, levels=names(method.names), labels=method.names) 8 | 9 | # Dataframe containing theoretical values 10 | df.theo <- df %>% subset(n == "theo") %>% select(method, value, x) 11 | 12 | # Create data frame containing finite sample values 13 | df <- df %>% 14 | subset(n != "theo" & method %in% c("xPAR(W, Z)", "PAR(W)")) %>% 15 | transform(n = factor(as.integer(n))) 16 | levels(df$n) <- paste0("$n = ", levels(df$n), "$") 17 | 18 | # Plot 19 | p <- ggplot(df, aes(x=x, y=value, colour=method, fill=method)) + 20 | geom_line(data=df.theo, aes(linetype=method, colour=method), size=0.8) + 21 | stat_summary(geom="line", fun=median, alpha=1, size=1)+ 22 | stat_summary(geom="ribbon", fun.data=median_hilow, fun.args=list(conf.int=0.5), size=0.1, alpha=0.1, show.legend = F) + 23 | coord_cartesian(ylim=c(0.65, 2.5)) + 24 | scale_linetype_manual(values=c("22", "26", "22", "22"), breaks = c("PAR(W)"), labels="", name="Population")+ 25 | scale_color_brewer(palette = "Dark2", name="Method", breaks = levels(df.theo$method), labels = method.names) + 26 | scale_fill_brewer(palette = "Dark2", name="Method", labels = method.names) + 27 | labs(y="MSPE under $do(A:=\\nu)$", 28 | x="Signal-to-variance ratio") + 29 | scale_x_continuous(breaks = c(0:4)/4, labels=c("0\\%", "25\\%", "50\\%", "75\\%", "100\\%")) + 30 | theme_bw(base_size=9) + 31 | theme(legend.position = "bottom", 32 | legend.spacing.y = unit(0.05, 'cm'), 33 | legend.spacing.x = unit(0.05, 'cm'), 34 | legend.margin=margin(c(-10,0,-5, 0)), 35 | legend.key.width = unit(0.3,"cm"), 36 | legend.title = element_text(size = 9), 37 | plot.margin = margin(0, 0, 0, 0)) + 38 | guides(color=guide_legend(order=2, title=NULL), 39 | linetype=guide_legend(order=1, override.aes = list(lty = "22"))) + 40 | facet_wrap(~n) 41 | print(p) 42 | -------------------------------------------------------------------------------- /synthetic-experiments/experiment1/experiment1-simulate.py: -------------------------------------------------------------------------------- 1 | ### Loading libraries 2 | import pandas as pd 3 | import numpy as np 4 | from tqdm import tqdm 5 | 6 | # Load file tools.py and population_estimators from parent folder 7 | import os, sys, inspect 8 | currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) 9 | parentdir = os.path.dirname(currentdir) 10 | sys.path.insert(0,parentdir) 11 | 12 | from tools import ols, ar, get_mse, cross, simulate 13 | from population_estimators import pack_params, gamma_par, gamma_ar, gamma_ols, gamma_cross, get_mse_v 14 | 15 | """ 16 | This file simulates data for the experiment in Section 5.1 17 | See the appendix Section E.1 for details on this experiment 18 | """ 19 | 20 | ### Set seed 21 | np.random.seed(1) 22 | 23 | ### Dict with dimensions of variables 24 | d = {"A": 3, "W": 3, 'Z': 3, "Y": 1, "X": 3, "H": 1} 25 | # We store the joint dimension of the outcomes X, Y and H as d['O'] 26 | d['O'] = d['X'] + d['Y'] + d['H'] 27 | 28 | ### Specify locations of parameters. 29 | # E.g. c['X'] specifies indices of (Y, X, H) containing X 30 | c = {"Y": [0], "X": [1, 2, 3]} 31 | 32 | ### Create parameter matrix M 33 | M = np.array([[1, 0, -2], 34 | [0, 2, 1], 35 | [-1, 3, 0], 36 | [2, 2, -3], 37 | [0, -2, 2]]) 38 | 39 | ### Create parameter matrix B 40 | B = np.zeros((d['O'], d['O'])) 41 | B[0] = np.array([0, -2, 2, 0, 1]) 42 | B[3] = np.array([3, 0, 0, 0, 1]) 43 | 44 | # Pack parameters in dict 45 | pars = {'M': M, 46 | 'B': B, 47 | 'beta': np.diag([1.0, 1, 1]), 48 | 'beta_z': np.diag([1.0, 1, 1])} 49 | 50 | # Fix lambda 51 | lamb = 5 52 | 53 | 54 | # 1) Simulate 55 | results = [] 56 | 57 | # We label the population means with a number to avoid type-errors (but they are computed by closed form equations, not a sample) 58 | theo = 1e9 59 | 60 | # Select fixed intervention direction 61 | v = np.array([[-4, 0.5, 1.0]]).T 62 | 63 | # Normalize to unit length, scale to trust region boundary, and upscale by 20% 64 | v = 1.2*v*np.sqrt(1+lamb)/np.sqrt(v.T@v) 65 | 66 | # Select points over x-axis 67 | x_ax = np.arange(1, 21)/20 68 | for x in tqdm(x_ax): 69 | # Compute s^2 70 | s2 = (1-x)/x 71 | 72 | # Pack parameter inputs to population mean functions 73 | params = pack_params(pars, c, d, np.sqrt(s2)) 74 | 75 | # We only the population version once at every value of s^2 76 | results.append([theo, x] + [get_mse_v(gamma_ar(params, lamb), v, params, c), 77 | get_mse_v(gamma_par(params, lamb), v, params, c), 78 | get_mse_v(gamma_cross(params, lamb), v, params, c), 79 | get_mse_v(gamma_ols(params), v, params, c)]) 80 | 81 | # Loop over simulations 82 | for n in [250, 2500]: 83 | for _ in range(5000): 84 | # Simulate training data 85 | data = simulate(n, d, pars, noise_W=np.sqrt(s2), noise_Z=np.sqrt(s2)) 86 | X, Y, A, W, Z = data['X'], data['Y'], data['A'], data['W'], data['Z'] 87 | 88 | # Compute estimators from training data 89 | gammas = { 90 | 'ar': ar(X, Y, A, lamb=lamb), 91 | "par": ar(X, Y, W, lamb=lamb), 92 | "cross": cross(X, Y, W, Z, lamb=lamb), 93 | "ols": ols(X, Y)} 94 | 95 | # Simulate test data from intervention do(A:=v) 96 | test_data = simulate(n, d, pars, noise_W=np.sqrt(s2), noise_Z=np.sqrt(s2), v=v) 97 | 98 | # Append results 99 | results.append([n, x] + [get_mse(test_data, gamma) 100 | for gamma in gammas.values()]) 101 | 102 | # Store data in dataframe 103 | df = pd.DataFrame(np.array(results), columns=["n", "x"] + list(gammas.keys())) 104 | df = df.melt(["x", "n"], var_name="method") 105 | # Encode population values as "theo" instead of 10e9 (Panda handles, but numpy didnt) 106 | df['n'] = df['n'].replace(theo, "theo") 107 | 108 | # Plotting data is done with ggplot in R. See file 'experiment1-plot.R' 109 | df.to_csv("experiment1-data.csv") 110 | -------------------------------------------------------------------------------- /synthetic-experiments/experiment2/experiment2-plot.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | library(dplyr) 3 | 4 | # Import and convert data 5 | df = read_csv("experiment2-data.csv") 6 | # The assumed signal-to-variance ratio 7 | svr = 0.4 8 | 9 | # Transform names 10 | var.names <- c("actual"="PAR($W$) MSPE, true", "belief" = "PAR($W$) MSPE, est.", "ols" = "OLS MSPE") 11 | df$variable <- factor(df$variable, levels = names(var.names), labels=var.names) 12 | 13 | # Plot 14 | p <- ggplot(df, aes(x=x, y=value, colour=variable, fill=variable, lty="Default")) + 15 | stat_summary(geom="line", fun=median, alpha=1, size=0.8, show.legend=T) + 16 | stat_summary(data=df, geom="ribbon", fun.data=median_hilow, fun.args=list(conf.int=0.5), size=0.00, alpha=0.1, show.legend=F)+ 17 | geom_vline(mapping=aes(xintercept=svr, lty="Assumed SVR"), size=0.6, show.legend=F) + 18 | labs(y="MSPE", x="True signal-to-variance ratio") + 19 | scale_x_continuous(breaks = (1:4)/4, labels = c("25\\%", "50\\%", "75\\%", "100\\%")) + 20 | scale_linetype_manual(values = c("22", "solid"), breaks="Assumed SVR", name=NULL) + 21 | scale_color_brewer(palette="Dark2", aesthetics = c("color", "fill"), name=NULL) + 22 | theme_bw(base_size=9) + 23 | theme(legend.margin = margin(0,0,0,0), 24 | legend.spacing.y = unit(3, 'pt'), 25 | plot.margin = margin(0, 0, 0, 0)) 26 | print(p) 27 | -------------------------------------------------------------------------------- /synthetic-experiments/experiment2/experiment2-simulate.py: -------------------------------------------------------------------------------- 1 | ### Loading libraries 2 | import pandas as pd 3 | import numpy as np 4 | from tqdm import tqdm 5 | 6 | # Load file tools.py and population_estimators from parent folder 7 | import os, sys, inspect 8 | currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) 9 | parentdir = os.path.dirname(currentdir) 10 | sys.path.insert(0,parentdir) 11 | 12 | from tools import Id, ols, ar, inv, simulate 13 | 14 | 15 | """ 16 | This file simulates data for the experiment in Section 5.2 17 | See the appendix Section E.2 for details on this experiment 18 | """ 19 | 20 | ### Set seed 21 | np.random.seed(1) 22 | 23 | ### Dict with dimensions of variables 24 | d = {"A": 3, "W": 3, 'Z': 3, "Y": 1, "X": 3, "H": 1} 25 | 26 | # We store the joint dimension of the outcomes X, Y and H as d['O'] 27 | d['O'] = d['X'] + d['Y'] + d['H'] 28 | 29 | ### Specify locations of parameters. 30 | # E.g. cX specifies indices of (Y, X, H) containing X 31 | cY = [0]; cX = [1, 2, 3] 32 | 33 | ### Create parameter matrix M 34 | M = np.array([[1, 0, -2], 35 | [0, 2, 1], 36 | [-1, 3, 0], 37 | [2, 2, -3], 38 | [0, -2, 2]]) 39 | 40 | ### Create parameter matrix B 41 | B = np.zeros((d['O'], d['O'])) 42 | B[0] = np.array([0, -2, 2, 0, 1]) 43 | B[3] = np.array([3, 0, 0, 0, 1]) 44 | 45 | # Pack parameters 46 | pars = {'M': M, 47 | 'B': B, 48 | 'beta': np.diag([1.0, 1, 1]), 49 | 'beta_z': np.diag([1.0, 1, 1]) 50 | } 51 | 52 | # Fix lambda 53 | lamb = 5 54 | 55 | # 1) Simulate 56 | results = [] 57 | 58 | # For computing the actual worst case losses, we compute the inverse of the matrix Id - B 59 | IB = inv(Id(d['O']) - B) 60 | 61 | # We specify the assumed signal-to-variance ratio to 40% 62 | svr = 0.4 63 | 64 | # Loop over x axis 65 | x_ax = np.arange(1, 21)/20 66 | for x in tqdm(x_ax): 67 | 68 | # The noise variance s^2 is set as (1-x)/x, where x is the actual signal-to-variance 69 | s2 = (1-x)/x 70 | for n in [1000]: 71 | for _ in range(1000): 72 | # Simulate data set 73 | data = simulate(n, d, pars, noise_W=np.sqrt(s2), noise_Z=np.sqrt(s2)) 74 | X, Y, A, W, Z = data['X'], data['Y'], data['A'], data['W'], data['Z'] 75 | 76 | # Compute believed worst case loss (PAR) 77 | gamma_par = ar(X, Y, W, lamb=lamb) 78 | R = (Y - gamma_par.T@X) 79 | WCL_belief = np.mean(R**2) + lamb*R@W.T@inv(W@W.T)@W@R.T/n 80 | 81 | 82 | # Find actual worst case intervention v in Omega_W(0.5) set (PAR) 83 | w_gamma = (IB[cY,] - gamma_par.T@IB[cX,]).T 84 | b_gamma = (w_gamma.T@M).T 85 | v = b_gamma*np.sqrt((1+lamb*svr)/(b_gamma.T@b_gamma)) 86 | 87 | # Compute actual worst case loss (PAR) 88 | WCL_actual = (b_gamma.T@v)**2 + w_gamma.T@w_gamma 89 | 90 | # Find worst case intervention and loss (OLS) 91 | gamma_ols = ols(X, Y) 92 | w_gamma = (IB[cY,] - gamma_ols.T@IB[cX,]).T 93 | b_gamma = (w_gamma.T@M).T 94 | v = b_gamma*np.sqrt((1+lamb*svr)/(b_gamma.T@b_gamma)) 95 | WCL_ols = (b_gamma.T@v)**2 + w_gamma.T@w_gamma 96 | 97 | # Append results 98 | results.append([n, x, WCL_belief[0,0], WCL_actual[0,0], WCL_ols[0,0]]) 99 | 100 | # Convert to data frame 101 | df = pd.DataFrame(np.array(results), columns=("n", "x", "belief", "actual", "ols")) 102 | df = df.melt(["x", "n"]) 103 | 104 | # Plotting data is done with ggplot in R. See file 'experiment2-plot.R' 105 | df.to_csv("experiment2-data.csv") 106 | -------------------------------------------------------------------------------- /synthetic-experiments/experiment3/experiment3-plot.R: -------------------------------------------------------------------------------- 1 | library(ggplot2) 2 | library(dplyr) 3 | library(readr) 4 | library(tikzDevice) 5 | 6 | # Naming 7 | method.names <- c("par5" = "PAR($W$)", "cross5" = "xPAR($W,Z$)", "ar5" = "AR($A$)") 8 | predictor.names <- c("$X_{\\rm{causal}}^1$", "$X_{\\rm{causal}}^2$", "$X_{\\rm{causal}}^3$", 9 | "$X_{\\rm{anti-causal}}^1$", "$X_{\\rm{anti-causal}}^2$", "$X_{\\rm{anti-causal}}^3$") 10 | 11 | # Read data 12 | df = read_csv("experiment3-data.csv") %>% 13 | transform(Method = factor(Method, levels=names(method.names)), Causal <- factor(Causal)) %>% 14 | subset(Method!="ar5") %>% 15 | group_by(X.coord, Method, Causal) %>% 16 | summarize(sd = sd(abs(Weight)), Weight = mean(abs(Weight)), ymin = Weight - sd, ymax = Weight+sd) %>% 17 | ungroup() 18 | 19 | # Plot 20 | p <- ggplot(subset(df, Method != "ar5"), aes(x=X.coord, y=abs(Weight), fill = Method)) + 21 | geom_bar(stat="identity", position=position_dodge(width=0.9)) + 22 | geom_errorbar(aes(ymin = ymin,ymax = ymax), stat="identity", position = position_dodge(width=0.9), size=0.1, width=0.25) + 23 | coord_flip() + 24 | labs(x=NULL, y = "$|$Regression coefficients$|$") + 25 | scale_x_continuous(breaks=c(1:6), labels=predictor.names) + 26 | scale_fill_brewer(palette="Dark2", breaks=c("cross5", "par5"), labels=as_labeller(method.names)) + 27 | theme_bw(base_size=9) + 28 | theme(legend.key.size = unit(0.8,"line")) + 29 | guides(fill = guide_legend(title=NULL, override.aes = list(size = 0.3))) 30 | print(p) 31 | -------------------------------------------------------------------------------- /synthetic-experiments/experiment3/experiment3-simulate.py: -------------------------------------------------------------------------------- 1 | ### Loading libraries 2 | import pandas as pd 3 | import numpy as np 4 | from tqdm import tqdm 5 | 6 | # Load file tools from parent folder 7 | import os, sys, inspect 8 | currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) 9 | parentdir = os.path.dirname(currentdir) 10 | sys.path.insert(0,parentdir) 11 | from tools import Id, ar, cross, cb, simulate 12 | 13 | """ 14 | This file simulates data for the experiment in Section 5.3 15 | See the appendix Section E.3 for details on this experiment 16 | """ 17 | 18 | ### Set seed 19 | np.random.seed(1) 20 | 21 | ### Dict with dimensions of variables 22 | d = {"A": 6, 23 | "W": 6, 24 | 'Z': 6, 25 | "Y": 1, 26 | "X": 6, 27 | "H": 1} 28 | # We store the joint dimension of the outcomes X, Y and H as d['O'] 29 | d['O'] = d['X'] + d['Y'] + d['H'] 30 | 31 | ### Specify locations of parameters. 32 | # E.g. cX1 specifies indices of (Y, X, H) containing X1 33 | # cA2 specifies indices of A containing A2 34 | cY = [0] 35 | cX1 = [1, 2, 3] 36 | cX2 = [4, 5, 6] 37 | cA1 = [0, 1, 2] 38 | cA2 = [3, 4, 5] 39 | 40 | # Create parameter matrix M 41 | M = np.zeros((d['O'], d['A'])) 42 | M[np.ix_(cX1,cA1)] = np.ones((len(cX1), len(cA1))) 43 | M[np.ix_(cX2,cA2)] = np.ones((len(cX1), len(cA1))) 44 | 45 | # Create parameter matrix B 46 | B = np.zeros((d['O'], d['O'])) 47 | B[np.ix_(cY, cX1)] = [1/4, 1/4, 1/4] 48 | B[np.ix_(cX2, cY)] = np.array([[4, 4, 4]]).T 49 | 50 | # Pack parameters in dict 51 | pars = {'M': M, 52 | 'B': B, 53 | 'beta': Id(d['A']), 54 | 'beta_z': Id(d['A']) 55 | } 56 | 57 | # Variable 'noise' specifies the error variance of the proxies. 58 | # The experiment regards considers a larger variance in proxy of A1 than in A2. 59 | noise = np.diag([1 for i in cA1] + [3 for i in cA2]) 60 | 61 | 62 | # 1) Simulate 63 | n = 10000 64 | out = None 65 | 66 | # Loop repeats experiment 1000 times 67 | for _ in tqdm(range(1000)): 68 | # Simulate data 69 | data = simulate(n, d, pars, noise_W=noise) 70 | X, Y, A, W, Z = data['X'], data['Y'], data['A'], data['W'], data['Z'] 71 | 72 | # Fit estimators 73 | par5 = ar(X, Y, W, lamb=5) 74 | cross5 = cross(X, Y, W, Z, lamb=5) 75 | ar5 = ar(X, Y, A, lamb=5) 76 | 77 | # Cast to dataframe 78 | df = pd.DataFrame(cb(par5, cross5, ar5), columns=["par5", "cross5", "ar5"]) 79 | 80 | # 'Causal' encodes for whether predictor is causal. 81 | df['Causal'] = 3*[1] + 3*[0] 82 | # 'X.coord' encodes variable number (e.g. X_1, X_2, X_3, ...) 83 | df['X.coord'] = np.arange(1, 7) 84 | # Melt dataframe 85 | df = df.melt(id_vars=["Causal", "X.coord"], var_name="Method", value_name = "Weight") 86 | # Add results from this simulation to overall results 87 | out = df if out is None else pd.concat((out, df)) 88 | 89 | # Plotting data is done with ggplot in R. See file 'experiment3-plot.R' 90 | out.to_csv("experiment3-data.csv") 91 | -------------------------------------------------------------------------------- /synthetic-experiments/experiment4/experiment4-plot.R: -------------------------------------------------------------------------------- 1 | library(ggplot2) 2 | library(dplyr) 3 | 4 | # Load data 5 | df = read.csv2("experiment4-data.csv", sep=",", stringsAsFactors = T) 6 | df$mse <- as.numeric(as.character(df$mse)) 7 | 8 | # Name methods 9 | method.names = c( 10 | "ols" = "OLS", 11 | "ar" = "AR(A)", 12 | "tar" = "TAR(A)" 13 | ) 14 | df$method <- factor(df$method, levels=c('tar', 'ar', 'ols')) 15 | 16 | # Label settings 17 | setup.labeller = c( 18 | "correct_shift" = "Anticipated shift occuring", 19 | "incorrect_shift" = "Anticipated shift not occuring" 20 | ) 21 | 22 | # Make plots 23 | p <- ggplot(df, aes(x=mse, color=method, fill=method)) + 24 | geom_histogram(aes(y = ..density..), position="identity", bins = 100, alpha=0.8) + 25 | facet_wrap(~setup, ncol=1, labeller = as_labeller(setup.labeller)) + 26 | scale_fill_brewer(palette = "Dark2", name="Method", labels = method.names) + 27 | scale_color_brewer(palette = "Dark2", name="Method", labels = method.names) + 28 | theme_bw(base_size=9) + 29 | theme(legend.title=element_blank(), 30 | panel.grid = element_blank(), 31 | legend.key.size = unit(0.8,"line"), 32 | legend.spacing.y = unit(0.1, 'cm'), 33 | plot.margin = margin(0,0,0,0)) + 34 | labs(x="MSPE", y=NULL) 35 | print(p) 36 | -------------------------------------------------------------------------------- /synthetic-experiments/experiment4/experiment4-simulate.py: -------------------------------------------------------------------------------- 1 | ### Loading libraries 2 | import numpy as np 3 | import pandas as pd 4 | from tqdm import tqdm 5 | 6 | # Load file tools.py and population_estimators from parent folder 7 | import os, sys, inspect 8 | currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) 9 | parentdir = os.path.dirname(currentdir) 10 | sys.path.insert(0,parentdir) 11 | from tools import Id, simulate, ols, ar, get_mse, tar 12 | 13 | """ 14 | This file simulates data for the experiment in Section 5.4 15 | See the appendix Section E.4 for details on this experiment 16 | """ 17 | 18 | 19 | ### Set seed 20 | np.random.seed(1) 21 | 22 | 23 | ### Dict with dimensions of variables 24 | d = {"A": 2, "W": 2, 'Z': 2, "Y": 1, "X": 2, "H": 1} 25 | # We store the joint dimension of the outcomes X, Y and H as d['O'] 26 | d['O'] = d['X'] + d['Y'] + d['H'] 27 | 28 | ### Create parameter matrix M 29 | M = np.array([[2, 1], 30 | [0, 1], 31 | [2, 2], 32 | [0, 3]]) 33 | 34 | ### Create parameter matrix B 35 | B = np.array([[ 0, -0.06, 0.07, 0.04], 36 | [ 0.05, 0, 0.19, 0.03], 37 | [ 0.11, -0.11, 0, 0.1 ], 38 | [-0.02, 0.02, 0.09, 0]]) 39 | 40 | ### Pack parameters 41 | pars = {'M': M, 42 | 'B': B, 43 | 'beta': Id(d['A'], d['W'])} 44 | 45 | 46 | ### Specify rotation and mean-shift of the anticipated distribution shift 47 | rotat = np.diag([np.sqrt(2), 1]) 48 | shift = np.array([0, 2]) 49 | # Store setups 50 | sim_setups = { 51 | "incorrect_shift": {"eta_tar": shift, "eta_sim": np.zeros(2), 52 | "cov_A_tar": rotat@rotat.T, "cov_A_sim": Id(d['A'])}, 53 | "correct_shift": {"eta_tar": shift, "eta_sim": shift, 54 | "cov_A_tar": rotat@rotat.T, "cov_A_sim": rotat@rotat.T} 55 | } 56 | # We select lambda such that B B.T + eta eta.T <= (1+lambda) Id (EAA.T = Id) 57 | eta = shift.reshape(-1, 1) 58 | lamb = np.linalg.eigvals(rotat@rotat.T + eta@eta.T).max() - 1 59 | 60 | ### Simulate 61 | results = [] 62 | n = 10000 # training size 63 | m = 10000 # test size 64 | for i in tqdm(range(10000)): 65 | # Simulate training data 66 | data = simulate(n, d, pars) 67 | A, X, Y, W, Z = data['A'], data['X'], data['Y'], data['W'], data['Z'] 68 | 69 | # Fit estimators 70 | lamb = 4 71 | gammas = {"ols": ols(X, Y), 72 | "ar": ar(X, Y, A, lamb=lamb)} 73 | 74 | for setup, s in sim_setups.items(): 75 | # Get simulation settings 76 | eta_tar, eta_sim, cov_A_tar, cov_A_sim = list(s.values()) 77 | 78 | # Target etstimator 79 | gamma_tar, alpha_tar = tar(X, Y, A, Sigma = cov_A_tar, nu=eta_tar) 80 | 81 | # Simulate test data 82 | _data = simulate(m, d, pars, v = eta_sim, cov_A = cov_A_sim) #v=eta_sim 83 | 84 | # Append results 85 | results.append([ 86 | get_mse(_data, gammas['ols']), 87 | get_mse(_data, gammas['ar']), 88 | get_mse(_data, gamma_tar, alpha=alpha_tar), 89 | setup 90 | ]) 91 | 92 | # Cast results to data frame 93 | df = pd.DataFrame(results, columns=["ols", "ar", "tar", "setup"]).melt(id_vars = "setup", value_name="mse", var_name="method") 94 | 95 | # Plotting data is done with ggplot in R. See file 'experiment4-plot.R' 96 | df.to_csv("experiment4-data.csv") 97 | -------------------------------------------------------------------------------- /synthetic-experiments/population_estimators.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | inv = np.linalg.inv; Id = lambda d, d2=None: np.eye(d, d2) 3 | """ 4 | For some experiments we need the population versions of estimators instead of sample estimators. 5 | This file has functions to such estimators, based on moments that are derived from parameter matrices. 6 | """ 7 | 8 | def pack_params(pars, c, d, noise_W): 9 | """ 10 | The population estimators are computed based on moments. 11 | We compute the moments based on the parameters MB and the noise distributions. 12 | 13 | In the naming below, we the variable XW corresponds to E[XW^T] etc. 14 | We let O denote the stacked outcome (Y, X, W) 15 | """ 16 | # Unpack inputs. c is a vector containing indices (e.g. O_1 is Y, O_2 is X_1,...) 17 | cY = c['Y']; cX = c['X'] # input 18 | beta, M, B = pars['beta'], pars['M'], pars['B'] 19 | 20 | # Store the inverse of the matrix (Id - B) 21 | IB = inv(Id(d['O']) - B) 22 | 23 | # Compute moments relating to the outcome O 24 | OA = IB@M 25 | OW = IB@M@beta 26 | OO = IB@(M@M.T + Id(d['O']))@IB.T 27 | 28 | # Compute moment E[WW^T] 29 | if len(np.shape(noise_W)) == 0: 30 | WW = beta.T@beta + noise_W**2*Id(d['W']) 31 | else: 32 | WW = beta.T@beta + noise_W@noise_W.T 33 | 34 | # Compute covariance of A and cross proxies 35 | AA = Id(d['A']) 36 | ZW = beta.T@beta 37 | #Covariances relating to X 38 | XX = OO[cX][:,cX]; XY = OO[cX][:,cY]; XW = OW[cX]; XA = OA[cX]; XZ = XW 39 | # Covariances relating to Y 40 | YW = OW[cY]; YA = OA[cY]; YZ = YW 41 | 42 | # Return dict with all moments 43 | return {"IB": IB, "OA": OA, "OW":OW, "OO":OO, "WW":WW, "AA":AA, "ZW":ZW,"XX":XX, 44 | "XY":XY, "XW":XW, "XA":XA, "XZ":XZ,"YW":YW, "YA":YA, "YZ":YZ, "M":M} 45 | 46 | # OLS 47 | def gamma_ols(params): 48 | # Unpack moments 49 | XX, XY = params['XX'], params['XY'] 50 | # Return estimator based on moments 51 | return inv(XX)@XY 52 | 53 | # Proxy anchor regression 54 | def gamma_par(params, lamb): 55 | # Unpack moments 56 | XX, XW, WW, XY, YW = params['XX'], params['XW'], params['WW'], params['XY'], params['YW'] 57 | # Return estimator based on moments 58 | return inv(XX + lamb*XW@inv(WW)@XW.T)@(XY + lamb*XW@inv(WW)@YW.T) 59 | 60 | # Anchor regression 61 | def gamma_ar(params, lamb): 62 | # Unpack moments 63 | XX, AA, XY, XA, YA = params['XX'], params['AA'], params['XY'], params['XA'], params['YA'] 64 | # Return estimator based on moments 65 | return inv(XX + lamb*XA@inv(AA)@XA.T)@(XY + lamb*XA@inv(AA)@YA.T) 66 | 67 | def gamma_cross(params, lamb): 68 | # Unpack moments 69 | XX, XW, ZW, XZ, XY, YW, YZ = params['XX'], params['XW'], params['ZW'], params['XZ'], params['XY'], params['YW'], params['YZ'] 70 | # Compute "denominator" (left-side inverse) 71 | denom = 2*XX + lamb*(XW@inv(ZW)@XZ.T + XZ@inv(ZW).T@XW.T) 72 | # Compute "numerator" 73 | num = 2*XY + lamb*(XW@inv(ZW)@YZ.T + XZ@inv(ZW).T@YW.T) 74 | return inv(denom)@num 75 | 76 | def get_mse_v(gamma, v, params, c): 77 | """Compute the population mse of using an estimator gamma""" 78 | # Unpack 79 | M, IB = params['M'], params['IB'] 80 | cY, cX = c['Y'], c['X'] 81 | # Compute w_gamma 82 | w_gamma = (IB[cY,] - gamma.T@IB[cX,]).T 83 | # Output population MSE 84 | return (w_gamma.T@M@v@v.T@M.T@w_gamma + w_gamma.T@w_gamma)[0,0] 85 | -------------------------------------------------------------------------------- /synthetic-experiments/tools.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | """ 4 | This file contains implementations of estimators and a function to simulate data, used for experiments in the paper 5 | """ 6 | 7 | ### Define convinience functions 8 | inv = np.linalg.inv; norm = np.linalg.norm; Id = lambda d, d2=None: np.eye(d, d2) 9 | # Column and row bind 10 | def cb(*args): return np.concatenate(args, axis=1) 11 | def rb(*args): return np.concatenate(args, axis=0) 12 | 13 | # Multivariate gaussian 14 | N = lambda d, n=1: np.random.normal(size=(d, n)) 15 | 16 | ### Simulate function 17 | def simulate(n, d, pars, v=None, shift=0, cov_A=None, noise_W=None, noise_Z=None): 18 | """ Simulate data from parameters. Dimensions d_A x n, etc. """ 19 | # Unpack 20 | d_A = d['A']; d_W = d['W']; d_X = d['X']; d_Y = d['Y']; d_O = d['O']; d_Z = d['Z'] 21 | M = pars['M']; B = pars['B']; beta = pars['beta'] 22 | 23 | # If no noise is provided, use spherical unit variance as proxy noise 24 | if noise_W is None: noise_W = Id(d_W) 25 | elif len(np.shape(noise_W)) == 0: noise_W = noise_W*Id(d_W) 26 | # If no noise for secondary proxy is supplied, use the same as W 27 | if noise_Z is None: noise_Z = noise_W 28 | elif len(np.shape(noise_Z)) == 0: noise_Z = noise_Z*Id(d_Z) 29 | 30 | # If no parameter beta_z is provided, use same as beta_W 31 | if "beta_z" in pars.keys(): beta_z = pars['beta_z'] 32 | else: beta_z = beta 33 | 34 | # If covariance matrix for A is given, use this, else use spherical noise 35 | # Since changed covariance matrices are only used for targeted, assumes also a v is given 36 | if cov_A is not None: 37 | A = np.random.multivariate_normal(v, cov=cov_A, size=n).T 38 | else: 39 | # Use either the intervention v tiled several times (fixed A), or a mean-zero gaussian 40 | A = (N(d_A, n) if v is None else np.tile(np.reshape(v, (d_A, 1)), n)) + shift 41 | # Compute the outcome O = (Y, X, H) 42 | O = inv(Id(d['O'])-B)@(M@A + N(d_O, n)) 43 | Y, X, H = np.split(O, [d_Y, d_Y+d_X]) 44 | #Simulate proxies 45 | W = beta.T@A + noise_W@N(d_W, n) 46 | Z = beta_z.T@A + noise_Z@N(d_Z, n) 47 | return {'A': A, 'W': W, 'Y': Y, 'X': X, 'H': H, 'Z': Z} 48 | 49 | # Mean function 50 | def E(X): 51 | return X.mean(axis=1).reshape(-1, 1) 52 | 53 | ### Estimators 54 | # Ordinary least squares 55 | def ols(X, Y, intercept=False): 56 | if intercept: 57 | X = np.concatenate((np.ones((1, X.shape[1])), X)) 58 | return inv(X@X.T)@X@Y.T 59 | 60 | # Anchor regression estimator 61 | def ar(X, Y, A, lamb=1, intercept=False): 62 | if intercept: 63 | X = np.concatenate((np.ones((1, X.shape[1])), X)) 64 | return inv(X@X.T + lamb*X@A.T@inv(A@A.T)@A@X.T)@(X@Y.T + lamb*X@A.T@inv(A@A.T)@A@Y.T) 65 | 66 | # Cross estimator 67 | def cross(X, Y, W, Z, lamb=1): 68 | ZW = inv(Z@W.T) 69 | denom = 2*X@X.T + lamb*(X@W.T@ZW@Z@X.T + X@Z.T@ZW.T@W@X.T) 70 | num = 2*X@Y.T + lamb*(X@W.T@ZW@Z@Y.T + X@Z.T@ZW.T@W@Y.T) 71 | return inv(denom)@num 72 | 73 | # Targeted anchor regression, targeted to covariance Sigma and mean shift nu 74 | def tar(X, Y, A, Sigma, nu=0): 75 | # Get dimensions 76 | d_A, n = A.shape 77 | if len(np.shape(nu)) == 0: 78 | nu = np.tile(nu, d_A).reshape(d_A, 1) 79 | 80 | # Compute alpha and gamma 81 | gamma = inv(X@X.T/n + X@A.T@inv(A@A.T)@(Sigma - A@A.T/n)@inv(A@A.T)@A@X.T)@(X@Y.T/n + X@A.T@inv(A@A.T)@(Sigma - A@A.T/n)@inv(A@A.T)@A@Y.T) 82 | alpha = (Y - gamma.T@X)@A.T@inv(A@A.T)@nu 83 | return gamma, alpha 84 | 85 | # IV estimator 86 | def iv(X, Y, A): 87 | return inv(X@A.T@inv(A@A.T)@A@X.T)@X@A.T@inv(A@A.T)@A@Y.T 88 | 89 | # Function to evaluate the prediction MSE of a dataset and some gamma 90 | def get_mse(data, gamma, alpha = 0): 91 | return ((data['Y'] - gamma.T@data['X'] - alpha)**2).mean() 92 | --------------------------------------------------------------------------------