ECON526
University of British Columbia
\[ \def\Er{{\mathrm{E}}} \def\En{{\mathbb{En}}} \def\cov{{\mathrm{Cov}}} \def\var{{\mathrm{Var}}} \def\R{{\mathbb{R}}} \def\indep{{\perp\!\!\!\perp}} \newcommand\norm[1]{\left\lVert#1\right\rVert} \def\rank{{\mathrm{rank}}} \newcommand{\inpr}{ \overset{p^*_{\scriptscriptstyle n}}{\longrightarrow}} \def\inprob{{\,{\buildrel p \over \rightarrow}\,}} \def\indist{\,{\buildrel d \over \rightarrow}\,} \DeclareMathOperator*{\plim}{plim} \DeclareMathOperator*{\argmin}{argmin} \]
def pfizerrctdata():
n1 = 19965
n0 = 20172
y1 = 9
y0 = 169
n1o = 4044
n0o = 4067
y1o = 1
y0o = 19
over65 = np.zeros(n1 + n0)
over65[0:(n1o+n0o)] = 1
treat = np.concatenate([np.ones(n1o), np.zeros(n0o), np.ones(n1-n1o), np.zeros(n0-n0o)])
infected = np.concatenate([np.ones(y1o), np.zeros(n1o-y1o), np.ones(y0o), np.zeros(n0o-y0o),
np.ones(y1-y1o), np.zeros(n1-n1o-(y1-y1o)),
np.ones(y0-y0o), np.zeros(n0-n0o-y0+y0o)])
data = pd.DataFrame({'treat': treat, 'infected': infected, 'over65': over65})
return data
data = pfizerrctdata()
data.groupby(['over65', 'treat']).mean()| infected | ||
|---|---|---|
| over65 | treat | |
| 0.0 | 0.0 | 0.009314 |
| 1.0 | 0.000502 | |
| 1.0 | 0.0 | 0.004672 |
| 1.0 | 0.000247 |
def ATE(data, treatment='treat', y='infected'):
means=data.groupby(treatment)[y].mean()
ATE = means[1] - means[0]
se = sum(data.groupby(treatment)[y].var()/data.groupby(treatment)[y].count())**0.5
return ATE, se
ate=ATE(data.loc[data['over65']==1,:])
print(f"difference in means = {ate[0]}, SE = {ate[1]}")difference in means = -0.0044244682964888074, SE = 0.0010976149240857532
OLS SE: 0.0011004170544556028, manual SE: 0.0010976149240857532
But standard error very slightly different
Default of smf.ols assumes homoskedasticiy \(\Er[\epsilon^2|X] = \sigma^2\)
With \(y\) and \(T\), binary, \(\Er[\epsilon^2|T] = P(y=1|T)(1-P(y=1|T))\)
OLS HC SE: 0.0010977499295368124, manual SE: 0.0010976149240857532
Regression being a linear approximation to \(\Er[Y|X]\) does not mean \(\beta\) necessarily has the sign you want
In example below, \(\Er[Y|x_1=1, x_2] > \Er[Y|x_1=0,x_2]\) for all \(x_2\), but \(\beta_1 < 0\)
import numpy as np
n = 100_000
def EYX(x):
return -1.0 + 1.0 * x[1] if x[2] > 0 else (1.0 + 0.1 * x[1])
x2 = np.random.randn(n)
x1 = (x2 * 3 + np.random.randn(n) > 0).astype(float)
X = np.column_stack((np.ones(n),x1, x2))
y = np.array([EYX(row) for row in X]) + np.random.randn(n)
beta = np.linalg.lstsq(X, y, rcond=None)[0]
print(f"beta_1 = {beta[1]:.3}")
mean_dEydx2 = np.mean([EYX([1,1,x2i]) - EYX([1,0,x2i]) for x2i in x2])
print(f"mean( E[y|x1=1,x2] - E[y|x1=0,x2] ) = {mean_dEydx2:.3}")beta_1 = -0.163
mean( E[y|x1=1,x2] - E[y|x1=0,x2] ) = 0.548
import os
import requests
url = 'https://www.nber.org/morg/annual/morg23.dta'
local_filename = 'data/morg23.dta'
if not os.path.exists(local_filename):
response = requests.get(url)
with open(local_filename, 'wb') as file:
file.write(response.content)
cps=pd.read_stata(local_filename)
cps["female"] = (cps.sex==2)
cps["log_earn"] = np.log(cps["earnwke"])
cps["log_uhours"] = np.log(cps.uhourse)
cps["log_hourslw"] = np.log(cps.hourslw)
cps.replace(-np.inf, np.nan, inplace=True)
cps["nevermarried"] = cps.marital==7
cps["wasmarried"] = (cps.marital >= 4) & (cps.marital <= 6)
cps["married"] = cps.marital <= 3
lm = list()
lm.append(smf.ols(formula="log_earn ~ female", data=cps,
missing="drop").fit(cov_type='HC3'))
lm.append(smf.ols(formula="log_earn ~ female + log_hourslw + log_uhours", data=cps,
missing="drop").fit(cov_type='HC3'))
lm.append(smf.ols(formula="log_earn ~ female + log_hourslw + log_uhours + wasmarried + married", data=cps,
missing="drop").fit(cov_type='HC3'))
lm.append(smf.ols(formula="log_earn ~ female*(wasmarried+married) + log_hourslw + log_uhours", data=cps,
missing="drop").fit(cov_type='HC3'))
summary_col(lm, stars=True, model_names=[f"{i+1}" for i in range(len(lm))])| 1 | 2 | 3 | 4 | |
|---|---|---|---|---|
| Intercept | 6.9948*** | 2.2378*** | 2.2410*** | 2.2166*** |
| (0.0031) | (0.0295) | (0.0280) | (0.0280) | |
| female[T.True] | -0.2772*** | -0.1318*** | -0.1340*** | -0.0715*** |
| (0.0046) | (0.0038) | (0.0038) | (0.0061) | |
| log_hourslw | 0.0424*** | 0.0443*** | 0.0434*** | |
| (0.0077) | (0.0076) | (0.0075) | ||
| log_uhours | 1.2623*** | 1.2113*** | 1.2108*** | |
| (0.0119) | (0.0115) | (0.0115) | ||
| wasmarried[T.True] | 0.1474*** | 0.1831*** | ||
| (0.0058) | (0.0089) | |||
| married[T.True] | 0.2896*** | 0.3361*** | ||
| (0.0041) | (0.0055) | |||
| female[T.True]:wasmarried[T.True] | -0.0727*** | |||
| (0.0118) | ||||
| female[T.True]:married[T.True] | -0.0988*** | |||
| (0.0080) | ||||
| R-squared | 0.0287 | 0.3632 | 0.3898 | 0.3906 |
| R-squared Adj. | 0.0287 | 0.3631 | 0.3898 | 0.3906 |
\[y_i = x_i \beta + w_i'\gamma + u_i\] - Can equivalently calculate \(\beta\) by
ey=smf.ols(formula="log_earn ~ log_hourslw + log_uhours", data=cps, missing="drop").fit().resid
ex=smf.ols(formula="I(1*female) ~ log_hourslw + log_uhours", data=cps, missing="drop").fit().resid
edf = pd.concat([ex,ey],axis=1)
edf.columns=['ex','ey']
smf.ols('ey ~ ex', data=edf).fit(cov_type="HC3").params[1]np.float64(-0.13176431840840097)
If we want \[ y_i = \beta_0 + x_i \beta + w_i'\gamma + u_i \]
But only regression \(y\) on \(x\), then \[ \hat{\beta}^s = \hat{\beta} + \frac{ \sum (x_i - \bar{x})w_i'}{\sum (x_i - \bar{x})^2} \hat{\gamma} \] and \[ \hat{\beta}^s \inprob \beta + \frac{ \Er[(x_i - \Er[x])w_i']}{\var(x_i)} \gamma \]
Useful for:
###
Estimation: OLS
Dep. var.: log_earn, Fixed effects: race+grade92+unionmme+unioncov+ind17+occ18
Inference: hetero
Observations: 104607
| Coefficient | Estimate | Std. Error | t value | Pr(>|t|) | 2.5% | 97.5% |
|:------------------|-----------:|-------------:|----------:|-----------:|-------:|--------:|
| female | -0.084 | 0.006 | -14.684 | 0.000 | -0.095 | -0.073 |
| wasmarried | 0.082 | 0.009 | 9.532 | 0.000 | 0.065 | 0.099 |
| married | 0.126 | 0.005 | 23.115 | 0.000 | 0.115 | 0.137 |
| log_hourslw | 0.044 | 0.007 | 6.623 | 0.000 | 0.031 | 0.058 |
| log_uhours | 0.988 | 0.011 | 93.299 | 0.000 | 0.967 | 1.009 |
| age | 0.003 | 0.000 | 19.689 | 0.000 | 0.003 | 0.003 |
| I(age ** 2) | -0.000 | 0.000 | -1.170 | 0.242 | -0.000 | 0.000 |
| female:wasmarried | -0.059 | 0.011 | -5.485 | 0.000 | -0.080 | -0.038 |
| female:married | -0.069 | 0.007 | -9.695 | 0.000 | -0.083 | -0.055 |
---
RMSE: 0.524 R2: 0.582 R2 Within: 0.314
RCT with outcome \(Y\), treatment \(T\), other variables \(X\)
Should we estimate ATE in a regression that includes \(X\)?
np.random.seed(54)
n = 1000 # sample size
Z = np.random.normal(size=n) # generate Z
Y0 = -Z + np.random.normal(size=n) # conditional average baseline response is -Z
Y1 = Z + np.random.normal(size=n) # conditional average treatment effect is +Z
D = np.random.binomial(1, .2, size=n) # treatment indicator; only 20% get treated
Y = Y1 * D + Y0 * (1 - D) # observed Y
Z = Z - Z.mean() # demean Z
data = pd.DataFrame({"Y": Y, "D": D, "Z": Z})
print(f"Unobservable sample ATE = {np.mean(Y1-Y0):.3}")Unobservable sample ATE = 0.072
| simple | additive | |
|---|---|---|
| Intercept | -0.0924 | -0.0789 |
| (0.0501) | (0.0386) | |
| D | -0.0370 | -0.1046 |
| (0.1166) | (0.1440) | |
| Z | -0.5630 | |
| (0.0561) | ||
| R-squared | 0.0001 | 0.1551 |
| R-squared Adj. | -0.0009 | 0.1534 |
| simple | additive | interactive | |
|---|---|---|---|
| Intercept | -0.0924 | -0.0789 | -0.0681 |
| (0.0501) | (0.0386) | (0.0352) | |
| D | -0.0370 | -0.1046 | 0.0463 |
| (0.1166) | (0.1440) | (0.0766) | |
| Z | -0.5630 | -1.0111 | |
| (0.0561) | (0.0341) | ||
| Z:D | 2.1349 | ||
| (0.0748) | |||
| R-squared | 0.0001 | 0.1551 | 0.5245 |
| R-squared Adj. | -0.0009 | 0.1534 | 0.5231 |
filename = 'data/collections_email.csv'
url = 'https://raw.githubusercontent.com/matheusfacure/python-causality-handbook/refs/heads/master/causal-inference-for-the-brave-and-true/data/collections_email.csv'
if not os.path.exists(filename):
response = requests.get(url)
with open(filename, 'wb') as file:
file.write(response.content)
data = pd.read_csv(filename)
data.describe()| payments | opened | agreement | credit_limit | risk_score | ||
|---|---|---|---|---|---|---|
| count | 5000.000000 | 5000.000000 | 5000.000000 | 5000.000000 | 5000.000000 | 5000.000000 |
| mean | 669.672000 | 0.490800 | 0.273400 | 0.160800 | 1194.845188 | 0.480812 |
| std | 103.970065 | 0.499965 | 0.445749 | 0.367383 | 480.978996 | 0.100376 |
| min | 330.000000 | 0.000000 | 0.000000 | 0.000000 | 193.695573 | 0.131784 |
| 25% | 600.000000 | 0.000000 | 0.000000 | 0.000000 | 843.049867 | 0.414027 |
| 50% | 670.000000 | 0.000000 | 0.000000 | 0.000000 | 1127.640297 | 0.486389 |
| 75% | 730.000000 | 1.000000 | 1.000000 | 0.000000 | 1469.096523 | 0.552727 |
| max | 1140.000000 | 1.000000 | 1.000000 | 1.000000 | 3882.178408 | 0.773459 |
lm = list()
lm.append(smf.ols(formula="payments ~ email", data=data).fit(cov_type='HC3'))
lm.append(smf.ols(formula="payments ~ email + credit_limit + risk_score",data=data).fit(cov_type='HC3'))
lm.append(smf.ols(formula="payments ~ email + credit_limit + risk_score + opened + agreement",data=data).fit(cov_type='HC3'))
summary_col(lm, stars=True, model_names=[f"{i+1}" for i in range(len(lm))])| 1 | 2 | 3 | |
|---|---|---|---|
| Intercept | 669.9764*** | 490.8653*** | 488.4416*** |
| (2.0973) | (10.2265) | (10.2052) | |
| -0.6203 | 4.4304** | -1.6095 | |
| (2.9401) | (2.1273) | (2.7095) | |
| credit_limit | 0.1511*** | 0.1507*** | |
| (0.0085) | (0.0085) | ||
| risk_score | -8.0516 | -2.0929 | |
| (40.6389) | (40.4986) | ||
| opened | 3.9808 | ||
| (3.9791) | |||
| agreement | 11.7093*** | ||
| (4.2158) | |||
| R-squared | 0.0000 | 0.4774 | 0.4796 |
| R-squared Adj. | -0.0002 | 0.4771 | 0.4791 |
filename = 'data/hospital_treatment.csv'
url = 'https://raw.githubusercontent.com/matheusfacure/python-causality-handbook/refs/heads/master/causal-inference-for-the-brave-and-true/data/hospital_treatment.csv'
if not os.path.exists(filename):
response = requests.get(url)
with open(filename, 'wb') as file:
file.write(response.content)
drug = pd.read_csv(filename)
print(drug.apply([np.mean, np.std]).to_markdown() + f"\n| N | {len(drug)} |\n")| hospital | treatment | severity | days | |
|---|---|---|---|---|
| mean | 0.6375 | 0.625 | 15.4758 | 42.1125 |
| std | 0.480722 | 0.484123 | 7.14637 | 15.9429 |
| N | 80 |
hospital \(\in \{0,1\}\)treatment \(\in \{0,1\}\)severity prior to treatment assignmentdays in hospital| hospital | treatment | severity | days |
|---|---|---|---|
| 0 | 0.0689655 | 7.94499 | 29.6207 |
| 1 | 0.941176 | 19.758 | 49.2157 |
models = [
smf.ols("days ~ treatment", data=drug).fit(cov_type='HC0'),
smf.ols("days ~ treatment", data=drug.query("hospital==0")).fit(cov_type='HC0'),
smf.ols("days ~ treatment", data=drug.query("hospital==1")).fit(cov_type='HC0'),
smf.ols("days ~ treatment + severity ", data=drug).fit(cov_type='HC0'),
smf.ols("days ~ treatment + severity + hospital", data=drug).fit(cov_type='HC0'),
]
summary_col(models, model_names=['all','hosp 0', 'hosp 1', 'all', 'all'])| all I | hosp 0 I | hosp 1 I | all II | all III | |
|---|---|---|---|---|---|
| Intercept | 33.2667 | 30.4074 | 59.0000 | 11.6641 | 11.0111 |
| (3.0510) | (2.8561) | (4.9889) | (2.1262) | (2.1632) | |
| treatment | 14.1533 | -11.4074 | -10.3958 | -7.5912 | -5.0945 |
| (3.5481) | (4.5450) | (5.2627) | (2.5753) | (4.0915) | |
| severity | 2.2741 | 2.3865 | |||
| (0.1910) | (0.1961) | ||||
| hospital | -4.1535 | ||||
| (4.3465) | |||||
| R-squared | 0.1847 | 0.0388 | 0.0436 | 0.7878 | 0.7902 |
| R-squared Adj. | 0.1743 | 0.0032 | 0.0241 | 0.7823 | 0.7820 |
drug['severity_c'] = drug['severity'] - drug['severity'].mean()
drug['hospital_c'] = drug['hospital'] - drug['hospital'].mean()
drug['hs_c'] = drug['severity']*drug['hospital'] - np.mean(drug['severity']*drug['hospital'])
models = [
smf.ols("days ~ treatment*severity_c", data=drug).fit(cov_type='HC0'),
smf.ols("days ~ treatment*hospital_c", data=drug).fit(cov_type='HC0'),
smf.ols("days ~ treatment*(hospital_c + severity_c + hs_c)", data=drug).fit(cov_type='HC0')
]
summary_col(models, model_names=['I','II','III'])| I | II | III | |
|---|---|---|---|
| Intercept | 46.6189 | 48.6352 | 46.8291 |
| (2.4925) | (3.3447) | (3.2836) | |
| treatment | -7.5209 | -10.7625 | -4.5251 |
| (2.6555) | (3.7377) | (3.3474) | |
| severity_c | 2.2342 | 2.7264 | |
| (0.2919) | (0.2508) | ||
| treatment:severity_c | 0.0867 | 6.0732 | |
| (0.3601) | (0.2508) | ||
| hospital_c | 28.5926 | 29.3104 | |
| (5.7486) | (14.2210) | ||
| treatment:hospital_c | 1.0116 | 13.1092 | |
| (6.9536) | (15.0690) | ||
| hs_c | -1.7888 | ||
| (0.5150) | |||
| treatment:hs_c | -4.6814 | ||
| (0.5822) | |||
| R-squared | 0.7880 | 0.3760 | 0.8075 |
| R-squared Adj. | 0.7796 | 0.3514 | 0.7887 |