ECON526
University of British Columbia
\[ \def\Er{{\mathrm{E}}} \def\En{{\mathbb{En}}} \def\cov{{\mathrm{Cov}}} \def\var{{\mathrm{Var}}} \def\R{{\mathbb{R}}} \def\indep{{\perp\!\!\!\perp}} \newcommand\norm[1]{\left\lVert#1\right\rVert} \def\rank{{\mathrm{rank}}} \newcommand{\inpr}{ \overset{p^*_{\scriptscriptstyle n}}{\longrightarrow}} \def\inprob{{\,{\buildrel p \over \rightarrow}\,}} \def\indist{\,{\buildrel d \over \rightarrow}\,} \DeclareMathOperator*{\plim}{plim} \DeclareMathOperator*{\argmin}{argmin} \]
def pfizerrctdata():
n1 = 19965
n0 = 20172
y1 = 9
y0 = 169
n1o = 4044
n0o = 4067
y1o = 1
y0o = 19
over65 = np.zeros(n1 + n0)
over65[0:(n1o+n0o)] = 1
treat = np.concatenate([np.ones(n1o), np.zeros(n0o), np.ones(n1-n1o), np.zeros(n0-n0o)])
infected = np.concatenate([np.ones(y1o), np.zeros(n1o-y1o), np.ones(y0o), np.zeros(n0o-y0o),
np.ones(y1-y1o), np.zeros(n1-n1o-(y1-y1o)),
np.ones(y0-y0o), np.zeros(n0-n0o-y0+y0o)])
data = pd.DataFrame({'treat': treat, 'infected': infected, 'over65': over65})
return data
data = pfizerrctdata()
data.groupby(['over65', 'treat']).mean()
infected | ||
---|---|---|
over65 | treat | |
0.0 | 0.0 | 0.009314 |
1.0 | 0.000502 | |
1.0 | 0.0 | 0.004672 |
1.0 | 0.000247 |
def ATE(data, treatment='treat', y='infected'):
means=data.groupby(treatment)[y].mean()
ATE = means[1] - means[0]
se = sum(data.groupby(treatment)[y].var()/data.groupby(treatment)[y].count())**0.5
return ATE, se
ate=ATE(data.loc[data['over65']==1,:])
print(f"difference in means = {ate[0]}, SE = {ate[1]}")
difference in means = -0.0044244682964888074, SE = 0.0010976149240857532
olsse = np.sqrt(np.diag(smf.ols('infected ~ treat',data=data.loc[data['over65']==1]).fit().cov_params())[1])
print(f"OLS SE: {olsse}, manual SE: {ate[1]}")
OLS SE: 0.0011004170544555998, manual SE: 0.0010976149240857532
But standard error very slightly different
Default of smf.ols
assumes homoskedasticiy \(\Er[\epsilon^2|X] = \sigma^2\)
With \(y\) and \(T\), binary, \(\Er[\epsilon^2|T] = P(y=1|T)(1-P(y=1|T))\)
olshcse = np.sqrt(np.diag(smf.ols('infected ~ treat',data=data.loc[data['over65']==1]).fit(cov_type="HC3").cov_params())[1])
print(f"OLS HC SE: {olshcse}, manual SE: {ate[1]}")
OLS HC SE: 0.001097749929536793, manual SE: 0.0010976149240857532
import os
import requests
url = 'https://www.nber.org/morg/annual/morg23.dta'
local_filename = 'data/morg23.dta'
if not os.path.exists(local_filename):
response = requests.get(url)
with open(local_filename, 'wb') as file:
file.write(response.content)
cps=pd.read_stata(local_filename)
cps["female"] = (cps.sex==2)
cps["log_earn"] = np.log(cps["earnwke"])
cps["log_uhours"] = np.log(cps.uhourse)
cps["log_hourslw"] = np.log(cps.hourslw)
cps.replace(-np.inf, np.nan, inplace=True)
cps["nevermarried"] = cps.marital==7
cps["wasmarried"] = (cps.marital >= 4) & (cps.marital <= 6)
cps["married"] = cps.marital <= 3
lm = list()
lm.append(smf.ols(formula="log_earn ~ female", data=cps,
missing="drop").fit(cov_type='HC3'))
lm.append(smf.ols(formula="log_earn ~ female + log_hourslw + log_uhours", data=cps,
missing="drop").fit(cov_type='HC3'))
lm.append(smf.ols(formula="log_earn ~ female + log_hourslw + log_uhours + wasmarried + married", data=cps,
missing="drop").fit(cov_type='HC3'))
lm.append(smf.ols(formula="log_earn ~ female*(wasmarried+married) + log_hourslw + log_uhours", data=cps,
missing="drop").fit(cov_type='HC3'))
summary_col(lm, stars=True, model_names=[f"{i+1}" for i in range(len(lm))])
1 | 2 | 3 | 4 | |
Intercept | 6.9948*** | 2.2378*** | 2.2410*** | 2.2166*** |
(0.0031) | (0.0295) | (0.0280) | (0.0280) | |
female[T.True] | -0.2772*** | -0.1318*** | -0.1340*** | -0.0715*** |
(0.0046) | (0.0038) | (0.0038) | (0.0061) | |
log_hourslw | 0.0424*** | 0.0443*** | 0.0434*** | |
(0.0077) | (0.0076) | (0.0075) | ||
log_uhours | 1.2623*** | 1.2113*** | 1.2108*** | |
(0.0119) | (0.0115) | (0.0115) | ||
wasmarried[T.True] | 0.1474*** | 0.1831*** | ||
(0.0058) | (0.0089) | |||
married[T.True] | 0.2896*** | 0.3361*** | ||
(0.0041) | (0.0055) | |||
female[T.True]:wasmarried[T.True] | -0.0727*** | |||
(0.0118) | ||||
female[T.True]:married[T.True] | -0.0988*** | |||
(0.0080) | ||||
R-squared | 0.0287 | 0.3632 | 0.3898 | 0.3906 |
R-squared Adj. | 0.0287 | 0.3631 | 0.3898 | 0.3906 |
\[y_i = x_i \beta + w_i'\gamma + u_i\] - Can equivalently calculate \(\beta\) by
ey=smf.ols(formula="log_earn ~ log_hourslw + log_uhours", data=cps, missing="drop").fit().resid
ex=smf.ols(formula="I(1*female) ~ log_hourslw + log_uhours", data=cps, missing="drop").fit().resid
edf = pd.concat([ex,ey],axis=1)
edf.columns=['ex','ey']
smf.ols('ey ~ ex', data=edf).fit(cov_type="HC3").params[1]
np.float64(-0.131764318408401)
If we want \[ y_i = \beta_0 + x_i \beta + w_i'\gamma + u_i \]
But only regression \(y\) on \(x\), then \[ \hat{\beta}^s = \hat{\beta} + \frac{ \sum (x_i - \bar{x})w_i'}{\sum (x_i - \bar{x})^2} \hat{\gamma} \] and \[ \hat{\beta}^s \inprob \beta + \frac{ \Er[(x_i - \Er[x])w_i']}{\var(x_i)} \gamma \]
Useful for:
import pyfixest as pf
controls="age + I(age**2) | race + grade92 + unionmme + unioncov + ind17 + occ18"
allcon=pf.feols("log_earn ~ female*(wasmarried + married) + log_hourslw + log_uhours + " + controls, data=cps,vcov='hetero')
allcon.summary()
###
Estimation: OLS
Dep. var.: log_earn, Fixed effects: race+grade92+unionmme+unioncov+ind17+occ18
Inference: hetero
Observations: 104607
| Coefficient | Estimate | Std. Error | t value | Pr(>|t|) | 2.5% | 97.5% |
|:------------------|-----------:|-------------:|----------:|-----------:|-------:|--------:|
| female | -0.084 | 0.006 | -14.742 | 0.000 | -0.095 | -0.073 |
| wasmarried | 0.082 | 0.009 | 9.570 | 0.000 | 0.065 | 0.099 |
| married | 0.126 | 0.005 | 23.207 | 0.000 | 0.115 | 0.137 |
| log_hourslw | 0.044 | 0.007 | 6.649 | 0.000 | 0.031 | 0.058 |
| log_uhours | 0.988 | 0.011 | 93.671 | 0.000 | 0.967 | 1.009 |
| age | 0.003 | 0.000 | 19.767 | 0.000 | 0.003 | 0.003 |
| I(age ** 2) | -0.000 | 0.000 | -1.174 | 0.240 | -0.000 | 0.000 |
| female:wasmarried | -0.059 | 0.011 | -5.507 | 0.000 | -0.080 | -0.038 |
| female:married | -0.069 | 0.007 | -9.734 | 0.000 | -0.083 | -0.055 |
---
RMSE: 0.524 R2: 0.582 R2 Within: 0.314
RCT with outcome \(Y\), treatment \(T\), other variables \(X\)
Should we estimate ATE in a regression that includes \(X\)?
np.random.seed(54)
n = 1000 # sample size
Z = np.random.normal(size=n) # generate Z
Y0 = -Z + np.random.normal(size=n) # conditional average baseline response is -Z
Y1 = Z + np.random.normal(size=n) # conditional average treatment effect is +Z
D = np.random.binomial(1, .2, size=n) # treatment indicator; only 20% get treated
Y = Y1 * D + Y0 * (1 - D) # observed Y
Z = Z - Z.mean() # demean Z
data = pd.DataFrame({"Y": Y, "D": D, "Z": Z})
print(f"Unobservable sample ATE = {np.mean(Y1-Y0):.3}")
Unobservable sample ATE = 0.072
hc = 'HC0'
m1=smf.ols('Y ~ D',data=data).fit(cov_type=hc)
madd=smf.ols('Y ~ D + Z',data=data).fit(cov_type=hc)
summary_col([m1, madd], model_names=['simple','additive'])
simple | additive | |
Intercept | -0.0924 | -0.0789 |
(0.0501) | (0.0386) | |
D | -0.0370 | -0.1046 |
(0.1166) | (0.1440) | |
Z | -0.5630 | |
(0.0561) | ||
R-squared | 0.0001 | 0.1551 |
R-squared Adj. | -0.0009 | 0.1534 |
minteract=smf.ols('Y ~ D + Z*D',data=data).fit(cov_type=hc)
summary_col([m1, madd, minteract],model_names=['simple','additive','interactive'])
simple | additive | interactive | |
Intercept | -0.0924 | -0.0789 | -0.0681 |
(0.0501) | (0.0386) | (0.0352) | |
D | -0.0370 | -0.1046 | 0.0463 |
(0.1166) | (0.1440) | (0.0766) | |
Z | -0.5630 | -1.0111 | |
(0.0561) | (0.0341) | ||
Z:D | 2.1349 | ||
(0.0748) | |||
R-squared | 0.0001 | 0.1551 | 0.5245 |
R-squared Adj. | -0.0009 | 0.1534 | 0.5231 |
filename = 'data/collections_email.csv'
url = 'https://raw.githubusercontent.com/matheusfacure/python-causality-handbook/refs/heads/master/causal-inference-for-the-brave-and-true/data/collections_email.csv'
if not os.path.exists(filename):
response = requests.get(url)
with open(filename, 'wb') as file:
file.write(response.content)
data = pd.read_csv(filename)
data.describe()
payments | opened | agreement | credit_limit | risk_score | ||
---|---|---|---|---|---|---|
count | 5000.000000 | 5000.000000 | 5000.000000 | 5000.000000 | 5000.000000 | 5000.000000 |
mean | 669.672000 | 0.490800 | 0.273400 | 0.160800 | 1194.845188 | 0.480812 |
std | 103.970065 | 0.499965 | 0.445749 | 0.367383 | 480.978996 | 0.100376 |
min | 330.000000 | 0.000000 | 0.000000 | 0.000000 | 193.695573 | 0.131784 |
25% | 600.000000 | 0.000000 | 0.000000 | 0.000000 | 843.049867 | 0.414027 |
50% | 670.000000 | 0.000000 | 0.000000 | 0.000000 | 1127.640297 | 0.486389 |
75% | 730.000000 | 1.000000 | 1.000000 | 0.000000 | 1469.096523 | 0.552727 |
max | 1140.000000 | 1.000000 | 1.000000 | 1.000000 | 3882.178408 | 0.773459 |
lm = list()
lm.append(smf.ols(formula="payments ~ email", data=data).fit(cov_type='HC3'))
lm.append(smf.ols(formula="payments ~ email + credit_limit + risk_score",data=data).fit(cov_type='HC3'))
lm.append(smf.ols(formula="payments ~ email + credit_limit + risk_score + opened + agreement",data=data).fit(cov_type='HC3'))
summary_col(lm, stars=True, model_names=[f"{i+1}" for i in range(len(lm))])
1 | 2 | 3 | |
Intercept | 669.9764*** | 490.8653*** | 488.4416*** |
(2.0973) | (10.2265) | (10.2052) | |
-0.6203 | 4.4304** | -1.6095 | |
(2.9401) | (2.1273) | (2.7095) | |
credit_limit | 0.1511*** | 0.1507*** | |
(0.0085) | (0.0085) | ||
risk_score | -8.0516 | -2.0929 | |
(40.6389) | (40.4986) | ||
opened | 3.9808 | ||
(3.9791) | |||
agreement | 11.7093*** | ||
(4.2158) | |||
R-squared | 0.0000 | 0.4774 | 0.4796 |
R-squared Adj. | -0.0002 | 0.4771 | 0.4791 |
filename = 'data/hospital_treatment.csv'
url = 'https://raw.githubusercontent.com/matheusfacure/python-causality-handbook/refs/heads/master/causal-inference-for-the-brave-and-true/data/hospital_treatment.csv'
if not os.path.exists(filename):
response = requests.get(url)
with open(filename, 'wb') as file:
file.write(response.content)
drug = pd.read_csv(filename)
print(drug.apply([np.mean, np.std]).to_markdown() + f"\n| N | {len(drug)} |\n")
hospital | treatment | severity | days | |
---|---|---|---|---|
mean | 0.6375 | 0.625 | 15.4758 | 42.1125 |
std | 0.480722 | 0.484123 | 7.14637 | 15.9429 |
N | 80 |
hospital
\(\in \{0,1\}\)treatment
\(\in \{0,1\}\)severity
prior to treatment assignmentdays
in hospitalhospital | treatment | severity | days |
---|---|---|---|
0 | 0.0689655 | 7.94499 | 29.6207 |
1 | 0.941176 | 19.758 | 49.2157 |
models = [
smf.ols("days ~ treatment", data=drug).fit(cov_type='HC0'),
smf.ols("days ~ treatment", data=drug.query("hospital==0")).fit(cov_type='HC0'),
smf.ols("days ~ treatment", data=drug.query("hospital==1")).fit(cov_type='HC0'),
smf.ols("days ~ treatment + severity ", data=drug).fit(cov_type='HC0'),
smf.ols("days ~ treatment + severity + hospital", data=drug).fit(cov_type='HC0'),
]
summary_col(models, model_names=['all','hosp 0', 'hosp 1', 'all', 'all'])
all I | hosp 0 I | hosp 1 I | all II | all III | |
Intercept | 33.2667 | 30.4074 | 59.0000 | 11.6641 | 11.0111 |
(3.0510) | (2.8561) | (4.9889) | (2.1262) | (2.1632) | |
treatment | 14.1533 | -11.4074 | -10.3958 | -7.5912 | -5.0945 |
(3.5481) | (4.5450) | (5.2627) | (2.5753) | (4.0915) | |
severity | 2.2741 | 2.3865 | |||
(0.1910) | (0.1961) | ||||
hospital | -4.1535 | ||||
(4.3465) | |||||
R-squared | 0.1847 | 0.0388 | 0.0436 | 0.7878 | 0.7902 |
R-squared Adj. | 0.1743 | 0.0032 | 0.0241 | 0.7823 | 0.7820 |
drug['severity_c'] = drug['severity'] - drug['severity'].mean()
drug['hospital_c'] = drug['hospital'] - drug['hospital'].mean()
drug['hs_c'] = drug['severity']*drug['hospital'] - np.mean(drug['severity']*drug['hospital'])
models = [
smf.ols("days ~ treatment*severity_c", data=drug).fit(cov_type='HC0'),
smf.ols("days ~ treatment*hospital_c", data=drug).fit(cov_type='HC0'),
smf.ols("days ~ treatment*(hospital_c + severity_c + hs_c)", data=drug).fit(cov_type='HC0')
]
summary_col(models, model_names=['I','II','III'])
I | II | III | |
Intercept | 46.6189 | 48.6352 | 46.8291 |
(2.4925) | (3.3447) | (3.2836) | |
treatment | -7.5209 | -10.7625 | -4.5251 |
(2.6555) | (3.7377) | (3.3474) | |
severity_c | 2.2342 | 2.7264 | |
(0.2919) | (0.2508) | ||
treatment:severity_c | 0.0867 | 6.0732 | |
(0.3601) | (0.2508) | ||
hospital_c | 28.5926 | 29.3104 | |
(5.7486) | (14.2210) | ||
treatment:hospital_c | 1.0116 | 13.1092 | |
(6.9536) | (15.0690) | ||
hs_c | -1.7888 | ||
(0.5150) | |||
treatment:hs_c | -4.6814 | ||
(0.5822) | |||
R-squared | 0.7880 | 0.3760 | 0.8075 |
R-squared Adj. | 0.7796 | 0.3514 | 0.7887 |