import os
os.getcwd()

'C:\\Users\\rsb84'

os.chdir("C://Users/rsb84/Downloads/lalonde")

import pandas as pd
df_control=pd.read_csv("nswre74_control.csv", sep = None)
df_treated=pd.read_csv("nswre74_treated.csv", sep = None)

C:\Users\rsb84\AppData\Local\Temp\ipykernel_60964\2330761546.py:2: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support sep=None with delim_whitespace=False; you can avoid this warning by specifying engine='python'.
  df_control=pd.read_csv("nswre74_control.csv", sep = None)
C:\Users\rsb84\AppData\Local\Temp\ipykernel_60964\2330761546.py:3: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support sep=None with delim_whitespace=False; you can avoid this warning by specifying engine='python'.
  df_treated=pd.read_csv("nswre74_treated.csv", sep = None)

df_control.head(10)

df_treated.head(10)

df_control.shape

(260, 10)

df_treated.shape

(185, 10)

# Convert categorical variables to category dtype
df_control['Black'] = df_control['Black'].astype('category')
df_control['Hispanic'] = df_control['Hispanic'].astype('category')
df_control['married'] = df_control['married'].astype('category')
df_control['nodegree'] = df_control['nodegree'].astype('category')

# Keep as numeric (continuous)
df_control['RE74'] = df_control['RE74'].astype(float)
df_control['RE75'] = df_control['RE75'].astype(float)
df_control['RE78'] = df_control['RE78'].astype(float)

# Verify
print("Variable types for df_control:")
print(df_control.dtypes)

Variable types for df_control:
﻿treatment_indicator       int64
age                        int64
education                  int64
Black                   category
Hispanic                category
married                 category
nodegree                category
RE74                     float64
RE75                     float64
RE78                     float64
dtype: object

# Convert categorical variables to category dtype
df_treated['Black'] = df_treated['Black'].astype('category')
df_treated['Hispanic'] = df_treated['Hispanic'].astype('category')
df_treated['married'] = df_treated['married'].astype('category')
df_treated['nodegree'] = df_treated['nodegree'].astype('category')

# Keep as numeric (continuous)
df_treated['RE74'] = df_treated['RE74'].astype(float)
df_treated['RE75'] = df_treated['RE75'].astype(float)
df_treated['RE78'] = df_treated['RE78'].astype(float)

# Verify
print("Variable types for df_treated:")
print(df_treated.dtypes)

Variable types for df_treated:
﻿treatment_indicator       int64
age                        int64
education                  int64
Black                   category
Hispanic                category
married                 category
nodegree                category
RE74                     float64
RE75                     float64
RE78                     float64
dtype: object

import seaborn as sns
import matplotlib.pyplot as plt

sns.displot(data=df_control, x="RE78", kind="kde")
plt.show()

sns.displot(data=df_treated, x="RE78", kind="kde")
plt.show()

from scipy.stats import anderson
result = anderson(df_control["RE78"])

print(round(result.statistic, 4))
print(result.significance_level)
print(result.critical_values)

13.7453
[15.  10.   5.   2.5  1. ]
[0.567 0.646 0.775 0.904 1.076]

result = anderson(df_treated["RE78"])

print(round(result.statistic, 4))
print(result.significance_level)
print(result.critical_values)

9.6406
[15.  10.   5.   2.5  1. ]
[0.564 0.643 0.771 0.899 1.07 ]

import numpy as np
import pandas as pd
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.power import TTestIndPower

# ==================================================
# STEP 0 — PREP DATA
# ==================================================

df_treated["treatment_indicator"] = 1
df_control["treatment_indicator"] = 0
df = pd.concat([df_treated, df_control], ignore_index=True)

# Ensure categorical variables are numeric 0/1
binary_cols = ["Black", "Hispanic", "married", "nodegree"]
for c in binary_cols:
    df[c] = df[c].astype(int)

# ==================================================
# STEP 1 — CHECK ATTRITION / MISSING OUTCOMES
# ==================================================

print("\nMissing RE78 by treatment status:")
print(df.groupby("treatment_indicator")["RE78"].apply(lambda x: x.isna().mean()))

Missing RE78 by treatment status:
treatment_indicator
0    0.0
1    0.0
Name: RE78, dtype: float64

# ==================================================
# STEP 2 — BASELINE BALANCE TABLE
# ==================================================

pre_vars = [
    "age", "education", "Black", "Hispanic",
    "married", "nodegree", "RE74", "RE75"
]

def standardized_mean_diff(x_t, x_c):
    m1, m0 = np.mean(x_t), np.mean(x_c)
    s1, s0 = np.var(x_t, ddof=1), np.var(x_c, ddof=1)
    pooled_sd = np.sqrt((s1 + s0) / 2)
    return (m1 - m0) / pooled_sd

balance_rows = []

treated = df[df.treatment_indicator == 1]
control = df[df.treatment_indicator == 0]

for v in pre_vars:
    mt = treated[v].mean()
    mc = control[v].mean()
    smd = standardized_mean_diff(treated[v], control[v])
    ttest_p = stats.ttest_ind(treated[v], control[v], equal_var=False).pvalue

    balance_rows.append([v, mt, mc, smd, ttest_p])

balance_table = pd.DataFrame(
    balance_rows,
    columns=["Variable", "Mean Treated", "Mean Control", "Std Mean Diff", "p-value"]
)

print("\nBASELINE BALANCE TABLE")
print(balance_table)

BASELINE BALANCE TABLE
    Variable  Mean Treated  Mean Control  Std Mean Diff   p-value
0        age     25.816216     25.053846       0.107277  0.265944
1  education     10.345946     10.088462       0.141220  0.150169
2      Black      0.843243      0.826923       0.043887  0.647357
3   Hispanic      0.059459      0.107692      -0.174561  0.064043
4    married      0.189189      0.153846       0.093641  0.334248
5   nodegree      0.708108      0.834615      -0.303986  0.002037
6       RE74   2095.573689   2107.026658      -0.002160  0.981863
7       RE75   1532.055314   1266.909002       0.083863  0.385273

# ==================================================
# STEP 3 — RAW DIFFERENCE IN MEANS (WELCH TEST)
# ==================================================

t_stat, p_val = stats.ttest_ind(
    treated["RE78"].dropna(),
    control["RE78"].dropna(),
    equal_var=False
)

print("\nWelch t-test for RE78:")
print("t =", t_stat)
print("p =", p_val)

Welch t-test for RE78:
t = 2.674145513783345
p = 0.007892977714517335

# ==================================================
# STEP 4 — COHEN'S D
# ==================================================

def cohens_d(x, y):
    nx, ny = len(x), len(y)
    vx, vy = np.var(x, ddof=1), np.var(y, ddof=1)
    pooled = np.sqrt(((nx-1)*vx + (ny-1)*vy) / (nx+ny-2))
    return (np.mean(x) - np.mean(y)) / pooled

d = cohens_d(treated["RE78"].dropna(), control["RE78"].dropna())
print("\nCohen's d =", d)

Cohen's d = 0.2727154073584651

# ==================================================
# STEP 5 — REGRESSION-ADJUSTED TREATMENT EFFECT
# (controls for pre-treatment variables)
# ==================================================

formula = """
RE78 ~ treatment_indicator
      + age + education + Black + Hispanic
      + married + nodegree + RE74 + RE75
"""

model_adj = smf.ols(formula, data=df).fit(cov_type="HC3")

print("\nREGRESSION-ADJUSTED TREATMENT EFFECT")
print(model_adj.summary())

# Treatment effect and CI
b = model_adj.params["treatment_indicator"]
ci = model_adj.conf_int().loc["treatment_indicator"]

print("\nAdjusted treatment effect =", b)
print("95% CI =", tuple(ci))

REGRESSION-ADJUSTED TREATMENT EFFECT
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                   RE78   R-squared:                       0.055
Model:                            OLS   Adj. R-squared:                  0.035
Method:                 Least Squares   F-statistic:                     2.482
Date:                Sun, 15 Feb 2026   Prob (F-statistic):            0.00901
Time:                        20:45:12   Log-Likelihood:                -4534.2
No. Observations:                 445   AIC:                             9088.
Df Residuals:                     435   BIC:                             9129.
Df Model:                           9                                         
Covariance Type:                  HC3                                         
=======================================================================================
                          coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------
Intercept             785.0615   3354.573      0.234      0.815   -5789.781    7359.904
treatment_indicator  1676.3426    685.302      2.446      0.014     333.174    3019.511
age                    55.3167     42.015      1.317      0.188     -27.032     137.665
education             395.7343    201.592      1.963      0.050       0.620     790.848
Black               -2159.5221   1041.932     -2.073      0.038   -4201.672    -117.372
Hispanic              164.0327   1412.943      0.116      0.908   -2605.285    2933.350
married              -138.7252    894.575     -0.155      0.877   -1892.061    1614.610
nodegree              -70.6807   1036.275     -0.068      0.946   -2101.741    1960.380
RE74                    0.0821      0.116      0.707      0.479      -0.146       0.310
RE75                    0.0528      0.133      0.396      0.692      -0.208       0.314
==============================================================================
Omnibus:                      284.809   Durbin-Watson:                   2.059
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             3770.859
Skew:                           2.545   Prob(JB):                         0.00
Kurtosis:                      16.321   Cond. No.                     7.19e+04
==============================================================================

Notes:
[1] Standard Errors are heteroscedasticity robust (HC3)
[2] The condition number is large, 7.19e+04. This might indicate that there are
strong multicollinearity or other numerical problems.

Adjusted treatment effect = 1676.3426437677613
95% CI = (333.1744674915333, 3019.510820043989)

import statsmodels.formula.api as smf

# ---------------------------------------------
# STEP 1 — Ensure categorical variables coded
# ---------------------------------------------
df["Black"] = df["Black"].astype(int)
df["Hispanic"] = df["Hispanic"].astype(int)
df["married"] = df["married"].astype(int)
df["nodegree"] = df["nodegree"].astype(int)

# ---------------------------------------------
# STEP 2 — Heterogeneity model with interactions
# ---------------------------------------------
heterogeneity_model = smf.ols(
    """
    RE78 ~ treatment_indicator
          + age + education + Black + Hispanic + married + nodegree + RE74 + RE75
          + treatment_indicator:age
          + treatment_indicator:education
          + treatment_indicator:Black
          + treatment_indicator:Hispanic
          + treatment_indicator:married
          + treatment_indicator:nodegree
          + treatment_indicator:RE74
          + treatment_indicator:RE75
    """,
    data=df
).fit(cov_type="HC3")

print(heterogeneity_model.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                   RE78   R-squared:                       0.066
Model:                            OLS   Adj. R-squared:                  0.029
Method:                 Least Squares   F-statistic:                     1.707
Date:                Mon, 16 Feb 2026   Prob (F-statistic):             0.0387
Time:                        15:12:49   Log-Likelihood:                -4531.5
No. Observations:                 445   AIC:                             9099.
Df Residuals:                     427   BIC:                             9173.
Df Model:                          17                                         
Covariance Type:                  HC3                                         
=================================================================================================
                                    coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
Intercept                      4133.2607   3404.239      1.214      0.225   -2538.925    1.08e+04
treatment_indicator           -5641.6850   7046.741     -0.801      0.423   -1.95e+04    8169.674
age                              33.3734     49.369      0.676      0.499     -63.389     130.136
education                       183.0770    209.136      0.875      0.381    -226.823     592.977
Black                         -2942.0851   1429.179     -2.059      0.040   -5743.224    -140.946
Hispanic                       -458.4562   1728.939     -0.265      0.791   -3847.114    2930.202
married                        -901.0297   1008.749     -0.893      0.372   -2878.142    1076.083
nodegree                        153.6691   1109.292      0.139      0.890   -2020.503    2327.841
RE74                              0.0822      0.108      0.764      0.445      -0.129       0.293
RE75                              0.0457      0.194      0.236      0.814      -0.335       0.426
treatment_indicator:age          50.1885     93.829      0.535      0.593    -133.714     234.090
treatment_indicator:education   440.8840    437.722      1.007      0.314    -417.035    1298.803
treatment_indicator:Black      1802.0711   2207.535      0.816      0.414   -2524.617    6128.759
treatment_indicator:Hispanic    762.7829   3445.055      0.221      0.825   -5989.402    7514.967
treatment_indicator:married    1933.4699   1896.954      1.019      0.308   -1784.492    5651.432
treatment_indicator:nodegree   -472.6784   2217.472     -0.213      0.831   -4818.844    3873.487
treatment_indicator:RE74         -0.0427      0.347     -0.123      0.902      -0.723       0.637
treatment_indicator:RE75          0.0430      0.319      0.135      0.893      -0.583       0.669
==============================================================================
Omnibus:                      283.242   Durbin-Watson:                   2.072
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             3774.272
Skew:                           2.523   Prob(JB):                         0.00
Kurtosis:                      16.345   Cond. No.                     1.74e+05
==============================================================================

Notes:
[1] Standard Errors are heteroscedasticity robust (HC3)
[2] The condition number is large, 1.74e+05. This might indicate that there are
strong multicollinearity or other numerical problems.

# ==================================================
# STEP 6 — ANCOVA MODEL (BEST PRACTICE)
# Adjust only for baseline outcome(s)
# ==================================================

ancova = smf.ols(
    "RE78 ~ treatment_indicator + RE75 + RE74",
    data=df
).fit(cov_type="HC3")

print("\nANCOVA MODEL")
print(ancova.summary())

print("\nANCOVA treatment effect:",
      ancova.params["treatment_indicator"])

print("95% CI:",
      tuple(ancova.conf_int().loc["treatment_indicator"]))

ANCOVA MODEL
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                   RE78   R-squared:                       0.026
Model:                            OLS   Adj. R-squared:                  0.019
Method:                 Least Squares   F-statistic:                     3.018
Date:                Sat, 14 Feb 2026   Prob (F-statistic):             0.0296
Time:                        23:41:44   Log-Likelihood:                -4540.9
No. Observations:                 445   AIC:                             9090.
Df Residuals:                     441   BIC:                             9106.
Df Model:                           3                                         
Covariance Type:                  HC3                                         
=======================================================================================
                          coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------
Intercept            4293.1881    372.284     11.532      0.000    3563.525    5022.851
treatment_indicator  1772.6031    678.147      2.614      0.009     443.458    3101.748
RE75                    0.0851      0.132      0.645      0.519      -0.174       0.344
RE74                    0.0730      0.118      0.620      0.535      -0.158       0.304
==============================================================================
Omnibus:                      280.213   Durbin-Watson:                   2.079
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             3423.977
Skew:                           2.521   Prob(JB):                         0.00
Kurtosis:                      15.619   Cond. No.                     1.42e+04
==============================================================================

Notes:
[1] Standard Errors are heteroscedasticity robust (HC3)
[2] The condition number is large, 1.42e+04. This might indicate that there are
strong multicollinearity or other numerical problems.

ANCOVA treatment effect: 1772.6030779774037
95% CI: (443.4584714004443, 3101.7476845543633)

# ==================================================
# STEP 7 — LOG-INCOME ROBUSTNESS CHECK
# (common for skewed earnings)
# ==================================================

df["log_RE78"] = np.log(df["RE78"] + 1)

log_model = smf.ols(
    "log_RE78 ~ treatment_indicator + RE75 + RE74",
    data=df
).fit(cov_type="HC3")

print("\nLOG-INCOME MODEL")
print(log_model.summary())

LOG-INCOME MODEL
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               log_RE78   R-squared:                       0.022
Model:                            OLS   Adj. R-squared:                  0.015
Method:                 Least Squares   F-statistic:                     4.108
Date:                Sat, 14 Feb 2026   Prob (F-statistic):            0.00682
Time:                        23:42:15   Log-Likelihood:                -1247.4
No. Observations:                 445   AIC:                             2503.
Df Residuals:                     441   BIC:                             2519.
Df Model:                           3                                         
Covariance Type:                  HC3                                         
=======================================================================================
                          coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------
Intercept               5.3606      0.270     19.850      0.000       4.831       5.890
treatment_indicator     0.9834      0.383      2.566      0.010       0.232       1.734
RE75                 9.824e-05   6.88e-05      1.428      0.153   -3.66e-05       0.000
RE74                 5.915e-06    4.6e-05      0.128      0.898   -8.43e-05    9.61e-05
==============================================================================
Omnibus:                      822.558   Durbin-Watson:                   2.012
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               66.422
Skew:                          -0.679   Prob(JB):                     3.77e-15
Kurtosis:                       1.682   Cond. No.                     1.42e+04
==============================================================================

Notes:
[1] Standard Errors are heteroscedasticity robust (HC3)
[2] The condition number is large, 1.42e+04. This might indicate that there are
strong multicollinearity or other numerical problems.

# ==================================================
# STEP 8 — CALCULATE THE ANCOVA-ADJUSTED STANDARDIZED EFFECT & POWER
# ==================================================

import numpy as np
from statsmodels.stats.power import TTestIndPower

# ANCOVA model already estimated as: ancova

beta_treat = ancova.params["treatment_indicator"]
resid_sd = np.sqrt(ancova.mse_resid)

effect_size_adj = beta_treat / resid_sd

print("Adjusted standardized effect =", effect_size_adj)

power_adj = TTestIndPower().solve_power(
    effect_size=effect_size_adj,
    nobs1=185,
    ratio=260/185, # control group size / treatment group size
    alpha=0.05
)

print("Power of ANCOVA-adjusted effect =", power_adj)

Adjusted standardized effect = 0.26993798318416534
Power of ANCOVA-adjusted effect = 0.799656649204108

import numpy as np
import pandas as pd
import statsmodels.formula.api as smf

# -------------------------------
# SETTINGS
# -------------------------------
n_treat = 185
n_control = 260
alpha = 0.05
n_sim = 10000   # increase for precision

# -------------------------------
# EXTRACT PARAMETERS FROM FITTED ANCOVA
# -------------------------------
params = ancova.params

intercept = params["Intercept"]
beta_treat = params["treatment_indicator"]

# all other covariates (exclude intercept + treatment)
beta_cov = params.drop(["Intercept", "treatment_indicator"])

sigma = np.sqrt(ancova.mse_resid)

# -------------------------------
# SIMULATE ONE DATASET
# -------------------------------
def simulate_dataset():
    
    n = n_treat + n_control
    
    df = pd.DataFrame({
        "treatment_indicator": np.r_[np.ones(n_treat), np.zeros(n_control)],
        "age": np.random.normal(30, 10, n),
        "education": np.random.normal(12, 2, n),
        "Black": np.random.binomial(1, 0.3, n),
        "Hispanic": np.random.binomial(1, 0.2, n),
        "married": np.random.binomial(1, 0.5, n),
        "nodegree": np.random.binomial(1, 0.4, n),
        "RE74": np.random.normal(20000, 10000, n),
        "RE75": np.random.normal(21000, 10000, n),
    })
    
    # generate outcome
    y = (
        intercept
        + beta_treat * df["treatment_indicator"]
        + df[beta_cov.index] @ beta_cov
        + np.random.normal(0, sigma, n)
    )
    
    df["RE78"] = y
    return df

# -------------------------------
# RUN MONTE CARLO
# -------------------------------
reject_count = 0

for _ in range(n_sim):
    
    sim_df = simulate_dataset()
    
    model = smf.ols(
        "RE78 ~ treatment_indicator + age + education + Black + Hispanic + married + nodegree + RE74 + RE75",
        data=sim_df
    ).fit()
    
    if model.pvalues["treatment_indicator"] < alpha:
        reject_count += 1

power_mc = reject_count / n_sim
print("Monte Carlo ANCOVA power =", power_mc)

Monte Carlo ANCOVA power = 0.7938

	Interpretation
< 0.10	excellent balance
0.10–0.20	acceptable
> 0.25	concerning imbalance

"Causal Impact of Job Training on Earnings: A Randomized Experimental Evaluation with ANCOVA and Regression Adjustment"¶

by Rob Boswell¶

Feb. 2026¶

Next, we will verify randomization worked for the covariates through a baseline balance table. This step is important because, especially when using a small number of samples, it is still possible that randomization may not leave covariates completely balanced.¶

	age	education	Black	Hispanic	married	nodegree	RE78
0	23	10	1	0	0	1	0.000
1	26	12	0	0	0	0	12383.680
2	22	9	1	0	0	1	0.000
3	18	9	1	0	0	1	10740.080
4	45	11	1	0	0	1	11796.470
5	18	9	1	0	0	1	9227.052
6	24	8	0	0	0	1	10569.270
7	34	11	1	0	1	1	6040.335
8	24	4	0	1	0	1	3880.833
9	36	10	1	0	0	1	0.000

	treatment_indicator	age	education	Black	Hispanic	married	nodegree	RE78
0	1	37	11	1	0	1	1	9930.0460
1	1	22	9	0	1	0	1	3595.8940
2	1	30	12	1	0	0	0	24909.4500
3	1	27	11	1	0	0	1	7506.1460
4	1	33	8	1	0	0	1	289.7899
5	1	22	9	1	0	0	1	4056.4940
6	1	23	12	1	0	0	0	0.0000
7	1	32	11	1	0	0	1	8472.1580
8	1	22	16	1	0	0	0	2164.0220
9	1	33	12	0	0	1	0	12418.0700

d	magnitude
0.2	small
0.5	medium
0.8	large