######################
######################
# Part 2: Matching and Weighting
######################
######################

# Load required packages
library(MatchIt)
library(cobalt)
library(tidyverse)
library(janitor)
library(haven)
library(here)
library(WeightIt)
library(marginaleffects)

# Print the working directory using 'here()'
# This ensures that the current working directory is set correctly for the workshop.
print(here())

# Read in the data using a relative path for reproducibility
shadishclark <- read_dta(here("shadishclark.dta"))

# View the initial rows of the data to check it's loaded correctly
View(shadishclark)

#### Data Preparation ####

# Creating the treatment variable
# We are defining 'treatment' based on the 'vm' column
shadishclark <- shadishclark %>%
	mutate(treatment = case_when(
		vm == 1 ~ 1, # If 'vm' is 1, treatment is set to 1
		vm == 2 ~ 0, # If 'vm' is 2, treatment is set to 0
		TRUE ~ NA_real_ # For all other cases, treatment is set to NA
	))

# Display the frequency distribution of the treatment variable
tabyl(shadishclark$treatment)

# -----------------------------------------------------------------------------
# Initial Balance Assessment
# -----------------------------------------------------------------------------

# Here we assess the baseline balance between treatment and control groups
# without any matching technique applied.

# Fit a basic model to assess baseline balance
m.out <- matchit(
	treatment ~ vocabpre + numbmath + likemath + likelit +
		preflit + pagree + pconsc + pemot + pintell + mars + beck +
		cauc + afram + other + age + male + married + momdegr +
		daddegr + credit + majormi + actcomp + hsgpaar + collgpaa +
		mathpre + pextra,
	data = shadishclark,
	method = NULL
) # Using NULL method means no matching

# Generate summary statistics to assess initial balance between groups.
summary(m.out)

# Visualize the balance across all covariates before matching
plot(summary(m.out))

# -----------------------------------------------------------------------------
# Propensity Score (PS) Matching
# -----------------------------------------------------------------------------

# 1. Nearest Neighbor Matching without Caliper

m.out.matched <- matchit(
	treatment ~ vocabpre + numbmath + likemath + likelit +
		preflit + pagree + pconsc + pemot + pintell + mars + beck +
		cauc + afram + other + age + male + married + momdegr +
		daddegr + credit + majormi + actcomp + hsgpaar + collgpaa +
		mathpre + pextra,
	data = shadishclark,
	distance = "logit",
	method = "nearest"
)

# Extract and attach propensity scores to the original dataset
shadishclark$propensity_score <- m.out.matched$distance

# Display the initial rows showing the treatment and the corresponding propensity scores
head(select(shadishclark, treatment, propensity_score))

# Display balance statistics after matching
m.out.matched
summary(m.out.matched)
plot(summary(m.out.matched))
plot(m.out.matched, type = "jitter", interactive = F)

# 2. Nearest Neighbor Matching with Caliper

m.out.matched <- matchit(
	treatment ~ vocabpre + numbmath + likemath + likelit +
		preflit + pagree + pconsc + pemot + pintell + mars + beck +
		cauc + afram + other + age + male + married + momdegr +
		daddegr + credit + majormi + actcomp + hsgpaar + collgpaa +
		mathpre + pextra,
	data = shadishclark,
	distance = "logit",
	caliper = 0.25, # lets play with the caliper
	method = "nearest"
)

# Display balance statistics after matching
summary(m.out.matched)
plot(summary(m.out.matched))
plot(m.out.matched, type = "jitter", interactive = F)

# 3. Nearest Neighbor Matching with Replacement

m.out.matched <- matchit(
	treatment ~ vocabpre + numbmath + likemath + likelit +
		preflit + pagree + pconsc + pemot + pintell + mars + beck +
		cauc + afram + other + age + male + married + momdegr +
		daddegr + credit + majormi + actcomp + hsgpaar + collgpaa +
		mathpre + pextra,
	data = shadishclark,
	distance = "logit",
	caliper = 0.25,
	replace = T, # with or wihtout replacement
	method = "nearest"
)

# Display balance statistics after matching
summary(m.out.matched)
plot(summary(m.out.matched))
plot(m.out.matched, type = "jitter", interactive = F)

# 4. Optimal Matching

m.out.matched <- matchit(
	treatment ~ vocabpre + numbmath + likemath + likelit +
		preflit + pagree + pconsc + pemot + pintell + mars + beck +
		cauc + afram + other + age + male + married + momdegr +
		daddegr + credit + majormi + actcomp + hsgpaar + collgpaa +
		mathpre + pextra,
	data = shadishclark,
	distance = "logit",
	method = "optimal"
)

# Display balance statistics after matching
summary(m.out.matched)
plot(summary(m.out.matched))
plot(m.out.matched, type = "jitter", interactive = F)

# 5. Exact Matching on a Specific Variable (e.g., "married")

m.out.matched <- matchit(
	treatment ~ vocabpre + numbmath + likemath + likelit +
		preflit + pagree + pconsc + pemot + pintell + mars + beck +
		cauc + afram + other + age + male + married + momdegr +
		daddegr + credit + majormi + actcomp + hsgpaar + collgpaa +
		mathpre + pextra,
	data = shadishclark,
	exact = "married", # lets do some exact matching
	distance = "logit",
	caliper = 0.25,
	replace = T, # with or wihtout replacement
	method = "nearest"
)

# Display balance statistics after matching
summary(m.out.matched)
plot(summary(m.out.matched))
plot(m.out.matched, type = "jitter", interactive = F)

# -----------------------------------------------------------------------------
# Propensity Score (PS) Weighting
# -----------------------------------------------------------------------------

# Examine initial inbalance
bal.tab(
	treatment ~ vocabpre + numbmath + likemath + likelit +
		preflit + pagree + pconsc + pemot + pintell + mars + beck +
		cauc + afram + other + age + male + married + momdegr +
		daddegr + credit + majormi + actcomp + hsgpaar + collgpaa +
		mathpre + pextra,
	data = shadishclark,
	estimand = "ATE",
	thresholds = c(m = .25)
)

# IPTW for Average Treatment Effect (ATE)
w.out <- weightit(
	treatment ~ vocabpre + numbmath + likemath + likelit +
		preflit + pagree + pconsc + pemot + pintell + mars + beck +
		cauc + afram + other + age + male + married + momdegr +
		daddegr + credit + majormi + actcomp + hsgpaar + collgpaa +
		mathpre + pextra,
	data = shadishclark,
	method = "ps",
	estimand = "ATE"
)

w.out
summary(w.out)

bal.tab(w.out, stats = c("m", "v"), thresholds = c(m = .25))

# IPTW for Average Treatment Effect on the Treated (ATT)
w.out <- weightit(
	treatment ~ vocabpre + numbmath + likemath + likelit +
		preflit + pagree + pconsc + pemot + pintell + mars + beck +
		cauc + afram + other + age + male + married + momdegr +
		daddegr + credit + majormi + actcomp + hsgpaar + collgpaa +
		mathpre + pextra,
	data = shadishclark,
	method = "ps",
	estimand = "ATT"
)

# -----------------------------------------------------------------------------
# Naive Comparison (Without Matching or Weighting)
# -----------------------------------------------------------------------------

# Simple regression for treatment effect
naive.model <- lm(mathall ~ treatment, data = shadishclark)
summary(naive.model)

# -----------------------------------------------------------------------------
# Outcome Modeling after Weighting
# -----------------------------------------------------------------------------

# IPTW for Average Treatment Effect on the Treated (ATT)
w.out <- weightit(
  treatment ~ vocabpre + numbmath + likemath + likelit +
    preflit + pagree + pconsc + pemot + pintell + mars + beck +
    cauc + afram + other + age + male + married + momdegr +
    daddegr + credit + majormi + actcomp + hsgpaar + collgpaa +
    mathpre + pextra,
  data = shadishclark,
  method = "ps",
  estimand = "ATT"
)


# Fit a linear regression model on the weighted data
weighted.data <- shadishclark
weighted.data$weights <- w.out$weights

outcome.model.weighted <- lm(mathall ~ treatment, data = weighted.data, weights = weighted.data$weights)  
summary(outcome.model.weighted)

# -----------------------------------------------------------------------------
# Doubly Robust Estimation after Weighting
# -----------------------------------------------------------------------------

# Incorporate covariates in the outcome model to make it doubly robust
doubly.robust.model.weighted <- lm(mathall ~ treatment + vocabpre + numbmath + likemath + likelit +
                                     preflit + pagree + pconsc + pemot + pintell + mars + beck +
                                     cauc + afram + other + age + male + married + momdegr +
                                     daddegr + credit + majormi + actcomp + hsgpaar + collgpaa +
                                     mathpre + pextra,
                                   data = weighted.data, weights = weighted.data$weights)

summary(doubly.robust.model.weighted)

# -----------------------------------------------------------------------------
# Handling Standard Errors in Outcome Modeling with Weighted Data
# -----------------------------------------------------------------------------

avg_comparisons(doubly.robust.model.weighted,
                variables = "treatment",
                vcov = "HC3",
                wts = weighted.data$weights)