3 分层（Stratification）

3.1 探索性分析

Show the code

data(lalonde, package='Matching')
lalonde <- lalonde %>% mutate(across(treat, as.factor))
str(lalonde)
#> 'data.frame':    445 obs. of  12 variables:
#>  $ age    : int  37 22 30 27 33 22 23 32 22 33 ...
#>  $ educ   : int  11 9 12 11 8 9 12 11 16 12 ...
#>  $ black  : int  1 0 1 1 1 1 1 1 1 0 ...
#>  $ hisp   : int  0 1 0 0 0 0 0 0 0 0 ...
#>  $ married: int  1 0 0 0 0 0 0 0 0 1 ...
#>  $ nodegr : int  1 1 0 1 1 1 0 1 0 0 ...
#>  $ re74   : num  0 0 0 0 0 0 0 0 0 0 ...
#>  $ re75   : num  0 0 0 0 0 0 0 0 0 0 ...
#>  $ re78   : num  9930 3596 24910 7506 290 ...
#>  $ u74    : int  1 1 1 1 1 1 1 1 1 1 ...
#>  $ u75    : int  1 1 1 1 1 1 1 1 1 1 ...
#>  $ treat  : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...

Show the code

ggplot(lalonde, aes(x = age, y = educ, shape = factor(married), color =treat)) + 
    geom_point() + 
    scale_color_manual(values = c(`0`="pink",`1`="skyblue"))+
    theme(legend.position = 'bottom')

Show the code

GGally::ggpairs(
    lalonde %>% select(age,educ, re78, treat),
    columns = c(1,2,3),
    mapping = ggplot2::aes(color = treat == 1),
    upper = list(continuous = 'cor'),
    lower = list(continuous = GGally::wrap("points", alpha = 0.3), 
                 combo = GGally::wrap("dot_no_facet", alpha = 0.4)),
    diag = list(continuous = "densityDiag", discrete = "barDiag", na = "naDiag", alpha = 0.4),
)

3.2 估计倾向得分（逻辑回归）

Show the code

lalonde_formula <- treat ~ age + I(age^2) + educ + I(educ^2) + black +
    hisp + married + nodegr + re74  + I(re74^2) + re75 + I(re75^2)

logit_fit <- glm(formula = lalonde_formula,
              data = lalonde,
              family = binomial(link = 'logit'))

倾向性得分就是模型的拟合值，

Show the code

lalonde$ps <- fitted(logit_fit)

检查倾向得分的分布，以确保我们有良好的重叠

Show the code

ggplot() +
    geom_histogram(data = subset(lalonde, treat == 1), aes(x = ps, y = after_stat(count)),
                   binwidth = 0.05, fill = "skyblue", color = "black", alpha = 0.7) +
    geom_histogram(data = subset(lalonde, treat == 0), aes(x = ps, y = - after_stat(count)),
                   binwidth = 0.05, fill = "pink", color = "black", alpha = 0.7) +
    labs(y = "Count", x = "PS")

Show the code



ggplot(lalonde, aes(x = ps, color = treat)) + 
    geom_density() +
    xlab('Propensity Score')+
    scale_color_manual(values = c(`0`="pink",`1`="skyblue"))

3.3 倾向得分分层

根据倾向分数使用五分位数进行分层

Show the code

breaks5 <- psa::get_strata_breaks(lalonde$ps)
str(breaks5)
#> List of 2
#>  $ breaks: Named num [1:6] 0.0849 0.3403 0.3594 0.408 0.5112 ...
#>   ..- attr(*, "names")= chr [1:6] "0%" "20%" "40%" "60%" ...
#>  $ labels:'data.frame':  5 obs. of  4 variables:
#>   ..$ strata: chr [1:5] "A" "B" "C" "D" ...
#>   ..$ xmin  : num [1:5] 0.0849 0.3403 0.3594 0.408 0.5112
#>   ..$ xmax  : num [1:5] 0.34 0.359 0.408 0.511 0.83
#>   ..$ xmid  : num [1:5] 0.213 0.35 0.384 0.46 0.671

lalonde$lr_strata5 <- cut(x = lalonde$ps, 
                          breaks = breaks5$breaks, 
                          include.lowest = TRUE, 
                          labels = breaks5$labels$strata)
table(lalonde$treat, lalonde$lr_strata5)
#>    
#>      A  B  C  D  E
#>   0 66 61 51 42 40
#>   1 23 28 38 47 49

Show the code

ggplot(lalonde, aes(x = ps, color = treat)) + 
    geom_density(aes(fill = treat), alpha = 0.2) +
    geom_vline(xintercept = breaks5$breaks, alpha = 0.5) +
    geom_text(data = breaks5$labels, 
              aes(x = xmid, y = 0, label = strata),
              color = 'black', vjust = 1) +
    xlab('Propensity Score') + ylab('Density') +
    xlim(c(0, 1))+
    scale_fill_manual(values = c(`0`="pink",`1`="skyblue"))

Show the code

ggplot() +
    geom_vline(xintercept = breaks5$breaks) +
    geom_point(data = lalonde, aes(x = ps, y = log(re78 + 1), color = treat), alpha = 0.5) +
    geom_text(data = breaks5$labels, aes(x = xmid, y = 0, label = strata), color = 'black', vjust = 1) +
    xlab('Propensity Score')+
    scale_color_manual(values = c(`0`="pink2",`1`="skyblue2"))

3.4 平衡诊断

3.4.1 效应大小

Show the code

covars <- all.vars(lalonde_formula)
covars <- lalonde[,covars[-1]]

PSAgraphics::cv.bal.psa(covariates = covars, 
                        treatment = lalonde$treat,
                        propensity = lalonde$ps,
                        strata = lalonde$lr_strata)

3.4.2 数值型协变量平衡图

Show the code

PSAgraphics::box.psa(continuous = lalonde$age, 
                     treatment = lalonde$treat, 
                     strata = lalonde$lr_strata,
                     xlab = "Strata", 
                     balance = FALSE,
                     main = 'Covariate: age')

3.4.3 分类型协变量平衡图

Show the code

PSAgraphics::cat.psa(categorical = lalonde$nodegr, 
                     treatment = lalonde$treat, 
                     strata = lalonde$lr_strata, 
                     xlab = 'Strata',
                     balance = FALSE,
                     main = 'Covariate: nodegr')

#> $`treatment:stratum.proportions`
#>   0:A 1:A 0:B   1:B   0:C 1:C   0:D   1:D   0:E   1:E
#> 0   0   0   0 0.036 0.039   0 0.333 0.383 0.675 0.714
#> 1   1   1   1 0.964 0.961   1 0.667 0.617 0.325 0.286

3.5 估计因果效应

Show the code

lalonde %>% 
ggplot(aes(x = ps, y = re78, color = treat)) + 
    geom_point(alpha = 0.5) +
    geom_smooth(method = 'loess', formula = y ~ x, alpha = 0.5) +
    xlab('Propensity Score') +
    theme(legend.position = 'bottom')

具有大约 95% 置信区间（灰色）的 Loess 回归线在整个倾向得分范围内重叠，说明异质或不均匀

3.5.1 分层

Show the code

lalonde$treat <- as.numeric(lalonde$treat)-1
dat <- lalonde |> mutate(
    ate_weight = psa::calculate_ps_weights(treat, ps, estimand = 'ATE'),
    att_weight = psa::calculate_ps_weights(treat, ps, estimand = 'ATT'),
    atc_weight = psa::calculate_ps_weights(treat, ps, estimand = 'ATC'),
    atm_weight = psa::calculate_ps_weights(treat, ps, estimand = 'ATM')
)

ggplot(dat, aes(x = ps, color = factor(treat))) + 
    geom_density(aes(fill = factor(treat)), alpha = 0.2) +
    geom_vline(xintercept = breaks5$breaks, alpha = 0.5) +
    geom_text(data = breaks5$labels, 
              aes(x = xmid, y = 0, label = strata),
              color = 'black', vjust = 0, size = 4) +
    xlab('Propensity Score') + ylab('Density') +
    xlim(c(0, 1))

Show the code

psa::stratification_plot(ps = dat$ps,
                    treatment = dat$treat,
                    outcome = dat$re78, 
                    n_strata = 5)

Show the code

PSAgraphics::loess.psa(response = log(lalonde$re78 + 1),
                       treatment = lalonde$treat,
                       propensity = lalonde$ps)

#> $ATE
#> [1] 0.9008386
#> 
#> $se.wtd
#> [1] 0.3913399
#> 
#> $CI95
#> [1] 0.1181588 1.6835185
#> 
#> $summary.strata
#>    counts.0 counts.1  means.0  means.1 diff.means
#> 1        34       11 6.268705 6.474912  0.2062076
#> 2        32       12 5.491717 5.659280  0.1675631
#> 3        31       14 5.467712 5.703584  0.2358722
#> 4        30       14 5.425593 5.747613  0.3220194
#> 5        27       18 5.397146 5.831117  0.4339703
#> 6        24       20 5.302660 6.339721  1.0370619
#> 7        21       23 5.125331 6.607936  1.4826043
#> 8        21       24 5.036908 6.594808  1.5578999
#> 9        22       22 5.182703 6.981383  1.7986801
#> 10       18       27 6.047529 7.820786  1.7732573

psa::loess_plot(ps = lalonde$ps,
                outcome = log(lalonde$re78 + 1),
                treatment = lalonde$treat == 1,
                outcomeTitle = 'log(re78)',
                
                plot.strata = 5,
                points.treat.alpha = 0.5,
                points.control.alpha = 0.5,
                percentPoints.treat = 1,
                percentPoints.control = 1,
                se = FALSE, 
                method = 'loess')

Show the code

PSAgraphics::circ.psa(response = log(lalonde$re78 + 1), 
                      treatment = lalonde$treat == 1, 
                      strata = lalonde$lr_strata5,
                      revc = T,
                      xlab = 'Treatment',
                      ylab = 'Control')

#> $summary.strata
#>   n.FALSE n.TRUE means.FALSE means.TRUE
#> A      66     23    6.280406   6.600537
#> B      61     28    4.409935   5.129193
#> C      51     38    6.212981   6.455034
#> D      42     47    4.705981   6.208840
#> E      40     49    5.783529   7.576461
#> 
#> $wtd.Mn.TRUE
#> [1] 6.394013
#> 
#> $wtd.Mn.FALSE
#> [1] 5.478567
#> 
#> $ATE
#> [1] -0.9154463
#> 
#> $se.wtd
#> [1] 0.394155
#> 
#> $approx.t
#> [1] -2.322554
#> 
#> $df
#> [1] 435
#> 
#> $CI.95
#> [1] -1.6901314 -0.1407612

将绘制每个层的平均处理（x 轴）与对照（y 轴）的对比。均值被投影到垂直于单位线的线（即线y=x），以便刻度线表示差异的分布。绿色条对应于 95% 置信区间。圆圈的大小与每个层内的样本大小成正比。因为y=x落在该线上的点表示零差（即y−x=0). 推而广之，如果绿线表示的置信区间跨越单位线，则无法否定原假设。然而，在此示例中，存在统计学上显著的处理效果。

3.5.2 平均治疗效应ATE

Show the code

ggplot() +
    geom_histogram(data = dat[dat$treat == 1,],
                   aes(x = ps, y = after_stat(count)),
                   bins = 50, alpha = 0.5) +
    geom_histogram(data = dat[dat$treat == 1,],
                   aes(x = ps, weight = ate_weight, y = after_stat(count)),
                   bins = 50, 
                   fill = "skyblue2", alpha = 0.5) +
    geom_histogram(data = dat[dat$treat == 0,],
                   aes(x = ps, y = -after_stat(count)),
                   bins = 50, alpha = 0.5) +
    geom_histogram(data = dat[dat$treat == 0,],
                   aes(x = ps, weight = ate_weight, y = -after_stat(count)),
                   bins = 50, 
                   fill = "pink2", alpha = 0.5) +
    ggtitle('Average Treatment Effect (ATE)')

3.5.3 接受治疗者的平均治疗效果（ATT）

Show the code

ggplot() +
    geom_histogram(data = dat[dat$treat == 1,],
                   aes(x = ps, y = after_stat(count)),
                   bins = 50, alpha = 0.5) +
    geom_histogram(data = dat[dat$treat == 1,],
                   aes(x = ps, weight = att_weight, y = after_stat(count)),
                   bins = 50, 
                   fill = "skyblue2", alpha = 0.5) +
    geom_histogram(data = dat[dat$treat == 0,],
                   aes(x = ps, y = -after_stat(count)),
                   bins = 50, alpha = 0.5) +
    geom_histogram(data = dat[dat$treat == 0,],
                   aes(x = ps, weight = att_weight, y = -after_stat(count)),
                   bins = 50, 
                   fill = "pink2", alpha = 0.5) +
    ggtitle('Average Treatment Effect Among the Treated (ATT)')

3.5.4 对照组的平均治疗效果（ATC）

Show the code

ggplot() +
    geom_histogram(data = dat[dat$treat == 1,],
                   aes(x = ps, y = after_stat(count)),
                   bins = 50, alpha = 0.5) +
    geom_histogram(data = dat[dat$treat == 1,],
                   aes(x = ps, weight = atc_weight, y = after_stat(count)),
                   bins = 50, 
                   fill = "skyblue2", alpha = 0.5) +
    geom_histogram(data = dat[dat$treat == 0,],
                   aes(x = ps, y = -after_stat(count)),
                   bins = 50, alpha = 0.5) +
    geom_histogram(data = dat[dat$treat == 0,],
                   aes(x = ps, weight = atc_weight, y = -after_stat(count)),
                   bins = 50, 
                   fill = "pink", alpha = 0.5) +
    ggtitle('Average Treatment Effect Among the Control (ATC)')

3.5.5 Average Treatment Effect Among the Evenly Matched (ATM)

Show the code

ggplot() +
    geom_histogram(data = dat[dat$treat == 1,],
                   aes(x = ps, y = after_stat(count)),
                   bins = 50, alpha = 0.5) +
    geom_histogram(data = dat[dat$treat == 1,],
                   aes(x = ps, weight = atm_weight, y = after_stat(count)),
                   bins = 50, 
                   fill = "skyblue2", alpha = 0.5) +
    geom_histogram(data = dat[dat$treat == 0,],
                   aes(x = ps, y = -after_stat(count)),
                   bins = 50, alpha = 0.5) +
    geom_histogram(data = dat[dat$treat == 0,],
                   aes(x = ps, weight = atm_weight, y = -after_stat(count)),
                   bins = 50, 
                   fill = "pink2", alpha = 0.5) +
    ggtitle('Average Treatment Effect Among the Evenly Matched (ATM)')

3.6 敏感性分析

评估该效果的稳健性

Rosenbaum （2012）建议另一种测试灵敏度的方法是测试原假设两次。我们将在这里使用分类树方法来估计倾向得分和分层。

Show the code

library(tree)
tree_out <- tree::tree(lalonde_formula,
                       data = lalonde)

plot(tree_out); text(tree_out)

Show the code



lalonde$tree_ps <- predict(tree_out)
table(lalonde$tree_ps, lalonde$treat, useNA = 'ifany')
#>                    
#>                       0   1
#>   0.332             167  83
#>   0.344827586206897  19  10
#>   0.351851851851852  35  19
#>   0.612903225806452  24  38
#>   0.659090909090909  15  29
#>   1                   0   6
lalonde$tree_strata <- predict(tree_out, type = 'where')
table(lalonde$tree_strata, lalonde$treat, useNA = 'ifany')
#>     
#>        0   1
#>   3  167  83
#>   5   15  29
#>   6   35  19
#>   9   24  38
#>   10  19  10
#>   11   0   6

3.1 探索性分析

3.2 估计倾向得分（逻辑回归）

3.3 倾向得分分层

3.4 平衡诊断

3.4.1 效应大小

3.4.2 数值型协变量平衡图

3.4.3 分类型协变量平衡图

3.5 估计因果效应

3.5.1 分层

3.5.2 平均治疗效应ATE

3.5.3 接受治疗者的平均治疗效果 （ATT）

3.5.4 对照组的平均治疗效果 （ATC）

3.5.5 Average Treatment Effect Among the Evenly Matched (ATM)

3.6 敏感性分析

3.5.3 接受治疗者的平均治疗效果（ATT）

3.5.4 对照组的平均治疗效果（ATC）