23  分层(Stratification)

23.1 估计倾向得分(逻辑回归)

Code
data(lalonde, package='Matching')
glimpse(lalonde)
#> Rows: 445
#> Columns: 12
#> $ age     <int> 37, 22, 30, 27, 33, 22, 23, 32, 22, 33, 19, 21, 18, 27, 17, 19…
#> $ educ    <int> 11, 9, 12, 11, 8, 9, 12, 11, 16, 12, 9, 13, 8, 10, 7, 10, 13, …
#> $ black   <int> 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
#> $ hisp    <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
#> $ married <int> 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,…
#> $ nodegr  <int> 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1,…
#> $ re74    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
#> $ re75    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
#> $ re78    <dbl> 9930.05, 3595.89, 24909.50, 7506.15, 289.79, 4056.49, 0.00, 84…
#> $ u74     <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
#> $ u75     <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
#> $ treat   <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…

lalonde_formu <- treat ~ age + I(age^2) + educ + I(educ^2) + black +
    hisp + married + nodegr + re74  + I(re74^2) + re75 + I(re75^2)
lr_out <- glm(formula = lalonde_formu,
              data = lalonde,
              family = binomial(link = 'logit'))

倾向性得分就是模型的拟合值,检查倾向得分的分布,以确保我们有良好的重叠

Code
lalonde$lr_ps <- fitted(lr_out)

ggplot(lalonde, aes(x = lr_ps, color = as.logical(treat))) + 
    geom_density() +
    xlab('Propensity Score')

23.1.1 分层

根据倾向分数使用五分位数进行分层

Code
breaks5 <- psa::get_strata_breaks(lalonde$lr_ps)
str(breaks5)
#> List of 2
#>  $ breaks: Named num [1:6] 0.0849 0.3403 0.3594 0.408 0.5112 ...
#>   ..- attr(*, "names")= chr [1:6] "0%" "20%" "40%" "60%" ...
#>  $ labels:'data.frame':  5 obs. of  4 variables:
#>   ..$ strata: chr [1:5] "A" "B" "C" "D" ...
#>   ..$ xmin  : num [1:5] 0.0849 0.3403 0.3594 0.408 0.5112
#>   ..$ xmax  : num [1:5] 0.34 0.359 0.408 0.511 0.83
#>   ..$ xmid  : num [1:5] 0.213 0.35 0.384 0.46 0.671

lalonde$lr_strata5 <- cut(x = lalonde$lr_ps, 
                          breaks = breaks5$breaks, 
                          include.lowest = TRUE, 
                          labels = breaks5$labels$strata)
table(lalonde$treat, lalonde$lr_strata5)
#>    
#>      A  B  C  D  E
#>   0 66 61 51 42 40
#>   1 23 28 38 47 49
Code
ggplot(lalonde, aes(x = lr_ps, color = as.logical(treat))) + 
    geom_density(aes(fill = as.logical(treat)), alpha = 0.2) +
    geom_vline(xintercept = breaks5$breaks, alpha = 0.5) +
    geom_text(data = breaks5$labels, 
              aes(x = xmid, y = 0, label = strata),
              color = 'black', vjust = 1) +
    xlab('Propensity Score') + ylab('Density') +
    xlim(c(0, 1))

Code
ggplot() +
    geom_vline(xintercept = breaks5$breaks) +
    geom_point(data = lalonde, aes(x = lr_ps, y = log(re78 + 1), color = as.logical(treat)), alpha = 0.5) +
    geom_text(data = breaks5$labels, aes(x = xmid, y = 0, label = strata), color = 'black', vjust = 1) +
    xlab('Propensity Score')

23.1.2 查看平衡混杂效应

Code
covars <- all.vars(lalonde_formu)
covars <- lalonde[,covars[-1]]
PSAgraphics::cv.bal.psa(covariates = covars, 
                        treatment = lalonde$treat,
                        propensity = lalonde$lr_ps,
                        strata = lalonde$lr_strata)

23.1.2.1 数值变量的协变量平衡图

Code
PSAgraphics::box.psa(continuous = lalonde$age, 
                     treatment = lalonde$treat, 
                     strata = lalonde$lr_strata,
                     xlab = "Strata", 
                     balance = FALSE,
                     main = 'Covariate: age')

23.1.2.2 分类变量的协变量平衡图

Code
PSAgraphics::cat.psa(categorical = lalonde$nodegr, 
                     treatment = lalonde$treat, 
                     strata = lalonde$lr_strata, 
                     xlab = 'Strata',
                     balance = FALSE,
                     main = 'Covariate: nodegr')

#> $`treatment:stratum.proportions`
#>   0:A 1:A 0:B   1:B   0:C 1:C   0:D   1:D   0:E   1:E
#> 0   0   0   0 0.036 0.039   0 0.333 0.383 0.675 0.714
#> 1   1   1   1 0.964 0.961   1 0.667 0.617 0.325 0.286

23.2 估计因果效应

Code
PSAgraphics::loess.psa(response = log(lalonde$re78 + 1),
                       treatment = lalonde$treat,
                       propensity = lalonde$lr_ps)

#> $ATE
#> [1] 0.9008386
#> 
#> $se.wtd
#> [1] 0.3913399
#> 
#> $CI95
#> [1] 0.1181588 1.6835185
#> 
#> $summary.strata
#>    counts.0 counts.1  means.0  means.1 diff.means
#> 1        34       11 6.268705 6.474912  0.2062076
#> 2        32       12 5.491717 5.659280  0.1675631
#> 3        31       14 5.467712 5.703584  0.2358722
#> 4        30       14 5.425593 5.747613  0.3220194
#> 5        27       18 5.397146 5.831117  0.4339703
#> 6        24       20 5.302660 6.339721  1.0370619
#> 7        21       23 5.125331 6.607936  1.4826043
#> 8        21       24 5.036908 6.594808  1.5578999
#> 9        22       22 5.182703 6.981383  1.7986801
#> 10       18       27 6.047529 7.820786  1.7732573

psa::loess_plot(ps = lalonde$lr_ps,
                outcome = log(lalonde$re78 + 1),
                treatment = lalonde$treat == 1,
                outcomeTitle = 'log(re78)',
                
                plot.strata = 5,
                points.treat.alpha = 0.5,
                points.control.alpha = 0.5,
                percentPoints.treat = 1,
                percentPoints.control = 1,
                se = FALSE, 
                method = 'loess')

Code
PSAgraphics::circ.psa(response = log(lalonde$re78 + 1), 
                      treatment = lalonde$treat == 1, 
                      strata = lalonde$lr_strata5)

#> $summary.strata
#>   n.FALSE n.TRUE means.FALSE means.TRUE
#> A      66     23    6.280406   6.600537
#> B      61     28    4.409935   5.129193
#> C      51     38    6.212981   6.455034
#> D      42     47    4.705981   6.208840
#> E      40     49    5.783529   7.576461
#> 
#> $wtd.Mn.FALSE
#> [1] 5.478567
#> 
#> $wtd.Mn.TRUE
#> [1] 6.394013
#> 
#> $ATE
#> [1] 0.9154463
#> 
#> $se.wtd
#> [1] 0.394155
#> 
#> $approx.t
#> [1] 2.322554
#> 
#> $df
#> [1] 435
#> 
#> $CI.95
#> [1] 0.1407612 1.6901314

23.3 敏感性分析

评估该效果的稳健性

23.3.1 估计倾向得分(分类树)

Code
library(tree)
tree_out <- tree::tree(lalonde_formu,
                       data = lalonde)

plot(tree_out); text(tree_out)

Code


lalonde$tree_ps <- predict(tree_out)
table(lalonde$tree_ps, lalonde$treat, useNA = 'ifany')
#>                    
#>                       0   1
#>   0.332             167  83
#>   0.344827586206897  19  10
#>   0.351851851851852  35  19
#>   0.612903225806452  24  38
#>   0.659090909090909  15  29
#>   1                   0   6
lalonde$tree_strata <- predict(tree_out, type = 'where')
table(lalonde$tree_strata, lalonde$treat, useNA = 'ifany')
#>     
#>        0   1
#>   3  167  83
#>   5   15  29
#>   6   35  19
#>   9   24  38
#>   10  19  10
#>   11   0   6