1 机器学习

典型建模过程的原理图

典型建模过程的原理图

1.1 探索性数据分析

Code
library(tidymodels)
#> ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
#> ✔ broom        1.0.6     ✔ rsample      1.2.1
#> ✔ dials        1.2.1     ✔ tune         1.2.1
#> ✔ infer        1.0.7     ✔ workflows    1.1.4
#> ✔ modeldata    1.4.0     ✔ workflowsets 1.1.0
#> ✔ parsnip      1.2.1     ✔ yardstick    1.3.1
#> ✔ recipes      1.1.0
data(ames)

ggplot(ames, aes(x = Sale_Price)) + 
  geom_histogram(bins = 50, col= "white")

Code




ames <- ames |>mutate(Sale_Price = log10(Sale_Price))

ggplot(ames, aes(x = Sale_Price)) + 
  geom_histogram(bins = 50, col= "white")+
    geom_vline(xintercept =quantile(ames$Sale_Price),lty=5 )

1.2 拆分训练集、验证集和测试集 rsample

1.2.1 简单抽样

Code
set.seed(10)
ames_split <- initial_split(ames, prop = c(0.8))
ames_split

ames_train <- training(ames_split)
ames_test  <-  testing(ames_split)
dim(ames_train)

1.2.2 分层抽样

Code
set.seed(100)
ames_split <- initial_split(ames, prop = 0.80, strata = Sale_Price)
ames_train <- training(ames_split)
ames_test  <-  testing(ames_split)

dim(ames_train)
#> [1] 2342   74

1.2.3 验证集

Code
set.seed(101)

# To put 60% into training, 20% in validation, and remaining 20% in testing:
ames_split <- initial_validation_split(ames, prop = c(0.6, 0.2),
                                       strata = Sale_Price)
ames_split

ames_train <- training(ames_split)
ames_test <- testing(ames_split)
ames_valid <- validation(ames_split)

1.3 模型选择 parsnip

Code
parsnip_addin()
Code
show_engines('linear_reg')
#> # A tibble: 7 × 2
#>   engine mode      
#>   <chr>  <chr>     
#> 1 lm     regression
#> 2 glm    regression
#> 3 glmnet regression
#> 4 stan   regression
#> 5 spark  regression
#> 6 keras  regression
#> 7 brulee regression
show_engines("logistic_reg")
#> # A tibble: 7 × 2
#>   engine    mode          
#>   <chr>     <chr>         
#> 1 glm       classification
#> 2 glmnet    classification
#> 3 LiblineaR classification
#> 4 spark     classification
#> 5 keras     classification
#> 6 stan      classification
#> 7 brulee    classification
Code
lm_model <- linear_reg() |> 
    set_engine("lm") |> 
    set_mode(mode = "regression")
lm_model
#> Linear Regression Model Specification (regression)
#> 
#> Computational engine: lm

logistic_reg() |>
  set_mode("classification") |>
  set_engine("glm") 
#> Logistic Regression Model Specification (classification)
#> 
#> Computational engine: glm

rand_forest(trees = 1000, min_n = 5) |>
  set_engine("ranger", verbose = TRUE) |>
  set_mode("regression") 
#> Random Forest Model Specification (regression)
#> 
#> Main Arguments:
#>   trees = 1000
#>   min_n = 5
#> 
#> Engine-Specific Arguments:
#>   verbose = TRUE
#> 
#> Computational engine: ranger

decision_tree(min_n = 2) |>
  set_engine("rpart") |>
  set_mode("regression")
#> Decision Tree Model Specification (regression)
#> 
#> Main Arguments:
#>   min_n = 2
#> 
#> Computational engine: rpart

1.4 模型工作流 workflows

1.4.1 线性模型

预处理 Preprocessor

Code

#  Preprocessor
# None

lm_wflow <- 
  workflow() %>% 
  add_model(lm_model)

lm_wflow
#> ══ Workflow ════════════════════════════════════════════════════════════════════
#> Preprocessor: None
#> Model: linear_reg()
#> 
#> ── Model ───────────────────────────────────────────────────────────────────────
#> Linear Regression Model Specification (regression)
#> 
#> Computational engine: lm


# Formula
lm_wflow <- lm_wflow |> 
    add_formula(Sale_Price ~ Longitude + Latitude)

lm_wflow
#> ══ Workflow ════════════════════════════════════════════════════════════════════
#> Preprocessor: Formula
#> Model: linear_reg()
#> 
#> ── Preprocessor ────────────────────────────────────────────────────────────────
#> Sale_Price ~ Longitude + Latitude
#> 
#> ── Model ───────────────────────────────────────────────────────────────────────
#> Linear Regression Model Specification (regression)
#> 
#> Computational engine: lm

#  拟合
lm_fit <- fit(lm_wflow, ames_train)
lm_fit
#> ══ Workflow [trained] ══════════════════════════════════════════════════════════
#> Preprocessor: Formula
#> Model: linear_reg()
#> 
#> ── Preprocessor ────────────────────────────────────────────────────────────────
#> Sale_Price ~ Longitude + Latitude
#> 
#> ── Model ───────────────────────────────────────────────────────────────────────
#> 
#> Call:
#> stats::lm(formula = ..y ~ ., data = data)
#> 
#> Coefficients:
#> (Intercept)    Longitude     Latitude  
#>    -314.938       -2.138        2.853


# 更换公式 再拟合
lm_wflow %>% update_formula(Sale_Price ~ Longitude) |> fit(ames_train)
#> ══ Workflow [trained] ══════════════════════════════════════════════════════════
#> Preprocessor: Formula
#> Model: linear_reg()
#> 
#> ── Preprocessor ────────────────────────────────────────────────────────────────
#> Sale_Price ~ Longitude
#> 
#> ── Model ───────────────────────────────────────────────────────────────────────
#> 
#> Call:
#> stats::lm(formula = ..y ~ ., data = data)
#> 
#> Coefficients:
#> (Intercept)    Longitude  
#>    -191.309       -2.099



# Variables:outcomes ~ predictors
lm_wflow <- 
  lm_wflow %>% 
  remove_formula() %>% 
  add_variables(outcomes  = Sale_Price, predictors = c(Longitude, Latitude))  # c(ends_with("tude"))

lm_wflow |> fit(ames_train)
#> ══ Workflow [trained] ══════════════════════════════════════════════════════════
#> Preprocessor: Variables
#> Model: linear_reg()
#> 
#> ── Preprocessor ────────────────────────────────────────────────────────────────
#> Outcomes: Sale_Price
#> Predictors: c(Longitude, Latitude)
#> 
#> ── Model ───────────────────────────────────────────────────────────────────────
#> 
#> Call:
#> stats::lm(formula = ..y ~ ., data = data)
#> 
#> Coefficients:
#> (Intercept)    Longitude     Latitude  
#>    -314.938       -2.138        2.853


# recepe

1.4.2 预测

Code

# 回归 "numeric" , "conf_int","pred_int","raw".
#  censored regression   "time","hazard","survival"
# 分类  "class", "prob",
# "quantile"

# When NULL, predict() will choose an appropriate value based on the model's mode.
predict(lm_fit, ames_test)  # "numeric"
#> # A tibble: 588 × 1
#>    .pred
#>    <dbl>
#>  1  5.23
#>  2  5.29
#>  3  5.28
#>  4  5.27
#>  5  5.26
#>  6  5.24
#>  7  5.24
#>  8  5.24
#>  9  5.24
#> 10  5.30
#> # ℹ 578 more rows

1.4.3 混合效应模型 multilevelmod

Code
library(multilevelmod)
multilevel_spec <- linear_reg() %>% set_engine("lmer")
multilevel_spec
#> Linear Regression Model Specification (regression)
#> 
#> Computational engine: lmer

df <-  read_delim("data/lme_anova.txt",) |> pivot_longer(cols = 3:7,names_to = "time",values_to = "BP") |> 
    mutate_at(1:3,as.factor)
#> Rows: 15 Columns: 7
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: "\t"
#> chr (1): induced_method
#> dbl (6): subject, t0, t1, t2, t3, t4
#> 
#> ℹ Use `spec()` to retrieve the full column specification for this data.
#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df
#> # A tibble: 75 × 4
#>    subject induced_method time     BP
#>    <fct>   <fct>          <fct> <dbl>
#>  1 1       A              t0      120
#>  2 1       A              t1      108
#>  3 1       A              t2      112
#>  4 1       A              t3      120
#>  5 1       A              t4      117
#>  6 2       A              t0      118
#>  7 2       A              t1      109
#>  8 2       A              t2      115
#>  9 2       A              t3      126
#> 10 2       A              t4      123
#> # ℹ 65 more rows
multilevel_workflow <- 
  workflow() %>% 
  add_variables(outcome = BP, predictors = c(induced_method,time,subject)) %>% 
  add_model(multilevel_spec, 
            # This formula is given to the model
            formula = BP ~ induced_method+time + ( 1| subject))

multilevel_workflow
#> ══ Workflow ════════════════════════════════════════════════════════════════════
#> Preprocessor: Variables
#> Model: linear_reg()
#> 
#> ── Preprocessor ────────────────────────────────────────────────────────────────
#> Outcomes: BP
#> Predictors: c(induced_method, time, subject)
#> 
#> ── Model ───────────────────────────────────────────────────────────────────────
#> Linear Regression Model Specification (regression)
#> 
#> Computational engine: lmer
multilevel_workflow |>  fit(data =df) 
#> ══ Workflow [trained] ══════════════════════════════════════════════════════════
#> Preprocessor: Variables
#> Model: linear_reg()
#> 
#> ── Preprocessor ────────────────────────────────────────────────────────────────
#> Outcomes: BP
#> Predictors: c(induced_method, time, subject)
#> 
#> ── Model ───────────────────────────────────────────────────────────────────────
#> Linear mixed model fit by REML ['lmerMod']
#> Formula: BP ~ induced_method + time + (1 | subject)
#>    Data: data
#> REML criterion at convergence: 431.0591
#> Random effects:
#>  Groups   Name        Std.Dev.
#>  subject  (Intercept) 3.441   
#>  Residual             4.434   
#> Number of obs: 75, groups:  subject, 15
#> Fixed Effects:
#>     (Intercept)  induced_methodB  induced_methodC           timet1  
#>         118.360            4.800            8.520           -4.400  
#>          timet2           timet3           timet4  
#>          -4.467            9.400            6.067

1.4.4 生存模型 censored

type = "time" type = "survival" type = "linear_pred" type = "quantile" type = "hazard"

Code
library(censored)
#> Loading required package: survival

parametric_spec <- survival_reg()

parametric_workflow <- 
  workflow() %>% 
  add_variables(outcome = c(fustat, futime), predictors = c(age, rx)) %>% 
  add_model(parametric_spec, 
            formula = Surv(futime, fustat) ~ age + strata(rx))

parametric_fit <- fit(parametric_workflow, data = ovarian)
parametric_fit
#> ══ Workflow [trained] ══════════════════════════════════════════════════════════
#> Preprocessor: Variables
#> Model: survival_reg()
#> 
#> ── Preprocessor ────────────────────────────────────────────────────────────────
#> Outcomes: c(fustat, futime)
#> Predictors: c(age, rx)
#> 
#> ── Model ───────────────────────────────────────────────────────────────────────
#> Call:
#> survival::survreg(formula = Surv(futime, fustat) ~ age + strata(rx), 
#>     data = data, model = TRUE)
#> 
#> Coefficients:
#> (Intercept)         age 
#>  12.8734120  -0.1033569 
#> 
#> Scale:
#>      rx=1      rx=2 
#> 0.7695509 0.4703602 
#> 
#> Loglik(model)= -89.4   Loglik(intercept only)= -97.1
#>  Chisq= 15.36 on 1 degrees of freedom, p= 8.88e-05 
#> n= 26

1.4.5 工作流集 workflowsets

Code
location <- list(
  longitude = Sale_Price ~ Longitude,
  latitude = Sale_Price ~ Latitude,
  coords = Sale_Price ~ Longitude + Latitude,
  neighborhood = Sale_Price ~ Neighborhood
)

library(workflowsets)
location_models <- workflow_set(preproc = location, models = list(lm = lm_model))
location_models
#> # A workflow set/tibble: 4 × 4
#>   wflow_id        info             option    result    
#>   <chr>           <list>           <list>    <list>    
#> 1 longitude_lm    <tibble [1 × 4]> <opts[0]> <list [0]>
#> 2 latitude_lm     <tibble [1 × 4]> <opts[0]> <list [0]>
#> 3 coords_lm       <tibble [1 × 4]> <opts[0]> <list [0]>
#> 4 neighborhood_lm <tibble [1 × 4]> <opts[0]> <list [0]>
location_models$info[[1]]
#> # A tibble: 1 × 4
#>   workflow   preproc model      comment
#>   <list>     <chr>   <chr>      <chr>  
#> 1 <workflow> formula linear_reg ""
extract_workflow(location_models, id = "coords_lm")
#> ══ Workflow ════════════════════════════════════════════════════════════════════
#> Preprocessor: Formula
#> Model: linear_reg()
#> 
#> ── Preprocessor ────────────────────────────────────────────────────────────────
#> Sale_Price ~ Longitude + Latitude
#> 
#> ── Model ───────────────────────────────────────────────────────────────────────
#> Linear Regression Model Specification (regression)
#> 
#> Computational engine: lm


location_models <-
   location_models %>%
   mutate(fit = map(info, ~ fit(.x$workflow[[1]], ames_train)))
location_models
#> # A workflow set/tibble: 4 × 5
#>   wflow_id        info             option    result     fit       
#>   <chr>           <list>           <list>    <list>     <list>    
#> 1 longitude_lm    <tibble [1 × 4]> <opts[0]> <list [0]> <workflow>
#> 2 latitude_lm     <tibble [1 × 4]> <opts[0]> <list [0]> <workflow>
#> 3 coords_lm       <tibble [1 × 4]> <opts[0]> <list [0]> <workflow>
#> 4 neighborhood_lm <tibble [1 × 4]> <opts[0]> <list [0]> <workflow>

location_models$fit[[1]]
#> ══ Workflow [trained] ══════════════════════════════════════════════════════════
#> Preprocessor: Formula
#> Model: linear_reg()
#> 
#> ── Preprocessor ────────────────────────────────────────────────────────────────
#> Sale_Price ~ Longitude
#> 
#> ── Model ───────────────────────────────────────────────────────────────────────
#> 
#> Call:
#> stats::lm(formula = ..y ~ ., data = data)
#> 
#> Coefficients:
#> (Intercept)    Longitude  
#>    -191.309       -2.099
Code
final_lm_res <- last_fit(lm_wflow, ames_split)
final_lm_res
#> # Resampling results
#> # Manual resampling 
#> # A tibble: 1 × 6
#>   splits             id               .metrics .notes   .predictions .workflow 
#>   <list>             <chr>            <list>   <list>   <list>       <list>    
#> 1 <split [2342/588]> train/test split <tibble> <tibble> <tibble>     <workflow>
extract_workflow(final_lm_res)
#> ══ Workflow [trained] ══════════════════════════════════════════════════════════
#> Preprocessor: Variables
#> Model: linear_reg()
#> 
#> ── Preprocessor ────────────────────────────────────────────────────────────────
#> Outcomes: Sale_Price
#> Predictors: c(Longitude, Latitude)
#> 
#> ── Model ───────────────────────────────────────────────────────────────────────
#> 
#> Call:
#> stats::lm(formula = ..y ~ ., data = data)
#> 
#> Coefficients:
#> (Intercept)    Longitude     Latitude  
#>    -314.938       -2.138        2.853

collect_metrics(final_lm_res)
#> # A tibble: 2 × 4
#>   .metric .estimator .estimate .config             
#>   <chr>   <chr>          <dbl> <chr>               
#> 1 rmse    standard       0.157 Preprocessor1_Model1
#> 2 rsq     standard       0.152 Preprocessor1_Model1
collect_predictions(final_lm_res) %>% slice(1:5)
#> # A tibble: 5 × 5
#>   .pred id                .row Sale_Price .config             
#>   <dbl> <chr>            <int>      <dbl> <chr>               
#> 1  5.23 train/test split     1       5.33 Preprocessor1_Model1
#> 2  5.29 train/test split     6       5.29 Preprocessor1_Model1
#> 3  5.28 train/test split    13       5.26 Preprocessor1_Model1
#> 4  5.27 train/test split    14       5.23 Preprocessor1_Model1
#> 5  5.26 train/test split    16       5.73 Preprocessor1_Model1

1.5 特征工程 recipes

1.5.1 虚拟变量

Code
simple_ames <- 
  recipe(Sale_Price ~ Neighborhood + Gr_Liv_Area + Year_Built + Bldg_Type,
         data = ames_train) %>%
  step_log(Gr_Liv_Area, base = 10) %>% 
  step_dummy(all_nominal_predictors())# 因子或字符变量,名义nominal
# all_numeric_predictors()  all_numeric()  all_predictors()  all_outcomes()

# 跨模型循环使用
simple_ames
#> 
#> ── Recipe ──────────────────────────────────────────────────────────────────────
#> 
#> ── Inputs
#> Number of variables by role
#> outcome:   1
#> predictor: 4
#> 
#> ── Operations
#> • Log transformation on: Gr_Liv_Area
#> • Dummy variables from: all_nominal_predictors()
Code
lm_wflow <- 
  lm_wflow %>% 
    #一次只能有一种预处理方法,需要在添加配方之前删除现有的预处理器
  remove_variables() %>% 
  add_recipe(simple_ames)

lm_wflow
#> ══ Workflow ════════════════════════════════════════════════════════════════════
#> Preprocessor: Recipe
#> Model: linear_reg()
#> 
#> ── Preprocessor ────────────────────────────────────────────────────────────────
#> 2 Recipe Steps
#> 
#> • step_log()
#> • step_dummy()
#> 
#> ── Model ───────────────────────────────────────────────────────────────────────
#> Linear Regression Model Specification (regression)
#> 
#> Computational engine: lm

lm_fit <- fit(lm_wflow, ames_train)
predict(lm_fit, ames_test)
#> Warning in predict.lm(object = object$fit, newdata = new_data, type =
#> "response", : prediction from rank-deficient fit; consider predict(.,
#> rankdeficient="NA")
#> # A tibble: 588 × 1
#>    .pred
#>    <dbl>
#>  1  5.24
#>  2  5.27
#>  3  5.24
#>  4  5.20
#>  5  5.66
#>  6  5.12
#>  7  5.08
#>  8  5.22
#>  9  5.00
#> 10  5.37
#> # ℹ 578 more rows

# 提取模型信息
lm_fit %>% 
  extract_recipe(estimated = TRUE)
#> 
#> ── Recipe ──────────────────────────────────────────────────────────────────────
#> 
#> ── Inputs
#> Number of variables by role
#> outcome:   1
#> predictor: 4
#> 
#> ── Training information
#> Training data contained 2342 data points and no incomplete rows.
#> 
#> ── Operations
#> • Log transformation on: Gr_Liv_Area | Trained
#> • Dummy variables from: Neighborhood and Bldg_Type | Trained

lm_fit %>% 
  extract_fit_parsnip() %>% 
  tidy()
#> # A tibble: 35 × 5
#>    term                            estimate std.error statistic   p.value
#>    <chr>                              <dbl>     <dbl>     <dbl>     <dbl>
#>  1 (Intercept)                     -1.10     0.234       -4.70  2.72e-  6
#>  2 Gr_Liv_Area                      0.646    0.0143      45.0   1.88e-318
#>  3 Year_Built                       0.00217  0.000118    18.5   3.21e- 71
#>  4 Neighborhood_College_Creek       0.00897  0.00841      1.07  2.87e-  1
#>  5 Neighborhood_Old_Town           -0.0178   0.00846     -2.11  3.51e-  2
#>  6 Neighborhood_Edwards            -0.0413   0.00769     -5.37  8.59e-  8
#>  7 Neighborhood_Somerset            0.0433   0.00992      4.37  1.30e-  5
#>  8 Neighborhood_Northridge_Heights  0.129    0.0102      12.7   6.96e- 36
#>  9 Neighborhood_Gilbert            -0.0437   0.00939     -4.65  3.43e-  6
#> 10 Neighborhood_Sawyer             -0.00785  0.00841     -0.933 3.51e-  1
#> # ℹ 25 more rows
Code

simple_ames <- 
  recipe(Sale_Price ~ Neighborhood + Gr_Liv_Area + Year_Built + Bldg_Type,
         data = ames_train) %>%
  step_log(Gr_Liv_Area, base = 10) %>% 
  step_other(Neighborhood, threshold = 0.01) %>% 
  step_dummy(all_nominal_predictors())
Code
ggplot(ames_train, aes(x = Gr_Liv_Area, y = 10^Sale_Price)) + 
  geom_point(alpha = .2) + 
  facet_wrap(~ Bldg_Type) + 
  geom_smooth(method = lm, formula = y ~ x, se = FALSE, color = "blue") + 
  scale_x_log10() + 
  scale_y_log10() + 
  labs(x = "Gross Living Area", y = "Sale Price (USD)")

1.5.2 交互项

step_interact(~ interaction terms) , +分隔不同交互效应

Code
simple_ames <- 
  recipe(Sale_Price ~ Neighborhood + Gr_Liv_Area + Year_Built + Bldg_Type,
         data = ames_train) %>%
  step_log(Gr_Liv_Area, base = 10) %>% 
  step_other(Neighborhood, threshold = 0.01) %>% 
  step_dummy(all_nominal_predictors()) %>% 
  # Gr_Liv_Area is on the log scale from a previous step
  step_interact( ~ Gr_Liv_Area:starts_with("Bldg_Type_") )

simple_ames 
#> 
#> ── Recipe ──────────────────────────────────────────────────────────────────────
#> 
#> ── Inputs
#> Number of variables by role
#> outcome:   1
#> predictor: 4
#> 
#> ── Operations
#> • Log transformation on: Gr_Liv_Area
#> • Collapsing factor levels for: Neighborhood
#> • Dummy variables from: all_nominal_predictors()
#> • Interactions with: Gr_Liv_Area:starts_with("Bldg_Type_")

1.5.3 样条函数

添加非线性特征

Code
library(patchwork)
library(splines)
plot_smoother <- function(deg_free) {
  ggplot(ames_train, aes(x = Latitude, y = 10^Sale_Price)) + 
    geom_point(alpha = .2) + 
    scale_y_log10() +
    geom_smooth(
      method = lm,
      formula = y ~ ns(x, df = deg_free),# natural splines.
      color = "lightblue",
      se = FALSE
    ) +
    labs(title = paste(deg_free, "Spline Terms"),
         y = "Sale Price (USD)")
}

( plot_smoother(1) + plot_smoother(5) ) / ( plot_smoother(20) + plot_smoother(100) )

Code
recipe(Sale_Price ~ Neighborhood + Gr_Liv_Area + Year_Built + Bldg_Type + Latitude,
         data = ames_train) %>%
  step_ns(Latitude, deg_free = 20)
#> 
#> ── Recipe ──────────────────────────────────────────────────────────────────────
#> 
#> ── Inputs
#> Number of variables by role
#> outcome:   1
#> predictor: 5
#> 
#> ── Operations
#> • Natural splines on: Latitude

1.5.4 特征提取

PCA,

Code
  # Use a regular expression to capture house size predictors: 
recipe(Sale_Price ~ Neighborhood + Gr_Liv_Area + Year_Built + Bldg_Type + Latitude+Total_Bsmt_SF+First_Flr_SF+Gr_Liv_Area,
         data = ames_train) %>%
    step_pca(matches("(SF$)|(Gr_Liv)"))
#> 
#> ── Recipe ──────────────────────────────────────────────────────────────────────
#> 
#> ── Inputs
#> Number of variables by role
#> outcome:   1
#> predictor: 7
#> 
#> ── Operations
#> • PCA extraction with: matches("(SF$)|(Gr_Liv)")

1.5.5 行采样

Downsampling,Upsampling,Hybrid

step_filter() step_sample() step_slice() step_arrange() skip TRUE

Code
library(themis)
step_downsample(outcome_column_name)

1.5.6 一般转换

step_mutate() 比,Bedroom_AbvGr / Full_Bath

1.5.7 tidy()

Code
ames_rec <- 
  recipe(Sale_Price ~ Neighborhood + Gr_Liv_Area + Year_Built + Bldg_Type + 
           Latitude + Longitude, data = ames_train) %>%
  step_log(Gr_Liv_Area, base = 10) %>% 
  step_other(Neighborhood, threshold = 0.01) %>% 
  step_dummy(all_nominal_predictors()) %>% 
  step_interact( ~ Gr_Liv_Area:starts_with("Bldg_Type_") ) %>% 
  step_ns(Latitude, Longitude, deg_free = 20)
tidy(ames_rec)
#> # A tibble: 5 × 6
#>   number operation type     trained skip  id            
#>    <int> <chr>     <chr>    <lgl>   <lgl> <chr>         
#> 1      1 step      log      FALSE   FALSE log_qeD8W     
#> 2      2 step      other    FALSE   FALSE other_Hirvj   
#> 3      3 step      dummy    FALSE   FALSE dummy_HJDrx   
#> 4      4 step      interact FALSE   FALSE interact_B7kcu
#> 5      5 step      ns       FALSE   FALSE ns_yBiqt
Code
ames_rec <- 
  recipe(Sale_Price ~ Neighborhood + Gr_Liv_Area + Year_Built + Bldg_Type + 
           Latitude + Longitude, data = ames_train) %>%
  step_log(Gr_Liv_Area, base = 10) %>% 
  step_other(Neighborhood, threshold = 0.01, id = "my_id") %>% 
  step_dummy(all_nominal_predictors()) %>% 
  step_interact( ~ Gr_Liv_Area:starts_with("Bldg_Type_") ) %>% 
  step_ns(Latitude, Longitude, deg_free = 20)

lm_wflow <- 
  workflow() %>% 
  add_model(lm_model) %>% 
  add_recipe(ames_rec)

lm_fit <- fit(lm_wflow, ames_train)

estimated_recipe <- 
  lm_fit %>% 
  extract_recipe(estimated = TRUE)

tidy(estimated_recipe, id = "my_id")
#> # A tibble: 21 × 3
#>    terms        retained           id   
#>    <chr>        <chr>              <chr>
#>  1 Neighborhood North_Ames         my_id
#>  2 Neighborhood College_Creek      my_id
#>  3 Neighborhood Old_Town           my_id
#>  4 Neighborhood Edwards            my_id
#>  5 Neighborhood Somerset           my_id
#>  6 Neighborhood Northridge_Heights my_id
#>  7 Neighborhood Gilbert            my_id
#>  8 Neighborhood Sawyer             my_id
#>  9 Neighborhood Northwest_Ames     my_id
#> 10 Neighborhood Sawyer_West        my_id
#> # ℹ 11 more rows
tidy(estimated_recipe, number = 2)
#> # A tibble: 21 × 3
#>    terms        retained           id   
#>    <chr>        <chr>              <chr>
#>  1 Neighborhood North_Ames         my_id
#>  2 Neighborhood College_Creek      my_id
#>  3 Neighborhood Old_Town           my_id
#>  4 Neighborhood Edwards            my_id
#>  5 Neighborhood Somerset           my_id
#>  6 Neighborhood Northridge_Heights my_id
#>  7 Neighborhood Gilbert            my_id
#>  8 Neighborhood Sawyer             my_id
#>  9 Neighborhood Northwest_Ames     my_id
#> 10 Neighborhood Sawyer_West        my_id
#> # ℹ 11 more rows

1.5.8 列角色

Code
ames_rec %>% update_role(Year_Built, new_role = "Street")
#> 
#> ── Recipe ──────────────────────────────────────────────────────────────────────
#> 
#> ── Inputs
#> Number of variables by role
#> outcome:   1
#> predictor: 5
#> Street:    1
#> 
#> ── Operations
#> • Log transformation on: Gr_Liv_Area
#> • Collapsing factor levels for: Neighborhood
#> • Dummy variables from: all_nominal_predictors()
#> • Interactions with: Gr_Liv_Area:starts_with("Bldg_Type_")
#> • Natural splines on: Latitude and Longitude

1.5.9 自然语言处理

textrecipes

1.6 模型评估 yardstick

1.6.1 回归指标

Code
ames_test_res <- predict(lm_fit, new_data = ames_test %>% select(-Sale_Price))
ames_test_res
#> # A tibble: 588 × 1
#>    .pred
#>    <dbl>
#>  1  5.24
#>  2  5.27
#>  3  5.24
#>  4  5.21
#>  5  5.65
#>  6  5.13
#>  7  5.04
#>  8  5.22
#>  9  5.00
#> 10  5.32
#> # ℹ 578 more rows

ames_test_res <- bind_cols(ames_test_res, ames_test %>% select(Sale_Price))
ames_test_res
#> # A tibble: 588 × 2
#>    .pred Sale_Price
#>    <dbl>      <dbl>
#>  1  5.24       5.33
#>  2  5.27       5.29
#>  3  5.24       5.26
#>  4  5.21       5.23
#>  5  5.65       5.73
#>  6  5.13       5.17
#>  7  5.04       5.06
#>  8  5.22       5.26
#>  9  5.00       5.02
#> 10  5.32       5.24
#> # ℹ 578 more rows
Code
ggplot(ames_test_res, aes(x = Sale_Price, y = .pred)) + 
  # Create a diagonal line:
  geom_abline(lty = 2) + 
  geom_point(alpha = 0.5) + 
  labs(y = "Predicted Sale Price (log10)", x = "Sale Price (log10)") +
  # Scale and size the x- and y-axis uniformly:
  coord_obs_pred()

1.6.1.1 均方根误差RMSE

Code
rmse(ames_test_res, truth = Sale_Price, estimate = .pred)
#> # A tibble: 1 × 3
#>   .metric .estimator .estimate
#>   <chr>   <chr>          <dbl>
#> 1 rmse    standard      0.0772

1.6.1.2 决定系数R2,平均绝对误差MAE

Code
ames_metrics <- metric_set(rmse, rsq, mae)
ames_metrics(ames_test_res, truth = Sale_Price, estimate = .pred)
#> # A tibble: 3 × 3
#>   .metric .estimator .estimate
#>   <chr>   <chr>          <dbl>
#> 1 rmse    standard      0.0772
#> 2 rsq     standard      0.795 
#> 3 mae     standard      0.0550

1.6.2 二分类指标

Code
data(two_class_example)
tibble(two_class_example)
#> # A tibble: 500 × 4
#>    truth   Class1   Class2 predicted
#>    <fct>    <dbl>    <dbl> <fct>    
#>  1 Class2 0.00359 0.996    Class2   
#>  2 Class1 0.679   0.321    Class1   
#>  3 Class2 0.111   0.889    Class2   
#>  4 Class1 0.735   0.265    Class1   
#>  5 Class2 0.0162  0.984    Class2   
#>  6 Class1 0.999   0.000725 Class1   
#>  7 Class1 0.999   0.000799 Class1   
#>  8 Class1 0.812   0.188    Class1   
#>  9 Class2 0.457   0.543    Class2   
#> 10 Class2 0.0976  0.902    Class2   
#> # ℹ 490 more rows
Code
# 混淆矩阵
# A confusion matrix: 
conf_mat(two_class_example, truth = truth, estimate = predicted)
#>           Truth
#> Prediction Class1 Class2
#>     Class1    227     50
#>     Class2     31    192
Code
# Accuracy:
accuracy(two_class_example, truth, predicted)
#> # A tibble: 1 × 3
#>   .metric  .estimator .estimate
#>   <chr>    <chr>          <dbl>
#> 1 accuracy binary         0.838
Code
# Matthews correlation coefficient:
mcc(two_class_example, truth, predicted)
#> # A tibble: 1 × 3
#>   .metric .estimator .estimate
#>   <chr>   <chr>          <dbl>
#> 1 mcc     binary         0.677
Code
# F1 metric:
f_meas(two_class_example, truth, predicted)
#> # A tibble: 1 × 3
#>   .metric .estimator .estimate
#>   <chr>   <chr>          <dbl>
#> 1 f_meas  binary         0.849
Code
# Combining these three classification metrics together
classification_metrics <- metric_set(accuracy, mcc, f_meas)
classification_metrics(two_class_example, truth = truth, estimate = predicted)
#> # A tibble: 3 × 3
#>   .metric  .estimator .estimate
#>   <chr>    <chr>          <dbl>
#> 1 accuracy binary         0.838
#> 2 mcc      binary         0.677
#> 3 f_meas   binary         0.849

感兴趣 事件水平

第二级逻辑将结果编码为0/1(在这种情况下,第二个值是事件)

Code
f_meas(two_class_example, truth, predicted, event_level = "second")
#> # A tibble: 1 × 3
#>   .metric .estimator .estimate
#>   <chr>   <chr>          <dbl>
#> 1 f_meas  binary         0.826

1.6.2.1 ROC,AUC

不使用预测类列,对于两类问题,感兴趣事件的概率列将传递到函数中

Code
two_class_curve <- roc_curve(two_class_example, truth, Class1)
two_class_curve
#> # A tibble: 502 × 3
#>    .threshold specificity sensitivity
#>         <dbl>       <dbl>       <dbl>
#>  1 -Inf           0                 1
#>  2    1.79e-7     0                 1
#>  3    4.50e-6     0.00413           1
#>  4    5.81e-6     0.00826           1
#>  5    5.92e-6     0.0124            1
#>  6    1.22e-5     0.0165            1
#>  7    1.40e-5     0.0207            1
#>  8    1.43e-5     0.0248            1
#>  9    2.38e-5     0.0289            1
#> 10    3.30e-5     0.0331            1
#> # ℹ 492 more rows
roc_auc(two_class_example, truth, Class1)
#> # A tibble: 1 × 3
#>   .metric .estimator .estimate
#>   <chr>   <chr>          <dbl>
#> 1 roc_auc binary         0.939
Code
autoplot(two_class_curve)+
    annotate("text",x=0.5,y=0.25,label="AUC=0.939")

1.6.3 多分类指标

Code
data(hpc_cv)
tibble(hpc_cv)
#> # A tibble: 3,467 × 7
#>    obs   pred     VF      F       M          L Resample
#>    <fct> <fct> <dbl>  <dbl>   <dbl>      <dbl> <chr>   
#>  1 VF    VF    0.914 0.0779 0.00848 0.0000199  Fold01  
#>  2 VF    VF    0.938 0.0571 0.00482 0.0000101  Fold01  
#>  3 VF    VF    0.947 0.0495 0.00316 0.00000500 Fold01  
#>  4 VF    VF    0.929 0.0653 0.00579 0.0000156  Fold01  
#>  5 VF    VF    0.942 0.0543 0.00381 0.00000729 Fold01  
#>  6 VF    VF    0.951 0.0462 0.00272 0.00000384 Fold01  
#>  7 VF    VF    0.914 0.0782 0.00767 0.0000354  Fold01  
#>  8 VF    VF    0.918 0.0744 0.00726 0.0000157  Fold01  
#>  9 VF    VF    0.843 0.128  0.0296  0.000192   Fold01  
#> 10 VF    VF    0.920 0.0728 0.00703 0.0000147  Fold01  
#> # ℹ 3,457 more rows
Code
accuracy(hpc_cv, obs, pred)
#> # A tibble: 1 × 3
#>   .metric  .estimator .estimate
#>   <chr>    <chr>          <dbl>
#> 1 accuracy multiclass     0.709
mcc(hpc_cv, obs, pred)
#> # A tibble: 1 × 3
#>   .metric .estimator .estimate
#>   <chr>   <chr>          <dbl>
#> 1 mcc     multiclass     0.515

二分类可拓展到多分类

Code
sensitivity(hpc_cv, obs, pred, estimator = "macro")
#> # A tibble: 1 × 3
#>   .metric     .estimator .estimate
#>   <chr>       <chr>          <dbl>
#> 1 sensitivity macro          0.560
sensitivity(hpc_cv, obs, pred, estimator = "macro_weighted")
#> # A tibble: 1 × 3
#>   .metric     .estimator     .estimate
#>   <chr>       <chr>              <dbl>
#> 1 sensitivity macro_weighted     0.709
sensitivity(hpc_cv, obs, pred, estimator = "micro")
#> # A tibble: 1 × 3
#>   .metric     .estimator .estimate
#>   <chr>       <chr>          <dbl>
#> 1 sensitivity micro          0.709

多分类

Code
roc_auc(hpc_cv, obs, VF, F, M, L)
#> # A tibble: 1 × 3
#>   .metric .estimator .estimate
#>   <chr>   <chr>          <dbl>
#> 1 roc_auc hand_till      0.829
roc_auc(hpc_cv, obs, VF, F, M, L, estimator = "macro_weighted")
#> # A tibble: 1 × 3
#>   .metric .estimator     .estimate
#>   <chr>   <chr>              <dbl>
#> 1 roc_auc macro_weighted     0.868
Code
hpc_cv %>% 
  group_by(Resample) %>% 
  accuracy(obs, pred)
#> # A tibble: 10 × 4
#>    Resample .metric  .estimator .estimate
#>    <chr>    <chr>    <chr>          <dbl>
#>  1 Fold01   accuracy multiclass     0.726
#>  2 Fold02   accuracy multiclass     0.712
#>  3 Fold03   accuracy multiclass     0.758
#>  4 Fold04   accuracy multiclass     0.712
#>  5 Fold05   accuracy multiclass     0.712
#>  6 Fold06   accuracy multiclass     0.697
#>  7 Fold07   accuracy multiclass     0.675
#>  8 Fold08   accuracy multiclass     0.721
#>  9 Fold09   accuracy multiclass     0.673
#> 10 Fold10   accuracy multiclass     0.699

# Four 1-vs-all ROC curves for each fold
hpc_cv %>% 
  group_by(Resample) %>% 
  roc_curve(obs, VF, F, M, L) %>% 
  autoplot()

1.7 重采样 rsample

Code
ames_rec
#> 
#> ── Recipe ──────────────────────────────────────────────────────────────────────
#> 
#> ── Inputs
#> Number of variables by role
#> outcome:   1
#> predictor: 6
#> 
#> ── Operations
#> • Log transformation on: Gr_Liv_Area
#> • Collapsing factor levels for: Neighborhood
#> • Dummy variables from: all_nominal_predictors()
#> • Interactions with: Gr_Liv_Area:starts_with("Bldg_Type_")
#> • Natural splines on: Latitude and Longitude
Code
rf_model <- 
  rand_forest(trees = 1000) %>% 
  set_engine("ranger") %>% 
  set_mode("regression")

rf_wflow <- 
  workflow() %>% 
  add_formula(
    Sale_Price ~ Neighborhood + Gr_Liv_Area + Year_Built + Bldg_Type + 
      Latitude + Longitude) %>% 
  add_model(rf_model) 

rf_fit <- rf_wflow %>% fit(data = ames_train)
Code
estimate_perf <- function(model, dat) {
  # Capture the names of the `model` and `dat` objects
  cl <- match.call()
  obj_name <- as.character(cl$model)
  data_name <- as.character(cl$dat)
  data_name <- gsub("ames_", "", data_name)
  
  # Estimate these metrics:
  reg_metrics <- metric_set(rmse, rsq)
  
  model %>%
    predict(dat) %>%
    bind_cols(dat %>% select(Sale_Price)) %>%
    reg_metrics(Sale_Price, .pred) %>%
    select(-.estimator) %>%
    mutate(object = obj_name, data = data_name)
}

estimate_perf(rf_fit, ames_train)
#> # A tibble: 2 × 4
#>   .metric .estimate object data 
#>   <chr>       <dbl> <chr>  <chr>
#> 1 rmse       0.0364 rf_fit train
#> 2 rsq        0.962  rf_fit train
estimate_perf(lm_fit, ames_train)
#> # A tibble: 2 × 4
#>   .metric .estimate object data 
#>   <chr>       <dbl> <chr>  <chr>
#> 1 rmse       0.0749 lm_fit train
#> 2 rsq        0.824  lm_fit train
Code
estimate_perf(rf_fit, ames_test)
#> # A tibble: 2 × 4
#>   .metric .estimate object data 
#>   <chr>       <dbl> <chr>  <chr>
#> 1 rmse       0.0677 rf_fit test 
#> 2 rsq        0.843  rf_fit test

1.7.1 交叉验证

V-fold cross-validation,数据被随机划分为样本量大致相等的V组(称为折叠),例如10重交叉验证

Code
set.seed(1001)
# 10-fold cross-validation
ames_folds <- vfold_cv(ames_train, v = 10)

# 分析集/评估集
ames_folds
#> #  10-fold cross-validation 
#> # A tibble: 10 × 2
#>    splits             id    
#>    <list>             <chr> 
#>  1 <split [2107/235]> Fold01
#>  2 <split [2107/235]> Fold02
#>  3 <split [2108/234]> Fold03
#>  4 <split [2108/234]> Fold04
#>  5 <split [2108/234]> Fold05
#>  6 <split [2108/234]> Fold06
#>  7 <split [2108/234]> Fold07
#>  8 <split [2108/234]> Fold08
#>  9 <split [2108/234]> Fold09
#> 10 <split [2108/234]> Fold10


# 检索分区数据
ames_folds$splits[[1]] %>% analysis() |> dim()
#> [1] 2107   74

model_spec %>% fit_resamples(formula, resamples, ...)

model_spec %>% fit_resamples(recipe, resamples, ...)

workflow %>% fit_resamples( resamples, ...)

Code
keep_pred <- control_resamples(save_pred = TRUE, save_workflow = TRUE)

set.seed(1003)
rf_res <- 
  rf_wflow %>% 
  fit_resamples(resamples = ames_folds, control = keep_pred)
rf_res
#> # Resampling results
#> # 10-fold cross-validation 
#> # A tibble: 10 × 5
#>    splits             id     .metrics         .notes           .predictions
#>    <list>             <chr>  <list>           <list>           <list>      
#>  1 <split [2107/235]> Fold01 <tibble [2 × 4]> <tibble [0 × 3]> <tibble>    
#>  2 <split [2107/235]> Fold02 <tibble [2 × 4]> <tibble [0 × 3]> <tibble>    
#>  3 <split [2108/234]> Fold03 <tibble [2 × 4]> <tibble [0 × 3]> <tibble>    
#>  4 <split [2108/234]> Fold04 <tibble [2 × 4]> <tibble [0 × 3]> <tibble>    
#>  5 <split [2108/234]> Fold05 <tibble [2 × 4]> <tibble [0 × 3]> <tibble>    
#>  6 <split [2108/234]> Fold06 <tibble [2 × 4]> <tibble [0 × 3]> <tibble>    
#>  7 <split [2108/234]> Fold07 <tibble [2 × 4]> <tibble [0 × 3]> <tibble>    
#>  8 <split [2108/234]> Fold08 <tibble [2 × 4]> <tibble [0 × 3]> <tibble>    
#>  9 <split [2108/234]> Fold09 <tibble [2 × 4]> <tibble [0 × 3]> <tibble>    
#> 10 <split [2108/234]> Fold10 <tibble [2 × 4]> <tibble [0 × 3]> <tibble>
collect_metrics(rf_res)
#> # A tibble: 2 × 6
#>   .metric .estimator   mean     n std_err .config             
#>   <chr>   <chr>       <dbl> <int>   <dbl> <chr>               
#> 1 rmse    standard   0.0726    10 0.00258 Preprocessor1_Model1
#> 2 rsq     standard   0.834     10 0.0116  Preprocessor1_Model1
collect_metrics(rf_res,summarize = F)
#> # A tibble: 20 × 5
#>    id     .metric .estimator .estimate .config             
#>    <chr>  <chr>   <chr>          <dbl> <chr>               
#>  1 Fold01 rmse    standard      0.0647 Preprocessor1_Model1
#>  2 Fold01 rsq     standard      0.861  Preprocessor1_Model1
#>  3 Fold02 rmse    standard      0.0662 Preprocessor1_Model1
#>  4 Fold02 rsq     standard      0.860  Preprocessor1_Model1
#>  5 Fold03 rmse    standard      0.0793 Preprocessor1_Model1
#>  6 Fold03 rsq     standard      0.808  Preprocessor1_Model1
#>  7 Fold04 rmse    standard      0.0861 Preprocessor1_Model1
#>  8 Fold04 rsq     standard      0.761  Preprocessor1_Model1
#>  9 Fold05 rmse    standard      0.0738 Preprocessor1_Model1
#> 10 Fold05 rsq     standard      0.857  Preprocessor1_Model1
#> 11 Fold06 rmse    standard      0.0678 Preprocessor1_Model1
#> 12 Fold06 rsq     standard      0.860  Preprocessor1_Model1
#> 13 Fold07 rmse    standard      0.0644 Preprocessor1_Model1
#> 14 Fold07 rsq     standard      0.873  Preprocessor1_Model1
#> 15 Fold08 rmse    standard      0.0673 Preprocessor1_Model1
#> 16 Fold08 rsq     standard      0.829  Preprocessor1_Model1
#> 17 Fold09 rmse    standard      0.0719 Preprocessor1_Model1
#> 18 Fold09 rsq     standard      0.840  Preprocessor1_Model1
#> 19 Fold10 rmse    standard      0.0848 Preprocessor1_Model1
#> 20 Fold10 rsq     standard      0.790  Preprocessor1_Model1
assess_res <- collect_predictions(rf_res)
assess_res
#> # A tibble: 2,342 × 5
#>    .pred id      .row Sale_Price .config             
#>    <dbl> <chr>  <int>      <dbl> <chr>               
#>  1  5.17 Fold01    10       5.05 Preprocessor1_Model1
#>  2  5.04 Fold01    27       5.06 Preprocessor1_Model1
#>  3  5.11 Fold01    47       5.10 Preprocessor1_Model1
#>  4  5.14 Fold01    52       5.11 Preprocessor1_Model1
#>  5  5.09 Fold01    59       5.05 Preprocessor1_Model1
#>  6  4.88 Fold01    63       4.77 Preprocessor1_Model1
#>  7  5.01 Fold01    65       5.10 Preprocessor1_Model1
#>  8  5.16 Fold01    66       5    Preprocessor1_Model1
#>  9  4.96 Fold01    67       5.10 Preprocessor1_Model1
#> 10  4.84 Fold01    68       4.91 Preprocessor1_Model1
#> # ℹ 2,332 more rows
Code
assess_res %>% 
  ggplot(aes(x = Sale_Price, y = .pred)) + 
  geom_point(alpha = .15) +
  geom_abline(color = "red") + 
  coord_obs_pred() + 
  ylab("Predicted")

Code
# 找到残差最大的2个
over_predicted <- 
  assess_res %>% 
  mutate(residual = Sale_Price - .pred) %>% 
  arrange(desc(abs(residual))) %>% 
  slice(1:2)
over_predicted
#> # A tibble: 2 × 6
#>   .pred id      .row Sale_Price .config              residual
#>   <dbl> <chr>  <int>      <dbl> <chr>                   <dbl>
#> 1  4.97 Fold10    33       4.11 Preprocessor1_Model1   -0.858
#> 2  4.93 Fold04   323       4.12 Preprocessor1_Model1   -0.814

ames_train %>% 
  slice(over_predicted$.row) %>% 
  select(Gr_Liv_Area, Neighborhood, Year_Built, Bedroom_AbvGr, Full_Bath)
#> # A tibble: 2 × 5
#>   Gr_Liv_Area Neighborhood           Year_Built Bedroom_AbvGr Full_Bath
#>         <int> <fct>                       <int>         <int>     <int>
#> 1         832 Old_Town                     1923             2         1
#> 2         733 Iowa_DOT_and_Rail_Road       1952             2         1

重复交叉验证

Code
# 10-fold cross-validation repeated 5 times 
vfold_cv(ames_train, v = 10, repeats = 5)
#> #  10-fold cross-validation repeated 5 times 
#> # A tibble: 50 × 3
#>    splits             id      id2   
#>    <list>             <chr>   <chr> 
#>  1 <split [2107/235]> Repeat1 Fold01
#>  2 <split [2107/235]> Repeat1 Fold02
#>  3 <split [2108/234]> Repeat1 Fold03
#>  4 <split [2108/234]> Repeat1 Fold04
#>  5 <split [2108/234]> Repeat1 Fold05
#>  6 <split [2108/234]> Repeat1 Fold06
#>  7 <split [2108/234]> Repeat1 Fold07
#>  8 <split [2108/234]> Repeat1 Fold08
#>  9 <split [2108/234]> Repeat1 Fold09
#> 10 <split [2108/234]> Repeat1 Fold10
#> # ℹ 40 more rows

留一交叉验证

leave-one-out (LOO) cross-validation

loo_cv()

蒙特卡罗交叉验证

Monte Carlo cross-validation,MCCV,将固定比例的数据分配给分析集和评估集。该比例的数据每次都是随机选择的,导致评估集不相互排斥

Code
mc_cv(ames_train, prop = 9/10, times = 20)
#> # Monte Carlo cross-validation (0.9/0.1) with 20 resamples  
#> # A tibble: 20 × 2
#>    splits             id        
#>    <list>             <chr>     
#>  1 <split [2107/235]> Resample01
#>  2 <split [2107/235]> Resample02
#>  3 <split [2107/235]> Resample03
#>  4 <split [2107/235]> Resample04
#>  5 <split [2107/235]> Resample05
#>  6 <split [2107/235]> Resample06
#>  7 <split [2107/235]> Resample07
#>  8 <split [2107/235]> Resample08
#>  9 <split [2107/235]> Resample09
#> 10 <split [2107/235]> Resample10
#> 11 <split [2107/235]> Resample11
#> 12 <split [2107/235]> Resample12
#> 13 <split [2107/235]> Resample13
#> 14 <split [2107/235]> Resample14
#> 15 <split [2107/235]> Resample15
#> 16 <split [2107/235]> Resample16
#> 17 <split [2107/235]> Resample17
#> 18 <split [2107/235]> Resample18
#> 19 <split [2107/235]> Resample19
#> 20 <split [2107/235]> Resample20

1.7.2 验证集

Code
set.seed(101)

# To put 60% into training, 20% in validation, and remaining 20% in testing:
ames_validation_split <- initial_validation_split(ames, prop = c(0.6, 0.2))
ames_validation_split
#> <Training/Validation/Testing/Total>
#> <1758/586/586/2930>


# Object used for resampling: 
val_set <- validation_set(ames_validation_split)
val_set
#> # A tibble: 1 × 2
#>   splits             id        
#>   <list>             <chr>     
#> 1 <split [1758/586]> validation
Code
val_res <- rf_wflow %>% fit_resamples(resamples = val_set)
val_res
#> # Resampling results
#> # Validation Set (0.75/0.25) 
#> # A tibble: 1 × 4
#>   splits             id         .metrics         .notes          
#>   <list>             <chr>      <list>           <list>          
#> 1 <split [1758/586]> validation <tibble [2 × 4]> <tibble [0 × 3]>
collect_metrics(val_res)
#> # A tibble: 2 × 6
#>   .metric .estimator   mean     n std_err .config             
#>   <chr>   <chr>       <dbl> <int>   <dbl> <chr>               
#> 1 rmse    standard   0.0809     1      NA Preprocessor1_Model1
#> 2 rsq     standard   0.818      1      NA Preprocessor1_Model1

1.7.3 自助法

Bootstrap resampling

replacement

out-of-bag sample

Code
bootstraps(ames_train, times = 5)
#> # Bootstrap sampling 
#> # A tibble: 5 × 2
#>   splits             id        
#>   <list>             <chr>     
#> 1 <split [2342/882]> Bootstrap1
#> 2 <split [2342/849]> Bootstrap2
#> 3 <split [2342/870]> Bootstrap3
#> 4 <split [2342/875]> Bootstrap4
#> 5 <split [2342/849]> Bootstrap5

1.7.4 Rolling forecast origin resampling

滚动预测原点重采样

时间序列数据

Code
time_slices <- 
  tibble(x = 1:365) %>% 
  rolling_origin(initial = 6 * 30, assess = 30, skip = 29, cumulative = FALSE)

data_range <- function(x) {
  summarize(x, first = min(x), last = max(x))
}

map_dfr(time_slices$splits, ~   analysis(.x) %>% data_range())
#> # A tibble: 6 × 2
#>   first  last
#>   <int> <int>
#> 1     1   180
#> 2    31   210
#> 3    61   240
#> 4    91   270
#> 5   121   300
#> 6   151   330

map_dfr(time_slices$splits, ~ assessment(.x) %>% data_range())
#> # A tibble: 6 × 2
#>   first  last
#>   <int> <int>
#> 1   181   210
#> 2   211   240
#> 3   241   270
#> 4   271   300
#> 5   301   330
#> 6   331   360

1.7.5 并行计算

Code
parallel::detectCores(logical = FALSE)
#> [1] 4
parallel::detectCores(logical = TRUE)
#> [1] 8

1.7.6 保存重采样对象

Code
ames_rec <- 
  recipe(Sale_Price ~ Neighborhood + Gr_Liv_Area + Year_Built + Bldg_Type + 
           Latitude + Longitude, data = ames_train) %>%
  step_other(Neighborhood, threshold = 0.01) %>% 
  step_dummy(all_nominal_predictors()) %>% 
  step_interact( ~ Gr_Liv_Area:starts_with("Bldg_Type_") ) %>% 
  step_ns(Latitude, Longitude, deg_free = 20)

lm_wflow <-  
  workflow() %>% 
  add_recipe(ames_rec) %>% 
  add_model(linear_reg() %>% set_engine("lm")) 

lm_fit <- lm_wflow %>% fit(data = ames_train)

# Select the recipe: 
extract_recipe(lm_fit, estimated = TRUE)
#> 
#> ── Recipe ──────────────────────────────────────────────────────────────────────
#> 
#> ── Inputs
#> Number of variables by role
#> outcome:   1
#> predictor: 6
#> 
#> ── Training information
#> Training data contained 2342 data points and no incomplete rows.
#> 
#> ── Operations
#> • Collapsing factor levels for: Neighborhood | Trained
#> • Dummy variables from: Neighborhood and Bldg_Type | Trained
#> • Interactions with: Gr_Liv_Area:(Bldg_Type_TwoFmCon + Bldg_Type_Duplex +
#>   Bldg_Type_Twnhs + Bldg_Type_TwnhsE) | Trained
#> • Natural splines on: Latitude and Longitude | Trained
Code
get_model <- function(x) {
  extract_fit_parsnip(x) %>% tidy()
}

get_model(lm_fit)
#> # A tibble: 72 × 5
#>    term                             estimate  std.error statistic   p.value
#>    <chr>                               <dbl>      <dbl>     <dbl>     <dbl>
#>  1 (Intercept)                      0.895    0.303          2.95  3.17e-  3
#>  2 Gr_Liv_Area                      0.000177 0.00000444    39.8   4.16e-263
#>  3 Year_Built                       0.00204  0.000139      14.7   1.51e- 46
#>  4 Neighborhood_College_Creek      -0.0283   0.0334        -0.849 3.96e-  1
#>  5 Neighborhood_Old_Town           -0.0489   0.0125        -3.91  9.48e-  5
#>  6 Neighborhood_Edwards            -0.0823   0.0273        -3.01  2.62e-  3
#>  7 Neighborhood_Somerset            0.0735   0.0192         3.83  1.33e-  4
#>  8 Neighborhood_Northridge_Heights  0.148    0.0278         5.33  1.07e-  7
#>  9 Neighborhood_Gilbert             0.0306   0.0218         1.41  1.60e-  1
#> 10 Neighborhood_Sawyer             -0.111    0.0257        -4.33  1.56e-  5
#> # ℹ 62 more rows
Code
ctrl <- control_resamples(extract = get_model)

lm_res <- lm_wflow %>%  fit_resamples(resamples = ames_folds, control = ctrl)
lm_res
#> # Resampling results
#> # 10-fold cross-validation 
#> # A tibble: 10 × 5
#>    splits             id     .metrics         .notes           .extracts       
#>    <list>             <chr>  <list>           <list>           <list>          
#>  1 <split [2107/235]> Fold01 <tibble [2 × 4]> <tibble [0 × 3]> <tibble [1 × 2]>
#>  2 <split [2107/235]> Fold02 <tibble [2 × 4]> <tibble [0 × 3]> <tibble [1 × 2]>
#>  3 <split [2108/234]> Fold03 <tibble [2 × 4]> <tibble [0 × 3]> <tibble [1 × 2]>
#>  4 <split [2108/234]> Fold04 <tibble [2 × 4]> <tibble [0 × 3]> <tibble [1 × 2]>
#>  5 <split [2108/234]> Fold05 <tibble [2 × 4]> <tibble [0 × 3]> <tibble [1 × 2]>
#>  6 <split [2108/234]> Fold06 <tibble [2 × 4]> <tibble [0 × 3]> <tibble [1 × 2]>
#>  7 <split [2108/234]> Fold07 <tibble [2 × 4]> <tibble [0 × 3]> <tibble [1 × 2]>
#>  8 <split [2108/234]> Fold08 <tibble [2 × 4]> <tibble [0 × 3]> <tibble [1 × 2]>
#>  9 <split [2108/234]> Fold09 <tibble [2 × 4]> <tibble [0 × 3]> <tibble [1 × 2]>
#> 10 <split [2108/234]> Fold10 <tibble [2 × 4]> <tibble [0 × 3]> <tibble [1 × 2]>
Code
lm_res$.extracts[[1]]
#> # A tibble: 1 × 2
#>   .extracts         .config             
#>   <list>            <chr>               
#> 1 <tibble [72 × 5]> Preprocessor1_Model1

lm_res$.extracts[[1]][[1]]
#> [[1]]
#> # A tibble: 72 × 5
#>    term                             estimate  std.error statistic   p.value
#>    <chr>                               <dbl>      <dbl>     <dbl>     <dbl>
#>  1 (Intercept)                      0.950    0.324          2.93  3.42e-  3
#>  2 Gr_Liv_Area                      0.000175 0.00000481    36.3   4.09e-223
#>  3 Year_Built                       0.00201  0.000149      13.5   1.08e- 39
#>  4 Neighborhood_College_Creek      -0.0273   0.0359        -0.762 4.46e-  1
#>  5 Neighborhood_Old_Town           -0.0543   0.0134        -4.04  5.51e-  5
#>  6 Neighborhood_Edwards            -0.0825   0.0293        -2.81  4.95e-  3
#>  7 Neighborhood_Somerset            0.0706   0.0207         3.40  6.76e-  4
#>  8 Neighborhood_Northridge_Heights  0.139    0.0298         4.65  3.55e-  6
#>  9 Neighborhood_Gilbert             0.0197   0.0235         0.835 4.04e-  1
#> 10 Neighborhood_Sawyer             -0.117    0.0273        -4.30  1.77e-  5
#> # ℹ 62 more rows
Code
all_coef <- map_dfr(lm_res$.extracts, ~ .x[[1]][[1]])
# Show the replicates for a single predictor:
filter(all_coef, term == "Year_Built")
#> # A tibble: 10 × 5
#>    term       estimate std.error statistic  p.value
#>    <chr>         <dbl>     <dbl>     <dbl>    <dbl>
#>  1 Year_Built  0.00201  0.000149      13.5 1.08e-39
#>  2 Year_Built  0.00203  0.000152      13.3 6.61e-39
#>  3 Year_Built  0.00189  0.000147      12.9 1.85e-36
#>  4 Year_Built  0.00210  0.000145      14.5 1.98e-45
#>  5 Year_Built  0.00211  0.000148      14.3 2.19e-44
#>  6 Year_Built  0.00204  0.000147      13.9 4.52e-42
#>  7 Year_Built  0.00215  0.000150      14.3 1.97e-44
#>  8 Year_Built  0.00213  0.000144      14.7 9.43e-47
#>  9 Year_Built  0.00208  0.000151      13.8 1.77e-41
#> 10 Year_Built  0.00212  0.000148      14.3 4.48e-44
Back to top