1 机器学习

1.1 探索性数据分析

Code

library(tidymodels)
#> ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
#> ✔ broom        1.0.6     ✔ rsample      1.2.1
#> ✔ dials        1.2.1     ✔ tune         1.2.1
#> ✔ infer        1.0.7     ✔ workflows    1.1.4
#> ✔ modeldata    1.4.0     ✔ workflowsets 1.1.0
#> ✔ parsnip      1.2.1     ✔ yardstick    1.3.1
#> ✔ recipes      1.1.0
data(ames)

ggplot(ames, aes(x = Sale_Price)) + 
  geom_histogram(bins = 50, col= "white")

Code





ames <- ames |>mutate(Sale_Price = log10(Sale_Price))

ggplot(ames, aes(x = Sale_Price)) + 
  geom_histogram(bins = 50, col= "white")+
    geom_vline(xintercept =quantile(ames$Sale_Price),lty=5 )

1.2 拆分训练集、验证集和测试集 `rsample`

1.2.1 简单抽样

Code

set.seed(10)
ames_split <- initial_split(ames, prop = c(0.8))
ames_split

ames_train <- training(ames_split)
ames_test  <-  testing(ames_split)
dim(ames_train)

1.2.2 分层抽样

Code

set.seed(100)
ames_split <- initial_split(ames, prop = 0.80, strata = Sale_Price)
ames_train <- training(ames_split)
ames_test  <-  testing(ames_split)

dim(ames_train)
#> [1] 2342   74

1.2.3 验证集

Code

set.seed(101)

# To put 60% into training, 20% in validation, and remaining 20% in testing:
ames_split <- initial_validation_split(ames, prop = c(0.6, 0.2),
                                       strata = Sale_Price)
ames_split

ames_train <- training(ames_split)
ames_test <- testing(ames_split)
ames_valid <- validation(ames_split)

1.3 模型选择 `parsnip`

Code

parsnip_addin()

Code

show_engines('linear_reg')
#> # A tibble: 7 × 2
#>   engine mode      
#>   <chr>  <chr>     
#> 1 lm     regression
#> 2 glm    regression
#> 3 glmnet regression
#> 4 stan   regression
#> 5 spark  regression
#> 6 keras  regression
#> 7 brulee regression
show_engines("logistic_reg")
#> # A tibble: 7 × 2
#>   engine    mode          
#>   <chr>     <chr>         
#> 1 glm       classification
#> 2 glmnet    classification
#> 3 LiblineaR classification
#> 4 spark     classification
#> 5 keras     classification
#> 6 stan      classification
#> 7 brulee    classification

Code

lm_model <- linear_reg() |> 
    set_engine("lm") |> 
    set_mode(mode = "regression")
lm_model
#> Linear Regression Model Specification (regression)
#> 
#> Computational engine: lm

logistic_reg() |>
  set_mode("classification") |>
  set_engine("glm") 
#> Logistic Regression Model Specification (classification)
#> 
#> Computational engine: glm

rand_forest(trees = 1000, min_n = 5) |>
  set_engine("ranger", verbose = TRUE) |>
  set_mode("regression") 
#> Random Forest Model Specification (regression)
#> 
#> Main Arguments:
#>   trees = 1000
#>   min_n = 5
#> 
#> Engine-Specific Arguments:
#>   verbose = TRUE
#> 
#> Computational engine: ranger

decision_tree(min_n = 2) |>
  set_engine("rpart") |>
  set_mode("regression")
#> Decision Tree Model Specification (regression)
#> 
#> Main Arguments:
#>   min_n = 2
#> 
#> Computational engine: rpart

1.4 模型工作流 `workflows`

1.4.1 线性模型

预处理 Preprocessor

Code


#  Preprocessor
# None

lm_wflow <- 
  workflow() %>% 
  add_model(lm_model)

lm_wflow
#> ══ Workflow ════════════════════════════════════════════════════════════════════
#> Preprocessor: None
#> Model: linear_reg()
#> 
#> ── Model ───────────────────────────────────────────────────────────────────────
#> Linear Regression Model Specification (regression)
#> 
#> Computational engine: lm


# Formula
lm_wflow <- lm_wflow |> 
    add_formula(Sale_Price ~ Longitude + Latitude)

lm_wflow
#> ══ Workflow ════════════════════════════════════════════════════════════════════
#> Preprocessor: Formula
#> Model: linear_reg()
#> 
#> ── Preprocessor ────────────────────────────────────────────────────────────────
#> Sale_Price ~ Longitude + Latitude
#> 
#> ── Model ───────────────────────────────────────────────────────────────────────
#> Linear Regression Model Specification (regression)
#> 
#> Computational engine: lm

#  拟合
lm_fit <- fit(lm_wflow, ames_train)
lm_fit
#> ══ Workflow [trained] ══════════════════════════════════════════════════════════
#> Preprocessor: Formula
#> Model: linear_reg()
#> 
#> ── Preprocessor ────────────────────────────────────────────────────────────────
#> Sale_Price ~ Longitude + Latitude
#> 
#> ── Model ───────────────────────────────────────────────────────────────────────
#> 
#> Call:
#> stats::lm(formula = ..y ~ ., data = data)
#> 
#> Coefficients:
#> (Intercept)    Longitude     Latitude  
#>    -314.938       -2.138        2.853


# 更换公式 再拟合
lm_wflow %>% update_formula(Sale_Price ~ Longitude) |> fit(ames_train)
#> ══ Workflow [trained] ══════════════════════════════════════════════════════════
#> Preprocessor: Formula
#> Model: linear_reg()
#> 
#> ── Preprocessor ────────────────────────────────────────────────────────────────
#> Sale_Price ~ Longitude
#> 
#> ── Model ───────────────────────────────────────────────────────────────────────
#> 
#> Call:
#> stats::lm(formula = ..y ~ ., data = data)
#> 
#> Coefficients:
#> (Intercept)    Longitude  
#>    -191.309       -2.099



# Variables:outcomes ~ predictors
lm_wflow <- 
  lm_wflow %>% 
  remove_formula() %>% 
  add_variables(outcomes  = Sale_Price, predictors = c(Longitude, Latitude))  # c(ends_with("tude"))

lm_wflow |> fit(ames_train)
#> ══ Workflow [trained] ══════════════════════════════════════════════════════════
#> Preprocessor: Variables
#> Model: linear_reg()
#> 
#> ── Preprocessor ────────────────────────────────────────────────────────────────
#> Outcomes: Sale_Price
#> Predictors: c(Longitude, Latitude)
#> 
#> ── Model ───────────────────────────────────────────────────────────────────────
#> 
#> Call:
#> stats::lm(formula = ..y ~ ., data = data)
#> 
#> Coefficients:
#> (Intercept)    Longitude     Latitude  
#>    -314.938       -2.138        2.853


# recepe

1.4.2 预测

Code


# 回归 "numeric" , "conf_int","pred_int","raw".
#  censored regression   "time"，"hazard","survival"
# 分类  "class", "prob",
# "quantile"

# When NULL, predict() will choose an appropriate value based on the model's mode.
predict(lm_fit, ames_test)  # "numeric"
#> # A tibble: 588 × 1
#>    .pred
#>    <dbl>
#>  1  5.23
#>  2  5.29
#>  3  5.28
#>  4  5.27
#>  5  5.26
#>  6  5.24
#>  7  5.24
#>  8  5.24
#>  9  5.24
#> 10  5.30
#> # ℹ 578 more rows

1.4.3 混合效应模型 `multilevelmod`

Code

library(multilevelmod)
multilevel_spec <- linear_reg() %>% set_engine("lmer")
multilevel_spec
#> Linear Regression Model Specification (regression)
#> 
#> Computational engine: lmer

df <-  read_delim("data/lme_anova.txt",) |> pivot_longer(cols = 3:7,names_to = "time",values_to = "BP") |> 
    mutate_at(1:3,as.factor)
#> Rows: 15 Columns: 7
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: "\t"
#> chr (1): induced_method
#> dbl (6): subject, t0, t1, t2, t3, t4
#> 
#> ℹ Use `spec()` to retrieve the full column specification for this data.
#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df
#> # A tibble: 75 × 4
#>    subject induced_method time     BP
#>    <fct>   <fct>          <fct> <dbl>
#>  1 1       A              t0      120
#>  2 1       A              t1      108
#>  3 1       A              t2      112
#>  4 1       A              t3      120
#>  5 1       A              t4      117
#>  6 2       A              t0      118
#>  7 2       A              t1      109
#>  8 2       A              t2      115
#>  9 2       A              t3      126
#> 10 2       A              t4      123
#> # ℹ 65 more rows
multilevel_workflow <- 
  workflow() %>% 
  add_variables(outcome = BP, predictors = c(induced_method,time,subject)) %>% 
  add_model(multilevel_spec, 
            # This formula is given to the model
            formula = BP ~ induced_method+time + ( 1| subject))

multilevel_workflow
#> ══ Workflow ════════════════════════════════════════════════════════════════════
#> Preprocessor: Variables
#> Model: linear_reg()
#> 
#> ── Preprocessor ────────────────────────────────────────────────────────────────
#> Outcomes: BP
#> Predictors: c(induced_method, time, subject)
#> 
#> ── Model ───────────────────────────────────────────────────────────────────────
#> Linear Regression Model Specification (regression)
#> 
#> Computational engine: lmer
multilevel_workflow |>  fit(data =df) 
#> ══ Workflow [trained] ══════════════════════════════════════════════════════════
#> Preprocessor: Variables
#> Model: linear_reg()
#> 
#> ── Preprocessor ────────────────────────────────────────────────────────────────
#> Outcomes: BP
#> Predictors: c(induced_method, time, subject)
#> 
#> ── Model ───────────────────────────────────────────────────────────────────────
#> Linear mixed model fit by REML ['lmerMod']
#> Formula: BP ~ induced_method + time + (1 | subject)
#>    Data: data
#> REML criterion at convergence: 431.0591
#> Random effects:
#>  Groups   Name        Std.Dev.
#>  subject  (Intercept) 3.441   
#>  Residual             4.434   
#> Number of obs: 75, groups:  subject, 15
#> Fixed Effects:
#>     (Intercept)  induced_methodB  induced_methodC           timet1  
#>         118.360            4.800            8.520           -4.400  
#>          timet2           timet3           timet4  
#>          -4.467            9.400            6.067

1.4.4 生存模型 `censored`

type = "time" type = "survival" type = "linear_pred" type = "quantile" type = "hazard"

Code

library(censored)
#> Loading required package: survival

parametric_spec <- survival_reg()

parametric_workflow <- 
  workflow() %>% 
  add_variables(outcome = c(fustat, futime), predictors = c(age, rx)) %>% 
  add_model(parametric_spec, 
            formula = Surv(futime, fustat) ~ age + strata(rx))

parametric_fit <- fit(parametric_workflow, data = ovarian)
parametric_fit
#> ══ Workflow [trained] ══════════════════════════════════════════════════════════
#> Preprocessor: Variables
#> Model: survival_reg()
#> 
#> ── Preprocessor ────────────────────────────────────────────────────────────────
#> Outcomes: c(fustat, futime)
#> Predictors: c(age, rx)
#> 
#> ── Model ───────────────────────────────────────────────────────────────────────
#> Call:
#> survival::survreg(formula = Surv(futime, fustat) ~ age + strata(rx), 
#>     data = data, model = TRUE)
#> 
#> Coefficients:
#> (Intercept)         age 
#>  12.8734120  -0.1033569 
#> 
#> Scale:
#>      rx=1      rx=2 
#> 0.7695509 0.4703602 
#> 
#> Loglik(model)= -89.4   Loglik(intercept only)= -97.1
#>  Chisq= 15.36 on 1 degrees of freedom, p= 8.88e-05 
#> n= 26

1.4.5 工作流集 `workflowsets`

Code

location <- list(
  longitude = Sale_Price ~ Longitude,
  latitude = Sale_Price ~ Latitude,
  coords = Sale_Price ~ Longitude + Latitude,
  neighborhood = Sale_Price ~ Neighborhood
)

library(workflowsets)
location_models <- workflow_set(preproc = location, models = list(lm = lm_model))
location_models
#> # A workflow set/tibble: 4 × 4
#>   wflow_id        info             option    result    
#>   <chr>           <list>           <list>    <list>    
#> 1 longitude_lm    <tibble [1 × 4]> <opts[0]> <list [0]>
#> 2 latitude_lm     <tibble [1 × 4]> <opts[0]> <list [0]>
#> 3 coords_lm       <tibble [1 × 4]> <opts[0]> <list [0]>
#> 4 neighborhood_lm <tibble [1 × 4]> <opts[0]> <list [0]>
location_models$info[[1]]
#> # A tibble: 1 × 4
#>   workflow   preproc model      comment
#>   <list>     <chr>   <chr>      <chr>  
#> 1 <workflow> formula linear_reg ""
extract_workflow(location_models, id = "coords_lm")
#> ══ Workflow ════════════════════════════════════════════════════════════════════
#> Preprocessor: Formula
#> Model: linear_reg()
#> 
#> ── Preprocessor ────────────────────────────────────────────────────────────────
#> Sale_Price ~ Longitude + Latitude
#> 
#> ── Model ───────────────────────────────────────────────────────────────────────
#> Linear Regression Model Specification (regression)
#> 
#> Computational engine: lm


location_models <-
   location_models %>%
   mutate(fit = map(info, ~ fit(.x$workflow[[1]], ames_train)))
location_models
#> # A workflow set/tibble: 4 × 5
#>   wflow_id        info             option    result     fit       
#>   <chr>           <list>           <list>    <list>     <list>    
#> 1 longitude_lm    <tibble [1 × 4]> <opts[0]> <list [0]> <workflow>
#> 2 latitude_lm     <tibble [1 × 4]> <opts[0]> <list [0]> <workflow>
#> 3 coords_lm       <tibble [1 × 4]> <opts[0]> <list [0]> <workflow>
#> 4 neighborhood_lm <tibble [1 × 4]> <opts[0]> <list [0]> <workflow>

location_models$fit[[1]]
#> ══ Workflow [trained] ══════════════════════════════════════════════════════════
#> Preprocessor: Formula
#> Model: linear_reg()
#> 
#> ── Preprocessor ────────────────────────────────────────────────────────────────
#> Sale_Price ~ Longitude
#> 
#> ── Model ───────────────────────────────────────────────────────────────────────
#> 
#> Call:
#> stats::lm(formula = ..y ~ ., data = data)
#> 
#> Coefficients:
#> (Intercept)    Longitude  
#>    -191.309       -2.099

Code

final_lm_res <- last_fit(lm_wflow, ames_split)
final_lm_res
#> # Resampling results
#> # Manual resampling 
#> # A tibble: 1 × 6
#>   splits             id               .metrics .notes   .predictions .workflow 
#>   <list>             <chr>            <list>   <list>   <list>       <list>    
#> 1 <split [2342/588]> train/test split <tibble> <tibble> <tibble>     <workflow>
extract_workflow(final_lm_res)
#> ══ Workflow [trained] ══════════════════════════════════════════════════════════
#> Preprocessor: Variables
#> Model: linear_reg()
#> 
#> ── Preprocessor ────────────────────────────────────────────────────────────────
#> Outcomes: Sale_Price
#> Predictors: c(Longitude, Latitude)
#> 
#> ── Model ───────────────────────────────────────────────────────────────────────
#> 
#> Call:
#> stats::lm(formula = ..y ~ ., data = data)
#> 
#> Coefficients:
#> (Intercept)    Longitude     Latitude  
#>    -314.938       -2.138        2.853

collect_metrics(final_lm_res)
#> # A tibble: 2 × 4
#>   .metric .estimator .estimate .config             
#>   <chr>   <chr>          <dbl> <chr>               
#> 1 rmse    standard       0.157 Preprocessor1_Model1
#> 2 rsq     standard       0.152 Preprocessor1_Model1
collect_predictions(final_lm_res) %>% slice(1:5)
#> # A tibble: 5 × 5
#>   .pred id                .row Sale_Price .config             
#>   <dbl> <chr>            <int>      <dbl> <chr>               
#> 1  5.23 train/test split     1       5.33 Preprocessor1_Model1
#> 2  5.29 train/test split     6       5.29 Preprocessor1_Model1
#> 3  5.28 train/test split    13       5.26 Preprocessor1_Model1
#> 4  5.27 train/test split    14       5.23 Preprocessor1_Model1
#> 5  5.26 train/test split    16       5.73 Preprocessor1_Model1

1.5 特征工程 `recipes`

1.5.1 虚拟变量

Code

simple_ames <- 
  recipe(Sale_Price ~ Neighborhood + Gr_Liv_Area + Year_Built + Bldg_Type,
         data = ames_train) %>%
  step_log(Gr_Liv_Area, base = 10) %>% 
  step_dummy(all_nominal_predictors())# 因子或字符变量，名义nominal
# all_numeric_predictors()  all_numeric()  all_predictors()  all_outcomes()

# 跨模型循环使用
simple_ames
#> 
#> ── Recipe ──────────────────────────────────────────────────────────────────────
#> 
#> ── Inputs
#> Number of variables by role
#> outcome:   1
#> predictor: 4
#> 
#> ── Operations
#> • Log transformation on: Gr_Liv_Area
#> • Dummy variables from: all_nominal_predictors()

Code

lm_wflow <- 
  lm_wflow %>% 
    #一次只能有一种预处理方法，需要在添加配方之前删除现有的预处理器
  remove_variables() %>% 
  add_recipe(simple_ames)

lm_wflow
#> ══ Workflow ════════════════════════════════════════════════════════════════════
#> Preprocessor: Recipe
#> Model: linear_reg()
#> 
#> ── Preprocessor ────────────────────────────────────────────────────────────────
#> 2 Recipe Steps
#> 
#> • step_log()
#> • step_dummy()
#> 
#> ── Model ───────────────────────────────────────────────────────────────────────
#> Linear Regression Model Specification (regression)
#> 
#> Computational engine: lm

lm_fit <- fit(lm_wflow, ames_train)
predict(lm_fit, ames_test)
#> Warning in predict.lm(object = object$fit, newdata = new_data, type =
#> "response", : prediction from rank-deficient fit; consider predict(.,
#> rankdeficient="NA")
#> # A tibble: 588 × 1
#>    .pred
#>    <dbl>
#>  1  5.24
#>  2  5.27
#>  3  5.24
#>  4  5.20
#>  5  5.66
#>  6  5.12
#>  7  5.08
#>  8  5.22
#>  9  5.00
#> 10  5.37
#> # ℹ 578 more rows

# 提取模型信息
lm_fit %>% 
  extract_recipe(estimated = TRUE)
#> 
#> ── Recipe ──────────────────────────────────────────────────────────────────────
#> 
#> ── Inputs
#> Number of variables by role
#> outcome:   1
#> predictor: 4
#> 
#> ── Training information
#> Training data contained 2342 data points and no incomplete rows.
#> 
#> ── Operations
#> • Log transformation on: Gr_Liv_Area | Trained
#> • Dummy variables from: Neighborhood and Bldg_Type | Trained

lm_fit %>% 
  extract_fit_parsnip() %>% 
  tidy()
#> # A tibble: 35 × 5
#>    term                            estimate std.error statistic   p.value
#>    <chr>                              <dbl>     <dbl>     <dbl>     <dbl>
#>  1 (Intercept)                     -1.10     0.234       -4.70  2.72e-  6
#>  2 Gr_Liv_Area                      0.646    0.0143      45.0   1.88e-318
#>  3 Year_Built                       0.00217  0.000118    18.5   3.21e- 71
#>  4 Neighborhood_College_Creek       0.00897  0.00841      1.07  2.87e-  1
#>  5 Neighborhood_Old_Town           -0.0178   0.00846     -2.11  3.51e-  2
#>  6 Neighborhood_Edwards            -0.0413   0.00769     -5.37  8.59e-  8
#>  7 Neighborhood_Somerset            0.0433   0.00992      4.37  1.30e-  5
#>  8 Neighborhood_Northridge_Heights  0.129    0.0102      12.7   6.96e- 36
#>  9 Neighborhood_Gilbert            -0.0437   0.00939     -4.65  3.43e-  6
#> 10 Neighborhood_Sawyer             -0.00785  0.00841     -0.933 3.51e-  1
#> # ℹ 25 more rows

Code


simple_ames <- 
  recipe(Sale_Price ~ Neighborhood + Gr_Liv_Area + Year_Built + Bldg_Type,
         data = ames_train) %>%
  step_log(Gr_Liv_Area, base = 10) %>% 
  step_other(Neighborhood, threshold = 0.01) %>% 
  step_dummy(all_nominal_predictors())

Code

ggplot(ames_train, aes(x = Gr_Liv_Area, y = 10^Sale_Price)) + 
  geom_point(alpha = .2) + 
  facet_wrap(~ Bldg_Type) + 
  geom_smooth(method = lm, formula = y ~ x, se = FALSE, color = "blue") + 
  scale_x_log10() + 
  scale_y_log10() + 
  labs(x = "Gross Living Area", y = "Sale Price (USD)")

1.5.2 交互项

step_interact(~ interaction terms) , +分隔不同交互效应

Code

simple_ames <- 
  recipe(Sale_Price ~ Neighborhood + Gr_Liv_Area + Year_Built + Bldg_Type,
         data = ames_train) %>%
  step_log(Gr_Liv_Area, base = 10) %>% 
  step_other(Neighborhood, threshold = 0.01) %>% 
  step_dummy(all_nominal_predictors()) %>% 
  # Gr_Liv_Area is on the log scale from a previous step
  step_interact( ~ Gr_Liv_Area:starts_with("Bldg_Type_") )

simple_ames 
#> 
#> ── Recipe ──────────────────────────────────────────────────────────────────────
#> 
#> ── Inputs
#> Number of variables by role
#> outcome:   1
#> predictor: 4
#> 
#> ── Operations
#> • Log transformation on: Gr_Liv_Area
#> • Collapsing factor levels for: Neighborhood
#> • Dummy variables from: all_nominal_predictors()
#> • Interactions with: Gr_Liv_Area:starts_with("Bldg_Type_")

1.5.3 样条函数

添加非线性特征

Code

library(patchwork)
library(splines)
plot_smoother <- function(deg_free) {
  ggplot(ames_train, aes(x = Latitude, y = 10^Sale_Price)) + 
    geom_point(alpha = .2) + 
    scale_y_log10() +
    geom_smooth(
      method = lm,
      formula = y ~ ns(x, df = deg_free),# natural splines.
      color = "lightblue",
      se = FALSE
    ) +
    labs(title = paste(deg_free, "Spline Terms"),
         y = "Sale Price (USD)")
}

( plot_smoother(1) + plot_smoother(5) ) / ( plot_smoother(20) + plot_smoother(100) )

Code

recipe(Sale_Price ~ Neighborhood + Gr_Liv_Area + Year_Built + Bldg_Type + Latitude,
         data = ames_train) %>%
  step_ns(Latitude, deg_free = 20)
#> 
#> ── Recipe ──────────────────────────────────────────────────────────────────────
#> 
#> ── Inputs
#> Number of variables by role
#> outcome:   1
#> predictor: 5
#> 
#> ── Operations
#> • Natural splines on: Latitude

1.5.4 特征提取

PCA,

Code

  # Use a regular expression to capture house size predictors: 
recipe(Sale_Price ~ Neighborhood + Gr_Liv_Area + Year_Built + Bldg_Type + Latitude+Total_Bsmt_SF+First_Flr_SF+Gr_Liv_Area,
         data = ames_train) %>%
    step_pca(matches("(SF$)|(Gr_Liv)"))
#> 
#> ── Recipe ──────────────────────────────────────────────────────────────────────
#> 
#> ── Inputs
#> Number of variables by role
#> outcome:   1
#> predictor: 7
#> 
#> ── Operations
#> • PCA extraction with: matches("(SF$)|(Gr_Liv)")

1.5.5 行采样

Downsampling，Upsampling，Hybrid

step_filter() step_sample() step_slice() step_arrange() skip TRUE

Code

library(themis)
step_downsample(outcome_column_name)

1.5.6 一般转换

step_mutate() 比，Bedroom_AbvGr / Full_Bath

1.5.7 `tidy()`

Code

ames_rec <- 
  recipe(Sale_Price ~ Neighborhood + Gr_Liv_Area + Year_Built + Bldg_Type + 
           Latitude + Longitude, data = ames_train) %>%
  step_log(Gr_Liv_Area, base = 10) %>% 
  step_other(Neighborhood, threshold = 0.01) %>% 
  step_dummy(all_nominal_predictors()) %>% 
  step_interact( ~ Gr_Liv_Area:starts_with("Bldg_Type_") ) %>% 
  step_ns(Latitude, Longitude, deg_free = 20)
tidy(ames_rec)
#> # A tibble: 5 × 6
#>   number operation type     trained skip  id            
#>    <int> <chr>     <chr>    <lgl>   <lgl> <chr>         
#> 1      1 step      log      FALSE   FALSE log_qeD8W     
#> 2      2 step      other    FALSE   FALSE other_Hirvj   
#> 3      3 step      dummy    FALSE   FALSE dummy_HJDrx   
#> 4      4 step      interact FALSE   FALSE interact_B7kcu
#> 5      5 step      ns       FALSE   FALSE ns_yBiqt

Code

ames_rec <- 
  recipe(Sale_Price ~ Neighborhood + Gr_Liv_Area + Year_Built + Bldg_Type + 
           Latitude + Longitude, data = ames_train) %>%
  step_log(Gr_Liv_Area, base = 10) %>% 
  step_other(Neighborhood, threshold = 0.01, id = "my_id") %>% 
  step_dummy(all_nominal_predictors()) %>% 
  step_interact( ~ Gr_Liv_Area:starts_with("Bldg_Type_") ) %>% 
  step_ns(Latitude, Longitude, deg_free = 20)

lm_wflow <- 
  workflow() %>% 
  add_model(lm_model) %>% 
  add_recipe(ames_rec)

lm_fit <- fit(lm_wflow, ames_train)

estimated_recipe <- 
  lm_fit %>% 
  extract_recipe(estimated = TRUE)

tidy(estimated_recipe, id = "my_id")
#> # A tibble: 21 × 3
#>    terms        retained           id   
#>    <chr>        <chr>              <chr>
#>  1 Neighborhood North_Ames         my_id
#>  2 Neighborhood College_Creek      my_id
#>  3 Neighborhood Old_Town           my_id
#>  4 Neighborhood Edwards            my_id
#>  5 Neighborhood Somerset           my_id
#>  6 Neighborhood Northridge_Heights my_id
#>  7 Neighborhood Gilbert            my_id
#>  8 Neighborhood Sawyer             my_id
#>  9 Neighborhood Northwest_Ames     my_id
#> 10 Neighborhood Sawyer_West        my_id
#> # ℹ 11 more rows
tidy(estimated_recipe, number = 2)
#> # A tibble: 21 × 3
#>    terms        retained           id   
#>    <chr>        <chr>              <chr>
#>  1 Neighborhood North_Ames         my_id
#>  2 Neighborhood College_Creek      my_id
#>  3 Neighborhood Old_Town           my_id
#>  4 Neighborhood Edwards            my_id
#>  5 Neighborhood Somerset           my_id
#>  6 Neighborhood Northridge_Heights my_id
#>  7 Neighborhood Gilbert            my_id
#>  8 Neighborhood Sawyer             my_id
#>  9 Neighborhood Northwest_Ames     my_id
#> 10 Neighborhood Sawyer_West        my_id
#> # ℹ 11 more rows

1.5.8 列角色

Code

ames_rec %>% update_role(Year_Built, new_role = "Street")
#> 
#> ── Recipe ──────────────────────────────────────────────────────────────────────
#> 
#> ── Inputs
#> Number of variables by role
#> outcome:   1
#> predictor: 5
#> Street:    1
#> 
#> ── Operations
#> • Log transformation on: Gr_Liv_Area
#> • Collapsing factor levels for: Neighborhood
#> • Dummy variables from: all_nominal_predictors()
#> • Interactions with: Gr_Liv_Area:starts_with("Bldg_Type_")
#> • Natural splines on: Latitude and Longitude

1.5.9 自然语言处理

textrecipes

1.6 模型评估 `yardstick`

1.6.1 回归指标

Code

ames_test_res <- predict(lm_fit, new_data = ames_test %>% select(-Sale_Price))
ames_test_res
#> # A tibble: 588 × 1
#>    .pred
#>    <dbl>
#>  1  5.24
#>  2  5.27
#>  3  5.24
#>  4  5.21
#>  5  5.65
#>  6  5.13
#>  7  5.04
#>  8  5.22
#>  9  5.00
#> 10  5.32
#> # ℹ 578 more rows

ames_test_res <- bind_cols(ames_test_res, ames_test %>% select(Sale_Price))
ames_test_res
#> # A tibble: 588 × 2
#>    .pred Sale_Price
#>    <dbl>      <dbl>
#>  1  5.24       5.33
#>  2  5.27       5.29
#>  3  5.24       5.26
#>  4  5.21       5.23
#>  5  5.65       5.73
#>  6  5.13       5.17
#>  7  5.04       5.06
#>  8  5.22       5.26
#>  9  5.00       5.02
#> 10  5.32       5.24
#> # ℹ 578 more rows

Code

ggplot(ames_test_res, aes(x = Sale_Price, y = .pred)) + 
  # Create a diagonal line:
  geom_abline(lty = 2) + 
  geom_point(alpha = 0.5) + 
  labs(y = "Predicted Sale Price (log10)", x = "Sale Price (log10)") +
  # Scale and size the x- and y-axis uniformly:
  coord_obs_pred()

1.6.1.1 均方根误差RMSE

Code

rmse(ames_test_res, truth = Sale_Price, estimate = .pred)
#> # A tibble: 1 × 3
#>   .metric .estimator .estimate
#>   <chr>   <chr>          <dbl>
#> 1 rmse    standard      0.0772

1.6.1.2 决定系数R²，平均绝对误差MAE

Code

ames_metrics <- metric_set(rmse, rsq, mae)
ames_metrics(ames_test_res, truth = Sale_Price, estimate = .pred)
#> # A tibble: 3 × 3
#>   .metric .estimator .estimate
#>   <chr>   <chr>          <dbl>
#> 1 rmse    standard      0.0772
#> 2 rsq     standard      0.795 
#> 3 mae     standard      0.0550

1.6.2 二分类指标

Code

data(two_class_example)
tibble(two_class_example)
#> # A tibble: 500 × 4
#>    truth   Class1   Class2 predicted
#>    <fct>    <dbl>    <dbl> <fct>    
#>  1 Class2 0.00359 0.996    Class2   
#>  2 Class1 0.679   0.321    Class1   
#>  3 Class2 0.111   0.889    Class2   
#>  4 Class1 0.735   0.265    Class1   
#>  5 Class2 0.0162  0.984    Class2   
#>  6 Class1 0.999   0.000725 Class1   
#>  7 Class1 0.999   0.000799 Class1   
#>  8 Class1 0.812   0.188    Class1   
#>  9 Class2 0.457   0.543    Class2   
#> 10 Class2 0.0976  0.902    Class2   
#> # ℹ 490 more rows

Code

# 混淆矩阵
# A confusion matrix: 
conf_mat(two_class_example, truth = truth, estimate = predicted)
#>           Truth
#> Prediction Class1 Class2
#>     Class1    227     50
#>     Class2     31    192

Code

# Accuracy:
accuracy(two_class_example, truth, predicted)
#> # A tibble: 1 × 3
#>   .metric  .estimator .estimate
#>   <chr>    <chr>          <dbl>
#> 1 accuracy binary         0.838

Code

# Matthews correlation coefficient:
mcc(two_class_example, truth, predicted)
#> # A tibble: 1 × 3
#>   .metric .estimator .estimate
#>   <chr>   <chr>          <dbl>
#> 1 mcc     binary         0.677

Code

# F1 metric:
f_meas(two_class_example, truth, predicted)
#> # A tibble: 1 × 3
#>   .metric .estimator .estimate
#>   <chr>   <chr>          <dbl>
#> 1 f_meas  binary         0.849

Code

# Combining these three classification metrics together
classification_metrics <- metric_set(accuracy, mcc, f_meas)
classification_metrics(two_class_example, truth = truth, estimate = predicted)
#> # A tibble: 3 × 3
#>   .metric  .estimator .estimate
#>   <chr>    <chr>          <dbl>
#> 1 accuracy binary         0.838
#> 2 mcc      binary         0.677
#> 3 f_meas   binary         0.849

感兴趣事件水平

第二级逻辑将结果编码为0/1（在这种情况下，第二个值是事件）

Code

f_meas(two_class_example, truth, predicted, event_level = "second")
#> # A tibble: 1 × 3
#>   .metric .estimator .estimate
#>   <chr>   <chr>          <dbl>
#> 1 f_meas  binary         0.826

1.6.2.1 ROC，AUC

不使用预测类列,对于两类问题，感兴趣事件的概率列将传递到函数中

Code

two_class_curve <- roc_curve(two_class_example, truth, Class1)
two_class_curve
#> # A tibble: 502 × 3
#>    .threshold specificity sensitivity
#>         <dbl>       <dbl>       <dbl>
#>  1 -Inf           0                 1
#>  2    1.79e-7     0                 1
#>  3    4.50e-6     0.00413           1
#>  4    5.81e-6     0.00826           1
#>  5    5.92e-6     0.0124            1
#>  6    1.22e-5     0.0165            1
#>  7    1.40e-5     0.0207            1
#>  8    1.43e-5     0.0248            1
#>  9    2.38e-5     0.0289            1
#> 10    3.30e-5     0.0331            1
#> # ℹ 492 more rows
roc_auc(two_class_example, truth, Class1)
#> # A tibble: 1 × 3
#>   .metric .estimator .estimate
#>   <chr>   <chr>          <dbl>
#> 1 roc_auc binary         0.939

Code

autoplot(two_class_curve)+
    annotate("text",x=0.5,y=0.25,label="AUC=0.939")

1.6.3 多分类指标

Code

data(hpc_cv)
tibble(hpc_cv)
#> # A tibble: 3,467 × 7
#>    obs   pred     VF      F       M          L Resample
#>    <fct> <fct> <dbl>  <dbl>   <dbl>      <dbl> <chr>   
#>  1 VF    VF    0.914 0.0779 0.00848 0.0000199  Fold01  
#>  2 VF    VF    0.938 0.0571 0.00482 0.0000101  Fold01  
#>  3 VF    VF    0.947 0.0495 0.00316 0.00000500 Fold01  
#>  4 VF    VF    0.929 0.0653 0.00579 0.0000156  Fold01  
#>  5 VF    VF    0.942 0.0543 0.00381 0.00000729 Fold01  
#>  6 VF    VF    0.951 0.0462 0.00272 0.00000384 Fold01  
#>  7 VF    VF    0.914 0.0782 0.00767 0.0000354  Fold01  
#>  8 VF    VF    0.918 0.0744 0.00726 0.0000157  Fold01  
#>  9 VF    VF    0.843 0.128  0.0296  0.000192   Fold01  
#> 10 VF    VF    0.920 0.0728 0.00703 0.0000147  Fold01  
#> # ℹ 3,457 more rows

Code

accuracy(hpc_cv, obs, pred)
#> # A tibble: 1 × 3
#>   .metric  .estimator .estimate
#>   <chr>    <chr>          <dbl>
#> 1 accuracy multiclass     0.709
mcc(hpc_cv, obs, pred)
#> # A tibble: 1 × 3
#>   .metric .estimator .estimate
#>   <chr>   <chr>          <dbl>
#> 1 mcc     multiclass     0.515

二分类可拓展到多分类

Code

sensitivity(hpc_cv, obs, pred, estimator = "macro")
#> # A tibble: 1 × 3
#>   .metric     .estimator .estimate
#>   <chr>       <chr>          <dbl>
#> 1 sensitivity macro          0.560
sensitivity(hpc_cv, obs, pred, estimator = "macro_weighted")
#> # A tibble: 1 × 3
#>   .metric     .estimator     .estimate
#>   <chr>       <chr>              <dbl>
#> 1 sensitivity macro_weighted     0.709
sensitivity(hpc_cv, obs, pred, estimator = "micro")
#> # A tibble: 1 × 3
#>   .metric     .estimator .estimate
#>   <chr>       <chr>          <dbl>
#> 1 sensitivity micro          0.709

多分类

Code

roc_auc(hpc_cv, obs, VF, F, M, L)
#> # A tibble: 1 × 3
#>   .metric .estimator .estimate
#>   <chr>   <chr>          <dbl>
#> 1 roc_auc hand_till      0.829
roc_auc(hpc_cv, obs, VF, F, M, L, estimator = "macro_weighted")
#> # A tibble: 1 × 3
#>   .metric .estimator     .estimate
#>   <chr>   <chr>              <dbl>
#> 1 roc_auc macro_weighted     0.868

Code

hpc_cv %>% 
  group_by(Resample) %>% 
  accuracy(obs, pred)
#> # A tibble: 10 × 4
#>    Resample .metric  .estimator .estimate
#>    <chr>    <chr>    <chr>          <dbl>
#>  1 Fold01   accuracy multiclass     0.726
#>  2 Fold02   accuracy multiclass     0.712
#>  3 Fold03   accuracy multiclass     0.758
#>  4 Fold04   accuracy multiclass     0.712
#>  5 Fold05   accuracy multiclass     0.712
#>  6 Fold06   accuracy multiclass     0.697
#>  7 Fold07   accuracy multiclass     0.675
#>  8 Fold08   accuracy multiclass     0.721
#>  9 Fold09   accuracy multiclass     0.673
#> 10 Fold10   accuracy multiclass     0.699

# Four 1-vs-all ROC curves for each fold
hpc_cv %>% 
  group_by(Resample) %>% 
  roc_curve(obs, VF, F, M, L) %>% 
  autoplot()

1.7 重采样 `rsample`

Code

ames_rec
#> 
#> ── Recipe ──────────────────────────────────────────────────────────────────────
#> 
#> ── Inputs
#> Number of variables by role
#> outcome:   1
#> predictor: 6
#> 
#> ── Operations
#> • Log transformation on: Gr_Liv_Area
#> • Collapsing factor levels for: Neighborhood
#> • Dummy variables from: all_nominal_predictors()
#> • Interactions with: Gr_Liv_Area:starts_with("Bldg_Type_")
#> • Natural splines on: Latitude and Longitude

Code

rf_model <- 
  rand_forest(trees = 1000) %>% 
  set_engine("ranger") %>% 
  set_mode("regression")

rf_wflow <- 
  workflow() %>% 
  add_formula(
    Sale_Price ~ Neighborhood + Gr_Liv_Area + Year_Built + Bldg_Type + 
      Latitude + Longitude) %>% 
  add_model(rf_model) 

rf_fit <- rf_wflow %>% fit(data = ames_train)

Code

estimate_perf <- function(model, dat) {
  # Capture the names of the `model` and `dat` objects
  cl <- match.call()
  obj_name <- as.character(cl$model)
  data_name <- as.character(cl$dat)
  data_name <- gsub("ames_", "", data_name)
  
  # Estimate these metrics:
  reg_metrics <- metric_set(rmse, rsq)
  
  model %>%
    predict(dat) %>%
    bind_cols(dat %>% select(Sale_Price)) %>%
    reg_metrics(Sale_Price, .pred) %>%
    select(-.estimator) %>%
    mutate(object = obj_name, data = data_name)
}

estimate_perf(rf_fit, ames_train)
#> # A tibble: 2 × 4
#>   .metric .estimate object data 
#>   <chr>       <dbl> <chr>  <chr>
#> 1 rmse       0.0364 rf_fit train
#> 2 rsq        0.962  rf_fit train
estimate_perf(lm_fit, ames_train)
#> # A tibble: 2 × 4
#>   .metric .estimate object data 
#>   <chr>       <dbl> <chr>  <chr>
#> 1 rmse       0.0749 lm_fit train
#> 2 rsq        0.824  lm_fit train

Code

estimate_perf(rf_fit, ames_test)
#> # A tibble: 2 × 4
#>   .metric .estimate object data 
#>   <chr>       <dbl> <chr>  <chr>
#> 1 rmse       0.0677 rf_fit test 
#> 2 rsq        0.843  rf_fit test

1.7.1 交叉验证

V-fold cross-validation，数据被随机划分为样本量大致相等的V组（称为折叠），例如10重交叉验证

Code

set.seed(1001)
# 10-fold cross-validation
ames_folds <- vfold_cv(ames_train, v = 10)

# 分析集/评估集
ames_folds
#> #  10-fold cross-validation 
#> # A tibble: 10 × 2
#>    splits             id    
#>    <list>             <chr> 
#>  1 <split [2107/235]> Fold01
#>  2 <split [2107/235]> Fold02
#>  3 <split [2108/234]> Fold03
#>  4 <split [2108/234]> Fold04
#>  5 <split [2108/234]> Fold05
#>  6 <split [2108/234]> Fold06
#>  7 <split [2108/234]> Fold07
#>  8 <split [2108/234]> Fold08
#>  9 <split [2108/234]> Fold09
#> 10 <split [2108/234]> Fold10


# 检索分区数据
ames_folds$splits[[1]] %>% analysis() |> dim()
#> [1] 2107   74

model_spec %>% fit_resamples(formula, resamples, ...)

model_spec %>% fit_resamples(recipe, resamples, ...)

workflow %>% fit_resamples( resamples, ...)

Code

keep_pred <- control_resamples(save_pred = TRUE, save_workflow = TRUE)

set.seed(1003)
rf_res <- 
  rf_wflow %>% 
  fit_resamples(resamples = ames_folds, control = keep_pred)
rf_res
#> # Resampling results
#> # 10-fold cross-validation 
#> # A tibble: 10 × 5
#>    splits             id     .metrics         .notes           .predictions
#>    <list>             <chr>  <list>           <list>           <list>      
#>  1 <split [2107/235]> Fold01 <tibble [2 × 4]> <tibble [0 × 3]> <tibble>    
#>  2 <split [2107/235]> Fold02 <tibble [2 × 4]> <tibble [0 × 3]> <tibble>    
#>  3 <split [2108/234]> Fold03 <tibble [2 × 4]> <tibble [0 × 3]> <tibble>    
#>  4 <split [2108/234]> Fold04 <tibble [2 × 4]> <tibble [0 × 3]> <tibble>    
#>  5 <split [2108/234]> Fold05 <tibble [2 × 4]> <tibble [0 × 3]> <tibble>    
#>  6 <split [2108/234]> Fold06 <tibble [2 × 4]> <tibble [0 × 3]> <tibble>    
#>  7 <split [2108/234]> Fold07 <tibble [2 × 4]> <tibble [0 × 3]> <tibble>    
#>  8 <split [2108/234]> Fold08 <tibble [2 × 4]> <tibble [0 × 3]> <tibble>    
#>  9 <split [2108/234]> Fold09 <tibble [2 × 4]> <tibble [0 × 3]> <tibble>    
#> 10 <split [2108/234]> Fold10 <tibble [2 × 4]> <tibble [0 × 3]> <tibble>
collect_metrics(rf_res)
#> # A tibble: 2 × 6
#>   .metric .estimator   mean     n std_err .config             
#>   <chr>   <chr>       <dbl> <int>   <dbl> <chr>               
#> 1 rmse    standard   0.0726    10 0.00258 Preprocessor1_Model1
#> 2 rsq     standard   0.834     10 0.0116  Preprocessor1_Model1
collect_metrics(rf_res,summarize = F)
#> # A tibble: 20 × 5
#>    id     .metric .estimator .estimate .config             
#>    <chr>  <chr>   <chr>          <dbl> <chr>               
#>  1 Fold01 rmse    standard      0.0647 Preprocessor1_Model1
#>  2 Fold01 rsq     standard      0.861  Preprocessor1_Model1
#>  3 Fold02 rmse    standard      0.0662 Preprocessor1_Model1
#>  4 Fold02 rsq     standard      0.860  Preprocessor1_Model1
#>  5 Fold03 rmse    standard      0.0793 Preprocessor1_Model1
#>  6 Fold03 rsq     standard      0.808  Preprocessor1_Model1
#>  7 Fold04 rmse    standard      0.0861 Preprocessor1_Model1
#>  8 Fold04 rsq     standard      0.761  Preprocessor1_Model1
#>  9 Fold05 rmse    standard      0.0738 Preprocessor1_Model1
#> 10 Fold05 rsq     standard      0.857  Preprocessor1_Model1
#> 11 Fold06 rmse    standard      0.0678 Preprocessor1_Model1
#> 12 Fold06 rsq     standard      0.860  Preprocessor1_Model1
#> 13 Fold07 rmse    standard      0.0644 Preprocessor1_Model1
#> 14 Fold07 rsq     standard      0.873  Preprocessor1_Model1
#> 15 Fold08 rmse    standard      0.0673 Preprocessor1_Model1
#> 16 Fold08 rsq     standard      0.829  Preprocessor1_Model1
#> 17 Fold09 rmse    standard      0.0719 Preprocessor1_Model1
#> 18 Fold09 rsq     standard      0.840  Preprocessor1_Model1
#> 19 Fold10 rmse    standard      0.0848 Preprocessor1_Model1
#> 20 Fold10 rsq     standard      0.790  Preprocessor1_Model1
assess_res <- collect_predictions(rf_res)
assess_res
#> # A tibble: 2,342 × 5
#>    .pred id      .row Sale_Price .config             
#>    <dbl> <chr>  <int>      <dbl> <chr>               
#>  1  5.17 Fold01    10       5.05 Preprocessor1_Model1
#>  2  5.04 Fold01    27       5.06 Preprocessor1_Model1
#>  3  5.11 Fold01    47       5.10 Preprocessor1_Model1
#>  4  5.14 Fold01    52       5.11 Preprocessor1_Model1
#>  5  5.09 Fold01    59       5.05 Preprocessor1_Model1
#>  6  4.88 Fold01    63       4.77 Preprocessor1_Model1
#>  7  5.01 Fold01    65       5.10 Preprocessor1_Model1
#>  8  5.16 Fold01    66       5    Preprocessor1_Model1
#>  9  4.96 Fold01    67       5.10 Preprocessor1_Model1
#> 10  4.84 Fold01    68       4.91 Preprocessor1_Model1
#> # ℹ 2,332 more rows

Code

assess_res %>% 
  ggplot(aes(x = Sale_Price, y = .pred)) + 
  geom_point(alpha = .15) +
  geom_abline(color = "red") + 
  coord_obs_pred() + 
  ylab("Predicted")

Code

# 找到残差最大的2个
over_predicted <- 
  assess_res %>% 
  mutate(residual = Sale_Price - .pred) %>% 
  arrange(desc(abs(residual))) %>% 
  slice(1:2)
over_predicted
#> # A tibble: 2 × 6
#>   .pred id      .row Sale_Price .config              residual
#>   <dbl> <chr>  <int>      <dbl> <chr>                   <dbl>
#> 1  4.97 Fold10    33       4.11 Preprocessor1_Model1   -0.858
#> 2  4.93 Fold04   323       4.12 Preprocessor1_Model1   -0.814

ames_train %>% 
  slice(over_predicted$.row) %>% 
  select(Gr_Liv_Area, Neighborhood, Year_Built, Bedroom_AbvGr, Full_Bath)
#> # A tibble: 2 × 5
#>   Gr_Liv_Area Neighborhood           Year_Built Bedroom_AbvGr Full_Bath
#>         <int> <fct>                       <int>         <int>     <int>
#> 1         832 Old_Town                     1923             2         1
#> 2         733 Iowa_DOT_and_Rail_Road       1952             2         1

重复交叉验证

Code

# 10-fold cross-validation repeated 5 times 
vfold_cv(ames_train, v = 10, repeats = 5)
#> #  10-fold cross-validation repeated 5 times 
#> # A tibble: 50 × 3
#>    splits             id      id2   
#>    <list>             <chr>   <chr> 
#>  1 <split [2107/235]> Repeat1 Fold01
#>  2 <split [2107/235]> Repeat1 Fold02
#>  3 <split [2108/234]> Repeat1 Fold03
#>  4 <split [2108/234]> Repeat1 Fold04
#>  5 <split [2108/234]> Repeat1 Fold05
#>  6 <split [2108/234]> Repeat1 Fold06
#>  7 <split [2108/234]> Repeat1 Fold07
#>  8 <split [2108/234]> Repeat1 Fold08
#>  9 <split [2108/234]> Repeat1 Fold09
#> 10 <split [2108/234]> Repeat1 Fold10
#> # ℹ 40 more rows

留一交叉验证

leave-one-out (LOO) cross-validation

loo_cv()

蒙特卡罗交叉验证

Monte Carlo cross-validation，MCCV，将固定比例的数据分配给分析集和评估集。该比例的数据每次都是随机选择的，导致评估集不相互排斥

Code

mc_cv(ames_train, prop = 9/10, times = 20)
#> # Monte Carlo cross-validation (0.9/0.1) with 20 resamples  
#> # A tibble: 20 × 2
#>    splits             id        
#>    <list>             <chr>     
#>  1 <split [2107/235]> Resample01
#>  2 <split [2107/235]> Resample02
#>  3 <split [2107/235]> Resample03
#>  4 <split [2107/235]> Resample04
#>  5 <split [2107/235]> Resample05
#>  6 <split [2107/235]> Resample06
#>  7 <split [2107/235]> Resample07
#>  8 <split [2107/235]> Resample08
#>  9 <split [2107/235]> Resample09
#> 10 <split [2107/235]> Resample10
#> 11 <split [2107/235]> Resample11
#> 12 <split [2107/235]> Resample12
#> 13 <split [2107/235]> Resample13
#> 14 <split [2107/235]> Resample14
#> 15 <split [2107/235]> Resample15
#> 16 <split [2107/235]> Resample16
#> 17 <split [2107/235]> Resample17
#> 18 <split [2107/235]> Resample18
#> 19 <split [2107/235]> Resample19
#> 20 <split [2107/235]> Resample20

1.7.2 验证集

Code

set.seed(101)

# To put 60% into training, 20% in validation, and remaining 20% in testing:
ames_validation_split <- initial_validation_split(ames, prop = c(0.6, 0.2))
ames_validation_split
#> <Training/Validation/Testing/Total>
#> <1758/586/586/2930>


# Object used for resampling: 
val_set <- validation_set(ames_validation_split)
val_set
#> # A tibble: 1 × 2
#>   splits             id        
#>   <list>             <chr>     
#> 1 <split [1758/586]> validation

Code

val_res <- rf_wflow %>% fit_resamples(resamples = val_set)
val_res
#> # Resampling results
#> # Validation Set (0.75/0.25) 
#> # A tibble: 1 × 4
#>   splits             id         .metrics         .notes          
#>   <list>             <chr>      <list>           <list>          
#> 1 <split [1758/586]> validation <tibble [2 × 4]> <tibble [0 × 3]>
collect_metrics(val_res)
#> # A tibble: 2 × 6
#>   .metric .estimator   mean     n std_err .config             
#>   <chr>   <chr>       <dbl> <int>   <dbl> <chr>               
#> 1 rmse    standard   0.0809     1      NA Preprocessor1_Model1
#> 2 rsq     standard   0.818      1      NA Preprocessor1_Model1

1.7.3 自助法

Bootstrap resampling

replacement

out-of-bag sample

Code

bootstraps(ames_train, times = 5)
#> # Bootstrap sampling 
#> # A tibble: 5 × 2
#>   splits             id        
#>   <list>             <chr>     
#> 1 <split [2342/882]> Bootstrap1
#> 2 <split [2342/849]> Bootstrap2
#> 3 <split [2342/870]> Bootstrap3
#> 4 <split [2342/875]> Bootstrap4
#> 5 <split [2342/849]> Bootstrap5

1.7.4 Rolling forecast origin resampling

滚动预测原点重采样

时间序列数据

Code

time_slices <- 
  tibble(x = 1:365) %>% 
  rolling_origin(initial = 6 * 30, assess = 30, skip = 29, cumulative = FALSE)

data_range <- function(x) {
  summarize(x, first = min(x), last = max(x))
}

map_dfr(time_slices$splits, ~   analysis(.x) %>% data_range())
#> # A tibble: 6 × 2
#>   first  last
#>   <int> <int>
#> 1     1   180
#> 2    31   210
#> 3    61   240
#> 4    91   270
#> 5   121   300
#> 6   151   330

map_dfr(time_slices$splits, ~ assessment(.x) %>% data_range())
#> # A tibble: 6 × 2
#>   first  last
#>   <int> <int>
#> 1   181   210
#> 2   211   240
#> 3   241   270
#> 4   271   300
#> 5   301   330
#> 6   331   360

1.7.5 并行计算

Code

parallel::detectCores(logical = FALSE)
#> [1] 4
parallel::detectCores(logical = TRUE)
#> [1] 8

1.7.6 保存重采样对象

Code

ames_rec <- 
  recipe(Sale_Price ~ Neighborhood + Gr_Liv_Area + Year_Built + Bldg_Type + 
           Latitude + Longitude, data = ames_train) %>%
  step_other(Neighborhood, threshold = 0.01) %>% 
  step_dummy(all_nominal_predictors()) %>% 
  step_interact( ~ Gr_Liv_Area:starts_with("Bldg_Type_") ) %>% 
  step_ns(Latitude, Longitude, deg_free = 20)

lm_wflow <-  
  workflow() %>% 
  add_recipe(ames_rec) %>% 
  add_model(linear_reg() %>% set_engine("lm")) 

lm_fit <- lm_wflow %>% fit(data = ames_train)

# Select the recipe: 
extract_recipe(lm_fit, estimated = TRUE)
#> 
#> ── Recipe ──────────────────────────────────────────────────────────────────────
#> 
#> ── Inputs
#> Number of variables by role
#> outcome:   1
#> predictor: 6
#> 
#> ── Training information
#> Training data contained 2342 data points and no incomplete rows.
#> 
#> ── Operations
#> • Collapsing factor levels for: Neighborhood | Trained
#> • Dummy variables from: Neighborhood and Bldg_Type | Trained
#> • Interactions with: Gr_Liv_Area:(Bldg_Type_TwoFmCon + Bldg_Type_Duplex +
#>   Bldg_Type_Twnhs + Bldg_Type_TwnhsE) | Trained
#> • Natural splines on: Latitude and Longitude | Trained

Code

get_model <- function(x) {
  extract_fit_parsnip(x) %>% tidy()
}

get_model(lm_fit)
#> # A tibble: 72 × 5
#>    term                             estimate  std.error statistic   p.value
#>    <chr>                               <dbl>      <dbl>     <dbl>     <dbl>
#>  1 (Intercept)                      0.895    0.303          2.95  3.17e-  3
#>  2 Gr_Liv_Area                      0.000177 0.00000444    39.8   4.16e-263
#>  3 Year_Built                       0.00204  0.000139      14.7   1.51e- 46
#>  4 Neighborhood_College_Creek      -0.0283   0.0334        -0.849 3.96e-  1
#>  5 Neighborhood_Old_Town           -0.0489   0.0125        -3.91  9.48e-  5
#>  6 Neighborhood_Edwards            -0.0823   0.0273        -3.01  2.62e-  3
#>  7 Neighborhood_Somerset            0.0735   0.0192         3.83  1.33e-  4
#>  8 Neighborhood_Northridge_Heights  0.148    0.0278         5.33  1.07e-  7
#>  9 Neighborhood_Gilbert             0.0306   0.0218         1.41  1.60e-  1
#> 10 Neighborhood_Sawyer             -0.111    0.0257        -4.33  1.56e-  5
#> # ℹ 62 more rows

Code

ctrl <- control_resamples(extract = get_model)

lm_res <- lm_wflow %>%  fit_resamples(resamples = ames_folds, control = ctrl)
lm_res
#> # Resampling results
#> # 10-fold cross-validation 
#> # A tibble: 10 × 5
#>    splits             id     .metrics         .notes           .extracts       
#>    <list>             <chr>  <list>           <list>           <list>          
#>  1 <split [2107/235]> Fold01 <tibble [2 × 4]> <tibble [0 × 3]> <tibble [1 × 2]>
#>  2 <split [2107/235]> Fold02 <tibble [2 × 4]> <tibble [0 × 3]> <tibble [1 × 2]>
#>  3 <split [2108/234]> Fold03 <tibble [2 × 4]> <tibble [0 × 3]> <tibble [1 × 2]>
#>  4 <split [2108/234]> Fold04 <tibble [2 × 4]> <tibble [0 × 3]> <tibble [1 × 2]>
#>  5 <split [2108/234]> Fold05 <tibble [2 × 4]> <tibble [0 × 3]> <tibble [1 × 2]>
#>  6 <split [2108/234]> Fold06 <tibble [2 × 4]> <tibble [0 × 3]> <tibble [1 × 2]>
#>  7 <split [2108/234]> Fold07 <tibble [2 × 4]> <tibble [0 × 3]> <tibble [1 × 2]>
#>  8 <split [2108/234]> Fold08 <tibble [2 × 4]> <tibble [0 × 3]> <tibble [1 × 2]>
#>  9 <split [2108/234]> Fold09 <tibble [2 × 4]> <tibble [0 × 3]> <tibble [1 × 2]>
#> 10 <split [2108/234]> Fold10 <tibble [2 × 4]> <tibble [0 × 3]> <tibble [1 × 2]>

Code

lm_res$.extracts[[1]]
#> # A tibble: 1 × 2
#>   .extracts         .config             
#>   <list>            <chr>               
#> 1 <tibble [72 × 5]> Preprocessor1_Model1

lm_res$.extracts[[1]][[1]]
#> [[1]]
#> # A tibble: 72 × 5
#>    term                             estimate  std.error statistic   p.value
#>    <chr>                               <dbl>      <dbl>     <dbl>     <dbl>
#>  1 (Intercept)                      0.950    0.324          2.93  3.42e-  3
#>  2 Gr_Liv_Area                      0.000175 0.00000481    36.3   4.09e-223
#>  3 Year_Built                       0.00201  0.000149      13.5   1.08e- 39
#>  4 Neighborhood_College_Creek      -0.0273   0.0359        -0.762 4.46e-  1
#>  5 Neighborhood_Old_Town           -0.0543   0.0134        -4.04  5.51e-  5
#>  6 Neighborhood_Edwards            -0.0825   0.0293        -2.81  4.95e-  3
#>  7 Neighborhood_Somerset            0.0706   0.0207         3.40  6.76e-  4
#>  8 Neighborhood_Northridge_Heights  0.139    0.0298         4.65  3.55e-  6
#>  9 Neighborhood_Gilbert             0.0197   0.0235         0.835 4.04e-  1
#> 10 Neighborhood_Sawyer             -0.117    0.0273        -4.30  1.77e-  5
#> # ℹ 62 more rows

Code

all_coef <- map_dfr(lm_res$.extracts, ~ .x[[1]][[1]])
# Show the replicates for a single predictor:
filter(all_coef, term == "Year_Built")
#> # A tibble: 10 × 5
#>    term       estimate std.error statistic  p.value
#>    <chr>         <dbl>     <dbl>     <dbl>    <dbl>
#>  1 Year_Built  0.00201  0.000149      13.5 1.08e-39
#>  2 Year_Built  0.00203  0.000152      13.3 6.61e-39
#>  3 Year_Built  0.00189  0.000147      12.9 1.85e-36
#>  4 Year_Built  0.00210  0.000145      14.5 1.98e-45
#>  5 Year_Built  0.00211  0.000148      14.3 2.19e-44
#>  6 Year_Built  0.00204  0.000147      13.9 4.52e-42
#>  7 Year_Built  0.00215  0.000150      14.3 1.97e-44
#>  8 Year_Built  0.00213  0.000144      14.7 9.43e-47
#>  9 Year_Built  0.00208  0.000151      13.8 1.77e-41
#> 10 Year_Built  0.00212  0.000148      14.3 4.48e-44

1 机器学习

1.1 探索性数据分析

1.2 拆分训练集、验证集和测试集 rsample

1.2.1 简单抽样

1.2.2 分层抽样

1.2.3 验证集

1.3 模型选择 parsnip

1.4 模型工作流 workflows

1.4.1 线性模型

1.4.2 预测

1.4.3 混合效应模型 multilevelmod

1.4.4 生存模型 censored

1.4.5 工作流集 workflowsets

1.5 特征工程 recipes

1.5.1 虚拟变量

1.5.2 交互项

1.5.3 样条函数

1.5.4 特征提取

1.5.5 行采样

1.5.6 一般转换

1.5.7 tidy()

1.5.8 列角色

1.5.9 自然语言处理

1.6 模型评估 yardstick

1.6.1 回归指标

1.6.1.1 均方根误差RMSE

1.6.1.2 决定系数R2，平均绝对误差MAE

1.6.2 二分类指标

1.6.2.1 ROC，AUC

1.6.3 多分类指标

1.7 重采样 rsample

1.7.1 交叉验证

1.7.2 验证集

1.7.3 自助法

1.7.4 Rolling forecast origin resampling

1.7.5 并行计算

1.7.6 保存重采样对象

1.2 拆分训练集、验证集和测试集 `rsample`

1.3 模型选择 `parsnip`

1.4 模型工作流 `workflows`

1.4.3 混合效应模型 `multilevelmod`

1.4.4 生存模型 `censored`

1.4.5 工作流集 `workflowsets`

1.5 特征工程 `recipes`

1.5.7 `tidy()`

1.6 模型评估 `yardstick`

1.6.1.2 决定系数R²，平均绝对误差MAE

1.7 重采样 `rsample`