1 分类

1.1 支持向量机

支持向量机（Support Vector Machine, SVM）：用于二分类和多分类问题，寻找最佳决策边界。

1.2 K-Nearest Neighbors

Code

Smarket <- read_csv("data/Smarket.csv")
#> Rows: 1250 Columns: 9
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: ","
#> chr (1): Direction
#> dbl (8): Year, Lag1, Lag2, Lag3, Lag4, Lag5, Volume, Today
#> 
#> ℹ Use `spec()` to retrieve the full column specification for this data.
#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Smarket$Direction <- factor(Smarket$Direction)
head(Smarket)
#> # A tibble: 6 × 9
#>    Year   Lag1   Lag2   Lag3   Lag4   Lag5 Volume  Today Direction
#>   <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl> <fct>    
#> 1  2001  0.381 -0.192 -2.62  -1.06   5.01    1.19  0.959 Up       
#> 2  2001  0.959  0.381 -0.192 -2.62  -1.06    1.30  1.03  Up       
#> 3  2001  1.03   0.959  0.381 -0.192 -2.62    1.41 -0.623 Down     
#> 4  2001 -0.623  1.03   0.959  0.381 -0.192   1.28  0.614 Up       
#> 5  2001  0.614 -0.623  1.03   0.959  0.381   1.21  0.213 Up       
#> 6  2001  0.213  0.614 -0.623  1.03   0.959   1.35  1.39  Up

Code

knn_spec <- nearest_neighbor(neighbors = 3) |> 
  set_mode("classification") |> 
  set_engine("kknn")

knn_fit <- knn_spec |>
  fit(Direction ~ Lag1 + Lag2, data = Smarket)

knn_fit
#> parsnip model object
#> 
#> 
#> Call:
#> kknn::train.kknn(formula = Direction ~ Lag1 + Lag2, data = data,     ks = min_rows(3, data, 5))
#> 
#> Type of response variable: nominal
#> Minimal misclassification: 0.5064
#> Best kernel: optimal
#> Best k: 3

Code

augment(knn_fit, new_data = Smarket) |> 
  conf_mat(truth = Direction, estimate = .pred_class) 
#>           Truth
#> Prediction Down  Up
#>       Down  602   0
#>       Up      0 648

Code

augment(knn_fit, new_data = Smarket) |>
  accuracy(truth = Direction, estimate = .pred_class) 
#> # A tibble: 1 × 3
#>   .metric  .estimator .estimate
#>   <chr>    <chr>          <dbl>
#> 1 accuracy binary             1

1.3 LDA

Code

lda_spec <- discrim_linear() %>%
  set_mode("classification") %>%
  set_engine("MASS")
lda_fit <- lda_spec %>%
  fit(Direction ~ Lag1 + Lag2, data = Smarket)

lda_fit
#> parsnip model object
#> 
#> Call:
#> lda(Direction ~ Lag1 + Lag2, data = data)
#> 
#> Prior probabilities of groups:
#>   Down     Up 
#> 0.4816 0.5184 
#> 
#> Group means:
#>             Lag1        Lag2
#> Down  0.05068605  0.03229734
#> Up   -0.03969136 -0.02244444
#> 
#> Coefficients of linear discriminants:
#>             LD1
#> Lag1 -0.7567605
#> Lag2 -0.4707872

1.4 模型比较

Code

models <- list("LDA" = lda_fit,
               "KNN" = knn_fit)
preds <- imap_dfr(models, augment, 
                  new_data = Smarket, .id = "model")

preds %>%
  dplyr::select(model, Direction, .pred_class, .pred_Down, .pred_Up)
#> # A tibble: 2,500 × 5
#>    model Direction .pred_class .pred_Down .pred_Up
#>    <chr> <fct>     <fct>            <dbl>    <dbl>
#>  1 LDA   Up        Up               0.486    0.514
#>  2 LDA   Up        Down             0.503    0.497
#>  3 LDA   Down      Down             0.510    0.490
#>  4 LDA   Up        Up               0.482    0.518
#>  5 LDA   Up        Up               0.485    0.515
#>  6 LDA   Up        Up               0.492    0.508
#>  7 LDA   Down      Down             0.509    0.491
#>  8 LDA   Up        Up               0.490    0.510
#>  9 LDA   Up        Up               0.477    0.523
#> 10 LDA   Up        Down             0.505    0.495
#> # ℹ 2,490 more rows

1.4.1 灵敏度和特异性

Code

multi_metric <- metric_set( sensitivity, specificity)  # accuracy


preds %>%
  group_by(model) %>%
  multi_metric(truth = Direction, estimate = .pred_class)
#> # A tibble: 4 × 4
#>   model .metric     .estimator .estimate
#>   <chr> <chr>       <chr>          <dbl>
#> 1 KNN   sensitivity binary         1    
#> 2 LDA   sensitivity binary         0.189
#> 3 KNN   specificity binary         1    
#> 4 LDA   specificity binary         0.843

1.4.2 ROC 曲线

Code

preds %>%
  group_by(model) %>%
  roc_curve(Direction, .pred_Down) %>%
  autoplot()

1.5 朴素贝叶斯

朴素贝叶斯（Naive Bayes）：基于贝叶斯定理的简单而高效的分类算法。

1.6 基于树的模型

决策树（Decision Tree）：基于树状模型进行决策的分类算法。
随机森林（Random Forest）：由多棵决策树组成的集成学习模型。
梯度提升树（Gradient Boosting Trees）：通过加法模型和前向分步算法实现的集成学习模型。