1 聚类

https://tidyclust.tidymodels.org/

Code
set.seed(10)
x_df <- tibble(
  V1 = rnorm(n = 50, mean = rep(c(0, 3), each = 25)),
  V2 = rnorm(n = 50, mean = rep(c(0, -4), each = 25))
)
Code
x_df %>%
  ggplot(aes(V1, V2, color = rep(c("A", "B"), each = 25))) +
  geom_point() +
  labs(color = "groups")

聚类分析的一般步骤

  1. 选择合适的变量

  2. 缩放数据 :标准化

  3. 寻找异常点

  4. 计算距离: dist(x,method = ) 默认欧几里得距离

  5. 选择聚类方法和算法

  6. 确定类的数目

  7. 获得最终的聚类解决方案

  8. 结果可视化

  9. 解读类

  10. 验证结果

1.1 划分聚类 partitioning clustering

1.1.1 K Means Cluster Specification

num_clusters = 3指定中心点(centroids)即类的个数,nstart = 20指定初始位置的个数,希望找到全局最大值而不是局部最大值

Code
kmeans_spec <-tidyclust::k_means(num_clusters = 3) %>%
  set_mode("partition") %>%
  set_engine("stats") %>%
  set_args(nstart = 20)

kmeans_spec
#> K Means Cluster Specification (partition)
#> 
#> Main Arguments:
#>   num_clusters = 3
#> 
#> Engine-Specific Arguments:
#>   nstart = 20
#> 
#> Computational engine: stats

K-means algorithm starts with random initialization

Code
set.seed(100)
kmeans_fit <- kmeans_spec %>%
  fit(~., data = x_df)

kmeans_fit$fit
#> K-means clustering with 3 clusters of sizes 14, 12, 24
#> 
#> Cluster means:
#>           V1         V2
#> 2 -0.4354713 -0.8929796
#> 1 -0.0594887  0.8269786
#> 3  2.6977371 -3.9171729
#> 
#> Clustering vector:
#>  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 
#>  1  1  2  2  2  2  1  2  1  2  1  1  1  2  2  2  2  1  2  1  2  1  1  1  1  3 
#> 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 
#>  3  3  3  3  3  3  3  3  3  3  3  1  3  3  3  3  3  3  3  3  3  3  3  3 
#> 
#> Within cluster sum of squares by cluster:
#> [1] 19.593523  9.502891 32.730828
#>  (between_SS / total_SS =  83.4 %)
#> 
#> Available components:
#> 
#> [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
#> [6] "betweenss"    "size"         "iter"         "ifault"

extract_centroids(kmeans_fit)
#> # A tibble: 3 × 3
#>   .cluster       V1     V2
#>   <fct>       <dbl>  <dbl>
#> 1 Cluster_1 -0.435  -0.893
#> 2 Cluster_2 -0.0595  0.827
#> 3 Cluster_3  2.70   -3.92
kmeans_fit$fit$centers
#>           V1         V2
#> 2 -0.4354713 -0.8929796
#> 1 -0.0594887  0.8269786
#> 3  2.6977371 -3.9171729

kmeans_fit$fit$cluster
#>  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 
#>  1  1  2  2  2  2  1  2  1  2  1  1  1  2  2  2  2  1  2  1  2  1  1  1  1  3 
#> 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 
#>  3  3  3  3  3  3  3  3  3  3  3  1  3  3  3  3  3  3  3  3  3  3  3  3
Code
predict(kmeans_fit, new_data = x_df)
#> # A tibble: 50 × 1
#>    .pred_cluster
#>    <fct>        
#>  1 Cluster_1    
#>  2 Cluster_1    
#>  3 Cluster_2    
#>  4 Cluster_2    
#>  5 Cluster_2    
#>  6 Cluster_2    
#>  7 Cluster_1    
#>  8 Cluster_2    
#>  9 Cluster_1    
#> 10 Cluster_2    
#> # ℹ 40 more rows
augment(kmeans_fit, new_data = x_df)
#> # A tibble: 50 × 3
#>         V1     V2 .pred_cluster
#>      <dbl>  <dbl> <fct>        
#>  1  0.0187 -0.401 Cluster_1    
#>  2 -0.184  -0.335 Cluster_1    
#>  3 -1.37    1.37  Cluster_2    
#>  4 -0.599   2.14  Cluster_2    
#>  5  0.295   0.506 Cluster_2    
#>  6  0.390   0.786 Cluster_2    
#>  7 -1.21   -0.902 Cluster_1    
#>  8 -0.364   0.533 Cluster_2    
#>  9 -1.63   -0.646 Cluster_1    
#> 10 -0.256   0.291 Cluster_2    
#> # ℹ 40 more rows
Code
augment(kmeans_fit, new_data = x_df) %>%
  ggplot(aes(V1, V2, color = .pred_cluster)) +
  geom_point()

tune_cluster()找到最适合的类的数目

Code
kmeans_spec_tuned <- kmeans_spec %>% 
  set_args(num_clusters = tune())

kmeans_wf <- workflow() %>%
  add_model(kmeans_spec_tuned) %>%
  add_formula(~.)
Code
set.seed(1000)
x_boots <- bootstraps(x_df, times = 10)

num_clusters_grid <- tibble(num_clusters = seq(1, 10))

tune_res <- tune_cluster(
  object = kmeans_wf,
  resamples = x_boots,
  grid = num_clusters_grid
)
Code
tune_res %>%
  collect_metrics()
#> # A tibble: 20 × 7
#>    num_clusters .metric          .estimator   mean     n std_err .config        
#>           <int> <chr>            <chr>       <dbl> <int>   <dbl> <chr>          
#>  1            1 sse_total        standard   381.      10  10.4   Preprocessor1_…
#>  2            1 sse_within_total standard   381.      10  10.4   Preprocessor1_…
#>  3            2 sse_total        standard   381.      10  10.4   Preprocessor1_…
#>  4            2 sse_within_total standard    81.4     10   4.36  Preprocessor1_…
#>  5            3 sse_total        standard   381.      10  10.4   Preprocessor1_…
#>  6            3 sse_within_total standard    56.8     10   3.34  Preprocessor1_…
#>  7            4 sse_total        standard   381.      10  10.4   Preprocessor1_…
#>  8            4 sse_within_total standard    40.5     10   2.33  Preprocessor1_…
#>  9            5 sse_total        standard   381.      10  10.4   Preprocessor1_…
#> 10            5 sse_within_total standard    29.8     10   1.71  Preprocessor1_…
#> 11            6 sse_total        standard   381.      10  10.4   Preprocessor1_…
#> 12            6 sse_within_total standard    21.8     10   1.43  Preprocessor1_…
#> 13            7 sse_total        standard   381.      10  10.4   Preprocessor1_…
#> 14            7 sse_within_total standard    17.0     10   1.04  Preprocessor1_…
#> 15            8 sse_total        standard   381.      10  10.4   Preprocessor1_…
#> 16            8 sse_within_total standard    13.6     10   0.842 Preprocessor1_…
#> 17            9 sse_total        standard   381.      10  10.4   Preprocessor1_…
#> 18            9 sse_within_total standard    11.0     10   0.669 Preprocessor1_…
#> 19           10 sse_total        standard   381.      10  10.4   Preprocessor1_…
#> 20           10 sse_within_total standard     8.82    10   0.595 Preprocessor1_…

elbow method 找到最理想的类的个数。

Code
tune_res %>%
  autoplot()

调整后的聚类

Code
final_kmeans <- kmeans_wf %>%
  update_model(kmeans_spec %>% set_args(num_clusters = 2)) %>%
  fit(x_df)
Code
augment(final_kmeans, new_data = x_df) %>%
  ggplot(aes(V1, V2, color = .pred_cluster)) +
  geom_point()

1.2 分层聚类(小样本)Hierarchical Clustering

算法

  1. 定义每个观测为一类

  2. 计算每类与其他各类的距离

  3. 把距离最短的两类合并成新的一类,总的类的个数减一

  4. 重复2,3步骤,直到所有的类聚成单个类为止

1.2.1 hclust specification

Code
res_hclust_complete <- tidyclust::hier_clust(linkage_method = "complete") %>%
  fit(~., data = x_df)

res_hclust_average <- hier_clust(linkage_method = "average") %>%
  fit(~., data = x_df)

res_hclust_single <- hier_clust(linkage_method = "single") %>%
  fit(~., data = x_df)

factoextra package 提取模型信息和可视化

Code
library(factoextra)
res_hclust_complete %>%
  extract_fit_engine() %>%
  fviz_dend(main = "complete", k = 2)
#> Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
#> of ggplot2 3.3.4.
#> ℹ The deprecated feature was likely used in the factoextra package.
#>   Please report the issue at <https://github.com/kassambara/factoextra/issues>.

Code
res_hclust_average %>%
  extract_fit_engine() %>%
  fviz_dend(main = "average", k = 2)

Code
res_hclust_single %>%
  extract_fit_engine() %>%
  fviz_dend(main = "single", k = 2)

Code
hier_rec <- recipe(~., data = x_df) %>%
  step_normalize(all_numeric_predictors()) # 标准化

hier_wf <- workflow() %>%
  add_recipe(hier_rec) %>%
  add_model(hier_clust(linkage_method = "complete"))

hier_fit <- hier_wf %>%
  fit(data = x_df) 

hier_fit %>%
  extract_fit_engine() %>%
  fviz_dend(k = 2)

Back to top