1 聚类
https://tidyclust.tidymodels.org/
Code
聚类分析的一般步骤
选择合适的变量
缩放数据 :标准化
寻找异常点
计算距离: dist(x,method = ) 默认欧几里得距离
选择聚类方法和算法
确定类的数目
获得最终的聚类解决方案
结果可视化
解读类
验证结果
1.1 划分聚类 partitioning clustering
1.1.1 K Means Cluster Specification
num_clusters = 3
指定中心点(centroids)即类的个数,nstart = 20
指定初始位置的个数,希望找到全局最大值而不是局部最大值
Code
kmeans_spec <-tidyclust::k_means(num_clusters = 3) %>%
set_mode("partition") %>%
set_engine("stats") %>%
set_args(nstart = 20)
kmeans_spec
#> K Means Cluster Specification (partition)
#>
#> Main Arguments:
#> num_clusters = 3
#>
#> Engine-Specific Arguments:
#> nstart = 20
#>
#> Computational engine: stats
K-means algorithm starts with random initialization
Code
set.seed(100)
kmeans_fit <- kmeans_spec %>%
fit(~., data = x_df)
kmeans_fit$fit
#> K-means clustering with 3 clusters of sizes 14, 12, 24
#>
#> Cluster means:
#> V1 V2
#> 2 -0.4354713 -0.8929796
#> 1 -0.0594887 0.8269786
#> 3 2.6977371 -3.9171729
#>
#> Clustering vector:
#> 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
#> 1 1 2 2 2 2 1 2 1 2 1 1 1 2 2 2 2 1 2 1 2 1 1 1 1 3
#> 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
#> 3 3 3 3 3 3 3 3 3 3 3 1 3 3 3 3 3 3 3 3 3 3 3 3
#>
#> Within cluster sum of squares by cluster:
#> [1] 19.593523 9.502891 32.730828
#> (between_SS / total_SS = 83.4 %)
#>
#> Available components:
#>
#> [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
#> [6] "betweenss" "size" "iter" "ifault"
extract_centroids(kmeans_fit)
#> # A tibble: 3 × 3
#> .cluster V1 V2
#> <fct> <dbl> <dbl>
#> 1 Cluster_1 -0.435 -0.893
#> 2 Cluster_2 -0.0595 0.827
#> 3 Cluster_3 2.70 -3.92
kmeans_fit$fit$centers
#> V1 V2
#> 2 -0.4354713 -0.8929796
#> 1 -0.0594887 0.8269786
#> 3 2.6977371 -3.9171729
kmeans_fit$fit$cluster
#> 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
#> 1 1 2 2 2 2 1 2 1 2 1 1 1 2 2 2 2 1 2 1 2 1 1 1 1 3
#> 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
#> 3 3 3 3 3 3 3 3 3 3 3 1 3 3 3 3 3 3 3 3 3 3 3 3
Code
predict(kmeans_fit, new_data = x_df)
#> # A tibble: 50 × 1
#> .pred_cluster
#> <fct>
#> 1 Cluster_1
#> 2 Cluster_1
#> 3 Cluster_2
#> 4 Cluster_2
#> 5 Cluster_2
#> 6 Cluster_2
#> 7 Cluster_1
#> 8 Cluster_2
#> 9 Cluster_1
#> 10 Cluster_2
#> # ℹ 40 more rows
augment(kmeans_fit, new_data = x_df)
#> # A tibble: 50 × 3
#> V1 V2 .pred_cluster
#> <dbl> <dbl> <fct>
#> 1 0.0187 -0.401 Cluster_1
#> 2 -0.184 -0.335 Cluster_1
#> 3 -1.37 1.37 Cluster_2
#> 4 -0.599 2.14 Cluster_2
#> 5 0.295 0.506 Cluster_2
#> 6 0.390 0.786 Cluster_2
#> 7 -1.21 -0.902 Cluster_1
#> 8 -0.364 0.533 Cluster_2
#> 9 -1.63 -0.646 Cluster_1
#> 10 -0.256 0.291 Cluster_2
#> # ℹ 40 more rows
Code
tune_cluster()
找到最适合的类的数目
Code
set.seed(1000)
x_boots <- bootstraps(x_df, times = 10)
num_clusters_grid <- tibble(num_clusters = seq(1, 10))
tune_res <- tune_cluster(
object = kmeans_wf,
resamples = x_boots,
grid = num_clusters_grid
)
Code
tune_res %>%
collect_metrics()
#> # A tibble: 20 × 7
#> num_clusters .metric .estimator mean n std_err .config
#> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
#> 1 1 sse_total standard 381. 10 10.4 Preprocessor1_…
#> 2 1 sse_within_total standard 381. 10 10.4 Preprocessor1_…
#> 3 2 sse_total standard 381. 10 10.4 Preprocessor1_…
#> 4 2 sse_within_total standard 81.4 10 4.36 Preprocessor1_…
#> 5 3 sse_total standard 381. 10 10.4 Preprocessor1_…
#> 6 3 sse_within_total standard 56.8 10 3.34 Preprocessor1_…
#> 7 4 sse_total standard 381. 10 10.4 Preprocessor1_…
#> 8 4 sse_within_total standard 40.5 10 2.33 Preprocessor1_…
#> 9 5 sse_total standard 381. 10 10.4 Preprocessor1_…
#> 10 5 sse_within_total standard 29.8 10 1.71 Preprocessor1_…
#> 11 6 sse_total standard 381. 10 10.4 Preprocessor1_…
#> 12 6 sse_within_total standard 21.8 10 1.43 Preprocessor1_…
#> 13 7 sse_total standard 381. 10 10.4 Preprocessor1_…
#> 14 7 sse_within_total standard 17.0 10 1.04 Preprocessor1_…
#> 15 8 sse_total standard 381. 10 10.4 Preprocessor1_…
#> 16 8 sse_within_total standard 13.6 10 0.842 Preprocessor1_…
#> 17 9 sse_total standard 381. 10 10.4 Preprocessor1_…
#> 18 9 sse_within_total standard 11.0 10 0.669 Preprocessor1_…
#> 19 10 sse_total standard 381. 10 10.4 Preprocessor1_…
#> 20 10 sse_within_total standard 8.82 10 0.595 Preprocessor1_…
elbow method 找到最理想的类的个数。
Code
tune_res %>%
autoplot()
调整后的聚类
1.2 分层聚类(小样本)Hierarchical Clustering
算法
定义每个观测为一类
计算每类与其他各类的距离
把距离最短的两类合并成新的一类,总的类的个数减一
重复2,3步骤,直到所有的类聚成单个类为止
1.2.1 hclust specification
Code
res_hclust_complete <- tidyclust::hier_clust(linkage_method = "complete") %>%
fit(~., data = x_df)
res_hclust_average <- hier_clust(linkage_method = "average") %>%
fit(~., data = x_df)
res_hclust_single <- hier_clust(linkage_method = "single") %>%
fit(~., data = x_df)
factoextra package 提取模型信息和可视化
Code
library(factoextra)
res_hclust_complete %>%
extract_fit_engine() %>%
fviz_dend(main = "complete", k = 2)
#> Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
#> of ggplot2 3.3.4.
#> ℹ The deprecated feature was likely used in the factoextra package.
#> Please report the issue at <https://github.com/kassambara/factoextra/issues>.
Code
res_hclust_average %>%
extract_fit_engine() %>%
fviz_dend(main = "average", k = 2)
Code
res_hclust_single %>%
extract_fit_engine() %>%
fviz_dend(main = "single", k = 2)
Code
hier_rec <- recipe(~., data = x_df) %>%
step_normalize(all_numeric_predictors()) # 标准化
hier_wf <- workflow() %>%
add_recipe(hier_rec) %>%
add_model(hier_clust(linkage_method = "complete"))
hier_fit <- hier_wf %>%
fit(data = x_df)
hier_fit %>%
extract_fit_engine() %>%
fviz_dend(k = 2)