9 数据清洗

9.1 行操作dplyr

https://dplyr.tidyverse.org/index.html “分割-应用-组合”（Split-Apply-Combine）

9.1.1 `filter()`

Show the code

dplyr::filter(mpg,model=="a4")

manufacturer	model	displ	year	cyl	trans	drv	cty	hwy	fl	class
audi	a4	1.8	1999	4	auto(l5)	f	18	29	p	compact
audi	a4	1.8	1999	4	manual(m5)	f	21	29	p	compact
audi	a4	2.0	2008	4	manual(m6)	f	20	31	p	compact
audi	a4	2.0	2008	4	auto(av)	f	21	30	p	compact
audi	a4	2.8	1999	6	auto(l5)	f	16	26	p	compact
audi	a4	2.8	1999	6	manual(m5)	f	18	26	p	compact
audi	a4	3.1	2008	6	auto(av)	f	18	27	p	compact

Show the code




dplyr::filter(mpg,drv %in% c("r","f")) |> DT::datatable()

9.1.2 `arrange()`

Show the code

arrange(mpg, year,displ,cty)|> DT::datatable()

Show the code


arrange(mpg,desc(displ))|> DT::datatable()

9.1.3 `distinct()`

Show the code

distinct(mpg,manufacturer)

manufacturer
audi
chevrolet
dodge
ford
honda
hyundai
jeep
land rover
lincoln
mercury
nissan
pontiac
subaru
toyota
volkswagen

Show the code

count(mpg,manufacturer,sort = TRUE)

manufacturer	n
dodge	37
toyota	34
volkswagen	27
ford	25
chevrolet	19
audi	18
hyundai	14
subaru	14
nissan	13
honda	9
jeep	8
pontiac	5
land rover	4
mercury	4
lincoln	3

Show the code


mpg |> 
    distinct(manufacturer,cyl) |> 
    count(cyl)

cyl	n
4	9
5	1
6	11
8	11

Show the code


# 将在数据集中找到唯一行的第一个出现并丢弃其余行
distinct(mpg,manufacturer,.keep_all = TRUE)

manufacturer	model	displ	year	cyl	trans	drv	cty	hwy	fl	class
audi	a4	1.8	1999	4	auto(l5)	f	18	29	p	compact
chevrolet	c1500 suburban 2wd	5.3	2008	8	auto(l4)	r	14	20	r	suv
dodge	caravan 2wd	2.4	1999	4	auto(l3)	f	18	24	r	minivan
ford	expedition 2wd	4.6	1999	8	auto(l4)	r	11	17	r	suv
honda	civic	1.6	1999	4	manual(m5)	f	28	33	r	subcompact
hyundai	sonata	2.4	1999	4	auto(l4)	f	18	26	r	midsize
jeep	grand cherokee 4wd	3.0	2008	6	auto(l5)	4	17	22	d	suv
land rover	range rover	4.0	1999	8	auto(l4)	4	11	15	p	suv
lincoln	navigator 2wd	5.4	1999	8	auto(l4)	r	11	17	r	suv
mercury	mountaineer 4wd	4.0	1999	6	auto(l5)	4	14	17	r	suv
nissan	altima	2.4	1999	4	manual(m5)	f	21	29	r	compact
pontiac	grand prix	3.1	1999	6	auto(l4)	f	18	26	r	midsize
subaru	forester awd	2.5	1999	4	manual(m5)	4	18	25	r	suv
toyota	4runner 4wd	2.7	1999	4	manual(m5)	4	15	20	r	suv
volkswagen	gti	2.0	1999	4	manual(m5)	f	21	29	r	compact

9.1.4 `slice()`

Show the code

slice_head(mpg , n=6)

manufacturer	model	displ	year	cyl	trans	drv	cty	hwy	fl	class
audi	a4	1.8	1999	4	auto(l5)	f	18	29	p	compact
audi	a4	1.8	1999	4	manual(m5)	f	21	29	p	compact
audi	a4	2.0	2008	4	manual(m6)	f	20	31	p	compact
audi	a4	2.0	2008	4	auto(av)	f	21	30	p	compact
audi	a4	2.8	1999	6	auto(l5)	f	16	26	p	compact
audi	a4	2.8	1999	6	manual(m5)	f	18	26	p	compact

Show the code

slice_max(mpg, order_by = displ ,n=6)

manufacturer	model	displ	year	cyl	trans	drv	cty	hwy	fl	class
chevrolet	corvette	7.0	2008	8	manual(m6)	r	15	24	p	2seater
chevrolet	k1500 tahoe 4wd	6.5	1999	8	auto(l4)	4	14	17	d	suv
chevrolet	corvette	6.2	2008	8	manual(m6)	r	16	26	p	2seater
chevrolet	corvette	6.2	2008	8	auto(s6)	r	15	25	p	2seater
jeep	grand cherokee 4wd	6.1	2008	8	auto(l5)	4	11	14	p	suv
chevrolet	c1500 suburban 2wd	6.0	2008	8	auto(l4)	r	12	17	r	suv

Show the code

slice_min(mpg,order_by = displ,prop = 0.01)

manufacturer	model	displ	year	cyl	trans	drv	cty	hwy	fl	class
honda	civic	1.6	1999	4	manual(m5)	f	28	33	r	subcompact
honda	civic	1.6	1999	4	auto(l4)	f	24	32	r	subcompact
honda	civic	1.6	1999	4	manual(m5)	f	25	32	r	subcompact
honda	civic	1.6	1999	4	manual(m5)	f	23	29	p	subcompact
honda	civic	1.6	1999	4	auto(l4)	f	24	32	r	subcompact

Show the code


slice_sample(mpg,n=6,weight_by =hwy ,replace = T)

manufacturer	model	displ	year	cyl	trans	drv	cty	hwy	fl	class
ford	expedition 2wd	5.4	1999	8	auto(l4)	r	11	17	r	suv
volkswagen	new beetle	1.9	1999	4	manual(m5)	f	35	44	d	subcompact
dodge	ram 1500 pickup 4wd	4.7	2008	8	manual(m6)	4	9	12	e	pickup
jeep	grand cherokee 4wd	4.7	2008	8	auto(l5)	4	14	19	r	suv
chevrolet	corvette	7.0	2008	8	manual(m6)	r	15	24	p	2seater
subaru	forester awd	2.5	2008	4	auto(l4)	4	18	23	p	suv

9.2 列操作 dplyr

9.2.1 `glimpse()`

Show the code

glimpse(mpg) %>% select(1:3) %>% head()
#> Rows: 234
#> Columns: 11
#> $ manufacturer <chr> "audi", "audi", "audi", "audi", "audi", "audi", "audi", "…
#> $ model        <chr> "a4", "a4", "a4", "a4", "a4", "a4", "a4", "a4 quattro", "…
#> $ displ        <dbl> 1.8, 1.8, 2.0, 2.0, 2.8, 2.8, 3.1, 1.8, 1.8, 2.0, 2.0, 2.…
#> $ year         <int> 1999, 1999, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 200…
#> $ cyl          <int> 4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 8, 8, …
#> $ trans        <chr> "auto(l5)", "manual(m5)", "manual(m6)", "auto(av)", "auto…
#> $ drv          <chr> "f", "f", "f", "f", "f", "f", "f", "4", "4", "4", "4", "4…
#> $ cty          <int> 18, 21, 20, 21, 16, 18, 18, 18, 16, 20, 19, 15, 17, 17, 1…
#> $ hwy          <int> 29, 29, 31, 30, 26, 26, 27, 26, 25, 28, 27, 25, 25, 25, 2…
#> $ fl           <chr> "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "p…
#> $ class        <chr> "compact", "compact", "compact", "compact", "compact", "c…

manufacturer	model	displ
audi	a4	1.8
audi	a4	1.8
audi	a4	2.0
audi	a4	2.0
audi	a4	2.8
audi	a4	2.8

9.2.2 `pull()`

提取单个列

Show the code

mpg %>% 
    pull(var = cyl)
#>   [1] 4 4 4 4 6 6 6 4 4 4 4 6 6 6 6 6 6 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 4 4 6 6 6
#>  [38] 4 6 6 6 6 6 6 6 6 6 6 6 6 6 6 8 8 8 8 8 6 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8
#>  [75] 8 8 8 6 6 6 6 8 8 6 6 8 8 8 8 8 6 6 6 6 8 8 8 8 8 4 4 4 4 4 4 4 4 4 4 4 4
#> [112] 4 6 6 6 4 4 4 4 6 6 6 6 6 6 8 8 8 8 8 8 8 8 8 8 8 8 6 6 8 8 4 4 4 4 6 6 6
#> [149] 6 6 6 6 6 8 6 6 6 6 8 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 6 6 6 8 4 4 4 4 6 6
#> [186] 6 4 4 4 4 6 6 6 4 4 4 4 4 8 8 4 4 4 6 6 6 6 4 4 4 4 6 4 4 4 4 4 5 5 6 6 4
#> [223] 4 4 4 5 5 4 4 4 4 6 6 6

9.2.3 `mutate()`

Show the code

mutate(mpg,
       id=nrow(mpg),
       .before = 1)|> DT::datatable()

Show the code


mutate(mpg,
       id=nrow(mpg),
       .after = 1)|> DT::datatable()

Show the code


mutate(mpg,
       id=nrow(mpg),
       .keep = "used")|> DT::datatable()

9.2.4 `rename()`

Show the code

rename(mpg,生产厂商=manufacturer) |> head()

生产厂商	model	displ	year	cyl	trans	drv	cty	hwy	fl	class
audi	a4	1.8	1999	4	auto(l5)	f	18	29	p	compact
audi	a4	1.8	1999	4	manual(m5)	f	21	29	p	compact
audi	a4	2.0	2008	4	manual(m6)	f	20	31	p	compact
audi	a4	2.0	2008	4	auto(av)	f	21	30	p	compact
audi	a4	2.8	1999	6	auto(l5)	f	16	26	p	compact
audi	a4	2.8	1999	6	manual(m5)	f	18	26	p	compact

9.2.5 `relocate()`

Show the code

relocate(mpg , model:year ,.before = 1) |> head()

model	displ	year	manufacturer	cyl	trans	drv	cty	hwy	fl	class
a4	1.8	1999	audi	4	auto(l5)	f	18	29	p	compact
a4	1.8	1999	audi	4	manual(m5)	f	21	29	p	compact
a4	2.0	2008	audi	4	manual(m6)	f	20	31	p	compact
a4	2.0	2008	audi	4	auto(av)	f	21	30	p	compact
a4	2.8	1999	audi	6	auto(l5)	f	16	26	p	compact
a4	2.8	1999	audi	6	manual(m5)	f	18	26	p	compact

Show the code

relocate(mpg , model:year ,.after = cyl) |> head()

manufacturer	cyl	model	displ	year	trans	drv	cty	hwy	fl	class
audi	4	a4	1.8	1999	auto(l5)	f	18	29	p	compact
audi	4	a4	1.8	1999	manual(m5)	f	21	29	p	compact
audi	4	a4	2.0	2008	manual(m6)	f	20	31	p	compact
audi	4	a4	2.0	2008	auto(av)	f	21	30	p	compact
audi	6	a4	2.8	1999	auto(l5)	f	16	26	p	compact
audi	6	a4	2.8	1999	manual(m5)	f	18	26	p	compact

9.2.6 `select()`

:用于选择一系列连续变量。
!用于获取一组变量的补集。
&以及|用于选择交集或并集。
c()用于组合选择。

选择帮助程序( selection helpers )选择特定的列：

everything()：匹配所有变量。
last_col()：选择最后一个变量，可能带有偏移量。
group_cols()：选择所有分组列。
starts_with()：前缀开头。
ends_with()：后缀结尾。
contains()：包含文本字符串。
matches()：匹配正则表达式。
num_range()：匹配 x01、x02、x03 等数值范围。

或者从存储在字符向量中的变量：

all_of()：匹配字符向量中的变量名称。所有名称必须存在，否则抛出越界错误（out-of-bounds error）。
any_of()：与all_of()相同，只是对于不存在的名称不会抛出任何错误。

或者使用谓词( predicate )函数：

where(fn)：将该函数应用于所有变量并选择该函数返回TRUE的变量。

Show the code

dplyr::select(mpg,model:year)|> DT::datatable()

Show the code

select(mpg,c(3,4,5))|> DT::datatable()

Show the code

select(mpg,where(is.numeric))|> DT::datatable()

Show the code

select(mpg,where(~is.numeric(.x) && mean(.x,na.rm=TRUE)<50))|> DT::datatable()

9.2.7 `across(.cols, .fns, ...)`

Show the code

iris <- as_tibble(iris)
iris %>%
  mutate(across(where(is.double) & !c(Petal.Length, Petal.Width), round)) %>% 
    head(6)

Sepal.Length	Sepal.Width	Petal.Length	Petal.Width	Species
5	4	1.4	0.2	setosa
5	3	1.4	0.2	setosa
5	3	1.3	0.2	setosa
5	3	1.5	0.2	setosa
5	4	1.4	0.2	setosa
5	4	1.7	0.4	setosa

Show the code


cols <- c("Sepal.Length", "Petal.Width")
iris %>%
  mutate(across(all_of(cols), round))

Sepal.Length	Sepal.Width	Petal.Length	Petal.Width	Species
5	3.5	1.4	0	setosa
5	3.0	1.4	0	setosa
5	3.2	1.3	0	setosa
5	3.1	1.5	0	setosa
5	3.6	1.4	0	setosa
5	3.9	1.7	0	setosa
5	3.4	1.4	0	setosa
5	3.4	1.5	0	setosa
4	2.9	1.4	0	setosa
5	3.1	1.5	0	setosa
5	3.7	1.5	0	setosa
5	3.4	1.6	0	setosa
5	3.0	1.4	0	setosa
4	3.0	1.1	0	setosa
6	4.0	1.2	0	setosa
6	4.4	1.5	0	setosa
5	3.9	1.3	0	setosa
5	3.5	1.4	0	setosa
6	3.8	1.7	0	setosa
5	3.8	1.5	0	setosa
5	3.4	1.7	0	setosa
5	3.7	1.5	0	setosa
5	3.6	1.0	0	setosa
5	3.3	1.7	0	setosa
5	3.4	1.9	0	setosa
5	3.0	1.6	0	setosa
5	3.4	1.6	0	setosa
5	3.5	1.5	0	setosa
5	3.4	1.4	0	setosa
5	3.2	1.6	0	setosa
5	3.1	1.6	0	setosa
5	3.4	1.5	0	setosa
5	4.1	1.5	0	setosa
6	4.2	1.4	0	setosa
5	3.1	1.5	0	setosa
5	3.2	1.2	0	setosa
6	3.5	1.3	0	setosa
5	3.6	1.4	0	setosa
4	3.0	1.3	0	setosa
5	3.4	1.5	0	setosa
5	3.5	1.3	0	setosa
4	2.3	1.3	0	setosa
4	3.2	1.3	0	setosa
5	3.5	1.6	1	setosa
5	3.8	1.9	0	setosa
5	3.0	1.4	0	setosa
5	3.8	1.6	0	setosa
5	3.2	1.4	0	setosa
5	3.7	1.5	0	setosa
5	3.3	1.4	0	setosa
7	3.2	4.7	1	versicolor
6	3.2	4.5	2	versicolor
7	3.1	4.9	2	versicolor
6	2.3	4.0	1	versicolor
6	2.8	4.6	2	versicolor
6	2.8	4.5	1	versicolor
6	3.3	4.7	2	versicolor
5	2.4	3.3	1	versicolor
7	2.9	4.6	1	versicolor
5	2.7	3.9	1	versicolor
5	2.0	3.5	1	versicolor
6	3.0	4.2	2	versicolor
6	2.2	4.0	1	versicolor
6	2.9	4.7	1	versicolor
6	2.9	3.6	1	versicolor
7	3.1	4.4	1	versicolor
6	3.0	4.5	2	versicolor
6	2.7	4.1	1	versicolor
6	2.2	4.5	2	versicolor
6	2.5	3.9	1	versicolor
6	3.2	4.8	2	versicolor
6	2.8	4.0	1	versicolor
6	2.5	4.9	2	versicolor
6	2.8	4.7	1	versicolor
6	2.9	4.3	1	versicolor
7	3.0	4.4	1	versicolor
7	2.8	4.8	1	versicolor
7	3.0	5.0	2	versicolor
6	2.9	4.5	2	versicolor
6	2.6	3.5	1	versicolor
6	2.4	3.8	1	versicolor
6	2.4	3.7	1	versicolor
6	2.7	3.9	1	versicolor
6	2.7	5.1	2	versicolor
5	3.0	4.5	2	versicolor
6	3.4	4.5	2	versicolor
7	3.1	4.7	2	versicolor
6	2.3	4.4	1	versicolor
6	3.0	4.1	1	versicolor
6	2.5	4.0	1	versicolor
6	2.6	4.4	1	versicolor
6	3.0	4.6	1	versicolor
6	2.6	4.0	1	versicolor
5	2.3	3.3	1	versicolor
6	2.7	4.2	1	versicolor
6	3.0	4.2	1	versicolor
6	2.9	4.2	1	versicolor
6	2.9	4.3	1	versicolor
5	2.5	3.0	1	versicolor
6	2.8	4.1	1	versicolor
6	3.3	6.0	2	virginica
6	2.7	5.1	2	virginica
7	3.0	5.9	2	virginica
6	2.9	5.6	2	virginica
6	3.0	5.8	2	virginica
8	3.0	6.6	2	virginica
5	2.5	4.5	2	virginica
7	2.9	6.3	2	virginica
7	2.5	5.8	2	virginica
7	3.6	6.1	2	virginica
6	3.2	5.1	2	virginica
6	2.7	5.3	2	virginica
7	3.0	5.5	2	virginica
6	2.5	5.0	2	virginica
6	2.8	5.1	2	virginica
6	3.2	5.3	2	virginica
6	3.0	5.5	2	virginica
8	3.8	6.7	2	virginica
8	2.6	6.9	2	virginica
6	2.2	5.0	2	virginica
7	3.2	5.7	2	virginica
6	2.8	4.9	2	virginica
8	2.8	6.7	2	virginica
6	2.7	4.9	2	virginica
7	3.3	5.7	2	virginica
7	3.2	6.0	2	virginica
6	2.8	4.8	2	virginica
6	3.0	4.9	2	virginica
6	2.8	5.6	2	virginica
7	3.0	5.8	2	virginica
7	2.8	6.1	2	virginica
8	3.8	6.4	2	virginica
6	2.8	5.6	2	virginica
6	2.8	5.1	2	virginica
6	2.6	5.6	1	virginica
8	3.0	6.1	2	virginica
6	3.4	5.6	2	virginica
6	3.1	5.5	2	virginica
6	3.0	4.8	2	virginica
7	3.1	5.4	2	virginica
7	3.1	5.6	2	virginica
7	3.1	5.1	2	virginica
6	2.7	5.1	2	virginica
7	3.2	5.9	2	virginica
7	3.3	5.7	2	virginica
7	3.0	5.2	2	virginica
6	2.5	5.0	2	virginica
6	3.0	5.2	2	virginica
6	3.4	5.4	2	virginica
6	3.0	5.1	2	virginica

Show the code



iris %>%
  group_by(Species) %>%
  summarise(across(starts_with("Sepal"), ~ mean(.x, na.rm = TRUE),.names = "mean_{.col}"))

Species	mean_Sepal.Length	mean_Sepal.Width
setosa	5.006	3.428
versicolor	5.936	2.770
virginica	6.588	2.974

Show the code



# 命名列表
iris %>%
  group_by(Species) %>%
  summarise(across(starts_with("Sepal"), list(mean = mean, sd = sd), .names = "{.col}_{.fn}"))

Species	Sepal.Length_mean	Sepal.Length_sd	Sepal.Width_mean	Sepal.Width_sd
setosa	5.006	0.3524897	3.428	0.3790644
versicolor	5.936	0.5161711	2.770	0.3137983
virginica	6.588	0.6358796	2.974	0.3224966

Show the code


iris %>%
  group_by(Species) %>%
  summarise(across(starts_with("Sepal"), list(mean, sd), .names = "{.col}_function{.fn}"))

Species	Sepal.Length_function1	Sepal.Length_function2	Sepal.Width_function1	Sepal.Width_function2
setosa	5.006	0.3524897	3.428	0.3790644
versicolor	5.936	0.5161711	2.770	0.3137983
virginica	6.588	0.6358796	2.974	0.3224966

Show the code

iris %>%
  dplyr::filter(if_any(ends_with("Width"), ~ . > 4))

Sepal.Length	Sepal.Width	Petal.Length	Petal.Width	Species
5.7	4.4	1.5	0.4	setosa
5.2	4.1	1.5	0.1	setosa
5.5	4.2	1.4	0.2	setosa

Show the code

iris %>%
  dplyr::filter(if_all(ends_with("Width"), ~ . > 2))

Sepal.Length	Sepal.Width	Petal.Length	Petal.Width	Species
6.3	3.3	6.0	2.5	virginica
7.1	3.0	5.9	2.1	virginica
6.5	3.0	5.8	2.2	virginica
7.6	3.0	6.6	2.1	virginica
7.2	3.6	6.1	2.5	virginica
6.8	3.0	5.5	2.1	virginica
5.8	2.8	5.1	2.4	virginica
6.4	3.2	5.3	2.3	virginica
7.7	3.8	6.7	2.2	virginica
7.7	2.6	6.9	2.3	virginica
6.9	3.2	5.7	2.3	virginica
6.7	3.3	5.7	2.1	virginica
6.4	2.8	5.6	2.1	virginica
6.4	2.8	5.6	2.2	virginica
7.7	3.0	6.1	2.3	virginica
6.3	3.4	5.6	2.4	virginica
6.9	3.1	5.4	2.1	virginica
6.7	3.1	5.6	2.4	virginica
6.9	3.1	5.1	2.3	virginica
6.8	3.2	5.9	2.3	virginica
6.7	3.3	5.7	2.5	virginica
6.7	3.0	5.2	2.3	virginica
6.2	3.4	5.4	2.3	virginica

9.3 按组操作

9.3.1 `group_by()`

Show the code

mpg |> 
    group_by(cyl,drv) |> 
    summarise(n=n(),
              .groups = "drop_last") #删除最后一个分组条件

cyl	drv	n
4	4	23
4	f	58
5	f	4
6	4	32
6	f	43
6	r	4
8	4	48
8	f	1
8	r	21

Show the code

# drop 删除所有分组
# keep 保留所有分组

9.3.2 `.by =`

Show the code

mpg %>% 
    summarise(n=n(),
              .by = c(cyl,drv))

cyl	drv	n
4	f	58
6	f	43
4	4	23
6	4	32
8	4	48
8	r	21
6	r	4
8	f	1
5	f	4

9.3.3 计数

Show the code

starwars %>% count(sex, gender, sort = TRUE)

sex	gender	n
male	masculine	60
female	feminine	16
none	masculine	5
NA	NA	4
hermaphroditic	masculine	1
none	feminine	1

Show the code


starwars %>% group_by(sex, gender) %>% 
    summarise(n=n(),.groups = "drop") %>% 
    arrange(desc(n))

sex	gender	n
male	masculine	60
female	feminine	16
none	masculine	5
NA	NA	4
hermaphroditic	masculine	1
none	feminine	1

9.3.4 `rowwise()`

Show the code

options(digits = 3)
set.seed(123)
df <- tibble(x = runif(6), y = runif(6), z = runif(6))
df

x	y	z
0.288	0.528	0.678
0.788	0.892	0.573
0.409	0.551	0.103
0.883	0.457	0.900
0.940	0.957	0.246
0.046	0.453	0.042

Show the code


df %>% mutate(m = pmin(x, y, z))

x	y	z	m
0.288	0.528	0.678	0.288
0.788	0.892	0.573	0.573
0.409	0.551	0.103	0.103
0.883	0.457	0.900	0.457
0.940	0.957	0.246	0.246
0.046	0.453	0.042	0.042

9.3.5 `reframe()`

Show the code

df <- tibble(
  g = c(1, 1, 1, 2, 2, 2, 2),
  x = c(1:7)
)
df %>%
  reframe(x = mean(x), .by = g)

g	x
1	2.0
2	5.5

9.4 按数据框操作

9.4.1 绑定操作

9.4.1.1 `bing_cols()`

Show the code

df1 <- tibble(x = 1:3)
df2 <- tibble(y = 3:1)
bind_cols(df1, df2)

x	y
1	3
2	2
3	1

9.4.1.2 `bind_rows()`

Show the code

df1 <- tibble(x = 1:2, y = letters[1:2])
df2 <- tibble(x = 4:5, z = 1:2)

bind_rows(df1, df2)

x	y	z
1	a	NA
2	b	NA
4	NA	1
5	NA	2

9.4.2 集合操作

集合运算要求变量名（列）完全相同,把观测（行）看成是集合中的元素

Show the code

x<-tibble(ID=c(1,2),X=c("a1",'a2'))
y<-tibble(ID=c(2,3),X=c("a2",'a3'))
x;y

ID	X
1	a1
2	a2

ID	X
2	a2
3	a3

Show the code

dplyr::intersect(x,y)

ID	X
2	a2

Show the code

union(x,y)

ID	X
1	a1
2	a2
3	a3

Show the code

union_all(x,y)

ID	X
1	a1
2	a2
2	a2
3	a3

Show the code


dplyr::setdiff(x,y) #返回在x中不在y中

ID	X
1	a1

Show the code

symdiff(x,y)  # 在x不在y中的行和在y中不在x中的行

ID	X
1	a1
3	a3

Show the code

setequal(x,y)#判断是否相等
#> [1] FALSE

9.4.3 连接操作

Show the code

x <- tribble(
  ~ID, ~X,
  1, "x1",
  2, "x2",
  3, "x3"
)
y <- tribble(
  ~id, ~Y,
  1, "y1",
  2, "y2",
  4, "y4"
)
x;y

ID	X
1	x1
2	x2
3	x3

id	Y
1	y1
2	y2
4	y4

9.4.3.1 mutate-joins

Show the code

# 左连接
left_join(x, y, by = join_by(ID == id))

ID	X	Y
1	x1	y1
2	x2	y2
3	x3	NA

Show the code


# 右连接
right_join(x, y, by = join_by(ID == id))

ID	X	Y
1	x1	y1
2	x2	y2
4	NA	y4

Show the code



# 全连接
full_join(x, y, by = join_by(ID == id))

ID	X	Y
1	x1	y1
2	x2	y2
3	x3	NA
4	NA	y4

Show the code


# 内连接
inner_join(x, y, by = join_by(ID == id))

ID	X	Y
1	x1	y1
2	x2	y2

Show the code

inner_join(x,y, by = join_by(ID >=id), keep = TRUE)

ID	X	id	Y
1	x1	1	y1
2	x2	1	y1
2	x2	2	y2
3	x3	1	y1
3	x3	2	y2

Show the code

#滚动联接  rolling join 类似于不等式连接，但仅匹配最近一个值。
inner_join(x,y, join_by(closest(ID >=id)))

ID	X	id	Y
1	x1	1	y1
2	x2	2	y2
3	x3	2	y2

9.4.3.2 filter-joins

对第一个数据框进行筛选

Show the code

# 半连接
semi_join(x, y, by = join_by(ID == id))

ID	X
1	x1
2	x2

Show the code


# 反连接
anti_join(x, y, by = join_by(ID == id))

ID	X
3	x3

9.4.3.3 cross-join

Show the code

#交叉连接
cross_join(x,y)         #nrow(x) * nrow(y)

ID	X	id	Y
1	x1	1	y1
1	x1	2	y2
1	x1	4	y4
2	x2	1	y1
2	x2	2	y2
2	x2	4	y4
3	x3	1	y1
3	x3	2	y2
3	x3	4	y4

Show the code



nest_join(x,y,by = join_by(ID==id))->nested

nested[[3]]
#> [[1]]
#> # A tibble: 1 × 1
#>   Y    
#>   <chr>
#> 1 y1   
#> 
#> [[2]]
#> # A tibble: 1 × 1
#>   Y    
#>   <chr>
#> 1 y2   
#> 
#> [[3]]
#> # A tibble: 0 × 1
#> # ℹ 1 variable: Y <chr>

9.4.3.4 `group_nest()`

Show the code

library(tidyverse)
library(broom)
data(crickets, package = "modeldata")
names(crickets)
#> [1] "species" "temp"    "rate"

split_by_species <- crickets %>% 
  group_nest(species) 
split_by_species

species	data
O. exclamationis	20.8, 20.8, 24.0, 24.0, 24.0, 24.0, 26.2, 26.2, 26.2, 26.2, 28.4, 29.0, 30.4, 30.4, 67.9, 65.1, 77.3, 78.7, 79.4, 80.4, 85.8, 86.6, 87.5, 89.1, 98.6, 100.8, 99.3, 101.7
O. niveus	17.2, 18.3, 18.3, 18.3, 18.9, 18.9, 20.4, 21.0, 21.0, 22.1, 23.5, 24.2, 25.9, 26.5, 26.5, 26.5, 28.6, 44.3, 47.2, 47.6, 49.6, 50.3, 51.8, 60.0, 58.5, 58.9, 60.7, 69.8, 70.9, 76.2, 76.1, 77.0, 77.7, 84.7

Show the code


model_by_species <- 
  split_by_species %>% 
  mutate(model = map(data, ~ lm(rate ~ temp, data = .x)))
model_by_species

species	data	model
O. exclamationis	20.8, 20.8, 24.0, 24.0, 24.0, 24.0, 26.2, 26.2, 26.2, 26.2, 28.4, 29.0, 30.4, 30.4, 67.9, 65.1, 77.3, 78.7, 79.4, 80.4, 85.8, 86.6, 87.5, 89.1, 98.6, 100.8, 99.3, 101.7	c(`(Inte.... \| \|O. niveus \|17.2, 18.3, 18.3, 18.3, 18.9, 18.9, 20.4, 21.0, 21.0, 22.1, 23.5, 24.2, 25.9, 26.5, 26.5, 26.5, 28.6, 44.3, 47.2, 47.6, 49.6, 50.3, 51.8, 60.0, 58.5, 58.9, 60.7, 69.8, 70.9, 76.2, 76.1, 77.0, 77.7, 84.7 \|c(`(Inte….

Show the code


model_by_species %>% 
  mutate(coef = map(model, tidy)) %>% 
  select(species, coef) %>% 
  unnest(cols = c(coef))

species	term	estimate	std.error	statistic	p.value
O. exclamationis	(Intercept)	-11.04	4.765	-2.32	0.039
O. exclamationis	temp	3.75	0.184	20.41	0.000
O. niveus	(Intercept)	-15.39	2.347	-6.56	0.000
O. niveus	temp	3.52	0.105	33.59	0.000

9.4.3.5 `group_split()`

9.5 整洁操作

9.5.1 `pivot_longer()`

9.5.1.1 列名包含一个变量名

Show the code

billboard|> DT::datatable()

Show the code

billboard_longer <- billboard |> 
  pivot_longer(
    cols = starts_with("wk"), 
    names_to = "week", 
    values_to = "排名" ,
    values_drop_na = TRUE,
  ) |> 
    mutate(
        week=parse_number(week)
    )
billboard_longer|> DT::datatable()

Show the code

billboard_longer |> 
  ggplot(aes(x = week, y = 排名, group = track)) + 
  geom_line(alpha = 0.25) + 
  scale_y_reverse()

9.5.1.2 列名包含多个变量名

Show the code

who2 |> DT::datatable()

Show the code

who2 |> 
  pivot_longer(
    cols = !(country:year),
    names_to = c("diagnosis", "gender", "age"), 
    names_sep = "_",
    values_to = "count",
    values_drop_na = T
  ) |> head()

country	year	diagnosis	gender	age	count
Afghanistan	1997	sp	m	014	0
Afghanistan	1997	sp	m	1524	10
Afghanistan	1997	sp	m	2534	6
Afghanistan	1997	sp	m	3544	3
Afghanistan	1997	sp	m	4554	5
Afghanistan	1997	sp	m	5564	2

9.5.1.3 列名包含变量名和变量值

Show the code

household

family	dob_child1	dob_child2	name_child1	name_child2
1	1998-11-26	2000-01-29	Susan	Jose
2	1996-06-22	NA	Mark	NA
3	2002-07-11	2004-04-05	Sam	Seth
4	2004-10-10	2009-08-27	Craig	Khai
5	2000-12-05	2005-02-28	Parker	Gracie

Show the code

household |> 
  pivot_longer(
    cols = !family, 
    names_to = c(".value", "child"), # 使用透视列名称的第一个组件作为变量名称
    names_sep = "_", 
    values_drop_na = TRUE
  )

family	child	dob	name
1	child1	1998-11-26	Susan
1	child2	2000-01-29	Jose
2	child1	1996-06-22	Mark
3	child1	2002-07-11	Sam
3	child2	2004-04-05	Seth
4	child1	2004-10-10	Craig
4	child2	2009-08-27	Khai
5	child1	2000-12-05	Parker
5	child2	2005-02-28	Gracie

Show the code

df <- tibble(ID=c("A","B","C"),
       GDP_2019=c(114,251,152),
       GDP_2018=c(215,115,141),
       GDP_2017=c(141,244,243)) 
df

ID	GDP_2019	GDP_2018	GDP_2017
A	114	215	141
B	251	115	244
C	152	141	243

Show the code


df %>% 
  pivot_longer(
    cols = -ID,
    names_to =c(".value","year"),
    names_sep = "_") %>% 
    arrange(year)

ID	year	GDP
A	2017	141
B	2017	244
C	2017	243
A	2018	215
B	2018	115
C	2018	141
A	2019	114
B	2019	251
C	2019	152

Show the code


df <- tribble(
  ~class1_name,~class1_major,~class2_name,~class2_major,~class3_name,~class3_major,
  '张非',"math",'李斯','English','王武','statistic',
  '钟爱','English','陈述','math','孙健','medicine'
) 
df

class1_name	class1_major	class2_name	class2_major	class3_name	class3_major
张非	math	李斯	English	王武	statistic
钟爱	English	陈述	math	孙健	medicine

Show the code

df%>% 
  pivot_longer(
    col=starts_with("class"),
    names_to = c("group",".value"),
    names_sep = "_",
  )

group	name	major
class1	张非	math
class2	李斯	English
class3	王武	statistic
class1	钟爱	English
class2	陈述	math
class3	孙健	medicine

9.5.2 `pivot_wider()`

Show the code

cms_patient_experience|> DT::datatable()

Show the code

cms_patient_experience |> 
  distinct(measure_cd, measure_title)

measure_cd	measure_title
CAHPS_GRP_1	CAHPS for MIPS SSM: Getting Timely Care, Appointments, and Information
CAHPS_GRP_2	CAHPS for MIPS SSM: How Well Providers Communicate
CAHPS_GRP_3	CAHPS for MIPS SSM: Patient’s Rating of Provider
CAHPS_GRP_5	CAHPS for MIPS SSM: Health Promotion and Education
CAHPS_GRP_8	CAHPS for MIPS SSM: Courteous and Helpful Office Staff
CAHPS_GRP_12	CAHPS for MIPS SSM: Stewardship of Patient Resources

Show the code

cms_patient_experience |>
    pivot_wider(
        id_cols = starts_with("org"),# 唯一标识列
        names_from = measure_cd,
        values_from = prf_rate
    ) |> DT::datatable()

9.5.3 `tidyr::separate_longer_*()`

Show the code

# df |> separate_longer_delim(cols, delim)
# df |> separate_longer_position(cols, width)
# df |> separate_wider_delim(cols, delim, names)
# df |> separate_wider_position(cols, widths)
# df |> separate_wider_regex(cols,patterns)

Show the code

df1 <- tibble(x = c("a,b,c", "d,e", "f"))
df1 |> 
  separate_longer_delim(x, delim = ",")

x
a
b
c
d
e
f

Show the code


df2 <- tibble(x = c("1211", "131", "21"))
df2 |> 
  separate_longer_position(x, width = 1)

x
1
2
1
1
1
3
1
2
1

9.5.4 `tidyr::separate_wider_*()`

Show the code

df3 <- tibble(x = c("a10.1.2022", "b10.2.2011", "e15.1.2015"))
df3

x
a10.1.2022
b10.2.2011
e15.1.2015

Show the code

df3 |> 
  separate_wider_delim(
    x,
    delim = ".",
    names = c("code", "edition", "year")
  )

code	edition	year
a10	1	2022
b10	2	2011
e15	1	2015

Show the code

df3 |> 
  separate_wider_delim(
    x,
    delim = ".",
    names = c("code", NA, "year")
  )

code	year
a10	2022
b10	2011
e15	2015

Show the code


df4 <- tibble(x = c("202215TX", "202122LA", "202325CA")) 
df4 |> 
  separate_wider_position(
    x,
    widths = c(year = 4, age = 2, state = 2)
  )

year	age	state
2022	15	TX
2021	22	LA
2023	25	CA

Show the code


df5 <- tribble(
    ~ str,
    "<Sheryl>-F_34",
    "<Kisha>-F_45",
    "<Brandon>-N_33",
    "<Sharon>-F_38",
    "<Penny>-F_58",
    "<Justin>-M_41",
    "<Patricia>-F_84",
)
df5 |>
    separate_wider_regex(str,
                         patterns = c(
                             "<",
                             name = "[A-Za-z]+",
                             #匹配 ≥1 个字母
                             ">-",
                             gender = ".",
                             #匹配除\n以外任意字符
                             "_",
                             age = "[0-9]+"      #匹配 ≥1 个数字
                         ))

name	gender	age
Sheryl	F	34
Kisha	F	45
Brandon	N	33
Sharon	F	38
Penny	F	58
Justin	M	41
Patricia	F	84

9.6 重编码

Show the code

students <- read_csv("data/students.csv", na = c("N/A", ""))
students

Student ID	Full Name	favourite.food	mealPlan	AGE
1	Sunil Huffmann	Strawberry yoghurt	Lunch only	4
2	Barclay Lynn	French fries	Lunch only	5
3	Jayendra Lyne	NA	Breakfast and lunch	7
4	Leon Rossini	Anchovies	Lunch only	NA
5	Chidiegwu Dunkel	Pizza	Breakfast and lunch	five
6	G??ven? Attila	Ice cream	Lunch only	6

9.6.0.1 变量名

Show the code

if(!require(janitor)) install.packages("janitor")

# snake_case
students |>
    janitor::clean_names(case="snake") #"title"  "lower_camel" "upper_camel"

student_id	full_name	favourite_food	meal_plan	age
1	Sunil Huffmann	Strawberry yoghurt	Lunch only	4
2	Barclay Lynn	French fries	Lunch only	5
3	Jayendra Lyne	NA	Breakfast and lunch	7
4	Leon Rossini	Anchovies	Lunch only	NA
5	Chidiegwu Dunkel	Pizza	Breakfast and lunch	five
6	G??ven? Attila	Ice cream	Lunch only	6

9.1 行操作dplyr

9.1.1 filter()

9.1.2 arrange()

9.1.3 distinct()

9.1.4 slice()

9.2 列操作 dplyr

9.2.1 glimpse()

9.2.2 pull()

9.2.3 mutate()

9.2.4 rename()

9.2.5 relocate()

9.2.6 select()

9.2.7 across(.cols, .fns, ...)

9.3 按组操作

9.3.1 group_by()

9.3.2 .by =

9.3.3 计数

9.3.4 rowwise()

9.3.5 reframe()

9.4 按数据框操作

9.4.1 绑定操作

9.4.1.1 bing_cols()

9.4.1.2 bind_rows()

9.4.2 集合操作

9.4.3 连接操作

9.4.3.1 mutate-joins

9.4.3.2 filter-joins

9.4.3.3 cross-join

9.4.3.4 group_nest()

9.4.3.5 group_split()

9.5 整洁操作

9.5.1 pivot_longer()

9.5.1.1 列名包含一个变量名

9.5.1.2 列名包含多个变量名

9.5.1.3 列名包含变量名和变量值

9.5.2 pivot_wider()

9.5.3 tidyr::separate_longer_*()

9.5.4 tidyr::separate_wider_*()

9.6 重编码

9.6.0.1 变量名

9.6.0.2 变量值

缺失值

9.1.1 `filter()`

9.1.2 `arrange()`

9.1.3 `distinct()`

9.1.4 `slice()`

9.2.1 `glimpse()`

9.2.2 `pull()`

9.2.3 `mutate()`

9.2.4 `rename()`

9.2.5 `relocate()`

9.2.6 `select()`

9.2.7 `across(.cols, .fns, ...)`

9.3.1 `group_by()`

9.3.2 `.by =`

9.3.4 `rowwise()`

9.3.5 `reframe()`

9.4.1.1 `bing_cols()`

9.4.1.2 `bind_rows()`

9.4.3.4 `group_nest()`

9.4.3.5 `group_split()`

9.5.1 `pivot_longer()`

9.5.2 `pivot_wider()`

9.5.3 `tidyr::separate_longer_*()`

9.5.4 `tidyr::separate_wider_*()`