9.1 行操作dplyr

https://dplyr.tidyverse.org/index.html “分割-应用-组合”(Split-Apply-Combine)

9.1.1 filter()

Show the code
dplyr::filter(mpg,model=="a4")
manufacturer model displ year cyl trans drv cty hwy fl class
audi a4 1.8 1999 4 auto(l5) f 18 29 p compact
audi a4 1.8 1999 4 manual(m5) f 21 29 p compact
audi a4 2.0 2008 4 manual(m6) f 20 31 p compact
audi a4 2.0 2008 4 auto(av) f 21 30 p compact
audi a4 2.8 1999 6 auto(l5) f 16 26 p compact
audi a4 2.8 1999 6 manual(m5) f 18 26 p compact
audi a4 3.1 2008 6 auto(av) f 18 27 p compact
Show the code



dplyr::filter(mpg,drv %in% c("r","f")) |> DT::datatable()

9.1.2 arrange()

Show the code
arrange(mpg, year,displ,cty)|> DT::datatable()
Show the code

arrange(mpg,desc(displ))|> DT::datatable()

9.1.3 distinct()

Show the code
distinct(mpg,manufacturer)
manufacturer
audi
chevrolet
dodge
ford
honda
hyundai
jeep
land rover
lincoln
mercury
nissan
pontiac
subaru
toyota
volkswagen
Show the code
count(mpg,manufacturer,sort = TRUE)
manufacturer n
dodge 37
toyota 34
volkswagen 27
ford 25
chevrolet 19
audi 18
hyundai 14
subaru 14
nissan 13
honda 9
jeep 8
pontiac 5
land rover 4
mercury 4
lincoln 3
Show the code

mpg |> 
    distinct(manufacturer,cyl) |> 
    count(cyl)
cyl n
4 9
5 1
6 11
8 11
Show the code

# 将在数据集中找到唯一行的第一个出现并丢弃其余行
distinct(mpg,manufacturer,.keep_all = TRUE)
manufacturer model displ year cyl trans drv cty hwy fl class
audi a4 1.8 1999 4 auto(l5) f 18 29 p compact
chevrolet c1500 suburban 2wd 5.3 2008 8 auto(l4) r 14 20 r suv
dodge caravan 2wd 2.4 1999 4 auto(l3) f 18 24 r minivan
ford expedition 2wd 4.6 1999 8 auto(l4) r 11 17 r suv
honda civic 1.6 1999 4 manual(m5) f 28 33 r subcompact
hyundai sonata 2.4 1999 4 auto(l4) f 18 26 r midsize
jeep grand cherokee 4wd 3.0 2008 6 auto(l5) 4 17 22 d suv
land rover range rover 4.0 1999 8 auto(l4) 4 11 15 p suv
lincoln navigator 2wd 5.4 1999 8 auto(l4) r 11 17 r suv
mercury mountaineer 4wd 4.0 1999 6 auto(l5) 4 14 17 r suv
nissan altima 2.4 1999 4 manual(m5) f 21 29 r compact
pontiac grand prix 3.1 1999 6 auto(l4) f 18 26 r midsize
subaru forester awd 2.5 1999 4 manual(m5) 4 18 25 r suv
toyota 4runner 4wd 2.7 1999 4 manual(m5) 4 15 20 r suv
volkswagen gti 2.0 1999 4 manual(m5) f 21 29 r compact

9.1.4 slice()

Show the code
slice_head(mpg , n=6)
manufacturer model displ year cyl trans drv cty hwy fl class
audi a4 1.8 1999 4 auto(l5) f 18 29 p compact
audi a4 1.8 1999 4 manual(m5) f 21 29 p compact
audi a4 2.0 2008 4 manual(m6) f 20 31 p compact
audi a4 2.0 2008 4 auto(av) f 21 30 p compact
audi a4 2.8 1999 6 auto(l5) f 16 26 p compact
audi a4 2.8 1999 6 manual(m5) f 18 26 p compact
Show the code
slice_max(mpg, order_by = displ ,n=6)
manufacturer model displ year cyl trans drv cty hwy fl class
chevrolet corvette 7.0 2008 8 manual(m6) r 15 24 p 2seater
chevrolet k1500 tahoe 4wd 6.5 1999 8 auto(l4) 4 14 17 d suv
chevrolet corvette 6.2 2008 8 manual(m6) r 16 26 p 2seater
chevrolet corvette 6.2 2008 8 auto(s6) r 15 25 p 2seater
jeep grand cherokee 4wd 6.1 2008 8 auto(l5) 4 11 14 p suv
chevrolet c1500 suburban 2wd 6.0 2008 8 auto(l4) r 12 17 r suv
Show the code
slice_min(mpg,order_by = displ,prop = 0.01)
manufacturer model displ year cyl trans drv cty hwy fl class
honda civic 1.6 1999 4 manual(m5) f 28 33 r subcompact
honda civic 1.6 1999 4 auto(l4) f 24 32 r subcompact
honda civic 1.6 1999 4 manual(m5) f 25 32 r subcompact
honda civic 1.6 1999 4 manual(m5) f 23 29 p subcompact
honda civic 1.6 1999 4 auto(l4) f 24 32 r subcompact
Show the code

slice_sample(mpg,n=6,weight_by =hwy ,replace = T)
manufacturer model displ year cyl trans drv cty hwy fl class
ford expedition 2wd 5.4 1999 8 auto(l4) r 11 17 r suv
volkswagen new beetle 1.9 1999 4 manual(m5) f 35 44 d subcompact
dodge ram 1500 pickup 4wd 4.7 2008 8 manual(m6) 4 9 12 e pickup
jeep grand cherokee 4wd 4.7 2008 8 auto(l5) 4 14 19 r suv
chevrolet corvette 7.0 2008 8 manual(m6) r 15 24 p 2seater
subaru forester awd 2.5 2008 4 auto(l4) 4 18 23 p suv

9.2 列操作 dplyr

9.2.1 glimpse()

Show the code
glimpse(mpg) %>% select(1:3) %>% head()
#> Rows: 234
#> Columns: 11
#> $ manufacturer <chr> "audi", "audi", "audi", "audi", "audi", "audi", "audi", "…
#> $ model        <chr> "a4", "a4", "a4", "a4", "a4", "a4", "a4", "a4 quattro", "…
#> $ displ        <dbl> 1.8, 1.8, 2.0, 2.0, 2.8, 2.8, 3.1, 1.8, 1.8, 2.0, 2.0, 2.…
#> $ year         <int> 1999, 1999, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 200…
#> $ cyl          <int> 4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 8, 8, …
#> $ trans        <chr> "auto(l5)", "manual(m5)", "manual(m6)", "auto(av)", "auto…
#> $ drv          <chr> "f", "f", "f", "f", "f", "f", "f", "4", "4", "4", "4", "4…
#> $ cty          <int> 18, 21, 20, 21, 16, 18, 18, 18, 16, 20, 19, 15, 17, 17, 1…
#> $ hwy          <int> 29, 29, 31, 30, 26, 26, 27, 26, 25, 28, 27, 25, 25, 25, 2…
#> $ fl           <chr> "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "p…
#> $ class        <chr> "compact", "compact", "compact", "compact", "compact", "c…
manufacturer model displ
audi a4 1.8
audi a4 1.8
audi a4 2.0
audi a4 2.0
audi a4 2.8
audi a4 2.8

9.2.2 pull()

提取单个列

Show the code
mpg %>% 
    pull(var = cyl)
#>   [1] 4 4 4 4 6 6 6 4 4 4 4 6 6 6 6 6 6 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 4 4 6 6 6
#>  [38] 4 6 6 6 6 6 6 6 6 6 6 6 6 6 6 8 8 8 8 8 6 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8
#>  [75] 8 8 8 6 6 6 6 8 8 6 6 8 8 8 8 8 6 6 6 6 8 8 8 8 8 4 4 4 4 4 4 4 4 4 4 4 4
#> [112] 4 6 6 6 4 4 4 4 6 6 6 6 6 6 8 8 8 8 8 8 8 8 8 8 8 8 6 6 8 8 4 4 4 4 6 6 6
#> [149] 6 6 6 6 6 8 6 6 6 6 8 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 6 6 6 8 4 4 4 4 6 6
#> [186] 6 4 4 4 4 6 6 6 4 4 4 4 4 8 8 4 4 4 6 6 6 6 4 4 4 4 6 4 4 4 4 4 5 5 6 6 4
#> [223] 4 4 4 5 5 4 4 4 4 6 6 6

9.2.3 mutate()

Show the code
mutate(mpg,
       id=nrow(mpg),
       .before = 1)|> DT::datatable()
Show the code

mutate(mpg,
       id=nrow(mpg),
       .after = 1)|> DT::datatable()
Show the code

mutate(mpg,
       id=nrow(mpg),
       .keep = "used")|> DT::datatable()

9.2.4 rename()

Show the code
rename(mpg,生产厂商=manufacturer) |> head()
生产厂商 model displ year cyl trans drv cty hwy fl class
audi a4 1.8 1999 4 auto(l5) f 18 29 p compact
audi a4 1.8 1999 4 manual(m5) f 21 29 p compact
audi a4 2.0 2008 4 manual(m6) f 20 31 p compact
audi a4 2.0 2008 4 auto(av) f 21 30 p compact
audi a4 2.8 1999 6 auto(l5) f 16 26 p compact
audi a4 2.8 1999 6 manual(m5) f 18 26 p compact

9.2.5 relocate()

Show the code
relocate(mpg , model:year ,.before = 1) |> head()
model displ year manufacturer cyl trans drv cty hwy fl class
a4 1.8 1999 audi 4 auto(l5) f 18 29 p compact
a4 1.8 1999 audi 4 manual(m5) f 21 29 p compact
a4 2.0 2008 audi 4 manual(m6) f 20 31 p compact
a4 2.0 2008 audi 4 auto(av) f 21 30 p compact
a4 2.8 1999 audi 6 auto(l5) f 16 26 p compact
a4 2.8 1999 audi 6 manual(m5) f 18 26 p compact
Show the code
relocate(mpg , model:year ,.after = cyl) |> head()
manufacturer cyl model displ year trans drv cty hwy fl class
audi 4 a4 1.8 1999 auto(l5) f 18 29 p compact
audi 4 a4 1.8 1999 manual(m5) f 21 29 p compact
audi 4 a4 2.0 2008 manual(m6) f 20 31 p compact
audi 4 a4 2.0 2008 auto(av) f 21 30 p compact
audi 6 a4 2.8 1999 auto(l5) f 16 26 p compact
audi 6 a4 2.8 1999 manual(m5) f 18 26 p compact

9.2.6 select()

  • :用于选择一系列连续变量。

  • !用于获取一组变量的补集。

  • &以及|用于选择交集 或并集。

  • c()用于组合选择。

选择帮助程序( selection helpers )选择特定的列:

或者从存储在字符向量中的变量:

  • all_of():匹配字符向量中的变量名称。所有 名称必须存在,否则抛出越界错误(out-of-bounds error)。

  • any_of():与all_of()相同,只是对于不存在的名称不会抛出任何错误。

或者使用谓词( predicate )函数:

  • where(fn):将该函数应用于所有变量并选择该函数返回TRUE的变量。
Show the code
dplyr::select(mpg,model:year)|> DT::datatable()
Show the code
select(mpg,c(3,4,5))|> DT::datatable()
Show the code
select(mpg,where(is.numeric))|> DT::datatable()
Show the code
select(mpg,where(~is.numeric(.x) && mean(.x,na.rm=TRUE)<50))|> DT::datatable()

9.2.7 across(.cols, .fns, ...)

Show the code
iris <- as_tibble(iris)
iris %>%
  mutate(across(where(is.double) & !c(Petal.Length, Petal.Width), round)) %>% 
    head(6)
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
5 4 1.4 0.2 setosa
5 3 1.4 0.2 setosa
5 3 1.3 0.2 setosa
5 3 1.5 0.2 setosa
5 4 1.4 0.2 setosa
5 4 1.7 0.4 setosa
Show the code

cols <- c("Sepal.Length", "Petal.Width")
iris %>%
  mutate(across(all_of(cols), round))
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
5 3.5 1.4 0 setosa
5 3.0 1.4 0 setosa
5 3.2 1.3 0 setosa
5 3.1 1.5 0 setosa
5 3.6 1.4 0 setosa
5 3.9 1.7 0 setosa
5 3.4 1.4 0 setosa
5 3.4 1.5 0 setosa
4 2.9 1.4 0 setosa
5 3.1 1.5 0 setosa
5 3.7 1.5 0 setosa
5 3.4 1.6 0 setosa
5 3.0 1.4 0 setosa
4 3.0 1.1 0 setosa
6 4.0 1.2 0 setosa
6 4.4 1.5 0 setosa
5 3.9 1.3 0 setosa
5 3.5 1.4 0 setosa
6 3.8 1.7 0 setosa
5 3.8 1.5 0 setosa
5 3.4 1.7 0 setosa
5 3.7 1.5 0 setosa
5 3.6 1.0 0 setosa
5 3.3 1.7 0 setosa
5 3.4 1.9 0 setosa
5 3.0 1.6 0 setosa
5 3.4 1.6 0 setosa
5 3.5 1.5 0 setosa
5 3.4 1.4 0 setosa
5 3.2 1.6 0 setosa
5 3.1 1.6 0 setosa
5 3.4 1.5 0 setosa
5 4.1 1.5 0 setosa
6 4.2 1.4 0 setosa
5 3.1 1.5 0 setosa
5 3.2 1.2 0 setosa
6 3.5 1.3 0 setosa
5 3.6 1.4 0 setosa
4 3.0 1.3 0 setosa
5 3.4 1.5 0 setosa
5 3.5 1.3 0 setosa
4 2.3 1.3 0 setosa
4 3.2 1.3 0 setosa
5 3.5 1.6 1 setosa
5 3.8 1.9 0 setosa
5 3.0 1.4 0 setosa
5 3.8 1.6 0 setosa
5 3.2 1.4 0 setosa
5 3.7 1.5 0 setosa
5 3.3 1.4 0 setosa
7 3.2 4.7 1 versicolor
6 3.2 4.5 2 versicolor
7 3.1 4.9 2 versicolor
6 2.3 4.0 1 versicolor
6 2.8 4.6 2 versicolor
6 2.8 4.5 1 versicolor
6 3.3 4.7 2 versicolor
5 2.4 3.3 1 versicolor
7 2.9 4.6 1 versicolor
5 2.7 3.9 1 versicolor
5 2.0 3.5 1 versicolor
6 3.0 4.2 2 versicolor
6 2.2 4.0 1 versicolor
6 2.9 4.7 1 versicolor
6 2.9 3.6 1 versicolor
7 3.1 4.4 1 versicolor
6 3.0 4.5 2 versicolor
6 2.7 4.1 1 versicolor
6 2.2 4.5 2 versicolor
6 2.5 3.9 1 versicolor
6 3.2 4.8 2 versicolor
6 2.8 4.0 1 versicolor
6 2.5 4.9 2 versicolor
6 2.8 4.7 1 versicolor
6 2.9 4.3 1 versicolor
7 3.0 4.4 1 versicolor
7 2.8 4.8 1 versicolor
7 3.0 5.0 2 versicolor
6 2.9 4.5 2 versicolor
6 2.6 3.5 1 versicolor
6 2.4 3.8 1 versicolor
6 2.4 3.7 1 versicolor
6 2.7 3.9 1 versicolor
6 2.7 5.1 2 versicolor
5 3.0 4.5 2 versicolor
6 3.4 4.5 2 versicolor
7 3.1 4.7 2 versicolor
6 2.3 4.4 1 versicolor
6 3.0 4.1 1 versicolor
6 2.5 4.0 1 versicolor
6 2.6 4.4 1 versicolor
6 3.0 4.6 1 versicolor
6 2.6 4.0 1 versicolor
5 2.3 3.3 1 versicolor
6 2.7 4.2 1 versicolor
6 3.0 4.2 1 versicolor
6 2.9 4.2 1 versicolor
6 2.9 4.3 1 versicolor
5 2.5 3.0 1 versicolor
6 2.8 4.1 1 versicolor
6 3.3 6.0 2 virginica
6 2.7 5.1 2 virginica
7 3.0 5.9 2 virginica
6 2.9 5.6 2 virginica
6 3.0 5.8 2 virginica
8 3.0 6.6 2 virginica
5 2.5 4.5 2 virginica
7 2.9 6.3 2 virginica
7 2.5 5.8 2 virginica
7 3.6 6.1 2 virginica
6 3.2 5.1 2 virginica
6 2.7 5.3 2 virginica
7 3.0 5.5 2 virginica
6 2.5 5.0 2 virginica
6 2.8 5.1 2 virginica
6 3.2 5.3 2 virginica
6 3.0 5.5 2 virginica
8 3.8 6.7 2 virginica
8 2.6 6.9 2 virginica
6 2.2 5.0 2 virginica
7 3.2 5.7 2 virginica
6 2.8 4.9 2 virginica
8 2.8 6.7 2 virginica
6 2.7 4.9 2 virginica
7 3.3 5.7 2 virginica
7 3.2 6.0 2 virginica
6 2.8 4.8 2 virginica
6 3.0 4.9 2 virginica
6 2.8 5.6 2 virginica
7 3.0 5.8 2 virginica
7 2.8 6.1 2 virginica
8 3.8 6.4 2 virginica
6 2.8 5.6 2 virginica
6 2.8 5.1 2 virginica
6 2.6 5.6 1 virginica
8 3.0 6.1 2 virginica
6 3.4 5.6 2 virginica
6 3.1 5.5 2 virginica
6 3.0 4.8 2 virginica
7 3.1 5.4 2 virginica
7 3.1 5.6 2 virginica
7 3.1 5.1 2 virginica
6 2.7 5.1 2 virginica
7 3.2 5.9 2 virginica
7 3.3 5.7 2 virginica
7 3.0 5.2 2 virginica
6 2.5 5.0 2 virginica
6 3.0 5.2 2 virginica
6 3.4 5.4 2 virginica
6 3.0 5.1 2 virginica
Show the code


iris %>%
  group_by(Species) %>%
  summarise(across(starts_with("Sepal"), ~ mean(.x, na.rm = TRUE),.names = "mean_{.col}"))
Species mean_Sepal.Length mean_Sepal.Width
setosa 5.006 3.428
versicolor 5.936 2.770
virginica 6.588 2.974
Show the code


# 命名列表
iris %>%
  group_by(Species) %>%
  summarise(across(starts_with("Sepal"), list(mean = mean, sd = sd), .names = "{.col}_{.fn}"))
Species Sepal.Length_mean Sepal.Length_sd Sepal.Width_mean Sepal.Width_sd
setosa 5.006 0.3524897 3.428 0.3790644
versicolor 5.936 0.5161711 2.770 0.3137983
virginica 6.588 0.6358796 2.974 0.3224966
Show the code

iris %>%
  group_by(Species) %>%
  summarise(across(starts_with("Sepal"), list(mean, sd), .names = "{.col}_function{.fn}"))
Species Sepal.Length_function1 Sepal.Length_function2 Sepal.Width_function1 Sepal.Width_function2
setosa 5.006 0.3524897 3.428 0.3790644
versicolor 5.936 0.5161711 2.770 0.3137983
virginica 6.588 0.6358796 2.974 0.3224966
Show the code
iris %>%
  dplyr::filter(if_any(ends_with("Width"), ~ . > 4))
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
5.7 4.4 1.5 0.4 setosa
5.2 4.1 1.5 0.1 setosa
5.5 4.2 1.4 0.2 setosa
Show the code
iris %>%
  dplyr::filter(if_all(ends_with("Width"), ~ . > 2))
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
6.3 3.3 6.0 2.5 virginica
7.1 3.0 5.9 2.1 virginica
6.5 3.0 5.8 2.2 virginica
7.6 3.0 6.6 2.1 virginica
7.2 3.6 6.1 2.5 virginica
6.8 3.0 5.5 2.1 virginica
5.8 2.8 5.1 2.4 virginica
6.4 3.2 5.3 2.3 virginica
7.7 3.8 6.7 2.2 virginica
7.7 2.6 6.9 2.3 virginica
6.9 3.2 5.7 2.3 virginica
6.7 3.3 5.7 2.1 virginica
6.4 2.8 5.6 2.1 virginica
6.4 2.8 5.6 2.2 virginica
7.7 3.0 6.1 2.3 virginica
6.3 3.4 5.6 2.4 virginica
6.9 3.1 5.4 2.1 virginica
6.7 3.1 5.6 2.4 virginica
6.9 3.1 5.1 2.3 virginica
6.8 3.2 5.9 2.3 virginica
6.7 3.3 5.7 2.5 virginica
6.7 3.0 5.2 2.3 virginica
6.2 3.4 5.4 2.3 virginica

9.3 按组操作

9.3.1 group_by()

Show the code
mpg |> 
    group_by(cyl,drv) |> 
    summarise(n=n(),
              .groups = "drop_last") #删除最后一个分组条件
cyl drv n
4 4 23
4 f 58
5 f 4
6 4 32
6 f 43
6 r 4
8 4 48
8 f 1
8 r 21
Show the code
# drop 删除所有分组
# keep 保留所有分组

9.3.2 .by =

Show the code
mpg %>% 
    summarise(n=n(),
              .by = c(cyl,drv))
cyl drv n
4 f 58
6 f 43
4 4 23
6 4 32
8 4 48
8 r 21
6 r 4
8 f 1
5 f 4

9.3.3 计数

Show the code
starwars %>% count(sex, gender, sort = TRUE)
sex gender n
male masculine 60
female feminine 16
none masculine 5
NA NA 4
hermaphroditic masculine 1
none feminine 1
Show the code

starwars %>% group_by(sex, gender) %>% 
    summarise(n=n(),.groups = "drop") %>% 
    arrange(desc(n))
sex gender n
male masculine 60
female feminine 16
none masculine 5
NA NA 4
hermaphroditic masculine 1
none feminine 1

9.3.4 rowwise()

Show the code
options(digits = 3)
set.seed(123)
df <- tibble(x = runif(6), y = runif(6), z = runif(6))
df
x y z
0.288 0.528 0.678
0.788 0.892 0.573
0.409 0.551 0.103
0.883 0.457 0.900
0.940 0.957 0.246
0.046 0.453 0.042
Show the code

df %>% mutate(m = pmin(x, y, z))
x y z m
0.288 0.528 0.678 0.288
0.788 0.892 0.573 0.573
0.409 0.551 0.103 0.103
0.883 0.457 0.900 0.457
0.940 0.957 0.246 0.246
0.046 0.453 0.042 0.042

9.3.5 reframe()

Show the code
df <- tibble(
  g = c(1, 1, 1, 2, 2, 2, 2),
  x = c(1:7)
)
df %>%
  reframe(x = mean(x), .by = g)
g x
1 2.0
2 5.5

9.4 按数据框操作

9.4.1 绑定操作

9.4.1.1 bing_cols()

Show the code
df1 <- tibble(x = 1:3)
df2 <- tibble(y = 3:1)
bind_cols(df1, df2)
x y
1 3
2 2
3 1

9.4.1.2 bind_rows()

Show the code
df1 <- tibble(x = 1:2, y = letters[1:2])
df2 <- tibble(x = 4:5, z = 1:2)

bind_rows(df1, df2)
x y z
1 a NA
2 b NA
4 NA 1
5 NA 2

9.4.2 集合操作

集合运算 要求变量名(列)完全相同,把观测(行)看成是集合中的元素

Show the code
x<-tibble(ID=c(1,2),X=c("a1",'a2'))
y<-tibble(ID=c(2,3),X=c("a2",'a3'))
x;y
ID X
1 a1
2 a2
ID X
2 a2
3 a3
Show the code
dplyr::intersect(x,y) 
ID X
2 a2
Show the code
union(x,y)
ID X
1 a1
2 a2
3 a3
Show the code
union_all(x,y)
ID X
1 a1
2 a2
2 a2
3 a3
Show the code

dplyr::setdiff(x,y) #返回在x中不在y中
ID X
1 a1
Show the code
symdiff(x,y)  # 在x不在y中的行和在y中不在x中的行
ID X
1 a1
3 a3
Show the code
setequal(x,y)#判断是否相等
#> [1] FALSE

9.4.3 连接操作

Show the code
x <- tribble(
  ~ID, ~X,
  1, "x1",
  2, "x2",
  3, "x3"
)
y <- tribble(
  ~id, ~Y,
  1, "y1",
  2, "y2",
  4, "y4"
)
x;y
ID X
1 x1
2 x2
3 x3
id Y
1 y1
2 y2
4 y4

9.4.3.1 mutate-joins

Show the code
# 左连接
left_join(x, y, by = join_by(ID == id))
ID X Y
1 x1 y1
2 x2 y2
3 x3 NA
Show the code

# 右连接
right_join(x, y, by = join_by(ID == id))
ID X Y
1 x1 y1
2 x2 y2
4 NA y4
Show the code


# 全连接
full_join(x, y, by = join_by(ID == id))
ID X Y
1 x1 y1
2 x2 y2
3 x3 NA
4 NA y4
Show the code

# 内连接
inner_join(x, y, by = join_by(ID == id))
ID X Y
1 x1 y1
2 x2 y2
Show the code
inner_join(x,y, by = join_by(ID >=id), keep = TRUE)
ID X id Y
1 x1 1 y1
2 x2 1 y1
2 x2 2 y2
3 x3 1 y1
3 x3 2 y2
Show the code
#滚动联接  rolling join 类似于不等式连接,但仅匹配最近一个值。
inner_join(x,y, join_by(closest(ID >=id)))
ID X id Y
1 x1 1 y1
2 x2 2 y2
3 x3 2 y2

9.4.3.2 filter-joins

对第一个数据框进行 筛选

Show the code
# 半连接
semi_join(x, y, by = join_by(ID == id))
ID X
1 x1
2 x2
Show the code

# 反连接
anti_join(x, y, by = join_by(ID == id))
ID X
3 x3

9.4.3.3 cross-join

Show the code
#交叉连接
cross_join(x,y)         #nrow(x) * nrow(y)
ID X id Y
1 x1 1 y1
1 x1 2 y2
1 x1 4 y4
2 x2 1 y1
2 x2 2 y2
2 x2 4 y4
3 x3 1 y1
3 x3 2 y2
3 x3 4 y4
Show the code


nest_join(x,y,by = join_by(ID==id))->nested

nested[[3]]
#> [[1]]
#> # A tibble: 1 × 1
#>   Y    
#>   <chr>
#> 1 y1   
#> 
#> [[2]]
#> # A tibble: 1 × 1
#>   Y    
#>   <chr>
#> 1 y2   
#> 
#> [[3]]
#> # A tibble: 0 × 1
#> # ℹ 1 variable: Y <chr>

9.4.3.4 group_nest()

Show the code
library(tidyverse)
library(broom)
data(crickets, package = "modeldata")
names(crickets)
#> [1] "species" "temp"    "rate"

split_by_species <- crickets %>% 
  group_nest(species) 
split_by_species
species data
O. exclamationis 20.8, 20.8, 24.0, 24.0, 24.0, 24.0, 26.2, 26.2, 26.2, 26.2, 28.4, 29.0, 30.4, 30.4, 67.9, 65.1, 77.3, 78.7, 79.4, 80.4, 85.8, 86.6, 87.5, 89.1, 98.6, 100.8, 99.3, 101.7
O. niveus 17.2, 18.3, 18.3, 18.3, 18.9, 18.9, 20.4, 21.0, 21.0, 22.1, 23.5, 24.2, 25.9, 26.5, 26.5, 26.5, 28.6, 44.3, 47.2, 47.6, 49.6, 50.3, 51.8, 60.0, 58.5, 58.9, 60.7, 69.8, 70.9, 76.2, 76.1, 77.0, 77.7, 84.7
Show the code

model_by_species <- 
  split_by_species %>% 
  mutate(model = map(data, ~ lm(rate ~ temp, data = .x)))
model_by_species
species data model
O. exclamationis 20.8, 20.8, 24.0, 24.0, 24.0, 24.0, 26.2, 26.2, 26.2, 26.2, 28.4, 29.0, 30.4, 30.4, 67.9, 65.1, 77.3, 78.7, 79.4, 80.4, 85.8, 86.6, 87.5, 89.1, 98.6, 100.8, 99.3, 101.7 c((Inte.... | |O. niveus |17.2, 18.3, 18.3, 18.3, 18.9, 18.9, 20.4, 21.0, 21.0, 22.1, 23.5, 24.2, 25.9, 26.5, 26.5, 26.5, 28.6, 44.3, 47.2, 47.6, 49.6, 50.3, 51.8, 60.0, 58.5, 58.9, 60.7, 69.8, 70.9, 76.2, 76.1, 77.0, 77.7, 84.7 |c((Inte….
Show the code

model_by_species %>% 
  mutate(coef = map(model, tidy)) %>% 
  select(species, coef) %>% 
  unnest(cols = c(coef))
species term estimate std.error statistic p.value
O. exclamationis (Intercept) -11.04 4.765 -2.32 0.039
O. exclamationis temp 3.75 0.184 20.41 0.000
O. niveus (Intercept) -15.39 2.347 -6.56 0.000
O. niveus temp 3.52 0.105 33.59 0.000

9.4.3.5 group_split()

9.5 整洁操作

9.5.1 pivot_longer()

9.5.1.1 列名包含一个变量名

Show the code
billboard|> DT::datatable()
Show the code
billboard_longer <- billboard |> 
  pivot_longer(
    cols = starts_with("wk"), 
    names_to = "week", 
    values_to = "排名" ,
    values_drop_na = TRUE,
  ) |> 
    mutate(
        week=parse_number(week)
    )
billboard_longer|> DT::datatable()
Show the code
billboard_longer |> 
  ggplot(aes(x = week, y = 排名, group = track)) + 
  geom_line(alpha = 0.25) + 
  scale_y_reverse()

歌曲排名随时间变化的折线图

9.5.1.2 列名包含多个变量名

Show the code
who2 |> DT::datatable()
Show the code
who2 |> 
  pivot_longer(
    cols = !(country:year),
    names_to = c("diagnosis", "gender", "age"), 
    names_sep = "_",
    values_to = "count",
    values_drop_na = T
  ) |> head()
country year diagnosis gender age count
Afghanistan 1997 sp m 014 0
Afghanistan 1997 sp m 1524 10
Afghanistan 1997 sp m 2534 6
Afghanistan 1997 sp m 3544 3
Afghanistan 1997 sp m 4554 5
Afghanistan 1997 sp m 5564 2

9.5.1.3 列名包含变量名和变量值

Show the code
household
family dob_child1 dob_child2 name_child1 name_child2
1 1998-11-26 2000-01-29 Susan Jose
2 1996-06-22 NA Mark NA
3 2002-07-11 2004-04-05 Sam Seth
4 2004-10-10 2009-08-27 Craig Khai
5 2000-12-05 2005-02-28 Parker Gracie
Show the code
household |> 
  pivot_longer(
    cols = !family, 
    names_to = c(".value", "child"), # 使用透视列名称的第一个组件作为变量名称
    names_sep = "_", 
    values_drop_na = TRUE
  )
family child dob name
1 child1 1998-11-26 Susan
1 child2 2000-01-29 Jose
2 child1 1996-06-22 Mark
3 child1 2002-07-11 Sam
3 child2 2004-04-05 Seth
4 child1 2004-10-10 Craig
4 child2 2009-08-27 Khai
5 child1 2000-12-05 Parker
5 child2 2005-02-28 Gracie
Show the code
df <- tibble(ID=c("A","B","C"),
       GDP_2019=c(114,251,152),
       GDP_2018=c(215,115,141),
       GDP_2017=c(141,244,243)) 
df
ID GDP_2019 GDP_2018 GDP_2017
A 114 215 141
B 251 115 244
C 152 141 243
Show the code

df %>% 
  pivot_longer(
    cols = -ID,
    names_to =c(".value","year"),
    names_sep = "_") %>% 
    arrange(year)
ID year GDP
A 2017 141
B 2017 244
C 2017 243
A 2018 215
B 2018 115
C 2018 141
A 2019 114
B 2019 251
C 2019 152
Show the code

df <- tribble(
  ~class1_name,~class1_major,~class2_name,~class2_major,~class3_name,~class3_major,
  '张非',"math",'李斯','English','王武','statistic',
  '钟爱','English','陈述','math','孙健','medicine'
) 
df
class1_name class1_major class2_name class2_major class3_name class3_major
张非 math 李斯 English 王武 statistic
钟爱 English 陈述 math 孙健 medicine
Show the code
df%>% 
  pivot_longer(
    col=starts_with("class"),
    names_to = c("group",".value"),
    names_sep = "_",
  )
group name major
class1 张非 math
class2 李斯 English
class3 王武 statistic
class1 钟爱 English
class2 陈述 math
class3 孙健 medicine

9.5.2 pivot_wider()

Show the code
cms_patient_experience|> DT::datatable()
Show the code
cms_patient_experience |> 
  distinct(measure_cd, measure_title)
measure_cd measure_title
CAHPS_GRP_1 CAHPS for MIPS SSM: Getting Timely Care, Appointments, and Information
CAHPS_GRP_2 CAHPS for MIPS SSM: How Well Providers Communicate
CAHPS_GRP_3 CAHPS for MIPS SSM: Patient’s Rating of Provider
CAHPS_GRP_5 CAHPS for MIPS SSM: Health Promotion and Education
CAHPS_GRP_8 CAHPS for MIPS SSM: Courteous and Helpful Office Staff
CAHPS_GRP_12 CAHPS for MIPS SSM: Stewardship of Patient Resources
Show the code
cms_patient_experience |>
    pivot_wider(
        id_cols = starts_with("org"),# 唯一标识列
        names_from = measure_cd,
        values_from = prf_rate
    ) |> DT::datatable()

9.5.3 tidyr::separate_longer_*()

Show the code
# df |> separate_longer_delim(cols, delim)
# df |> separate_longer_position(cols, width)
# df |> separate_wider_delim(cols, delim, names)
# df |> separate_wider_position(cols, widths)
# df |> separate_wider_regex(cols,patterns)
Show the code
df1 <- tibble(x = c("a,b,c", "d,e", "f"))
df1 |> 
  separate_longer_delim(x, delim = ",")
x
a
b
c
d
e
f
Show the code

df2 <- tibble(x = c("1211", "131", "21"))
df2 |> 
  separate_longer_position(x, width = 1)
x
1
2
1
1
1
3
1
2
1

9.5.4 tidyr::separate_wider_*()

Show the code
df3 <- tibble(x = c("a10.1.2022", "b10.2.2011", "e15.1.2015"))
df3
x
a10.1.2022
b10.2.2011
e15.1.2015
Show the code
df3 |> 
  separate_wider_delim(
    x,
    delim = ".",
    names = c("code", "edition", "year")
  )
code edition year
a10 1 2022
b10 2 2011
e15 1 2015
Show the code
df3 |> 
  separate_wider_delim(
    x,
    delim = ".",
    names = c("code", NA, "year")
  )
code year
a10 2022
b10 2011
e15 2015
Show the code

df4 <- tibble(x = c("202215TX", "202122LA", "202325CA")) 
df4 |> 
  separate_wider_position(
    x,
    widths = c(year = 4, age = 2, state = 2)
  )
year age state
2022 15 TX
2021 22 LA
2023 25 CA
Show the code

df5 <- tribble(
    ~ str,
    "<Sheryl>-F_34",
    "<Kisha>-F_45",
    "<Brandon>-N_33",
    "<Sharon>-F_38",
    "<Penny>-F_58",
    "<Justin>-M_41",
    "<Patricia>-F_84",
)
df5 |>
    separate_wider_regex(str,
                         patterns = c(
                             "<",
                             name = "[A-Za-z]+",
                             #匹配 ≥1 个字母
                             ">-",
                             gender = ".",
                             #匹配除\n以外任意字符
                             "_",
                             age = "[0-9]+"      #匹配 ≥1 个数字
                         ))
name gender age
Sheryl F 34
Kisha F 45
Brandon N 33
Sharon F 38
Penny F 58
Justin M 41
Patricia F 84

9.6 重编码

Show the code
students <- read_csv("data/students.csv", na = c("N/A", ""))
students
Student ID Full Name favourite.food mealPlan AGE
1 Sunil Huffmann Strawberry yoghurt Lunch only 4
2 Barclay Lynn French fries Lunch only 5
3 Jayendra Lyne NA Breakfast and lunch 7
4 Leon Rossini Anchovies Lunch only NA
5 Chidiegwu Dunkel Pizza Breakfast and lunch five
6 G??ven? Attila Ice cream Lunch only 6

9.6.0.1 变量名

Show the code
if(!require(janitor)) install.packages("janitor")

# snake_case
students |>
    janitor::clean_names(case="snake") #"title"  "lower_camel" "upper_camel"
student_id full_name favourite_food meal_plan age
1 Sunil Huffmann Strawberry yoghurt Lunch only 4
2 Barclay Lynn French fries Lunch only 5
3 Jayendra Lyne NA Breakfast and lunch 7
4 Leon Rossini Anchovies Lunch only NA
5 Chidiegwu Dunkel Pizza Breakfast and lunch five
6 G??ven? Attila Ice cream Lunch only 6

9.6.0.2 变量值

缺失值