# 数据清洗
```{r include=FALSE}
if(!require(DT)) install.packages("DT")
if(!require(janitor)) install.packages("janitor")
```
## 行操作**dplyr**
<https://dplyr.tidyverse.org/index.html> "分割-应用-组合"(Split-Apply-Combine)
### `filter()`
```{r}
dplyr::filter(mpg,model=="a4")
dplyr::filter(mpg,drv %in% c("r","f")) |> DT::datatable()
```
### `arrange()`
```{r}
arrange(mpg, year,displ,cty)|> DT::datatable()
arrange(mpg,desc(displ))|> DT::datatable()
```
### `distinct()`
```{r}
distinct(mpg,manufacturer)
count(mpg,manufacturer,sort = TRUE)
mpg |>
distinct(manufacturer,cyl) |>
count(cyl)
# 将在数据集中找到唯一行的第一个出现并丢弃其余行
distinct(mpg,manufacturer,.keep_all = TRUE)
```
### `slice()`
```{r}
slice_head(mpg , n=6)
slice_max(mpg, order_by = displ ,n=6)
slice_min(mpg,order_by = displ,prop = 0.01)
slice_sample(mpg,n=6,weight_by =hwy ,replace = T)
```
## 列操作 **dplyr**
### `glimpse()`
```{r}
glimpse(mpg) %>% select(1:3) %>% head()
```
### `pull()`
提取单个列
```{r}
mpg %>%
pull(var = cyl)
```
### `mutate()`
```{r}
mutate(mpg,
id=nrow(mpg),
.before = 1)|> DT::datatable()
mutate(mpg,
id=nrow(mpg),
.after = 1)|> DT::datatable()
mutate(mpg,
id=nrow(mpg),
.keep = "used")|> DT::datatable()
```
### `rename()`
```{r}
rename(mpg,生产厂商=manufacturer) |> head()
```
### `relocate()`
```{r}
relocate(mpg , model:year ,.before = 1) |> head()
relocate(mpg , model:year ,.after = cyl) |> head()
```
### `select()`
- `:`用于选择一系列连续变量。
- `!`用于获取一组变量的补集。
- `&`以及`|`用于选择交集 或并集。
- [`c()`](https://rdrr.io/r/base/c.html)用于组合选择。
选择帮助程序*( selection helpers )*选择特定的列:
- [`everything()`](https://tidyselect.r-lib.org/reference/everything.html):匹配所有变量。
- [`last_col()`](https://tidyselect.r-lib.org/reference/everything.html):选择最后一个变量,可能带有偏移量。
- [`group_cols()`](https://dplyr.tidyverse.org/reference/group_cols.html):选择所有分组列。
- [`starts_with()`](https://tidyselect.r-lib.org/reference/starts_with.html):前缀开头。
- [`ends_with()`](https://tidyselect.r-lib.org/reference/starts_with.html):后缀结尾。
- [`contains()`](https://tidyselect.r-lib.org/reference/starts_with.html):包含文本字符串。
- [`matches()`](https://tidyselect.r-lib.org/reference/starts_with.html):匹配正则表达式。
- [`num_range()`](https://tidyselect.r-lib.org/reference/starts_with.html):匹配 x01、x02、x03 等数值范围。
或者从存储在字符向量中的变量:
- [`all_of()`](https://tidyselect.r-lib.org/reference/all_of.html):匹配字符向量中的变量名称。所有 名称必须存在,否则抛出越界错误(out-of-bounds error)。
- [`any_of()`](https://tidyselect.r-lib.org/reference/all_of.html):与`all_of()`相同,只是对于不存在的名称不会抛出任何错误。
或者使用谓词( predicate )函数:
- [where(fn)](https://tidyselect.r-lib.org/reference/where.html):将该函数应用于所有变量并选择该函数返回`TRUE`的变量。
```{r}
dplyr::select(mpg,model:year)|> DT::datatable()
select(mpg,c(3,4,5))|> DT::datatable()
select(mpg,where(is.numeric))|> DT::datatable()
select(mpg,where(~is.numeric(.x) && mean(.x,na.rm=TRUE)<50))|> DT::datatable()
```
### `across(.cols, .fns, ...)`
```{r}
iris <- as_tibble(iris)
iris %>%
mutate(across(where(is.double) & !c(Petal.Length, Petal.Width), round)) %>%
head(6)
cols <- c("Sepal.Length", "Petal.Width")
iris %>%
mutate(across(all_of(cols), round))
iris %>%
group_by(Species) %>%
summarise(across(starts_with("Sepal"), ~ mean(.x, na.rm = TRUE),.names = "mean_{.col}"))
# 命名列表
iris %>%
group_by(Species) %>%
summarise(across(starts_with("Sepal"), list(mean = mean, sd = sd), .names = "{.col}_{.fn}"))
iris %>%
group_by(Species) %>%
summarise(across(starts_with("Sepal"), list(mean, sd), .names = "{.col}_function{.fn}"))
```
```{r}
iris %>%
dplyr::filter(if_any(ends_with("Width"), ~ . > 4))
```
```{r}
iris %>%
dplyr::filter(if_all(ends_with("Width"), ~ . > 2))
```
## 按组操作
### `group_by()`
```{r}
mpg |>
group_by(cyl,drv) |>
summarise(n=n(),
.groups = "drop_last") #删除最后一个分组条件
# drop 删除所有分组
# keep 保留所有分组
```
### `.by =`
```{r}
mpg %>%
summarise(n=n(),
.by = c(cyl,drv))
```
### 计数
```{r}
starwars %>% count(sex, gender, sort = TRUE)
starwars %>% group_by(sex, gender) %>%
summarise(n=n(),.groups = "drop") %>%
arrange(desc(n))
```
### `rowwise()`
```{r}
options(digits = 3)
set.seed(123)
df <- tibble(x = runif(6), y = runif(6), z = runif(6))
df
df %>% mutate(m = pmin(x, y, z))
```
### `reframe()`
```{r}
df <- tibble(
g = c(1, 1, 1, 2, 2, 2, 2),
x = c(1:7)
)
df %>%
reframe(x = mean(x), .by = g)
```
## 按数据框操作
### 绑定操作
#### `bing_cols()`
```{r}
df1 <- tibble(x = 1:3)
df2 <- tibble(y = 3:1)
bind_cols(df1, df2)
```
#### `bind_rows()`
```{r}
df1 <- tibble(x = 1:2, y = letters[1:2])
df2 <- tibble(x = 4:5, z = 1:2)
bind_rows(df1, df2)
```
### 集合操作
集合运算 要求变量名(列)完全相同,把观测(行)看成是集合中的元素
```{r}
x<-tibble(ID=c(1,2),X=c("a1",'a2'))
y<-tibble(ID=c(2,3),X=c("a2",'a3'))
x;y
```
```{r}
dplyr::intersect(x,y)
union(x,y)
union_all(x,y)
dplyr::setdiff(x,y) #返回在x中不在y中
symdiff(x,y) # 在x不在y中的行和在y中不在x中的行
setequal(x,y)#判断是否相等
```
### 连接操作
```{r}
x <- tribble(
~ID, ~X,
1, "x1",
2, "x2",
3, "x3"
)
y <- tribble(
~id, ~Y,
1, "y1",
2, "y2",
4, "y4"
)
x;y
```
#### mutate-joins
```{r}
# 左连接
left_join(x, y, by = join_by(ID == id))
# 右连接
right_join(x, y, by = join_by(ID == id))
# 全连接
full_join(x, y, by = join_by(ID == id))
# 内连接
inner_join(x, y, by = join_by(ID == id))
inner_join(x,y, by = join_by(ID >=id), keep = TRUE)
#滚动联接 rolling join 类似于不等式连接,但仅匹配最近一个值。
inner_join(x,y, join_by(closest(ID >=id)))
```
#### filter-joins
对第一个数据框进行 筛选
```{r}
# 半连接
semi_join(x, y, by = join_by(ID == id))
# 反连接
anti_join(x, y, by = join_by(ID == id))
```
#### cross-join
```{r}
#交叉连接
cross_join(x,y) #nrow(x) * nrow(y)
nest_join(x,y,by = join_by(ID==id))->nested
nested[[3]]
```
#### `group_nest()`
```{r}
library(tidyverse)
library(broom)
data(crickets, package = "modeldata")
names(crickets)
split_by_species <- crickets %>%
group_nest(species)
split_by_species
model_by_species <-
split_by_species %>%
mutate(model = map(data, ~ lm(rate ~ temp, data = .x)))
model_by_species
model_by_species %>%
mutate(coef = map(model, tidy)) %>%
select(species, coef) %>%
unnest(cols = c(coef))
```
#### `group_split()`
## 整洁操作
### `pivot_longer()`
#### 列名包含一个变量名
```{r}
billboard|> DT::datatable()
```
```{r}
billboard_longer <- billboard |>
pivot_longer(
cols = starts_with("wk"),
names_to = "week",
values_to = "排名" ,
values_drop_na = TRUE,
) |>
mutate(
week=parse_number(week)
)
billboard_longer|> DT::datatable()
```
```{r}
#| fig-cap: "歌曲排名随时间变化的折线图"
billboard_longer |>
ggplot(aes(x = week, y = 排名, group = track)) +
geom_line(alpha = 0.25) +
scale_y_reverse()
```
#### 列名包含多个变量名
```{r}
who2 |> DT::datatable()
```
```{r}
who2 |>
pivot_longer(
cols = !(country:year),
names_to = c("diagnosis", "gender", "age"),
names_sep = "_",
values_to = "count",
values_drop_na = T
) |> head()
```
#### 列名包含变量名和变量值
```{r}
household
```
```{r}
household |>
pivot_longer(
cols = !family,
names_to = c(".value", "child"), # 使用透视列名称的第一个组件作为变量名称
names_sep = "_",
values_drop_na = TRUE
)
```
```{r}
df <- tibble(ID=c("A","B","C"),
GDP_2019=c(114,251,152),
GDP_2018=c(215,115,141),
GDP_2017=c(141,244,243))
df
df %>%
pivot_longer(
cols = -ID,
names_to =c(".value","year"),
names_sep = "_") %>%
arrange(year)
```
```{r}
df <- tribble(
~class1_name,~class1_major,~class2_name,~class2_major,~class3_name,~class3_major,
'张非',"math",'李斯','English','王武','statistic',
'钟爱','English','陈述','math','孙健','medicine'
)
df
df%>%
pivot_longer(
col=starts_with("class"),
names_to = c("group",".value"),
names_sep = "_",
)
```
### `pivot_wider()`
```{r}
cms_patient_experience|> DT::datatable()
```
```{r}
cms_patient_experience |>
distinct(measure_cd, measure_title)
```
```{r}
cms_patient_experience |>
pivot_wider(
id_cols = starts_with("org"),# 唯一标识列
names_from = measure_cd,
values_from = prf_rate
) |> DT::datatable()
```
### `tidyr::separate_longer_*()`
```{r}
# df |> separate_longer_delim(cols, delim)
# df |> separate_longer_position(cols, width)
# df |> separate_wider_delim(cols, delim, names)
# df |> separate_wider_position(cols, widths)
# df |> separate_wider_regex(cols,patterns)
```
```{r}
df1 <- tibble(x = c("a,b,c", "d,e", "f"))
df1 |>
separate_longer_delim(x, delim = ",")
df2 <- tibble(x = c("1211", "131", "21"))
df2 |>
separate_longer_position(x, width = 1)
```
### `tidyr::separate_wider_*()`
```{r}
df3 <- tibble(x = c("a10.1.2022", "b10.2.2011", "e15.1.2015"))
df3
df3 |>
separate_wider_delim(
x,
delim = ".",
names = c("code", "edition", "year")
)
df3 |>
separate_wider_delim(
x,
delim = ".",
names = c("code", NA, "year")
)
df4 <- tibble(x = c("202215TX", "202122LA", "202325CA"))
df4 |>
separate_wider_position(
x,
widths = c(year = 4, age = 2, state = 2)
)
df5 <- tribble(
~ str,
"<Sheryl>-F_34",
"<Kisha>-F_45",
"<Brandon>-N_33",
"<Sharon>-F_38",
"<Penny>-F_58",
"<Justin>-M_41",
"<Patricia>-F_84",
)
df5 |>
separate_wider_regex(str,
patterns = c(
"<",
name = "[A-Za-z]+",
#匹配 ≥1 个字母
">-",
gender = ".",
#匹配除\n以外任意字符
"_",
age = "[0-9]+" #匹配 ≥1 个数字
))
```
## 重编码
```{r}
students <- read_csv("data/students.csv", na = c("N/A", ""))
students
```
#### 变量名
```{r}
if(!require(janitor)) install.packages("janitor")
# snake_case
students |>
janitor::clean_names(case="snake") #"title" "lower_camel" "upper_camel"
```
#### 变量值
##### 缺失值