# base 函数
## 赋值运算符
| **Assignment operator in R** | **Description** |
|:--:|:--:|
| `<-` | Left assignment |
| = | Left assignment (not recommended) and argument assignment |
| `->` | Right assignment |
| `<<-` | Left lexicographic assignment (for advanced users) |
| `->>` | Right lexicographic assignment (for advanced users) |
## 其他运算符
| **Miscellaneous operator in R** | **Description** |
|:--:|:--:|
| \$ | Named list or dataframe column subset |
| : | Sequence generator |
| :: | Accessing functions of packages It is not usually needed |
| ::: | Accessing internal functions of packages |
| \~ | Model formulae |
| \@ | Accessing slots in S4 classes (Advanced) |
## 逻辑索引
```{r}
1 : length (mtcars$ mpg)
seq_along (mtcars$ mpg)
x <- c (1 ,3 ,6 ,4 ,0 ,7 )
# 返回值为TRUE的索引
which (x> 2 )
which.max (x)
which.min (x)
# 返回值
x[x> 2 ]
x[which (x> 2 )]
# 包含
# match(x, table, nomatch = NA_integer_, incomparables = NULL)
# x %in% table
1 : 10 %in% c (1 ,3 ,5 ,9 )
match (1 : 10 , c (1 ,3 ,5 ,9 ))
all (colnames (mpg)== colnames (mpg))
any (colnames (mpg)== colnames (mtcars))
x <- c (9 : 20 , 1 : 5 , 3 : 7 , 0 : 8 )
duplicated (x)
xu <- x[! duplicated (x)]
xu
```
## 子集运算
`[` ,`$` ,`[[`
S4 对象: `@` `slot`
### `[ ]`
```{r}
x <- c (2.1 , 4.2 , 3.3 , 5.4 )
x[c (3 , 1 )]
x[- c (3 , 1 )]
x[c (TRUE , TRUE , FALSE , FALSE )]
x[x > 3 ]
x[c (TRUE , FALSE )] # recycling rules 循环
x[]
x[0 ]
```
### `$`, `[[ ]]`
`$` 是一个简写运算符, `x$y` 大致相当于 `x[["y"]]` ,从左到右部分赋值
```{r}
x <- list (abc = 1 )
x$ a
x[["a" ]]
options (warnPartialMatchDollar = TRUE )
x$ a
```
## 生成数值序列
```{r}
seq (from= 1 ,to= 30 ,by= 3 ) # 生成一个序列
seq (from= 1 ,to= 30 ,length= 10 )
sequence (30 ,by= 3 )
rep (x = c ("A" ,1 ,"B" ,2 ),times = 3 ) #重复序列
rep (x = c ("A" ,1 ,"B" ,2 ),times = c (1 ,2 ,3 ,4 ))
rep (x = c ("A" ,1 ,"B" ,2 ),each = 3 ,times = 2 )
```
## 排序
```{r}
x <- c ("b" , "c" , "a" )
order (x)
x[order (x)]
```
```{r eval=FALSE}
sort()
dplyr::arrange()
# 分配秩次
rank()
```
## 拆分
```{r}
set.seed (3 )
df <- CO2[sample (1 : nrow (CO2), 10 ), ]
head (df)
dfs <- split (df, f = list (df$ Type, df$ Treatment))
dfs
```
## 合并
```{r}
set.seed (61 )
employee_id <- 1 : 10
employee_name <- c ("Andrew" , "Susan" , "John" , "Joe" , "Jack" ,
"Jacob" , "Mary" , "Kate" , "Jacqueline" , "Ivy" )
employee_salary <- round (rnorm (10 , mean = 1500 , sd = 200 ))
employee_age <- round (rnorm (10 , mean = 50 , sd = 8 ))
employee_position <- c ("CTO" , "CFO" , "Administrative" , rep ("Technician" , 7 ))
df_1 <- data.frame (id = employee_id[1 : 8 ], name = employee_name[1 : 8 ],
month_salary = employee_salary[1 : 8 ])
df_2 <- data.frame (id = employee_id[- 5 ], name = employee_name[- 5 ],
age = employee_age[- 5 ], position = employee_position[- 5 ])
df_1
df_2
```
### 连接
```{r}
# 内连接
merge (x = df_1, y = df_2, by = c ("id" , "name" ))
# 全连接
merge (x = df_1, y = df_2, all = TRUE )
# 左连接
merge (x = df_1, y = df_2, all.x = TRUE )
# 右连接
merge (x = df_1, y = df_2, all.y = TRUE )
# 交叉连接
Merged <- merge (x = df_1, y = df_2, by = NULL )
head (Merged)
```
### 按行名合并
```{r}
df1 <- data.frame (var = c ("one" , "two" , "three" , "four" , "five" ),
data = c (1 , 5 , 1 , 6 , 8 ))
rownames (df1) <- c ("A" , "B" , "C" , "D" , "E" )
df1
df2 <- data.frame (var = c ("three" , "one" , "eight" , "two" , "nine" ),
data = c (1 , 5 , 1 , 6 , 8 ))
rownames (df2) <- c ("E" , "A" , "B" , "D" , "C" )
df2
```
```{r}
merge (df1, df2, by = 0 , all = TRUE )
merge (df1, df2, by = "row.names" , all = TRUE ) # Equivalent
```
## AsIs `I()`
`I()` 函数来自于stats包,它是as.is函数的别名。I()函数用于改变对象的类别,指示R在进行操作时应该将该对象视为其原始形式,不对它进行任何转换。
```{r error=TRUE}
summary(mtcars$mpg) |> as_tibble()
lapply(summary(mtcars$mpg), I)
lapply(summary(mtcars$mpg), I) |> as_tibble()
sapply(summary(mtcars$mpg), I, simplify = F)
sapply(summary(mtcars$mpg), I, simplify = F) |> as_tibble()
```
## 数学函数
```{r}
x<- c (25 ,- 4 ,3.66 ,3.42 ,- 5.99 )
abs (x) #绝对值
sqrt (x) #平方根
ceiling (x) #向上取整
floor (x) #向下取整
trunc (x) #整数部分
round (x,digits = 2 ) #四舍五入,保留2位小数
signif (x,digits = 2 ) #四舍五入,保留2有效数字
log (x,base= 4 ) #对x取以base为底的对数
log (x) #自然对数
log10 (x) #常用对数
exp (x) #e指数函数
```
## 统计函数
```{r}
mean (x,trim = 0.05 ,na.rm = TRUE ) #算术平均值
median (x) #中位数
sd (x) #标准差
var (x) #方差
mad (x) #绝对中位差
quantile (x,probs = c (0 ,0.25 ,0.5 ,0.75 ,1 )) #分位数
range (x) # 值域
scale (x,center = TRUE ,scale = TRUE ) #标准化(均值为0、标准差为1)
scale (x,center = TRUE ,scale = FALSE ) #中心化:减去均值
x <- c (1 ,3 ,5 ,7 ,9 ,11 ,13 )
# Find the "previous" (lag()) or "next" (lead()) values in a vector
dplyr:: lag (x,n= 2 ) # n阶滞后
dplyr:: lead (x,n= 2 ) # n阶前移
# 滞后差分 lag阶滞后 difference阶差分
diff (x,lag = 1 ,difference= 1 ) # 隔0个值后位减前位,进行1次
diff (x,lag = 1 ,difference= 2 ) ## 隔0个值后位减前位,进行2次
diff (x,lag = 1 ,difference= 3 ) # 隔0个值后位减前位,进行3次
diff (x,lag = 2 ,difference= 1 ) # 隔1个值后位减前位,进行1次
diff (x,lag = 2 ,difference= 2 ) ## 隔1个值后位减前位,进行2次
```
## 字符串函数
```{r}
# 匹配
data <- data.frame (
name = c ("Alice" , "Bob" , "Carol" , "Dave" , "Eve" ),
description = c ("Software developer" , "Data analyst" , "UX designer" , "Project manager" , "Data scientist" )
)
data
data$ has_data_analyst <- str_detect (data$ description, "Data analyst" )
print (data)
data$ has_data_grepl <- grepl ("Data" , data$ description)
print (data)
```
```{r}
#子串
substr (x= "qwertyyuio" ,start = 2 ,stop= 4 )
str_sub (string = "qwertyyuio" ,start = 2 ,end = 4 )
#查找替换
sub (pattern = " " ,replacement = "." ,
x= "hello world hello !" ,ignore.case = FALSE ,fixed = FALSE )
str_replace_all ("hello world hello !" ," " ,replacement = "." )
# 查找,返回下标
grep (pattern = "v" ,x= c ("a" ,"v" ,"D" ,"A" ,"f" ,"J" ),ignore.case = FALSE ,fixed = FALSE )
# 分隔,\\转义字符
strsplit (x= "a.fa.fag" ,split = " \\ ." ,fixed = FALSE )
#连接
paste ("x" ,c ("a" ,"b" ),sep= "" ,collapse = "?" )
paste0 ("x" ,c ("A" ,"B" ),collapse= "?" )
cat ("hello" ,"BOb" ," \b\n " ," \b Isn \' R" ," \t " ,"GREAT? \n " ,sep = " " )
#
toupper ("abc" ) #大写转换
tolower ("aaAGEErg" ) #小写转换
```
## 打印函数 `sprintf()`
```{r}
a <- "string"
sprintf ("This is where a %s goes." , a)
x <- 8
sprintf ("Regular:%04d" , x)
sprintf ("%f" , pi) # "3.141593"
sprintf ("%.3f" , pi) # "3.142"
sprintf ("%1.0f" , pi) # "3"
sprintf ("%5.1f" , pi) # " 3.1"
sprintf ("%05.1f" , pi) # "003.1"
sprintf ("%+f" , pi) # "+3.141593"
sprintf ("% f" , pi) # " 3.141593"
sprintf ("%-10f" , pi) # "3.141593 " (left justified)
sprintf ("%e" , pi) #"3.141593e+00"
sprintf ("%E" , pi) # "3.141593E+00"
sprintf ("%g" , pi) # "3.14159"
sprintf ("%g" , 1e6 * pi) # "3.14159e+06" (exponential)
sprintf ("%.9g" , 1e6 * pi) # "3141592.65" ("fixed")
sprintf ("%G" , 1e-6 * pi) # "3.14159E-06"
x <- "string"
sprintf ("Substitute in multiple strings: %s %s" , x, "string2" )
sprintf ("A single percent sign here: %%" )
```
## apply函数簇
### `apply()`
```{r}
apply (X = mtcars, MARGIN = 2 , FUN = mean)
```
### `aggregate()`
```{r}
aggregate (x = mtcars,
by = list (am= mtcars$ am,cyl= mtcars$ cyl),
FUN = mean)
aggregate (.~ am+ cyl,
data = mtcars,
FUN = mean)
aggregate (mpg~ am+ cyl,
data = mtcars,
FUN = summary)
```
### `by()`, `tapply()`
`by()` 是应用于数据框的tapply的面向对象包装器。
```{r}
by (data = mtcars[,1 : 3 ],
INDICES = list (cyl= mtcars$ cyl),
FUN = summary)
by (data = mtcars[,1 : 3 ],
INDICES = factor (mtcars$ cyl),
FUN = function (x) lm ( disp ~ mpg, data = x))
tapply (X = mtcars[,1 : 3 ],
INDEX = list (cyl= mtcars$ cyl),
FUN = summary)
```
### `lapply()`, `sapply()`
`lapply()` returns a list of the same length as X,
`sapply()` is a user-friendly version and wrapper of `lapply()` by default returning a vector or matrix
```{r}
x <- list (a = 1 : 10 , beta = exp (- 3 : 3 ), logic = c (TRUE ,FALSE ,FALSE ,TRUE ))
lapply (X = x,FUN = quantile)
sapply (x, quantile)
```
### `vapply`
```{r}
by_cyl <- split (mtcars, mtcars$ cyl)
models <- lapply (by_cyl, function (data) lm (mpg ~ wt, data = data))
models
vapply (models, function (x) coef (x)[[2 ]], double (1 ))
```
## 矩阵运算
```{r}
X <- matrix (data = 1 : 9 ,ncol = 3 ,nrow = 3 )
X
diag (X)
det (X)
eigen (X)
X
# 协方差矩阵
cov (X)
# 相关系数矩阵
cor (X)
```
## 统计分布函数
形如`[dpqr]distribution_abbreviation` ,其中密度函数`d` ,分布函数`p` ,分位数函数`q` ,随机数生成函数`r`
```{r}
### 正态分布
dnorm (3 ,0 ,2 ) #N(0,4)在 3 处的密度值
pnorm (1 : 3 ,0 ,2 )#N(0,4)在1,2,3处的分布概率值
qnorm (1 -0.025 ,0 ,1 )# N(0,1)的上0.025分位数
rnorm (5 ,3 ,3 ) # 生成5个服从N(3,9)的随机数
### 二项分布
1 - sum (dbinom (0 : 1 ,400 ,0.02 ))# 命中率为0.02,独立射击400次,至少击中2次的概率
ggplot ()
### 多元正态分布
mean<- c (230.7 ,146.7 ,3.6 )
sigma<- matrix (c (15360.8 ,6721.2 ,- 47.1 ,
6721.2 ,4700.9 ,- 16.5 ,
- 47.1 ,16.5 ,0.3 ),nrow = 3 ,ncol = 3 )
library (MASS)
multi <- mvrnorm (500 ,mean,sigma)
head (multi)
```