7.1 赋值运算符

Assignment operator in R Description
<- Left assignment
= Left assignment (not recommended) and argument assignment
-> Right assignment
<<- Left lexicographic assignment (for advanced users)
->> Right lexicographic assignment (for advanced users)

7.2 其他运算符

Miscellaneous operator in R Description
$ Named list or dataframe column subset
: Sequence generator
:: Accessing functions of packages It is not usually needed
::: Accessing internal functions of packages
~ Model formulae
@ Accessing slots in S4 classes (Advanced)

7.3 逻辑索引

Show the code
1:length(mtcars$mpg)
#>  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
#> [26] 26 27 28 29 30 31 32
seq_along(mtcars$mpg)
#>  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
#> [26] 26 27 28 29 30 31 32

x <- c(1,3,6,4,0,7)
# 返回值为TRUE的索引
which(x>2)
#> [1] 2 3 4 6
which.max(x)
#> [1] 6
which.min(x)
#> [1] 5

# 返回值
x[x>2]
#> [1] 3 6 4 7
x[which(x>2)]
#> [1] 3 6 4 7

# 包含
# match(x, table, nomatch = NA_integer_, incomparables = NULL)
# x %in% table
1:10 %in% c(1,3,5,9)
#>  [1]  TRUE FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE

match(1:10, c(1,3,5,9))
#>  [1]  1 NA  2 NA  3 NA NA NA  4 NA

all(colnames(mpg)==colnames(mpg))
#> [1] TRUE
any(colnames(mpg)==colnames(mtcars))
#> [1] FALSE


x <- c(9:20, 1:5, 3:7, 0:8)
duplicated(x)
#>  [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
#> [13] FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE FALSE FALSE FALSE  TRUE
#> [25]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE

xu <- x[!duplicated(x)]
xu
#>  [1]  9 10 11 12 13 14 15 16 17 18 19 20  1  2  3  4  5  6  7  0  8

7.4 子集运算

[$[[

S4 对象: @ slot

7.4.1 [ ]

Show the code
x <- c(2.1, 4.2, 3.3, 5.4)

x[c(3, 1)]
#> [1] 3.3 2.1

x[-c(3, 1)]
#> [1] 4.2 5.4

x[c(TRUE, TRUE, FALSE, FALSE)]
#> [1] 2.1 4.2
x[x > 3]
#> [1] 4.2 3.3 5.4

x[c(TRUE, FALSE)] # recycling rules 循环
#> [1] 2.1 3.3

x[]
#> [1] 2.1 4.2 3.3 5.4

x[0]
#> numeric(0)

7.4.2 $, [[ ]]

$是一个简写运算符, x$y大致相当于 x[["y"]] ,从左到右部分赋值

Show the code
x <- list(abc = 1)
x$a
#> [1] 1

x[["a"]]
#> NULL

options(warnPartialMatchDollar = TRUE)
x$a
#> [1] 1

7.5 生成数值序列

Show the code
seq(from=1,to=30,by=3)  # 生成一个序列
#>  [1]  1  4  7 10 13 16 19 22 25 28
seq(from=1,to=30,length=10) 
#>  [1]  1.000000  4.222222  7.444444 10.666667 13.888889 17.111111 20.333333
#>  [8] 23.555556 26.777778 30.000000

sequence(30,by=3)
#>  [1]  1  4  7 10 13 16 19 22 25 28 31 34 37 40 43 46 49 52 55 58 61 64 67 70 73
#> [26] 76 79 82 85 88


rep(x = c("A",1,"B",2),times = 3)           #重复序列
#>  [1] "A" "1" "B" "2" "A" "1" "B" "2" "A" "1" "B" "2"
rep(x = c("A",1,"B",2),times = c(1,2,3,4))
#>  [1] "A" "1" "1" "B" "B" "B" "2" "2" "2" "2"
rep(x = c("A",1,"B",2),each = 3,times = 2)
#>  [1] "A" "A" "A" "1" "1" "1" "B" "B" "B" "2" "2" "2" "A" "A" "A" "1" "1" "1" "B"
#> [20] "B" "B" "2" "2" "2"

7.6 排序

Show the code
x <- c("b", "c", "a")
order(x)
#> [1] 3 1 2
x[order(x)]
#> [1] "a" "b" "c"
Show the code
sort()
dplyr::arrange()

# 分配秩次
rank()

7.7 拆分

Show the code
set.seed(3)

df <- CO2[sample(1:nrow(CO2), 10), ]
head(df)
Plant Type Treatment conc uptake
5 Qn1 Quebec nonchilled 500 35.3
58 Mn3 Mississippi nonchilled 175 19.4
12 Qn2 Quebec nonchilled 500 40.6
36 Qc3 Quebec chilled 95 15.1
83 Mc3 Mississippi chilled 675 18.9
8 Qn2 Quebec nonchilled 95 13.6
Show the code
dfs <- split(df, f = list(df$Type, df$Treatment))
dfs
#> $Quebec.nonchilled
#>    Plant   Type  Treatment conc uptake
#> 5    Qn1 Quebec nonchilled  500   35.3
#> 12   Qn2 Quebec nonchilled  500   40.6
#> 8    Qn2 Quebec nonchilled   95   13.6
#> 20   Qn3 Quebec nonchilled  675   43.9
#> 
#> $Mississippi.nonchilled
#>    Plant        Type  Treatment conc uptake
#> 58   Mn3 Mississippi nonchilled  175   19.4
#> 55   Mn2 Mississippi nonchilled  675   31.1
#> 
#> $Quebec.chilled
#>    Plant   Type Treatment conc uptake
#> 36   Qc3 Quebec   chilled   95   15.1
#> 40   Qc3 Quebec   chilled  500   38.9
#> 
#> $Mississippi.chilled
#>    Plant        Type Treatment conc uptake
#> 83   Mc3 Mississippi   chilled  675   18.9
#> 74   Mc2 Mississippi   chilled  350   13.0

7.8 合并

Show the code
set.seed(61)

employee_id <- 1:10
employee_name <- c("Andrew", "Susan", "John", "Joe", "Jack",
                   "Jacob", "Mary", "Kate", "Jacqueline", "Ivy")
employee_salary <- round(rnorm(10, mean = 1500, sd = 200))
employee_age <- round(rnorm(10, mean = 50, sd = 8))
employee_position <- c("CTO", "CFO", "Administrative", rep("Technician", 7))


df_1 <- data.frame(id = employee_id[1:8], name = employee_name[1:8],
                   month_salary = employee_salary[1:8])
df_2 <- data.frame(id = employee_id[-5], name = employee_name[-5],
                   age = employee_age[-5], position = employee_position[-5])

df_1
id name month_salary
1 Andrew 1424
2 Susan 1425
3 John 1156
4 Joe 1570
5 Jack 1223
6 Jacob 1462
7 Mary 1641
8 Kate 1603
Show the code
df_2
id name age position
1 Andrew 40 CTO
2 Susan 38 CFO
3 John 54 Administrative
4 Joe 66 Technician
6 Jacob 38 Technician
7 Mary 53 Technician
8 Kate 56 Technician
9 Jacqueline 55 Technician
10 Ivy 43 Technician

7.8.1 连接

Show the code
# 内连接
merge(x = df_1, y = df_2, by = c("id", "name")) 
id name month_salary age position
1 Andrew 1424 40 CTO
2 Susan 1425 38 CFO
3 John 1156 54 Administrative
4 Joe 1570 66 Technician
6 Jacob 1462 38 Technician
7 Mary 1641 53 Technician
8 Kate 1603 56 Technician
Show the code

# 全连接
merge(x = df_1, y = df_2, all = TRUE)
id name month_salary age position
1 Andrew 1424 40 CTO
2 Susan 1425 38 CFO
3 John 1156 54 Administrative
4 Joe 1570 66 Technician
5 Jack 1223 NA NA
6 Jacob 1462 38 Technician
7 Mary 1641 53 Technician
8 Kate 1603 56 Technician
9 Jacqueline NA 55 Technician
10 Ivy NA 43 Technician
Show the code

# 左连接
merge(x = df_1, y = df_2, all.x = TRUE)
id name month_salary age position
1 Andrew 1424 40 CTO
2 Susan 1425 38 CFO
3 John 1156 54 Administrative
4 Joe 1570 66 Technician
5 Jack 1223 NA NA
6 Jacob 1462 38 Technician
7 Mary 1641 53 Technician
8 Kate 1603 56 Technician
Show the code

# 右连接
merge(x = df_1, y = df_2, all.y = TRUE)
id name month_salary age position
1 Andrew 1424 40 CTO
2 Susan 1425 38 CFO
3 John 1156 54 Administrative
4 Joe 1570 66 Technician
6 Jacob 1462 38 Technician
7 Mary 1641 53 Technician
8 Kate 1603 56 Technician
9 Jacqueline NA 55 Technician
10 Ivy NA 43 Technician
Show the code

# 交叉连接
Merged <- merge(x = df_1, y = df_2, by = NULL)
head(Merged)
id.x name.x month_salary id.y name.y age position
1 Andrew 1424 1 Andrew 40 CTO
2 Susan 1425 1 Andrew 40 CTO
3 John 1156 1 Andrew 40 CTO
4 Joe 1570 1 Andrew 40 CTO
5 Jack 1223 1 Andrew 40 CTO
6 Jacob 1462 1 Andrew 40 CTO

7.8.2 按行名合并

Show the code
df1 <- data.frame(var = c("one", "two", "three", "four", "five"),
                  data = c(1, 5, 1, 6, 8))
rownames(df1) <- c("A", "B", "C", "D", "E")
df1
var data
A one 1
B two 5
C three 1
D four 6
E five 8
Show the code

df2 <- data.frame(var = c("three", "one", "eight", "two", "nine"),
                  data = c(1, 5, 1, 6, 8))
rownames(df2) <- c("E", "A", "B", "D", "C")
df2
var data
E three 1
A one 5
B eight 1
D two 6
C nine 8
Show the code
merge(df1, df2, by = 0, all = TRUE) 
Row.names var.x data.x var.y data.y
A one 1 one 5
B two 5 eight 1
C three 1 nine 8
D four 6 two 6
E five 8 three 1
Show the code
merge(df1, df2, by = "row.names", all = TRUE) # Equivalent
Row.names var.x data.x var.y data.y
A one 1 one 5
B two 5 eight 1
C three 1 nine 8
D four 6 two 6
E five 8 three 1

7.9 AsIs I()

I()函数来自于stats包,它是as.is函数的别名。I()函数用于改变对象的类别,指示R在进行操作时应该将该对象视为其原始形式,不对它进行任何转换。

Show the code

summary(mtcars$mpg) |> as_tibble()
#> Error in dimnames(x) <- dnx: 'dimnames' applied to non-array

lapply(summary(mtcars$mpg), I) 
#> $Min.
#> [1] 10.4
#> 
#> $`1st Qu.`
#> [1] 15.425
#> 
#> $Median
#> [1] 19.2
#> 
#> $Mean
#> [1] 20.09062
#> 
#> $`3rd Qu.`
#> [1] 22.8
#> 
#> $Max.
#> [1] 33.9
lapply(summary(mtcars$mpg), I) |> as_tibble()
Min. 1st Qu. Median Mean 3rd Qu. Max.
10.4 15.425 19.2 20.090625 22.8 33.9
Show the code

sapply(summary(mtcars$mpg), I, simplify = F) 
#> $Min.
#> [1] 10.4
#> 
#> $`1st Qu.`
#> [1] 15.425
#> 
#> $Median
#> [1] 19.2
#> 
#> $Mean
#> [1] 20.09062
#> 
#> $`3rd Qu.`
#> [1] 22.8
#> 
#> $Max.
#> [1] 33.9
sapply(summary(mtcars$mpg), I, simplify = F) |> as_tibble()
Min. 1st Qu. Median Mean 3rd Qu. Max.
10.4 15.425 19.2 20.090625 22.8 33.9

7.10 数学函数

Show the code
x<-c(25,-4,3.66,3.42,-5.99)
abs(x)                       #绝对值
#> [1] 25.00  4.00  3.66  3.42  5.99
sqrt(x)                      #平方根
#> [1] 5.000000      NaN 1.913113 1.849324      NaN
ceiling(x)                   #向上取整
#> [1] 25 -4  4  4 -5
floor(x)                     #向下取整
#> [1] 25 -4  3  3 -6
trunc(x)                     #整数部分
#> [1] 25 -4  3  3 -5
round(x,digits = 2)          #四舍五入,保留2位小数
#> [1] 25.00 -4.00  3.66  3.42 -5.99
signif(x,digits = 2)         #四舍五入,保留2有效数字
#> [1] 25.0 -4.0  3.7  3.4 -6.0
log(x,base=4)                #对x取以base为底的对数
#> [1] 2.3219281       NaN 0.9359218 0.8869982       NaN
log(x)                          #自然对数
#> [1] 3.218876      NaN 1.297463 1.229641      NaN
log10(x)                        #常用对数
#> [1] 1.3979400       NaN 0.5634811 0.5340261       NaN
exp(x)                       #e指数函数
#> [1] 7.200490e+10 1.831564e-02 3.886134e+01 3.056942e+01 2.503664e-03

7.11 统计函数

Show the code
mean(x,trim = 0.05,na.rm = TRUE)     #算术平均值
#> [1] 4.418
median(x)                            #中位数
#> [1] 3.42
sd(x)                                #标准差
#> [1] 12.29206
var(x)                               #方差
#> [1] 151.0946
mad(x)                               #绝对中位差
#> [1] 11.00089
quantile(x,probs = c(0,0.25,0.5,0.75,1))     #分位数
#>    0%   25%   50%   75%  100% 
#> -5.99 -4.00  3.42  3.66 25.00
range(x)                             # 值域
#> [1] -5.99 25.00
scale(x,center = TRUE,scale = TRUE)  #标准化(均值为0、标准差为1)
#>             [,1]
#> [1,]  1.67441487
#> [2,] -0.68483259
#> [3,] -0.06166585
#> [4,] -0.08119065
#> [5,] -0.84672578
#> attr(,"scaled:center")
#> [1] 4.418
#> attr(,"scaled:scale")
#> [1] 12.29206
scale(x,center = TRUE,scale =FALSE)  #中心化:减去均值
#>         [,1]
#> [1,]  20.582
#> [2,]  -8.418
#> [3,]  -0.758
#> [4,]  -0.998
#> [5,] -10.408
#> attr(,"scaled:center")
#> [1] 4.418


x <- c(1,3,5,7,9,11,13)
# Find the "previous" (lag()) or "next" (lead()) values in a vector
dplyr::lag(x,n=2)                      # n阶滞后
#> [1] NA NA  1  3  5  7  9
dplyr::lead(x,n=2)                     # n阶前移
#> [1]  5  7  9 11 13 NA NA

# 滞后差分  lag阶滞后  difference阶差分
diff(x,lag = 1,difference=1)     # 隔0个值后位减前位,进行1次
#> [1] 2 2 2 2 2 2
diff(x,lag = 1,difference=2)     ## 隔0个值后位减前位,进行2次
#> [1] 0 0 0 0 0
diff(x,lag = 1,difference=3)     # 隔0个值后位减前位,进行3次
#> [1] 0 0 0 0

diff(x,lag = 2,difference=1)  # 隔1个值后位减前位,进行1次
#> [1] 4 4 4 4 4
diff(x,lag = 2,difference=2) ## 隔1个值后位减前位,进行2次
#> [1] 0 0 0

7.12 字符串函数

Show the code

# 匹配
data <- data.frame(
  name = c("Alice", "Bob", "Carol", "Dave", "Eve"),
  description = c("Software developer", "Data analyst", "UX designer", "Project manager", "Data scientist")
)
data
name description
Alice Software developer
Bob Data analyst
Carol UX designer
Dave Project manager
Eve Data scientist
Show the code

data$has_data_analyst <- str_detect(data$description, "Data analyst")
print(data)
#>    name        description has_data_analyst
#> 1 Alice Software developer            FALSE
#> 2   Bob       Data analyst             TRUE
#> 3 Carol        UX designer            FALSE
#> 4  Dave    Project manager            FALSE
#> 5   Eve     Data scientist            FALSE

data$has_data_grepl <- grepl("Data", data$description)
print(data)
#>    name        description has_data_analyst has_data_grepl
#> 1 Alice Software developer            FALSE          FALSE
#> 2   Bob       Data analyst             TRUE           TRUE
#> 3 Carol        UX designer            FALSE          FALSE
#> 4  Dave    Project manager            FALSE          FALSE
#> 5   Eve     Data scientist            FALSE           TRUE
Show the code
#子串
substr(x="qwertyyuio",start = 2,stop=4)   
#> [1] "wer"
str_sub(string = "qwertyyuio",start = 2,end = 4)
#> [1] "wer"
#查找替换
sub(pattern = " ",replacement = ".",
    x="hello world hello !",ignore.case = FALSE,fixed = FALSE)  
#> [1] "hello.world hello !"

str_replace_all("hello world hello !"," ",replacement = ".")
#> [1] "hello.world.hello.!"
# 查找,返回下标
grep(pattern = "v",x=c("a","v","D","A","f","J"),ignore.case = FALSE,fixed = FALSE)   
#> [1] 2

 # 分隔,\\转义字符
strsplit(x="a.fa.fag",split = "\\.",fixed = FALSE) 
#> [[1]]
#> [1] "a"   "fa"  "fag"


#连接
paste("x",c("a","b"),sep="",collapse = "?")  
#> [1] "xa?xb"
paste0("x",c("A","B"),collapse="?")
#> [1] "xA?xB"

cat("hello","BOb","\b\n","\bIsn\' R","\t","GREAT?\n",sep = " " )
#> hello BOb 
#>  Isn' R      GREAT?

#
toupper("abc")              #大写转换
#> [1] "ABC"
tolower("aaAGEErg")         #小写转换
#> [1] "aaageerg"

7.13 打印函数 sprintf()

Show the code
a <- "string"
sprintf("This is where a %s goes.", a)
#> [1] "This is where a string goes."

x <- 8
sprintf("Regular:%04d", x)
#> [1] "Regular:0008"

sprintf("%f", pi)         # "3.141593"
#> [1] "3.141593"
sprintf("%.3f", pi)       # "3.142"
#> [1] "3.142"
sprintf("%1.0f", pi)      # "3"
#> [1] "3"
sprintf("%5.1f", pi)      # "  3.1"
#> [1] "  3.1"
sprintf("%05.1f", pi)     # "003.1"
#> [1] "003.1"
sprintf("%+f", pi)        # "+3.141593"
#> [1] "+3.141593"
sprintf("% f", pi)        # " 3.141593"
#> [1] " 3.141593"
sprintf("%-10f", pi)      # "3.141593  "   (left justified)
#> [1] "3.141593  "
sprintf("%e", pi)         #"3.141593e+00"
#> [1] "3.141593e+00"
sprintf("%E", pi)         # "3.141593E+00"
#> [1] "3.141593E+00"
sprintf("%g", pi)         # "3.14159"
#> [1] "3.14159"
sprintf("%g",   1e6 * pi) # "3.14159e+06"  (exponential)
#> [1] "3.14159e+06"
sprintf("%.9g", 1e6 * pi) # "3141592.65"   ("fixed")
#> [1] "3141592.65"
sprintf("%G", 1e-6 * pi)  # "3.14159E-06"
#> [1] "3.14159E-06"


x <- "string"
sprintf("Substitute in multiple strings: %s %s", x, "string2")
#> [1] "Substitute in multiple strings: string string2"

sprintf("A single percent sign here: %%")
#> [1] "A single percent sign here: %"

7.14 apply函数簇

7.14.1 apply()

Show the code
apply(X = mtcars, MARGIN = 2, FUN = mean)
#>        mpg        cyl       disp         hp       drat         wt       qsec 
#>  20.090625   6.187500 230.721875 146.687500   3.596563   3.217250  17.848750 
#>         vs         am       gear       carb 
#>   0.437500   0.406250   3.687500   2.812500

7.14.2 aggregate()

Show the code
aggregate(x = mtcars,
          by = list(am=mtcars$am,cyl=mtcars$cyl),
          FUN = mean)  
am cyl mpg cyl disp hp drat wt qsec vs am gear carb
0 4 22.90000 4 135.8667 84.66667 3.770000 2.935000 20.97000 1.000 0 3.666667 1.666667
1 4 28.07500 4 93.6125 81.87500 4.183750 2.042250 18.45000 0.875 1 4.250000 1.500000
0 6 19.12500 6 204.5500 115.25000 3.420000 3.388750 19.21500 1.000 0 3.500000 2.500000
1 6 20.56667 6 155.0000 131.66667 3.806667 2.755000 16.32667 0.000 1 4.333333 4.666667
0 8 15.05000 8 357.6167 194.16667 3.120833 4.104083 17.14250 0.000 0 3.000000 3.083333
1 8 15.40000 8 326.0000 299.50000 3.880000 3.370000 14.55000 0.000 1 5.000000 6.000000
Show the code
aggregate(.~am+cyl,
          data = mtcars,
          FUN = mean)
am cyl mpg disp hp drat wt qsec vs gear carb
0 4 22.90000 135.8667 84.66667 3.770000 2.935000 20.97000 1.000 3.666667 1.666667
1 4 28.07500 93.6125 81.87500 4.183750 2.042250 18.45000 0.875 4.250000 1.500000
0 6 19.12500 204.5500 115.25000 3.420000 3.388750 19.21500 1.000 3.500000 2.500000
1 6 20.56667 155.0000 131.66667 3.806667 2.755000 16.32667 0.000 4.333333 4.666667
0 8 15.05000 357.6167 194.16667 3.120833 4.104083 17.14250 0.000 3.000000 3.083333
1 8 15.40000 326.0000 299.50000 3.880000 3.370000 14.55000 0.000 5.000000 6.000000
Show the code
aggregate(mpg~am+cyl,
          data = mtcars,
          FUN = summary)
am cyl mpg
0 4 21.50000
1 4 21.40000
0 6 17.80000
1 6 19.70000
0 8 10.40000
1 8 15.00000

7.14.3 by(), tapply()

by()是应用于数据框的tapply的面向对象包装器。

Show the code
by(data = mtcars[,1:3],
   INDICES = list(cyl=mtcars$cyl),
   FUN = summary)
#> cyl: 4
#>       mpg             cyl         disp       
#>  Min.   :21.40   Min.   :4   Min.   : 71.10  
#>  1st Qu.:22.80   1st Qu.:4   1st Qu.: 78.85  
#>  Median :26.00   Median :4   Median :108.00  
#>  Mean   :26.66   Mean   :4   Mean   :105.14  
#>  3rd Qu.:30.40   3rd Qu.:4   3rd Qu.:120.65  
#>  Max.   :33.90   Max.   :4   Max.   :146.70  
#> ------------------------------------------------------------ 
#> cyl: 6
#>       mpg             cyl         disp      
#>  Min.   :17.80   Min.   :6   Min.   :145.0  
#>  1st Qu.:18.65   1st Qu.:6   1st Qu.:160.0  
#>  Median :19.70   Median :6   Median :167.6  
#>  Mean   :19.74   Mean   :6   Mean   :183.3  
#>  3rd Qu.:21.00   3rd Qu.:6   3rd Qu.:196.3  
#>  Max.   :21.40   Max.   :6   Max.   :258.0  
#> ------------------------------------------------------------ 
#> cyl: 8
#>       mpg             cyl         disp      
#>  Min.   :10.40   Min.   :8   Min.   :275.8  
#>  1st Qu.:14.40   1st Qu.:8   1st Qu.:301.8  
#>  Median :15.20   Median :8   Median :350.5  
#>  Mean   :15.10   Mean   :8   Mean   :353.1  
#>  3rd Qu.:16.25   3rd Qu.:8   3rd Qu.:390.0  
#>  Max.   :19.20   Max.   :8   Max.   :472.0

by(data = mtcars[,1:3],
   INDICES = factor(mtcars$cyl),
   FUN = function(x) lm( disp ~ mpg, data = x))
#> factor(mtcars$cyl): 4
#> 
#> Call:
#> lm(formula = disp ~ mpg, data = x)
#> 
#> Coefficients:
#> (Intercept)          mpg  
#>     233.067       -4.798  
#> 
#> ------------------------------------------------------------ 
#> factor(mtcars$cyl): 6
#> 
#> Call:
#> lm(formula = disp ~ mpg, data = x)
#> 
#> Coefficients:
#> (Intercept)          mpg  
#>     125.122        2.947  
#> 
#> ------------------------------------------------------------ 
#> factor(mtcars$cyl): 8
#> 
#> Call:
#> lm(formula = disp ~ mpg, data = x)
#> 
#> Coefficients:
#> (Intercept)          mpg  
#>      560.87       -13.76

tapply(X = mtcars[,1:3],
       INDEX = list(cyl=mtcars$cyl),
       FUN = summary)
#> $`4`
#>       mpg             cyl         disp       
#>  Min.   :21.40   Min.   :4   Min.   : 71.10  
#>  1st Qu.:22.80   1st Qu.:4   1st Qu.: 78.85  
#>  Median :26.00   Median :4   Median :108.00  
#>  Mean   :26.66   Mean   :4   Mean   :105.14  
#>  3rd Qu.:30.40   3rd Qu.:4   3rd Qu.:120.65  
#>  Max.   :33.90   Max.   :4   Max.   :146.70  
#> 
#> $`6`
#>       mpg             cyl         disp      
#>  Min.   :17.80   Min.   :6   Min.   :145.0  
#>  1st Qu.:18.65   1st Qu.:6   1st Qu.:160.0  
#>  Median :19.70   Median :6   Median :167.6  
#>  Mean   :19.74   Mean   :6   Mean   :183.3  
#>  3rd Qu.:21.00   3rd Qu.:6   3rd Qu.:196.3  
#>  Max.   :21.40   Max.   :6   Max.   :258.0  
#> 
#> $`8`
#>       mpg             cyl         disp      
#>  Min.   :10.40   Min.   :8   Min.   :275.8  
#>  1st Qu.:14.40   1st Qu.:8   1st Qu.:301.8  
#>  Median :15.20   Median :8   Median :350.5  
#>  Mean   :15.10   Mean   :8   Mean   :353.1  
#>  3rd Qu.:16.25   3rd Qu.:8   3rd Qu.:390.0  
#>  Max.   :19.20   Max.   :8   Max.   :472.0

7.14.4 lapply(), sapply()

lapply() returns a list of the same length as X,

sapply() is a user-friendly version and wrapper of lapply() by default returning a vector or matrix

Show the code
x <- list(a = 1:10, beta = exp(-3:3), logic = c(TRUE,FALSE,FALSE,TRUE))
lapply(X = x,FUN = quantile)
#> $a
#>    0%   25%   50%   75%  100% 
#>  1.00  3.25  5.50  7.75 10.00 
#> 
#> $beta
#>          0%         25%         50%         75%        100% 
#>  0.04978707  0.25160736  1.00000000  5.05366896 20.08553692 
#> 
#> $logic
#>   0%  25%  50%  75% 100% 
#>  0.0  0.0  0.5  1.0  1.0

sapply(x, quantile)
#>          a        beta logic
#> 0%    1.00  0.04978707   0.0
#> 25%   3.25  0.25160736   0.0
#> 50%   5.50  1.00000000   0.5
#> 75%   7.75  5.05366896   1.0
#> 100% 10.00 20.08553692   1.0

7.14.5 vapply

Show the code
by_cyl <- split(mtcars, mtcars$cyl)
models <- lapply(by_cyl, function(data) lm(mpg ~ wt, data = data))
models
#> $`4`
#> 
#> Call:
#> lm(formula = mpg ~ wt, data = data)
#> 
#> Coefficients:
#> (Intercept)           wt  
#>      39.571       -5.647  
#> 
#> 
#> $`6`
#> 
#> Call:
#> lm(formula = mpg ~ wt, data = data)
#> 
#> Coefficients:
#> (Intercept)           wt  
#>       28.41        -2.78  
#> 
#> 
#> $`8`
#> 
#> Call:
#> lm(formula = mpg ~ wt, data = data)
#> 
#> Coefficients:
#> (Intercept)           wt  
#>      23.868       -2.192
vapply(models, function(x) coef(x)[[2]], double(1))
#>         4         6         8 
#> -5.647025 -2.780106 -2.192438

7.15 矩阵运算

Show the code
X <- matrix(data = 1:9,ncol = 3,nrow = 3)
X
#>      [,1] [,2] [,3]
#> [1,]    1    4    7
#> [2,]    2    5    8
#> [3,]    3    6    9

diag(X)
#> [1] 1 5 9
det(X)
#> [1] 0
eigen(X)
#> eigen() decomposition
#> $values
#> [1]  1.611684e+01 -1.116844e+00 -5.700691e-16
#> 
#> $vectors
#>            [,1]       [,2]       [,3]
#> [1,] -0.4645473 -0.8829060  0.4082483
#> [2,] -0.5707955 -0.2395204 -0.8164966
#> [3,] -0.6770438  0.4038651  0.4082483

X
#>      [,1] [,2] [,3]
#> [1,]    1    4    7
#> [2,]    2    5    8
#> [3,]    3    6    9
# 协方差矩阵
cov(X)
#>      [,1] [,2] [,3]
#> [1,]    1    1    1
#> [2,]    1    1    1
#> [3,]    1    1    1

# 相关系数矩阵
cor(X)
#>      [,1] [,2] [,3]
#> [1,]    1    1    1
#> [2,]    1    1    1
#> [3,]    1    1    1

7.16 统计分布函数

形如[dpqr]distribution_abbreviation,其中密度函数d,分布函数p,分位数函数q,随机数生成函数r

Show the code
### 正态分布
dnorm(3,0,2)  #N(0,4)在 3 处的密度值
#> [1] 0.0647588
pnorm(1:3,0,2)#N(0,4)在1,2,3处的分布概率值
#> [1] 0.6914625 0.8413447 0.9331928
qnorm(1-0.025,0,1)# N(0,1)的上0.025分位数
#> [1] 1.959964
rnorm(5,3,3)  # 生成5个服从N(3,9)的随机数
#> [1] 3.401818 2.115231 1.348325 5.882787 3.584287


### 二项分布
1-sum(dbinom(0:1,400,0.02))# 命中率为0.02,独立射击400次,至少击中2次的概率
#> [1] 0.9971655
ggplot()

Show the code



### 多元正态分布
mean<-c(230.7,146.7,3.6)
sigma<-matrix(c(15360.8,6721.2,-47.1,
                6721.2,4700.9,-16.5,
                -47.1,16.5,0.3),nrow = 3,ncol = 3)
library(MASS)
multi <- mvrnorm(500,mean,sigma)
head(multi)
#>          [,1]     [,2]     [,3]
#> [1,] 365.6210 131.9249 3.193131
#> [2,] 277.0565 215.3377 3.732176
#> [3,] 188.6413  64.2639 4.161602
#> [4,] 180.2383 113.7082 3.760537
#> [5,] 327.8288 211.1226 3.160674
#> [6,] 383.8595 220.1269 2.838524