3 数据结构

数据结构是一种组织和存储数据的方式，它使得数据可以更高效地访问和操作。数据结构通常是由基本数据类型组合而成的。例如，数组、链表、栈、队列、树和图等都是数据结构。

R中的数据结构包括 原子向量（atomic vector）和泛型向量（generic vector）。

原子向量是基本数据类型（numeric，character，logical，complex，raw）的数组（一维vector，二维matrix，多维array）。
泛型向量是原子向量的集合，即列表list。

3.1 属性

在R中任何一个对象都具有属性：

names,dimnames
length
dimensions（例如matrices，arrays）
class
自定义属性

在R中有多种方式查看对象的属性，

class()函数，从面向对象编程的角度，知道其class属性后就可以通过methods()函数查找相应的泛型函数（generic function）对其操作。

Show the code

m1 <- matrix(1:6,nrow = 2)
m1
#>      [,1] [,2] [,3]
#> [1,]    1    3    5
#> [2,]    2    4    6
class(m1)
#> [1] "matrix" "array"
methods(class="matrix")
#>  [1] anyDuplicated as.data.frame as.raster     as_tibble     boxplot      
#>  [6] coerce        determinant   duplicated    edit          head         
#> [11] initialize    isSymmetric   Math          Math2         Ops          
#> [16] relist        subset        summary       tail          unique       
#> see '?methods' for accessing help and source code

attributes() 主要是用于列出对象所有已有的属性。

Show the code

attributes(m1)
#> $dim
#> [1] 2 3

attr(x = m1,which = "dim") <- c(3,2)
m1
#>      [,1] [,2]
#> [1,]    1    4
#> [2,]    2    5
#> [3,]    3    6

# 或者
a <- structure(
  m1, 
  dim=c(1,6)
)
str(attributes(a))
#> List of 1
#>  $ dim: int [1:2] 1 6
a
#>      [,1] [,2] [,3] [,4] [,5] [,6]
#> [1,]    1    2    3    4    5    6

大多数操作都会丢失大多数属性。

只有两个属性是默认保留的：

“names”，一个字符向量，为每个元素命名。

Show the code

# When creating it: 
x <- c(a = 1, b = 2, c = 3)
x
#> a b c 
#> 1 2 3
attributes(x)
#> $names
#> [1] "a" "b" "c"
names(x)
#> [1] "a" "b" "c"

# By assigning a character vector to names()
x <- 1:3
names(x) <- c("a", "b", "c")

# Inline, with setNames():
x <- setNames(1:3, c("a", "b", "c"))

“dim”，dimensions 的缩写，整数向量，用于将向量转换为矩阵或数组。

对于原子向量，dimension 属性通常用于创建矩阵或数组。

Show the code

z <- 1:6
dim(z) <- c(3, 2, 1)
z
#> , , 1
#> 
#>      [,1] [,2]
#> [1,]    1    4
#> [2,]    2    5
#> [3,]    3    6
class(z)
#> [1] "array"
attributes(z)
#> $dim
#> [1] 3 2 1
dim(z)
#> [1] 3 2 1

对于列表，dimension 属性可用于创建列表矩阵或列表数组：

Show the code

l <- list(1:5, "a", TRUE, 1.0)
dim(l) <- c(2, 2)
l
#>      [,1]      [,2]
#> [1,] integer,5 TRUE
#> [2,] "a"       1
class(l)
#> [1] "matrix" "array"
attributes(l)
#> $dim
#> [1] 2 2
l[[1,1]]
#> [1] 1 2 3 4 5

向量	矩阵	数组
names()	rownames(),colnames()	dimnames()
length()	nrow(),ncol()	dim()
c()	rbind(),cbind()	abind::abind()
—	t()	aperm()
is.null(dim(x))	is.matrix()	is.array()

3.2 原子向量

3.2.1 vector

向量是一组有序元素的集合。一个 vector 可以包含任意数量的元素。但是，向量的所有元素必须属于同一类型。

Show the code

# 单元素向量
"a" == c("a")
#> [1] TRUE
is.vector("a")
#> [1] TRUE

# 函数c()  Combine Values into a Vector or List

c("a","b","c")
#> [1] "a" "b" "c"
c(list(1),list(T)) |> is.vector()
#> [1] TRUE

3.2.2 matrices

矩阵是一个具有维度属性（dim）的原子向量，所有元素必须是同一类型。

matrix(data= ,nrow=1 ,ncol=1 ,byrow=FALSE ,dimnames=list(rnames,cnames) ,...)

Show the code

num<-c(16,22,24,28)
rnames<-c("R1","R2")
cnames<-c("C1","C2")
m<-matrix(num,nrow=2,ncol=2,byrow=TRUE,dimnames=list(rnames,cnames))
m
#>    C1 C2
#> R1 16 22
#> R2 24 28
class(m)
#> [1] "matrix" "array"
attributes(m)
#> $dim
#> [1] 2 2
#> 
#> $dimnames
#> $dimnames[[1]]
#> [1] "R1" "R2"
#> 
#> $dimnames[[2]]
#> [1] "C1" "C2"
dim(m)
#> [1] 2 2
rownames(m)
#> [1] "R1" "R2"
colnames(m)
#> [1] "C1" "C2"

3.2.2.1 稀疏矩阵

Sparse matrix

稀疏矩阵的典型构造方式是通过三元组。

Show the code

library(Matrix)
i <- c(1, 3:8) # 行指标
j <- c(2, 9, 6:10) # 列指标
x <- 7 * (1:7) # 数据
sparseMatrix(i, j, x = x)
#> 8 x 10 sparse Matrix of class "dgCMatrix"
#>                              
#> [1,] . 7 . . .  .  .  .  .  .
#> [2,] . . . . .  .  .  .  .  .
#> [3,] . . . . .  .  .  . 14  .
#> [4,] . . . . . 21  .  .  .  .
#> [5,] . . . . .  . 28  .  .  .
#> [6,] . . . . .  .  . 35  .  .
#> [7,] . . . . .  .  .  . 42  .
#> [8,] . . . . .  .  .  .  . 49

稀疏矩阵对象仅存储非零元素，更节省内存

Show the code


N = 100

m = diag(1, N, N)
sp = sparseMatrix(1:N, 1:N, x = 1)
 
object.size(m)
#> 80216 bytes
object.size(sp)
#> 3104 bytes

3.2.3 array

数组也是一个具有维度属性（dim）的原子向量，所有元素必须是同一类型。

array(data,dim_numeric_vector=c(...),dimnames = list(dim1,dim2,...),...)

Show the code

v<-1:24 
dim1<-c("A1","A2","A3") 
dim2<-c("B1","B2","B3","B4")
dim3<-c("C1","C2") 
array_3d<-array(v,c(3,4,2),dimnames = list(dim1,dim2,dim3)) 
array_3d
#> , , C1
#> 
#>    B1 B2 B3 B4
#> A1  1  4  7 10
#> A2  2  5  8 11
#> A3  3  6  9 12
#> 
#> , , C2
#> 
#>    B1 B2 B3 B4
#> A1 13 16 19 22
#> A2 14 17 20 23
#> A3 15 18 21 24
class(array_3d)
#> [1] "array"
attributes(array_3d)
#> $dim
#> [1] 3 4 2
#> 
#> $dimnames
#> $dimnames[[1]]
#> [1] "A1" "A2" "A3"
#> 
#> $dimnames[[2]]
#> [1] "B1" "B2" "B3" "B4"
#> 
#> $dimnames[[3]]
#> [1] "C1" "C2"
dim(array_3d)
#> [1] 3 4 2
dimnames(array_3d)
#> [[1]]
#> [1] "A1" "A2" "A3"
#> 
#> [[2]]
#> [1] "B1" "B2" "B3" "B4"
#> 
#> [[3]]
#> [1] "C1" "C2"

3.3 S3类原子向量

要保留其他属性，需要创建S3 类。

base R 中使用的四个重要的 S3类原子向量：

分类数据，其中值来自factor向量中记录的一组固定水平。
日期（具有日期分辨率），记录在Date向量中。
日期时间（具有秒或亚秒分辨率），存储在 POSIXct 向量中。
持续时间，存储在difftime向量中。

3.3.1 factor

因子，分类变量，建立在具有两个属性（class，levels）的 integer向量之上。

factor(vector,ordered=FALSE,levels=c(v1,v2,…),labels= ,...) ，在内存中以整数向量c(1,2,3,...,k)存储。

Show the code

# 存储形式
x <- factor(c("a", "b", "b", "a"))
x
#> [1] a b b a
#> Levels: a b
typeof(x)
#> [1] "integer"
attributes(x)
#> $levels
#> [1] "a" "b"
#> 
#> $class
#> [1] "factor"
levels(x)
#> [1] "a" "b"
class(x)
#> [1] "factor"


# 名义变量 nominal variable
diabetes<-c("t1","t2","t1","t1") 
attributes(diabetes)
#> NULL
diabetes<-factor(diabetes)
attributes(diabetes)
#> $levels
#> [1] "t1" "t2"
#> 
#> $class
#> [1] "factor"


# 顺序变量 ordinal variable     默认水平根据字母顺序而定
status<-c("poor","better","best","poor")
status<-factor(status,ordered = TRUE) 
str(status) 
#>  Ord.factor w/ 3 levels "best"<"better"<..: 3 2 1 3
status<-factor(status,ordered =TRUE,levels = c("poor","better","best")) 
str(status) 
#>  Ord.factor w/ 3 levels "poor"<"better"<..: 1 2 3 1


#改变外在标签
sex<-c(1,2,2,1)
sex
#> [1] 1 2 2 1
sex<-factor(sex,levels=c(1,2),labels = c("男","女")) 
str(sex) 
#>  Factor w/ 2 levels "男","女": 1 2 2 1
sex
#> [1] 男 女 女 男
#> Levels: 男 女


age <- c(29, 44, 45, 68, 99)
# 连续型变量→因子
cut(
    age,
    breaks = c(0, 18, 45, 65, Inf),
    labels = c("minor", "young", "middle_age", "elder"),
    include.lowest = TRUE,
    right = TRUE
)
#> [1] young young young elder elder
#> Levels: minor young middle_age elder

3.3.2 Date

日期向量建立在 double 向量之上，具有class "Date"属性。

日期默认格式："%Y-%m-%d" xxxx-xx-xx,例如：2023-03-15

Show the code

today <- Sys.Date()
today
#> [1] "2024-11-07"
typeof(today)
#> [1] "double"
attributes(today)
#> $class
#> [1] "Date"
class(today)
#> [1] "Date"

as.Date(c("02 14-2002","01 04-2013"),"%m %d-%Y") #以"%m %d-%Y"格式读入
#> [1] "2002-02-14" "2013-01-04"
format(Sys.Date(),"%Y/%m/%d") #以"%Y/%m/%d"格式输出
#> [1] "2024/11/07"


# 双精度值（通过剥离类来查看）表示自 1970 年 1 月 1 日以来的天数
date <- as.Date("1970-02-01")
unclass(date)
#> [1] 31

3.3.3 Datetime

POSIXct 和 POSIXlt

“POSIX”是可移植操作系统接口（ Portable Operating System Interface）的缩写，这是一个跨平台标准系列。“ct”代表calendar time（C 中的类型），“lt”代表local time（C 中的类型）。

POSIXct 向量建立在 double向量之上，其中值表示自 1970-01-01 以来的秒数

Show the code

now_ct <- as.POSIXct("2024-04-20 15:45", tz = "Asia/Shanghai")
now_ct
#> [1] "2024-04-20 15:45:00 CST"
typeof(now_ct)
#> [1] "double"
attributes(now_ct)
#> $class
#> [1] "POSIXct" "POSIXt" 
#> 
#> $tzone
#> [1] "Asia/Shanghai"
class(now_ct)
#> [1] "POSIXct" "POSIXt"

3.3.4 Durations

持续时间（表示日期对或日期时间对之间的时间量）存储在difftimes中。Difftimes 建立在 double 之上，并且具有确定整数应如何解释的属性。

Show the code

units_1 <- as.difftime(1, units = "weeks")  #units = c("auto", "secs", "mins", "hours","days", "weeks"))

units_1
#> Time difference of 1 weeks
typeof(units_1)
#> [1] "double"
attributes(units_1)
#> $class
#> [1] "difftime"
#> 
#> $units
#> [1] "weeks"
class(units_1)
#> [1] "difftime"
units(units_1)
#> [1] "weeks"
units(units_1) <- "days"

attributes(units_1)
#> $units
#> [1] "days"
#> 
#> $class
#> [1] "difftime"

3.4 泛型向量

从技术上讲，列表的每个元素实际上是相同的类型，因为每个元素实际上是对另一个对象的引用 Figure 3.1 ，该对象可以是任何类型。

3.4.1 list

列表不存储值本身，而是存储对它们的引用：

Show the code

l1 <- list(1,2,3)
typeof(l1)
#> [1] "list"

列表，list(name1=object1,name2=object2,...)

Show the code

list1<-list(title="My list",
             matr=matrix(c("a1","b1","a2","b2"),nrow=2,ncol=2,byrow=TRUE,
                           dimnames = list(c("X1","X2"),c("Y1","Y2"))
                         ),
             df=data.frame(id=matrix(c("Lisa","BOb","John","Jule"),
                                     nrow=4,ncol=1,byrow=TRUE
                                     ),
                           int=c(3,5,7,9),
                           TF=c(T,T,T,F)
                           ),
             list=list(a=c(1,2,3),b=c("A","B"))
             )
list1
#> $title
#> [1] "My list"
#> 
#> $matr
#>    Y1   Y2  
#> X1 "a1" "b1"
#> X2 "a2" "b2"
#> 
#> $df
#>     id int    TF
#> 1 Lisa   3  TRUE
#> 2  BOb   5  TRUE
#> 3 John   7  TRUE
#> 4 Jule   9 FALSE
#> 
#> $list
#> $list$a
#> [1] 1 2 3
#> 
#> $list$b
#> [1] "A" "B"
typeof(list1)
#> [1] "list"
attributes(list1)
#> $names
#> [1] "title" "matr"  "df"    "list"
class(list1)
#> [1] "list"
names(list1)
#> [1] "title" "matr"  "df"    "list"

3.4.2 data frame/tibble

建立在"list"之上的两个最重要的 S3 类是data.frame 和 tibble

数据框是列向量具有 names，row.names ，class = "data.frame" 属性的命名列表，是一种特殊的列表，每个原子向量的长度必须相同。

Show the code

id<-c(001231,001241,001413,001244) 
age<-c(21,14,52,15) 
diabetes<-c("t1","t2","t1","t1") 
status<-c("poor","better","best","poor") 
df1<-data.frame(patientID=id,age,diabetes,status,row.names = c(1,2,3,4),
                stringsAsFactors = FALSE) # 4个列向量组成数据框 
df1

patientID	age	diabetes	status
1231	21	t1	poor
1241	14	t2	better
1413	52	t1	best
1244	15	t1	poor

Show the code

typeof(df1)
#> [1] "list"
attributes(df1)
#> $names
#> [1] "patientID" "age"       "diabetes"  "status"   
#> 
#> $class
#> [1] "data.frame"
#> 
#> $row.names
#> [1] "1" "2" "3" "4"
class(df1)
#> [1] "data.frame"
names(df1)
#> [1] "patientID" "age"       "diabetes"  "status"
colnames(df1)
#> [1] "patientID" "age"       "diabetes"  "status"
row.names(df1)
#> [1] "1" "2" "3" "4"
rownames(df1)
#> [1] "1" "2" "3" "4"

library(tibble)
tibble(
  x = c(1, 2, 5), 
  y = c("h", "m", "g"),
  z = c(0.08, 0.83, 0.60)
)

x	y	z
1	h	0.08
2	m	0.83
5	g	0.60

Show the code


tribble(
  ~x, ~y, ~z,
  1, "h", 0.08,
  2, "m", 0.83,
  5, "g", 0.60
)

x	y	z
1	h	0.08
2	m	0.83
5	g	0.60

tibblle与data frame共享相同的结构。区别是class属性更多,不会自动进行强制类型转换，不会自动转换非法名称（自动反引号非法名称）， tibbles 只能循环较短的长度为 1 的向量，允许引用在构造过程中创建的变量。

Show the code

df2 <- tibble(x = 1:3, y = letters[1:3])
typeof(df2)
#> [1] "list"
attributes(df2)
#> $class
#> [1] "tbl_df"     "tbl"        "data.frame"
#> 
#> $row.names
#> [1] 1 2 3
#> 
#> $names
#> [1] "x" "y"


names(tibble(`1` = 1))
#> [1] "1"

tibble(x = 1:4, y = 1)

x	y
1	1
2	1
3	1
4	1

Show the code

tibble(x = 1:4, y = 1:2)
#> Error in `tibble()`:
#> ! Tibble columns must have compatible sizes.
#> • Size 4: Existing data.
#> • Size 2: Column `y`.
#> ℹ Only values of size one are recycled.

tibble(
  x = 1:3,
  y = x * 2
)

x	y
1	2
2	4
3	6

3.4.2.1 行名→列

Show the code

df3 <- data.frame(
  age = c(35, 27, 18),
  hair = c("blond", "brown", "black"),
  row.names = c("Bob", "Susan", "Sam")
)
df3 |> rownames_to_column(var = "name")

name	age	hair
Bob	35	blond
Susan	27	brown
Sam	18	black

Show the code

as_tibble(df3, rownames = "name")

name	age	hair
Bob	35	blond
Susan	27	brown
Sam	18	black

Show the code



is_tibble(df2)
#> [1] TRUE
is_tibble(df3)
#> [1] FALSE
as_tibble(df3)

age	hair
35	blond
27	brown
18	black

3.4.2.2 列表列

Show the code

df <- data.frame(x = 1:3)
df$y <- list(1:2, 1:3, 1:4)
df

x	y
1	1, 2
2	1, 2, 3
3	1, 2, 3, 4

Show the code


data.frame(
  x = 1:3, 
  y = I(list(1:2, 1:3, 1:4))
)

x	y
1	1, 2
2	1, 2, 3
3	1, 2, 3, 4

Show the code


tibble(
  x = 1:3, 
  y = list(1:2, 1:3, 1:4)
)

x	y
1	1, 2
2	1, 2, 3
3	1, 2, 3, 4

3.4.2.3 矩阵和数据框列

行数必须相等

Show the code

dfm <- data.frame(
  x = 1:3 * 10
)
dfm$y <- matrix(1:9, nrow = 3)
dfm$z <- data.frame(a = 3:1, b = letters[1:3], stringsAsFactors = FALSE)

dfm

x	y	z
10	1	3
20	2	2
30	3	1