4.1 Read

4.1.1 文本格式

常用的分隔符如逗号, ,制表符\t,分号;,空格space,…… 等。

4.1.1.1 utils::

utilities,a piece of computer software that performs a particular task

read.table() read.csv() read.csv2

Show the code
# 列名(第1行),行标识符 (第1列)
read.table("data/read_write/leadership.txt",header = TRUE,sep=",",row.names = 1)
date country gender age q1 q2 q3 q4 q5
Bob 10/24/14 US M 29 5 4 5 5 5
Mary 10/28/14 US F 44 3 5 2 5 5
Lily 10/01/14 UK F 45 3 5 5 5 2
July 10/12/14 UK M 68 3 3 4 NA NA
Frank 05/01/14 UK F 99 2 2 1 2 1

4.1.1.2 readr::

read_csv() read_tsv() read_delim()

Show the code
readr::read_csv("data/read_write/leadership.txt",
                col_names = TRUE,
                id = "file_path") |>
    column_to_rownames(var = "id")
file_path date country gender age q1 q2 q3 q4 q5
Bob data/read_write/leadership.txt 10/24/14 US M 29 5 4 5 5 5
Mary data/read_write/leadership.txt 10/28/14 US F 44 3 5 2 5 5
Lily data/read_write/leadership.txt 10/01/14 UK F 45 3 5 5 5 2
July data/read_write/leadership.txt 10/12/14 UK M 68 3 3 4 NA NA
Frank data/read_write/leadership.txt 05/01/14 UK F 99 2 2 1 2 1

4.1.1.3 批量读取csv - 相同格式

Show the code
sales_files <- c("data/read_write//01-sales.csv",
                     "data/read_write//02-sales.csv",
                 "data/read_write//03-sales.csv")
read_csv(sales_files, id = "file_path")
file_path month year brand item n
data/read_write//01-sales.csv January 2019 1 1234 3
data/read_write//01-sales.csv January 2019 1 8721 9
data/read_write//01-sales.csv January 2019 1 1822 2
data/read_write//01-sales.csv January 2019 2 3333 1
data/read_write//01-sales.csv January 2019 2 2156 9
data/read_write//01-sales.csv January 2019 2 3987 6
data/read_write//01-sales.csv January 2019 2 3827 6
data/read_write//02-sales.csv February 2019 1 1234 8
data/read_write//02-sales.csv February 2019 1 8721 2
data/read_write//02-sales.csv February 2019 1 1822 3
data/read_write//02-sales.csv February 2019 2 3333 1
data/read_write//02-sales.csv February 2019 2 2156 3
data/read_write//02-sales.csv February 2019 2 3987 6
data/read_write//03-sales.csv March 2019 1 1234 3
data/read_write//03-sales.csv March 2019 1 3627 1
data/read_write//03-sales.csv March 2019 1 8820 3
data/read_write//03-sales.csv March 2019 2 7253 1
data/read_write//03-sales.csv March 2019 2 8766 3
data/read_write//03-sales.csv March 2019 2 8288 6

4.1.2 Excel

4.1.2.1 批量读取 sheet

Show the code
library(readxl)
filename <- "data/read_write/multi-sheet-iris_data.xlsx"

excel_sheets(filename)
#> [1] "iris"       "setosa"     "versicolor" "virginica"

excel_sheets(filename) %>% 
    map(~read_excel(filename, sheet = .x)) %>% 
    set_names(excel_sheets(filename)) %>% map(head)
#> $iris
#> # A tibble: 6 × 6
#>   id    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#>   <chr>        <dbl>       <dbl>        <dbl>       <dbl> <chr>  
#> 1 1              5.1         3.5          1.4         0.2 setosa 
#> 2 2              4.9         3            1.4         0.2 setosa 
#> 3 3              4.7         3.2          1.3         0.2 setosa 
#> 4 4              4.6         3.1          1.5         0.2 setosa 
#> 5 5              5           3.6          1.4         0.2 setosa 
#> 6 6              5.4         3.9          1.7         0.4 setosa 
#> 
#> $setosa
#> # A tibble: 6 × 6
#>   id    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#>   <chr>        <dbl>       <dbl>        <dbl>       <dbl> <chr>  
#> 1 1              5.1         3.5          1.4         0.2 setosa 
#> 2 2              4.9         3            1.4         0.2 setosa 
#> 3 3              4.7         3.2          1.3         0.2 setosa 
#> 4 4              4.6         3.1          1.5         0.2 setosa 
#> 5 5              5           3.6          1.4         0.2 setosa 
#> 6 6              5.4         3.9          1.7         0.4 setosa 
#> 
#> $versicolor
#> # A tibble: 6 × 6
#>   id    Sepal.Length Sepal.Width Petal.Length Petal.Width Species   
#>   <chr>        <dbl>       <dbl>        <dbl>       <dbl> <chr>     
#> 1 51             7           3.2          4.7         1.4 versicolor
#> 2 52             6.4         3.2          4.5         1.5 versicolor
#> 3 53             6.9         3.1          4.9         1.5 versicolor
#> 4 54             5.5         2.3          4           1.3 versicolor
#> 5 55             6.5         2.8          4.6         1.5 versicolor
#> 6 56             5.7         2.8          4.5         1.3 versicolor
#> 
#> $virginica
#> # A tibble: 6 × 6
#>   id    Sepal.Length Sepal.Width Petal.Length Petal.Width Species  
#>   <chr>        <dbl>       <dbl>        <dbl>       <dbl> <chr>    
#> 1 101            6.3         3.3          6           2.5 virginica
#> 2 102            5.8         2.7          5.1         1.9 virginica
#> 3 103            7.1         3            5.9         2.1 virginica
#> 4 104            6.3         2.9          5.6         1.8 virginica
#> 5 105            6.5         3            5.8         2.2 virginica
#> 6 106            7.6         3            6.6         2.1 virginica
Show the code
read_excel_sheets <- function(filename, single_tbl = FALSE) {
  sheets <- readxl::excel_sheets(filename)
  
  if (single_tbl){ #指示函数应返回单个表还是表的列表的逻辑值
    x <- purrr::map_df(sheets, readxl::read_excel, path = filename)
  } else {
    x <- purrr::map(sheets, ~ readxl::read_excel(filename, sheet = .x))
    purrr::set_names(x, sheets)
  }
  
  x
}


read_excel_sheets(filename, F) %>% map(head)
#> [[1]]
#> # A tibble: 6 × 6
#>   id    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#>   <chr>        <dbl>       <dbl>        <dbl>       <dbl> <chr>  
#> 1 1              5.1         3.5          1.4         0.2 setosa 
#> 2 2              4.9         3            1.4         0.2 setosa 
#> 3 3              4.7         3.2          1.3         0.2 setosa 
#> 4 4              4.6         3.1          1.5         0.2 setosa 
#> 5 5              5           3.6          1.4         0.2 setosa 
#> 6 6              5.4         3.9          1.7         0.4 setosa 
#> 
#> [[2]]
#> # A tibble: 6 × 6
#>   id    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#>   <chr>        <dbl>       <dbl>        <dbl>       <dbl> <chr>  
#> 1 1              5.1         3.5          1.4         0.2 setosa 
#> 2 2              4.9         3            1.4         0.2 setosa 
#> 3 3              4.7         3.2          1.3         0.2 setosa 
#> 4 4              4.6         3.1          1.5         0.2 setosa 
#> 5 5              5           3.6          1.4         0.2 setosa 
#> 6 6              5.4         3.9          1.7         0.4 setosa 
#> 
#> [[3]]
#> # A tibble: 6 × 6
#>   id    Sepal.Length Sepal.Width Petal.Length Petal.Width Species   
#>   <chr>        <dbl>       <dbl>        <dbl>       <dbl> <chr>     
#> 1 51             7           3.2          4.7         1.4 versicolor
#> 2 52             6.4         3.2          4.5         1.5 versicolor
#> 3 53             6.9         3.1          4.9         1.5 versicolor
#> 4 54             5.5         2.3          4           1.3 versicolor
#> 5 55             6.5         2.8          4.6         1.5 versicolor
#> 6 56             5.7         2.8          4.5         1.3 versicolor
#> 
#> [[4]]
#> # A tibble: 6 × 6
#>   id    Sepal.Length Sepal.Width Petal.Length Petal.Width Species  
#>   <chr>        <dbl>       <dbl>        <dbl>       <dbl> <chr>    
#> 1 101            6.3         3.3          6           2.5 virginica
#> 2 102            5.8         2.7          5.1         1.9 virginica
#> 3 103            7.1         3            5.9         2.1 virginica
#> 4 104            6.3         2.9          5.6         1.8 virginica
#> 5 105            6.5         3            5.8         2.2 virginica
#> 6 106            7.6         3            6.6         2.1 virginica

read_excel_sheets(filename, T) %>% head()
id Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1 5.1 3.5 1.4 0.2 setosa
2 4.9 3.0 1.4 0.2 setosa
3 4.7 3.2 1.3 0.2 setosa
4 4.6 3.1 1.5 0.2 setosa
5 5.0 3.6 1.4 0.2 setosa
6 5.4 3.9 1.7 0.4 setosa

4.1.2.2 批量读取多个Excel

Show the code

excel_file_paths <- list.files(path = ".",pattern = ".xlsx",recursive = T)

excel_file_paths
#> [1] "data/read_write/multi-sheet-iris_data.xlsx"


map(excel_file_paths, read_excel) %>% map(head)
#> [[1]]
#> # A tibble: 6 × 6
#>   id    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#>   <chr>        <dbl>       <dbl>        <dbl>       <dbl> <chr>  
#> 1 1              5.1         3.5          1.4         0.2 setosa 
#> 2 2              4.9         3            1.4         0.2 setosa 
#> 3 3              4.7         3.2          1.3         0.2 setosa 
#> 4 4              4.6         3.1          1.5         0.2 setosa 
#> 5 5              5           3.6          1.4         0.2 setosa 
#> 6 6              5.4         3.9          1.7         0.4 setosa

4.1.3 脚本载入

Show the code
source("data/read_write/add_function.R")
add(c(1,24,4),4)
#> [1]  5 28  8
Note

add_function

Show the code
add <- function(x,y){
  return(x+y)
}

4.1.4 二进制格式

RDS,R’s custom binary format called RDS

read_rds()

Show the code
read_rds("data/read_write/leadership.rds")
date country gender age q1 q2 q3 q4 q5
Bob 10/24/14 US M 29 5 4 5 5 5
Mary 10/28/14 US F 44 3 5 2 5 5
Lily 10/01/14 UK F 45 3 5 5 5 2
July 10/12/14 UK M 68 3 3 4 NA NA
Frank 05/01/14 UK F 99 2 2 1 2 1

Parquet,a fast binary and columnar storage file format(列式存储文件)

arrow

Show the code
library(arrow)
read_parquet(file = "data/read_write/leadership.parquet" )
date country gender age q1 q2 q3 q4 q5
10/24/14 US M 29 5 4 5 5 5
10/28/14 US F 44 3 5 2 5 5
10/01/14 UK F 45 3 5 5 5 2
10/12/14 UK M 68 3 3 4 NA NA
05/01/14 UK F 99 2 2 1 2 1

4.1.5 数据库

Show the code

4.1.6 等宽格式

readr::read_fwf()

Show the code
' First     Last  Sex Number
 Currer     Bell    F      2
    Dr.    Seuss    M     49
    ""   Student   NA     21' |> read_fwf()
X1 X2 X3
First Last Sex Number
Currer Bell F 2
Dr. Seuss M 49
“” Student NA 21

4.1.7 网络下载

Show the code
url = "https://vincentarelbundock.github.io/Rdatasets/csv/datasets/co2.csv"
download.file(url, "data/read_write/网络下载.csv")
Show the code
read_csv("data/read_write/网络下载.csv") %>% head()
rownames time value
1 1959.000 315.42
2 1959.083 316.31
3 1959.167 316.50
4 1959.250 317.56
5 1959.333 318.13
6 1959.417 318.00

4.1.8 JSON

javascript object notation

Show the code
library(jsonlite)
read_json(path = "data/read_write/JSON.json",simplifyVector = TRUE)
#> $author
#> [1] "wal"
#> 
#> $description
#> [1] ""
#> 
#> $id
#> [1] 1.71086e+15
#> 
#> $title
#> [1] "3龙门币3赤金"
#> 
#> $buildingType
#> [1] 243
#> 
#> $planTimes
#> [1] "3班"
#> 
#> $plans
#>    name description Fiammetta.enable Fiammetta.target Fiammetta.order
#> 1 A白班                         TRUE             巫恋             pre
#> 2 B夜班                         TRUE           龙舌兰             pre
#>   drones.room drones.index drones.enable drones.order
#> 1     trading            1          TRUE          pre
#> 2     trading            1          TRUE          pre
#>                                                                                                                                      rooms.trading
#> 1         FALSE, FALSE, FALSE, LMD, LMD, LMD, 巫恋, 龙舌兰, 柏喙, 孑, 银灰, 琳琅诗怀雅, 空弦, 但书, 伺夜, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE
#> 2 FALSE, FALSE, FALSE, LMD, LMD, LMD, 巫恋, 龙舌兰, 卡夫卡, 能天使, 德克萨斯, 拉普兰德, 黑键, 慕斯, 石英, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE
#>                                                                                                                                              rooms.manufacture
#> 1         FALSE, FALSE, FALSE, Pure Gold, Pure Gold, Pure Gold, 槐琥, 至简, 清流, 苍苔, 斑点, 夜烟, 温蒂, 森蚺, 异客, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE
#> 2 FALSE, FALSE, FALSE, Pure Gold, Pure Gold, Pure Gold, 淬羽赫默, 多萝西, 白面鸮, 砾, 迷迭香, 香草, 火神, 贝娜, 泡泡, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE
#>                                                                               rooms.power
#> 1 FALSE, FALSE, FALSE, 承曦格雷伊, 澄闪, 格雷伊, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE
#> 2       FALSE, FALSE, FALSE, 缪尔赛思, 雷蛇, 黍, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE
#>                                                                                  rooms.dormitory
#> 1 FALSE, FALSE, FALSE, FALSE, 菲亚梅塔, 重岳, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE
#> 2 FALSE, FALSE, FALSE, FALSE, 菲亚梅塔, 塑心, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE
#>                                                     rooms.control
#> 1 FALSE, 阿米娅, 麒麟R夜刀, 火龙S黑角, 灵知, 玛恩纳, FALSE, FALSE
#> 2             FALSE, 重岳, 诗怀雅, 凯尔希, 令, 琴柳, FALSE, FALSE
#>                       rooms.meeting                    rooms.hire
#> 1     FALSE, 陈, 白雪, FALSE, FALSE FALSE, 艾雅法拉, FALSE, FALSE
#> 2 FALSE, 远山, 塞雷娅, FALSE, FALSE     FALSE, 絮雨, FALSE, FALSE
#>          rooms.processing                     period
#> 1     FALSE, FALSE, FALSE               07:00, 21:59
#> 2 FALSE, 年, FALSE, FALSE 22:00, 00:00, 23:59, 06:59
#> 
#> $scheduleType
#> $scheduleType$planTimes
#> [1] 2
#> 
#> $scheduleType$trading
#> [1] 3
#> 
#> $scheduleType$manufacture
#> [1] 3
#> 
#> $scheduleType$power
#> [1] 3
#> 
#> $scheduleType$dormitory
#> [1] 4

4.1.9 SPSS/SAS/stata DTA

read_sas(file)

read_dta(file)

read_stata(file)

Show the code
library(haven)
sav <- read_spss(file = "data/read_write/非正态分布的均值比较.sav")

table(sav$group)
#> 
#>   1   4 
#> 252 957
read_sav(file = "data/read_write/非正态分布的均值比较.sav") %>% head()
x group
11989.43 1
35152.95 1
45632.33 1
12113.40 1
3997.77 1
209557.65 1

4.2 Write

4.2.1 文本输出

Show the code
sink("data/read_write/sink_output.txt",append = F,split = T)  # split =T 输出到控制台和文件
cat(letters)
#> a b c d e f g h i j k l m n o p q r s t u v w x y z
sink(NULL)

4.2.2 图片输出

Rstudio 直接 Export save as Image/PDF,copy to clipboard

Show the code

# bmp,jpeg,png,SVG
svg(file = "data/read_write/svg_output.svg")       
plot(1:25,1:25,pch=1:25,col="green",bg="red",type = "b")
while(!is.null(dev.list()))  dev.off() 

4.2.3 Excel

Show the code
library(writexl)
library(dplyr)

df <- iris %>% as_tibble(rownames = "id")
df %>% head()
id Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1 5.1 3.5 1.4 0.2 setosa
2 4.9 3.0 1.4 0.2 setosa
3 4.7 3.2 1.3 0.2 setosa
4 4.6 3.1 1.5 0.2 setosa
5 5.0 3.6 1.4 0.2 setosa
6 5.4 3.9 1.7 0.4 setosa
Show the code
l <- df  |>
  group_split(Species)

names(l) <- sapply(unique(iris$Species), as.character)

lt <- c(iris = list(df),l)

write_xlsx(lt, path = "data/read_write/multi-sheet-iris_data.xlsx")