9  字符文本的操纵

9.1 stringr

stringi+ stringr + glue + stringx

Show the code
x <- c("apple", "banana", "pear","why")

# 连接
str_c(x, collapse = "; ")
#> [1] "apple; banana; pear; why"
paste0("Hello ", c("John", "Susan"))
#> [1] "Hello John"  "Hello Susan"
paste("x",c("a","b"),sep="",collapse = "?")  
#> [1] "xa?xb"
paste0("x",c("A","B"),collapse="?")
#> [1] "xA?xB"
cat("hello","BOb","\b\n","\bIsn\' R","\t","GREAT?\n",sep = " " )
#> hello BOb 
#>  Isn' R      GREAT?

# 
str_length(x) 
#> [1] 5 6 4 3
str_sub(x,start =  1,end =  2)
#> [1] "ap" "ba" "pe" "wh"
str_dup(x, times = 2)
#> [1] "appleapple"   "bananabanana" "pearpear"     "whywhy"


# 空格
str_pad(x, 10, "both")
#> [1] "  apple   " "  banana  " "   pear   " "   why    "
x <- c("  a   ", "b   ",  "   c")
str_trim(x,side = "left")
#> [1] "a   " "b   " "c"

jabberwocky <- str_c(
  "`Twas brillig, and the slithy toves ",
  "did gyre and gimble in the wabe: ",
  "All mimsy were the borogoves, ",
  "and the mome raths outgrabe. "
)
str_wrap(jabberwocky, width = 40)
#> [1] "`Twas brillig, and the slithy toves did\ngyre and gimble in the wabe: All mimsy\nwere the borogoves, and the mome raths\noutgrabe."
cat(str_wrap(jabberwocky, width = 40))
#> `Twas brillig, and the slithy toves did
#> gyre and gimble in the wabe: All mimsy
#> were the borogoves, and the mome raths
#> outgrabe.

# 截断
x <- "This string is moderately long"
rbind(
  str_trunc(x, 20, "right"),
  str_trunc(x, 20, "left"),
  str_trunc(x, 20, "center")
)
#>      [,1]                  
#> [1,] "This string is mo..."
#> [2,] "...s moderately long"
#> [3,] "This stri...ely long"

# Locale sensitive 
x <- "I like horses."
str_to_upper(x)
#> [1] "I LIKE HORSES."
str_to_title(x)
#> [1] "I Like Horses."
str_to_lower(x)
#> [1] "i like horses."
str_to_lower(x,locale =  "tr")
#> [1] "ı like horses."

# 排序
x <- c("y", "i", "k")
str_order(x)
#> [1] 2 3 1

str_sort(x,locale = "en")
#> [1] "i" "k" "y"

9.2 匹配正则表达式

Show the code
strings <- c(
  "apple", 
  "219 733 8965", 
  "329-293-8753", 
  "Work: 579-499-7527; Home: 543.355.3679"
)
phone <- "([2-9][0-9]{2})[- .]([0-9]{3})[- .]([0-9]{4})"
# 检测
str_detect(strings, phone)
#> [1] FALSE  TRUE  TRUE  TRUE
grepl(pattern = phone, x = strings)
#> [1] FALSE  TRUE  TRUE  TRUE

# 查找
str_subset(strings, phone)
#> [1] "219 733 8965"                          
#> [2] "329-293-8753"                          
#> [3] "Work: 579-499-7527; Home: 543.355.3679"
grep(pattern = phone, x = strings,value = T)
#> [1] "219 733 8965"                          
#> [2] "329-293-8753"                          
#> [3] "Work: 579-499-7527; Home: 543.355.3679"

str_which(strings, phone)
#> [1] 2 3 4
grep(pattern = phone, x = strings)
#> [1] 2 3 4


# 计数
str_count(strings, phone)
#> [1] 0 1 1 2

# 定位
str_locate(strings, phone)
#>      start end
#> [1,]    NA  NA
#> [2,]     1  12
#> [3,]     1  12
#> [4,]     7  18
str_locate_all(strings, phone)
#> [[1]]
#>      start end
#> 
#> [[2]]
#>      start end
#> [1,]     1  12
#> 
#> [[3]]
#>      start end
#> [1,]     1  12
#> 
#> [[4]]
#>      start end
#> [1,]     7  18
#> [2,]    27  38

# 提取
str_extract(strings, phone)
#> [1] NA             "219 733 8965" "329-293-8753" "579-499-7527"
str_extract_all(strings, phone)
#> [[1]]
#> character(0)
#> 
#> [[2]]
#> [1] "219 733 8965"
#> 
#> [[3]]
#> [1] "329-293-8753"
#> 
#> [[4]]
#> [1] "579-499-7527" "543.355.3679"
str_extract_all(strings, phone, simplify = TRUE)
#>      [,1]           [,2]          
#> [1,] ""             ""            
#> [2,] "219 733 8965" ""            
#> [3,] "329-293-8753" ""            
#> [4,] "579-499-7527" "543.355.3679"

# 提取匹配的捕获组 ()
str_match(strings, phone)
#>      [,1]           [,2]  [,3]  [,4]  
#> [1,] NA             NA    NA    NA    
#> [2,] "219 733 8965" "219" "733" "8965"
#> [3,] "329-293-8753" "329" "293" "8753"
#> [4,] "579-499-7527" "579" "499" "7527"
str_match_all(strings, phone)
#> [[1]]
#>      [,1] [,2] [,3] [,4]
#> 
#> [[2]]
#>      [,1]           [,2]  [,3]  [,4]  
#> [1,] "219 733 8965" "219" "733" "8965"
#> 
#> [[3]]
#>      [,1]           [,2]  [,3]  [,4]  
#> [1,] "329-293-8753" "329" "293" "8753"
#> 
#> [[4]]
#>      [,1]           [,2]  [,3]  [,4]  
#> [1,] "579-499-7527" "579" "499" "7527"
#> [2,] "543.355.3679" "543" "355" "3679"

# 替换
str_replace(strings, phone, replacement = "XXX-XXX-XXXX")
#> [1] "apple"                                 
#> [2] "XXX-XXX-XXXX"                          
#> [3] "XXX-XXX-XXXX"                          
#> [4] "Work: XXX-XXX-XXXX; Home: 543.355.3679"

str_replace_all(strings, phone, replacement = "XXX-XXX-XXXX")
#> [1] "apple"                                 
#> [2] "XXX-XXX-XXXX"                          
#> [3] "XXX-XXX-XXXX"                          
#> [4] "Work: XXX-XXX-XXXX; Home: XXX-XXX-XXXX"

x <- c("apple", "pear", "banana")
str_remove(x, "[aeiou]")      #删除        相当于替换符为空字符""
#> [1] "pple"  "par"   "bnana"
str_remove_all(x, "[aeiou]")   #str_replace(x, pattern, "")
#> [1] "ppl" "pr"  "bnn"

# 拆分
str_split("a-b-c", pattern = "-")
#> [[1]]
#> [1] "a" "b" "c"
str_split_fixed("a-b-c", "-", n = 2)
#>      [,1] [,2] 
#> [1,] "a"  "b-c"

9.3 匹配固定项

Show the code
a1 <- "\u00e1"
a2 <- "a\u0301"
c(a1, a2)
#> [1] "á" "á"
str_detect(a1, fixed(a2))
#> [1] FALSE
str_detect(a1, coll(a2))
#> [1] TRUE

x <- "This is a sentence."
str_split(x, boundary("word"))
#> [[1]]
#> [1] "This"     "is"       "a"        "sentence"
str_count(x, boundary("word"))
#> [1] 4
str_extract_all(x, boundary("word"))
#> [[1]]
#> [1] "This"     "is"       "a"        "sentence"
str_split(x, "")
#> [[1]]
#>  [1] "T" "h" "i" "s" " " "i" "s" " " "a" " " "s" "e" "n" "t" "e" "n" "c" "e" "."

str_view(c("", "a", "."), stringr::fixed("a"))
#> [2] │ <a>
str_view("x X", stringr::fixed("X", ignore_case = TRUE))
#> [1] │ <x> <X>

9.4 str_flatten

Show the code
df <- tribble(
  ~ name, ~ fruit,
  "Carmen", "banana",
  "Carmen", "apple",
  "Marvin", "nectarine",
  "Terence", "cantaloupe",
  "Terence", "papaya",
  "Terence", "mandarin"
)
df |>
  group_by(name) |> 
  summarize(fruits = str_flatten(fruit, ", ",last = ", and "))
name fruits
Carmen banana, and apple
Marvin nectarine
Terence cantaloupe, papaya, and mandarin

9.5 str_glue()

Show the code
name <- "Fred"
age <- 50
anniversary <- as.Date("1991-10-12")

str_glue(
  "My name is {name}, ",
  "my age next year is {age + 1}, ",
  "and my anniversary is {format(anniversary, '%A, %B %d, %Y')}."
)
#> My name is Fred, my age next year is 51, and my anniversary is 星期六, 十月 12, 1991.

str_glue("My name is {name}, not {{name}}.")
#> My name is Fred, not {name}.

9.6 转义escape

Show the code
# 转义字符串
single_quote <- "\'"
double_quote <- '\"'
backslash <- "\\"
x <- c(single_quote, double_quote, backslash)
x
#> [1] "'"  "\"" "\\"

str_view(x)
#> [1] │ '
#> [2] │ "
#> [3] │ \
Show the code
x <- c("one\ntwo", "one\ttwo", "\u00b5", "\U0001f604")
x
#> [1] "one\ntwo" "one\ttwo" "µ"        "😄"
str_view(x)
#> [1] │ one
#>     │ two
#> [2] │ one{\t}two
#> [3] │ µ
#> [4] │ 😄
Show the code
dot <- "\\."
str_view(dot)
#> [1] │ \.
str_view(c("abc", "a.c", "bef"), "a\\.c")  
#> [2] │ <a.c>

9.7 raw

Show the code
single_quote <- r"(')"        # []    ---()---  -()-
double_quote <- r"["]"
backslash <- r"--(\)--"

x <- c(single_quote, double_quote, backslash)
x
#> [1] "'"  "\"" "\\"
str_view(x)
#> [1] │ '
#> [2] │ "
#> [3] │ \

9.8 正则表达式

9.8.1 直接匹配

Show the code
str_view(fruit, "berry")  
#>  [6] │ bil<berry>
#>  [7] │ black<berry>
#> [10] │ blue<berry>
#> [11] │ boysen<berry>
#> [19] │ cloud<berry>
#> [21] │ cran<berry>
#> [29] │ elder<berry>
#> [32] │ goji <berry>
#> [33] │ goose<berry>
#> [38] │ huckle<berry>
#> [50] │ mul<berry>
#> [70] │ rasp<berry>
#> [73] │ salal <berry>
#> [76] │ straw<berry>

9.8.2 字符类 character classes

Show the code
#   [...]    匹配方括号内任意一个字符
#   [^...]   匹配除方括号内的任意字符
str_view(words, "[aeiou]x[aeiou]")  #匹配 中间x两边元音
#> [284] │ <exa>ct
#> [285] │ <exa>mple
#> [288] │ <exe>rcise
#> [289] │ <exi>st

str_view(words, "[^aeiou]y[^aeiou]") #匹配 中间y两边辅音
#> [836] │ <sys>tem
#> [901] │ <typ>e

#  -   定义一个范围,匹配任何小写字母并匹配任何数字。[a-z][0-9]
#  \   对特殊字符进行转义,[\\^\\-\\]]  匹配 ^、-和 ]
x <- "abcd ABCD  12345 -![@#%^."
str_view(x, "[abc\\^\\[]")
#> [1] │ <a><b><c>d ABCD  12345 -!<[>@#%<^>.
str_view(x, "[a-zA-Z]")
#> [1] │ <a><b><c><d> <A><B><C><D>  12345 -![@#%^.
str_view(x, "[^a-z0-9]")
#> [1] │ abcd< ><A><B><C><D>< >< >12345< ><-><!><[><@><#><%><^><.>

9.8.3 量词 Quantifier

Show the code
# {n}正好匹配 n 次。
# {n,}至少匹配 n 次。
# {n,m}N 次和 M 次之间的匹配。
x <- c("1234_abcd  123 a33a bbbc  22_23" , "abb","aab",'a_bbbbb')

str_view(x, "b{2,5}")
#> [1] │ 1234_abcd  123 a33a <bbb>c  22_23
#> [2] │ a<bb>
#> [4] │ a_<bbbbb>
str_view(x, "\\d{2}")
#> [1] │ <12><34>_abcd  <12>3 a<33>a bbbc  <22>_<23>
str_view(x, "\\d{2,}")
#> [1] │ <1234>_abcd  <123> a<33>a bbbc  <22>_<23>
str_view(x, "\\d{2,3}")
#> [1] │ <123>4_abcd  <123> a<33>a bbbc  <22>_<23>



#    ?  匹配 0 或 1 次   {0,1}
#    +  至少匹配一次    {1,}
#    *  匹配任意次数    {0,}

str_view(x, "ab?")  
#> [1] │ 1234_<ab>cd  123 <a>33<a> bbbc  22_23
#> [2] │ <ab>b
#> [3] │ <a><ab>
#> [4] │ <a>_bbbbb
str_view(x, "ab+")  
#> [1] │ 1234_<ab>cd  123 a33a bbbc  22_23
#> [2] │ <abb>
#> [3] │ a<ab>
str_view(x, "ab*")  
#> [1] │ 1234_<ab>cd  123 <a>33<a> bbbc  22_23
#> [2] │ <abb>
#> [3] │ <a><ab>
#> [4] │ <a>_bbbbb
  
# 贪婪量词 尽可能匹配  上述都是

# 懒惰量词 原有量词后加一个 ? ,仅保持最小匹配
str_view(x, "\\d{2,3}?")
#> [1] │ <12><34>_abcd  <12>3 a<33>a bbbc  <22>_<23>

str_view(x, "ab??")  # 匹配a
#> [1] │ 1234_<a>bcd  123 <a>33<a> bbbc  22_23
#> [2] │ <a>bb
#> [3] │ <a><a>b
#> [4] │ <a>_bbbbb
str_view(x, "ab+?")  # 匹配ab
#> [1] │ 1234_<ab>cd  123 a33a bbbc  22_23
#> [2] │ <ab>b
#> [3] │ a<ab>
str_view(x, "ab*?")  # 匹配a
#> [1] │ 1234_<a>bcd  123 <a>33<a> bbbc  22_23
#> [2] │ <a>bb
#> [3] │ <a><a>b
#> [4] │ <a>_bbbbb

9.8.4 元字符(meta-characters)

Show the code
#\d匹配任何数字;匹配任何非数字的内容\D
str_view(x, "\\d+")
#> [1] │ <1234>_abcd  <123> a<33>a bbbc  <22>_<23>
str_view(x, "\\D+")
#> [1] │ 1234<_abcd  >123< a>33<a bbbc  >22<_>23
#> [2] │ <abb>
#> [3] │ <aab>
#> [4] │ <a_bbbbb>

#\s匹配任何空格(例如,空格、制表符、换行符);匹配任何非空格的内容。\S
str_view(x, "\\s+")
#> [1] │ 1234_abcd<  >123< >a33a< >bbbc<  >22_23
str_view(x, "\\S+")
#> [1] │ <1234_abcd>  <123> <a33a> <bbbc>  <22_23>
#> [2] │ <abb>
#> [3] │ <aab>
#> [4] │ <a_bbbbb>

#\w匹配任何“单词”字符,即字母、数字和下划线;匹配任何“非单词”字符。\W
str_view(x, "\\w+")
#> [1] │ <1234_abcd>  <123> <a33a> <bbbc>  <22_23>
#> [2] │ <abb>
#> [3] │ <aab>
#> [4] │ <a_bbbbb>
str_view(x, "\\W+")
#> [1] │ 1234_abcd<  >123< >a33a< >bbbc<  >22_23


#   .        匹配除 \n 以外的任意字符
str_view(fruit, "a...e")
#>  [1] │ <apple>
#>  [7] │ bl<ackbe>rry
#> [48] │ mand<arine>
#> [51] │ nect<arine>
#> [62] │ pine<apple>
#> [64] │ pomegr<anate>
#> [70] │ r<aspbe>rry
#> [73] │ sal<al be>rry

锚点Anchors

Show the code
# 匹配开头或结尾:^  $
str_view(fruit, "^a")
#> [1] │ <a>pple
#> [2] │ <a>pricot
#> [3] │ <a>vocado
str_view(fruit, "a$")
#>  [4] │ banan<a>
#> [15] │ cherimoy<a>
#> [30] │ feijo<a>
#> [36] │ guav<a>
#> [56] │ papay<a>
#> [74] │ satsum<a>

str_view(fruit, "apple")
#>  [1] │ <apple>
#> [62] │ pine<apple>
str_view(fruit, "^apple$") #仅匹配完整字符串
#> [1] │ <apple>


#  \b  单词的开头或结尾,字符的边界
x <- c("summary(x)", "summarize(df)", "rowsum(x)", "sum(x)")
str_view(x, "sum")
#> [1] │ <sum>mary(x)
#> [2] │ <sum>marize(df)
#> [3] │ row<sum>(x)
#> [4] │ <sum>(x)
str_view(x, "^sum$")
str_view(x, "\\bsum\\b") 
#> [4] │ <sum>(x)

9.8.5 零宽匹配

zero-width match

Show the code
str_view("abc", c("$", "^", "\\b"))
#> [1] │ abc<>
#> [2] │ <>abc
#> [3] │ <>abc<>
str_replace_all("abc", c("$", "^", "\\b"), "++")
#> [1] "abc++"   "++abc"   "++abc++"

9.8.6 或运算符

Show the code
#alternation
str_view(fruit, "apple|melon|nut")
#>  [1] │ <apple>
#> [13] │ canary <melon>
#> [20] │ coco<nut>
#> [52] │ <nut>
#> [62] │ pine<apple>
#> [72] │ rock <melon>
#> [80] │ water<melon>
str_view(fruit, "aa|ee|ii|oo|uu")
#>  [9] │ bl<oo>d orange
#> [33] │ g<oo>seberry
#> [47] │ lych<ee>
#> [66] │ purple mangost<ee>n