一、安裝加載R包
if(!require("tidyverse"))install.packages("tidyverse")
if(!require("stringr"))install.packages("stringr")
library(tidyverse)
library(stringr)
二、 字符串基礎(chǔ)
2.1 字符串長(zhǎng)度
str_length()
e.g.
str_length(c("a", "R for data science", NA))
#> [1] 1 18 NA
2.2 字符串組合
str_c(..., sep = "", collapse = FALSE)
2.2.1 組合2個(gè)及以上字符串
e.g.1
str_c("x", "y")
#> [1] "xy"
str_c("x", "y", "z")
#> [1] "xyz"
e.g.2
str_c("x", "y", sep = ", ")
#> [1] "x, y"
e.g.3 注意str_c是向量化的,所以有循環(huán)補(bǔ)齊功能
str_c("prefix-", c("a", "b", "c"), "-suffix")
#> [1] "prefix-a-suffix" "prefix-b-suffix" "prefix-c-suffix"
2.2.2 將字符向量合并為字符串
str_c(c("x", "y", "z"), collapse = ". ")
#> [1] "x. y. z"
2.3 字符串取子集
str_sub(string, start = 1L, end = -1L)
1、提取字符串的一部分
x <- c("Apple", "Banana", "Pear")
str_sub(x, 1, 3)
#> [1] "App" "Ban" "Pea"
# 負(fù)數(shù)代表從后往前數(shù)
str_sub(x, -3, -1)
#> [1] "ple" "ana" "ear"
2、提取后賦值
str_sub(x, 1, 1) <- str_to_lower(str_sub(x, 1, 1))
x
#> [1] "apple" "banana" "pear"
2.4 練習(xí)
str_trim(string, side = c("both", "left", "right"))
作用:修剪字符串中的空格
str_pad(string, width, side = c("left", "right", "both"), pad = " ")
作用:增加字符串中的空格
三、正則表達(dá)式
3.1 基礎(chǔ)匹配
重點(diǎn):
.可以代表任意字符- 若想匹配字符.則需要添加轉(zhuǎn)義符\ 所以字符.用正則表達(dá)式表示為
\.然后再寫成字符串的形式為"\\." - 字符\ 用正則表達(dá)式為
\\再寫成字符串的形式為"\\\\"
e.g.
str_view(c("abc", "a.c", "bef"), "a\\.c")

3.2 錨點(diǎn)
重點(diǎn):
- ^從字符串開頭進(jìn)行匹配
- $從字符串末尾進(jìn)行匹配
- 可以使用\b來匹配單詞的邊界
3.3 練習(xí)
如何匹配字符串""
思路:
- 你需要一個(gè)轉(zhuǎn)移符號(hào)告訴正則表達(dá)式你要的是
所代表的特殊含義,所以其正則表達(dá)式為
\$\^\$ - 使用字符串表示正則表達(dá)式;而\在字符串中也表示轉(zhuǎn)義,所以要再加一個(gè)轉(zhuǎn)義符:
"\\$\\^\\$" - 最后加上錨點(diǎn):
"^\\$\\^\\$$"
3.4 字符類與字符選項(xiàng)
重點(diǎn):
- \d可以匹配任意數(shù)字
- \s可以匹配任意空白字符(如空格、制表符和換行符)
- [abc]可以匹配a、b或c
- [^abc]可以匹配出a、b、c外的任意字符
牢記!因?yàn)閈在字符串中也表示轉(zhuǎn)義,所以創(chuàng)建\d的正則表達(dá)式需要在字符串中對(duì)\進(jìn)行轉(zhuǎn)義,因此需輸入\\d
還可以使用字符選項(xiàng)創(chuàng)建多個(gè)可選的模式
注意:|的優(yōu)先級(jí)很低?。。?!
abc|xyz匹配的是abc或xyz,而不是abcyz或abxyz
3.5 重復(fù)
正則表達(dá)式一個(gè)強(qiáng)大的功能是可以控制一個(gè)模式能夠匹配多少次
重點(diǎn)1:
- ?:0次或1次
- +:1次或多次
- *:0次或多次
注意:
1、只重復(fù)其前方的一個(gè)字符?。。?/p>
2、這些字符優(yōu)先級(jí)非常高
重點(diǎn)2:
- {n}:匹配n次
- {n, }:匹配n次或更多次
- { ,m}:最多匹配m次
- {n, m}:匹配n到m次
重點(diǎn)3:
默認(rèn)的匹配方式是“貪婪的”:正則表達(dá)式會(huì)匹配盡量長(zhǎng)的字符串。
通過在正則表達(dá)式后添加一個(gè)?,可以將匹配方式改為“懶惰的”,即匹配盡量短的字符串。
x <- "1888 is the longest year in Roman numerals: MDCCCLXXXVIII"
str_view(x, 'C{2,3}?')

3.6 分組與溯源引用
括號(hào)的作用:
1、消除復(fù)雜表達(dá)式中的歧義,闡明優(yōu)先級(jí)
2、定義“分組”信息,同時(shí)可以通過回溯引用如(\1,\2等)來引用這些分組。
注意:
1個(gè)括號(hào)為1組,\1代表回溯引用第1組即第1個(gè)括號(hào)里的內(nèi)容,\2代表回溯引用第2組即第2個(gè)括號(hào)里的內(nèi)容,\3代表回溯引用第3組即第3個(gè)括號(hào)里的內(nèi)容。
str_view(fruit, "(..)\\1", match = TRUE)

四、工具
學(xué)習(xí)stringr的多個(gè)函數(shù),應(yīng)用正則表達(dá)式:
- 確定與某種模式相匹配的字符串
- 找出匹配的位置
str_detect - 提取出匹配的內(nèi)容
str_extract - 使用新值替換匹配內(nèi)容
str_replace - 基于匹配拆分字符串
str_split
4.1 匹配檢測(cè)
想要確定一個(gè)字符串向量能否匹配一種模式,使用str_detect函數(shù),它返回一個(gè)與輸入向量具有相同長(zhǎng)度的邏輯向量。
x <- c("apple", "banana", "pear")
str_detect(x, "e")
#> [1] TRUE FALSE TRUE
str_detect配合邏輯值取子集和str_subset函數(shù)可起到一樣的效果
words[str_detect(words, "x$")]
#> [1] "box" "sex" "six" "tax"
str_subset()函數(shù)
str_subset(words, "x$")
#> [1] "box" "sex" "six" "tax"
重點(diǎn):
然而,字符串通常式數(shù)據(jù)框的一列,此時(shí)我們可以用filter操作
df <- tibble(
word = words,
i = seq_along(word)
)
df %>%
filter(str_detect(word, "x$"))
#> # A tibble: 4 x 2
#> word i
#> <chr> <int>
#> 1 box 108
#> 2 sex 747
#> 3 six 772
#> 4 tax 841
str_detect()函數(shù)的一種變體是str_count(),后者不是簡(jiǎn)單返回是或否,而是返回字符串中匹配的數(shù)量
str_count()
x <- c("apple", "banana", "pear")
str_count(x, "a")
#> [1] 1 3 1
str_count可以同mutate()函數(shù)一同使用:
計(jì)算word數(shù)據(jù)中元音字母和輔音字母的數(shù)量
df %>%
mutate(
vowels = str_count(word, "[aeiou]"),
consonants = str_count(word, "[^aeiou]")
)
#> # A tibble: 980 x 4
#> word i vowels consonants
#> <chr> <int> <int> <int>
#> 1 a 1 1 0
#> 2 able 2 2 2
#> 3 about 3 3 2
#> 4 absolute 4 4 4
#> 5 accept 5 2 4
#> 6 account 6 3 4
#> # … with 974 more rows
4.2 提取匹配內(nèi)容
利用stringr的內(nèi)置數(shù)據(jù)集sentences做練習(xí)
length(sentences)
#> [1] 720
head(sentences)
#> [1] "The birch canoe slid on the smooth planks."
#> [2] "Glue the sheet to the dark blue background."
#> [3] "It's easy to tell the depth of a well."
#> [4] "These days a chicken leg is a rare dish."
#> [5] "Rice is often served in round bowls."
#> [6] "The juice of lemons makes fine punch."
1、創(chuàng)建一個(gè)顏色向量
colours <- c("red", "orange", "yellow", "green", "blue", "purple")
colour_match <- str_c(colours, collapse = "|")
colour_match
#> [1] "red|orange|yellow|green|blue|purple"
2、選取包含這個(gè)顏色的句子
has_colour <- str_subset(sentences, colour_match)
3、提取這些句子里所包含的顏色,就知道句子里有哪些顏色了
matches <- str_extract(has_colour, colour_match)
head(matches)
#> [1] "blue" "blue" "red" "red" "red" "blue"
注意:
str_extract()只提取了每個(gè)句子的第一個(gè)匹配?。。ㄒ粋€(gè)句子里可能有2個(gè)或以上的顏色單詞)
這是stringr函數(shù)的一種通用模式,因?yàn)閱蝹€(gè)匹配可以使用更簡(jiǎn)單的數(shù)據(jù)結(jié)構(gòu)。要想得到所有匹配,可以使用str_extract_all()函數(shù),他會(huì)返回一個(gè)列表
str_extract_all(has_colour, colour_match) %>%
.[1:3]
#[[1]]
#[1] "blue"
#
#[[2]]
#[1] "blue"
#
#[[3]]
#[1] "red"
如果設(shè)置參數(shù)simplify = TRUE,那么返回一個(gè)矩陣,其中較短的匹配循環(huán)補(bǔ)齊
x <- c("a", "a b", "a b c")
str_extract_all(x, "[a-z]", simplify = TRUE)
#> [,1] [,2] [,3]
#> [1,] "a" "" ""
#> [2,] "a" "b" ""
#> [3,] "a" "b" "c"
4.3 分組匹配
同樣探索內(nèi)置數(shù)據(jù)集sentences,假設(shè)我們想從句子中提取出名詞,可以換個(gè)思路一般a/the后面跟的是名詞,所以a/the+ 空格+1個(gè)以上非空格字符即可
noun <- "(a|the) ([^ ]+)"
has_noun <- sentences %>%
str_subset(noun) %>%
head(10)
has_noun %>%
str_extract(noun)
#> [1] "the smooth" "the sheet" "the depth" "a chicken" "the parked"
#> [6] "the sun" "the huge" "the ball" "the woman" "a helps"
str_extract()可以給出完整匹配,str_match()則可以給出每個(gè)獨(dú)立分組(即每個(gè)括號(hào)里的內(nèi)容),返回的不是向量而是矩陣,其中一列為完整匹配,后面的是每個(gè)分組的匹配
str_match()
has_noun %>%
str_match(noun)
#> [,1] [,2] [,3]
#> [1,] "the smooth" "the" "smooth"
#> [2,] "the sheet" "the" "sheet"
#> [3,] "the depth" "the" "depth"
#> [4,] "a chicken" "a" "chicken"
#> [5,] "the parked" "the" "parked"
#> [6,] "the sun" "the" "sun"
#> [7,] "the huge" "the" "huge"
#> [8,] "the ball" "the" "ball"
#> [9,] "the woman" "the" "woman"
#> [10,] "a helps" "a" "helps"
與str_extract()函數(shù)一樣,如果想要找出每個(gè)字符串的所有匹配,需要使用str_match_all()
tidyr::extract( data, col, into, regex = "([[:alnum:]]+)", remove = TRUE, convert = FALSE, ...)
如果數(shù)據(jù)類型是tibble,使用tidyr::extract()更容易,其工作原理與str_match()相似,只是要求為每個(gè)分組提供一個(gè)名稱,以作為新列放在tibble中
tibble(sentence = sentences) %>%
extract(sentence,
into = c("article", "noun"),
"(a|the) ([^ ]+)",
remove = FALSE)
#> # A tibble: 720 x 3
#> sentence article noun
#> <chr> <chr> <chr>
#> 1 The birch canoe slid on the smooth planks. the smooth
#> 2 Glue the sheet to the dark blue background. the sheet
#> 3 It's easy to tell the depth of a well. the depth
#> 4 These days a chicken leg is a rare dish. a chicken
#> 5 Rice is often served in round bowls. <NA> <NA>
#> 6 The juice of lemons makes fine punch. <NA> <NA>
#> # … with 714 more rows
4.4 替換匹配內(nèi)容
str_replace()和str_replace_all()函數(shù)可以使用新字符串替換匹配內(nèi)容。
str_replace(string, pattern, replacement)
1、使用固定字符串替換匹配內(nèi)容
x <- c("apple", "pear", "banana")
str_replace(x, "[aeiou]", "-")
#> [1] "-pple" "p-ar" "b-nana"
str_replace_all(x, "[aeiou]", "-")
#> [1] "-ppl-" "p--r" "b-n-n-"
2、通過提供一個(gè)命名向量,使用str_replace_all()函數(shù)可以同時(shí)執(zhí)行多個(gè)替換:
x <- c("1 house", "2 cars", "3 people")
str_replace_all(x, c("1" = "one", "2" = "two", "3" = "three"))
#> [1] "one house" "two cars" "three people"
3、使用回溯引用來插入匹配中的分組,下面代碼交換了第二個(gè)和第三個(gè)單詞的順序:
sentences %>%
str_replace("([^ ]+) ([^ ]+) ([^ ]+)", "\\1 \\3 \\2") %>%
head(5)
#> [1] "The canoe birch slid on the smooth planks."
#> [2] "Glue sheet the to the dark blue background."
#> [3] "It's to easy tell the depth of a well."
#> [4] "These a days chicken leg is a rare dish."
#> [5] "Rice often is served in round bowls."
4.5 拆分
str_split()函數(shù)可以將字符串拆分為多個(gè)片段。
str_split(string, pattern, n = Inf, simplify = FALSE)
舉例:
sentences %>%
head(5) %>%
str_split(" ")
#> [[1]]
#> [1] "The" "birch" "canoe" "slid" "on" "the" "smooth"
#> [8] "planks."
#>
#> [[2]]
#> [1] "Glue" "the" "sheet" "to" "the"
#> [6] "dark" "blue" "background."
#>
#> [[3]]
#> [1] "It's" "easy" "to" "tell" "the" "depth" "of" "a" "well."
#>
#> [[4]]
#> [1] "These" "days" "a" "chicken" "leg" "is" "a"
#> [8] "rare" "dish."
#>
#> [[5]]
#> [1] "Rice" "is" "often" "served" "in" "round" "bowls."
因?yàn)樽址蛄康拿總€(gè)分量會(huì)包含不同數(shù)量的片段,所以str_split()返回一個(gè)列表,所以如果你拆分的是一個(gè)長(zhǎng)度為1的向量,那么只要簡(jiǎn)單地提取列表第一個(gè)元素即可:
"a|b|c|d" %>%
str_split("\\|") %>%
.[[1]]
#> [1] "a" "b" "c" "d"
或者將修改參數(shù)為simplify = TRUE,返回的是一個(gè)矩陣
sentences %>%
head(5) %>%
str_split(" ", simplify = TRUE)
#> [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
#> [1,] "The" "birch" "canoe" "slid" "on" "the" "smooth" "planks."
#> [2,] "Glue" "the" "sheet" "to" "the" "dark" "blue" "background."
#> [3,] "It's" "easy" "to" "tell" "the" "depth" "of" "a"
#> [4,] "These" "days" "a" "chicken" "leg" "is" "a" "rare"
#> [5,] "Rice" "is" "often" "served" "in" "round" "bowls." ""
#> [,9]
#> [1,] ""
#> [2,] ""
#> [3,] "well."
#> [4,] "dish."
#> [5,] ""
還可以設(shè)定拆分片段的最大數(shù)量:
下面代碼設(shè)定拆分的片段最多為2個(gè)片段
fields <- c("Name: Hadley", "Country: NZ", "Age: 35")
fields %>% str_split(": ", n = 2, simplify = TRUE)
#> [,1] [,2]
#> [1,] "Name" "Hadley"
#> [2,] "Country" "NZ"
#> [3,] "Age" "35"
boundary()函數(shù),可以通過字母、行、句子和單詞邊界來拆分字符串。
boundary(type = c("character", "line_break", "sentence", "word"), skip_word_none = NA, ...)
e.g.
x <- "This is a sentence. This is another sentence."
str_split(x, " ")[[1]]
#> [1] "This" "is" "a" "sentence." "" "This"
#> [7] "is" "another" "sentence."
str_split(x, boundary("word"))[[1]]
#> [1] "This" "is" "a" "sentence" "This" "is" "another"
#> [8] "sentence"
認(rèn)真觀察上面的結(jié)果,可以發(fā)現(xiàn)boundary()函數(shù)的拆分結(jié)果是要更好的。
五、正則表達(dá)式的其他應(yīng)用
R基礎(chǔ)包中兩個(gè)常用函數(shù),他們也可以使用正則表達(dá)式
- apropos()函數(shù)可以在全局環(huán)境空間中搜索所有可用對(duì)象。當(dāng)我們沒法確切的想起函數(shù)名稱是,這個(gè)函數(shù)很好用,舉例:
apropos("replace")
#> [1] "%+replace%" "replace" "replace_na" "setReplaceMethod"
#> [5] "str_replace" "str_replace_all" "str_replace_na" "theme_replace"
- dir()函數(shù)可以列出一個(gè)目錄下的所有文件。
dir()的pattern參數(shù)可以是一個(gè)正則表達(dá)式,此時(shí)他只返回與這個(gè)模式相匹配的文件名。舉例:
dir(pattern = "\\.csv$")
[1] "clinical.csv" "raw_ntnbr.csv" "raw_tnbr.csv"
[4] "total_clinical.csv"
這章內(nèi)容都很重要,可以參考英文版R數(shù)據(jù)科學(xué):https://r4ds.had.co.nz/strings.html
及其課后習(xí)題答案:https://jrnold.github.io/r4ds-exercise-solutions/strings.html#splitting