R正则表达式常见函数

这一篇文章主要记录R语言中一些正则表达函数。

1.模式匹配查询函数

查询成果的函数主要有grep、grepl、regexpr、gregexpr、regexec，这些函数的主要区别在于其输出功效名目差异，配合点是都包括正则表达式pattern和文本text这两个参数。

1.1grep

grep函数是R正则表达中最根基的函数，其参数pattern是待匹配的模式，参数text凡是是向量名目标文本。grep返回功效是一个数值向量，向量内容是pattern在text向量中的索引位置区间。

text = c("one word", "a sentence", "you and me", "three two one")
pat = "one"
grep(pat, text)

## [1] 1 4

当我们想要修改返回功效时，可以思量增加value参数，value=TRUE可以输出text向量中切合匹配模式的值。

grep(pat, text, value = TRUE)

## [1] "one word"      "three two one"

invert参数可以获得除匹配模式以外的text值

grep(pat, text, invert = TRUE)

## [1] 2 3

grep(pat, text, invert = TRUE, value = TRUE)

## [1] "a sentence" "you and me"

1.2grepl

grepl函数返回的功效是逻辑变量值TRUE/FALSE.

grepl(pat, text)

## [1]  TRUE FALSE FALSE  TRUE

1.3regexpr

regexpr函数可以获得给定字符串中匹配模式呈现的较准确位置。较grep函数，regexpr不只可以判定出文本向量中哪个元素包括匹配模式，还可以识别匹配模式呈此刻文本向量每个元素的位置及模式长度。假如该向量元素没有匹配模式，则返回-1。

regexpr(pat, text)

## [1]  1 -1 -1 11
## attr(,"match.length")
## [1]  3 -1 -1  3
## attr(,"useBytes")
## [1] TRUE

1.4gregexpr

gregexpr与regexpr函数成果雷同，其区别在于其输出功效为列表(list)名目。

gregexpr(pat, text)

## [[1]]
## [1] 1
## attr(,"match.length")
## [1] 3
## attr(,"useBytes")
## [1] TRUE
## 
## [[2]]
## [1] -1
## attr(,"match.length")
## [1] -1
## attr(,"useBytes")
## [1] TRUE
## 
## [[3]]
## [1] -1
## attr(,"match.length")
## [1] -1
## attr(,"useBytes")
## [1] TRUE
## 
## [[4]]
## [1] 11
## attr(,"match.length")
## [1] 3
## attr(,"useBytes")
## [1] TRUE

1.5regexec

regexec函数与gregexpr很是靠近，返回功效都有“.match.length”。

(x = regexec(pat, text))

## [[1]]
## [1] 1
## attr(,"match.length")
## [1] 3
## 
## [[2]]
## [1] -1
## attr(,"match.length")
## [1] -1
## 
## [[3]]
## [1] -1
## attr(,"match.length")
## [1] -1
## 
## [[4]]
## [1] 11
## attr(,"match.length")
## [1] 3

2模式替换函数

模式替换函数主要有sub和gsub，二者的区别在于sub函数只替换文本中第一个匹配的元素，gsub则针对text中所有匹配元素。两个函数语法一直，行如：sub(pattern, replacement, text)。

Rstring = c("The R Foundation", "for Statistical Computing", "R is FREE software", 
    "R is a collaborative project")
# 用'RR'替换'R'
sub("R", "RR", Rstring)

## [1] "The RR Foundation"             "for Statistical Computing"    
## [3] "RR is FREE software"           "RR is a collaborative project"

gsub("R", "RR", Rstring)

## [1] "The RR Foundation"             "for Statistical Computing"    
## [3] "RR is FRREE software"          "RR is a collaborative project"

3.模式拆分函数

按照指定特征对文本举办拆分长短经常见的事情，strsplit函数是实现这一成果的函数。

sentence = c("R is a collaborative project with many contributors")
strsplit(sentence, " ")

## [[1]]
## [1] "R"             "is"            "a"             "collaborative"
## [5] "project"       "with"          "many"          "contributors"

tels = c("510-548-2238", "707-231-2440", "650-752-1300")
strsplit(tels, "-")

## [[1]]
## [1] "510"  "548"  "2238"
## 
## [[2]]
## [1] "707"  "231"  "2440"
## 
## [[3]]
## [1] "650"  "752"  "1300"

4.stringr包中实现正则表达的函数

stringr包的所有函数都有配合特征，其布局行如：str_function(string, pattern)。下面别离先容一些常用函数：

library(stringr)

4.1str_detect

str_detect函数用于检测字符向量中是否存在匹配模式,返回逻辑值TRUE/FALSE

some_objs = c("pen", "pencil", "marker", "spray")
str_detect(some_objs, "pen")

## [1]  TRUE  TRUE FALSE FALSE

some_objs[str_detect(some_objs, "pen")]

## [1] "pen"    "pencil"


# 匹配日期名目标例子
strings = c("12 Jun 2002", " 8 September 2004 ", "22-July-2009 ", "01 01 2001", 
    "date", "02.06.2000", "xxx-yyy-zzzz", "$2,600")
# 界说日期名目
dates = "([0-9]{1,2})[- .]([a-zA-Z]+)[- .]([0-9]{4})"
str_detect(strings, dates)

## [1]  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE

4.2str_extract和str_extract_all

str_extract函数的成果是剥离文本向量中每个元素中哦哦谁人第一个匹配的值,无匹配的元素返回NA，功效是一个和文本向量同等维度的向量。

paris_tweets = c("#Paris is chock-full of cultural and culinary attractions", 
    "Some time in #Paris along Canal St.-Martin famous by #Amelie", "While you're in #Paris, stop at cafe: http://goo.gl/yaCbW", 
    "Paris, the city of light")
# 界说模式,剥离#Paris
hash = "#[a-zA-Z]{1,}"
str_extract(paris_tweets, hash)

## [1] "#Paris" "#Paris" "#Paris" NA

str_extract_all成果雷同str_extract，可是它会剥离文本向量中每个元素中所有切合匹配模式的值，无匹配时返回character(0)，功效名目为list名目。

str_extract_all(paris_tweets, hash)

## [[1]]
## [1] "#Paris"
## 
## [[2]]
## [1] "#Paris"  "#Amelie"
## 
## [[3]]
## [1] "#Paris"
## 
## [[4]]
## character(0)

4.3str_match和str_match_all

较str_extract和str_extract_all，str_match和str_match_all会给出具体的匹配信息。

str_match(strings, dates)

##      [,1]               [,2] [,3]        [,4]  
## [1,] "12 Jun 2002"      "12" "Jun"       "2002"
## [2,] "8 September 2004" "8"  "September" "2004"
## [3,] "22-July-2009"     "22" "July"      "2009"
## [4,] NA                 NA   NA          NA    
## [5,] NA                 NA   NA          NA    
## [6,] NA                 NA   NA          NA    
## [7,] NA                 NA   NA          NA    
## [8,] NA                 NA   NA          NA

str_match_all(paris_tweets, hash)

## [[1]]
##      [,1]    
## [1,] "#Paris"
## 
## [[2]]
##      [,1]     
## [1,] "#Paris" 
## [2,] "#Amelie"
## 
## [[3]]
##      [,1]    
## [1,] "#Paris"
## 
## [[4]]
## character(0)

4.4str_locate和str_locate_all

这两个函数是用于定位匹配模式在文本向量每个元素中的位置，同上前者只定位第一个呈现的匹配模式，后者针对所有匹配的模式;str_locate返回的是两列的矩阵，str_locate_all返回的是list。

str_locate(paris_tweets, hash)

##      start end
## [1,]     1   6
## [2,]    14  19
## [3,]    17  22
## [4,]    NA  NA

str_locate_all(paris_tweets, hash)

## [[1]]
##      start end
## [1,]     1   6
## 
## [[2]]
##      start end
## [1,]    14  19
## [2,]    54  60
## 
## [[3]]
##      start end
## [1,]    17  22
## 
## [[4]]
##      start end

4.5str_replace和str_replace_all

这两个函数用于替换，用指定模式替换匹配模式。

cities = c("San Francisco", "Barcelona", "Naples", "Paris")
str_replace(cities, "[aeiou]", ";")

## [1] "S;n Francisco" "B;rcelona"     "N;ples"        "P;ris"

str_replace(cities, "[^aeiou]", ";")

## [1] ";an Francisco" ";arcelona"     ";aples"        ";aris"

str_replace_all(cities, pattern = "[aeiou]", ";")

## [1] "S;n Fr;nc;sc;" "B;rc;l;n;"     "N;pl;s"        "P;r;s"

str_replace_all(cities, pattern = "[^aeiou]", ";")

## [1] ";a;;;;a;;i;;o" ";a;;e;o;a"     ";a;;e;"        ";a;i;"

4.6str_split和str_split_fixed

str_split语法形如： str_split(string, pattern, n=Inf),个中n是返回的支解数目。

sentence = c("R is a collaborative project with many contributors")
str_split(sentence, " ")

## [[1]]
## [1] "R"             "is"            "a"             "collaborative"
## [5] "project"       "with"          "many"          "contributors"

# 支解电话号码
tels = c("510-548-2238", "707-231-2440", "650-752-1300")
str_split(tels, "-")

## [[1]]
## [1] "510"  "548"  "2238"
## 
## [[2]]
## [1] "707"  "231"  "2440"
## 
## [[3]]
## [1] "650"  "752"  "1300"

# 说明参数n的用法
flavors = c("chocolate", "vanilla", "cinnamon", "mint", "lemon")
str_split(flavors, "[aeiou]")

## [[1]]
## [1] "ch" "c"  "l"  "t"  ""  
## 
## [[2]]
## [1] "v"  "n"  "ll" ""  
## 
## [[3]]
## [1] "c"  "nn" "m"  "n" 
## 
## [[4]]
## [1] "m"  "nt"
## 
## [[5]]
## [1] "l" "m" "n"

str_split(flavors, "[aeiou]", n = 2)

## [[1]]
## [1] "ch"     "colate"
## 
## [[2]]
## [1] "v"     "nilla"
## 
## [[3]]
## [1] "c"      "nnamon"
## 
## [[4]]
## [1] "m"  "nt"
## 
## [[5]]
## [1] "l"   "mon"

str_split_fixed函数同样用于支解，其语法名目为：str_split_fixed(string, pattern, n)，这里n没有默认值，必需指定。

# 以n为界支解为两部门
str_split_fixed(flavors, "n", 2)

##      [,1]        [,2]   
## [1,] "chocolate" ""     
## [2,] "va"        "illa" 
## [3,] "ci"        "namon"
## [4,] "mi"        "t"    
## [5,] "lemo"      ""

# 以n为界支解三个部门
str_split_fixed(flavors, "n", 3)

##      [,1]        [,2]   [,3]  
## [1,] "chocolate" ""     ""    
## [2,] "va"        "illa" ""    
## [3,] "ci"        ""     "amon"
## [4,] "mi"        "t"    ""    
## [5,] "lemo"      ""     ""文章转自：http://blog.163.com/[email protected]/blog/static/692285082014325113834380/

当前位置：以往代写 > 其他教程 >R正则表达式常见函数