Skip to content

Commit 37be343

Browse files
committed
create length, count, detect, extract
1 parent f08cc03 commit 37be343

File tree

5 files changed

+224
-16
lines changed

5 files changed

+224
-16
lines changed

R/count.R

+8
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
str_count <- function(l, pattern) {
2+
switch(
3+
class(pattern),
4+
text = str_text_count(l, pattern@value),
5+
regex = str_regex_count(l, pattern@value),
6+
character = str_regex_count(l, pattern)
7+
)
8+
}

R/detect.R

+14
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
str_detect <- function(l, pattern, negate = FALSE) {
2+
result <- switch(
3+
class(pattern),
4+
text = str_text_detect(l, pattern@value),
5+
regex = str_regex_detect(l, pattern@value),
6+
character = str_regex_detect(l, pattern)
7+
)
8+
if (negate) {
9+
for (i in seq(length(result))) {
10+
result[i] <- !result[i]
11+
}
12+
}
13+
result
14+
}

R/extendr-wrappers.R

+14
Original file line numberDiff line numberDiff line change
@@ -12,3 +12,17 @@ str_regex_count <- function(strings, pattern) .Call(wrap__str_regex_count, strin
1212

1313
str_text_count <- function(strings, sub) .Call(wrap__str_text_count, strings, sub)
1414

15+
str_regex_detect <- function(strings, pattern) .Call(wrap__str_regex_detect, strings, pattern)
16+
17+
str_text_detect <- function(strings, sub) .Call(wrap__str_text_detect, strings, sub)
18+
19+
str_regex_extract <- function(strings, pattern) .Call(wrap__str_regex_extract, strings, pattern)
20+
21+
str_text_extract <- function(strings, sub) .Call(wrap__str_text_extract, strings, sub)
22+
23+
single_str_regex_extract_all <- function(string, pattern) .Call(wrap__single_str_regex_extract_all, string, pattern)
24+
25+
single_str_text_extract_all <- function(string, sub) .Call(wrap__single_str_text_extract_all, string, sub)
26+
27+
str_length <- function(str) .Call(wrap__str_length, str)
28+

R/extract.R

+22
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
str_extract <- function(l, x) {
2+
switch(
3+
class(x),
4+
text = str_text_extract(l, x@value),
5+
regex = str_regex_extract(l, x@value),
6+
character = str_regex_extract(l, x)
7+
)
8+
}
9+
10+
str_extract_all <- function(l, x) {
11+
rs <- list()
12+
for (v in l) {
13+
r <- switch(
14+
class(x),
15+
text = single_str_text_extract_all(v, x@value),
16+
regex = single_str_regex_extract_all(v, x@value),
17+
character = single_str_regex_extract_all(v, x)
18+
)
19+
rs <- append(rs, r)
20+
}
21+
rs
22+
}

src/rust/src/lib.rs

+166-16
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,14 @@ extendr_module! {
55
mod stringer;
66
fn str_regex_count;
77
fn str_text_count;
8+
fn str_regex_detect;
9+
fn str_text_detect;
10+
fn str_regex_extract;
11+
fn str_text_extract;
12+
fn single_str_regex_extract_all;
13+
fn single_str_text_extract_all;
14+
15+
fn str_length;
816
}
917

1018
#[extendr]
@@ -13,24 +21,166 @@ fn str_regex_count(strings: Vec<String>, pattern: String) -> Vec<i64> {
1321
Ok(re) => re,
1422
Err(err) => panic!("{}", err),
1523
};
16-
strings.iter().map(|x| {
17-
let c = re.split(x.as_str()).count();
18-
if c > 0 {
19-
(c - 1) as i64
20-
} else {
21-
0 as i64
22-
}
23-
}).collect()
24+
strings
25+
.iter()
26+
.map(|x| {
27+
let c = re.split(x.as_str()).count();
28+
if c > 0 {
29+
(c - 1) as i64
30+
} else {
31+
0 as i64
32+
}
33+
})
34+
.collect()
2435
}
2536

2637
#[extendr]
2738
fn str_text_count(strings: Vec<String>, sub: String) -> Vec<i64> {
28-
strings.iter().map(|x| {
29-
let c = x.split(&sub).count();
30-
if c > 0 {
31-
(c - 1) as i64
32-
} else {
33-
0 as i64
39+
strings
40+
.iter()
41+
.map(|x| {
42+
let c = x.split(&sub).count();
43+
if c > 0 {
44+
(c - 1) as i64
45+
} else {
46+
0 as i64
47+
}
48+
})
49+
.collect()
50+
}
51+
52+
#[extendr]
53+
fn str_regex_detect(strings: Vec<String>, pattern: String) -> Vec<bool> {
54+
let re = match Regex::new(pattern.as_str()) {
55+
Ok(re) => re,
56+
Err(err) => panic!("{}", err),
57+
};
58+
strings.iter().map(|x| re.is_match(x.as_str())).collect()
59+
}
60+
61+
#[extendr]
62+
fn str_text_detect(strings: Vec<String>, sub: String) -> Vec<bool> {
63+
strings.iter().map(|x| x.contains(sub.as_str())).collect()
64+
}
65+
66+
#[extendr]
67+
fn str_regex_extract(strings: Vec<String>, pattern: String) -> Vec<String> {
68+
let re = match Regex::new(pattern.as_str()) {
69+
Ok(re) => re,
70+
Err(err) => panic!("{}", err),
71+
};
72+
strings
73+
.iter()
74+
.map(|x| {
75+
re.find(x.as_str())
76+
.map(|m| {
77+
x.chars()
78+
.skip(m.start())
79+
.take(m.end() - m.start())
80+
.collect()
81+
})
82+
.unwrap_or_else(|| "".to_string())
83+
})
84+
.collect()
85+
}
86+
87+
#[extendr]
88+
fn str_text_extract(strings: Vec<String>, sub: String) -> Vec<String> {
89+
strings
90+
.iter()
91+
.map(|x| {
92+
if x.contains(sub.as_str()) {
93+
sub.clone()
94+
} else {
95+
"".to_string()
96+
}
97+
})
98+
.collect()
99+
}
100+
101+
#[extendr]
102+
fn single_str_regex_extract_all(string: String, pattern: String) -> Vec<String> {
103+
let re = match Regex::new(pattern.as_str()) {
104+
Ok(re) => re,
105+
Err(err) => panic!("{}", err),
106+
};
107+
let mut rs = Vec::new();
108+
let mut index = 0;
109+
loop {
110+
match re.find_at(string.as_str(), index) {
111+
Some(m) => {
112+
rs.push(
113+
string.chars()
114+
.skip(m.start())
115+
.take(m.end() - m.start())
116+
.collect(),
117+
);
118+
index = m.end();
119+
}
120+
None => break,
121+
}
122+
}
123+
if rs.len() == 0 {
124+
rs.push("".to_string());
125+
}
126+
rs
127+
}
128+
129+
#[extendr]
130+
fn single_str_text_extract_all(string: String, sub: String) -> Vec<String> {
131+
let mut rs = Vec::new();
132+
let mut index = 0;
133+
loop {
134+
let (_, target) = string.split_at(index);
135+
match target.find(sub.as_str()) {
136+
Some(m) => {
137+
rs.push(
138+
string.chars()
139+
.skip(m)
140+
.take(sub.len())
141+
.collect(),
142+
);
143+
index = m + sub.len();
144+
}
145+
None => break,
146+
}
147+
}
148+
if rs.len() == 0 {
149+
rs.push("".to_string());
150+
}
151+
rs
152+
}
153+
154+
#[extendr]
155+
fn str_length(str: String) -> i64 {
156+
let mut count = 0;
157+
let mut cur = 0;
158+
159+
for c in str.as_bytes() {
160+
if cur == 0 {
161+
if (c & 0b10000000) == 0 {
162+
count += 1;
163+
cur = 0;
164+
continue;
165+
}
166+
if (c & 0b11100000) == 0b11000000 {
167+
count += 1;
168+
cur = 1;
169+
continue;
170+
}
171+
if (c & 0b11110000) == 0b11100000 {
172+
count += 1;
173+
cur = 2;
174+
continue;
175+
}
176+
if (c & 0b11111000) == 0b11110000 {
177+
count += 1;
178+
cur = 3;
179+
continue;
180+
}
34181
}
35-
}).collect()
36-
}
182+
cur -= 1;
183+
}
184+
185+
count
186+
}

0 commit comments

Comments
 (0)