Skip to content

Commit 8475ef2

Browse files
committed
capi: added split
1 parent ab88aa5 commit 8475ef2

File tree

4 files changed

+161
-1
lines changed

4 files changed

+161
-1
lines changed

regex-capi/README.md

-1
Original file line numberDiff line numberDiff line change
@@ -99,5 +99,4 @@ There are a few things missing from the C API that are present in the Rust API.
9999
There's no particular (known) reason why they don't, they just haven't been
100100
implemented yet.
101101

102-
* Splitting a string by a regex.
103102
* Replacing regex matches in a string with some other text.

regex-capi/ctest/test.c

+64
Original file line numberDiff line numberDiff line change
@@ -264,6 +264,69 @@ bool test_iter_capture_names() {
264264
return passed;
265265
}
266266

267+
bool test_iter_split() {
268+
bool passed = true;
269+
270+
rure *re = rure_compile_must("[ \t]+");
271+
272+
const uint8_t *haystack = (const uint8_t *)" \t a b \t c\td e";
273+
size_t haystack_len = strlen((const char *)haystack);
274+
275+
rure_iter_split *it = rure_iter_split_new(re, haystack, haystack_len);
276+
277+
char *match;
278+
bool result = rure_iter_split_next(it, &match);
279+
if (!result) {
280+
if (DEBUG) {
281+
fprintf(stderr,
282+
"[test_iter_split] expected a match, "
283+
"but got none\n");
284+
}
285+
passed = false;
286+
goto done;
287+
}
288+
289+
result = rure_iter_split_next(it, &match);
290+
passed = (strcmp(match, "a") == 0);
291+
if (!passed) {
292+
goto done;
293+
}
294+
295+
result = rure_iter_split_next(it, &match);
296+
passed = (strcmp(match, "b") == 0);
297+
if (!passed) {
298+
goto done;
299+
}
300+
301+
result = rure_iter_split_next(it, &match);
302+
passed = (strcmp(match, "c") == 0);
303+
if (!passed) {
304+
goto done;
305+
}
306+
307+
result = rure_iter_split_next(it, &match);
308+
passed = (strcmp(match, "d") == 0);
309+
if (!passed) {
310+
goto done;
311+
}
312+
313+
result = rure_iter_split_next(it, &match);
314+
passed = (strcmp(match, "e") == 0);
315+
if (!passed) {
316+
goto done;
317+
}
318+
319+
result = rure_iter_split_next(it, &match);
320+
passed = !result;
321+
if (!passed) {
322+
goto done;
323+
}
324+
done:
325+
rure_iter_split_free(it);
326+
rure_free(re);
327+
return passed;
328+
}
329+
267330
/*
268331
* This tests whether we can set the flags correctly. In this case, we disable
269332
* all flags, which includes disabling Unicode mode. When we disable Unicode
@@ -574,6 +637,7 @@ int main() {
574637
run_test(test_captures, "test_captures", &passed);
575638
run_test(test_iter, "test_iter", &passed);
576639
run_test(test_iter_capture_names, "test_iter_capture_names", &passed);
640+
run_test(test_iter_split, "test_iter_split", &passed);
577641
run_test(test_flags, "test_flags", &passed);
578642
run_test(test_compile_error, "test_compile_error", &passed);
579643
run_test(test_compile_error_size_limit, "test_compile_error_size_limit",

regex-capi/include/rure.h

+39
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,19 @@ typedef struct rure_iter rure_iter;
104104
*/
105105
typedef struct rure_iter_capture_names rure_iter_capture_names;
106106

107+
/*
108+
* rure_iter_split is an iterator over substrings in the haystack passed,
109+
* delimited by a match of the rure. Namely, each element of the iterator corresponds
110+
* to a part of the haystack that isn’t matched by the regular expression.
111+
*
112+
* An rure_iter_split value may not outlive its corresponding rure,
113+
* and should be freed before its corresponding rure is freed.
114+
*
115+
* It is not safe to use from multiple threads simultaneously.
116+
*/
117+
typedef struct rure_iter_split rure_iter_split;
118+
119+
107120
/*
108121
* rure_error is an error that caused compilation to fail.
109122
*
@@ -294,6 +307,32 @@ void rure_iter_capture_names_free(rure_iter_capture_names *it);
294307
*/
295308
bool rure_iter_capture_names_next(rure_iter_capture_names *it, char **name);
296309

310+
/*
311+
* rure_iter_split_new creates an iterator of substrings of the haystack given,
312+
* delimited by a match of the regex. Namely, each element of the iterator corresponds
313+
* to a part of the haystack that isn’t matched by the regular expression.
314+
*
315+
* haystack may contain arbitrary bytes, but ASCII compatible text is more
316+
* useful. UTF-8 is even more useful. Other text encodings aren't supported.
317+
* length should be the number of bytes in haystack.
318+
*/
319+
rure_iter_split *rure_iter_split_new(rure *re, const uint8_t *haystack, size_t length);
320+
321+
/*
322+
* rure_iter_split_free frees the iterator given.
323+
*
324+
* It must be called at most once.
325+
*/
326+
void rure_iter_split_free(rure_iter_split *it);
327+
328+
/*
329+
* rure_iter_split_next advances the iterator and returns true if and only if a
330+
* match was found. The value of the next item is written to the provided pointer.
331+
*
332+
* If no match is found, then subsequent calls will return false indefinitely.
333+
*/
334+
bool rure_iter_split_next(rure_iter_split *it, char *const *next);
335+
297336
/*
298337
* rure_iter_new creates a new iterator.
299338
*

regex-capi/src/rure.rs

+58
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,11 @@ pub struct IterCaptureNames {
5454
name_ptrs: Vec<*mut c_char>,
5555
}
5656

57+
pub struct IterSplit {
58+
split: bytes::Split<'static, 'static>,
59+
split_ptrs: Vec<*mut c_char>,
60+
}
61+
5762
impl Deref for Regex {
5863
type Target = bytes::Regex;
5964
fn deref(&self) -> &bytes::Regex {
@@ -302,6 +307,59 @@ ffi_fn! {
302307
}
303308
}
304309

310+
ffi_fn! {
311+
fn rure_iter_split_new(
312+
re: *const Regex,
313+
haystack: *const u8,
314+
len: size_t,
315+
) -> *mut IterSplit {
316+
let re = unsafe { &*re };
317+
let haystack = unsafe { slice::from_raw_parts(haystack, len) };
318+
Box::into_raw(Box::new(IterSplit {
319+
split: re.re.split(haystack),
320+
split_ptrs: Vec::new(),
321+
}))
322+
}
323+
}
324+
325+
ffi_fn! {
326+
fn rure_iter_split_free(it: *mut IterSplit) {
327+
unsafe {
328+
let it = &mut *it;
329+
while let Some(ptr) = it.split_ptrs.pop() {
330+
drop(CString::from_raw(ptr));
331+
}
332+
drop(Box::from_raw(it));
333+
}
334+
}
335+
}
336+
337+
ffi_fn! {
338+
fn rure_iter_split_next(
339+
it: *mut IterSplit,
340+
next: *mut *const c_char,
341+
) -> bool {
342+
let it = unsafe { &mut *it };
343+
let s = match it.split.next() {
344+
// Matches exhausted
345+
None => return false,
346+
Some(val) => val
347+
};
348+
349+
unsafe {
350+
let cs = match CString::new(s) {
351+
Result::Ok(val) => val,
352+
Result::Err(_) => return false
353+
};
354+
let ptr = cs.into_raw();
355+
it.split_ptrs.push(ptr);
356+
*next = ptr;
357+
}
358+
true
359+
360+
}
361+
}
362+
305363
ffi_fn! {
306364
fn rure_iter_new(
307365
re: *const Regex,

0 commit comments

Comments
 (0)