Skip to content

Commit 1f9f9cc

Browse files
authored
bytes: escape invalid UTF-8 bytes in debug output for Match
PR #1203
1 parent ab4c8d1 commit 1f9f9cc

File tree

1 file changed

+91
-11
lines changed

1 file changed

+91
-11
lines changed

src/regex/bytes.rs

+91-11
Original file line numberDiff line numberDiff line change
@@ -1555,18 +1555,13 @@ impl<'h> Match<'h> {
15551555

15561556
impl<'h> core::fmt::Debug for Match<'h> {
15571557
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
1558+
use regex_automata::util::escape::DebugHaystack;
1559+
15581560
let mut fmt = f.debug_struct("Match");
1559-
fmt.field("start", &self.start).field("end", &self.end);
1560-
if let Ok(s) = core::str::from_utf8(self.as_bytes()) {
1561-
fmt.field("bytes", &s);
1562-
} else {
1563-
// FIXME: It would be nice if this could be printed as a string
1564-
// with invalid UTF-8 replaced with hex escapes. A alloc would
1565-
// probably okay if that makes it easier, but regex-automata does
1566-
// (at time of writing) have internal routines that do this. So
1567-
// maybe we should expose them.
1568-
fmt.field("bytes", &self.as_bytes());
1569-
}
1561+
fmt.field("start", &self.start)
1562+
.field("end", &self.end)
1563+
.field("bytes", &DebugHaystack(&self.as_bytes()));
1564+
15701565
fmt.finish()
15711566
}
15721567
}
@@ -2620,3 +2615,88 @@ fn no_expansion<T: AsRef<[u8]>>(replacement: &T) -> Option<Cow<'_, [u8]>> {
26202615
None => Some(Cow::Borrowed(replacement)),
26212616
}
26222617
}
2618+
2619+
#[cfg(test)]
2620+
mod tests {
2621+
use super::*;
2622+
use alloc::format;
2623+
2624+
#[test]
2625+
fn test_match_properties() {
2626+
let haystack = b"Hello, world!";
2627+
let m = Match::new(haystack, 7, 12);
2628+
2629+
assert_eq!(m.start(), 7);
2630+
assert_eq!(m.end(), 12);
2631+
assert_eq!(m.is_empty(), false);
2632+
assert_eq!(m.len(), 5);
2633+
assert_eq!(m.as_bytes(), b"world");
2634+
}
2635+
2636+
#[test]
2637+
fn test_empty_match() {
2638+
let haystack = b"";
2639+
let m = Match::new(haystack, 0, 0);
2640+
2641+
assert_eq!(m.is_empty(), true);
2642+
assert_eq!(m.len(), 0);
2643+
}
2644+
2645+
#[test]
2646+
fn test_debug_output_valid_utf8() {
2647+
let haystack = b"Hello, world!";
2648+
let m = Match::new(haystack, 7, 12);
2649+
let debug_str = format!("{:?}", m);
2650+
2651+
assert_eq!(
2652+
debug_str,
2653+
r#"Match { start: 7, end: 12, bytes: "world" }"#
2654+
);
2655+
}
2656+
2657+
#[test]
2658+
fn test_debug_output_invalid_utf8() {
2659+
let haystack = b"Hello, \xFFworld!";
2660+
let m = Match::new(haystack, 7, 13);
2661+
let debug_str = format!("{:?}", m);
2662+
2663+
assert_eq!(
2664+
debug_str,
2665+
r#"Match { start: 7, end: 13, bytes: "\xffworld" }"#
2666+
);
2667+
}
2668+
2669+
#[test]
2670+
fn test_debug_output_various_unicode() {
2671+
let haystack =
2672+
"Hello, 😊 world! 안녕하세요? مرحبا بالعالم!".as_bytes();
2673+
let m = Match::new(haystack, 0, haystack.len());
2674+
let debug_str = format!("{:?}", m);
2675+
2676+
assert_eq!(
2677+
debug_str,
2678+
r#"Match { start: 0, end: 62, bytes: "Hello, 😊 world! 안녕하세요? مرحبا بالعالم!" }"#
2679+
);
2680+
}
2681+
2682+
#[test]
2683+
fn test_debug_output_ascii_escape() {
2684+
let haystack = b"Hello,\tworld!\nThis is a \x1b[31mtest\x1b[0m.";
2685+
let m = Match::new(haystack, 0, haystack.len());
2686+
let debug_str = format!("{:?}", m);
2687+
2688+
assert_eq!(
2689+
debug_str,
2690+
r#"Match { start: 0, end: 38, bytes: "Hello,\tworld!\nThis is a \u{1b}[31mtest\u{1b}[0m." }"#
2691+
);
2692+
}
2693+
2694+
#[test]
2695+
fn test_debug_output_match_in_middle() {
2696+
let haystack = b"The quick brown fox jumps over the lazy dog.";
2697+
let m = Match::new(haystack, 16, 19);
2698+
let debug_str = format!("{:?}", m);
2699+
2700+
assert_eq!(debug_str, r#"Match { start: 16, end: 19, bytes: "fox" }"#);
2701+
}
2702+
}

0 commit comments

Comments
 (0)