From b308817a6434ed04fb8ad324efa4ebd248ef2157 Mon Sep 17 00:00:00 2001 From: Milo Mirate <992859+mmirate@users.noreply.github.com> Date: Sun, 22 Dec 2024 18:57:36 -0500 Subject: [PATCH 1/2] hir: lift alternations' common suffixes too This should probably produce better regexes internally; additionally, I know it will produce Hirs that are more amenable to being walked recursively (trading away performance for content-intelligence) to produce human-readable regex syntax. (My own usecase involves automating the operation of a ghastly pre-existing machine that takes PCREs.) --- regex-syntax/src/hir/mod.rs | 63 +++++++++++++++++++++++++++++++++++-- 1 file changed, 61 insertions(+), 2 deletions(-) diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index 5db784388..ff69ecc8d 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -3047,7 +3047,7 @@ fn lift_common_prefix(hirs: Vec) -> Result> { .count(); prefix = &prefix[..common_len]; if prefix.is_empty() { - return Err(hirs); + return lift_common_suffix(hirs).map(Hir::concat); } } let len = prefix.len(); @@ -3068,10 +3068,69 @@ fn lift_common_prefix(hirs: Vec) -> Result> { } } let mut concat = prefix_concat; - concat.push(Hir::alternation(suffix_alts)); + match lift_common_suffix(suffix_alts) { + Ok(suffix_concat) => { + concat.extend(suffix_concat); + } + Err(suffix_alts) => { + concat.push(Hir::alternation(suffix_alts)); + } + } Ok(Hir::concat(concat)) } +#[allow(clippy::inline_always)] +#[inline(always)] // prevents blowing the stack +fn lift_common_suffix(hirs: Vec) -> Result, Vec> { + if hirs.len() <= 1 { + return Err(hirs); + } + let mut suffix = match hirs.last().unwrap().kind() { + HirKind::Concat(ref xs) => &**xs, + _ => return Err(hirs), + }; + if suffix.is_empty() { + return Err(hirs); + } + for h in hirs.iter().rev().skip(1) { + let concat = match h.kind() { + HirKind::Concat(ref xs) => xs, + _ => return Err(hirs), + }; + let common_len = suffix + .iter() + .rev() + .zip(concat.iter().rev()) + .take_while(|(x, y)| x == y) + .count(); + suffix = &suffix[suffix.len()-common_len..]; + if suffix.is_empty() { + return Err(hirs); + } + } + let len = suffix.len(); + assert_ne!(0, len); + let mut suffix_concat = vec![]; + let mut prefix_alts = vec![]; + for h in hirs { + let mut concat = match h.into_kind() { + HirKind::Concat(xs) => xs, + // We required all sub-expressions to be + // concats above, so we're only here if we + // have a concat. + _ => unreachable!(), + }; + let suffix = concat.split_off(concat.len()-len); + prefix_alts.push(Hir::concat(concat)); + if suffix_concat.is_empty() { + suffix_concat = suffix; + } + } + let mut concat = suffix_concat; + concat.insert(0, Hir::alternation(prefix_alts)); + Ok(concat) +} + #[cfg(test)] mod tests { use super::*; From 988c40529e7c8a1e5f6a12a374ee06fd53da05ee Mon Sep 17 00:00:00 2001 From: Milo Mirate <992859+mmirate@users.noreply.github.com> Date: Sun, 22 Dec 2024 19:00:03 -0500 Subject: [PATCH 2/2] appease rustfmt --- regex-syntax/src/hir/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index ff69ecc8d..ea76a2c32 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -3103,7 +3103,7 @@ fn lift_common_suffix(hirs: Vec) -> Result, Vec> { .zip(concat.iter().rev()) .take_while(|(x, y)| x == y) .count(); - suffix = &suffix[suffix.len()-common_len..]; + suffix = &suffix[suffix.len() - common_len..]; if suffix.is_empty() { return Err(hirs); } @@ -3120,7 +3120,7 @@ fn lift_common_suffix(hirs: Vec) -> Result, Vec> { // have a concat. _ => unreachable!(), }; - let suffix = concat.split_off(concat.len()-len); + let suffix = concat.split_off(concat.len() - len); prefix_alts.push(Hir::concat(concat)); if suffix_concat.is_empty() { suffix_concat = suffix;