Skip to content

Commit

Permalink
api: add new 'Regex::static_captures_len' method
Browse files Browse the repository at this point in the history
This adds a new routine for computing the static number of capture
groups that will appear in every match. If the number of groups is not
invariant across all matches, then there is no static capture length.

This is meant to help implement higher level convenience APIs for
extracting capture groups, such as the one described in #824. We may
wind up including such APIs in the regex crate itself, but this commit
stops short of that. Instead, we just add this new property which should
permit those APIs to exist outside of this crate for now.

Closes #908
  • Loading branch information
BurntSushi committed Apr 17, 2023
1 parent 42dbdcf commit b23e97a
Show file tree
Hide file tree
Showing 7 changed files with 207 additions and 0 deletions.
80 changes: 80 additions & 0 deletions regex-syntax/src/hir/mod.rs
Expand Up @@ -1833,6 +1833,7 @@ struct PropertiesI {
look_set_suffix: LookSet,
utf8: bool,
captures_len: usize,
static_captures_len: Option<usize>,
literal: bool,
alternation_literal: bool,
}
Expand Down Expand Up @@ -1990,6 +1991,44 @@ impl Properties {
self.0.captures_len
}

/// Returns the total number of explicit capturing groups that appear in
/// every possible match.
///
/// If the number of capture groups can vary depending on the match, then
/// this returns `None`. That is, a value is only returned when the number
/// of matching groups is invariant or "static."
///
/// Note that this does not include the implicit capturing group
/// corresponding to the entire match.
///
/// # Example
///
/// This shows a few cases where a static number of capture groups is
/// available and a few cases where it is not.
///
/// ```
/// use regex_syntax::parse;
///
/// let len = |pattern| {
/// parse(pattern).map(|h| h.properties().static_captures_len())
/// };
///
/// assert_eq!(Some(0), len("a")?);
/// assert_eq!(Some(1), len("(a)")?);
/// assert_eq!(Some(1), len("(a)|(b)")?);
/// assert_eq!(Some(2), len("(a)(b)|(c)(d)")?);
/// assert_eq!(None, len("(a)|b")?);
/// assert_eq!(None, len("a|(b)")?);
/// assert_eq!(None, len("(b)*")?);
/// assert_eq!(Some(1), len("(b)+")?);
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
#[inline]
pub fn static_captures_len(&self) -> Option<usize> {
self.0.static_captures_len
}

/// Return true if and only if this HIR is a simple literal. This is
/// only true when this HIR expression is either itself a `Literal` or a
/// concatenation of only `Literal`s.
Expand Down Expand Up @@ -2100,6 +2139,13 @@ impl Properties {
} else {
LookSet::full()
};
// And also, an empty alternate means we have 0 static capture groups,
// but we otherwise start with the number corresponding to the first
// alternate. If any subsequent alternate has a different number of
// static capture groups, then we overall have a variation and not a
// static number of groups.
let static_captures_len =
it.peek().and_then(|p| p.borrow().static_captures_len());
// The base case is an empty alternation, which matches nothing.
// Note though that empty alternations aren't possible, because the
// Hir::alternation smart constructor rewrites those as empty character
Expand All @@ -2112,6 +2158,7 @@ impl Properties {
look_set_suffix: fix,
utf8: true,
captures_len: 0,
static_captures_len,
literal: false,
alternation_literal: true,
};
Expand All @@ -2125,6 +2172,9 @@ impl Properties {
props.utf8 = props.utf8 && p.is_utf8();
props.captures_len =
props.captures_len.saturating_add(p.captures_len());
if props.static_captures_len != p.static_captures_len() {
props.static_captures_len = None;
}
props.alternation_literal =
props.alternation_literal && p.is_alternation_literal();
if !min_poisoned {
Expand Down Expand Up @@ -2180,6 +2230,7 @@ impl Properties {
// since it too can match the empty string.
utf8: true,
captures_len: 0,
static_captures_len: Some(0),
literal: false,
alternation_literal: false,
};
Expand All @@ -2196,6 +2247,7 @@ impl Properties {
look_set_suffix: LookSet::empty(),
utf8: core::str::from_utf8(&lit.0).is_ok(),
captures_len: 0,
static_captures_len: Some(0),
literal: true,
alternation_literal: true,
};
Expand All @@ -2212,6 +2264,7 @@ impl Properties {
look_set_suffix: LookSet::empty(),
utf8: class.is_utf8(),
captures_len: 0,
static_captures_len: Some(0),
literal: false,
alternation_literal: false,
};
Expand Down Expand Up @@ -2241,6 +2294,7 @@ impl Properties {
// property borderline useless.
utf8: true,
captures_len: 0,
static_captures_len: Some(0),
literal: false,
alternation_literal: false,
};
Expand Down Expand Up @@ -2268,6 +2322,7 @@ impl Properties {
look_set_suffix: LookSet::empty(),
utf8: p.is_utf8(),
captures_len: p.captures_len(),
static_captures_len: p.static_captures_len(),
literal: false,
alternation_literal: false,
};
Expand All @@ -2278,6 +2333,23 @@ impl Properties {
inner.look_set_prefix = p.look_set_prefix();
inner.look_set_suffix = p.look_set_suffix();
}
// If the static captures len of the sub-expression is not known or is
// zero, then it automatically propagates to the repetition, regardless
// of the repetition. Otherwise, it might change, but only when the
// repetition can match 0 times.
if rep.min == 0
&& inner.static_captures_len.map_or(false, |len| len > 0)
{
// If we require a match 0 times, then our captures len is
// guaranteed to be zero. Otherwise, if we *can* match the empty
// string, then it's impossible to know how many captures will be
// in the resulting match.
if rep.max == Some(0) {
inner.static_captures_len = Some(0);
} else {
inner.static_captures_len = None;
}
}
Properties(Box::new(inner))
}

Expand All @@ -2286,6 +2358,9 @@ impl Properties {
let p = capture.sub.properties();
Properties(Box::new(PropertiesI {
captures_len: p.captures_len().saturating_add(1),
static_captures_len: p
.static_captures_len()
.map(|len| len.saturating_add(1)),
literal: false,
alternation_literal: false,
..*p.0.clone()
Expand All @@ -2306,6 +2381,7 @@ impl Properties {
look_set_suffix: LookSet::empty(),
utf8: true,
captures_len: 0,
static_captures_len: Some(0),
literal: true,
alternation_literal: true,
};
Expand All @@ -2316,6 +2392,10 @@ impl Properties {
props.utf8 = props.utf8 && p.is_utf8();
props.captures_len =
props.captures_len.saturating_add(p.captures_len());
props.static_captures_len = p
.static_captures_len()
.and_then(|len1| Some((len1, props.static_captures_len?)))
.and_then(|(len1, len2)| Some(len1.saturating_add(len2)));
props.literal = props.literal && p.is_literal();
props.alternation_literal =
props.alternation_literal && p.is_alternation_literal();
Expand Down
35 changes: 35 additions & 0 deletions regex-syntax/src/hir/translate.rs
Expand Up @@ -3204,6 +3204,41 @@ mod tests {
assert_eq!(1, props(r"([a&&b])").captures_len());
}

#[test]
fn analysis_static_captures_len() {
let len = |pattern| props(pattern).static_captures_len();
assert_eq!(Some(0), len(r""));
assert_eq!(Some(0), len(r"foo|bar"));
assert_eq!(None, len(r"(foo)|bar"));
assert_eq!(None, len(r"foo|(bar)"));
assert_eq!(Some(1), len(r"(foo|bar)"));
assert_eq!(Some(1), len(r"(a|b|c|d|e|f)"));
assert_eq!(Some(1), len(r"(a)|(b)|(c)|(d)|(e)|(f)"));
assert_eq!(Some(2), len(r"(a)(b)|(c)(d)|(e)(f)"));
assert_eq!(Some(6), len(r"(a)(b)(c)(d)(e)(f)"));
assert_eq!(Some(3), len(r"(a)(b)(extra)|(a)(b)()"));
assert_eq!(Some(3), len(r"(a)(b)((?:extra)?)"));
assert_eq!(None, len(r"(a)(b)(extra)?"));
assert_eq!(Some(1), len(r"(foo)|(bar)"));
assert_eq!(Some(2), len(r"(foo)(bar)"));
assert_eq!(Some(2), len(r"(foo)+(bar)"));
assert_eq!(None, len(r"(foo)*(bar)"));
assert_eq!(Some(0), len(r"(foo)?{0}"));
assert_eq!(None, len(r"(foo)?{1}"));
assert_eq!(Some(1), len(r"(foo){1}"));
assert_eq!(Some(1), len(r"(foo){1,}"));
assert_eq!(Some(1), len(r"(foo){1,}?"));
assert_eq!(None, len(r"(foo){1,}??"));
assert_eq!(None, len(r"(foo){0,}"));
assert_eq!(Some(1), len(r"(foo)(?:bar)"));
assert_eq!(Some(2), len(r"(foo(?:bar)+)(?:baz(boo))"));
assert_eq!(Some(2), len(r"(?P<bar>foo)(?:bar)(bal|loon)"));
assert_eq!(
Some(2),
len(r#"<(a)[^>]+href="([^"]+)"|<(img)[^>]+src="([^"]+)""#)
);
}

#[test]
fn analysis_is_all_assertions() {
// Positive examples.
Expand Down
2 changes: 2 additions & 0 deletions src/compile.rs
Expand Up @@ -161,6 +161,8 @@ impl Compiler {
self.fill_to_next(patch.hole);
self.compiled.matches = vec![self.insts.len()];
self.push_compiled(Inst::Match(0));
self.compiled.static_captures_len =
expr.properties().static_captures_len();
self.compile_finish()
}

Expand Down
6 changes: 6 additions & 0 deletions src/exec.rs
Expand Up @@ -1361,6 +1361,12 @@ impl Exec {
pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
&self.ro.nfa.capture_name_idx
}

/// If the number of capture groups in every match is always the same, then
/// return that number. Otherwise return `None`.
pub fn static_captures_len(&self) -> Option<usize> {
self.ro.nfa.static_captures_len
}
}

impl Clone for Exec {
Expand Down
4 changes: 4 additions & 0 deletions src/prog.rs
Expand Up @@ -27,6 +27,9 @@ pub struct Program {
pub captures: Vec<Option<String>>,
/// Pointers to all named capture groups into `captures`.
pub capture_name_idx: Arc<HashMap<String, usize>>,
/// If the number of capture groups is the same for all possible matches,
/// then this is that number.
pub static_captures_len: Option<usize>,
/// A pointer to the start instruction. This can vary depending on how
/// the program was compiled. For example, programs for use with the DFA
/// engine have a `.*?` inserted at the beginning of unanchored regular
Expand Down Expand Up @@ -83,6 +86,7 @@ impl Program {
matches: vec![],
captures: vec![],
capture_name_idx: Arc::new(HashMap::new()),
static_captures_len: None,
start: 0,
byte_classes: vec![0; 256],
only_utf8: true,
Expand Down
40 changes: 40 additions & 0 deletions src/re_bytes.rs
Expand Up @@ -667,6 +667,46 @@ impl Regex {
self.0.capture_names().len()
}

/// Returns the total number of capturing groups that appear in every
/// possible match.
///
/// If the number of capture groups can vary depending on the match, then
/// this returns `None`. That is, a value is only returned when the number
/// of matching groups is invariant or "static."
///
/// Note that like [`Regex::captures_len`], this **does** include the
/// implicit capturing group corresponding to the entire match. Therefore,
/// when a non-None value is returned, it is guaranteed to be at least `1`.
/// Stated differently, a return value of `Some(0)` is impossible.
///
/// # Example
///
/// This shows a few cases where a static number of capture groups is
/// available and a few cases where it is not.
///
/// ```
/// use regex::bytes::Regex;
///
/// let len = |pattern| {
/// Regex::new(pattern).map(|re| re.static_captures_len())
/// };
///
/// assert_eq!(Some(1), len("a")?);
/// assert_eq!(Some(2), len("(a)")?);
/// assert_eq!(Some(2), len("(a)|(b)")?);
/// assert_eq!(Some(3), len("(a)(b)|(c)(d)")?);
/// assert_eq!(None, len("(a)|b")?);
/// assert_eq!(None, len("a|(b)")?);
/// assert_eq!(None, len("(b)*")?);
/// assert_eq!(Some(2), len("(b)+")?);
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
#[inline]
pub fn static_captures_len(&self) -> Option<usize> {
self.0.static_captures_len().map(|len| len.saturating_add(1))
}

/// Returns an empty set of capture locations that can be reused in
/// multiple calls to `captures_read` or `captures_read_at`.
pub fn capture_locations(&self) -> CaptureLocations {
Expand Down
40 changes: 40 additions & 0 deletions src/re_unicode.rs
Expand Up @@ -725,6 +725,46 @@ impl Regex {
self.0.capture_names().len()
}

/// Returns the total number of capturing groups that appear in every
/// possible match.
///
/// If the number of capture groups can vary depending on the match, then
/// this returns `None`. That is, a value is only returned when the number
/// of matching groups is invariant or "static."
///
/// Note that like [`Regex::captures_len`], this **does** include the
/// implicit capturing group corresponding to the entire match. Therefore,
/// when a non-None value is returned, it is guaranteed to be at least `1`.
/// Stated differently, a return value of `Some(0)` is impossible.
///
/// # Example
///
/// This shows a few cases where a static number of capture groups is
/// available and a few cases where it is not.
///
/// ```
/// use regex::Regex;
///
/// let len = |pattern| {
/// Regex::new(pattern).map(|re| re.static_captures_len())
/// };
///
/// assert_eq!(Some(1), len("a")?);
/// assert_eq!(Some(2), len("(a)")?);
/// assert_eq!(Some(2), len("(a)|(b)")?);
/// assert_eq!(Some(3), len("(a)(b)|(c)(d)")?);
/// assert_eq!(None, len("(a)|b")?);
/// assert_eq!(None, len("a|(b)")?);
/// assert_eq!(None, len("(b)*")?);
/// assert_eq!(Some(2), len("(b)+")?);
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
#[inline]
pub fn static_captures_len(&self) -> Option<usize> {
self.0.static_captures_len().map(|len| len.saturating_add(1))
}

/// Returns an empty set of capture locations that can be reused in
/// multiple calls to `captures_read` or `captures_read_at`.
pub fn capture_locations(&self) -> CaptureLocations {
Expand Down

0 comments on commit b23e97a

Please sign in to comment.