Skip to content

Commit

Permalink
impl: fix prefix literal matching bug
Browse files Browse the repository at this point in the history
This commit fixes a bug where it was possible to report a match where
none existed. Basically, in the current regex crate, it just cannot deal
with a mixture of look-around assertions in the prefix of a pattern and
prefix literal optimizations. Before 1.8, this was handled by simply
refusing to extract literals in that case. But in 1.8, with a rewrite of
the literal extractor, literals are now extracted for patterns like
this:

    (?i:(?:\b|_)win(?:32|64|dows)?(?:\b|_))

So in 1.8, since it was still using the old engines that can't deal with
this, I added some extra logic to throw away any extracted prefix
literals if a look-around assertion occurred in the prefix of the
pattern. The problem is that the logic I used was "always occurs in the
prefix of the pattern" instead of "may occur in the prefix of the
pattern." In the pattern above, it's the latter case. So it slipped by
and the regex engine tried to use the prefix literals to accelerat the
search. This in turn caused mishandling of the `\b` and led to a false
positive match.

The specific reason why the current regex engines can't deal with this
is because they weren't designed to handle searches that took the
surrounding context into account when resolving look-around assertions.
It was a pretty big oversight on my part many years ago.

The new engines we'll be migrating to Real Soon Now don't have this
problem and can deal with the prefix literal optimizations while
correctly handling look-around assertions in the prefix.

Fixes #981
  • Loading branch information
BurntSushi committed Apr 21, 2023
1 parent 93316a3 commit 4a75247
Show file tree
Hide file tree
Showing 4 changed files with 85 additions and 9 deletions.
56 changes: 53 additions & 3 deletions regex-syntax/src/hir/mod.rs
Expand Up @@ -1854,6 +1854,8 @@ struct PropertiesI {
look_set: LookSet,
look_set_prefix: LookSet,
look_set_suffix: LookSet,
look_set_prefix_any: LookSet,
look_set_suffix_any: LookSet,
utf8: bool,
explicit_captures_len: usize,
static_explicit_captures_len: Option<usize>,
Expand Down Expand Up @@ -1909,6 +1911,19 @@ impl Properties {
self.0.look_set_prefix
}

/// Returns a set of all look-around assertions that appear as a _possible_
/// prefix for this HIR value. That is, the set returned corresponds to the
/// set of assertions that _may_ be passed before matching any bytes in a
/// haystack.
///
/// For example, `hir.look_set_prefix_any().contains(Look::Start)` returns
/// true if and only if it's possible for the regex to match through a
/// anchored assertion before consuming any input.
#[inline]
pub fn look_set_prefix_any(&self) -> LookSet {
self.0.look_set_prefix_any
}

/// Returns a set of all look-around assertions that appear as a suffix for
/// this HIR value. That is, the set returned corresponds to the set of
/// assertions that must be passed in order to be considered a match after
Expand All @@ -1921,6 +1936,19 @@ impl Properties {
self.0.look_set_suffix
}

/// Returns a set of all look-around assertions that appear as a _possible_
/// suffix for this HIR value. That is, the set returned corresponds to the
/// set of assertions that _may_ be passed before matching any bytes in a
/// haystack.
///
/// For example, `hir.look_set_suffix_any().contains(Look::End)` returns
/// true if and only if it's possible for the regex to match through a
/// anchored assertion at the end of a match without consuming any input.
#[inline]
pub fn look_set_suffix_any(&self) -> LookSet {
self.0.look_set_suffix_any
}

/// Return true if and only if the corresponding HIR will always match
/// valid UTF-8.
///
Expand Down Expand Up @@ -2188,6 +2216,8 @@ impl Properties {
look_set: LookSet::empty(),
look_set_prefix: fix,
look_set_suffix: fix,
look_set_prefix_any: LookSet::empty(),
look_set_suffix_any: LookSet::empty(),
utf8: true,
explicit_captures_len: 0,
static_explicit_captures_len,
Expand All @@ -2201,6 +2231,8 @@ impl Properties {
props.look_set.set_union(p.look_set());
props.look_set_prefix.set_intersect(p.look_set_prefix());
props.look_set_suffix.set_intersect(p.look_set_suffix());
props.look_set_prefix_any.set_union(p.look_set_prefix_any());
props.look_set_suffix_any.set_union(p.look_set_suffix_any());
props.utf8 = props.utf8 && p.is_utf8();
props.explicit_captures_len = props
.explicit_captures_len
Expand Down Expand Up @@ -2246,6 +2278,8 @@ impl Properties {
look_set: LookSet::empty(),
look_set_prefix: LookSet::empty(),
look_set_suffix: LookSet::empty(),
look_set_prefix_any: LookSet::empty(),
look_set_suffix_any: LookSet::empty(),
// It is debatable whether an empty regex always matches at valid
// UTF-8 boundaries. Strictly speaking, at a byte oriented view,
// it is clearly false. There are, for example, many empty strings
Expand Down Expand Up @@ -2280,6 +2314,8 @@ impl Properties {
look_set: LookSet::empty(),
look_set_prefix: LookSet::empty(),
look_set_suffix: LookSet::empty(),
look_set_prefix_any: LookSet::empty(),
look_set_suffix_any: LookSet::empty(),
utf8: core::str::from_utf8(&lit.0).is_ok(),
explicit_captures_len: 0,
static_explicit_captures_len: Some(0),
Expand All @@ -2297,6 +2333,8 @@ impl Properties {
look_set: LookSet::empty(),
look_set_prefix: LookSet::empty(),
look_set_suffix: LookSet::empty(),
look_set_prefix_any: LookSet::empty(),
look_set_suffix_any: LookSet::empty(),
utf8: class.is_utf8(),
explicit_captures_len: 0,
static_explicit_captures_len: Some(0),
Expand All @@ -2314,6 +2352,8 @@ impl Properties {
look_set: LookSet::singleton(look),
look_set_prefix: LookSet::singleton(look),
look_set_suffix: LookSet::singleton(look),
look_set_prefix_any: LookSet::singleton(look),
look_set_suffix_any: LookSet::singleton(look),
// This requires a little explanation. Basically, we don't consider
// matching an empty string to be equivalent to matching invalid
// UTF-8, even though technically matching every empty string will
Expand Down Expand Up @@ -2355,15 +2395,17 @@ impl Properties {
look_set: p.look_set(),
look_set_prefix: LookSet::empty(),
look_set_suffix: LookSet::empty(),
look_set_prefix_any: p.look_set_prefix_any(),
look_set_suffix_any: p.look_set_suffix_any(),
utf8: p.is_utf8(),
explicit_captures_len: p.explicit_captures_len(),
static_explicit_captures_len: p.static_explicit_captures_len(),
literal: false,
alternation_literal: false,
};
// The repetition operator can match the empty string, then its lookset
// prefix and suffixes themselves remain empty since they are no longer
// required to match.
// If the repetition operator can match the empty string, then its
// lookset prefix and suffixes themselves remain empty since they are
// no longer required to match.
if rep.min > 0 {
inner.look_set_prefix = p.look_set_prefix();
inner.look_set_suffix = p.look_set_suffix();
Expand Down Expand Up @@ -2414,6 +2456,8 @@ impl Properties {
look_set: LookSet::empty(),
look_set_prefix: LookSet::empty(),
look_set_suffix: LookSet::empty(),
look_set_prefix_any: LookSet::empty(),
look_set_suffix_any: LookSet::empty(),
utf8: true,
explicit_captures_len: 0,
static_explicit_captures_len: Some(0),
Expand Down Expand Up @@ -2455,6 +2499,9 @@ impl Properties {
let mut it = concat.iter();
while let Some(x) = it.next() {
props.look_set_prefix.set_union(x.properties().look_set_prefix());
props
.look_set_prefix_any
.set_union(x.properties().look_set_prefix_any());
if x.properties().maximum_len().map_or(true, |x| x > 0) {
break;
}
Expand All @@ -2463,6 +2510,9 @@ impl Properties {
let mut it = concat.iter().rev();
while let Some(x) = it.next() {
props.look_set_suffix.set_union(x.properties().look_set_suffix());
props
.look_set_suffix_any
.set_union(x.properties().look_set_suffix_any());
if x.properties().maximum_len().map_or(true, |x| x > 0) {
break;
}
Expand Down
6 changes: 6 additions & 0 deletions regex-syntax/src/hir/translate.rs
Expand Up @@ -3287,6 +3287,12 @@ mod tests {
assert_eq!(p.minimum_len(), Some(1));
}

#[test]
fn analysis_look_set_prefix_any() {
let p = props(r"(?i:(?:\b|_)win(?:32|64|dows)?(?:\b|_))");
assert!(p.look_set_prefix_any().contains(Look::WordUnicode));
}

#[test]
fn analysis_is_anchored() {
let is_start = |p| props(p).look_set_prefix().contains(Look::Start);
Expand Down
13 changes: 7 additions & 6 deletions src/exec.rs
Expand Up @@ -274,18 +274,18 @@ impl ExecBuilder {
// prefixes, so disable them.
prefixes = None;
} else if is_set
&& props.look_set_prefix().contains(Look::Start)
&& props.look_set_prefix_any().contains(Look::Start)
{
// Regex sets with anchors do not go well with literal
// optimizations.
prefixes = None;
} else if props.look_set_prefix().contains_word() {
} else if props.look_set_prefix_any().contains_word() {
// The new literal extractor ignores look-around while
// the old one refused to extract prefixes from regexes
// that began with a \b. These old creaky regex internals
// can't deal with it, so we drop it.
prefixes = None;
} else if props.look_set().contains(Look::StartLF) {
} else if props.look_set_prefix_any().contains(Look::StartLF) {
// Similar to the reasoning for word boundaries, this old
// regex engine can't handle literal prefixes with '(?m:^)'
// at the beginning of a regex.
Expand All @@ -298,15 +298,16 @@ impl ExecBuilder {
// Partial anchors unfortunately make it hard to use
// suffixes, so disable them.
suffixes = None;
} else if is_set && props.look_set_suffix().contains(Look::End)
} else if is_set
&& props.look_set_suffix_any().contains(Look::End)
{
// Regex sets with anchors do not go well with literal
// optimizations.
suffixes = None;
} else if props.look_set_suffix().contains_word() {
} else if props.look_set_suffix_any().contains_word() {
// See the prefix case for reasoning here.
suffixes = None;
} else if props.look_set().contains(Look::EndLF) {
} else if props.look_set_suffix_any().contains(Look::EndLF) {
// See the prefix case for reasoning here.
suffixes = None;
}
Expand Down
19 changes: 19 additions & 0 deletions tests/regression.rs
Expand Up @@ -220,3 +220,22 @@ matiter!(empty_group_find, r"()Ј01", "zЈ01", (1, 5));

// See: https://github.com/rust-lang/regex/issues/862
mat!(non_greedy_question_literal, r"ab??", "ab", Some((0, 1)));

// See: https://github.com/rust-lang/regex/issues/981
#[cfg(feature = "unicode")]
#[test]
fn regression_bad_word_boundary() {
let re = regex_new!(r#"(?i:(?:\b|_)win(?:32|64|dows)?(?:\b|_))"#).unwrap();
let hay = "ubi-Darwin-x86_64.tar.gz";
assert!(!re.is_match(text!(hay)));
let hay = "ubi-Windows-x86_64.zip";
assert!(re.is_match(text!(hay)));
}

// See: https://github.com/rust-lang/regex/issues/982
#[test]
fn regression_unicode_perl_not_enabled() {
let pat = r"(\d+\s?(years|year|y))?\s?(\d+\s?(months|month|m))?\s?(\d+\s?(weeks|week|w))?\s?(\d+\s?(days|day|d))?\s?(\d+\s?(hours|hour|h))?";
let re = regex_new!(pat);
assert!(re.is_ok());
}

0 comments on commit 4a75247

Please sign in to comment.