impl: fix prefix literal matching bug

This commit fixes a bug where it was possible to report a match where none existed. Basically, in the current regex crate, it just cannot deal with a mixture of look-around assertions in the prefix of a pattern and prefix literal optimizations. Before 1.8, this was handled by simply refusing to extract literals in that case. But in 1.8, with a rewrite of the literal extractor, literals are now extracted for patterns like this: (?i:(?:\b|_)win(?:32|64|dows)?(?:\b|_)) So in 1.8, since it was still using the old engines that can't deal with this, I added some extra logic to throw away any extracted prefix literals if a look-around assertion occurred in the prefix of the pattern. The problem is that the logic I used was "always occurs in the prefix of the pattern" instead of "may occur in the prefix of the pattern." In the pattern above, it's the latter case. So it slipped by and the regex engine tried to use the prefix literals to accelerat the search. This in turn caused mishandling of the `\b` and led to a false positive match. The specific reason why the current regex engines can't deal with this is because they weren't designed to handle searches that took the surrounding context into account when resolving look-around assertions. It was a pretty big oversight on my part many years ago. The new engines we'll be migrating to Real Soon Now don't have this problem and can deal with the prefix literal optimizations while correctly handling look-around assertions in the prefix. Fixes #981
rust-lang · Apr 21, 2023 · 4a75247 · 4a75247
1 parent 93316a3
commit 4a75247
Show file tree

Hide file tree

Showing 4 changed files with 85 additions and 9 deletions.
diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs
@@ -1854,6 +1854,8 @@ struct PropertiesI {
     look_set: LookSet,
     look_set_prefix: LookSet,
     look_set_suffix: LookSet,
+    look_set_prefix_any: LookSet,
+    look_set_suffix_any: LookSet,
     utf8: bool,
     explicit_captures_len: usize,
     static_explicit_captures_len: Option<usize>,
@@ -1909,6 +1911,19 @@ impl Properties {
         self.0.look_set_prefix
     }
 
+    /// Returns a set of all look-around assertions that appear as a _possible_
+    /// prefix for this HIR value. That is, the set returned corresponds to the
+    /// set of assertions that _may_ be passed before matching any bytes in a
+    /// haystack.
+    ///
+    /// For example, `hir.look_set_prefix_any().contains(Look::Start)` returns
+    /// true if and only if it's possible for the regex to match through a
+    /// anchored assertion before consuming any input.
+    #[inline]
+    pub fn look_set_prefix_any(&self) -> LookSet {
+        self.0.look_set_prefix_any
+    }
+
     /// Returns a set of all look-around assertions that appear as a suffix for
     /// this HIR value. That is, the set returned corresponds to the set of
     /// assertions that must be passed in order to be considered a match after
@@ -1921,6 +1936,19 @@ impl Properties {
         self.0.look_set_suffix
     }
 
+    /// Returns a set of all look-around assertions that appear as a _possible_
+    /// suffix for this HIR value. That is, the set returned corresponds to the
+    /// set of assertions that _may_ be passed before matching any bytes in a
+    /// haystack.
+    ///
+    /// For example, `hir.look_set_suffix_any().contains(Look::End)` returns
+    /// true if and only if it's possible for the regex to match through a
+    /// anchored assertion at the end of a match without consuming any input.
+    #[inline]
+    pub fn look_set_suffix_any(&self) -> LookSet {
+        self.0.look_set_suffix_any
+    }
+
     /// Return true if and only if the corresponding HIR will always match
     /// valid UTF-8.
     ///
@@ -2188,6 +2216,8 @@ impl Properties {
             look_set: LookSet::empty(),
             look_set_prefix: fix,
             look_set_suffix: fix,
+            look_set_prefix_any: LookSet::empty(),
+            look_set_suffix_any: LookSet::empty(),
             utf8: true,
             explicit_captures_len: 0,
             static_explicit_captures_len,
@@ -2201,6 +2231,8 @@ impl Properties {
             props.look_set.set_union(p.look_set());
             props.look_set_prefix.set_intersect(p.look_set_prefix());
             props.look_set_suffix.set_intersect(p.look_set_suffix());
+            props.look_set_prefix_any.set_union(p.look_set_prefix_any());
+            props.look_set_suffix_any.set_union(p.look_set_suffix_any());
             props.utf8 = props.utf8 && p.is_utf8();
             props.explicit_captures_len = props
                 .explicit_captures_len
@@ -2246,6 +2278,8 @@ impl Properties {
             look_set: LookSet::empty(),
             look_set_prefix: LookSet::empty(),
             look_set_suffix: LookSet::empty(),
+            look_set_prefix_any: LookSet::empty(),
+            look_set_suffix_any: LookSet::empty(),
             // It is debatable whether an empty regex always matches at valid
             // UTF-8 boundaries. Strictly speaking, at a byte oriented view,
             // it is clearly false. There are, for example, many empty strings
@@ -2280,6 +2314,8 @@ impl Properties {
             look_set: LookSet::empty(),
             look_set_prefix: LookSet::empty(),
             look_set_suffix: LookSet::empty(),
+            look_set_prefix_any: LookSet::empty(),
+            look_set_suffix_any: LookSet::empty(),
             utf8: core::str::from_utf8(&lit.0).is_ok(),
             explicit_captures_len: 0,
             static_explicit_captures_len: Some(0),
@@ -2297,6 +2333,8 @@ impl Properties {
             look_set: LookSet::empty(),
             look_set_prefix: LookSet::empty(),
             look_set_suffix: LookSet::empty(),
+            look_set_prefix_any: LookSet::empty(),
+            look_set_suffix_any: LookSet::empty(),
             utf8: class.is_utf8(),
             explicit_captures_len: 0,
             static_explicit_captures_len: Some(0),
@@ -2314,6 +2352,8 @@ impl Properties {
             look_set: LookSet::singleton(look),
             look_set_prefix: LookSet::singleton(look),
             look_set_suffix: LookSet::singleton(look),
+            look_set_prefix_any: LookSet::singleton(look),
+            look_set_suffix_any: LookSet::singleton(look),
             // This requires a little explanation. Basically, we don't consider
             // matching an empty string to be equivalent to matching invalid
             // UTF-8, even though technically matching every empty string will
@@ -2355,15 +2395,17 @@ impl Properties {
             look_set: p.look_set(),
             look_set_prefix: LookSet::empty(),
             look_set_suffix: LookSet::empty(),
+            look_set_prefix_any: p.look_set_prefix_any(),
+            look_set_suffix_any: p.look_set_suffix_any(),
             utf8: p.is_utf8(),
             explicit_captures_len: p.explicit_captures_len(),
             static_explicit_captures_len: p.static_explicit_captures_len(),
             literal: false,
             alternation_literal: false,
         };
-        // The repetition operator can match the empty string, then its lookset
-        // prefix and suffixes themselves remain empty since they are no longer
-        // required to match.
+        // If the repetition operator can match the empty string, then its
+        // lookset prefix and suffixes themselves remain empty since they are
+        // no longer required to match.
         if rep.min > 0 {
             inner.look_set_prefix = p.look_set_prefix();
             inner.look_set_suffix = p.look_set_suffix();
@@ -2414,6 +2456,8 @@ impl Properties {
             look_set: LookSet::empty(),
             look_set_prefix: LookSet::empty(),
             look_set_suffix: LookSet::empty(),
+            look_set_prefix_any: LookSet::empty(),
+            look_set_suffix_any: LookSet::empty(),
             utf8: true,
             explicit_captures_len: 0,
             static_explicit_captures_len: Some(0),
@@ -2455,6 +2499,9 @@ impl Properties {
         let mut it = concat.iter();
         while let Some(x) = it.next() {
             props.look_set_prefix.set_union(x.properties().look_set_prefix());
+            props
+                .look_set_prefix_any
+                .set_union(x.properties().look_set_prefix_any());
             if x.properties().maximum_len().map_or(true, |x| x > 0) {
                 break;
             }
@@ -2463,6 +2510,9 @@ impl Properties {
         let mut it = concat.iter().rev();
         while let Some(x) = it.next() {
             props.look_set_suffix.set_union(x.properties().look_set_suffix());
+            props
+                .look_set_suffix_any
+                .set_union(x.properties().look_set_suffix_any());
             if x.properties().maximum_len().map_or(true, |x| x > 0) {
                 break;
             }

diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs
@@ -3287,6 +3287,12 @@ mod tests {
         assert_eq!(p.minimum_len(), Some(1));
     }
 
+    #[test]
+    fn analysis_look_set_prefix_any() {
+        let p = props(r"(?i:(?:\b|_)win(?:32|64|dows)?(?:\b|_))");
+        assert!(p.look_set_prefix_any().contains(Look::WordUnicode));
+    }
+
     #[test]
     fn analysis_is_anchored() {
         let is_start = |p| props(p).look_set_prefix().contains(Look::Start);

diff --git a/src/exec.rs b/src/exec.rs
@@ -274,18 +274,18 @@ impl ExecBuilder {
                     // prefixes, so disable them.
                     prefixes = None;
                 } else if is_set
-                    && props.look_set_prefix().contains(Look::Start)
+                    && props.look_set_prefix_any().contains(Look::Start)
                 {
                     // Regex sets with anchors do not go well with literal
                     // optimizations.
                     prefixes = None;
-                } else if props.look_set_prefix().contains_word() {
+                } else if props.look_set_prefix_any().contains_word() {
                     // The new literal extractor ignores look-around while
                     // the old one refused to extract prefixes from regexes
                     // that began with a \b. These old creaky regex internals
                     // can't deal with it, so we drop it.
                     prefixes = None;
-                } else if props.look_set().contains(Look::StartLF) {
+                } else if props.look_set_prefix_any().contains(Look::StartLF) {
                     // Similar to the reasoning for word boundaries, this old
                     // regex engine can't handle literal prefixes with '(?m:^)'
                     // at the beginning of a regex.
@@ -298,15 +298,16 @@ impl ExecBuilder {
                     // Partial anchors unfortunately make it hard to use
                     // suffixes, so disable them.
                     suffixes = None;
-                } else if is_set && props.look_set_suffix().contains(Look::End)
+                } else if is_set
+                    && props.look_set_suffix_any().contains(Look::End)
                 {
                     // Regex sets with anchors do not go well with literal
                     // optimizations.
                     suffixes = None;
-                } else if props.look_set_suffix().contains_word() {
+                } else if props.look_set_suffix_any().contains_word() {
                     // See the prefix case for reasoning here.
                     suffixes = None;
-                } else if props.look_set().contains(Look::EndLF) {
+                } else if props.look_set_suffix_any().contains(Look::EndLF) {
                     // See the prefix case for reasoning here.
                     suffixes = None;
                 }

diff --git a/tests/regression.rs b/tests/regression.rs
@@ -220,3 +220,22 @@ matiter!(empty_group_find, r"()Ј01", "zЈ01", (1, 5));
 
 // See: https://github.com/rust-lang/regex/issues/862
 mat!(non_greedy_question_literal, r"ab??", "ab", Some((0, 1)));
+
+// See: https://github.com/rust-lang/regex/issues/981
+#[cfg(feature = "unicode")]
+#[test]
+fn regression_bad_word_boundary() {
+    let re = regex_new!(r#"(?i:(?:\b|_)win(?:32|64|dows)?(?:\b|_))"#).unwrap();
+    let hay = "ubi-Darwin-x86_64.tar.gz";
+    assert!(!re.is_match(text!(hay)));
+    let hay = "ubi-Windows-x86_64.zip";
+    assert!(re.is_match(text!(hay)));
+}
+
+// See: https://github.com/rust-lang/regex/issues/982
+#[test]
+fn regression_unicode_perl_not_enabled() {
+    let pat = r"(\d+\s?(years|year|y))?\s?(\d+\s?(months|month|m))?\s?(\d+\s?(weeks|week|w))?\s?(\d+\s?(days|day|d))?\s?(\d+\s?(hours|hour|h))?";
+    let re = regex_new!(pat);
+    assert!(re.is_ok());
+}