Skip to content

Commit

Permalink
syntax: reject '(?-u)\W' when UTF-8 mode is enabled
Browse files Browse the repository at this point in the history
When Unicode mode is disabled (i.e., (?-u)), the Perl character classes
(\w, \d and \s) revert to their ASCII definitions. The negated forms
of these classes are also derived from their ASCII definitions, and this
means that they may actually match bytes outside of ASCII and thus
possibly invalid UTF-8. For this reason, when the translator is
configured to only produce HIR that matches valid UTF-8, '(?-u)\W'
should be rejected.

Previously, it was not being rejected, which could actually lead to
matches that produced offsets that split codepoints, and thus lead to
panics when match offsets are used to slice a string. For example, this
code

  fn main() {
      let re = regex::Regex::new(r"(?-u)\W").unwrap();
      let haystack = "☃";
      if let Some(m) = re.find(haystack) {
          println!("{:?}", &haystack[m.range()]);
      }
  }

panics with

  byte index 1 is not a char boundary; it is inside '☃' (bytes 0..3) of `☃`

That is, it reports a match at 0..1, which is technically correct, but
the regex itself should have been rejected in the first place since the
top-level Regex API always has UTF-8 mode enabled.

Also, many of the replacement tests were using '(?-u)\W' (or similar)
for some reason. I'm not sure why, so I just removed the '(?-u)' to make
those tests pass. Whether Unicode is enabled or not doesn't seem to be
an interesting detail for those tests. (All haystacks and replacements
appear to be ASCII.)

Fixes #895, Partially addresses #738
  • Loading branch information
BurntSushi committed Apr 17, 2023
1 parent 78c647d commit d258c29
Show file tree
Hide file tree
Showing 2 changed files with 92 additions and 19 deletions.
95 changes: 84 additions & 11 deletions regex-syntax/src/hir/translate.rs
Expand Up @@ -305,7 +305,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
let hcls = hir::Class::Unicode(cls);
self.push(HirFrame::Expr(Hir::class(hcls)));
} else {
let cls = self.hir_perl_byte_class(x);
let cls = self.hir_perl_byte_class(x)?;
let hcls = hir::Class::Bytes(cls);
self.push(HirFrame::Expr(Hir::class(hcls)));
}
Expand Down Expand Up @@ -445,7 +445,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
cls.union(&xcls);
self.push(HirFrame::ClassUnicode(cls));
} else {
let xcls = self.hir_perl_byte_class(x);
let xcls = self.hir_perl_byte_class(x)?;
let mut cls = self.pop().unwrap().unwrap_class_bytes();
cls.union(&xcls);
self.push(HirFrame::ClassBytes(cls));
Expand Down Expand Up @@ -879,7 +879,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
fn hir_perl_byte_class(
&self,
ast_class: &ast::ClassPerl,
) -> hir::ClassBytes {
) -> Result<hir::ClassBytes> {
use crate::ast::ClassPerlKind::*;

assert!(!self.flags().unicode());
Expand All @@ -893,7 +893,13 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
if ast_class.negated {
class.negate();
}
class
// Negating a Perl byte class is likely to cause it to match invalid
// UTF-8. That's only OK if the translator is configured to allow such
// things.
if !self.trans().allow_invalid_utf8 && !class.is_all_ascii() {
return Err(self.error(ast_class.span, ErrorKind::InvalidUtf8));
}
Ok(class)
}

/// Converts the given Unicode specific error to an HIR translation error.
Expand Down Expand Up @@ -1971,7 +1977,7 @@ mod tests {

#[test]
#[cfg(feature = "unicode-perl")]
fn class_perl() {
fn class_perl_unicode() {
// Unicode
assert_eq!(t(r"\d"), hir_uclass_query(ClassQuery::Binary("digit")));
assert_eq!(t(r"\s"), hir_uclass_query(ClassQuery::Binary("space")));
Expand Down Expand Up @@ -2011,7 +2017,10 @@ mod tests {
);
#[cfg(feature = "unicode-case")]
assert_eq!(t(r"(?i)\W"), hir_negate(hir_uclass_perl_word()));
}

#[test]
fn class_perl_ascii() {
// ASCII only
assert_eq!(
t(r"(?-u)\d"),
Expand Down Expand Up @@ -2040,29 +2049,93 @@ mod tests {

// ASCII only, negated
assert_eq!(
t(r"(?-u)\D"),
t_bytes(r"(?-u)\D"),
hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit))
);
assert_eq!(
t(r"(?-u)\S"),
t_bytes(r"(?-u)\S"),
hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Space))
);
assert_eq!(
t(r"(?-u)\W"),
t_bytes(r"(?-u)\W"),
hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word))
);
assert_eq!(
t(r"(?i-u)\D"),
t_bytes(r"(?i-u)\D"),
hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit))
);
assert_eq!(
t(r"(?i-u)\S"),
t_bytes(r"(?i-u)\S"),
hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Space))
);
assert_eq!(
t(r"(?i-u)\W"),
t_bytes(r"(?i-u)\W"),
hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word))
);

// ASCII only, negated, with UTF-8 mode enabled.
// In this case, negating any Perl class results in an error because
// all such classes can match invalid UTF-8.
assert_eq!(
t_err(r"(?-u)\D"),
TestError {
kind: hir::ErrorKind::InvalidUtf8,
span: Span::new(
Position::new(5, 1, 6),
Position::new(7, 1, 8),
),
},
);
assert_eq!(
t_err(r"(?-u)\S"),
TestError {
kind: hir::ErrorKind::InvalidUtf8,
span: Span::new(
Position::new(5, 1, 6),
Position::new(7, 1, 8),
),
},
);
assert_eq!(
t_err(r"(?-u)\W"),
TestError {
kind: hir::ErrorKind::InvalidUtf8,
span: Span::new(
Position::new(5, 1, 6),
Position::new(7, 1, 8),
),
},
);
assert_eq!(
t_err(r"(?i-u)\D"),
TestError {
kind: hir::ErrorKind::InvalidUtf8,
span: Span::new(
Position::new(6, 1, 7),
Position::new(8, 1, 9),
),
},
);
assert_eq!(
t_err(r"(?i-u)\S"),
TestError {
kind: hir::ErrorKind::InvalidUtf8,
span: Span::new(
Position::new(6, 1, 7),
Position::new(8, 1, 9),
),
},
);
assert_eq!(
t_err(r"(?i-u)\W"),
TestError {
kind: hir::ErrorKind::InvalidUtf8,
span: Span::new(
Position::new(6, 1, 7),
Position::new(8, 1, 9),
),
},
);
}

#[test]
Expand Down
16 changes: 8 additions & 8 deletions tests/replace.rs
Expand Up @@ -15,15 +15,15 @@ replace!(all, replace_all, r"[0-9]", "age: 26", t!("Z"), "age: ZZ");
replace!(
groups,
replace,
r"(?-u)(\S+)\s+(\S+)",
r"([^ ]+)[ ]+([^ ]+)",
"w1 w2",
t!("$2 $1"),
"w2 w1"
);
replace!(
double_dollar,
replace,
r"(?-u)(\S+)\s+(\S+)",
r"([^ ]+)[ ]+([^ ]+)",
"w1 w2",
t!("$2 $$1"),
"w2 $1"
Expand All @@ -33,7 +33,7 @@ replace!(
replace!(
named,
replace_all,
r"(?-u)(?P<first>\S+)\s+(?P<last>\S+)(?P<space>\s*)",
r"(?P<first>[^ ]+)[ ]+(?P<last>[^ ]+)(?P<space>[ ]*)",
"w1 w2 w3 w4",
t!("$last $first$space"),
"w2 w1 w4 w3"
Expand All @@ -51,39 +51,39 @@ replace!(number_hypen, replace, r"(.)(.)", "ab", t!("$1-$2"), "a-b");
replace!(
simple_expand,
replace_all,
r"(?-u)(\w) (\w)",
r"([a-z]) ([a-z])",
"a b",
t!("$2 $1"),
"b a"
);
replace!(
literal_dollar1,
replace_all,
r"(?-u)(\w+) (\w+)",
r"([a-z]+) ([a-z]+)",
"a b",
t!("$$1"),
"$1"
);
replace!(
literal_dollar2,
replace_all,
r"(?-u)(\w+) (\w+)",
r"([a-z]+) ([a-z]+)",
"a b",
t!("$2 $$c $1"),
"b $c a"
);
replace!(
no_expand1,
replace,
r"(?-u)(\S+)\s+(\S+)",
r"([^ ]+)[ ]+([^ ]+)",
"w1 w2",
no_expand!("$2 $1"),
"$2 $1"
);
replace!(
no_expand2,
replace,
r"(?-u)(\S+)\s+(\S+)",
r"([^ ]+)[ ]+([^ ]+)",
"w1 w2",
no_expand!("$$1"),
"$$1"
Expand Down

0 comments on commit d258c29

Please sign in to comment.