Skip to content

Commit

Permalink
syntax: permit most no-op escape sequences
Browse files Browse the repository at this point in the history
This resolves a long-standing (but somewhat minor) complaint that folks
have with the regex crate: it does not permit escaping punctuation
characters in cases where those characters do not need to be escaped. So
things like \/, \" and \! would result in parse errors. Most other regex
engines permit these, even in cases where they aren't needed.

I had been against doing this for future evolution purposes, but it's
incredibly unlikely that we're ever going to add a new meta character to
the syntax. I literally cannot think of any conceivable future in which
that might happen.

However, we do continue to ban escapes for [0-9A-Za-z<>], because it is
conceivable that we might add new escape sequences for those characters.
(And 0-9 are already banned by virtue of them looking too much like
backreferences, which aren't supported.) For example, we could add
\Q...\E literal syntax. Or \< and \> as start and end word boundaries,
as found in POSIX regex engines.

Fixes #501
  • Loading branch information
BurntSushi committed Apr 17, 2023
1 parent e4006af commit fbdc4a9
Show file tree
Hide file tree
Showing 4 changed files with 185 additions and 61 deletions.
9 changes: 6 additions & 3 deletions regex-syntax/src/ast/mod.rs
Expand Up @@ -588,9 +588,12 @@ impl Literal {
pub enum LiteralKind {
/// The literal is written verbatim, e.g., `a` or `☃`.
Verbatim,
/// The literal is written as an escape because it is punctuation, e.g.,
/// `\*` or `\[`.
Punctuation,
/// The literal is written as an escape because it is otherwise a special
/// regex meta character, e.g., `\*` or `\[`.
Meta,
/// The literal is written as an escape despite the fact that the escape is
/// unnecessary, e.g., `\%` or `\/`.
Superfluous,
/// The literal is written as an octal escape, e.g., `\141`.
Octal,
/// The literal is written as a hex code with a fixed number of digits
Expand Down
133 changes: 81 additions & 52 deletions regex-syntax/src/ast/parse.rs
Expand Up @@ -18,7 +18,7 @@ use alloc::{
use crate::{
ast::{self, Ast, Position, Span},
either::Either,
is_meta_character,
is_escapeable_character, is_meta_character,
};

type Result<T> = core::result::Result<T, ast::Error>;
Expand Down Expand Up @@ -1495,7 +1495,14 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
if is_meta_character(c) {
return Ok(Primitive::Literal(ast::Literal {
span,
kind: ast::LiteralKind::Punctuation,
kind: ast::LiteralKind::Meta,
c,
}));
}
if is_escapeable_character(c) {
return Ok(Primitive::Literal(ast::Literal {
span,
kind: ast::LiteralKind::Superfluous,
c,
}));
}
Expand All @@ -1513,9 +1520,6 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
'n' => special(ast::SpecialLiteralKind::LineFeed, '\n'),
'r' => special(ast::SpecialLiteralKind::CarriageReturn, '\r'),
'v' => special(ast::SpecialLiteralKind::VerticalTab, '\x0B'),
' ' if self.ignore_whitespace() => {
special(ast::SpecialLiteralKind::Space, ' ')
}
'A' => Ok(Primitive::Assertion(ast::Assertion {
span,
kind: ast::AssertionKind::StartText,
Expand Down Expand Up @@ -2420,13 +2424,9 @@ mod tests {
lit_with(c, span(start..start + c.len_utf8()))
}

/// Create a punctuation literal starting at the given position.
fn punct_lit(c: char, span: Span) -> Ast {
Ast::Literal(ast::Literal {
span,
kind: ast::LiteralKind::Punctuation,
c,
})
/// Create a meta literal starting at the given position.
fn meta_lit(c: char, span: Span) -> Ast {
Ast::Literal(ast::Literal { span, kind: ast::LiteralKind::Meta, c })
}

/// Create a verbatim literal with the given span.
Expand Down Expand Up @@ -2710,24 +2710,24 @@ bar
Ok(concat(
0..36,
vec![
punct_lit('\\', span(0..2)),
punct_lit('.', span(2..4)),
punct_lit('+', span(4..6)),
punct_lit('*', span(6..8)),
punct_lit('?', span(8..10)),
punct_lit('(', span(10..12)),
punct_lit(')', span(12..14)),
punct_lit('|', span(14..16)),
punct_lit('[', span(16..18)),
punct_lit(']', span(18..20)),
punct_lit('{', span(20..22)),
punct_lit('}', span(22..24)),
punct_lit('^', span(24..26)),
punct_lit('$', span(26..28)),
punct_lit('#', span(28..30)),
punct_lit('&', span(30..32)),
punct_lit('-', span(32..34)),
punct_lit('~', span(34..36)),
meta_lit('\\', span(0..2)),
meta_lit('.', span(2..4)),
meta_lit('+', span(4..6)),
meta_lit('*', span(6..8)),
meta_lit('?', span(8..10)),
meta_lit('(', span(10..12)),
meta_lit(')', span(12..14)),
meta_lit('|', span(14..16)),
meta_lit('[', span(16..18)),
meta_lit(']', span(18..20)),
meta_lit('{', span(20..22)),
meta_lit('}', span(22..24)),
meta_lit('^', span(24..26)),
meta_lit('$', span(26..28)),
meta_lit('#', span(28..30)),
meta_lit('&', span(30..32)),
meta_lit('-', span(32..34)),
meta_lit('~', span(34..36)),
]
))
);
Expand Down Expand Up @@ -2879,23 +2879,12 @@ bar
flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
Ast::Literal(ast::Literal {
span: span_range(pat, 4..6),
kind: ast::LiteralKind::Special(
ast::SpecialLiteralKind::Space
),
kind: ast::LiteralKind::Superfluous,
c: ' ',
}),
]
))
);
// ... but only when `x` mode is enabled.
let pat = r"\ ";
assert_eq!(
parser(pat).parse().unwrap_err(),
TestError {
span: span_range(pat, 0..2),
kind: ast::ErrorKind::EscapeUnrecognized,
}
);
}

#[test]
Expand Down Expand Up @@ -4246,7 +4235,7 @@ bar
parser(r"\|").parse_primitive(),
Ok(Primitive::Literal(ast::Literal {
span: span(0..2),
kind: ast::LiteralKind::Punctuation,
kind: ast::LiteralKind::Meta,
c: '|',
}))
);
Expand Down Expand Up @@ -4297,11 +4286,26 @@ bar
}))
);

// We also support superfluous escapes in most cases now too.
for c in ['!', '@', '%', '"', '\'', '/', ' '] {
let pat = format!(r"\{}", c);
assert_eq!(
parser(&pat).parse_primitive(),
Ok(Primitive::Literal(ast::Literal {
span: span(0..2),
kind: ast::LiteralKind::Superfluous,
c,
}))
);
}

// Some superfluous escapes, namely [0-9A-Za-z], are still banned. This
// gives flexibility for future evolution.
assert_eq!(
parser(r"\").parse_escape().unwrap_err(),
parser(r"\e").parse_escape().unwrap_err(),
TestError {
span: span(0..1),
kind: ast::ErrorKind::EscapeUnexpectedEof,
span: span(0..2),
kind: ast::ErrorKind::EscapeUnrecognized,
}
);
assert_eq!(
Expand All @@ -4311,6 +4315,31 @@ bar
kind: ast::ErrorKind::EscapeUnrecognized,
}
);
// But also, < and > are banned, so that we may evolve them into
// start/end word boundary assertions. (Not sure if we will...)
assert_eq!(
parser(r"\<").parse_escape().unwrap_err(),
TestError {
span: span(0..2),
kind: ast::ErrorKind::EscapeUnrecognized,
}
);
assert_eq!(
parser(r"\>").parse_escape().unwrap_err(),
TestError {
span: span(0..2),
kind: ast::ErrorKind::EscapeUnrecognized,
}
);

// An unfinished escape is illegal.
assert_eq!(
parser(r"\").parse_escape().unwrap_err(),
TestError {
span: span(0..1),
kind: ast::ErrorKind::EscapeUnexpectedEof,
}
);
}

#[test]
Expand Down Expand Up @@ -4907,7 +4936,7 @@ bar
lit(span(1..2), 'a'),
ast::ClassSetItem::Literal(ast::Literal {
span: span(2..4),
kind: ast::LiteralKind::Punctuation,
kind: ast::LiteralKind::Meta,
c: ']',
}),
]
Expand All @@ -4925,7 +4954,7 @@ bar
lit(span(1..2), 'a'),
ast::ClassSetItem::Literal(ast::Literal {
span: span(2..4),
kind: ast::LiteralKind::Punctuation,
kind: ast::LiteralKind::Meta,
c: '-',
}),
lit(span(4..5), 'z'),
Expand Down Expand Up @@ -5117,7 +5146,7 @@ bar
span(1..6),
itemset(ast::ClassSetItem::Literal(ast::Literal {
span: span(1..3),
kind: ast::LiteralKind::Punctuation,
kind: ast::LiteralKind::Meta,
c: '^',
})),
itemset(lit(span(5..6), '^')),
Expand All @@ -5133,7 +5162,7 @@ bar
span(1..6),
itemset(ast::ClassSetItem::Literal(ast::Literal {
span: span(1..3),
kind: ast::LiteralKind::Punctuation,
kind: ast::LiteralKind::Meta,
c: '&',
})),
itemset(lit(span(5..6), '&')),
Expand Down Expand Up @@ -5198,7 +5227,7 @@ bar
lit(span(1..2), ']'),
ast::ClassSetItem::Literal(ast::Literal {
span: span(2..4),
kind: ast::LiteralKind::Punctuation,
kind: ast::LiteralKind::Meta,
c: '[',
}),
]
Expand All @@ -5216,7 +5245,7 @@ bar
kind: itemset(ast::ClassSetItem::Literal(
ast::Literal {
span: span(1..3),
kind: ast::LiteralKind::Punctuation,
kind: ast::LiteralKind::Meta,
c: '[',
}
)),
Expand Down
2 changes: 1 addition & 1 deletion regex-syntax/src/ast/print.rs
Expand Up @@ -216,7 +216,7 @@ impl<W: fmt::Write> Writer<W> {

match ast.kind {
Verbatim => self.wtr.write_char(ast.c),
Punctuation => write!(self.wtr, r"\{}", ast.c),
Meta | Superfluous => write!(self.wtr, r"\{}", ast.c),
Octal => write!(self.wtr, r"\{:o}", u32::from(ast.c)),
HexFixed(ast::HexLiteralKind::X) => {
write!(self.wtr, r"\x{:02X}", u32::from(ast.c))
Expand Down
102 changes: 97 additions & 5 deletions regex-syntax/src/lib.rs
Expand Up @@ -215,13 +215,43 @@ pub fn escape_into(text: &str, buf: &mut String) {

/// Returns true if the given character has significance in a regex.
///
/// These are the only characters that are allowed to be escaped, with one
/// exception: an ASCII space character may be escaped when extended mode (with
/// the `x` flag) is enabled. In particular, `is_meta_character(' ')` returns
/// `false`.
/// Generally speaking, these are the only characters which _must_ be escaped
/// in order to match their literal meaning. For example, to match a literal
/// `|`, one could write `\|`. Sometimes escaping isn't always necessary. For
/// example, `-` is treated as a meta character because of its significance
/// for writing ranges inside of character classes, but the regex `-` will
/// match a literal `-` because `-` has no special meaning outside of character
/// classes.
///
/// In order to determine whether a character may be escaped at all, the
/// [`is_escapeable_character`] routine should be used. The difference between
/// `is_meta_character` and `is_escapeable_character` is that the latter will
/// return true for some characters that are _not_ meta characters. For
/// example, `%` and `\%` both match a literal `%` in all contexts. In other
/// words, `is_escapeable_character` includes "superfluous" escapes.
///
/// Note that the set of characters for which this function returns `true` or
/// `false` is fixed and won't change in a semver compatible release.
/// `false` is fixed and won't change in a semver compatible release. (In this
/// case, "semver compatible release" actually refers to the `regex` crate
/// itself, since reducing or expanding the set of meta characters would be a
/// breaking change for not just `regex-syntax` but also `regex` itself.)
///
/// # Example
///
/// ```
/// use regex_syntax::is_meta_character;
///
/// assert!(is_meta_character('?'));
/// assert!(is_meta_character('-'));
/// assert!(is_meta_character('&'));
/// assert!(is_meta_character('#'));
///
/// assert!(!is_meta_character('%'));
/// assert!(!is_meta_character('/'));
/// assert!(!is_meta_character('!'));
/// assert!(!is_meta_character('"'));
/// assert!(!is_meta_character('e'));
/// ```
pub fn is_meta_character(c: char) -> bool {
match c {
'\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' | '[' | ']' | '{'
Expand All @@ -230,6 +260,68 @@ pub fn is_meta_character(c: char) -> bool {
}
}

/// Returns true if the given character can be escaped in a regex.
///
/// This returns true in all cases that `is_meta_character` returns true, but
/// also returns true in some cases where `is_meta_character` returns false.
/// For example, `%` is not a meta character, but it is escapeable. That is,
/// `%` and `\%` both match a literal `%` in all contexts.
///
/// The purpose of this routine is to provide knowledge about what characters
/// may be escaped. Namely, most regex engines permit "superfluous" escapes
/// where characters without any special significance may be escaped even
/// though there is no actual _need_ to do so.
///
/// This will return false for some characters. For example, `e` is not
/// escapeable. Therefore, `\e` will either result in a parse error (which is
/// true today), or it could backwards compatibly evolve into a new construct
/// with its own meaning. Indeed, that is the purpose of banning _some_
/// superfluous escapes: it provides a way to evolve the syntax in a compatible
/// manner.
///
/// # Example
///
/// ```
/// use regex_syntax::is_escapeable_character;
///
/// assert!(is_escapeable_character('?'));
/// assert!(is_escapeable_character('-'));
/// assert!(is_escapeable_character('&'));
/// assert!(is_escapeable_character('#'));
/// assert!(is_escapeable_character('%'));
/// assert!(is_escapeable_character('/'));
/// assert!(is_escapeable_character('!'));
/// assert!(is_escapeable_character('"'));
///
/// assert!(!is_escapeable_character('e'));
/// ```
pub fn is_escapeable_character(c: char) -> bool {
// Certainly escapeable if it's a meta character.
if is_meta_character(c) {
return true;
}
// Any character that isn't ASCII is definitely not escapeable. There's
// no real need to allow things like \☃ right?
if !c.is_ascii() {
return false;
}
// Otherwise, we basically say that everything is escapeable unless it's a
// letter or digit. Things like \3 are either octal (when enabled) or an
// error, and we should keep it that way. Otherwise, letters are reserved
// for adding new syntax in a backwards compatible way.
match c {
'0'..='9' | 'A'..='Z' | 'a'..='z' => false,
// While not currently supported, we keep these as not escapeable to
// give us some flexibility with respect to supporting the \< and
// \> word boundary assertions in the future. By rejecting them as
// escapeable, \< and \> will result in a parse error. Thus, we can
// turn them into something else in the future without it being a
// backwards incompatible change.
'<' | '>' => false,
_ => true,
}
}

/// Returns true if and only if the given character is a Unicode word
/// character.
///
Expand Down

0 comments on commit fbdc4a9

Please sign in to comment.