Skip to content

Commit

Permalink
syntax: support (?< syntax for named groups
Browse files Browse the repository at this point in the history
It turns out that both '(?P<name>...)' and '(?<name>...)' are rather
common among regex engines. There are several that support just one or
the other. Until this commit, the regex crate only supported the former,
along with both RE2, RE2/J and Go's regexp package. There are also
several regex engines that only supported the latter, such as Onigmo,
Onuguruma, Java, Ruby, Boost, .NET and Javascript. To decrease friction,
and because there is somewhat little cost to doing so, we elect to
support both.

It looks like perhaps RE2 and Go's regexp package will go the same
route, but it isn't fully decided yet:
golang/go#58458

Closes #955, Closes #956
  • Loading branch information
01mf02 authored and BurntSushi committed Mar 15, 2023
1 parent b43da2d commit 0ff592e
Show file tree
Hide file tree
Showing 5 changed files with 87 additions and 42 deletions.
13 changes: 9 additions & 4 deletions regex-syntax/src/ast/mod.rs
Expand Up @@ -1162,7 +1162,7 @@ impl Group {
/// Returns true if and only if this group is capturing.
pub fn is_capturing(&self) -> bool {
match self.kind {
GroupKind::CaptureIndex(_) | GroupKind::CaptureName(_) => true,
GroupKind::CaptureIndex(_) | GroupKind::CaptureName { .. } => true,
GroupKind::NonCapturing(_) => false,
}
}
Expand All @@ -1173,7 +1173,7 @@ impl Group {
pub fn capture_index(&self) -> Option<u32> {
match self.kind {
GroupKind::CaptureIndex(i) => Some(i),
GroupKind::CaptureName(ref x) => Some(x.index),
GroupKind::CaptureName { ref name, .. } => Some(name.index),
GroupKind::NonCapturing(_) => None,
}
}
Expand All @@ -1184,8 +1184,13 @@ impl Group {
pub enum GroupKind {
/// `(a)`
CaptureIndex(u32),
/// `(?P<name>a)`
CaptureName(CaptureName),
/// `(?<name>a)` or `(?P<name>a)`
CaptureName {
/// True if the `?P<` syntax is used and false if the `?<` syntax is used.
starts_with_p: bool,
/// The capture name.
name: CaptureName,
},
/// `(?:a)` and `(?i:a)`
NonCapturing(Flags),
}
Expand Down
103 changes: 70 additions & 33 deletions regex-syntax/src/ast/parse.rs
Expand Up @@ -1202,12 +1202,16 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
));
}
let inner_span = self.span();
if self.bump_if("?P<") {
let mut starts_with_p = true;
if self.bump_if("?P<") || {
starts_with_p = false;
self.bump_if("?<")
} {
let capture_index = self.next_capture_index(open_span)?;
let cap = self.parse_capture_name(capture_index)?;
let name = self.parse_capture_name(capture_index)?;
Ok(Either::Right(ast::Group {
span: open_span,
kind: ast::GroupKind::CaptureName(cap),
kind: ast::GroupKind::CaptureName { starts_with_p, name },
ast: Box::new(Ast::Empty(self.span())),
}))
} else if self.bump_if("?") {
Expand Down Expand Up @@ -2800,11 +2804,14 @@ bar
flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
Ast::Group(ast::Group {
span: span_range(pat, 4..pat.len()),
kind: ast::GroupKind::CaptureName(ast::CaptureName {
span: span_range(pat, 9..12),
name: s("foo"),
index: 1,
}),
kind: ast::GroupKind::CaptureName {
starts_with_p: true,
name: ast::CaptureName {
span: span_range(pat, 9..12),
name: s("foo"),
index: 1,
}
},
ast: Box::new(lit_with('a', span_range(pat, 14..15))),
}),
]
Expand Down Expand Up @@ -3819,27 +3826,48 @@ bar

#[test]
fn parse_capture_name() {
assert_eq!(
parser("(?<a>z)").parse(),
Ok(Ast::Group(ast::Group {
span: span(0..7),
kind: ast::GroupKind::CaptureName {
starts_with_p: false,
name: ast::CaptureName {
span: span(3..4),
name: s("a"),
index: 1,
}
},
ast: Box::new(lit('z', 5)),
}))
);
assert_eq!(
parser("(?P<a>z)").parse(),
Ok(Ast::Group(ast::Group {
span: span(0..8),
kind: ast::GroupKind::CaptureName(ast::CaptureName {
span: span(4..5),
name: s("a"),
index: 1,
}),
kind: ast::GroupKind::CaptureName {
starts_with_p: true,
name: ast::CaptureName {
span: span(4..5),
name: s("a"),
index: 1,
}
},
ast: Box::new(lit('z', 6)),
}))
);
assert_eq!(
parser("(?P<abc>z)").parse(),
Ok(Ast::Group(ast::Group {
span: span(0..10),
kind: ast::GroupKind::CaptureName(ast::CaptureName {
span: span(4..7),
name: s("abc"),
index: 1,
}),
kind: ast::GroupKind::CaptureName {
starts_with_p: true,
name: ast::CaptureName {
span: span(4..7),
name: s("abc"),
index: 1,
}
},
ast: Box::new(lit('z', 8)),
}))
);
Expand All @@ -3848,11 +3876,14 @@ bar
parser("(?P<a_1>z)").parse(),
Ok(Ast::Group(ast::Group {
span: span(0..10),
kind: ast::GroupKind::CaptureName(ast::CaptureName {
span: span(4..7),
name: s("a_1"),
index: 1,
}),
kind: ast::GroupKind::CaptureName {
starts_with_p: true,
name: ast::CaptureName {
span: span(4..7),
name: s("a_1"),
index: 1,
}
},
ast: Box::new(lit('z', 8)),
}))
);
Expand All @@ -3861,11 +3892,14 @@ bar
parser("(?P<a.1>z)").parse(),
Ok(Ast::Group(ast::Group {
span: span(0..10),
kind: ast::GroupKind::CaptureName(ast::CaptureName {
span: span(4..7),
name: s("a.1"),
index: 1,
}),
kind: ast::GroupKind::CaptureName {
starts_with_p: true,
name: ast::CaptureName {
span: span(4..7),
name: s("a.1"),
index: 1,
}
},
ast: Box::new(lit('z', 8)),
}))
);
Expand All @@ -3874,11 +3908,14 @@ bar
parser("(?P<a[1]>z)").parse(),
Ok(Ast::Group(ast::Group {
span: span(0..11),
kind: ast::GroupKind::CaptureName(ast::CaptureName {
span: span(4..8),
name: s("a[1]"),
index: 1,
}),
kind: ast::GroupKind::CaptureName {
starts_with_p: true,
name: ast::CaptureName {
span: span(4..8),
name: s("a[1]"),
index: 1,
}
},
ast: Box::new(lit('z', 9)),
}))
);
Expand Down
8 changes: 5 additions & 3 deletions regex-syntax/src/ast/print.rs
Expand Up @@ -160,9 +160,10 @@ impl<W: fmt::Write> Writer<W> {
use crate::ast::GroupKind::*;
match ast.kind {
CaptureIndex(_) => self.wtr.write_str("("),
CaptureName(ref x) => {
self.wtr.write_str("(?P<")?;
self.wtr.write_str(&x.name)?;
CaptureName { ref name, starts_with_p } => {
let start = if starts_with_p { "(?P<" } else { "(?<" };
self.wtr.write_str(start)?;
self.wtr.write_str(&name.name)?;
self.wtr.write_str(">")?;
Ok(())
}
Expand Down Expand Up @@ -505,6 +506,7 @@ mod tests {
fn print_group() {
roundtrip("(?i:a)");
roundtrip("(?P<foo>a)");
roundtrip("(?<foo>a)");
roundtrip("(a)");
}

Expand Down
4 changes: 2 additions & 2 deletions regex-syntax/src/hir/translate.rs
Expand Up @@ -905,8 +905,8 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
fn hir_group(&self, group: &ast::Group, expr: Hir) -> Hir {
let (index, name) = match group.kind {
ast::GroupKind::CaptureIndex(index) => (index, None),
ast::GroupKind::CaptureName(ref cap) => {
(cap.index, Some(cap.name.clone().into_boxed_str()))
ast::GroupKind::CaptureName { ref name, .. } => {
(name.index, Some(name.name.clone().into_boxed_str()))
}
// The HIR doesn't need to use non-capturing groups, since the way
// in which the data type is defined handles this automatically.
Expand Down
1 change: 1 addition & 0 deletions src/lib.rs
Expand Up @@ -361,6 +361,7 @@ regex matches `abc` at positions `0`, `1`, `2` and `3`.
<pre class="rust">
(exp) numbered capture group (indexed by opening parenthesis)
(?P&lt;name&gt;exp) named (also numbered) capture group (allowed chars: [_0-9a-zA-Z.\[\]])
(?&lt;name&gt;exp) named (also numbered) capture group (allowed chars: [_0-9a-zA-Z.\[\]])
(?:exp) non-capturing group
(?flags) set flags within current group
(?flags:exp) set flags for exp (non-capturing)
Expand Down

0 comments on commit 0ff592e

Please sign in to comment.