Skip to content

Commit

Permalink
syntax: add support for CRLF-aware line anchors
Browse files Browse the repository at this point in the history
This adds Look::StartCRLF and Look::EndCRLF. And also adds a new flag,
'R', for making ^/$ be CRLF aware in multi-line mode. The 'R' flag also
causes '.' to *not* match \r in addition to \n (unless the 's' flag is
enabled of course).

The intended semantics are that CRLF mode makes \r\n, \r and \n line
terminators but with one key property: \r\n is treated as a single line
terminator. That is, ^/$ do not match between \r and \n.

This partially addresses #244 by adding syntax support. Currently, if
you try to use this new flag, the regex compiler will report an error.
We intend to finish support for this once #656 is complete. (Indeed, at
time of writing, CRLF matching works in regex-automata.)
  • Loading branch information
BurntSushi committed Mar 15, 2023
1 parent 968dd0b commit 2a6d72d
Show file tree
Hide file tree
Showing 8 changed files with 226 additions and 28 deletions.
2 changes: 2 additions & 0 deletions regex-syntax/src/ast/mod.rs
Expand Up @@ -1314,6 +1314,8 @@ pub enum Flag {
SwapGreed,
/// `u`
Unicode,
/// `R`
CRLF,
/// `x`
IgnoreWhitespace,
}
Expand Down
30 changes: 30 additions & 0 deletions regex-syntax/src/ast/parse.rs
Expand Up @@ -1381,6 +1381,7 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
's' => Ok(ast::Flag::DotMatchesNewLine),
'U' => Ok(ast::Flag::SwapGreed),
'u' => Ok(ast::Flag::Unicode),
'R' => Ok(ast::Flag::CRLF),
'x' => Ok(ast::Flag::IgnoreWhitespace),
_ => {
Err(self
Expand Down Expand Up @@ -4084,6 +4085,34 @@ bar
],
})
);
assert_eq!(
parser("i-sR:").parse_flags(),
Ok(ast::Flags {
span: span(0..4),
items: vec![
ast::FlagsItem {
span: span(0..1),
kind: ast::FlagsItemKind::Flag(
ast::Flag::CaseInsensitive
),
},
ast::FlagsItem {
span: span(1..2),
kind: ast::FlagsItemKind::Negation,
},
ast::FlagsItem {
span: span(2..3),
kind: ast::FlagsItemKind::Flag(
ast::Flag::DotMatchesNewLine
),
},
ast::FlagsItem {
span: span(3..4),
kind: ast::FlagsItemKind::Flag(ast::Flag::CRLF),
},
],
})
);

assert_eq!(
parser("isU").parse_flags().unwrap_err(),
Expand Down Expand Up @@ -4145,6 +4174,7 @@ bar
assert_eq!(parser("s").parse_flag(), Ok(ast::Flag::DotMatchesNewLine));
assert_eq!(parser("U").parse_flag(), Ok(ast::Flag::SwapGreed));
assert_eq!(parser("u").parse_flag(), Ok(ast::Flag::Unicode));
assert_eq!(parser("R").parse_flag(), Ok(ast::Flag::CRLF));
assert_eq!(parser("x").parse_flag(), Ok(ast::Flag::IgnoreWhitespace));

assert_eq!(
Expand Down
1 change: 1 addition & 0 deletions regex-syntax/src/ast/print.rs
Expand Up @@ -289,6 +289,7 @@ impl<W: fmt::Write> Writer<W> {
Flag::DotMatchesNewLine => self.wtr.write_str("s"),
Flag::SwapGreed => self.wtr.write_str("U"),
Flag::Unicode => self.wtr.write_str("u"),
Flag::CRLF => self.wtr.write_str("R"),
Flag::IgnoreWhitespace => self.wtr.write_str("x"),
},
}?;
Expand Down
83 changes: 62 additions & 21 deletions regex-syntax/src/hir/mod.rs
Expand Up @@ -471,10 +471,12 @@ impl Hir {

/// Returns an HIR expression for `.`.
///
/// * [`Dot::AnyChar`] maps to `(?su:.)`.
/// * [`Dot::AnyByte`] maps to `(?s-u:.)`.
/// * [`Dot::AnyCharExceptNL`] maps to `(?u-s:.)`.
/// * [`Dot::AnyByteExceptNL`] maps to `(?-su:.)`.
/// * [`Dot::AnyChar`] maps to `(?su-R:.)`.
/// * [`Dot::AnyByte`] maps to `(?s-Ru:.)`.
/// * [`Dot::AnyCharExceptLF`] maps to `(?u-Rs:.)`.
/// * [`Dot::AnyCharExceptCRLF`] maps to `(?Ru-s:.)`.
/// * [`Dot::AnyByteExceptLF`] maps to `(?-Rsu:.)`.
/// * [`Dot::AnyByteExceptCRLF`] maps to `(?R-su:.)`.
///
/// Note that this is a convenience routine for constructing the correct
/// character class based on the value of `Dot`. There is no explicit "dot"
Expand All @@ -492,18 +494,32 @@ impl Hir {
cls.push(ClassBytesRange::new(b'\0', b'\xFF'));
Hir::class(Class::Bytes(cls))
}
Dot::AnyCharExceptNL => {
Dot::AnyCharExceptLF => {
let mut cls = ClassUnicode::empty();
cls.push(ClassUnicodeRange::new('\0', '\x09'));
cls.push(ClassUnicodeRange::new('\x0B', '\u{10FFFF}'));
Hir::class(Class::Unicode(cls))
}
Dot::AnyByteExceptNL => {
Dot::AnyCharExceptCRLF => {
let mut cls = ClassUnicode::empty();
cls.push(ClassUnicodeRange::new('\0', '\x09'));
cls.push(ClassUnicodeRange::new('\x0B', '\x0C'));
cls.push(ClassUnicodeRange::new('\x0E', '\u{10FFFF}'));
Hir::class(Class::Unicode(cls))
}
Dot::AnyByteExceptLF => {
let mut cls = ClassBytes::empty();
cls.push(ClassBytesRange::new(b'\0', b'\x09'));
cls.push(ClassBytesRange::new(b'\x0B', b'\xFF'));
Hir::class(Class::Bytes(cls))
}
Dot::AnyByteExceptCRLF => {
let mut cls = ClassBytes::empty();
cls.push(ClassBytesRange::new(b'\0', b'\x09'));
cls.push(ClassBytesRange::new(b'\x0B', b'\x0C'));
cls.push(ClassBytesRange::new(b'\x0E', b'\xFF'));
Hir::class(Class::Bytes(cls))
}
}
}
}
Expand Down Expand Up @@ -1365,6 +1381,16 @@ pub enum Look {
/// at the end position of the input, or at the position immediately
/// preceding a `\n` character.
EndLF,
/// Match the beginning of a line or the beginning of text. Specifically,
/// this matches at the starting position of the input, or at the position
/// immediately following either a `\r` or `\n` character, but never after
/// a `\r` when a `\n` follows.
StartCRLF,
/// Match the end of a line or the end of text. Specifically, this matches
/// at the end position of the input, or at the position immediately
/// preceding a `\r` or `\n` character, but never before a `\n` when a `\r`
/// precedes it.
EndCRLF,
/// Match an ASCII-only word boundary. That is, this matches a position
/// where the left adjacent character and right adjacent character
/// correspond to a word and non-word or a non-word and word character.
Expand All @@ -1380,30 +1406,34 @@ pub enum Look {
}

impl Look {
fn from_repr(repr: u8) -> Option<Look> {
fn from_repr(repr: u16) -> Option<Look> {
match repr {
0 => Some(Look::Start),
1 => Some(Look::End),
2 => Some(Look::StartLF),
3 => Some(Look::EndLF),
4 => Some(Look::WordAscii),
5 => Some(Look::WordAsciiNegate),
6 => Some(Look::WordUnicode),
7 => Some(Look::WordUnicodeNegate),
4 => Some(Look::StartCRLF),
5 => Some(Look::EndCRLF),
6 => Some(Look::WordAscii),
7 => Some(Look::WordAsciiNegate),
8 => Some(Look::WordUnicode),
9 => Some(Look::WordUnicodeNegate),
_ => None,
}
}

fn as_repr(&self) -> u8 {
fn as_repr(&self) -> u16 {
match *self {
Look::Start => 0,
Look::End => 1,
Look::StartLF => 2,
Look::EndLF => 3,
Look::WordAscii => 4,
Look::WordAsciiNegate => 5,
Look::WordUnicode => 6,
Look::WordUnicodeNegate => 7,
Look::StartCRLF => 5,
Look::EndCRLF => 5,
Look::WordAscii => 6,
Look::WordAsciiNegate => 7,
Look::WordUnicode => 8,
Look::WordUnicodeNegate => 9,
}
}

Expand All @@ -1413,6 +1443,8 @@ impl Look {
Look::End => 'z',
Look::StartLF => '^',
Look::EndLF => '$',
Look::StartCRLF => '^',
Look::EndCRLF => '$',
Look::WordAscii => 'b',
Look::WordAsciiNegate => 'B',
Look::WordUnicode => '𝛃',
Expand Down Expand Up @@ -1505,11 +1537,20 @@ pub enum Dot {
/// Matches the UTF-8 encoding of any Unicode scalar value except for `\n`.
///
/// This is equivalent to `(?u-s:.)` and also `[\p{any}--\n]`.
AnyCharExceptNL,
AnyCharExceptLF,
/// Matches the UTF-8 encoding of any Unicode scalar value except for `\r`
/// and `\n`.
///
/// This is equivalent to `(?uR-s:.)` and also `[\p{any}--\r\n]`.
AnyCharExceptCRLF,
/// Matches any byte value except for `\n`.
///
/// This is equivalent to `(?-su:.)` and also `(?-u:[[\x00-\xFF]--\n])`.
AnyByteExceptNL,
AnyByteExceptLF,
/// Matches any byte value except for `\r` and `\n`.
///
/// This is equivalent to `(?R-su:.)` and also `(?-u:[[\x00-\xFF]--\r\n])`.
AnyByteExceptCRLF,
}

/// A custom `Drop` impl is used for `HirKind` such that it uses constant stack
Expand Down Expand Up @@ -2038,7 +2079,7 @@ impl Properties {
/// example, an [`Hir`] provides properties that return `LookSet`s.
#[derive(Clone, Copy, Default, Eq, PartialEq)]
pub struct LookSet {
bits: u8,
bits: u16,
}

impl LookSet {
Expand Down Expand Up @@ -2170,8 +2211,8 @@ impl Iterator for LookSetIter {
#[inline]
fn next(&mut self) -> Option<Look> {
// We'll never have more than u8::MAX distinct look-around assertions,
// so 'repr' will always fit into a usize.
let repr = u8::try_from(self.set.bits.trailing_zeros()).unwrap();
// so 'repr' will always fit into a u16.
let repr = u16::try_from(self.set.bits.trailing_zeros()).unwrap();
let look = Look::from_repr(repr)?;
self.set.remove(look);
Some(look)
Expand Down
6 changes: 6 additions & 0 deletions regex-syntax/src/hir/print.rs
Expand Up @@ -177,6 +177,12 @@ impl<W: fmt::Write> Visitor for Writer<W> {
hir::Look::EndLF => {
self.wtr.write_str("(?m:$)")?;
}
hir::Look::StartCRLF => {
self.wtr.write_str("(?mR:^)")?;
}
hir::Look::EndCRLF => {
self.wtr.write_str("(?mR:$)")?;
}
hir::Look::WordAscii => {
self.wtr.write_str(r"(?-u:\b)")?;
}
Expand Down

0 comments on commit 2a6d72d

Please sign in to comment.