Skip to content

Commit

Permalink
Use memchr for string lexing (astral-sh#9888)
Browse files Browse the repository at this point in the history
## Summary

On `main`, string lexing consists of walking through the string
character-by-character to search for the closing quote (with some
nuance: we also need to skip escaped characters, and error if we see
newlines in non-triple-quoted strings). This PR rewrites `lex_string` to
instead use `memchr` to search for the closing quote, which is
significantly faster. On my machine, at least, the `globals.py`
benchmark (which contains a lot of docstrings) gets 40% faster...

```text
lexer/numpy/globals.py  time:   [3.6410 µs 3.6496 µs 3.6585 µs]
                        thrpt:  [806.53 MiB/s 808.49 MiB/s 810.41 MiB/s]
                 change:
                        time:   [-40.413% -40.185% -39.984%] (p = 0.00 < 0.05)
                        thrpt:  [+66.623% +67.181% +67.822%]
                        Performance has improved.
Found 2 outliers among 100 measurements (2.00%)
  2 (2.00%) high mild
lexer/unicode/pypinyin.py
                        time:   [12.422 µs 12.445 µs 12.467 µs]
                        thrpt:  [337.03 MiB/s 337.65 MiB/s 338.27 MiB/s]
                 change:
                        time:   [-9.4213% -9.1930% -8.9586%] (p = 0.00 < 0.05)
                        thrpt:  [+9.8401% +10.124% +10.401%]
                        Performance has improved.
Found 3 outliers among 100 measurements (3.00%)
  1 (1.00%) high mild
  2 (2.00%) high severe
lexer/pydantic/types.py time:   [107.45 µs 107.50 µs 107.56 µs]
                        thrpt:  [237.11 MiB/s 237.24 MiB/s 237.35 MiB/s]
                 change:
                        time:   [-4.0108% -3.7005% -3.3787%] (p = 0.00 < 0.05)
                        thrpt:  [+3.4968% +3.8427% +4.1784%]
                        Performance has improved.
Found 7 outliers among 100 measurements (7.00%)
  2 (2.00%) high mild
  5 (5.00%) high severe
lexer/numpy/ctypeslib.py
                        time:   [46.123 µs 46.165 µs 46.208 µs]
                        thrpt:  [360.36 MiB/s 360.69 MiB/s 361.01 MiB/s]
                 change:
                        time:   [-19.313% -18.996% -18.710%] (p = 0.00 < 0.05)
                        thrpt:  [+23.016% +23.451% +23.935%]
                        Performance has improved.
Found 8 outliers among 100 measurements (8.00%)
  3 (3.00%) low mild
  1 (1.00%) high mild
  4 (4.00%) high severe
lexer/large/dataset.py  time:   [231.07 µs 231.19 µs 231.33 µs]
                        thrpt:  [175.87 MiB/s 175.97 MiB/s 176.06 MiB/s]
                 change:
                        time:   [-2.0437% -1.7663% -1.4922%] (p = 0.00 < 0.05)
                        thrpt:  [+1.5148% +1.7981% +2.0864%]
                        Performance has improved.
Found 10 outliers among 100 measurements (10.00%)
  5 (5.00%) high mild
  5 (5.00%) high severe
```
  • Loading branch information
charliermarsh authored and nkxxll committed Mar 4, 2024
1 parent df3da93 commit 795ce55
Show file tree
Hide file tree
Showing 2 changed files with 99 additions and 34 deletions.
128 changes: 94 additions & 34 deletions crates/ruff_python_parser/src/lexer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -690,48 +690,65 @@ impl<'source> Lexer<'source> {

let value_start = self.offset();

let value_end = loop {
match self.cursor.bump() {
Some('\\') => {
if self.cursor.eat_char('\r') {
self.cursor.eat_char('\n');
} else {
self.cursor.bump();
}
}
Some('\r' | '\n') if !triple_quoted => {
let quote_byte = u8::try_from(quote).expect("char that fits in u8");
let value_end = if triple_quoted {
// For triple-quoted strings, scan until we find the closing quote (ignoring escaped
// quotes) or the end of the file.
loop {
let Some(index) = memchr::memchr(quote_byte, self.cursor.rest().as_bytes()) else {
self.cursor.skip_to_end();

if let Some(fstring) = self.fstrings.current() {
// When we are in an f-string, check whether the initial quote
// matches with f-strings quotes and if it is, then this must be a
// missing '}' token so raise the proper error.
if fstring.quote_char() == quote && !fstring.is_triple_quoted() {
if fstring.quote_char() == quote
&& fstring.is_triple_quoted() == triple_quoted
{
return Err(LexicalError {
error: LexicalErrorType::FStringError(
FStringErrorType::UnclosedLbrace,
),
location: self.offset() - TextSize::new(1),
location: self.cursor.text_len(),
});
}
}
return Err(LexicalError {
error: LexicalErrorType::OtherError(
"EOL while scanning string literal".to_owned(),
),
location: self.offset() - TextSize::new(1),
error: LexicalErrorType::Eof,
location: self.cursor.text_len(),
});
};

// Rare case: if there are an odd number of backslashes before the quote, then
// the quote is escaped and we should continue scanning.
let num_backslashes = self.cursor.rest().as_bytes()[..index]
.iter()
.rev()
.take_while(|&&c| c == b'\\')
.count();

// Advance the cursor past the quote and continue scanning.
self.cursor.skip_bytes(index + 1);

// If the character is escaped, continue scanning.
if num_backslashes % 2 == 1 {
continue;
}
Some(c) if c == quote => {
if triple_quoted {
if self.cursor.eat_char2(quote, quote) {
break self.offset() - TextSize::new(3);
}
} else {
break self.offset() - TextSize::new(1);
}

// Otherwise, if it's followed by two more quotes, then we're done.
if self.cursor.eat_char2(quote, quote) {
break self.offset() - TextSize::new(3);
}
}
} else {
// For non-triple-quoted strings, scan until we find the closing quote, but end early
// if we encounter a newline or the end of the file.
loop {
let Some(index) =
memchr::memchr3(quote_byte, b'\r', b'\n', self.cursor.rest().as_bytes())
else {
self.cursor.skip_to_end();

Some(_) => {}
None => {
if let Some(fstring) = self.fstrings.current() {
// When we are in an f-string, check whether the initial quote
// matches with f-strings quotes and if it is, then this must be a
Expand All @@ -748,23 +765,66 @@ impl<'source> Lexer<'source> {
}
}
return Err(LexicalError {
error: if triple_quoted {
LexicalErrorType::Eof
} else {
LexicalErrorType::StringError
},
error: LexicalErrorType::StringError,
location: self.offset(),
});
};

// Rare case: if there are an odd number of backslashes before the quote, then
// the quote is escaped and we should continue scanning.
let num_backslashes = self.cursor.rest().as_bytes()[..index]
.iter()
.rev()
.take_while(|&&c| c == b'\\')
.count();

// Skip up to the current character.
self.cursor.skip_bytes(index);
let ch = self.cursor.bump();

// If the character is escaped, continue scanning.
if num_backslashes % 2 == 1 {
if ch == Some('\r') {
self.cursor.eat_char('\n');
}
continue;
}

match ch {
Some('\r' | '\n') => {
if let Some(fstring) = self.fstrings.current() {
// When we are in an f-string, check whether the initial quote
// matches with f-strings quotes and if it is, then this must be a
// missing '}' token so raise the proper error.
if fstring.quote_char() == quote && !fstring.is_triple_quoted() {
return Err(LexicalError {
error: LexicalErrorType::FStringError(
FStringErrorType::UnclosedLbrace,
),
location: self.offset() - TextSize::new(1),
});
}
}
return Err(LexicalError {
error: LexicalErrorType::OtherError(
"EOL while scanning string literal".to_owned(),
),
location: self.offset() - TextSize::new(1),
});
}
Some(ch) if ch == quote => {
break self.offset() - TextSize::new(1);
}
_ => unreachable!("memchr2 returned an index that is not a quote or a newline"),
}
}
};

let tok = Tok::String {
Ok(Tok::String {
value: self.source[TextRange::new(value_start, value_end)].to_string(),
kind,
triple_quoted,
};
Ok(tok)
})
}

// This is the main entry point. Call this function to retrieve the next token.
Expand Down
5 changes: 5 additions & 0 deletions crates/ruff_python_parser/src/lexer/cursor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -145,4 +145,9 @@ impl<'a> Cursor<'a> {

self.chars = self.chars.as_str()[count..].chars();
}

/// Skips to the end of the input stream.
pub(super) fn skip_to_end(&mut self) {
self.chars = "".chars();
}
}

0 comments on commit 795ce55

Please sign in to comment.