Use memchr for string lexing (astral-sh#9888)

## Summary On `main`, string lexing consists of walking through the string character-by-character to search for the closing quote (with some nuance: we also need to skip escaped characters, and error if we see newlines in non-triple-quoted strings). This PR rewrites `lex_string` to instead use `memchr` to search for the closing quote, which is significantly faster. On my machine, at least, the `globals.py` benchmark (which contains a lot of docstrings) gets 40% faster... ```text lexer/numpy/globals.py time: [3.6410 µs 3.6496 µs 3.6585 µs] thrpt: [806.53 MiB/s 808.49 MiB/s 810.41 MiB/s] change: time: [-40.413% -40.185% -39.984%] (p = 0.00 < 0.05) thrpt: [+66.623% +67.181% +67.822%] Performance has improved. Found 2 outliers among 100 measurements (2.00%) 2 (2.00%) high mild lexer/unicode/pypinyin.py time: [12.422 µs 12.445 µs 12.467 µs] thrpt: [337.03 MiB/s 337.65 MiB/s 338.27 MiB/s] change: time: [-9.4213% -9.1930% -8.9586%] (p = 0.00 < 0.05) thrpt: [+9.8401% +10.124% +10.401%] Performance has improved. Found 3 outliers among 100 measurements (3.00%) 1 (1.00%) high mild 2 (2.00%) high severe lexer/pydantic/types.py time: [107.45 µs 107.50 µs 107.56 µs] thrpt: [237.11 MiB/s 237.24 MiB/s 237.35 MiB/s] change: time: [-4.0108% -3.7005% -3.3787%] (p = 0.00 < 0.05) thrpt: [+3.4968% +3.8427% +4.1784%] Performance has improved. Found 7 outliers among 100 measurements (7.00%) 2 (2.00%) high mild 5 (5.00%) high severe lexer/numpy/ctypeslib.py time: [46.123 µs 46.165 µs 46.208 µs] thrpt: [360.36 MiB/s 360.69 MiB/s 361.01 MiB/s] change: time: [-19.313% -18.996% -18.710%] (p = 0.00 < 0.05) thrpt: [+23.016% +23.451% +23.935%] Performance has improved. Found 8 outliers among 100 measurements (8.00%) 3 (3.00%) low mild 1 (1.00%) high mild 4 (4.00%) high severe lexer/large/dataset.py time: [231.07 µs 231.19 µs 231.33 µs] thrpt: [175.87 MiB/s 175.97 MiB/s 176.06 MiB/s] change: time: [-2.0437% -1.7663% -1.4922%] (p = 0.00 < 0.05) thrpt: [+1.5148% +1.7981% +2.0864%] Performance has improved. Found 10 outliers among 100 measurements (10.00%) 5 (5.00%) high mild 5 (5.00%) high severe ```
nkxxll · Mar 4, 2024 · 795ce55 · 795ce55
1 parent df3da93
commit 795ce55
Show file tree

Hide file tree

Showing 2 changed files with 99 additions and 34 deletions.
diff --git a/crates/ruff_python_parser/src/lexer.rs b/crates/ruff_python_parser/src/lexer.rs
@@ -690,48 +690,65 @@ impl<'source> Lexer<'source> {
 
         let value_start = self.offset();
 
-        let value_end = loop {
-            match self.cursor.bump() {
-                Some('\\') => {
-                    if self.cursor.eat_char('\r') {
-                        self.cursor.eat_char('\n');
-                    } else {
-                        self.cursor.bump();
-                    }
-                }
-                Some('\r' | '\n') if !triple_quoted => {
+        let quote_byte = u8::try_from(quote).expect("char that fits in u8");
+        let value_end = if triple_quoted {
+            // For triple-quoted strings, scan until we find the closing quote (ignoring escaped
+            // quotes) or the end of the file.
+            loop {
+                let Some(index) = memchr::memchr(quote_byte, self.cursor.rest().as_bytes()) else {
+                    self.cursor.skip_to_end();
+
                     if let Some(fstring) = self.fstrings.current() {
                         // When we are in an f-string, check whether the initial quote
                         // matches with f-strings quotes and if it is, then this must be a
                         // missing '}' token so raise the proper error.
-                        if fstring.quote_char() == quote && !fstring.is_triple_quoted() {
+                        if fstring.quote_char() == quote
+                            && fstring.is_triple_quoted() == triple_quoted
+                        {
                             return Err(LexicalError {
                                 error: LexicalErrorType::FStringError(
                                     FStringErrorType::UnclosedLbrace,
                                 ),
-                                location: self.offset() - TextSize::new(1),
+                                location: self.cursor.text_len(),
                             });
                         }
                     }
                     return Err(LexicalError {
-                        error: LexicalErrorType::OtherError(
-                            "EOL while scanning string literal".to_owned(),
-                        ),
-                        location: self.offset() - TextSize::new(1),
+                        error: LexicalErrorType::Eof,
+                        location: self.cursor.text_len(),
                     });
+                };
+
+                // Rare case: if there are an odd number of backslashes before the quote, then
+                // the quote is escaped and we should continue scanning.
+                let num_backslashes = self.cursor.rest().as_bytes()[..index]
+                    .iter()
+                    .rev()
+                    .take_while(|&&c| c == b'\\')
+                    .count();
+
+                // Advance the cursor past the quote and continue scanning.
+                self.cursor.skip_bytes(index + 1);
+
+                // If the character is escaped, continue scanning.
+                if num_backslashes % 2 == 1 {
+                    continue;
                 }
-                Some(c) if c == quote => {
-                    if triple_quoted {
-                        if self.cursor.eat_char2(quote, quote) {
-                            break self.offset() - TextSize::new(3);
-                        }
-                    } else {
-                        break self.offset() - TextSize::new(1);
-                    }
+
+                // Otherwise, if it's followed by two more quotes, then we're done.
+                if self.cursor.eat_char2(quote, quote) {
+                    break self.offset() - TextSize::new(3);
                 }
+            }
+        } else {
+            // For non-triple-quoted strings, scan until we find the closing quote, but end early
+            // if we encounter a newline or the end of the file.
+            loop {
+                let Some(index) =
+                    memchr::memchr3(quote_byte, b'\r', b'\n', self.cursor.rest().as_bytes())
+                else {
+                    self.cursor.skip_to_end();
 
-                Some(_) => {}
-                None => {
                     if let Some(fstring) = self.fstrings.current() {
                         // When we are in an f-string, check whether the initial quote
                         // matches with f-strings quotes and if it is, then this must be a
@@ -748,23 +765,66 @@ impl<'source> Lexer<'source> {
                         }
                     }
                     return Err(LexicalError {
-                        error: if triple_quoted {
-                            LexicalErrorType::Eof
-                        } else {
-                            LexicalErrorType::StringError
-                        },
+                        error: LexicalErrorType::StringError,
                         location: self.offset(),
                     });
+                };
+
+                // Rare case: if there are an odd number of backslashes before the quote, then
+                // the quote is escaped and we should continue scanning.
+                let num_backslashes = self.cursor.rest().as_bytes()[..index]
+                    .iter()
+                    .rev()
+                    .take_while(|&&c| c == b'\\')
+                    .count();
+
+                // Skip up to the current character.
+                self.cursor.skip_bytes(index);
+                let ch = self.cursor.bump();
+
+                // If the character is escaped, continue scanning.
+                if num_backslashes % 2 == 1 {
+                    if ch == Some('\r') {
+                        self.cursor.eat_char('\n');
+                    }
+                    continue;
+                }
+
+                match ch {
+                    Some('\r' | '\n') => {
+                        if let Some(fstring) = self.fstrings.current() {
+                            // When we are in an f-string, check whether the initial quote
+                            // matches with f-strings quotes and if it is, then this must be a
+                            // missing '}' token so raise the proper error.
+                            if fstring.quote_char() == quote && !fstring.is_triple_quoted() {
+                                return Err(LexicalError {
+                                    error: LexicalErrorType::FStringError(
+                                        FStringErrorType::UnclosedLbrace,
+                                    ),
+                                    location: self.offset() - TextSize::new(1),
+                                });
+                            }
+                        }
+                        return Err(LexicalError {
+                            error: LexicalErrorType::OtherError(
+                                "EOL while scanning string literal".to_owned(),
+                            ),
+                            location: self.offset() - TextSize::new(1),
+                        });
+                    }
+                    Some(ch) if ch == quote => {
+                        break self.offset() - TextSize::new(1);
+                    }
+                    _ => unreachable!("memchr2 returned an index that is not a quote or a newline"),
                 }
             }
         };
 
-        let tok = Tok::String {
+        Ok(Tok::String {
             value: self.source[TextRange::new(value_start, value_end)].to_string(),
             kind,
             triple_quoted,
-        };
-        Ok(tok)
+        })
     }
 
     // This is the main entry point. Call this function to retrieve the next token.

diff --git a/crates/ruff_python_parser/src/lexer/cursor.rs b/crates/ruff_python_parser/src/lexer/cursor.rs
@@ -145,4 +145,9 @@ impl<'a> Cursor<'a> {
 
         self.chars = self.chars.as_str()[count..].chars();
     }
+
+    /// Skips to the end of the input stream.
+    pub(super) fn skip_to_end(&mut self) {
+        self.chars = "".chars();
+    }
 }