Make the lexer lazy

astral-sh · May 17, 2024 · 2f35044 · 2f35044
1 parent 83152ff
commit 2f35044
Show file tree

Hide file tree

Showing 14 changed files with 882 additions and 939 deletions.
diff --git a/crates/ruff_python_parser/src/lexer.rs b/crates/ruff_python_parser/src/lexer.rs
diff --git a/crates/ruff_python_parser/src/lexer/cursor.rs b/crates/ruff_python_parser/src/lexer/cursor.rs
@@ -1,18 +1,26 @@
-use ruff_text_size::{TextLen, TextSize};
 use std::str::Chars;
 
+use ruff_text_size::{TextLen, TextSize};
+
 pub(crate) const EOF_CHAR: char = '\0';
 
+/// A cursor represents a pointer in the source code.
 #[derive(Clone, Debug)]
-pub(super) struct Cursor<'a> {
-    chars: Chars<'a>,
+pub(super) struct Cursor<'src> {
+    /// An iterator over the [`char`]'s of the source code.
+    chars: Chars<'src>,
+
+    /// Length of the source code. This is used as a marker to indicate the start of the current
+    /// token which is being lexed.
     source_length: TextSize,
+
+    /// Stores the previous character for debug assertions.
     #[cfg(debug_assertions)]
     prev_char: char,
 }
 
-impl<'a> Cursor<'a> {
-    pub(crate) fn new(source: &'a str) -> Self {
+impl<'src> Cursor<'src> {
+    pub(crate) fn new(source: &'src str) -> Self {
         Self {
             source_length: source.text_len(),
             chars: source.chars(),
@@ -21,14 +29,14 @@ impl<'a> Cursor<'a> {
         }
     }
 
-    /// Returns the previous token. Useful for debug assertions.
+    /// Returns the previous character. Useful for debug assertions.
     #[cfg(debug_assertions)]
     pub(super) const fn previous(&self) -> char {
         self.prev_char
     }
 
     /// Peeks the next character from the input stream without consuming it.
-    /// Returns [`EOF_CHAR`] if the file is at the end of the file.
+    /// Returns [`EOF_CHAR`] if the position is past the end of the file.
     pub(super) fn first(&self) -> char {
         self.chars.clone().next().unwrap_or(EOF_CHAR)
     }
@@ -42,29 +50,44 @@ impl<'a> Cursor<'a> {
     }
 
     /// Returns the remaining text to lex.
-    pub(super) fn rest(&self) -> &'a str {
+    ///
+    /// Use [`Cursor::text_len`] to get the length of the remaining text.
+    pub(super) fn rest(&self) -> &'src str {
         self.chars.as_str()
     }
 
+    /// Returns the length of the remaining text.
+    ///
+    /// Use [`Cursor::rest`] to get the remaining text.
     // SAFETY: The `source.text_len` call in `new` would panic if the string length is larger than a `u32`.
     #[allow(clippy::cast_possible_truncation)]
     pub(super) fn text_len(&self) -> TextSize {
         TextSize::new(self.chars.as_str().len() as u32)
     }
 
+    /// Returns the length of the current token length.
+    ///
+    /// This is to be used after setting the start position of the token using
+    /// [`Cursor::start_token`].
     pub(super) fn token_len(&self) -> TextSize {
         self.source_length - self.text_len()
     }
 
+    /// Mark the current position of the cursor as the start of the token which is going to be
+    /// lexed.
+    ///
+    /// Use [`Cursor::token_len`] to get the length of the lexed token.
     pub(super) fn start_token(&mut self) {
         self.source_length = self.text_len();
     }
 
+    /// Returns `true` if the cursor is at the end of file.
     pub(super) fn is_eof(&self) -> bool {
         self.chars.as_str().is_empty()
     }
 
-    /// Consumes the next character
+    /// Moves the cursor to the next character, returning the previous character.
+    /// Returns [`None`] if there is no next character.
     pub(super) fn bump(&mut self) -> Option<char> {
         let prev = self.chars.next()?;
 

diff --git a/crates/ruff_python_parser/src/lexer/fstring.rs b/crates/ruff_python_parser/src/lexer/fstring.rs
@@ -127,4 +127,17 @@ impl FStrings {
     pub(crate) fn current_mut(&mut self) -> Option<&mut FStringContext> {
         self.stack.last_mut()
     }
+
+    pub(crate) fn checkpoint(&self) -> FStringsCheckpoint {
+        FStringsCheckpoint(self.stack.len())
+    }
+
+    pub(crate) fn rewind(&mut self, checkpoint: FStringsCheckpoint) {
+        assert!(self.stack.len() <= checkpoint.0);
+
+        self.stack.truncate(checkpoint.0);
+    }
 }
+
+#[derive(Debug, Copy, Clone)]
+pub(crate) struct FStringsCheckpoint(usize);
diff --git a/crates/ruff_python_parser/src/lexer/indentation.rs b/crates/ruff_python_parser/src/lexer/indentation.rs
@@ -82,8 +82,8 @@ impl Indentation {
 #[derive(Debug, Copy, Clone, PartialEq)]
 pub(super) struct UnexpectedIndentation;
 
-// The indentations stack is used to keep track of the current indentation level
-// [See Indentation](docs.python.org/3/reference/lexical_analysis.html#indentation).
+/// The indentations stack is used to keep track of the current indentation level
+/// [See Indentation](docs.python.org/3/reference/lexical_analysis.html#indentation).
 #[derive(Debug, Clone, Default)]
 pub(super) struct Indentations {
     stack: Vec<Indentation>,
@@ -124,8 +124,21 @@ impl Indentations {
         static ROOT: Indentation = Indentation::root();
         self.stack.last().unwrap_or(&ROOT)
     }
+
+    pub(crate) fn checkpoint(&self) -> IndentationsCheckpoint {
+        IndentationsCheckpoint(self.stack.len())
+    }
+
+    pub(crate) fn rewind(&mut self, checkpoint: IndentationsCheckpoint) {
+        assert!(self.stack.len() <= checkpoint.0);
+
+        self.stack.truncate(checkpoint.0);
+    }
 }
 
+#[derive(Debug, Copy, Clone)]
+pub(crate) struct IndentationsCheckpoint(usize);
+
 assert_eq_size!(Indentation, u64);
 
 #[cfg(test)]

diff --git a/crates/ruff_python_parser/src/lib.rs b/crates/ruff_python_parser/src/lib.rs
@@ -125,7 +125,6 @@ use ruff_text_size::{Ranged, TextRange, TextSize};
 mod error;
 pub mod lexer;
 mod parser;
-mod soft_keywords;
 mod string;
 mod token;
 mod token_set;

diff --git a/crates/ruff_python_parser/src/parser/expression.rs b/crates/ruff_python_parser/src/parser/expression.rs
@@ -11,11 +11,12 @@ use ruff_python_ast::{
 };
 use ruff_text_size::{Ranged, TextLen, TextRange, TextSize};
 
+use crate::lexer::TokenValue;
 use crate::parser::progress::ParserProgress;
 use crate::parser::{helpers, FunctionKind, Parser};
 use crate::string::{parse_fstring_literal_element, parse_string_literal, StringType};
 use crate::token_set::TokenSet;
-use crate::{FStringErrorType, Mode, ParseErrorType, Tok, TokenKind};
+use crate::{FStringErrorType, Mode, ParseErrorType, TokenKind};
 
 use super::{Parenthesized, RecoveryContextKind};
 
@@ -459,36 +460,43 @@ impl<'src> Parser<'src> {
         let range = self.current_token_range();
 
         if self.at(TokenKind::Name) {
-            let (Tok::Name { name }, _) = self.bump(TokenKind::Name) else {
+            let TokenValue::Name(name) = self.bump_value(TokenKind::Name) else {
                 unreachable!();
             };
-            ast::Identifier {
+            return ast::Identifier {
                 id: name.to_string(),
                 range,
-            }
+            };
+        }
+
+        if self.current_token_kind().is_soft_keyword() {
+            let id = self.src_text(range).to_string();
+            self.bump_any();
+            return ast::Identifier { id, range };
+        }
+
+        if self.current_token_kind().is_keyword() {
+            // Non-soft keyword
+            self.add_error(
+                ParseErrorType::OtherError(format!(
+                    "Expected an identifier, but found a keyword '{}' that cannot be used here",
+                    self.current_token_kind()
+                )),
+                range,
+            );
+
+            let id = self.src_text(range).to_string();
+            self.bump_any();
+            ast::Identifier { id, range }
         } else {
-            if self.current_token_kind().is_keyword() {
-                let (tok, range) = self.next_token();
-                self.add_error(
-                    ParseErrorType::OtherError(format!(
-                        "Expected an identifier, but found a keyword '{tok}' that cannot be used here"
-                    )),
-                    range,
-                );
+            self.add_error(
+                ParseErrorType::OtherError("Expected an identifier".into()),
+                range,
+            );
 
-                ast::Identifier {
-                    id: tok.to_string(),
-                    range,
-                }
-            } else {
-                self.add_error(
-                    ParseErrorType::OtherError("Expected an identifier".into()),
-                    range,
-                );
-                ast::Identifier {
-                    id: String::new(),
-                    range: self.missing_node_range(),
-                }
+            ast::Identifier {
+                id: String::new(),
+                range: self.missing_node_range(),
             }
         }
     }
@@ -501,7 +509,7 @@ impl<'src> Parser<'src> {
 
         let lhs = match self.current_token_kind() {
             TokenKind::Float => {
-                let (Tok::Float { value }, _) = self.bump(TokenKind::Float) else {
+                let TokenValue::Float(value) = self.bump_value(TokenKind::Float) else {
                     unreachable!()
                 };
 
@@ -511,7 +519,7 @@ impl<'src> Parser<'src> {
                 })
             }
             TokenKind::Complex => {
-                let (Tok::Complex { real, imag }, _) = self.bump(TokenKind::Complex) else {
+                let TokenValue::Complex { real, imag } = self.bump_value(TokenKind::Complex) else {
                     unreachable!()
                 };
                 Expr::NumberLiteral(ast::ExprNumberLiteral {
@@ -520,7 +528,7 @@ impl<'src> Parser<'src> {
                 })
             }
             TokenKind::Int => {
-                let (Tok::Int { value }, _) = self.bump(TokenKind::Int) else {
+                let TokenValue::Int(value) = self.bump_value(TokenKind::Int) else {
                     unreachable!()
                 };
                 Expr::NumberLiteral(ast::ExprNumberLiteral {
@@ -566,7 +574,7 @@ impl<'src> Parser<'src> {
             TokenKind::Lbrace => self.parse_set_or_dict_like_expression(),
 
             kind => {
-                if kind.is_keyword() {
+                if kind.is_keyword() || kind.is_soft_keyword() {
                     Expr::Name(self.parse_name())
                 } else {
                     self.add_error(
@@ -1231,7 +1239,8 @@ impl<'src> Parser<'src> {
     ///
     /// See: <https://docs.python.org/3.13/reference/lexical_analysis.html#string-and-bytes-literals>
     fn parse_string_or_byte_literal(&mut self) -> StringType {
-        let (Tok::String { value, flags }, range) = self.bump(TokenKind::String) else {
+        let range = self.current_token_range();
+        let TokenValue::String { value, flags } = self.bump_value(TokenKind::String) else {
             unreachable!()
         };
 
@@ -1278,7 +1287,7 @@ impl<'src> Parser<'src> {
     fn parse_fstring(&mut self) -> ast::FString {
         let start = self.node_start();
 
-        let (Tok::FStringStart(kind), _) = self.bump(TokenKind::FStringStart) else {
+        let TokenValue::FStringStart(kind) = self.bump_value(TokenKind::FStringStart) else {
             unreachable!()
         };
         let elements = self.parse_fstring_elements();
@@ -1306,7 +1315,9 @@ impl<'src> Parser<'src> {
                     FStringElement::Expression(parser.parse_fstring_expression_element())
                 }
                 TokenKind::FStringMiddle => {
-                    let (Tok::FStringMiddle { value, flags, .. }, range) = parser.next_token()
+                    let range = parser.current_token_range();
+                    let TokenValue::FStringMiddle { value, flags, .. } =
+                        parser.bump_value(TokenKind::FStringMiddle)
                     else {
                         unreachable!()
                     };
@@ -1396,7 +1407,10 @@ impl<'src> Parser<'src> {
 
         let conversion = if self.eat(TokenKind::Exclamation) {
             let conversion_flag_range = self.current_token_range();
-            if let Tok::Name { name } = self.next_token().0 {
+            if self.at(TokenKind::Name) {
+                let TokenValue::Name(name) = self.bump_value(TokenKind::Name) else {
+                    unreachable!();
+                };
                 match &*name {
                     "s" => ConversionFlag::Str,
                     "r" => ConversionFlag::Repr,
@@ -2229,7 +2243,8 @@ impl<'src> Parser<'src> {
     fn parse_ipython_escape_command_expression(&mut self) -> ast::ExprIpyEscapeCommand {
         let start = self.node_start();
 
-        let (Tok::IpyEscapeCommand { value, kind }, _) = self.bump(TokenKind::IpyEscapeCommand)
+        let TokenValue::IpyEscapeCommand { value, kind } =
+            self.bump_value(TokenKind::IpyEscapeCommand)
         else {
             unreachable!()
         };