Make Lexer lazy

astral-sh · May 2, 2024 · fe0fa75 · fe0fa75
1 parent c391c8b
commit fe0fa75
Show file tree

Hide file tree

Showing 7 changed files with 225 additions and 108 deletions.
diff --git a/crates/ruff_python_parser/src/lexer.rs b/crates/ruff_python_parser/src/lexer.rs
@@ -45,27 +45,39 @@ use crate::lexer::fstring::{FStringContext, FStrings};
 use crate::lexer::indentation::{Indentation, Indentations};
 use crate::soft_keywords::SoftKeywordTransformer;
 use crate::token::Tok;
-use crate::Mode;
+use crate::{Mode, TokenKind};
 
 mod cursor;
 mod fstring;
 mod indentation;
 
 /// A lexer for Python source code.
-pub struct Lexer<'source> {
-    // Contains the source code to be lexed.
-    cursor: Cursor<'source>,
-    source: &'source str,
+#[derive(Debug)]
+pub struct Lexer<'src> {
+    /// Source code to be lexed.
+    source: &'src str,
 
+    /// A pointer to the current character of the source code which is being lexed.
+    cursor: Cursor<'src>,
+
+    /// The current lexed token.
+    current: Spanned,
+
+    /// Lexer state.
     state: State,
-    // Amount of parenthesis.
+
+    /// Nesting level represents the amount of parenthesis the lexer is currently in.
+    /// If it's > 0, the lexer is in a parenthesized context.
     nesting: u32,
-    // Indentation levels.
+
+    /// A stack of indentation representing the current indentation level.
     indentations: Indentations,
     pending_indentation: Option<Indentation>,
-    // Lexer mode.
+
+    /// Lexer mode.
     mode: Mode,
-    // F-string contexts.
+
+    /// F-string contexts.
     fstrings: FStrings,
 }
 
@@ -141,31 +153,42 @@ pub fn lex_starts_at(
     }
 }
 
-impl<'source> Lexer<'source> {
+impl<'src> Lexer<'src> {
     /// Create a new lexer from T and a starting location. You probably want to use
     /// [`lex`] instead.
-    pub fn new(input: &'source str, mode: Mode) -> Self {
+    pub fn new(input: &'src str, mode: Mode) -> Self {
         assert!(
             u32::try_from(input.len()).is_ok(),
             "Lexer only supports files with a size up to 4GB"
         );
 
-        let mut lxr = Lexer {
+        let mut lexer = Lexer {
+            source: input,
+            cursor: Cursor::new(input),
             state: State::AfterNewline,
+            current: (Tok::EndOfFile, TextRange::default()),
             nesting: 0,
             indentations: Indentations::default(),
             pending_indentation: None,
-
-            source: input,
-            cursor: Cursor::new(input),
             mode,
             fstrings: FStrings::default(),
         };
+
         // TODO: Handle possible mismatch between BOM and explicit encoding declaration.
         // spell-checker:ignore feff
-        lxr.cursor.eat_char('\u{feff}');
+        lexer.cursor.eat_char('\u{feff}');
+
+        lexer
+    }
 
-        lxr
+    /// Returns the kind of the current token.
+    pub(crate) fn current_kind(&self) -> TokenKind {
+        TokenKind::from_token(&self.current.0)
+    }
+
+    /// Returns the range of the current token.
+    pub(crate) fn current_range(&self) -> TextRange {
+        self.current.1
     }
 
     /// Lex an identifier. Also used for keywords and string/bytes literals with a prefix.
@@ -825,9 +848,22 @@ impl<'source> Lexer<'source> {
         })
     }
 
-    // This is the main entry point. Call this function to retrieve the next token.
-    // This function is used by the iterator implementation.
+    /// Moves the lexer to the next token.
+    ///
+    /// Returns the old current token as an owned value.
     pub fn next_token(&mut self) -> LexResult {
+        self.next_token_with_context(LexerContext::Regular)
+    }
+
+    pub(crate) fn next_token_with_context(&mut self, context: LexerContext) -> LexResult {
+        let next = self.next_token_impl()?;
+        Ok(match context {
+            LexerContext::Regular => std::mem::replace(&mut self.current, next),
+            LexerContext::Peeking => next,
+        })
+    }
+
+    fn next_token_impl(&mut self) -> LexResult {
         if let Some(fstring) = self.fstrings.current() {
             if !fstring.is_in_expression(self.nesting) {
                 match self.lex_fstring_middle_or_end() {
@@ -1303,7 +1339,7 @@ impl<'source> Lexer<'source> {
     }
 
     #[inline]
-    fn token_text(&self) -> &'source str {
+    fn token_text(&self) -> &'src str {
         &self.source[self.token_range()]
     }
 
@@ -1318,6 +1354,41 @@ impl<'source> Lexer<'source> {
     fn token_start(&self) -> TextSize {
         self.token_range().start()
     }
+
+    pub(crate) fn take_value(&mut self) -> TokenValue {
+        let (tok, _) = std::mem::replace(&mut self.current, (Tok::EndOfFile, TextRange::default()));
+        TokenValue::from_token(tok)
+    }
+
+    /// Creates a checkpoint to which it can later return to using [Self::rewind].
+    pub(crate) fn checkpoint(&self) -> LexerCheckpoint<'src> {
+        LexerCheckpoint {
+            cursor: self.cursor.clone(),
+            state: self.state,
+            nesting: self.nesting,
+            indentations_position: self.indentations.len(),
+            pending_indentation: self.pending_indentation,
+            fstrings_position: self.fstrings.len(),
+        }
+    }
+
+    /// Restore the lexer to the given checkpoint.
+    ///
+    /// # Panics
+    ///
+    /// If the current indentation is less than the indentation at the checkpoint
+    /// If the lexer is out of any f-strings it was in at the time of checkpoint
+    pub(crate) fn rewind(&mut self, checkpoint: LexerCheckpoint) {
+        assert!(self.indentations.len() >= checkpoint.indentations_position);
+        assert!(self.fstrings.len() >= checkpoint.fstrings_position);
+
+        self.cursor = checkpoint.cursor;
+        self.state = checkpoint.state;
+        self.nesting = checkpoint.nesting;
+        self.indentations.truncate(checkpoint.indentations_position);
+        self.pending_indentation = checkpoint.pending_indentation;
+        self.fstrings.truncate(checkpoint.fstrings_position);
+    }
 }
 
 // Implement iterator pattern for Lexer.
@@ -1461,6 +1532,63 @@ impl std::fmt::Display for LexicalErrorType {
     }
 }
 
+#[derive(Debug)]
+pub(crate) enum TokenValue {
+    None,
+    Name(Box<str>),
+    Int(Int),
+    Float(f64),
+    Complex {
+        real: f64,
+        imag: f64,
+    },
+    String {
+        value: Box<str>,
+        kind: AnyStringKind,
+    },
+    FStringStart(AnyStringKind),
+    FStringMiddle {
+        value: Box<str>,
+        kind: AnyStringKind,
+    },
+    IpyEscapeCommand {
+        value: Box<str>,
+        kind: IpyEscapeKind,
+    },
+    Comment(Box<str>),
+}
+
+impl TokenValue {
+    pub fn from_token(tok: Tok) -> TokenValue {
+        match tok {
+            Tok::Name { name } => TokenValue::Name(name),
+            Tok::Int { value } => TokenValue::Int(value),
+            Tok::Float { value } => TokenValue::Float(value),
+            Tok::Complex { real, imag } => TokenValue::Complex { real, imag },
+            Tok::String { value, kind } => TokenValue::String { value, kind },
+            Tok::FStringStart(kind) => TokenValue::FStringStart(kind),
+            Tok::FStringMiddle { value, kind } => TokenValue::FStringMiddle { value, kind },
+            Tok::Comment(value) => TokenValue::Comment(value),
+            _ => TokenValue::None,
+        }
+    }
+}
+
+struct LexerCheckpoint<'src> {
+    cursor: Cursor<'src>,
+    state: State,
+    nesting: u32,
+    indentations_position: usize,
+    pending_indentation: Option<Indentation>,
+    fstrings_position: usize,
+}
+
+#[derive(Copy, Clone, Debug)]
+pub(crate) enum LexerContext {
+    Regular,
+    Peeking,
+}
+
 #[derive(Copy, Clone, Debug)]
 enum State {
     /// Lexer is right at the beginning of the file or after a `Newline` token.

diff --git a/crates/ruff_python_parser/src/lexer/cursor.rs b/crates/ruff_python_parser/src/lexer/cursor.rs
@@ -3,16 +3,19 @@ use std::str::Chars;
 
 pub(crate) const EOF_CHAR: char = '\0';
 
+/// A cursor represents a pointer in the source code
 #[derive(Clone, Debug)]
-pub(super) struct Cursor<'a> {
-    chars: Chars<'a>,
+pub(super) struct Cursor<'src> {
+    /// An iterator over the [`char`]'s of the source code.
+    chars: Chars<'src>,
+    /// Length of the source code.
     source_length: TextSize,
     #[cfg(debug_assertions)]
     prev_char: char,
 }
 
-impl<'a> Cursor<'a> {
-    pub(crate) fn new(source: &'a str) -> Self {
+impl<'src> Cursor<'src> {
+    pub(crate) fn new(source: &'src str) -> Self {
         Self {
             source_length: source.text_len(),
             chars: source.chars(),
@@ -42,7 +45,7 @@ impl<'a> Cursor<'a> {
     }
 
     /// Returns the remaining text to lex.
-    pub(super) fn rest(&self) -> &'a str {
+    pub(super) fn rest(&self) -> &'src str {
         self.chars.as_str()
     }
 

diff --git a/crates/ruff_python_parser/src/lexer/fstring.rs b/crates/ruff_python_parser/src/lexer/fstring.rs
@@ -127,4 +127,12 @@ impl FStrings {
     pub(crate) fn current_mut(&mut self) -> Option<&mut FStringContext> {
         self.stack.last_mut()
     }
+
+    pub(super) fn len(&self) -> usize {
+        self.stack.len()
+    }
+
+    pub(super) fn truncate(&mut self, len: usize) {
+        self.stack.truncate(len)
+    }
 }
diff --git a/crates/ruff_python_parser/src/lexer/indentation.rs b/crates/ruff_python_parser/src/lexer/indentation.rs
@@ -82,8 +82,8 @@ impl Indentation {
 #[derive(Debug, Copy, Clone, PartialEq)]
 pub(super) struct UnexpectedIndentation;
 
-// The indentations stack is used to keep track of the current indentation level
-// [See Indentation](docs.python.org/3/reference/lexical_analysis.html#indentation).
+/// The indentations stack is used to keep track of the current indentation level
+/// [See Indentation](docs.python.org/3/reference/lexical_analysis.html#indentation).
 #[derive(Debug, Clone, Default)]
 pub(super) struct Indentations {
     stack: Vec<Indentation>,
@@ -124,6 +124,14 @@ impl Indentations {
         static ROOT: Indentation = Indentation::root();
         self.stack.last().unwrap_or(&ROOT)
     }
+
+    pub(super) fn len(&self) -> usize {
+        self.stack.len()
+    }
+
+    pub(super) fn truncate(&mut self, len: usize) {
+        self.stack.truncate(len)
+    }
 }
 
 assert_eq_size!(Indentation, u64);

diff --git a/crates/ruff_python_parser/src/lib.rs b/crates/ruff_python_parser/src/lib.rs
@@ -117,7 +117,7 @@ pub use crate::parser::Program;
 pub use crate::token::{Tok, TokenKind};
 
 use ruff_python_ast::{Expr, Mod, ModModule, PySourceType, Suite};
-use ruff_text_size::TextSize;
+use ruff_text_size::{TextRange, TextSize};
 
 mod error;
 pub mod lexer;

diff --git a/crates/ruff_python_parser/src/parser/mod.rs b/crates/ruff_python_parser/src/parser/mod.rs
@@ -84,9 +84,6 @@ pub(crate) struct Parser<'src> {
     /// Specify the mode in which the code will be parsed.
     mode: Mode,
 
-    /// Current token along with its range.
-    current: Spanned,
-
     /// The ID of the current token. This is used to track the progress of the parser
     /// to avoid infinite loops when the parser is stuck.
     current_token_id: TokenId,