Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make Lexer lazy #11244

Merged
merged 2 commits into from
May 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1,055 changes: 590 additions & 465 deletions crates/ruff_python_parser/src/lexer.rs

Large diffs are not rendered by default.

41 changes: 32 additions & 9 deletions crates/ruff_python_parser/src/lexer/cursor.rs
Original file line number Diff line number Diff line change
@@ -1,18 +1,26 @@
use ruff_text_size::{TextLen, TextSize};
use std::str::Chars;

use ruff_text_size::{TextLen, TextSize};

pub(crate) const EOF_CHAR: char = '\0';

/// A cursor represents a pointer in the source code.
#[derive(Clone, Debug)]
pub(super) struct Cursor<'a> {
chars: Chars<'a>,
pub(super) struct Cursor<'src> {
/// An iterator over the [`char`]'s of the source code.
chars: Chars<'src>,

/// Length of the source code. This is used as a marker to indicate the start of the current
/// token which is being lexed.
source_length: TextSize,

/// Stores the previous character for debug assertions.
#[cfg(debug_assertions)]
prev_char: char,
}

impl<'a> Cursor<'a> {
pub(crate) fn new(source: &'a str) -> Self {
impl<'src> Cursor<'src> {
pub(crate) fn new(source: &'src str) -> Self {
Self {
source_length: source.text_len(),
chars: source.chars(),
Expand All @@ -21,14 +29,14 @@ impl<'a> Cursor<'a> {
}
}

/// Returns the previous token. Useful for debug assertions.
/// Returns the previous character. Useful for debug assertions.
#[cfg(debug_assertions)]
pub(super) const fn previous(&self) -> char {
self.prev_char
}

/// Peeks the next character from the input stream without consuming it.
/// Returns [`EOF_CHAR`] if the file is at the end of the file.
/// Returns [`EOF_CHAR`] if the position is past the end of the file.
pub(super) fn first(&self) -> char {
self.chars.clone().next().unwrap_or(EOF_CHAR)
}
Expand All @@ -42,29 +50,44 @@ impl<'a> Cursor<'a> {
}

/// Returns the remaining text to lex.
pub(super) fn rest(&self) -> &'a str {
///
/// Use [`Cursor::text_len`] to get the length of the remaining text.
pub(super) fn rest(&self) -> &'src str {
self.chars.as_str()
}

/// Returns the length of the remaining text.
///
/// Use [`Cursor::rest`] to get the remaining text.
// SAFETY: The `source.text_len` call in `new` would panic if the string length is larger than a `u32`.
#[allow(clippy::cast_possible_truncation)]
pub(super) fn text_len(&self) -> TextSize {
TextSize::new(self.chars.as_str().len() as u32)
}

/// Returns the length of the current token length.
///
/// This is to be used after setting the start position of the token using
/// [`Cursor::start_token`].
pub(super) fn token_len(&self) -> TextSize {
self.source_length - self.text_len()
}

/// Mark the current position of the cursor as the start of the token which is going to be
/// lexed.
///
/// Use [`Cursor::token_len`] to get the length of the lexed token.
pub(super) fn start_token(&mut self) {
self.source_length = self.text_len();
}

/// Returns `true` if the cursor is at the end of file.
pub(super) fn is_eof(&self) -> bool {
self.chars.as_str().is_empty()
}

/// Consumes the next character
/// Moves the cursor to the next character, returning the previous character.
/// Returns [`None`] if there is no next character.
pub(super) fn bump(&mut self) -> Option<char> {
let prev = self.chars.next()?;

Expand Down
13 changes: 12 additions & 1 deletion crates/ruff_python_parser/src/lexer/fstring.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use ruff_python_ast::AnyStringFlags;

/// The context representing the current f-string that the lexer is in.
#[derive(Debug)]
#[derive(Clone, Debug)]
pub(crate) struct FStringContext {
flags: AnyStringFlags,

Expand Down Expand Up @@ -127,4 +127,15 @@ impl FStrings {
pub(crate) fn current_mut(&mut self) -> Option<&mut FStringContext> {
self.stack.last_mut()
}

pub(crate) fn checkpoint(&self) -> FStringsCheckpoint {
FStringsCheckpoint(self.stack.clone())
}

pub(crate) fn rewind(&mut self, checkpoint: FStringsCheckpoint) {
self.stack = checkpoint.0;
}
}

#[derive(Debug, Clone)]
pub(crate) struct FStringsCheckpoint(Vec<FStringContext>);
15 changes: 13 additions & 2 deletions crates/ruff_python_parser/src/lexer/indentation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,8 @@ impl Indentation {
#[derive(Debug, Copy, Clone, PartialEq)]
pub(super) struct UnexpectedIndentation;

// The indentations stack is used to keep track of the current indentation level
// [See Indentation](docs.python.org/3/reference/lexical_analysis.html#indentation).
/// The indentations stack is used to keep track of the current indentation level
/// [See Indentation](docs.python.org/3/reference/lexical_analysis.html#indentation).
#[derive(Debug, Clone, Default)]
pub(super) struct Indentations {
stack: Vec<Indentation>,
Expand Down Expand Up @@ -124,8 +124,19 @@ impl Indentations {
static ROOT: Indentation = Indentation::root();
self.stack.last().unwrap_or(&ROOT)
}

pub(crate) fn checkpoint(&self) -> IndentationsCheckpoint {
IndentationsCheckpoint(self.stack.clone())
}

pub(crate) fn rewind(&mut self, checkpoint: IndentationsCheckpoint) {
self.stack = checkpoint.0;
}
}

#[derive(Debug, Clone)]
pub(crate) struct IndentationsCheckpoint(Vec<Indentation>);

assert_eq_size!(Indentation, u64);

#[cfg(test)]
Expand Down
1 change: 0 additions & 1 deletion crates/ruff_python_parser/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,6 @@ use ruff_text_size::{Ranged, TextRange, TextSize};
mod error;
pub mod lexer;
mod parser;
mod soft_keywords;
mod string;
mod token;
mod token_set;
Expand Down
83 changes: 49 additions & 34 deletions crates/ruff_python_parser/src/parser/expression.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,12 @@ use ruff_python_ast::{
};
use ruff_text_size::{Ranged, TextLen, TextRange, TextSize};

use crate::lexer::TokenValue;
use crate::parser::progress::ParserProgress;
use crate::parser::{helpers, FunctionKind, Parser};
use crate::string::{parse_fstring_literal_element, parse_string_literal, StringType};
use crate::token_set::TokenSet;
use crate::{FStringErrorType, Mode, ParseErrorType, Tok, TokenKind};
use crate::{FStringErrorType, Mode, ParseErrorType, TokenKind};

use super::{Parenthesized, RecoveryContextKind};

Expand Down Expand Up @@ -459,36 +460,43 @@ impl<'src> Parser<'src> {
let range = self.current_token_range();

if self.at(TokenKind::Name) {
let (Tok::Name { name }, _) = self.bump(TokenKind::Name) else {
let TokenValue::Name(name) = self.bump_value(TokenKind::Name) else {
unreachable!();
};
ast::Identifier {
return ast::Identifier {
id: name.to_string(),
range,
}
};
}

if self.current_token_kind().is_soft_keyword() {
let id = self.src_text(range).to_string();
self.bump_any();
return ast::Identifier { id, range };
}

if self.current_token_kind().is_keyword() {
// Non-soft keyword
self.add_error(
ParseErrorType::OtherError(format!(
"Expected an identifier, but found a keyword '{}' that cannot be used here",
self.current_token_kind()
)),
range,
);

let id = self.src_text(range).to_string();
self.bump_any();
ast::Identifier { id, range }
} else {
if self.current_token_kind().is_keyword() {
let (tok, range) = self.next_token();
self.add_error(
ParseErrorType::OtherError(format!(
"Expected an identifier, but found a keyword '{tok}' that cannot be used here"
)),
range,
);
self.add_error(
ParseErrorType::OtherError("Expected an identifier".into()),
range,
);

ast::Identifier {
id: tok.to_string(),
range,
}
} else {
self.add_error(
ParseErrorType::OtherError("Expected an identifier".into()),
range,
);
ast::Identifier {
id: String::new(),
range: self.missing_node_range(),
}
ast::Identifier {
id: String::new(),
range: self.missing_node_range(),
}
}
}
Expand All @@ -501,7 +509,7 @@ impl<'src> Parser<'src> {

let lhs = match self.current_token_kind() {
TokenKind::Float => {
let (Tok::Float { value }, _) = self.bump(TokenKind::Float) else {
let TokenValue::Float(value) = self.bump_value(TokenKind::Float) else {
unreachable!()
};

Expand All @@ -511,7 +519,7 @@ impl<'src> Parser<'src> {
})
}
TokenKind::Complex => {
let (Tok::Complex { real, imag }, _) = self.bump(TokenKind::Complex) else {
let TokenValue::Complex { real, imag } = self.bump_value(TokenKind::Complex) else {
unreachable!()
};
Expr::NumberLiteral(ast::ExprNumberLiteral {
Expand All @@ -520,7 +528,7 @@ impl<'src> Parser<'src> {
})
}
TokenKind::Int => {
let (Tok::Int { value }, _) = self.bump(TokenKind::Int) else {
let TokenValue::Int(value) = self.bump_value(TokenKind::Int) else {
unreachable!()
};
Expr::NumberLiteral(ast::ExprNumberLiteral {
Expand Down Expand Up @@ -566,7 +574,7 @@ impl<'src> Parser<'src> {
TokenKind::Lbrace => self.parse_set_or_dict_like_expression(),

kind => {
if kind.is_keyword() {
if kind.is_keyword() || kind.is_soft_keyword() {
Expr::Name(self.parse_name())
} else {
self.add_error(
Expand Down Expand Up @@ -1231,7 +1239,8 @@ impl<'src> Parser<'src> {
///
/// See: <https://docs.python.org/3.13/reference/lexical_analysis.html#string-and-bytes-literals>
fn parse_string_or_byte_literal(&mut self) -> StringType {
let (Tok::String { value, flags }, range) = self.bump(TokenKind::String) else {
let range = self.current_token_range();
let TokenValue::String { value, flags } = self.bump_value(TokenKind::String) else {
unreachable!()
};

Expand Down Expand Up @@ -1278,7 +1287,7 @@ impl<'src> Parser<'src> {
fn parse_fstring(&mut self) -> ast::FString {
let start = self.node_start();

let (Tok::FStringStart(kind), _) = self.bump(TokenKind::FStringStart) else {
let TokenValue::FStringStart(kind) = self.bump_value(TokenKind::FStringStart) else {
unreachable!()
};
let elements = self.parse_fstring_elements();
Expand Down Expand Up @@ -1306,7 +1315,9 @@ impl<'src> Parser<'src> {
FStringElement::Expression(parser.parse_fstring_expression_element())
}
TokenKind::FStringMiddle => {
let (Tok::FStringMiddle { value, flags, .. }, range) = parser.next_token()
let range = parser.current_token_range();
let TokenValue::FStringMiddle { value, flags, .. } =
parser.bump_value(TokenKind::FStringMiddle)
else {
unreachable!()
};
Expand Down Expand Up @@ -1396,7 +1407,10 @@ impl<'src> Parser<'src> {

let conversion = if self.eat(TokenKind::Exclamation) {
let conversion_flag_range = self.current_token_range();
if let Tok::Name { name } = self.next_token().0 {
if self.at(TokenKind::Name) {
let TokenValue::Name(name) = self.bump_value(TokenKind::Name) else {
unreachable!();
};
match &*name {
"s" => ConversionFlag::Str,
"r" => ConversionFlag::Repr,
Expand Down Expand Up @@ -2229,7 +2243,8 @@ impl<'src> Parser<'src> {
fn parse_ipython_escape_command_expression(&mut self) -> ast::ExprIpyEscapeCommand {
let start = self.node_start();

let (Tok::IpyEscapeCommand { value, kind }, _) = self.bump(TokenKind::IpyEscapeCommand)
let TokenValue::IpyEscapeCommand { value, kind } =
self.bump_value(TokenKind::IpyEscapeCommand)
else {
unreachable!()
};
Expand Down