Skip to content

Commit

Permalink
Make the lexer lazy
Browse files Browse the repository at this point in the history
  • Loading branch information
dhruvmanila committed May 17, 2024
1 parent 83152ff commit 2f35044
Show file tree
Hide file tree
Showing 14 changed files with 882 additions and 939 deletions.
1,062 changes: 596 additions & 466 deletions crates/ruff_python_parser/src/lexer.rs

Large diffs are not rendered by default.

41 changes: 32 additions & 9 deletions crates/ruff_python_parser/src/lexer/cursor.rs
Original file line number Diff line number Diff line change
@@ -1,18 +1,26 @@
use ruff_text_size::{TextLen, TextSize};
use std::str::Chars;

use ruff_text_size::{TextLen, TextSize};

pub(crate) const EOF_CHAR: char = '\0';

/// A cursor represents a pointer in the source code.
#[derive(Clone, Debug)]
pub(super) struct Cursor<'a> {
chars: Chars<'a>,
pub(super) struct Cursor<'src> {
/// An iterator over the [`char`]'s of the source code.
chars: Chars<'src>,

/// Length of the source code. This is used as a marker to indicate the start of the current
/// token which is being lexed.
source_length: TextSize,

/// Stores the previous character for debug assertions.
#[cfg(debug_assertions)]
prev_char: char,
}

impl<'a> Cursor<'a> {
pub(crate) fn new(source: &'a str) -> Self {
impl<'src> Cursor<'src> {
pub(crate) fn new(source: &'src str) -> Self {
Self {
source_length: source.text_len(),
chars: source.chars(),
Expand All @@ -21,14 +29,14 @@ impl<'a> Cursor<'a> {
}
}

/// Returns the previous token. Useful for debug assertions.
/// Returns the previous character. Useful for debug assertions.
#[cfg(debug_assertions)]
pub(super) const fn previous(&self) -> char {
self.prev_char
}

/// Peeks the next character from the input stream without consuming it.
/// Returns [`EOF_CHAR`] if the file is at the end of the file.
/// Returns [`EOF_CHAR`] if the position is past the end of the file.
pub(super) fn first(&self) -> char {
self.chars.clone().next().unwrap_or(EOF_CHAR)
}
Expand All @@ -42,29 +50,44 @@ impl<'a> Cursor<'a> {
}

/// Returns the remaining text to lex.
pub(super) fn rest(&self) -> &'a str {
///
/// Use [`Cursor::text_len`] to get the length of the remaining text.
pub(super) fn rest(&self) -> &'src str {
self.chars.as_str()
}

/// Returns the length of the remaining text.
///
/// Use [`Cursor::rest`] to get the remaining text.
// SAFETY: The `source.text_len` call in `new` would panic if the string length is larger than a `u32`.
#[allow(clippy::cast_possible_truncation)]
pub(super) fn text_len(&self) -> TextSize {
TextSize::new(self.chars.as_str().len() as u32)
}

/// Returns the length of the current token length.
///
/// This is to be used after setting the start position of the token using
/// [`Cursor::start_token`].
pub(super) fn token_len(&self) -> TextSize {
self.source_length - self.text_len()
}

/// Mark the current position of the cursor as the start of the token which is going to be
/// lexed.
///
/// Use [`Cursor::token_len`] to get the length of the lexed token.
pub(super) fn start_token(&mut self) {
self.source_length = self.text_len();
}

/// Returns `true` if the cursor is at the end of file.
pub(super) fn is_eof(&self) -> bool {
self.chars.as_str().is_empty()
}

/// Consumes the next character
/// Moves the cursor to the next character, returning the previous character.
/// Returns [`None`] if there is no next character.
pub(super) fn bump(&mut self) -> Option<char> {
let prev = self.chars.next()?;

Expand Down
13 changes: 13 additions & 0 deletions crates/ruff_python_parser/src/lexer/fstring.rs
Original file line number Diff line number Diff line change
Expand Up @@ -127,4 +127,17 @@ impl FStrings {
pub(crate) fn current_mut(&mut self) -> Option<&mut FStringContext> {
self.stack.last_mut()
}

pub(crate) fn checkpoint(&self) -> FStringsCheckpoint {
FStringsCheckpoint(self.stack.len())
}

pub(crate) fn rewind(&mut self, checkpoint: FStringsCheckpoint) {
assert!(self.stack.len() <= checkpoint.0);

self.stack.truncate(checkpoint.0);
}
}

#[derive(Debug, Copy, Clone)]
pub(crate) struct FStringsCheckpoint(usize);
17 changes: 15 additions & 2 deletions crates/ruff_python_parser/src/lexer/indentation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,8 @@ impl Indentation {
#[derive(Debug, Copy, Clone, PartialEq)]
pub(super) struct UnexpectedIndentation;

// The indentations stack is used to keep track of the current indentation level
// [See Indentation](docs.python.org/3/reference/lexical_analysis.html#indentation).
/// The indentations stack is used to keep track of the current indentation level
/// [See Indentation](docs.python.org/3/reference/lexical_analysis.html#indentation).
#[derive(Debug, Clone, Default)]
pub(super) struct Indentations {
stack: Vec<Indentation>,
Expand Down Expand Up @@ -124,8 +124,21 @@ impl Indentations {
static ROOT: Indentation = Indentation::root();
self.stack.last().unwrap_or(&ROOT)
}

pub(crate) fn checkpoint(&self) -> IndentationsCheckpoint {
IndentationsCheckpoint(self.stack.len())
}

pub(crate) fn rewind(&mut self, checkpoint: IndentationsCheckpoint) {
assert!(self.stack.len() <= checkpoint.0);

self.stack.truncate(checkpoint.0);
}
}

#[derive(Debug, Copy, Clone)]
pub(crate) struct IndentationsCheckpoint(usize);

assert_eq_size!(Indentation, u64);

#[cfg(test)]
Expand Down
1 change: 0 additions & 1 deletion crates/ruff_python_parser/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,6 @@ use ruff_text_size::{Ranged, TextRange, TextSize};
mod error;
pub mod lexer;
mod parser;
mod soft_keywords;
mod string;
mod token;
mod token_set;
Expand Down
83 changes: 49 additions & 34 deletions crates/ruff_python_parser/src/parser/expression.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,12 @@ use ruff_python_ast::{
};
use ruff_text_size::{Ranged, TextLen, TextRange, TextSize};

use crate::lexer::TokenValue;
use crate::parser::progress::ParserProgress;
use crate::parser::{helpers, FunctionKind, Parser};
use crate::string::{parse_fstring_literal_element, parse_string_literal, StringType};
use crate::token_set::TokenSet;
use crate::{FStringErrorType, Mode, ParseErrorType, Tok, TokenKind};
use crate::{FStringErrorType, Mode, ParseErrorType, TokenKind};

use super::{Parenthesized, RecoveryContextKind};

Expand Down Expand Up @@ -459,36 +460,43 @@ impl<'src> Parser<'src> {
let range = self.current_token_range();

if self.at(TokenKind::Name) {
let (Tok::Name { name }, _) = self.bump(TokenKind::Name) else {
let TokenValue::Name(name) = self.bump_value(TokenKind::Name) else {
unreachable!();
};
ast::Identifier {
return ast::Identifier {
id: name.to_string(),
range,
}
};
}

if self.current_token_kind().is_soft_keyword() {
let id = self.src_text(range).to_string();
self.bump_any();
return ast::Identifier { id, range };
}

if self.current_token_kind().is_keyword() {
// Non-soft keyword
self.add_error(
ParseErrorType::OtherError(format!(
"Expected an identifier, but found a keyword '{}' that cannot be used here",
self.current_token_kind()
)),
range,
);

let id = self.src_text(range).to_string();
self.bump_any();
ast::Identifier { id, range }
} else {
if self.current_token_kind().is_keyword() {
let (tok, range) = self.next_token();
self.add_error(
ParseErrorType::OtherError(format!(
"Expected an identifier, but found a keyword '{tok}' that cannot be used here"
)),
range,
);
self.add_error(
ParseErrorType::OtherError("Expected an identifier".into()),
range,
);

ast::Identifier {
id: tok.to_string(),
range,
}
} else {
self.add_error(
ParseErrorType::OtherError("Expected an identifier".into()),
range,
);
ast::Identifier {
id: String::new(),
range: self.missing_node_range(),
}
ast::Identifier {
id: String::new(),
range: self.missing_node_range(),
}
}
}
Expand All @@ -501,7 +509,7 @@ impl<'src> Parser<'src> {

let lhs = match self.current_token_kind() {
TokenKind::Float => {
let (Tok::Float { value }, _) = self.bump(TokenKind::Float) else {
let TokenValue::Float(value) = self.bump_value(TokenKind::Float) else {
unreachable!()
};

Expand All @@ -511,7 +519,7 @@ impl<'src> Parser<'src> {
})
}
TokenKind::Complex => {
let (Tok::Complex { real, imag }, _) = self.bump(TokenKind::Complex) else {
let TokenValue::Complex { real, imag } = self.bump_value(TokenKind::Complex) else {
unreachable!()
};
Expr::NumberLiteral(ast::ExprNumberLiteral {
Expand All @@ -520,7 +528,7 @@ impl<'src> Parser<'src> {
})
}
TokenKind::Int => {
let (Tok::Int { value }, _) = self.bump(TokenKind::Int) else {
let TokenValue::Int(value) = self.bump_value(TokenKind::Int) else {
unreachable!()
};
Expr::NumberLiteral(ast::ExprNumberLiteral {
Expand Down Expand Up @@ -566,7 +574,7 @@ impl<'src> Parser<'src> {
TokenKind::Lbrace => self.parse_set_or_dict_like_expression(),

kind => {
if kind.is_keyword() {
if kind.is_keyword() || kind.is_soft_keyword() {
Expr::Name(self.parse_name())
} else {
self.add_error(
Expand Down Expand Up @@ -1231,7 +1239,8 @@ impl<'src> Parser<'src> {
///
/// See: <https://docs.python.org/3.13/reference/lexical_analysis.html#string-and-bytes-literals>
fn parse_string_or_byte_literal(&mut self) -> StringType {
let (Tok::String { value, flags }, range) = self.bump(TokenKind::String) else {
let range = self.current_token_range();
let TokenValue::String { value, flags } = self.bump_value(TokenKind::String) else {
unreachable!()
};

Expand Down Expand Up @@ -1278,7 +1287,7 @@ impl<'src> Parser<'src> {
fn parse_fstring(&mut self) -> ast::FString {
let start = self.node_start();

let (Tok::FStringStart(kind), _) = self.bump(TokenKind::FStringStart) else {
let TokenValue::FStringStart(kind) = self.bump_value(TokenKind::FStringStart) else {
unreachable!()
};
let elements = self.parse_fstring_elements();
Expand Down Expand Up @@ -1306,7 +1315,9 @@ impl<'src> Parser<'src> {
FStringElement::Expression(parser.parse_fstring_expression_element())
}
TokenKind::FStringMiddle => {
let (Tok::FStringMiddle { value, flags, .. }, range) = parser.next_token()
let range = parser.current_token_range();
let TokenValue::FStringMiddle { value, flags, .. } =
parser.bump_value(TokenKind::FStringMiddle)
else {
unreachable!()
};
Expand Down Expand Up @@ -1396,7 +1407,10 @@ impl<'src> Parser<'src> {

let conversion = if self.eat(TokenKind::Exclamation) {
let conversion_flag_range = self.current_token_range();
if let Tok::Name { name } = self.next_token().0 {
if self.at(TokenKind::Name) {
let TokenValue::Name(name) = self.bump_value(TokenKind::Name) else {
unreachable!();
};
match &*name {
"s" => ConversionFlag::Str,
"r" => ConversionFlag::Repr,
Expand Down Expand Up @@ -2229,7 +2243,8 @@ impl<'src> Parser<'src> {
fn parse_ipython_escape_command_expression(&mut self) -> ast::ExprIpyEscapeCommand {
let start = self.node_start();

let (Tok::IpyEscapeCommand { value, kind }, _) = self.bump(TokenKind::IpyEscapeCommand)
let TokenValue::IpyEscapeCommand { value, kind } =
self.bump_value(TokenKind::IpyEscapeCommand)
else {
unreachable!()
};
Expand Down

0 comments on commit 2f35044

Please sign in to comment.