class Parser:
def __init__(self, lexer: Lexer) -> None:
self.lexer = lexer
self.cur_token: Token | None = None
self.peek_token: Token | None = None
self.expressions: list[Expression] = []
self.advance_token()
self.advance_token()
def advance_token(self) -> None:
while next_token := self.lexer.next_token():
if next_token.type != TokenType.WHITESPACE: # skip whitespaces
self.cur_token = self.peek_token
self.peek_token = next_token
break
def expect_token_type(self, token: Token, token_types: set[TokenType]) -> bool:
if token.type not in token_types:
return False
return True
def parse_program(self) -> Program:
raise NotImplementedError
def parse_expression(self) -> Expression:
raise NotImplementedError语法分析:语法分析器框架
约 333 字大约 1 分钟
2026-02-18
先预告一下,后续我们会用两种方法实现语法分析:Pratt 解析器和递归下降解析器。但是我先要考虑明白怎么处理输入的符号流。这里最重要的函数就是advance_token()。它会从输入的符号流中读取下一个符号,并更新cur_token和peek_token。由于advance_token()会实际上调用的是词法分析器的next_token(),因此到了源代码的结尾会得到 EOF。此外,遇到 WHITESPACE 是会自动跳过的。
还有一个辅助函数expect_token_type(),它会检查作为参数的符号是否是期望的类型。如果不是,就会返回False。
测试部分,由于advance_token()很重要,我们准备了一些测试。举例来说,如果输入一个字符串3.2 / 5.3
- 调用一次
advance_token()后,cur_token会是3.2,peek_token会是/。 - 调用第二次
advance_token()后,cur_token会是/,peek_token会是5.3。 - 调用第三次
advance_token()后,cur_token会是5.3,peek_token会是EOF。
import pytest
from pyec.lexer import Lexer
from pyec.parser import Parser
from pyec.token import Token, TokenType
@pytest.mark.parametrize(
"src_code, times, cur_token, peek_token",
[
(
"",
0,
Token(TokenType.EOF, "\0", 1, 1),
Token(TokenType.EOF, "\0", 1, 1),
),
(
" \t",
1,
Token(TokenType.EOF, "\0", 1, 3),
Token(TokenType.EOF, "\0", 1, 3),
),
(
"\n \r\n",
0,
Token(TokenType.NEWLINE, "\n", 1, 1),
Token(TokenType.NEWLINE, "\r\n", 2, 2),
),
(
"1",
0,
Token(TokenType.INT, "1", 1, 1),
Token(TokenType.EOF, "\0", 1, 2),
),
(
"1 +6",
2,
Token(TokenType.INT, "6", 1, 4),
Token(TokenType.EOF, "\0", 1, 5),
),
(
"1 +6",
3,
Token(TokenType.EOF, "\0", 1, 5),
Token(TokenType.EOF, "\0", 1, 5),
),
(
"3.2 / 5.3",
0,
Token(TokenType.FLOAT, "3.2", 1, 1),
Token(TokenType.SLASH, "/", 1, 5),
),
(
"3.2 / 5.3",
1,
Token(TokenType.SLASH, "/", 1, 5),
Token(TokenType.FLOAT, "5.3", 1, 7),
),
(
"3.2 / 5.3",
2,
Token(TokenType.FLOAT, "5.3", 1, 7),
Token(TokenType.EOF, "\0", 1, 10),
),
(
"3.2 / 5.3",
3,
Token(TokenType.EOF, "\0", 1, 10),
Token(TokenType.EOF, "\0", 1, 10),
),
],
)
def test_parser_advance_token(
src_code: str,
times: int,
cur_token: Token,
peek_token: Token,
) -> None:
lexer = Lexer(src_code)
parser = Parser(lexer)
for _ in range(times):
parser.advance_token()
assert (
parser.cur_token == cur_token
), f"cur_token: expected='{cur_token}', actual='{parser.cur_token}'"
assert (
parser.peek_token == peek_token
), f"peek_token: expected='{peek_token}', actual='{parser.peek_token}'"我们的框架已经准备好,我们可以开始实现语法分析器了。当前的代码,我们提交给 Git 仓库。
$ git add .
$ git commit -m "parser: skeleton"