class Expression:
def __init__(self):
pass
def __repr__(self) -> str:
raise NotImplementedError
def __eq__(self, other: object) -> bool:
raise NotImplementedError
class Program:
def __init__(self, expressions: list[Expression]) -> None:
self.expressions = expressions
def __repr__(self) -> str:
return "\n".join(repr(expr) for expr in self.expressions)
def __str__(self) -> str:
return "\n".join(str(expr) for expr in self.expressions)
class IntegerLiteral(Expression):
def __init__(self, token: Token) -> None:
self.token = token
def __repr__(self) -> str:
return repr(self.token)
def __str__(self) -> str:
return self.token.literal
def __eq__(self, other: object) -> bool:
if not isinstance(other, IntegerLiteral):
return False
return self.token == other.token
class FloatLiteral(Expression):
def __init__(self, token: Token) -> None:
self.token = token
def __repr__(self) -> str:
return repr(self.token)
def __str__(self) -> str:
return self.token.literal
def __eq__(self, other: object) -> bool:
if not isinstance(other, FloatLiteral):
return False
return self.token == other.token
class PrefixExpression(Expression):
def __init__(self, operator: Token, right: Expression) -> None:
self.operator = operator
self.right = right
def __repr__(self) -> str:
return f"{repr(self.operator)} {repr(self.right)}"
def __str__(self) -> str:
return f"{self.operator.literal}{self.right}"
def __eq__(self, other: object) -> bool:
if not isinstance(other, PrefixExpression):
return False
return self.operator == other.operator and self.right == other.right
class InfixExpression(Expression):
def __init__(self, operator: Token, left: Expression, right: Expression) -> None:
self.operator = operator
self.left = left
self.right = right
def __repr__(self) -> str:
return f"({repr(self.operator)} {repr(self.left)} {repr(self.right)})"
def __str__(self) -> str:
return f"({self.left} {self.operator.literal} {self.right})"
def __eq__(self, other: object) -> bool:
if not isinstance(other, InfixExpression):
return False
return (
self.operator == other.operator
and self.left == other.left
and self.right == other.right
)
class GroupedExpression(Expression):
def __init__(self, expression: Expression) -> None:
self.expression = expression
def __repr__(self) -> str:
return f"({repr(self.expression)})"
def __str__(self) -> str:
return f"({self.expression})"
def __eq__(self, other: object) -> bool:
if not isinstance(other, GroupedExpression):
return False
return self.expression == other.expression语法分析:抽象语法树
约 743 字大约 2 分钟
2026-02-17
我们把语法分析器的输出称为抽象语法树(Abstract Syntax Tree,AST)。在 PyEC 这个项目中,每一行输入都应当是一个可以计算的表达式 expression。对于一个表达式来说,我们可以把它看作是一个树结构,树的根节点是表达式的操作符,而叶子节点是操作数。
这里我们需要引入一个概念结合性 associativity。结合性判断的是运算符在相同优先级下先后计算的顺序。加减乘除运算符都是左结合的,这意味着在一个表达式中,它们会从左到右进行计算。而乘方操作符是右结合的,这意味着在一个表达式中,它会从右到左进行计算。譬如,10 - 8 + 7,加法和减法优先级相同,于是等价于(10 - 8) + 7。我们可以用下图来表示。其中叶子节点是数字,非叶子节点是运算符。如果是右结合的乘方运算,那么2 ^ 3 ^ 4等价于2 ^ (3 ^ 4)。
第二个需要引入的概念是优先级 precedence。优先级判断的是运算符在不同优先级下的计算顺序。优先级高的运算符会先被计算。譬如,10 - 8 * 7,乘法的优先级更高,于是等价于10 - (8 * 7)。
当然,我们也可以用括号来改变优先级。譬如,(10 - 8) * 7,括号的优先级最高,于是它的树可以表示为下图。树并不一定要有两个子节点,事实上它可以有多个子节点,只是在编程语言中不太常用。
有了以上的概念,我们可以开始构造抽象语法树了。和之前的词法分析器一样,我们先定义输出类型,然后再尝试用程序来创建它们。在 PyEC 中,一个程序 program,由若干个表达式 expression 组成。每个表达式都占据一行。表达式可以细分为:
- Literal 字面量:整数、浮点数
- Prefix 前缀表达式:由一个操作符
+或-和一个操作数组成,例如-20 - Infix 中缀表达式:由一个操作符和两个操作数组成,例如
20 + 7 - Grouped 特指括号表达式,例如
(20 + 7)
我们采用了和词法分析器类似方式,实现了__repr__()方法和__str__()方法。
我们也创建了测试用例,和词法分析器中的 Token 类型一样。
@pytest.mark.parametrize(
"program, expected_repr, expected_str",
[
(Program([]), "", ""),
(
Program(
[
IntegerLiteral(Token(TokenType.INT, "123", 1, 1)),
]
),
"Token(type=TokenType.INT, literal='123', line=1, column=1)",
"123",
),
(
Program(
[
FloatLiteral(Token(TokenType.FLOAT, "3.2", 1, 1)),
]
),
"Token(type=TokenType.FLOAT, literal='3.2', line=1, column=1)",
"3.2",
),
(
Program(
[
IntegerLiteral(Token(TokenType.INT, "123", 1, 1)),
FloatLiteral(Token(TokenType.FLOAT, "3.2", 2, 1)),
]
),
(
"Token(type=TokenType.INT, literal='123', line=1, column=1)\n"
"Token(type=TokenType.FLOAT, literal='3.2', line=2, column=1)"
),
"123\n3.2",
),
(
Program(
[
PrefixExpression(
Token(TokenType.MINUS, "-", 1, 1),
IntegerLiteral(Token(TokenType.INT, "123", 1, 2)),
),
]
),
(
"Token(type=TokenType.MINUS, literal='-', line=1, column=1) "
"Token(type=TokenType.INT, literal='123', line=1, column=2)"
),
"-123",
),
(
Program(
[
PrefixExpression(
Token(TokenType.MINUS, "-", 1, 1),
FloatLiteral(Token(TokenType.FLOAT, "123.456", 1, 2)),
),
PrefixExpression(
Token(TokenType.PLUS, "+", 2, 1),
IntegerLiteral(Token(TokenType.INT, "5", 2, 2)),
),
]
),
(
"Token(type=TokenType.MINUS, literal='-', line=1, column=1) "
"Token(type=TokenType.FLOAT, literal='123.456', line=1, column=2)\n"
"Token(type=TokenType.PLUS, literal='+', line=2, column=1) "
"Token(type=TokenType.INT, literal='5', line=2, column=2)"
),
"-123.456\n+5",
),
(
Program(
[
InfixExpression(
Token(TokenType.MINUS, "-", 1, 11),
InfixExpression(
Token(TokenType.PLUS, "+", 1, 5),
IntegerLiteral(Token(TokenType.INT, "123", 1, 1)),
IntegerLiteral(Token(TokenType.INT, "456", 1, 7)),
),
IntegerLiteral(Token(TokenType.INT, "789", 1, 13)),
),
]
),
(
"(Token(type=TokenType.MINUS, literal='-', line=1, column=11) "
"(Token(type=TokenType.PLUS, literal='+', line=1, column=5) "
"Token(type=TokenType.INT, literal='123', line=1, column=1) "
"Token(type=TokenType.INT, literal='456', line=1, column=7)) "
"Token(type=TokenType.INT, literal='789', line=1, column=13))"
),
"((123 + 456) - 789)",
),
(
Program(
[
InfixExpression(
Token(TokenType.PLUS, "+", 1, 2),
IntegerLiteral(Token(TokenType.INT, "1", 1, 1)),
InfixExpression(
Token(TokenType.SLASH, "/", 1, 5),
IntegerLiteral(Token(TokenType.INT, "2", 1, 3)),
IntegerLiteral(Token(TokenType.INT, "3", 1, 7)),
),
),
]
),
(
"(Token(type=TokenType.PLUS, literal='+', line=1, column=2) "
"Token(type=TokenType.INT, literal='1', line=1, column=1) "
"(Token(type=TokenType.SLASH, literal='/', line=1, column=5) "
"Token(type=TokenType.INT, literal='2', line=1, column=3) "
"Token(type=TokenType.INT, literal='3', line=1, column=7)))"
),
"(1 + (2 / 3))",
),
(
Program(
[
InfixExpression(
Token(TokenType.PLUS, "+", 1, 9),
GroupedExpression(
InfixExpression(
Token(TokenType.PLUS, "+", 1, 4),
IntegerLiteral(Token(TokenType.INT, "1", 1, 2)),
IntegerLiteral(Token(TokenType.INT, "2", 1, 6)),
),
),
InfixExpression(
Token(TokenType.CARET, "^", 1, 19),
GroupedExpression(
InfixExpression(
Token(TokenType.PLUS, "+", 1, 14),
IntegerLiteral(Token(TokenType.INT, "2", 1, 12)),
IntegerLiteral(Token(TokenType.INT, "3", 1, 16)),
),
),
IntegerLiteral(Token(TokenType.INT, "4", 1, 21)),
),
),
]
),
(
"(Token(type=TokenType.PLUS, literal='+', line=1, column=9) "
"((Token(type=TokenType.PLUS, literal='+', line=1, column=4) "
"Token(type=TokenType.INT, literal='1', line=1, column=2) "
"Token(type=TokenType.INT, literal='2', line=1, column=6))) "
"(Token(type=TokenType.CARET, literal='^', line=1, column=19) "
"((Token(type=TokenType.PLUS, literal='+', line=1, column=14) "
"Token(type=TokenType.INT, literal='2', line=1, column=12) "
"Token(type=TokenType.INT, literal='3', line=1, column=16))) "
"Token(type=TokenType.INT, literal='4', line=1, column=21)))"
),
"(((1 + 2)) + (((2 + 3)) ^ 4))",
),
],
)
def test_ast(
program: Program,
expected_repr: str,
expected_str: str,
) -> None:
assert (
repr(program) == expected_repr
), f"repr: expected='{expected_repr}', actual='{repr(program)}'"
assert (
str(program) == expected_str
), f"str: expected='{expected_str}', actual='{program}'"这一章的两个概念结合性和优先级非常重要。代码层面应该没有什么难度。当然有一些测试的数据比较复杂,需要一些耐心。
$ git add .
$ git commit -m "parser: ast"