From ab8451cb434fbfd279c20609b3ca71cab1a1a488 Mon Sep 17 00:00:00 2001 From: Alessio Stalla Date: Tue, 22 Jul 2025 14:03:00 +0200 Subject: [PATCH 1/3] #18 Introduce a PylasuANTLRParser class modeled after Kolasu. Not everything was copied from Kolasu, just the bare minimum to have this usable in a project. --- Requirements.txt | 2 +- pylasu/model/model.py | 10 +-- pylasu/parsing/antlr.py | 150 ++++++++++++++++++++++++++++++++++++++ pylasu/parsing/results.py | 48 +++++------- 4 files changed, 174 insertions(+), 36 deletions(-) create mode 100644 pylasu/parsing/antlr.py diff --git a/Requirements.txt b/Requirements.txt index 59e7149..641f396 100644 --- a/Requirements.txt +++ b/Requirements.txt @@ -1,2 +1,2 @@ -antlr4-python3-runtime==4.11.1 +antlr4-python3-runtime==4.13.2 pyecore==0.12.2; extra == 'ecore' diff --git a/pylasu/model/model.py b/pylasu/model/model.py index df026a6..c3d5c7c 100644 --- a/pylasu/model/model.py +++ b/pylasu/model/model.py @@ -133,14 +133,14 @@ def process_annotated_property(cl: type, name: str, decl_type): fields = dataclasses.fields(cl) except TypeError: fields = tuple() - for field in fields: - if field.name == name and PYLASU_FEATURE in field.metadata: - feature = field.metadata[PYLASU_FEATURE] + for fld in fields: + if fld.name == name and PYLASU_FEATURE in fld.metadata: + feature = fld.metadata[PYLASU_FEATURE] feature.name = name if isinstance(decl_type, type): feature.type = decl_type - elif type(field.type) is str: - feature.type = try_to_resolve_string_type(field.type, name, cl) + elif type(fld.type) is str: + feature.type = try_to_resolve_string_type(fld.type, name, cl) return feature return compute_feature_from_annotation(cl, name, decl_type) diff --git a/pylasu/parsing/antlr.py b/pylasu/parsing/antlr.py new file mode 100644 index 0000000..3764310 --- /dev/null +++ b/pylasu/parsing/antlr.py @@ -0,0 +1,150 @@ +import time +from abc import abstractmethod +from typing import Optional, List + +from antlr4 import CommonTokenStream, InputStream, Lexer, Parser, ParserATNSimulator, ParserRuleContext, \ + PredictionContextCache, Recognizer, Token, TokenStream +from antlr4.error.ErrorListener import ErrorListener +from pylasu.model import walk, Source, Position, Node, Point +from pylasu.model.processing import assign_parents +from pylasu.parsing.parse_tree import token_end_point +from pylasu.parsing.results import ParsingResultWithFirstStage, FirstStageParsingResult +from pylasu.validation import Issue, IssueType + + +class PylasuANTLRParser: + """ A complete description of a multi-stage ANTLR-based parser, from source code to AST. + + You should extend this class to implement the parts that are specific to your language. + + Note: instances of this class are thread-safe and they're meant to be reused. Do not create a new PylasuANTLRParser + instance every time you need to parse some source code, or performance may suffer.""" + def __init__(self): + self.prediction_context_cache = PredictionContextCache() + + def parse(self, input_stream: InputStream, consider_range: bool = True, measure_lexing_time: bool = False, + source: Optional[Source] = None): + """Parses source code, returning a result that includes an AST and a collection of parse issues + (errors, warnings). + The parsing is done in accordance to the StarLasu methodology i.e. a first-stage parser builds a parse tree + which is then mapped onto a higher-level tree called the AST. + @param inputStream the source code. + @param charset the character set in which the input is encoded. + @param considerPosition if true (the default), parsed AST nodes record their position in the input text. + @param measureLexingTime if true, the result will include a measurement of the time spent in lexing i.e. + breaking the input stream into tokens.""" + start = time.time_ns() + first_stage = self.parse_first_stage(input_stream, measure_lexing_time) + issues = first_stage.issues + ast = self.parse_tree_to_ast(first_stage.root, consider_range, issues, source) + self.assign_parents(ast) + ast = self.post_process_ast(ast, issues) if ast else ast + if ast and not consider_range: + # Remove parseTreeNodes because they cause the range to be computed + for node in walk(ast): + node.origin = None + now = time.time_ns() + return ParsingResultWithFirstStage( + issues, + ast, + input_stream.getText(0, input_stream.index + 1), + (now - start) // 1_000_000, + first_stage, + source, + ) + + def parse_first_stage( + self, input_stream: InputStream, measure_lexing_time: bool = False, source: Source = None + ) -> FirstStageParsingResult: + """Executes only the first stage of the parser, i.e., the production of a parse tree. Usually, you'll want to + use the [parse] method, that returns an AST which is simpler to use and query.""" + issues = [] + lexing_time: Optional[int] = None + total_time: int = time.time_ns() + parser = self.create_parser(input_stream, issues) + if measure_lexing_time: + token_stream = parser.getInputStream() + if isinstance(token_stream, CommonTokenStream): + lexing_time = time.time_ns() + token_stream.fill() + token_stream.seek(0) + lexing_time = (time.time_ns() - lexing_time) // 1_000_000 + root = self.invoke_root_rule(parser) + if root: + self.verify_parse_tree(parser, issues, root) + total_time = (time.time_ns() - total_time) // 1_000_000 + return FirstStageParsingResult(issues, root, None, total_time, lexing_time, source) + + def create_parser(self, input_stream: InputStream, issues: List[Issue]) -> Parser: + """Creates the first-stage parser.""" + lexer = self.create_antlr_lexer(input_stream) + self.attach_listeners(lexer, issues) + token_stream = self.create_token_stream(lexer) + parser = self.create_antlr_parser(token_stream) + # Assign interpreter to avoid caching DFA states indefinitely across executions + parser._interp = \ + ParserATNSimulator(parser, parser.atn, parser._interp.decisionToDFA, self.prediction_context_cache) + self.attach_listeners(parser, issues) + return parser + + def invoke_root_rule(self, parser: Parser): + """Invokes the parser's root rule, i.e., the method which is responsible for parsing the entire input. + Usually this is the topmost rule, the one with index 0 (as also assumed by other libraries such as antlr4-c3), + so this method invokes that rule. If your grammar/parser is structured differently, or if you're using this to + parse only a portion of the input or a subset of the language, you have to override this method to invoke the + correct entry point.""" + return getattr(parser, parser.ruleNames[0])() + + def verify_parse_tree(self, parser: Parser, issues: List[Issue], root: ParserRuleContext): + """Checks the parse tree for correctness. + If you're concerned about performance, you may want to override this to do nothing.""" + last_token: Token = parser.getTokenStream().get(parser.getTokenStream().index) + if last_token.type != Token.EOF: + issues.append( + Issue( + IssueType.SYNTACTIC, + "The whole input was not consumed", + position=Position(token_end_point(last_token), token_end_point(last_token)) + ) + ) + # TODO Kolasu also traverses the parse tree searching for exceptions + + def assign_parents(self, ast): + if ast: + assign_parents(ast) + + def post_process_ast(self, ast, issues): + return ast + + def create_token_stream(self, lexer: Lexer) -> TokenStream: + return CommonTokenStream(lexer) + + @abstractmethod + def create_antlr_lexer(self, input_stream: InputStream): + """Creates the lexer.""" + pass + + @abstractmethod + def create_antlr_parser(self, token_stream: TokenStream): + """Creates the first-stage parser.""" + pass + + @abstractmethod + def parse_tree_to_ast(self, root, consider_range: bool, issues: List[Issue], source: Source) -> Optional[Node]: + pass + + def attach_listeners(self, recognizer: Recognizer, issues: List[Issue]): + recognizer.removeErrorListeners() + recognizer.addErrorListener(ParserErrorListener(issues)) + + +class ParserErrorListener(ErrorListener): + def __init__(self, issues: List[Issue]): + self.issues = issues + + def syntaxError(self, recognizer, offending_symbol, line, column, msg, e): + start_point = Point(line, column) + end_point = start_point + if isinstance(offending_symbol, Token): + end_point = token_end_point(offending_symbol) + self.issues.append(Issue(IssueType.SYNTACTIC, msg or "unspecified", position=Position(start_point, end_point))) diff --git a/pylasu/parsing/results.py b/pylasu/parsing/results.py index 17c9fda..3954bb4 100644 --- a/pylasu/parsing/results.py +++ b/pylasu/parsing/results.py @@ -1,37 +1,25 @@ -from dataclasses import dataclass -from typing import List +from dataclasses import dataclass, field +from typing import List, Optional -from antlr4 import ParserRuleContext, Token - -from pylasu.model import Source -from pylasu.validation.validation import WithIssues, IssueType, Issue - - -@dataclass -class FirstStageResult(WithIssues): - parse_tree: ParserRuleContext +from pylasu.model import Source, Node +from pylasu.validation.validation import Issue @dataclass -class LexingResult(WithIssues): - tokens: List[Token] +class FirstStageParsingResult: + issues: List[Issue] + root: Optional[Node] + code: Optional[str] = None + time: int = None + lexing_time: int = None + source: Source = None @dataclass -class IssuesErrorListener: - """This Error Listener should be used with ANTLR lexers and parsers to capture issues""" - type: IssueType - source: Source - issues: WithIssues - - def syntaxError(self, recognizer, offendingSymbol, line, column, msg, e): - self.issues.append(Issue(type=self.type, message=msg)) - - def reportAmbiguity(self, recognizer, dfa, startIndex, stopIndex, exact, ambigAlts, configs): - pass - - def reportAttemptingFullContext(self, recognizer, dfa, startIndex, stopIndex, conflictingAlts, configs): - pass - - def reportContextSensitivity(self, recognizer, dfa, startIndex, stopIndex, prediction, configs): - pass +class ParsingResultWithFirstStage: + issues: List[Issue] = field(default_factory=list) + root: Node = None + code: str = None + time: int = None + first_stage: FirstStageParsingResult = None + source: Source = None From ef208e3f139e88ad21419a169f093a6399698f54 Mon Sep 17 00:00:00 2001 From: Alessio Stalla Date: Tue, 22 Jul 2025 14:24:28 +0200 Subject: [PATCH 2/3] #18 Add relevant tests from Kolasu, lint --- .github/workflows/pythonapp.yml | 2 +- pylasu/parsing/antlr.py | 15 ++++--- tests/SimpleLangParser.g4 | 2 +- tests/generate-test-parsers.sh | 6 +-- tests/parsing/__init__.py | 0 tests/parsing/test_pylasu_antlr_parser.py | 53 +++++++++++++++++++++++ 6 files changed, 67 insertions(+), 11 deletions(-) create mode 100644 tests/parsing/__init__.py create mode 100644 tests/parsing/test_pylasu_antlr_parser.py diff --git a/.github/workflows/pythonapp.yml b/.github/workflows/pythonapp.yml index a618b66..b3b1527 100644 --- a/.github/workflows/pythonapp.yml +++ b/.github/workflows/pythonapp.yml @@ -23,7 +23,7 @@ jobs: run: | python -m pip install --upgrade pip pip install -r Requirements.txt - curl -O https://www.antlr.org/download/antlr-4.11.1-complete.jar + pip install antlr4-cli - name: Lint with flake8 run: | pip install flake8 diff --git a/pylasu/parsing/antlr.py b/pylasu/parsing/antlr.py index 3764310..e0cb876 100644 --- a/pylasu/parsing/antlr.py +++ b/pylasu/parsing/antlr.py @@ -1,6 +1,6 @@ import time from abc import abstractmethod -from typing import Optional, List +from typing import Optional, List, Union from antlr4 import CommonTokenStream, InputStream, Lexer, Parser, ParserATNSimulator, ParserRuleContext, \ PredictionContextCache, Recognizer, Token, TokenStream @@ -22,8 +22,8 @@ class PylasuANTLRParser: def __init__(self): self.prediction_context_cache = PredictionContextCache() - def parse(self, input_stream: InputStream, consider_range: bool = True, measure_lexing_time: bool = False, - source: Optional[Source] = None): + def parse(self, input_stream: Union[InputStream, str], consider_range: bool = True, + measure_lexing_time: bool = False, source: Optional[Source] = None): """Parses source code, returning a result that includes an AST and a collection of parse issues (errors, warnings). The parsing is done in accordance to the StarLasu methodology i.e. a first-stage parser builds a parse tree @@ -34,6 +34,8 @@ def parse(self, input_stream: InputStream, consider_range: bool = True, measure_ @param measureLexingTime if true, the result will include a measurement of the time spent in lexing i.e. breaking the input stream into tokens.""" start = time.time_ns() + if type(input_stream) is str: + input_stream = InputStream(input_stream) first_stage = self.parse_first_stage(input_stream, measure_lexing_time) issues = first_stage.issues ast = self.parse_tree_to_ast(first_stage.root, consider_range, issues, source) @@ -120,12 +122,12 @@ def create_token_stream(self, lexer: Lexer) -> TokenStream: return CommonTokenStream(lexer) @abstractmethod - def create_antlr_lexer(self, input_stream: InputStream): + def create_antlr_lexer(self, input_stream: InputStream) -> Lexer: """Creates the lexer.""" pass @abstractmethod - def create_antlr_parser(self, token_stream: TokenStream): + def create_antlr_parser(self, token_stream: TokenStream) -> Parser: """Creates the first-stage parser.""" pass @@ -147,4 +149,5 @@ def syntaxError(self, recognizer, offending_symbol, line, column, msg, e): end_point = start_point if isinstance(offending_symbol, Token): end_point = token_end_point(offending_symbol) - self.issues.append(Issue(IssueType.SYNTACTIC, msg or "unspecified", position=Position(start_point, end_point))) + msg = (msg or "unspecified").capitalize() + self.issues.append(Issue(IssueType.SYNTACTIC, msg, position=Position(start_point, end_point))) diff --git a/tests/SimpleLangParser.g4 b/tests/SimpleLangParser.g4 index ac85c0f..a643a87 100644 --- a/tests/SimpleLangParser.g4 +++ b/tests/SimpleLangParser.g4 @@ -3,7 +3,7 @@ parser grammar SimpleLangParser; options { tokenVocab = SimpleLangLexer; } compilationUnit: - statement+; + statement+ EOF; statement: DISPLAY expression #displayStmt diff --git a/tests/generate-test-parsers.sh b/tests/generate-test-parsers.sh index 4b3338e..0d6bf2b 100755 --- a/tests/generate-test-parsers.sh +++ b/tests/generate-test-parsers.sh @@ -1,3 +1,3 @@ -java -cp ../antlr-4.11.1-complete.jar org.antlr.v4.Tool -Dlanguage=Python3 -visitor -o simple_lang SimpleLangLexer.g4 SimpleLangParser.g4 -java -cp ../antlr-4.11.1-complete.jar org.antlr.v4.Tool -Dlanguage=Python3 -visitor -o antlr_entity AntlrEntityLexer.g4 AntlrEntityParser.g4 -java -cp ../antlr-4.11.1-complete.jar org.antlr.v4.Tool -Dlanguage=Python3 -visitor -o antlr_script AntlrScriptLexer.g4 AntlrScriptParser.g4 +antlr4 -Dlanguage=Python3 -visitor -o simple_lang SimpleLangLexer.g4 SimpleLangParser.g4 +antlr4 -Dlanguage=Python3 -visitor -o antlr_entity AntlrEntityLexer.g4 AntlrEntityParser.g4 +antlr4 -Dlanguage=Python3 -visitor -o antlr_script AntlrScriptLexer.g4 AntlrScriptParser.g4 diff --git a/tests/parsing/__init__.py b/tests/parsing/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/parsing/test_pylasu_antlr_parser.py b/tests/parsing/test_pylasu_antlr_parser.py new file mode 100644 index 0000000..748531b --- /dev/null +++ b/tests/parsing/test_pylasu_antlr_parser.py @@ -0,0 +1,53 @@ +import unittest +from typing import List, Optional + +from antlr4 import TokenStream, InputStream + +from pylasu.model import Source, Node, Position, Point +from pylasu.parsing.antlr import PylasuANTLRParser +from pylasu.validation import Issue +from tests.simple_lang.SimpleLangLexer import SimpleLangLexer +from tests.simple_lang.SimpleLangParser import SimpleLangParser + + +class SimpleLangPylasuParser(PylasuANTLRParser): + + def create_antlr_lexer(self, input_stream: InputStream): + return SimpleLangLexer(input_stream) + + def create_antlr_parser(self, token_stream: TokenStream): + return SimpleLangParser(token_stream) + + def parse_tree_to_ast(self, root, consider_range: bool, issues: List[Issue], source: Source) -> Optional[Node]: + return None + + +class KolasuParserTest(unittest.TestCase): + def test_lexing(self): + parser = SimpleLangPylasuParser() + result = parser.parse("""set a = 10 +set b = "" +display c +""") + self.assertIsNotNone(result) + # TODO we don't have Pylasu Tokens yet + + def test_issues_are_capitalized(self): + parser = SimpleLangPylasuParser() + result = parser.parse("""set set a = 10 +display c +""") + self.assertTrue(result.issues) + self.assertTrue([i for i in result.issues if i.message.startswith("Extraneous input 'set'")]) + self.assertTrue([i for i in result.issues if i.message.startswith("Mismatched input 'c'")]) + + def test_issues_have_not_flat_position(self): + parser = SimpleLangPylasuParser() + result = parser.parse("""set set a = 10 +display c +""") + self.assertTrue(result.issues) + extraneous_input = [i for i in result.issues if i.message.startswith("Extraneous input 'set'")][0] + self.assertEqual(Position(Point(1, 4), Point(1, 7)), extraneous_input.position) + mismatched_input = [i for i in result.issues if i.message.startswith("Mismatched input 'c'")][0] + self.assertEqual(Position(Point(2, 8), Point(2, 9)), mismatched_input.position) From f4b83d0434bb5cb51168aad35e516df52ebda64b Mon Sep 17 00:00:00 2001 From: Alessio Stalla Date: Tue, 22 Jul 2025 15:27:05 +0200 Subject: [PATCH 3/3] Update version, changelog, easier imports for non-ANTLR parser stuff --- CHANGELOG.md | 5 +++++ pylasu/__init__.py | 2 +- pylasu/parsing/__init__.py | 1 + 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cebb204..0db7260 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,11 @@ All notable changes to this project from version 0.4.0 upwards are documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). +## [0.9.0] – Not yet released + +### Added +- `PylasuANTLRParser` class modeled after the Kolasu equivalent + ## [0.8.1] – 2025-02-21 ### Added diff --git a/pylasu/__init__.py b/pylasu/__init__.py index 26df0ed..976684a 100644 --- a/pylasu/__init__.py +++ b/pylasu/__init__.py @@ -1 +1 @@ -VERSION = "0.8.1" +VERSION = "0.9.0" diff --git a/pylasu/parsing/__init__.py b/pylasu/parsing/__init__.py index e69de29..335340e 100644 --- a/pylasu/parsing/__init__.py +++ b/pylasu/parsing/__init__.py @@ -0,0 +1 @@ +from .results import FirstStageParsingResult, ParsingResultWithFirstStage