Source code for paxter.core.parser

"""
Recursive descent parser of Paxter language.
"""
from dataclasses import dataclass
from typing import List, Match, Tuple, Union

from paxter.core.data import (
    Fragment, FragmentList, Identifier, Number, Operator, PaxterApply, PaxterPhrase,
    Text, TokenList,
)
from paxter.core.exceptions import PaxterSyntaxError
from paxter.core.lexers import LEXER
from paxter.core.line_col import LineCol
from paxter.core.scope_pattern import (
    EMPTY_SCOPE_PATTERN, GLOBAL_SCOPE_PATTERN, ScopePattern,
)

__all__ = ['ParseContext']

OPENED_TO_CLOSED_SCOPE_TRANS = str.maketrans('([{', ')]}')


[docs]@dataclass class ParseContext: """ Implements recursive descent parser for Paxter language. Below is how to utilize this class:: input_text = 'Hello @name' tree = ParseContext(input_text).parse() """ #: Document source text input_text: str
[docs] def parse(self) -> FragmentList: """ Parses the already provided input text starting from the beginning. This method is expensive and should not be called more than once. """ end_pos, node = self.parse_inner_fragment_list(0, GLOBAL_SCOPE_PATTERN) if end_pos != len(self.input_text): # pragma: no cover raise RuntimeError("unexpected error; input text not fully consumed") return node
def parse_inner_fragment_list( self, next_pos: int, scope_pattern: ScopePattern, ) -> Tuple[int, FragmentList]: """ Parses the input text at the given position for the scope of fragment node list. This method is called at the start of global-level text or the start of the text within the braces pattern. """ start_pos = next_pos children: List[Fragment] = [] while True: # Tries to match the break pattern break_matchobj = scope_pattern.rec_break_re.match(self.input_text, next_pos) if break_matchobj is None: self.cannot_match_closed_pattern(start_pos, scope_pattern) # Append non-empty text node to children list text_node = Text.from_matchobj(break_matchobj, 'inner', EMPTY_SCOPE_PATTERN) if text_node.inner: children.append(text_node) next_pos = break_matchobj.end() # Dispatch parsing between the @-command switch # and the closed (i.e. right) pattern break_char = break_matchobj.group('break') if break_char == '@': next_pos, result_node = self.parse_command(next_pos) children.append(result_node) else: fragment_list_node = FragmentList( start_pos, break_matchobj.end('inner'), children, scope_pattern, ) return next_pos, fragment_list_node def parse_command(self, next_pos: int) -> Tuple[int, Fragment]: """ Attempts to parse all kinds of Paxter expressions by looking ahead for desired patterns. """ next_pos, command_node = self.parse_command_2nd_level(next_pos) if isinstance(command_node, (FragmentList, Text)): command_node.is_command = True return next_pos, command_node def parse_command_2nd_level(self, next_pos: int) -> Tuple[int, Fragment]: """ Attempts to parse all kinds of Paxter expressions by looking ahead for desired patterns. """ matchobj = LEXER.id_prefix_re.match(self.input_text, next_pos) if matchobj: return self.parse_command_after_id(matchobj) matchobj = LEXER.brace_prefix_re.match(self.input_text, next_pos) if matchobj: return self.parse_fragment_list(matchobj) matchobj = LEXER.quote_prefix_re.match(self.input_text, next_pos) if matchobj: return self.parse_text(matchobj) matchobj = LEXER.bar_prefix_re.match(self.input_text, next_pos) if matchobj: return self.parse_normal_phrase(matchobj) matchobj = LEXER.symbol_re.match(self.input_text, next_pos) if matchobj: return self.parse_symbol_phrase(matchobj) self.invalid_command(next_pos) def parse_command_after_id( self, id_prefix_matchobj: Match[str], ) -> Tuple[int, Union[PaxterApply, PaxterPhrase]]: """ Continues parsing the command after the identifier section. The result of this function could either be `PaxterApply` (if the options section or the main argument section exists) or `PaxterPhrase` (otherwise). """ start_pos = id_prefix_matchobj.start() next_pos = id_prefix_matchobj.end() # Parse for options section (square brackets) bracket_prefix_matchobj = LEXER.bracket_prefix_re.match( self.input_text, next_pos, ) if bracket_prefix_matchobj: next_pos = bracket_prefix_matchobj.end() next_pos, options = self.parse_options(next_pos) else: options = None # Parse for main arguments brace_prefix_matchobj = LEXER.brace_prefix_re.match(self.input_text, next_pos) if brace_prefix_matchobj: next_pos, main_arg_node = self.parse_fragment_list(brace_prefix_matchobj) else: quote_prefix_matchobj = LEXER.quote_prefix_re.match( self.input_text, next_pos, ) if quote_prefix_matchobj: next_pos, main_arg_node = self.parse_text(quote_prefix_matchobj) else: main_arg_node = None # Create PaxterPhrase node as a special case if options is None and main_arg_node is None: result_node = PaxterPhrase.from_matchobj( id_prefix_matchobj, 'id', EMPTY_SCOPE_PATTERN, ) # Create PaxterApply node else: id_node = Identifier.from_matchobj(id_prefix_matchobj, 'id') result_node = PaxterApply( start_pos, next_pos, id_node, options, main_arg_node, ) return next_pos, result_node def parse_fragment_list( self, brace_prefix_matchobj: Match[str], ) -> Tuple[int, FragmentList]: """ Recursively parses the input text until the closed (i.e. right) pattern corresponding to the opened (i.e. left) pattern captured by the provided match object is discovered. """ next_pos = brace_prefix_matchobj.end() scope_pattern = ScopePattern(opening=brace_prefix_matchobj.group('opened')) return self.parse_inner_fragment_list(next_pos, scope_pattern) def parse_text(self, quote_prefix_matchobj: Match[str]) -> Tuple[int, Text]: """ Continues parsing the command for `Text` following the pattern `@"..."`. """ next_pos = quote_prefix_matchobj.end() scope_pattern = ScopePattern(opening=quote_prefix_matchobj.group('opened')) inner_matchobj = scope_pattern.non_rec_break_re.match(self.input_text, next_pos) if inner_matchobj is None: self.cannot_match_closed_pattern(next_pos, scope_pattern) text_node = Text.from_matchobj(inner_matchobj, 'inner', scope_pattern) return inner_matchobj.end(), text_node def parse_normal_phrase( self, bar_prefix_matchobj: Match[str], ) -> Tuple[int, PaxterPhrase]: """ Continues parsing the command for `PaxterPhrase` following the pattern `@|...|`. """ next_pos = bar_prefix_matchobj.end() scope_pattern = ScopePattern(opening=bar_prefix_matchobj.group('opened')) inner_matchobj = scope_pattern.non_rec_break_re.match(self.input_text, next_pos) if inner_matchobj is None: self.cannot_match_closed_pattern(next_pos, scope_pattern) phrase_node = PaxterPhrase.from_matchobj(inner_matchobj, 'inner', scope_pattern) return inner_matchobj.end(), phrase_node def parse_symbol_phrase( self, symbol_matchobj: Match[str], ) -> Tuple[int, PaxterPhrase]: """ Continues parsing the command for `PaxterPhrase` following the pattern `@_` where `_` is a single-character symbol. """ phrase_node = PaxterPhrase.from_matchobj( symbol_matchobj, 'symbol', EMPTY_SCOPE_PATTERN, ) return symbol_matchobj.end(), phrase_node def parse_options(self, next_pos: int) -> Tuple[int, TokenList]: """ Parses the options section until reaching the closed square brackets. """ return self.parse_options_rec(next_pos, '[') def parse_options_rec( self, next_pos: int, opened_char: str, ) -> Tuple[int, TokenList]: """ Recursively parses the options section until reaching the given breaking character. """ start_pos = next_pos expected_closed_char = opened_char.translate(OPENED_TO_CLOSED_SCOPE_TRANS) children = [] while True: token_matchobj = LEXER.option_token_re.match(self.input_text, next_pos) next_pos = token_matchobj.end() # Attempts to extract identifier node if token_matchobj.group('id'): id_node = Identifier.from_matchobj(token_matchobj, 'id') children.append(id_node) continue # Attempts to extract operator node if token_matchobj.group('op'): op_node = Operator.from_matchobj(token_matchobj, 'op') children.append(op_node) continue # Attempts to extract number literal node if token_matchobj.group('num'): num_node = Number.from_matchobj(token_matchobj, 'num') children.append(num_node) continue char = token_matchobj.group('char') # Attempts to parse the command if char == '@': next_pos, command_node = self.parse_command(next_pos) children.append(command_node) continue # Attempts to parse a list of tokens in sub-level if isinstance(char, str) and char in '([{': next_pos, token_list_node = self.parse_options_rec(next_pos, char) children.append(token_list_node) continue # Asserts that the character matches the expected closed character. # Return the token list if this is the case. if isinstance(char, str) and char == expected_closed_char: end_pos = token_matchobj.start() return next_pos, TokenList(start_pos, end_pos, children) # Else, something was wrong at the parsing, # perhaps reaching the end of text or found unmatched parenthesis. self.cannot_match_closed_pattern( start_pos, ScopePattern(opened_char, expected_closed_char), ) def cannot_match_closed_pattern(self, pos: int, scope_pattern: ScopePattern): """ Raises syntax error for failing to match closed pattern to the corresponding opened pattern. """ raise PaxterSyntaxError( f"cannot match closed pattern {scope_pattern.closing!r} " f"to the opened pattern {scope_pattern.opening!r} at %(pos)s", pos=LineCol(self.input_text, pos), ) def invalid_command(self, pos: int): """ Raises syntax error for failing to parse @-command. """ raise PaxterSyntaxError( f"invalid expression after @-command at %(pos)s", pos=LineCol(self.input_text, pos), )