Source code for paxter.core.parser

"""
Recursive descent parser of Paxter language.
"""
from dataclasses import dataclass, field
from typing import List, Match, Tuple

from paxter.core.charloc import CharLoc
from paxter.core.data import (
    Command, Fragment, FragmentList, Identifier, Number, Operator, ShortSymbol, Text,
    TokenList,
)
from paxter.core.enclosing import EnclosingPattern, GlobalEnclosingPattern
from paxter.core.exceptions import PaxterSyntaxError
from paxter.core.lexers import LEXER

__all__ = ['ParseContext']

OPENED_TO_CLOSED_SCOPE_TRANS = str.maketrans('([{', ')]}')


[docs]@dataclass class ParseContext: """ Implements a recursive descent parser for Paxter language text input. To utilize this class, provide the input text to the constructor, and the resulting parsed tree node will be generated upon instantiation. """ #: Document source text input_text: str #: Root node of the parsed tree tree: FragmentList = field(init=False) def __post_init__(self): self.tree = self._parse_global_fragment_list() def _parse_global_fragment_list(self) -> FragmentList: """ Parses the entirety of the already provided input text for the global-level fragment list from the very beginning. """ end_pos, node = self._inner_parse_fragment_list(0, GlobalEnclosingPattern()) if end_pos != len(self.input_text): # pragma: no cover raise RuntimeError("unexpected error; input text not fully consumed") return node def _inner_parse_fragment_list( self, next_pos: int, enclosing: EnclosingPattern, ) -> Tuple[int, FragmentList]: """ Subroutinely parses the input expecting a list of fragment nodes starting from the given position indicated by ``next_pos``. This method is called when parsing the global-level input or when parsing under a scope enclosed by braces pattern. """ start_pos = next_pos children: List[Fragment] = [] while True: # Attempts to match the next break pattern break_matchobj = enclosing.rec_break_re.match(self.input_text, next_pos) if break_matchobj is None: self._cannot_match_enclosing(start_pos, enclosing) # Append non-empty text node to children list text_node = Text.from_matchobj( break_matchobj, 'inner', enclosing=EnclosingPattern(left=''), ) if text_node.inner: children.append(text_node) # Dispatch parsing between the @-expression switch # and the closing (i.e. right) pattern next_pos = break_matchobj.end() break_char = break_matchobj.group('break') if break_char == '@': next_pos, result_node = self._parse_at_expr(next_pos) children.append(result_node) else: break end_pos = break_matchobj.end('inner') fragment_list_node = FragmentList(start_pos, end_pos, children, enclosing) return next_pos, fragment_list_node def _parse_at_expr(self, next_pos: int) -> Tuple[int, Fragment]: """ Parses @-expressions starting from immediately after @-symbol by attempting to dispatch the next step through lookahead patterns. """ matchobj = LEXER.id_re.match(self.input_text, next_pos) if matchobj: return self._parse_cmd_with_id_starter(matchobj) matchobj = LEXER.lbar_re.match(self.input_text, next_pos) if matchobj: return self._parse_cmd_with_bar_starter(matchobj) matchobj = LEXER.symbol_re.match(self.input_text, next_pos) if matchobj: return self._parse_short_symbol(matchobj) self._invalid_cmd(next_pos) def _parse_cmd_with_id_starter( self, id_matchobj: Match[str], ) -> Tuple[int, Command]: """ Continues parsing the starter section of the Command by using the identifier name content as the starter section. """ cmd_start_pos, next_pos = id_matchobj.span() starter = id_matchobj.group('id') starter_enclosing = EnclosingPattern(left='') return self._parse_cmd_after_starter( next_pos, cmd_start_pos, starter, starter_enclosing, ) def _parse_cmd_with_bar_starter( self, lbar_matchobj: Match[str], ) -> Tuple[int, Command]: """ Continues parsing the starter section of the Command which is enclosed by the bar pattern. """ cmd_start_pos, next_pos = lbar_matchobj.span() starter_enclosing = EnclosingPattern(left=lbar_matchobj.group('left')) inner_matchobj = starter_enclosing.non_rec_break_re.match( self.input_text, next_pos, ) if inner_matchobj is None: self._cannot_match_enclosing(next_pos, starter_enclosing) next_pos = inner_matchobj.end() starter = inner_matchobj.group('inner') return self._parse_cmd_after_starter( next_pos, cmd_start_pos, starter, starter_enclosing, ) def _parse_cmd_after_starter( self, next_pos: int, cmd_start_pos: int, starter: str, starter_enclosing: EnclosingPattern, ) -> Tuple[int, Command]: """ Continues parsing the Command after the starter section. """ # Parses for option section (square brackets) lbracket_matchobj = LEXER.lbracket_re.match(self.input_text, next_pos) if lbracket_matchobj: next_pos = lbracket_matchobj.end() next_pos, options = self._parse_option(next_pos) else: options = None # Parses for main argument lbrace_matchobj = LEXER.lbrace_re.match(self.input_text, next_pos) if lbrace_matchobj: next_pos, main_arg_node = self._parse_fragment_list(lbrace_matchobj) else: lquote_matchobj = LEXER.lquote_re.match(self.input_text, next_pos) if lquote_matchobj: next_pos, main_arg_node = self._parse_text(lquote_matchobj) else: main_arg_node = None # Construct Command node cmd_node = Command( cmd_start_pos, next_pos, starter, starter_enclosing, options, main_arg_node, ) return next_pos, cmd_node def _parse_fragment_list( self, lbrace_matchobj: Match[str], ) -> Tuple[int, FragmentList]: """ Recursively parses the input until the enclosing right pattern corresponding to the enclosing left pattern (captured by the provided match object) is discovered. """ next_pos = lbrace_matchobj.end() enclosing = EnclosingPattern(left=lbrace_matchobj.group('left')) return self._inner_parse_fragment_list(next_pos, enclosing) def _parse_text(self, lquote_matchobj: Match[str]) -> Tuple[int, Text]: """ Continues parsing the input for raw :class:`Text` node until the enclosing right pattern corresponding to the enclosing left pattern (captured by the provided match object) is discovered. """ next_pos = lquote_matchobj.end() enclosing = EnclosingPattern(left=lquote_matchobj.group('left')) inner_matchobj = enclosing.non_rec_break_re.match(self.input_text, next_pos) if inner_matchobj is None: self._cannot_match_enclosing(next_pos, enclosing) next_pos = inner_matchobj.end() text_node = Text.from_matchobj(inner_matchobj, 'inner', enclosing) return next_pos, text_node def _parse_short_symbol( self, symbol_matchobj: Match[str], ) -> Tuple[int, ShortSymbol]: """ A special case of @-expression (called a "short symbol") where a single-character symbol follows the @-switch character. """ next_pos = symbol_matchobj.end() command_node = ShortSymbol.from_matchobj(symbol_matchobj, 'symbol') return next_pos, command_node def _parse_option(self, next_pos: int) -> Tuple[int, TokenList]: """ Parses the option section until reaching the right square brackets. """ start_pos = next_pos children = [] while True: # Remove leading whitespaces ws_matchobj = LEXER.ws_re.match(self.input_text, next_pos) next_pos = ws_matchobj.end() # Attempts to extract identifier node id_matchobj = LEXER.id_re.match(self.input_text, next_pos) if id_matchobj: next_pos = id_matchobj.end() id_node = Identifier.from_matchobj(id_matchobj, 'id') children.append(id_node) continue # Attempts to extract operator node op_matchobj = LEXER.op_re.match(self.input_text, next_pos) if op_matchobj: next_pos = op_matchobj.end() op_node = Operator.from_matchobj(op_matchobj, 'op') children.append(op_node) continue # Attempts to extract number literal node num_matchobj = LEXER.num_re.match(self.input_text, next_pos) if num_matchobj: next_pos = num_matchobj.end() num_node = Number.from_matchobj(num_matchobj, 'num') children.append(num_node) continue # Attempts to extract fragment list node lbrace_matchobj = LEXER.lbrace_re.match(self.input_text, next_pos) if lbrace_matchobj: next_pos, fragment_list_node = ( self._parse_fragment_list(lbrace_matchobj) ) children.append(fragment_list_node) continue # Attempts to extract text node lquote_matchobj = LEXER.lquote_re.match(self.input_text, next_pos) if lquote_matchobj: next_pos, text_node = self._parse_text(lquote_matchobj) children.append(text_node) continue # Attempts to extract @-expressions at_matchobj = LEXER.at_re.match(self.input_text, next_pos) if at_matchobj: next_pos = at_matchobj.end() next_pos, at_expr_node = self._parse_at_expr(next_pos) children.append(at_expr_node) continue # Attempts to parse a sub-level list of tokens lbracket_matchobj = LEXER.lbracket_re.match(self.input_text, next_pos) if lbracket_matchobj: next_pos = lbracket_matchobj.end() next_pos, token_list_node = self._parse_option(next_pos) children.append(token_list_node) continue # Attempts to parse the end of token list # Return the token list if this is the case rbracket_matchobj = LEXER.rbracket_re.match(self.input_text, next_pos) if rbracket_matchobj: end_pos, next_pos = rbracket_matchobj.span() return next_pos, TokenList(start_pos, end_pos, children) # Else, something was wrong at the parsing, # perhaps reaching the end of text or found unmatched parenthesis. self._cannot_match_char(start_pos, '[', ']') def _cannot_match_enclosing(self, pos: int, enclosing: EnclosingPattern): """ Raises syntax error for failing to match enclosing right pattern to the corresponding enclosing left pattern. """ raise PaxterSyntaxError( f"cannot match enclosing right pattern {enclosing.right!r} " f"to the left pattern {enclosing.left!r} at %(pos)s", pos=CharLoc(self.input_text, pos - len(enclosing.left)), ) def _cannot_match_char(self, pos: int, left_char: str, right_char: str): """ Raises syntax error for failing to match enclosing right char to the corresponding enclosing left char. """ raise PaxterSyntaxError( f"cannot match enclosing right character {right_char!r} " f"to the left character {left_char!r} at %(pos)s", pos=CharLoc(self.input_text, pos - len(left_char)), ) def _invalid_cmd(self, pos: int): """ Raises syntax error for failing to parse @-command. """ raise PaxterSyntaxError( "invalid expression after @-command at %(pos)s", pos=CharLoc(self.input_text, pos), )