Source code for paxter.core.parser

"""
Recursive descent parser of Paxter language.
"""
from dataclasses import dataclass
from typing import List, Match, Tuple, Union

from paxter.core.data import (
    Fragment, FragmentList, Identifier, Number, Operator, PaxterApply, PaxterPhrase,
    Text, TokenList,
)
from paxter.core.exceptions import PaxterSyntaxError
from paxter.core.lexers import LEXER
from paxter.core.line_col import LineCol
from paxter.core.scope_pattern import (
    EMPTY_SCOPE_PATTERN, GLOBAL_SCOPE_PATTERN, ScopePattern,
)

__all__ = ['ParseContext']

OPENED_TO_CLOSED_SCOPE_TRANS = str.maketrans('([{', ')]}')


[docs]@dataclass
class ParseContext:
    """
    Implements recursive descent parser for Paxter language.

    Below is how to utilize this class::

        input_text = 'Hello @name'
        tree = ParseContext(input_text).parse()
    """
    #: Document source text
    input_text: str

[docs]    def parse(self) -> FragmentList:
        """
        Parses the already provided input text starting from the beginning.
        This method is expensive and should not be called more than once.
        """
        end_pos, node = self.parse_inner_fragment_list(0, GLOBAL_SCOPE_PATTERN)
        if end_pos != len(self.input_text):  # pragma: no cover
            raise RuntimeError("unexpected error; input text not fully consumed")
        return node

    def parse_inner_fragment_list(
            self, next_pos: int, scope_pattern: ScopePattern,
    ) -> Tuple[int, FragmentList]:
        """
        Parses the input text at the given position
        for the scope of fragment node list.
        This method is called at the start of global-level text
        or the start of the text within the braces pattern.
        """
        start_pos = next_pos
        children: List[Fragment] = []

        while True:
            # Tries to match the break pattern
            break_matchobj = scope_pattern.rec_break_re.match(self.input_text, next_pos)
            if break_matchobj is None:
                self.cannot_match_closed_pattern(start_pos, scope_pattern)

            # Append non-empty text node to children list
            text_node = Text.from_matchobj(break_matchobj, 'inner', EMPTY_SCOPE_PATTERN)
            if text_node.inner:
                children.append(text_node)
            next_pos = break_matchobj.end()

            # Dispatch parsing between the @-command switch
            # and the closed (i.e. right) pattern
            break_char = break_matchobj.group('break')
            if break_char == '@':
                next_pos, result_node = self.parse_command(next_pos)
                children.append(result_node)
            else:
                fragment_list_node = FragmentList(
                    start_pos, break_matchobj.end('inner'),
                    children, scope_pattern,
                )
                return next_pos, fragment_list_node

    def parse_command(self, next_pos: int) -> Tuple[int, Fragment]:
        """
        Attempts to parse all kinds of Paxter expressions
        by looking ahead for desired patterns.
        """
        next_pos, command_node = self.parse_command_2nd_level(next_pos)
        if isinstance(command_node, (FragmentList, Text)):
            command_node.is_command = True
        return next_pos, command_node

    def parse_command_2nd_level(self, next_pos: int) -> Tuple[int, Fragment]:
        """
        Attempts to parse all kinds of Paxter expressions
        by looking ahead for desired patterns.
        """
        matchobj = LEXER.id_prefix_re.match(self.input_text, next_pos)
        if matchobj:
            return self.parse_command_after_id(matchobj)

        matchobj = LEXER.brace_prefix_re.match(self.input_text, next_pos)
        if matchobj:
            return self.parse_fragment_list(matchobj)

        matchobj = LEXER.quote_prefix_re.match(self.input_text, next_pos)
        if matchobj:
            return self.parse_text(matchobj)

        matchobj = LEXER.bar_prefix_re.match(self.input_text, next_pos)
        if matchobj:
            return self.parse_normal_phrase(matchobj)

        matchobj = LEXER.symbol_re.match(self.input_text, next_pos)
        if matchobj:
            return self.parse_symbol_phrase(matchobj)

        self.invalid_command(next_pos)

    def parse_command_after_id(
            self, id_prefix_matchobj: Match[str],
    ) -> Tuple[int, Union[PaxterApply, PaxterPhrase]]:
        """
        Continues parsing the command after the identifier section.
        The result of this function could either be `PaxterApply`
        (if the options section or the main argument section exists)
        or `PaxterPhrase` (otherwise).
        """
        start_pos = id_prefix_matchobj.start()
        next_pos = id_prefix_matchobj.end()

        # Parse for options section (square brackets)
        bracket_prefix_matchobj = LEXER.bracket_prefix_re.match(
            self.input_text, next_pos,
        )
        if bracket_prefix_matchobj:
            next_pos = bracket_prefix_matchobj.end()
            next_pos, options = self.parse_options(next_pos)
        else:
            options = None

        # Parse for main arguments
        brace_prefix_matchobj = LEXER.brace_prefix_re.match(self.input_text, next_pos)
        if brace_prefix_matchobj:
            next_pos, main_arg_node = self.parse_fragment_list(brace_prefix_matchobj)
        else:
            quote_prefix_matchobj = LEXER.quote_prefix_re.match(
                self.input_text, next_pos,
            )
            if quote_prefix_matchobj:
                next_pos, main_arg_node = self.parse_text(quote_prefix_matchobj)
            else:
                main_arg_node = None

        # Create PaxterPhrase node as a special case
        if options is None and main_arg_node is None:
            result_node = PaxterPhrase.from_matchobj(
                id_prefix_matchobj, 'id', EMPTY_SCOPE_PATTERN,
            )

        # Create PaxterApply node
        else:
            id_node = Identifier.from_matchobj(id_prefix_matchobj, 'id')
            result_node = PaxterApply(
                start_pos, next_pos, id_node, options, main_arg_node,
            )

        return next_pos, result_node

    def parse_fragment_list(
            self, brace_prefix_matchobj: Match[str],
    ) -> Tuple[int, FragmentList]:
        """
        Recursively parses the input text until the closed (i.e. right)
        pattern corresponding to the opened (i.e. left) pattern
        captured by the provided match object is discovered.
        """
        next_pos = brace_prefix_matchobj.end()
        scope_pattern = ScopePattern(opening=brace_prefix_matchobj.group('opened'))

        return self.parse_inner_fragment_list(next_pos, scope_pattern)

    def parse_text(self, quote_prefix_matchobj: Match[str]) -> Tuple[int, Text]:
        """
        Continues parsing the command for `Text`
        following the pattern `@"..."`.
        """
        next_pos = quote_prefix_matchobj.end()
        scope_pattern = ScopePattern(opening=quote_prefix_matchobj.group('opened'))

        inner_matchobj = scope_pattern.non_rec_break_re.match(self.input_text, next_pos)
        if inner_matchobj is None:
            self.cannot_match_closed_pattern(next_pos, scope_pattern)

        text_node = Text.from_matchobj(inner_matchobj, 'inner', scope_pattern)
        return inner_matchobj.end(), text_node

    def parse_normal_phrase(
            self, bar_prefix_matchobj: Match[str],
    ) -> Tuple[int, PaxterPhrase]:
        """
        Continues parsing the command for `PaxterPhrase`
        following the pattern `@|...|`.
        """
        next_pos = bar_prefix_matchobj.end()
        scope_pattern = ScopePattern(opening=bar_prefix_matchobj.group('opened'))

        inner_matchobj = scope_pattern.non_rec_break_re.match(self.input_text, next_pos)
        if inner_matchobj is None:
            self.cannot_match_closed_pattern(next_pos, scope_pattern)

        phrase_node = PaxterPhrase.from_matchobj(inner_matchobj, 'inner', scope_pattern)
        return inner_matchobj.end(), phrase_node

    def parse_symbol_phrase(
            self, symbol_matchobj: Match[str],
    ) -> Tuple[int, PaxterPhrase]:
        """
        Continues parsing the command for `PaxterPhrase`
        following the pattern `@_` where `_` is a single-character symbol.
        """
        phrase_node = PaxterPhrase.from_matchobj(
            symbol_matchobj, 'symbol', EMPTY_SCOPE_PATTERN,
        )
        return symbol_matchobj.end(), phrase_node

    def parse_options(self, next_pos: int) -> Tuple[int, TokenList]:
        """
        Parses the options section until reaching the closed square brackets.
        """
        return self.parse_options_rec(next_pos, '[')

    def parse_options_rec(
            self, next_pos: int,
            opened_char: str,
    ) -> Tuple[int, TokenList]:
        """
        Recursively parses the options section
        until reaching the given breaking character.
        """
        start_pos = next_pos
        expected_closed_char = opened_char.translate(OPENED_TO_CLOSED_SCOPE_TRANS)
        children = []

        while True:
            token_matchobj = LEXER.option_token_re.match(self.input_text, next_pos)
            next_pos = token_matchobj.end()

            # Attempts to extract identifier node
            if token_matchobj.group('id'):
                id_node = Identifier.from_matchobj(token_matchobj, 'id')
                children.append(id_node)
                continue

            # Attempts to extract operator node
            if token_matchobj.group('op'):
                op_node = Operator.from_matchobj(token_matchobj, 'op')
                children.append(op_node)
                continue

            # Attempts to extract number literal node
            if token_matchobj.group('num'):
                num_node = Number.from_matchobj(token_matchobj, 'num')
                children.append(num_node)
                continue

            char = token_matchobj.group('char')

            # Attempts to parse the command
            if char == '@':
                next_pos, command_node = self.parse_command(next_pos)
                children.append(command_node)
                continue

            # Attempts to parse a list of tokens in sub-level
            if isinstance(char, str) and char in '([{':
                next_pos, token_list_node = self.parse_options_rec(next_pos, char)
                children.append(token_list_node)
                continue

            # Asserts that the character matches the expected closed character.
            # Return the token list if this is the case.
            if isinstance(char, str) and char == expected_closed_char:
                end_pos = token_matchobj.start()
                return next_pos, TokenList(start_pos, end_pos, children)

            # Else, something was wrong at the parsing,
            # perhaps reaching the end of text or found unmatched parenthesis.
            self.cannot_match_closed_pattern(
                start_pos, ScopePattern(opened_char, expected_closed_char),
            )

    def cannot_match_closed_pattern(self, pos: int, scope_pattern: ScopePattern):
        """
        Raises syntax error for failing to match closed pattern
        to the corresponding opened pattern.
        """
        raise PaxterSyntaxError(
            f"cannot match closed pattern {scope_pattern.closing!r} "
            f"to the opened pattern {scope_pattern.opening!r} at %(pos)s",
            pos=LineCol(self.input_text, pos),
        )

    def invalid_command(self, pos: int):
        """
        Raises syntax error for failing to parse @-command.
        """
        raise PaxterSyntaxError(
            f"invalid expression after @-command at %(pos)s",
            pos=LineCol(self.input_text, pos),
        )