from __future__ import annotations import contextlib import re from dataclasses import dataclass from typing import Generator, Mapping, NoReturn from .specifiers import Specifier @dataclass class Token: name: str text: str position: int class ParserSyntaxError(Exception): """The provided source text could not be parsed correctly.""" def __init__( self, message: str, *, source: str, span: tuple[int, int], ) -> None: self.span = span self.message = message self.source = source super().__init__() def __str__(self) -> str: marker = " " * self.span[0] + "~" * (self.span[1] - self.span[0]) + "^" return f"{self.message}\n {self.source}\n {marker}" DEFAULT_RULES: dict[str, re.Pattern[str]] = { "LEFT_PARENTHESIS": re.compile(r"\("), "RIGHT_PARENTHESIS": re.compile(r"\)"), "LEFT_BRACKET": re.compile(r"\["), "RIGHT_BRACKET": re.compile(r"\]"), "SEMICOLON": re.compile(r";"), "COMMA": re.compile(r","), "QUOTED_STRING": re.compile( r""" ( ('[^']*') | ("[^"]*") ) """, re.VERBOSE, ), "OP": re.compile(r"(===|==|~=|!=|<=|>=|<|>)"), "BOOLOP": re.compile(r"\b(or|and)\b"), "IN": re.compile(r"\bin\b"), "NOT": re.compile(r"\bnot\b"), "VARIABLE": re.compile( r""" \b( python_version |python_full_version |os[._]name |sys[._]platform |platform_(release|system) |platform[._](version|machine|python_implementation) |python_implementation |implementation_(name|version) |extras? |dependency_groups )\b """, re.VERBOSE, ), "SPECIFIER": re.compile( Specifier._operator_regex_str + Specifier._version_regex_str, re.VERBOSE | re.IGNORECASE, ), "AT": re.compile(r"\@"), "URL": re.compile(r"[^ \t]+"), "IDENTIFIER": re.compile(r"\b[a-zA-Z0-9][a-zA-Z0-9._-]*\b"), "VERSION_PREFIX_TRAIL": re.compile(r"\.\*"), "VERSION_LOCAL_LABEL_TRAIL": re.compile(r"\+[a-z0-9]+(?:[-_\.][a-z0-9]+)*"), "WS": re.compile(r"[ \t]+"), "END": re.compile(r"$"), } class Tokenizer: """Context-sensitive token parsing. Provides methods to examine the input stream to check whether the next token matches. """ def __init__( self, source: str, *, rules: Mapping[str, re.Pattern[str]], ) -> None: self.source = source self.rules = rules self.next_token: Token | None = None self.position = 0 def consume(self, name: str) -> None: """Move beyond provided token name, if at current position.""" if self.check(name): self.read() def check(self, name: str, *, peek: bool = False) -> bool: """Check whether the next token has the provided name. By default, if the check succeeds, the token *must* be read before another check. If `peek` is set to `True`, the token is not loaded and would need to be checked again. """ assert self.next_token is None, ( f"Cannot check for {name!r}, already have {self.next_token!r}" ) assert name in self.rules, f"Unknown token name: {name!r}" expression = self.rules[name] match = expression.match(self.source, self.position) if match is None: return False if not peek: self.next_token = Token(name, match[0], self.position) return True def expect(self, name: str, *, expected: str) -> Token: """Expect a certain token name next, failing with a syntax error otherwise. The token is *not* read. """ if not self.check(name): raise self.raise_syntax_error(f"Expected {expected}") return self.read() def read(self) -> Token: """Consume the next token and return it.""" token = self.next_token assert token is not None self.position += len(token.text) self.next_token = None return token def raise_syntax_error( self, message: str, *, span_start: int | None = None, span_end: int | None = None, ) -> NoReturn: """Raise ParserSyntaxError at the given position.""" span = ( self.position if span_start is None else span_start, self.position if span_end is None else span_end, ) raise ParserSyntaxError( message, source=self.source, span=span, ) @contextlib.contextmanager def enclosing_tokens( self, open_token: str, close_token: str, *, around: str ) -> Generator[None, None, None]: if self.check(open_token): open_position = self.position self.read() else: open_position = None yield if open_position is None: return if not self.check(close_token): self.raise_syntax_error( f"Expected matching {close_token} for {open_token}, after {around}", span_start=open_position, ) self.read()