# ------------------------------------------------------------------------------ # pycparser: c_lexer.py # # CLexer class: lexer for the C language # # Eli Bendersky [https://eli.thegreenplace.net/] # License: BSD # ------------------------------------------------------------------------------ import re from dataclasses import dataclass from enum import Enum from typing import Callable, Dict, List, Optional, Tuple @dataclass(slots=True) class _Token: type: str value: str lineno: int column: int class CLexer: """A standalone lexer for C. Parameters for construction: error_func: Called with (msg, line, column) on lexing errors. on_lbrace_func: Called when an LBRACE token is produced (used for scope tracking). on_rbrace_func: Called when an RBRACE token is produced (used for scope tracking). type_lookup_func: Called with an identifier name; expected to return True if it is a typedef name and should be tokenized as TYPEID. Call input(text) to initialize lexing, and then keep calling token() to get the next token, until it returns None (at end of input). """ def __init__( self, error_func: Callable[[str, int, int], None], on_lbrace_func: Callable[[], None], on_rbrace_func: Callable[[], None], type_lookup_func: Callable[[str], bool], ) -> None: self.error_func = error_func self.on_lbrace_func = on_lbrace_func self.on_rbrace_func = on_rbrace_func self.type_lookup_func = type_lookup_func self._init_state() def input(self, text: str, filename: str = "") -> None: """Initialize the lexer to the given input text. filename is an optional name identifying the file from which the input comes. The lexer can modify it if #line directives are encountered. """ self._init_state() self._lexdata = text self._filename = filename def _init_state(self) -> None: self._lexdata = "" self._filename = "" self._pos = 0 self._line_start = 0 self._pending_tok: Optional[_Token] = None self._lineno = 1 @property def filename(self) -> str: return self._filename def token(self) -> Optional[_Token]: # Lexing strategy overview: # # - We maintain a current position (self._pos), line number, and the # byte offset of the current line start. The lexer is a simple loop # that skips whitespace/newlines and emits one token per call. # - A small amount of logic is handled manually before regex matching: # # * Preprocessor-style directives: if we see '#', we check whether # it's a #line or #pragma directive and consume it inline. #line # updates lineno/filename and produces no tokens. #pragma can yield # both PPPRAGMA and PPPRAGMASTR, but token() returns a single token, # so we stash the PPPRAGMASTR as _pending_tok to return on the next # token() call. Otherwise we return PPHASH. # * Newlines update lineno/line-start tracking so tokens can record # accurate columns. # # - The bulk of tokens are recognized in _match_token: # # * _regex_rules: regex patterns for identifiers, literals, and other # complex tokens (including error-producing patterns). The lexer # uses a combined _regex_master to scan options at the same time. # * _fixed_tokens: exact string matches for operators and punctuation, # resolved by longest match. # # - Error patterns call the error callback and advance minimally, which # keeps lexing resilient while reporting useful diagnostics. text = self._lexdata n = len(text) if self._pending_tok is not None: tok = self._pending_tok self._pending_tok = None return tok while self._pos < n: match text[self._pos]: case " " | "\t": self._pos += 1 case "\n": self._lineno += 1 self._pos += 1 self._line_start = self._pos case "#": if _line_pattern.match(text, self._pos + 1): self._pos += 1 self._handle_ppline() continue if _pragma_pattern.match(text, self._pos + 1): self._pos += 1 toks = self._handle_pppragma() if len(toks) > 1: self._pending_tok = toks[1] if len(toks) > 0: return toks[0] continue tok = self._make_token("PPHASH", "#", self._pos) self._pos += 1 return tok case _: if tok := self._match_token(): return tok else: continue def _match_token(self) -> Optional[_Token]: """Match one token at the current position. Returns a Token on success, or None if no token could be matched and an error was reported. This method always advances _pos by the matched length, or by 1 on error/no-match. """ text = self._lexdata pos = self._pos # We pick the longest match between: # - the master regex (identifiers, literals, error patterns, etc.) # - fixed operator/punctuator literals from the bucket for text[pos] # # The longest match is required to ensure we properly lex something # like ".123" (a floating-point constant) as a single entity (with # FLOAT_CONST), rather than a PERIOD followed by a number. # # The fixed-literal buckets are already length-sorted, so within that # bucket we can take the first match. However, we still compare its # length to the regex match because the regex may have matched a longer # token that should take precedence. best = None if m := _regex_master.match(text, pos): tok_type = m.lastgroup # All master-regex alternatives are named; lastgroup shouldn't be None. assert tok_type is not None value = m.group(tok_type) length = len(value) action, msg = _regex_actions[tok_type] best = (length, tok_type, value, action, msg) if bucket := _fixed_tokens_by_first.get(text[pos]): for entry in bucket: if text.startswith(entry.literal, pos): length = len(entry.literal) if best is None or length > best[0]: best = ( length, entry.tok_type, entry.literal, _RegexAction.TOKEN, None, ) break if best is None: msg = f"Illegal character {repr(text[pos])}" self._error(msg, pos) self._pos += 1 return None length, tok_type, value, action, msg = best if action == _RegexAction.ERROR: if tok_type == "BAD_CHAR_CONST": msg = f"Invalid char constant {value}" # All other ERROR rules provide a message. assert msg is not None self._error(msg, pos) self._pos += max(1, length) return None if action == _RegexAction.ID: tok_type = _keyword_map.get(value, "ID") if tok_type == "ID" and self.type_lookup_func(value): tok_type = "TYPEID" tok = self._make_token(tok_type, value, pos) self._pos += length if tok.type == "LBRACE": self.on_lbrace_func() elif tok.type == "RBRACE": self.on_rbrace_func() return tok def _make_token(self, tok_type: str, value: str, pos: int) -> _Token: """Create a Token at an absolute input position. Expects tok_type/value and the absolute byte offset pos in the current input. Does not advance lexer state; callers manage _pos themselves. Returns a Token with lineno/column computed from current line tracking. """ column = pos - self._line_start + 1 tok = _Token(tok_type, value, self._lineno, column) return tok def _error(self, msg: str, pos: int) -> None: column = pos - self._line_start + 1 self.error_func(msg, self._lineno, column) def _handle_ppline(self) -> None: # Since #line directives aren't supposed to return tokens but should # only affect the lexer's state (update line/filename for coords), this # method does a bit of parsing on its own. It doesn't return anything, # but its side effect is to update self._pos past the directive, and # potentially update self._lineno and self._filename, based on the # directive's contents. # # Accepted #line forms from preprocessors: # - "#line 66 \"kwas\\df.h\"" # - "# 9" # - "#line 10 \"include/me.h\" 1 2 3" (extra numeric flags) # - "# 1 \"file.h\" 3" # Errors we must report: # - "#line \"file.h\"" (filename before line number) # - "#line df" (garbage instead of number/string) # # We scan the directive line once (after an optional 'line' keyword), # validating the order: NUMBER, optional STRING, then any NUMBERs. # The NUMBERs tail is only accepted if a filename STRING was present. text = self._lexdata n = len(text) line_end = text.find("\n", self._pos) if line_end == -1: line_end = n line = text[self._pos : line_end] pos = 0 line_len = len(line) def skip_ws() -> None: nonlocal pos while pos < line_len and line[pos] in " \t": pos += 1 skip_ws() if line.startswith("line", pos): pos += 4 def success(pp_line: Optional[str], pp_filename: Optional[str]) -> None: if pp_line is None: self._error("line number missing in #line", self._pos + line_len) else: self._lineno = int(pp_line) if pp_filename is not None: self._filename = pp_filename self._pos = line_end + 1 self._line_start = self._pos def fail(msg: str, offset: int) -> None: self._error(msg, self._pos + offset) self._pos = line_end + 1 self._line_start = self._pos skip_ws() if pos >= line_len: success(None, None) return if line[pos] == '"': fail("filename before line number in #line", pos) return m = re.match(_decimal_constant, line[pos:]) if not m: fail("invalid #line directive", pos) return pp_line = m.group(0) pos += len(pp_line) skip_ws() if pos >= line_len: success(pp_line, None) return if line[pos] != '"': fail("invalid #line directive", pos) return m = re.match(_string_literal, line[pos:]) if not m: fail("invalid #line directive", pos) return pp_filename = m.group(0).lstrip('"').rstrip('"') pos += len(m.group(0)) # Consume arbitrary sequence of numeric flags after the directive while True: skip_ws() if pos >= line_len: break m = re.match(_decimal_constant, line[pos:]) if not m: fail("invalid #line directive", pos) return pos += len(m.group(0)) success(pp_line, pp_filename) def _handle_pppragma(self) -> List[_Token]: # Parse a full #pragma line; returns a list of tokens with 1 or 2 # tokens - PPPRAGMA and an optional PPPRAGMASTR. If an empty list is # returned, it means an error occurred, or we're at the end of input. # # Examples: # - "#pragma" -> PPPRAGMA only # - "#pragma once" -> PPPRAGMA, PPPRAGMASTR("once") # - "# pragma omp parallel private(th_id)" -> PPPRAGMA, PPPRAGMASTR("omp parallel private(th_id)") # - "#\tpragma {pack: 2, smack: 3}" -> PPPRAGMA, PPPRAGMASTR("{pack: 2, smack: 3}") text = self._lexdata n = len(text) pos = self._pos while pos < n and text[pos] in " \t": pos += 1 if pos >= n: self._pos = pos return [] if not text.startswith("pragma", pos): self._error("invalid #pragma directive", pos) self._pos = pos + 1 return [] pragma_pos = pos pos += len("pragma") toks = [self._make_token("PPPRAGMA", "pragma", pragma_pos)] while pos < n and text[pos] in " \t": pos += 1 start = pos while pos < n and text[pos] != "\n": pos += 1 if pos > start: toks.append(self._make_token("PPPRAGMASTR", text[start:pos], start)) if pos < n and text[pos] == "\n": self._lineno += 1 pos += 1 self._line_start = pos self._pos = pos return toks ## ## Reserved keywords ## _keywords: Tuple[str, ...] = ( "AUTO", "BREAK", "CASE", "CHAR", "CONST", "CONTINUE", "DEFAULT", "DO", "DOUBLE", "ELSE", "ENUM", "EXTERN", "FLOAT", "FOR", "GOTO", "IF", "INLINE", "INT", "LONG", "REGISTER", "OFFSETOF", "RESTRICT", "RETURN", "SHORT", "SIGNED", "SIZEOF", "STATIC", "STRUCT", "SWITCH", "TYPEDEF", "UNION", "UNSIGNED", "VOID", "VOLATILE", "WHILE", "__INT128", "_BOOL", "_COMPLEX", "_NORETURN", "_THREAD_LOCAL", "_STATIC_ASSERT", "_ATOMIC", "_ALIGNOF", "_ALIGNAS", "_PRAGMA", ) _keyword_map: Dict[str, str] = {} for keyword in _keywords: # Keywords from new C standard are mixed-case, like _Bool, _Alignas, etc. if keyword.startswith("_") and len(keyword) > 1 and keyword[1].isalpha(): _keyword_map[keyword[:2].upper() + keyword[2:].lower()] = keyword else: _keyword_map[keyword.lower()] = keyword ## ## Regexes for use in tokens ## # valid C identifiers (K&R2: A.2.3), plus '$' (supported by some compilers) _identifier = r"[a-zA-Z_$][0-9a-zA-Z_$]*" _hex_prefix = "0[xX]" _hex_digits = "[0-9a-fA-F]+" _bin_prefix = "0[bB]" _bin_digits = "[01]+" # integer constants (K&R2: A.2.5.1) _integer_suffix_opt = ( r"(([uU]ll)|([uU]LL)|(ll[uU]?)|(LL[uU]?)|([uU][lL])|([lL][uU]?)|[uU])?" ) _decimal_constant = ( "(0" + _integer_suffix_opt + ")|([1-9][0-9]*" + _integer_suffix_opt + ")" ) _octal_constant = "0[0-7]*" + _integer_suffix_opt _hex_constant = _hex_prefix + _hex_digits + _integer_suffix_opt _bin_constant = _bin_prefix + _bin_digits + _integer_suffix_opt _bad_octal_constant = "0[0-7]*[89]" # comments are not supported _unsupported_c_style_comment = r"\/\*" _unsupported_cxx_style_comment = r"\/\/" # character constants (K&R2: A.2.5.2) # Note: a-zA-Z and '.-~^_!=&;,' are allowed as escape chars to support #line # directives with Windows paths as filenames (..\..\dir\file) # For the same reason, decimal_escape allows all digit sequences. We want to # parse all correct code, even if it means to sometimes parse incorrect # code. # # The original regexes were taken verbatim from the C syntax definition, # and were later modified to avoid worst-case exponential running time. # # simple_escape = r"""([a-zA-Z._~!=&\^\-\\?'"])""" # decimal_escape = r"""(\d+)""" # hex_escape = r"""(x[0-9a-fA-F]+)""" # bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-7])""" # # The following modifications were made to avoid the ambiguity that allowed # backtracking: (https://github.com/eliben/pycparser/issues/61) # # - \x was removed from simple_escape, unless it was not followed by a hex # digit, to avoid ambiguity with hex_escape. # - hex_escape allows one or more hex characters, but requires that the next # character(if any) is not hex # - decimal_escape allows one or more decimal characters, but requires that the # next character(if any) is not a decimal # - bad_escape does not allow any decimals (8-9), to avoid conflicting with the # permissive decimal_escape. # # Without this change, python's `re` module would recursively try parsing each # ambiguous escape sequence in multiple ways. e.g. `\123` could be parsed as # `\1`+`23`, `\12`+`3`, and `\123`. _simple_escape = r"""([a-wyzA-Z._~!=&\^\-\\?'"]|x(?![0-9a-fA-F]))""" _decimal_escape = r"""(\d+)(?!\d)""" _hex_escape = r"""(x[0-9a-fA-F]+)(?![0-9a-fA-F])""" _bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-9])""" _escape_sequence = ( r"""(\\(""" + _simple_escape + "|" + _decimal_escape + "|" + _hex_escape + "))" ) # This complicated regex with lookahead might be slow for strings, so because # all of the valid escapes (including \x) allowed # 0 or more non-escaped characters after the first character, # simple_escape+decimal_escape+hex_escape got simplified to _escape_sequence_start_in_string = r"""(\\[0-9a-zA-Z._~!=&\^\-\\?'"])""" _cconst_char = r"""([^'\\\n]|""" + _escape_sequence + ")" _char_const = "'" + _cconst_char + "'" _wchar_const = "L" + _char_const _u8char_const = "u8" + _char_const _u16char_const = "u" + _char_const _u32char_const = "U" + _char_const _multicharacter_constant = "'" + _cconst_char + "{2,4}'" _unmatched_quote = "('" + _cconst_char + "*\\n)|('" + _cconst_char + "*$)" _bad_char_const = ( r"""('""" + _cconst_char + """[^'\n]+')|('')|('""" + _bad_escape + r"""[^'\n]*')""" ) # string literals (K&R2: A.2.6) _string_char = r"""([^"\\\n]|""" + _escape_sequence_start_in_string + ")" _string_literal = '"' + _string_char + '*"' _wstring_literal = "L" + _string_literal _u8string_literal = "u8" + _string_literal _u16string_literal = "u" + _string_literal _u32string_literal = "U" + _string_literal _bad_string_literal = '"' + _string_char + "*" + _bad_escape + _string_char + '*"' # floating constants (K&R2: A.2.5.3) _exponent_part = r"""([eE][-+]?[0-9]+)""" _fractional_constant = r"""([0-9]*\.[0-9]+)|([0-9]+\.)""" _floating_constant = ( "((((" + _fractional_constant + ")" + _exponent_part + "?)|([0-9]+" + _exponent_part + "))[FfLl]?)" ) _binary_exponent_part = r"""([pP][+-]?[0-9]+)""" _hex_fractional_constant = ( "(((" + _hex_digits + r""")?\.""" + _hex_digits + ")|(" + _hex_digits + r"""\.))""" ) _hex_floating_constant = ( "(" + _hex_prefix + "(" + _hex_digits + "|" + _hex_fractional_constant + ")" + _binary_exponent_part + "[FfLl]?)" ) class _RegexAction(Enum): TOKEN = 0 ID = 1 ERROR = 2 @dataclass(frozen=True) class _RegexRule: # tok_type: name of the token emitted for a match # regex_pattern: the raw regex (no anchors) to match at the current position # action: TOKEN for normal tokens, ID for identifiers, ERROR to report # error_message: message used for ERROR entries tok_type: str regex_pattern: str action: _RegexAction error_message: Optional[str] _regex_rules: List[_RegexRule] = [ _RegexRule( "UNSUPPORTED_C_STYLE_COMMENT", _unsupported_c_style_comment, _RegexAction.ERROR, "Comments are not supported, see https://github.com/eliben/pycparser#3using.", ), _RegexRule( "UNSUPPORTED_CXX_STYLE_COMMENT", _unsupported_cxx_style_comment, _RegexAction.ERROR, "Comments are not supported, see https://github.com/eliben/pycparser#3using.", ), _RegexRule( "BAD_STRING_LITERAL", _bad_string_literal, _RegexAction.ERROR, "String contains invalid escape code", ), _RegexRule("WSTRING_LITERAL", _wstring_literal, _RegexAction.TOKEN, None), _RegexRule("U8STRING_LITERAL", _u8string_literal, _RegexAction.TOKEN, None), _RegexRule("U16STRING_LITERAL", _u16string_literal, _RegexAction.TOKEN, None), _RegexRule("U32STRING_LITERAL", _u32string_literal, _RegexAction.TOKEN, None), _RegexRule("STRING_LITERAL", _string_literal, _RegexAction.TOKEN, None), _RegexRule("HEX_FLOAT_CONST", _hex_floating_constant, _RegexAction.TOKEN, None), _RegexRule("FLOAT_CONST", _floating_constant, _RegexAction.TOKEN, None), _RegexRule("INT_CONST_HEX", _hex_constant, _RegexAction.TOKEN, None), _RegexRule("INT_CONST_BIN", _bin_constant, _RegexAction.TOKEN, None), _RegexRule( "BAD_CONST_OCT", _bad_octal_constant, _RegexAction.ERROR, "Invalid octal constant", ), _RegexRule("INT_CONST_OCT", _octal_constant, _RegexAction.TOKEN, None), _RegexRule("INT_CONST_DEC", _decimal_constant, _RegexAction.TOKEN, None), _RegexRule("INT_CONST_CHAR", _multicharacter_constant, _RegexAction.TOKEN, None), _RegexRule("CHAR_CONST", _char_const, _RegexAction.TOKEN, None), _RegexRule("WCHAR_CONST", _wchar_const, _RegexAction.TOKEN, None), _RegexRule("U8CHAR_CONST", _u8char_const, _RegexAction.TOKEN, None), _RegexRule("U16CHAR_CONST", _u16char_const, _RegexAction.TOKEN, None), _RegexRule("U32CHAR_CONST", _u32char_const, _RegexAction.TOKEN, None), _RegexRule("UNMATCHED_QUOTE", _unmatched_quote, _RegexAction.ERROR, "Unmatched '"), _RegexRule("BAD_CHAR_CONST", _bad_char_const, _RegexAction.ERROR, None), _RegexRule("ID", _identifier, _RegexAction.ID, None), ] _regex_actions: Dict[str, Tuple[_RegexAction, Optional[str]]] = {} _regex_pattern_parts: List[str] = [] for _rule in _regex_rules: _regex_actions[_rule.tok_type] = (_rule.action, _rule.error_message) _regex_pattern_parts.append(f"(?P<{_rule.tok_type}>{_rule.regex_pattern})") # The master regex is a single alternation of all token patterns, each wrapped # in a named group. We match once at the current position and then use # `lastgroup` to recover which token kind fired; this avoids iterating over all # regexes on every character while keeping the same token-level semantics. _regex_master: re.Pattern[str] = re.compile("|".join(_regex_pattern_parts)) @dataclass(frozen=True) class _FixedToken: tok_type: str literal: str _fixed_tokens: List[_FixedToken] = [ _FixedToken("ELLIPSIS", "..."), _FixedToken("LSHIFTEQUAL", "<<="), _FixedToken("RSHIFTEQUAL", ">>="), _FixedToken("PLUSPLUS", "++"), _FixedToken("MINUSMINUS", "--"), _FixedToken("ARROW", "->"), _FixedToken("LAND", "&&"), _FixedToken("LOR", "||"), _FixedToken("LSHIFT", "<<"), _FixedToken("RSHIFT", ">>"), _FixedToken("LE", "<="), _FixedToken("GE", ">="), _FixedToken("EQ", "=="), _FixedToken("NE", "!="), _FixedToken("TIMESEQUAL", "*="), _FixedToken("DIVEQUAL", "/="), _FixedToken("MODEQUAL", "%="), _FixedToken("PLUSEQUAL", "+="), _FixedToken("MINUSEQUAL", "-="), _FixedToken("ANDEQUAL", "&="), _FixedToken("OREQUAL", "|="), _FixedToken("XOREQUAL", "^="), _FixedToken("EQUALS", "="), _FixedToken("PLUS", "+"), _FixedToken("MINUS", "-"), _FixedToken("TIMES", "*"), _FixedToken("DIVIDE", "/"), _FixedToken("MOD", "%"), _FixedToken("OR", "|"), _FixedToken("AND", "&"), _FixedToken("NOT", "~"), _FixedToken("XOR", "^"), _FixedToken("LNOT", "!"), _FixedToken("LT", "<"), _FixedToken("GT", ">"), _FixedToken("CONDOP", "?"), _FixedToken("LPAREN", "("), _FixedToken("RPAREN", ")"), _FixedToken("LBRACKET", "["), _FixedToken("RBRACKET", "]"), _FixedToken("LBRACE", "{"), _FixedToken("RBRACE", "}"), _FixedToken("COMMA", ","), _FixedToken("PERIOD", "."), _FixedToken("SEMI", ";"), _FixedToken("COLON", ":"), ] # To avoid scanning all fixed tokens on every character, we bucket them by the # first character. When matching at position i, we only look at the bucket for # text[i], and we pre-sort that bucket by token length so the first match is # also the longest. This preserves longest-match semantics (e.g. '>>=' before # '>>' before '>') while reducing the number of comparisons. _fixed_tokens_by_first: Dict[str, List[_FixedToken]] = {} for _entry in _fixed_tokens: _fixed_tokens_by_first.setdefault(_entry.literal[0], []).append(_entry) for _bucket in _fixed_tokens_by_first.values(): _bucket.sort(key=lambda item: len(item.literal), reverse=True) _line_pattern: re.Pattern[str] = re.compile(r"([ \t]*line\W)|([ \t]*\d+)") _pragma_pattern: re.Pattern[str] = re.compile(r"[ \t]*pragma\W")