# ------------------------------------------------------------------------------ # pycparser: c_parser.py # # Recursive-descent parser for the C language. # # Eli Bendersky [https://eli.thegreenplace.net/] # License: BSD # ------------------------------------------------------------------------------ from dataclasses import dataclass from typing import ( Any, Dict, List, Literal, NoReturn, Optional, Tuple, TypedDict, cast, ) from . import c_ast from .c_lexer import CLexer, _Token from .ast_transforms import fix_switch_cases, fix_atomic_specifiers @dataclass class Coord: """Coordinates of a syntactic element. Consists of: - File name - Line number - Column number """ file: str line: int column: Optional[int] = None def __str__(self) -> str: text = f"{self.file}:{self.line}" if self.column: text += f":{self.column}" return text class ParseError(Exception): pass class CParser: """Recursive-descent C parser. Usage: parser = CParser() ast = parser.parse(text, filename) The `lexer` parameter lets you inject a lexer class (defaults to CLexer). The parameters after `lexer` are accepted for backward compatibility with the old PLY-based parser and are otherwise unused. """ def __init__( self, lex_optimize: bool = True, lexer: type[CLexer] = CLexer, lextab: str = "pycparser.lextab", yacc_optimize: bool = True, yacctab: str = "pycparser.yacctab", yacc_debug: bool = False, taboutputdir: str = "", ) -> None: self.clex: CLexer = lexer( error_func=self._lex_error_func, on_lbrace_func=self._lex_on_lbrace_func, on_rbrace_func=self._lex_on_rbrace_func, type_lookup_func=self._lex_type_lookup_func, ) # Stack of scopes for keeping track of symbols. _scope_stack[-1] is # the current (topmost) scope. Each scope is a dictionary that # specifies whether a name is a type. If _scope_stack[n][name] is # True, 'name' is currently a type in the scope. If it's False, # 'name' is used in the scope but not as a type (for instance, if we # saw: int name; # If 'name' is not a key in _scope_stack[n] then 'name' was not defined # in this scope at all. self._scope_stack: List[Dict[str, bool]] = [dict()] self._tokens: _TokenStream = _TokenStream(self.clex) def parse( self, text: str, filename: str = "", debug: bool = False ) -> c_ast.FileAST: """Parses C code and returns an AST. text: A string containing the C source code filename: Name of the file being parsed (for meaningful error messages) debug: Deprecated debug flag (unused); for backwards compatibility. """ self._scope_stack = [dict()] self.clex.input(text, filename) self._tokens = _TokenStream(self.clex) ast = self._parse_translation_unit_or_empty() tok = self._peek() if tok is not None: self._parse_error(f"before: {tok.value}", self._tok_coord(tok)) return ast # ------------------------------------------------------------------ # Scope and declaration helpers # ------------------------------------------------------------------ def _coord(self, lineno: int, column: Optional[int] = None) -> Coord: return Coord(file=self.clex.filename, line=lineno, column=column) def _parse_error(self, msg: str, coord: Coord | str | None) -> NoReturn: raise ParseError(f"{coord}: {msg}") def _push_scope(self) -> None: self._scope_stack.append(dict()) def _pop_scope(self) -> None: assert len(self._scope_stack) > 1 self._scope_stack.pop() def _add_typedef_name(self, name: str, coord: Optional[Coord]) -> None: """Add a new typedef name (ie a TYPEID) to the current scope""" if not self._scope_stack[-1].get(name, True): self._parse_error( f"Typedef {name!r} previously declared as non-typedef in this scope", coord, ) self._scope_stack[-1][name] = True def _add_identifier(self, name: str, coord: Optional[Coord]) -> None: """Add a new object, function, or enum member name (ie an ID) to the current scope """ if self._scope_stack[-1].get(name, False): self._parse_error( f"Non-typedef {name!r} previously declared as typedef in this scope", coord, ) self._scope_stack[-1][name] = False def _is_type_in_scope(self, name: str) -> bool: """Is *name* a typedef-name in the current scope?""" for scope in reversed(self._scope_stack): # If name is an identifier in this scope it shadows typedefs in # higher scopes. in_scope = scope.get(name) if in_scope is not None: return in_scope return False def _lex_error_func(self, msg: str, line: int, column: int) -> None: self._parse_error(msg, self._coord(line, column)) def _lex_on_lbrace_func(self) -> None: self._push_scope() def _lex_on_rbrace_func(self) -> None: self._pop_scope() def _lex_type_lookup_func(self, name: str) -> bool: """Looks up types that were previously defined with typedef. Passed to the lexer for recognizing identifiers that are types. """ return self._is_type_in_scope(name) # To understand what's going on here, read sections A.8.5 and # A.8.6 of K&R2 very carefully. # # A C type consists of a basic type declaration, with a list # of modifiers. For example: # # int *c[5]; # # The basic declaration here is 'int c', and the pointer and # the array are the modifiers. # # Basic declarations are represented by TypeDecl (from module c_ast) and the # modifiers are FuncDecl, PtrDecl and ArrayDecl. # # The standard states that whenever a new modifier is parsed, it should be # added to the end of the list of modifiers. For example: # # K&R2 A.8.6.2: Array Declarators # # In a declaration T D where D has the form # D1 [constant-expression-opt] # and the type of the identifier in the declaration T D1 is # "type-modifier T", the type of the # identifier of D is "type-modifier array of T" # # This is what this method does. The declarator it receives # can be a list of declarators ending with TypeDecl. It # tacks the modifier to the end of this list, just before # the TypeDecl. # # Additionally, the modifier may be a list itself. This is # useful for pointers, that can come as a chain from the rule # p_pointer. In this case, the whole modifier list is spliced # into the new location. def _type_modify_decl(self, decl: Any, modifier: Any) -> c_ast.Node: """Tacks a type modifier on a declarator, and returns the modified declarator. Note: the declarator and modifier may be modified """ modifier_head = modifier modifier_tail = modifier # The modifier may be a nested list. Reach its tail. while modifier_tail.type: modifier_tail = modifier_tail.type # If the decl is a basic type, just tack the modifier onto it. if isinstance(decl, c_ast.TypeDecl): modifier_tail.type = decl return modifier else: # Otherwise, the decl is a list of modifiers. Reach # its tail and splice the modifier onto the tail, # pointing to the underlying basic type. decl_tail = decl while not isinstance(decl_tail.type, c_ast.TypeDecl): decl_tail = decl_tail.type modifier_tail.type = decl_tail.type decl_tail.type = modifier_head return decl # Due to the order in which declarators are constructed, # they have to be fixed in order to look like a normal AST. # # When a declaration arrives from syntax construction, it has # these problems: # * The innermost TypeDecl has no type (because the basic # type is only known at the uppermost declaration level) # * The declaration has no variable name, since that is saved # in the innermost TypeDecl # * The typename of the declaration is a list of type # specifiers, and not a node. Here, basic identifier types # should be separated from more complex types like enums # and structs. # # This method fixes these problems. def _fix_decl_name_type( self, decl: c_ast.Decl | c_ast.Typedef | c_ast.Typename, typename: List[Any], ) -> c_ast.Decl | c_ast.Typedef | c_ast.Typename: """Fixes a declaration. Modifies decl.""" # Reach the underlying basic type typ = decl while not isinstance(typ, c_ast.TypeDecl): typ = typ.type decl.name = typ.declname typ.quals = decl.quals[:] # The typename is a list of types. If any type in this # list isn't an IdentifierType, it must be the only # type in the list (it's illegal to declare "int enum ..") # If all the types are basic, they're collected in the # IdentifierType holder. for tn in typename: if not isinstance(tn, c_ast.IdentifierType): if len(typename) > 1: self._parse_error("Invalid multiple types specified", tn.coord) else: typ.type = tn return decl if not typename: # Functions default to returning int if not isinstance(decl.type, c_ast.FuncDecl): self._parse_error("Missing type in declaration", decl.coord) typ.type = c_ast.IdentifierType(["int"], coord=decl.coord) else: # At this point, we know that typename is a list of IdentifierType # nodes. Concatenate all the names into a single list. typ.type = c_ast.IdentifierType( [name for id in typename for name in id.names], coord=typename[0].coord ) return decl def _add_declaration_specifier( self, declspec: Optional["_DeclSpec"], newspec: Any, kind: "_DeclSpecKind", append: bool = False, ) -> "_DeclSpec": """See _DeclSpec for the specifier dictionary layout.""" if declspec is None: spec: _DeclSpec = dict( qual=[], storage=[], type=[], function=[], alignment=[] ) else: spec = declspec if append: spec[kind].append(newspec) else: spec[kind].insert(0, newspec) return spec def _build_declarations( self, spec: "_DeclSpec", decls: List["_DeclInfo"], typedef_namespace: bool = False, ) -> List[c_ast.Node]: """Builds a list of declarations all sharing the given specifiers. If typedef_namespace is true, each declared name is added to the "typedef namespace", which also includes objects, functions, and enum constants. """ is_typedef = "typedef" in spec["storage"] declarations = [] # Bit-fields are allowed to be unnamed. if decls[0].get("bitsize") is None: # When redeclaring typedef names as identifiers in inner scopes, a # problem can occur where the identifier gets grouped into # spec['type'], leaving decl as None. This can only occur for the # first declarator. if decls[0]["decl"] is None: if ( len(spec["type"]) < 2 or len(spec["type"][-1].names) != 1 or not self._is_type_in_scope(spec["type"][-1].names[0]) ): coord = "?" for t in spec["type"]: if hasattr(t, "coord"): coord = t.coord break self._parse_error("Invalid declaration", coord) # Make this look as if it came from "direct_declarator:ID" decls[0]["decl"] = c_ast.TypeDecl( declname=spec["type"][-1].names[0], type=None, quals=None, align=spec["alignment"], coord=spec["type"][-1].coord, ) # Remove the "new" type's name from the end of spec['type'] del spec["type"][-1] # A similar problem can occur where the declaration ends up # looking like an abstract declarator. Give it a name if this is # the case. elif not isinstance( decls[0]["decl"], (c_ast.Enum, c_ast.Struct, c_ast.Union, c_ast.IdentifierType), ): decls_0_tail = cast(Any, decls[0]["decl"]) while not isinstance(decls_0_tail, c_ast.TypeDecl): decls_0_tail = decls_0_tail.type if decls_0_tail.declname is None: decls_0_tail.declname = spec["type"][-1].names[0] del spec["type"][-1] for decl in decls: assert decl["decl"] is not None if is_typedef: declaration = c_ast.Typedef( name=None, quals=spec["qual"], storage=spec["storage"], type=decl["decl"], coord=decl["decl"].coord, ) else: declaration = c_ast.Decl( name=None, quals=spec["qual"], align=spec["alignment"], storage=spec["storage"], funcspec=spec["function"], type=decl["decl"], init=decl.get("init"), bitsize=decl.get("bitsize"), coord=decl["decl"].coord, ) if isinstance( declaration.type, (c_ast.Enum, c_ast.Struct, c_ast.Union, c_ast.IdentifierType), ): fixed_decl = declaration else: fixed_decl = self._fix_decl_name_type(declaration, spec["type"]) # Add the type name defined by typedef to a # symbol table (for usage in the lexer) if typedef_namespace: if is_typedef: self._add_typedef_name(fixed_decl.name, fixed_decl.coord) else: self._add_identifier(fixed_decl.name, fixed_decl.coord) fixed_decl = fix_atomic_specifiers( cast(c_ast.Decl | c_ast.Typedef, fixed_decl) ) declarations.append(fixed_decl) return declarations def _build_function_definition( self, spec: "_DeclSpec", decl: c_ast.Node, param_decls: Optional[List[c_ast.Node]], body: c_ast.Node, ) -> c_ast.Node: """Builds a function definition.""" if "typedef" in spec["storage"]: self._parse_error("Invalid typedef", decl.coord) declaration = self._build_declarations( spec=spec, decls=[dict(decl=decl, init=None, bitsize=None)], typedef_namespace=True, )[0] return c_ast.FuncDef( decl=declaration, param_decls=param_decls, body=body, coord=decl.coord ) def _select_struct_union_class(self, token: str) -> type: """Given a token (either STRUCT or UNION), selects the appropriate AST class. """ if token == "struct": return c_ast.Struct else: return c_ast.Union # ------------------------------------------------------------------ # Token helpers # ------------------------------------------------------------------ def _peek(self, k: int = 1) -> Optional[_Token]: """Return the k-th next token without consuming it (1-based).""" return self._tokens.peek(k) def _peek_type(self, k: int = 1) -> Optional[str]: """Return the type of the k-th next token, or None if absent (1-based).""" tok = self._peek(k) return tok.type if tok is not None else None def _advance(self) -> _Token: tok = self._tokens.next() if tok is None: self._parse_error("At end of input", self.clex.filename) else: return tok def _accept(self, token_type: str) -> Optional[_Token]: """Conditionally consume next token, only if it's of token_type. If it is of the expected type, consume and return it. Otherwise, leaves the token intact and returns None. """ tok = self._peek() if tok is not None and tok.type == token_type: return self._advance() return None def _expect(self, token_type: str) -> _Token: tok = self._advance() if tok.type != token_type: self._parse_error(f"before: {tok.value}", self._tok_coord(tok)) return tok def _mark(self) -> int: return self._tokens.mark() def _reset(self, mark: int) -> None: self._tokens.reset(mark) def _tok_coord(self, tok: _Token) -> Coord: return self._coord(tok.lineno, tok.column) def _starts_declaration(self, tok: Optional[_Token] = None) -> bool: tok = tok or self._peek() if tok is None: return False return tok.type in _DECL_START def _starts_expression(self, tok: Optional[_Token] = None) -> bool: tok = tok or self._peek() if tok is None: return False return tok.type in _STARTS_EXPRESSION def _starts_statement(self) -> bool: tok_type = self._peek_type() if tok_type is None: return False if tok_type in _STARTS_STATEMENT: return True return self._starts_expression() def _starts_declarator(self, id_only: bool = False) -> bool: tok_type = self._peek_type() if tok_type is None: return False if tok_type in {"TIMES", "LPAREN"}: return True if id_only: return tok_type == "ID" return tok_type in {"ID", "TYPEID"} def _peek_declarator_name_info(self) -> Tuple[Optional[str], bool]: mark = self._mark() tok_type, saw_paren = self._scan_declarator_name_info() self._reset(mark) return tok_type, saw_paren def _parse_any_declarator( self, allow_abstract: bool = False, typeid_paren_as_abstract: bool = False ) -> Tuple[Optional[c_ast.Node], bool]: # C declarators are ambiguous without lookahead. For example: # int foo(int (aa)); -> aa is a name (ID) # typedef char TT; # int bar(int (TT)); -> TT is a type (TYPEID) in parens name_type, saw_paren = self._peek_declarator_name_info() if name_type is None or ( typeid_paren_as_abstract and name_type == "TYPEID" and saw_paren ): if not allow_abstract: tok = self._peek() coord = self._tok_coord(tok) if tok is not None else self.clex.filename self._parse_error("Invalid declarator", coord) decl = self._parse_abstract_declarator_opt() return decl, False if name_type == "TYPEID": if typeid_paren_as_abstract: decl = self._parse_typeid_noparen_declarator() else: decl = self._parse_typeid_declarator() else: decl = self._parse_id_declarator() return decl, True def _scan_declarator_name_info(self) -> Tuple[Optional[str], bool]: saw_paren = False while self._accept("TIMES"): while self._peek_type() in _TYPE_QUALIFIER: self._advance() tok = self._peek() if tok is None: return None, saw_paren if tok.type in {"ID", "TYPEID"}: self._advance() return tok.type, saw_paren if tok.type == "LPAREN": saw_paren = True self._advance() tok_type, nested_paren = self._scan_declarator_name_info() if nested_paren: saw_paren = True depth = 1 while True: tok = self._peek() if tok is None: return None, saw_paren if tok.type == "LPAREN": depth += 1 elif tok.type == "RPAREN": depth -= 1 self._advance() if depth == 0: break continue self._advance() return tok_type, saw_paren return None, saw_paren def _starts_direct_abstract_declarator(self) -> bool: return self._peek_type() in {"LPAREN", "LBRACKET"} def _is_assignment_op(self) -> bool: tok = self._peek() return tok is not None and tok.type in _ASSIGNMENT_OPS def _try_parse_paren_type_name( self, ) -> Optional[Tuple[c_ast.Typename, int, _Token]]: """Parse and return a parenthesized type name if present. Returns (typ, mark, lparen_tok) when the next tokens look like '(' type_name ')', where typ is the parsed type name, mark is the token-stream position before parsing, and lparen_tok is the LPAREN token. Returns None if no parenthesized type name is present. """ mark = self._mark() lparen_tok = self._accept("LPAREN") if lparen_tok is None: return None if not self._starts_declaration(): self._reset(mark) return None typ = self._parse_type_name() if self._accept("RPAREN") is None: self._reset(mark) return None return typ, mark, lparen_tok # ------------------------------------------------------------------ # Top-level # ------------------------------------------------------------------ # BNF: translation_unit_or_empty : translation_unit | empty def _parse_translation_unit_or_empty(self) -> c_ast.FileAST: if self._peek() is None: return c_ast.FileAST([]) return c_ast.FileAST(self._parse_translation_unit()) # BNF: translation_unit : external_declaration+ def _parse_translation_unit(self) -> List[c_ast.Node]: ext = [] while self._peek() is not None: ext.extend(self._parse_external_declaration()) return ext # BNF: external_declaration : function_definition # | declaration # | pp_directive # | pppragma_directive # | static_assert # | ';' def _parse_external_declaration(self) -> List[c_ast.Node]: tok = self._peek() if tok is None: return [] if tok.type == "PPHASH": self._parse_pp_directive() return [] if tok.type in {"PPPRAGMA", "_PRAGMA"}: return [self._parse_pppragma_directive()] if self._accept("SEMI"): return [] if tok.type == "_STATIC_ASSERT": return self._parse_static_assert() if not self._starts_declaration(tok): # Special handling for old-style function definitions that have an # implicit return type, e.g. # # foo() { # return 5; # } # # These get an implicit 'int' return type. decl = self._parse_id_declarator() param_decls = None if self._peek_type() != "LBRACE": self._parse_error("Invalid function definition", decl.coord) spec: _DeclSpec = dict( qual=[], alignment=[], storage=[], type=[c_ast.IdentifierType(["int"], coord=decl.coord)], function=[], ) func = self._build_function_definition( spec=spec, decl=decl, param_decls=param_decls, body=self._parse_compound_statement(), ) return [func] # From here on, parsing a standard declatation/definition. spec, saw_type, spec_coord = self._parse_declaration_specifiers( allow_no_type=True ) name_type, _ = self._peek_declarator_name_info() if name_type != "ID": decls = self._parse_decl_body_with_spec(spec, saw_type) self._expect("SEMI") return decls decl = self._parse_id_declarator() if self._peek_type() == "LBRACE" or self._starts_declaration(): param_decls = None if self._starts_declaration(): param_decls = self._parse_declaration_list() if self._peek_type() != "LBRACE": self._parse_error("Invalid function definition", decl.coord) if not spec["type"]: spec["type"] = [c_ast.IdentifierType(["int"], coord=spec_coord)] func = self._build_function_definition( spec=spec, decl=decl, param_decls=param_decls, body=self._parse_compound_statement(), ) return [func] decl_dict: "_DeclInfo" = dict(decl=decl, init=None, bitsize=None) if self._accept("EQUALS"): decl_dict["init"] = self._parse_initializer() decls = self._parse_init_declarator_list(first=decl_dict) decls = self._build_declarations(spec=spec, decls=decls, typedef_namespace=True) self._expect("SEMI") return decls # ------------------------------------------------------------------ # Declarations # # Declarations always come as lists (because they can be several in one # line). When returning parsed declarations, a list is always returned - # even if it contains a single element. # ------------------------------------------------------------------ def _parse_declaration(self) -> List[c_ast.Node]: decls = self._parse_decl_body() self._expect("SEMI") return decls # BNF: decl_body : declaration_specifiers decl_body_with_spec def _parse_decl_body(self) -> List[c_ast.Node]: spec, saw_type, _ = self._parse_declaration_specifiers(allow_no_type=True) return self._parse_decl_body_with_spec(spec, saw_type) # BNF: decl_body_with_spec : init_declarator_list # | struct_or_union_or_enum_only def _parse_decl_body_with_spec( self, spec: "_DeclSpec", saw_type: bool ) -> List[c_ast.Node]: decls = None if saw_type: if self._starts_declarator(): decls = self._parse_init_declarator_list() else: if self._starts_declarator(id_only=True): decls = self._parse_init_declarator_list(id_only=True) if decls is None: ty = spec["type"] s_u_or_e = (c_ast.Struct, c_ast.Union, c_ast.Enum) if len(ty) == 1 and isinstance(ty[0], s_u_or_e): decls = [ c_ast.Decl( name=None, quals=spec["qual"], align=spec["alignment"], storage=spec["storage"], funcspec=spec["function"], type=ty[0], init=None, bitsize=None, coord=ty[0].coord, ) ] else: decls = self._build_declarations( spec=spec, decls=[dict(decl=None, init=None, bitsize=None)], typedef_namespace=True, ) else: decls = self._build_declarations( spec=spec, decls=decls, typedef_namespace=True ) return decls # BNF: declaration_list : declaration+ def _parse_declaration_list(self) -> List[c_ast.Node]: decls = [] while self._starts_declaration(): decls.extend(self._parse_declaration()) return decls # BNF: declaration_specifiers : (storage_class_specifier # | type_specifier # | type_qualifier # | function_specifier # | alignment_specifier)+ def _parse_declaration_specifiers( self, allow_no_type: bool = False ) -> Tuple["_DeclSpec", bool, Optional[Coord]]: """Parse declaration-specifier sequence. allow_no_type: If True, allow a missing type specifier without error. Returns: (spec, saw_type, first_coord) where spec is a dict with qual/storage/type/function/alignment entries, saw_type is True if a type specifier was consumed, and first_coord is the coord of the first specifier token (used for diagnostics). """ spec = None saw_type = False first_coord = None while True: tok = self._peek() if tok is None: break if tok.type == "_ALIGNAS": if first_coord is None: first_coord = self._tok_coord(tok) spec = self._add_declaration_specifier( spec, self._parse_alignment_specifier(), "alignment", append=True ) continue if tok.type == "_ATOMIC" and self._peek_type(2) == "LPAREN": if first_coord is None: first_coord = self._tok_coord(tok) spec = self._add_declaration_specifier( spec, self._parse_atomic_specifier(), "type", append=True ) saw_type = True continue if tok.type in _TYPE_QUALIFIER: if first_coord is None: first_coord = self._tok_coord(tok) spec = self._add_declaration_specifier( spec, self._advance().value, "qual", append=True ) continue if tok.type in _STORAGE_CLASS: if first_coord is None: first_coord = self._tok_coord(tok) spec = self._add_declaration_specifier( spec, self._advance().value, "storage", append=True ) continue if tok.type in _FUNCTION_SPEC: if first_coord is None: first_coord = self._tok_coord(tok) spec = self._add_declaration_specifier( spec, self._advance().value, "function", append=True ) continue if tok.type in _TYPE_SPEC_SIMPLE: if first_coord is None: first_coord = self._tok_coord(tok) tok = self._advance() spec = self._add_declaration_specifier( spec, c_ast.IdentifierType([tok.value], coord=self._tok_coord(tok)), "type", append=True, ) saw_type = True continue if tok.type == "TYPEID": if saw_type: break if first_coord is None: first_coord = self._tok_coord(tok) tok = self._advance() spec = self._add_declaration_specifier( spec, c_ast.IdentifierType([tok.value], coord=self._tok_coord(tok)), "type", append=True, ) saw_type = True continue if tok.type in {"STRUCT", "UNION"}: if first_coord is None: first_coord = self._tok_coord(tok) spec = self._add_declaration_specifier( spec, self._parse_struct_or_union_specifier(), "type", append=True ) saw_type = True continue if tok.type == "ENUM": if first_coord is None: first_coord = self._tok_coord(tok) spec = self._add_declaration_specifier( spec, self._parse_enum_specifier(), "type", append=True ) saw_type = True continue break if spec is None: self._parse_error("Invalid declaration", self.clex.filename) if not saw_type and not allow_no_type: self._parse_error("Missing type in declaration", first_coord) return spec, saw_type, first_coord # BNF: specifier_qualifier_list : (type_specifier # | type_qualifier # | alignment_specifier)+ def _parse_specifier_qualifier_list(self) -> "_DeclSpec": spec = None saw_type = False saw_alignment = False first_coord = None while True: tok = self._peek() if tok is None: break if tok.type == "_ALIGNAS": if first_coord is None: first_coord = self._tok_coord(tok) spec = self._add_declaration_specifier( spec, self._parse_alignment_specifier(), "alignment", append=True ) saw_alignment = True continue if tok.type == "_ATOMIC" and self._peek_type(2) == "LPAREN": if first_coord is None: first_coord = self._tok_coord(tok) spec = self._add_declaration_specifier( spec, self._parse_atomic_specifier(), "type", append=True ) saw_type = True continue if tok.type in _TYPE_QUALIFIER: if first_coord is None: first_coord = self._tok_coord(tok) spec = self._add_declaration_specifier( spec, self._advance().value, "qual", append=True ) continue if tok.type in _TYPE_SPEC_SIMPLE: if first_coord is None: first_coord = self._tok_coord(tok) tok = self._advance() spec = self._add_declaration_specifier( spec, c_ast.IdentifierType([tok.value], coord=self._tok_coord(tok)), "type", append=True, ) saw_type = True continue if tok.type == "TYPEID": if saw_type: break if first_coord is None: first_coord = self._tok_coord(tok) tok = self._advance() spec = self._add_declaration_specifier( spec, c_ast.IdentifierType([tok.value], coord=self._tok_coord(tok)), "type", append=True, ) saw_type = True continue if tok.type in {"STRUCT", "UNION"}: if first_coord is None: first_coord = self._tok_coord(tok) spec = self._add_declaration_specifier( spec, self._parse_struct_or_union_specifier(), "type", append=True ) saw_type = True continue if tok.type == "ENUM": if first_coord is None: first_coord = self._tok_coord(tok) spec = self._add_declaration_specifier( spec, self._parse_enum_specifier(), "type", append=True ) saw_type = True continue break if spec is None: self._parse_error("Invalid specifier list", self.clex.filename) if not saw_type and not saw_alignment: self._parse_error("Missing type in declaration", first_coord) if spec.get("storage") is None: spec["storage"] = [] if spec.get("function") is None: spec["function"] = [] return spec # BNF: type_qualifier_list : type_qualifier+ def _parse_type_qualifier_list(self) -> List[str]: quals = [] while self._peek_type() in _TYPE_QUALIFIER: quals.append(self._advance().value) return quals # BNF: alignment_specifier : _ALIGNAS '(' type_name | constant_expression ')' def _parse_alignment_specifier(self) -> c_ast.Node: tok = self._expect("_ALIGNAS") self._expect("LPAREN") if self._starts_declaration(): typ = self._parse_type_name() self._expect("RPAREN") return c_ast.Alignas(typ, self._tok_coord(tok)) expr = self._parse_constant_expression() self._expect("RPAREN") return c_ast.Alignas(expr, self._tok_coord(tok)) # BNF: atomic_specifier : _ATOMIC '(' type_name ')' def _parse_atomic_specifier(self) -> c_ast.Node: self._expect("_ATOMIC") self._expect("LPAREN") typ = self._parse_type_name() self._expect("RPAREN") typ.quals.append("_Atomic") return typ # BNF: init_declarator_list : init_declarator (',' init_declarator)* def _parse_init_declarator_list( self, first: Optional["_DeclInfo"] = None, id_only: bool = False ) -> List["_DeclInfo"]: decls = ( [first] if first is not None else [self._parse_init_declarator(id_only=id_only)] ) while self._accept("COMMA"): decls.append(self._parse_init_declarator(id_only=id_only)) return decls # BNF: init_declarator : declarator ('=' initializer)? def _parse_init_declarator(self, id_only: bool = False) -> "_DeclInfo": decl = self._parse_id_declarator() if id_only else self._parse_declarator() init = None if self._accept("EQUALS"): init = self._parse_initializer() return dict(decl=decl, init=init, bitsize=None) # ------------------------------------------------------------------ # Structs/unions/enums # ------------------------------------------------------------------ # BNF: struct_or_union_specifier : struct_or_union ID? '{' struct_declaration_list? '}' # | struct_or_union ID def _parse_struct_or_union_specifier(self) -> c_ast.Node: tok = self._advance() klass = self._select_struct_union_class(tok.value) if self._peek_type() in {"ID", "TYPEID"}: name_tok = self._advance() if self._peek_type() == "LBRACE": self._advance() if self._accept("RBRACE"): return klass( name=name_tok.value, decls=[], coord=self._tok_coord(name_tok) ) decls = self._parse_struct_declaration_list() self._expect("RBRACE") return klass( name=name_tok.value, decls=decls, coord=self._tok_coord(name_tok) ) return klass( name=name_tok.value, decls=None, coord=self._tok_coord(name_tok) ) if self._peek_type() == "LBRACE": brace_tok = self._advance() if self._accept("RBRACE"): return klass(name=None, decls=[], coord=self._tok_coord(brace_tok)) decls = self._parse_struct_declaration_list() self._expect("RBRACE") return klass(name=None, decls=decls, coord=self._tok_coord(brace_tok)) self._parse_error("Invalid struct/union declaration", self._tok_coord(tok)) # BNF: struct_declaration_list : struct_declaration+ def _parse_struct_declaration_list(self) -> List[c_ast.Node]: decls = [] while self._peek_type() not in {None, "RBRACE"}: items = self._parse_struct_declaration() if items is None: continue decls.extend(items) return decls # BNF: struct_declaration : specifier_qualifier_list struct_declarator_list? ';' # | static_assert # | pppragma_directive def _parse_struct_declaration(self) -> Optional[List[c_ast.Node]]: if self._peek_type() == "SEMI": self._advance() return None if self._peek_type() in {"PPPRAGMA", "_PRAGMA"}: return [self._parse_pppragma_directive()] spec = self._parse_specifier_qualifier_list() assert "typedef" not in spec.get("storage", []) decls = None if self._starts_declarator() or self._peek_type() == "COLON": decls = self._parse_struct_declarator_list() if decls is not None: self._expect("SEMI") return self._build_declarations(spec=spec, decls=decls) if len(spec["type"]) == 1: node = spec["type"][0] if isinstance(node, c_ast.Node): decl_type = node else: decl_type = c_ast.IdentifierType(node) self._expect("SEMI") return self._build_declarations( spec=spec, decls=[dict(decl=decl_type, init=None, bitsize=None)] ) self._expect("SEMI") return self._build_declarations( spec=spec, decls=[dict(decl=None, init=None, bitsize=None)] ) # BNF: struct_declarator_list : struct_declarator (',' struct_declarator)* def _parse_struct_declarator_list(self) -> List["_DeclInfo"]: decls = [self._parse_struct_declarator()] while self._accept("COMMA"): decls.append(self._parse_struct_declarator()) return decls # BNF: struct_declarator : declarator? ':' constant_expression # | declarator (':' constant_expression)? def _parse_struct_declarator(self) -> "_DeclInfo": if self._accept("COLON"): bitsize = self._parse_constant_expression() return { "decl": c_ast.TypeDecl(None, None, None, None), "init": None, "bitsize": bitsize, } decl = self._parse_declarator() if self._accept("COLON"): bitsize = self._parse_constant_expression() return {"decl": decl, "init": None, "bitsize": bitsize} return {"decl": decl, "init": None, "bitsize": None} # BNF: enum_specifier : ENUM ID? '{' enumerator_list? '}' # | ENUM ID def _parse_enum_specifier(self) -> c_ast.Node: tok = self._expect("ENUM") if self._peek_type() in {"ID", "TYPEID"}: name_tok = self._advance() if self._peek_type() == "LBRACE": self._advance() enums = self._parse_enumerator_list() self._expect("RBRACE") return c_ast.Enum(name_tok.value, enums, self._tok_coord(tok)) return c_ast.Enum(name_tok.value, None, self._tok_coord(tok)) self._expect("LBRACE") enums = self._parse_enumerator_list() self._expect("RBRACE") return c_ast.Enum(None, enums, self._tok_coord(tok)) # BNF: enumerator_list : enumerator (',' enumerator)* ','? def _parse_enumerator_list(self) -> c_ast.Node: enum = self._parse_enumerator() enum_list = c_ast.EnumeratorList([enum], enum.coord) while self._accept("COMMA"): if self._peek_type() == "RBRACE": break enum = self._parse_enumerator() enum_list.enumerators.append(enum) return enum_list # BNF: enumerator : ID ('=' constant_expression)? def _parse_enumerator(self) -> c_ast.Node: name_tok = self._expect("ID") if self._accept("EQUALS"): value = self._parse_constant_expression() else: value = None enum = c_ast.Enumerator(name_tok.value, value, self._tok_coord(name_tok)) self._add_identifier(enum.name, enum.coord) return enum # ------------------------------------------------------------------ # Declarators # ------------------------------------------------------------------ # BNF: declarator : pointer? direct_declarator def _parse_declarator(self) -> c_ast.Node: decl, _ = self._parse_any_declarator( allow_abstract=False, typeid_paren_as_abstract=False ) assert decl is not None return decl # BNF: id_declarator : declarator with ID name def _parse_id_declarator(self) -> c_ast.Node: return self._parse_declarator_kind(kind="id", allow_paren=True) # BNF: typeid_declarator : declarator with TYPEID name def _parse_typeid_declarator(self) -> c_ast.Node: return self._parse_declarator_kind(kind="typeid", allow_paren=True) # BNF: typeid_noparen_declarator : declarator without parenthesized name def _parse_typeid_noparen_declarator(self) -> c_ast.Node: return self._parse_declarator_kind(kind="typeid", allow_paren=False) # BNF: declarator_kind : pointer? direct_declarator(kind) def _parse_declarator_kind(self, kind: str, allow_paren: bool) -> c_ast.Node: ptr = None if self._peek_type() == "TIMES": ptr = self._parse_pointer() direct = self._parse_direct_declarator(kind, allow_paren=allow_paren) if ptr is not None: return self._type_modify_decl(direct, ptr) return direct # BNF: direct_declarator : ID | TYPEID | '(' declarator ')' # | direct_declarator '[' ... ']' # | direct_declarator '(' ... ')' def _parse_direct_declarator( self, kind: str, allow_paren: bool = True ) -> c_ast.Node: if allow_paren and self._accept("LPAREN"): decl = self._parse_declarator_kind(kind, allow_paren=True) self._expect("RPAREN") else: if kind == "id": name_tok = self._expect("ID") else: name_tok = self._expect("TYPEID") decl = c_ast.TypeDecl( declname=name_tok.value, type=None, quals=None, align=None, coord=self._tok_coord(name_tok), ) return self._parse_decl_suffixes(decl) def _parse_decl_suffixes(self, decl: c_ast.Node) -> c_ast.Node: """Parse a chain of array/function suffixes and attach them to decl.""" while True: if self._peek_type() == "LBRACKET": decl = self._type_modify_decl(decl, self._parse_array_decl(decl)) continue if self._peek_type() == "LPAREN": func = self._parse_function_decl(decl) decl = self._type_modify_decl(decl, func) continue break return decl # BNF: array_decl : '[' array_specifiers? assignment_expression? ']' def _parse_array_decl(self, base_decl: c_ast.Node) -> c_ast.Node: return self._parse_array_decl_common(base_type=None, coord=base_decl.coord) def _parse_array_decl_common( self, base_type: Optional[c_ast.Node], coord: Optional[Coord] = None ) -> c_ast.Node: """Parse an array declarator suffix and return an ArrayDecl node. base_type: Base declarator node to attach (None for direct-declarator parsing, TypeDecl for abstract declarators). coord: Coordinate to use for the ArrayDecl. If None, uses the '[' token. """ lbrack_tok = self._expect("LBRACKET") if coord is None: coord = self._tok_coord(lbrack_tok) def make_array_decl(dim, dim_quals): return c_ast.ArrayDecl( type=base_type, dim=dim, dim_quals=dim_quals, coord=coord ) if self._accept("STATIC"): dim_quals = ["static"] + (self._parse_type_qualifier_list() or []) dim = self._parse_assignment_expression() self._expect("RBRACKET") return make_array_decl(dim, dim_quals) if self._peek_type() in _TYPE_QUALIFIER: dim_quals = self._parse_type_qualifier_list() or [] if self._accept("STATIC"): dim_quals = dim_quals + ["static"] dim = self._parse_assignment_expression() self._expect("RBRACKET") return make_array_decl(dim, dim_quals) times_tok = self._accept("TIMES") if times_tok: self._expect("RBRACKET") dim = c_ast.ID(times_tok.value, self._tok_coord(times_tok)) return make_array_decl(dim, dim_quals) dim = None if self._starts_expression(): dim = self._parse_assignment_expression() self._expect("RBRACKET") return make_array_decl(dim, dim_quals) times_tok = self._accept("TIMES") if times_tok: self._expect("RBRACKET") dim = c_ast.ID(times_tok.value, self._tok_coord(times_tok)) return make_array_decl(dim, []) dim = None if self._starts_expression(): dim = self._parse_assignment_expression() self._expect("RBRACKET") return make_array_decl(dim, []) # BNF: function_decl : '(' parameter_type_list_opt | identifier_list_opt ')' def _parse_function_decl(self, base_decl: c_ast.Node) -> c_ast.Node: self._expect("LPAREN") if self._accept("RPAREN"): args = None else: args = ( self._parse_parameter_type_list() if self._starts_declaration() else self._parse_identifier_list_opt() ) self._expect("RPAREN") func = c_ast.FuncDecl(args=args, type=None, coord=base_decl.coord) if self._peek_type() == "LBRACE": if func.args is not None: for param in func.args.params: if isinstance(param, c_ast.EllipsisParam): break name = getattr(param, "name", None) if name: self._add_identifier(name, param.coord) return func # BNF: pointer : '*' type_qualifier_list? pointer? def _parse_pointer(self) -> Optional[c_ast.Node]: stars = [] times_tok = self._accept("TIMES") while times_tok: quals = self._parse_type_qualifier_list() or [] stars.append((quals, self._tok_coord(times_tok))) times_tok = self._accept("TIMES") if not stars: return None ptr = None for quals, coord in stars: ptr = c_ast.PtrDecl(quals=quals, type=ptr, coord=coord) return ptr # BNF: parameter_type_list : parameter_list (',' ELLIPSIS)? def _parse_parameter_type_list(self) -> c_ast.ParamList: params = self._parse_parameter_list() if self._peek_type() == "COMMA" and self._peek_type(2) == "ELLIPSIS": self._advance() ell_tok = self._advance() params.params.append(c_ast.EllipsisParam(self._tok_coord(ell_tok))) return params # BNF: parameter_list : parameter_declaration (',' parameter_declaration)* def _parse_parameter_list(self) -> c_ast.ParamList: first = self._parse_parameter_declaration() params = c_ast.ParamList([first], first.coord) while self._peek_type() == "COMMA" and self._peek_type(2) != "ELLIPSIS": self._advance() params.params.append(self._parse_parameter_declaration()) return params # BNF: parameter_declaration : declaration_specifiers declarator? # | declaration_specifiers abstract_declarator_opt def _parse_parameter_declaration(self) -> c_ast.Node: spec, _, spec_coord = self._parse_declaration_specifiers(allow_no_type=True) if not spec["type"]: spec["type"] = [c_ast.IdentifierType(["int"], coord=spec_coord)] if self._starts_declarator(): decl, is_named = self._parse_any_declarator( allow_abstract=True, typeid_paren_as_abstract=True ) if is_named: return self._build_declarations( spec=spec, decls=[dict(decl=decl, init=None, bitsize=None)] )[0] return self._build_parameter_declaration(spec, decl, spec_coord) decl = self._parse_abstract_declarator_opt() return self._build_parameter_declaration(spec, decl, spec_coord) def _build_parameter_declaration( self, spec: "_DeclSpec", decl: Optional[c_ast.Node], spec_coord: Optional[Coord] ) -> c_ast.Node: if ( len(spec["type"]) > 1 and len(spec["type"][-1].names) == 1 and self._is_type_in_scope(spec["type"][-1].names[0]) ): return self._build_declarations( spec=spec, decls=[dict(decl=decl, init=None, bitsize=None)] )[0] decl = c_ast.Typename( name="", quals=spec["qual"], align=None, type=decl or c_ast.TypeDecl(None, None, None, None), coord=spec_coord, ) return self._fix_decl_name_type(decl, spec["type"]) # BNF: identifier_list_opt : identifier_list | empty def _parse_identifier_list_opt(self) -> Optional[c_ast.Node]: if self._peek_type() == "RPAREN": return None return self._parse_identifier_list() # BNF: identifier_list : identifier (',' identifier)* def _parse_identifier_list(self) -> c_ast.Node: first = self._parse_identifier() params = c_ast.ParamList([first], first.coord) while self._accept("COMMA"): params.params.append(self._parse_identifier()) return params # ------------------------------------------------------------------ # Abstract declarators # ------------------------------------------------------------------ # BNF: type_name : specifier_qualifier_list abstract_declarator_opt def _parse_type_name(self) -> c_ast.Typename: spec = self._parse_specifier_qualifier_list() decl = self._parse_abstract_declarator_opt() coord = None if decl is not None: coord = decl.coord elif spec["type"]: coord = spec["type"][0].coord typename = c_ast.Typename( name="", quals=spec["qual"][:], align=None, type=decl or c_ast.TypeDecl(None, None, None, None), coord=coord, ) return cast(c_ast.Typename, self._fix_decl_name_type(typename, spec["type"])) # BNF: abstract_declarator_opt : pointer? direct_abstract_declarator? def _parse_abstract_declarator_opt(self) -> Optional[c_ast.Node]: if self._peek_type() == "TIMES": ptr = self._parse_pointer() if self._starts_direct_abstract_declarator(): decl = self._parse_direct_abstract_declarator() else: decl = c_ast.TypeDecl(None, None, None, None) assert ptr is not None return self._type_modify_decl(decl, ptr) if self._starts_direct_abstract_declarator(): return self._parse_direct_abstract_declarator() return None # BNF: direct_abstract_declarator : '(' parameter_type_list_opt ')' # | '(' abstract_declarator ')' # | '[' ... ']' def _parse_direct_abstract_declarator(self) -> c_ast.Node: lparen_tok = self._accept("LPAREN") if lparen_tok: if self._starts_declaration() or self._peek_type() == "RPAREN": params = self._parse_parameter_type_list_opt() self._expect("RPAREN") decl = c_ast.FuncDecl( args=params, type=c_ast.TypeDecl(None, None, None, None), coord=self._tok_coord(lparen_tok), ) else: decl = self._parse_abstract_declarator_opt() self._expect("RPAREN") assert decl is not None elif self._peek_type() == "LBRACKET": decl = self._parse_abstract_array_base() else: self._parse_error("Invalid abstract declarator", self.clex.filename) return self._parse_decl_suffixes(decl) # BNF: parameter_type_list_opt : parameter_type_list | empty def _parse_parameter_type_list_opt(self) -> Optional[c_ast.ParamList]: if self._peek_type() == "RPAREN": return None return self._parse_parameter_type_list() # BNF: abstract_array_base : '[' array_specifiers? assignment_expression? ']' def _parse_abstract_array_base(self) -> c_ast.Node: return self._parse_array_decl_common( base_type=c_ast.TypeDecl(None, None, None, None), coord=None ) # ------------------------------------------------------------------ # Statements # ------------------------------------------------------------------ # BNF: statement : labeled_statement | compound_statement # | selection_statement | iteration_statement # | jump_statement | expression_statement # | static_assert | pppragma_directive def _parse_statement(self) -> c_ast.Node | List[c_ast.Node]: tok_type = self._peek_type() match tok_type: case "CASE" | "DEFAULT": return self._parse_labeled_statement() case "ID" if self._peek_type(2) == "COLON": return self._parse_labeled_statement() case "LBRACE": return self._parse_compound_statement() case "IF" | "SWITCH": return self._parse_selection_statement() case "WHILE" | "DO" | "FOR": return self._parse_iteration_statement() case "GOTO" | "BREAK" | "CONTINUE" | "RETURN": return self._parse_jump_statement() case "PPPRAGMA" | "_PRAGMA": return self._parse_pppragma_directive() case "_STATIC_ASSERT": return self._parse_static_assert() case _: return self._parse_expression_statement() # BNF: pragmacomp_or_statement : pppragma_directive* statement def _parse_pragmacomp_or_statement(self) -> c_ast.Node | List[c_ast.Node]: if self._peek_type() in {"PPPRAGMA", "_PRAGMA"}: pragmas = self._parse_pppragma_directive_list() stmt = self._parse_statement() return c_ast.Compound(block_items=pragmas + [stmt], coord=pragmas[0].coord) return self._parse_statement() # BNF: block_item : declaration | statement def _parse_block_item(self) -> c_ast.Node | List[c_ast.Node]: if self._starts_declaration(): return self._parse_declaration() return self._parse_statement() # BNF: block_item_list : block_item+ def _parse_block_item_list(self) -> List[c_ast.Node]: items = [] while self._peek_type() not in {"RBRACE", None}: item = self._parse_block_item() if isinstance(item, list): if item == [None]: continue items.extend(item) else: items.append(item) return items # BNF: compound_statement : '{' block_item_list? '}' def _parse_compound_statement(self) -> c_ast.Node: lbrace_tok = self._expect("LBRACE") if self._accept("RBRACE"): return c_ast.Compound(block_items=None, coord=self._tok_coord(lbrace_tok)) block_items = self._parse_block_item_list() self._expect("RBRACE") return c_ast.Compound( block_items=block_items, coord=self._tok_coord(lbrace_tok) ) # BNF: labeled_statement : ID ':' statement # | CASE constant_expression ':' statement # | DEFAULT ':' statement def _parse_labeled_statement(self) -> c_ast.Node: tok_type = self._peek_type() match tok_type: case "ID": name_tok = self._advance() self._expect("COLON") if self._starts_statement(): stmt = self._parse_pragmacomp_or_statement() else: stmt = c_ast.EmptyStatement(self._tok_coord(name_tok)) return c_ast.Label(name_tok.value, stmt, self._tok_coord(name_tok)) case "CASE": case_tok = self._advance() expr = self._parse_constant_expression() self._expect("COLON") if self._starts_statement(): stmt = self._parse_pragmacomp_or_statement() else: stmt = c_ast.EmptyStatement(self._tok_coord(case_tok)) return c_ast.Case(expr, [stmt], self._tok_coord(case_tok)) case "DEFAULT": def_tok = self._advance() self._expect("COLON") if self._starts_statement(): stmt = self._parse_pragmacomp_or_statement() else: stmt = c_ast.EmptyStatement(self._tok_coord(def_tok)) return c_ast.Default([stmt], self._tok_coord(def_tok)) case _: self._parse_error("Invalid labeled statement", self.clex.filename) # BNF: selection_statement : IF '(' expression ')' statement (ELSE statement)? # | SWITCH '(' expression ')' statement def _parse_selection_statement(self) -> c_ast.Node: tok = self._advance() match tok.type: case "IF": self._expect("LPAREN") cond = self._parse_expression() self._expect("RPAREN") then_stmt = self._parse_pragmacomp_or_statement() if self._accept("ELSE"): else_stmt = self._parse_pragmacomp_or_statement() return c_ast.If(cond, then_stmt, else_stmt, self._tok_coord(tok)) return c_ast.If(cond, then_stmt, None, self._tok_coord(tok)) case "SWITCH": self._expect("LPAREN") expr = self._parse_expression() self._expect("RPAREN") stmt = self._parse_pragmacomp_or_statement() return fix_switch_cases(c_ast.Switch(expr, stmt, self._tok_coord(tok))) case _: self._parse_error("Invalid selection statement", self._tok_coord(tok)) # BNF: iteration_statement : WHILE '(' expression ')' statement # | DO statement WHILE '(' expression ')' ';' # | FOR '(' (declaration | expression_opt) ';' # expression_opt ';' expression_opt ')' statement def _parse_iteration_statement(self) -> c_ast.Node: tok = self._advance() match tok.type: case "WHILE": self._expect("LPAREN") cond = self._parse_expression() self._expect("RPAREN") stmt = self._parse_pragmacomp_or_statement() return c_ast.While(cond, stmt, self._tok_coord(tok)) case "DO": stmt = self._parse_pragmacomp_or_statement() self._expect("WHILE") self._expect("LPAREN") cond = self._parse_expression() self._expect("RPAREN") self._expect("SEMI") return c_ast.DoWhile(cond, stmt, self._tok_coord(tok)) case "FOR": self._expect("LPAREN") if self._starts_declaration(): decls = self._parse_declaration() init = c_ast.DeclList(decls, self._tok_coord(tok)) cond = self._parse_expression_opt() self._expect("SEMI") next_expr = self._parse_expression_opt() self._expect("RPAREN") stmt = self._parse_pragmacomp_or_statement() return c_ast.For(init, cond, next_expr, stmt, self._tok_coord(tok)) init = self._parse_expression_opt() self._expect("SEMI") cond = self._parse_expression_opt() self._expect("SEMI") next_expr = self._parse_expression_opt() self._expect("RPAREN") stmt = self._parse_pragmacomp_or_statement() return c_ast.For(init, cond, next_expr, stmt, self._tok_coord(tok)) case _: self._parse_error("Invalid iteration statement", self._tok_coord(tok)) # BNF: jump_statement : GOTO ID ';' | BREAK ';' | CONTINUE ';' # | RETURN expression? ';' def _parse_jump_statement(self) -> c_ast.Node: tok = self._advance() match tok.type: case "GOTO": name_tok = self._expect("ID") self._expect("SEMI") return c_ast.Goto(name_tok.value, self._tok_coord(tok)) case "BREAK": self._expect("SEMI") return c_ast.Break(self._tok_coord(tok)) case "CONTINUE": self._expect("SEMI") return c_ast.Continue(self._tok_coord(tok)) case "RETURN": if self._accept("SEMI"): return c_ast.Return(None, self._tok_coord(tok)) expr = self._parse_expression() self._expect("SEMI") return c_ast.Return(expr, self._tok_coord(tok)) case _: self._parse_error("Invalid jump statement", self._tok_coord(tok)) # BNF: expression_statement : expression_opt ';' def _parse_expression_statement(self) -> c_ast.Node: expr = self._parse_expression_opt() semi_tok = self._expect("SEMI") if expr is None: return c_ast.EmptyStatement(self._tok_coord(semi_tok)) return expr # ------------------------------------------------------------------ # Expressions # ------------------------------------------------------------------ # BNF: expression_opt : expression | empty def _parse_expression_opt(self) -> Optional[c_ast.Node]: if self._starts_expression(): return self._parse_expression() return None # BNF: expression : assignment_expression (',' assignment_expression)* def _parse_expression(self) -> c_ast.Node: expr = self._parse_assignment_expression() if not self._accept("COMMA"): return expr exprs = [expr, self._parse_assignment_expression()] while self._accept("COMMA"): exprs.append(self._parse_assignment_expression()) return c_ast.ExprList(exprs, expr.coord) # BNF: assignment_expression : conditional_expression # | unary_expression assignment_op assignment_expression def _parse_assignment_expression(self) -> c_ast.Node: if self._peek_type() == "LPAREN" and self._peek_type(2) == "LBRACE": self._advance() comp = self._parse_compound_statement() self._expect("RPAREN") return comp expr = self._parse_conditional_expression() if self._is_assignment_op(): op = self._advance().value rhs = self._parse_assignment_expression() return c_ast.Assignment(op, expr, rhs, expr.coord) return expr # BNF: conditional_expression : binary_expression # | binary_expression '?' expression ':' conditional_expression def _parse_conditional_expression(self) -> c_ast.Node: expr = self._parse_binary_expression() if self._accept("CONDOP"): iftrue = self._parse_expression() self._expect("COLON") iffalse = self._parse_conditional_expression() return c_ast.TernaryOp(expr, iftrue, iffalse, expr.coord) return expr # BNF: binary_expression : cast_expression (binary_op cast_expression)* def _parse_binary_expression( self, min_prec: int = 0, lhs: Optional[c_ast.Node] = None ) -> c_ast.Node: if lhs is None: lhs = self._parse_cast_expression() while True: tok = self._peek() if tok is None or tok.type not in _BINARY_PRECEDENCE: break prec = _BINARY_PRECEDENCE[tok.type] if prec < min_prec: break op = tok.value self._advance() rhs = self._parse_cast_expression() while True: next_tok = self._peek() if next_tok is None or next_tok.type not in _BINARY_PRECEDENCE: break next_prec = _BINARY_PRECEDENCE[next_tok.type] if next_prec > prec: rhs = self._parse_binary_expression(next_prec, rhs) else: break lhs = c_ast.BinaryOp(op, lhs, rhs, lhs.coord) return lhs # BNF: cast_expression : '(' type_name ')' cast_expression # | unary_expression def _parse_cast_expression(self) -> c_ast.Node: result = self._try_parse_paren_type_name() if result is not None: typ, mark, lparen_tok = result if self._peek_type() == "LBRACE": # (type){...} is a compound literal, not a cast. Examples: # (int){1} -> compound literal, handled in postfix # (int) x -> cast, handled below self._reset(mark) else: expr = self._parse_cast_expression() return c_ast.Cast(typ, expr, self._tok_coord(lparen_tok)) return self._parse_unary_expression() # BNF: unary_expression : postfix_expression # | '++' unary_expression # | '--' unary_expression # | unary_op cast_expression # | 'sizeof' unary_expression # | 'sizeof' '(' type_name ')' # | '_Alignof' '(' type_name ')' def _parse_unary_expression(self) -> c_ast.Node: tok_type = self._peek_type() if tok_type in {"PLUSPLUS", "MINUSMINUS"}: tok = self._advance() expr = self._parse_unary_expression() return c_ast.UnaryOp(tok.value, expr, expr.coord) if tok_type in {"AND", "TIMES", "PLUS", "MINUS", "NOT", "LNOT"}: tok = self._advance() expr = self._parse_cast_expression() return c_ast.UnaryOp(tok.value, expr, expr.coord) if tok_type == "SIZEOF": tok = self._advance() result = self._try_parse_paren_type_name() if result is not None: typ, _, _ = result return c_ast.UnaryOp(tok.value, typ, self._tok_coord(tok)) expr = self._parse_unary_expression() return c_ast.UnaryOp(tok.value, expr, self._tok_coord(tok)) if tok_type == "_ALIGNOF": tok = self._advance() self._expect("LPAREN") typ = self._parse_type_name() self._expect("RPAREN") return c_ast.UnaryOp(tok.value, typ, self._tok_coord(tok)) return self._parse_postfix_expression() # BNF: postfix_expression : primary_expression postfix_suffix* # | '(' type_name ')' '{' initializer_list ','? '}' def _parse_postfix_expression(self) -> c_ast.Node: result = self._try_parse_paren_type_name() if result is not None: typ, mark, _ = result # Disambiguate between casts and compound literals: # (int) x -> cast # (int) {1} -> compound literal if self._accept("LBRACE"): init = self._parse_initializer_list() self._accept("COMMA") self._expect("RBRACE") return c_ast.CompoundLiteral(typ, init) else: self._reset(mark) expr = self._parse_primary_expression() while True: if self._accept("LBRACKET"): sub = self._parse_expression() self._expect("RBRACKET") expr = c_ast.ArrayRef(expr, sub, expr.coord) continue if self._accept("LPAREN"): if self._peek_type() == "RPAREN": self._advance() args = None else: args = self._parse_argument_expression_list() self._expect("RPAREN") expr = c_ast.FuncCall(expr, args, expr.coord) continue if self._peek_type() in {"PERIOD", "ARROW"}: op_tok = self._advance() name_tok = self._advance() if name_tok.type not in {"ID", "TYPEID"}: self._parse_error( "Invalid struct reference", self._tok_coord(name_tok) ) field = c_ast.ID(name_tok.value, self._tok_coord(name_tok)) expr = c_ast.StructRef(expr, op_tok.value, field, expr.coord) continue if self._peek_type() in {"PLUSPLUS", "MINUSMINUS"}: tok = self._advance() expr = c_ast.UnaryOp("p" + tok.value, expr, expr.coord) continue break return expr # BNF: primary_expression : ID | constant | string_literal # | '(' expression ')' | offsetof def _parse_primary_expression(self) -> c_ast.Node: tok_type = self._peek_type() if tok_type == "ID": return self._parse_identifier() if ( tok_type in _INT_CONST or tok_type in _FLOAT_CONST or tok_type in _CHAR_CONST ): return self._parse_constant() if tok_type in _STRING_LITERAL: return self._parse_unified_string_literal() if tok_type in _WSTR_LITERAL: return self._parse_unified_wstring_literal() if tok_type == "LPAREN": self._advance() expr = self._parse_expression() self._expect("RPAREN") return expr if tok_type == "OFFSETOF": off_tok = self._advance() self._expect("LPAREN") typ = self._parse_type_name() self._expect("COMMA") designator = self._parse_offsetof_member_designator() self._expect("RPAREN") coord = self._tok_coord(off_tok) return c_ast.FuncCall( c_ast.ID(off_tok.value, coord), c_ast.ExprList([typ, designator], coord), coord, ) self._parse_error("Invalid expression", self.clex.filename) # BNF: offsetof_member_designator : identifier_or_typeid # ('.' identifier_or_typeid | '[' expression ']')* def _parse_offsetof_member_designator(self) -> c_ast.Node: node = self._parse_identifier_or_typeid() while True: if self._accept("PERIOD"): field = self._parse_identifier_or_typeid() node = c_ast.StructRef(node, ".", field, node.coord) continue if self._accept("LBRACKET"): expr = self._parse_expression() self._expect("RBRACKET") node = c_ast.ArrayRef(node, expr, node.coord) continue break return node # BNF: argument_expression_list : assignment_expression (',' assignment_expression)* def _parse_argument_expression_list(self) -> c_ast.Node: expr = self._parse_assignment_expression() exprs = [expr] while self._accept("COMMA"): exprs.append(self._parse_assignment_expression()) return c_ast.ExprList(exprs, expr.coord) # BNF: constant_expression : conditional_expression def _parse_constant_expression(self) -> c_ast.Node: return self._parse_conditional_expression() # ------------------------------------------------------------------ # Terminals # ------------------------------------------------------------------ # BNF: identifier : ID def _parse_identifier(self) -> c_ast.Node: tok = self._expect("ID") return c_ast.ID(tok.value, self._tok_coord(tok)) # BNF: identifier_or_typeid : ID | TYPEID def _parse_identifier_or_typeid(self) -> c_ast.Node: tok = self._advance() if tok.type not in {"ID", "TYPEID"}: self._parse_error("Expected identifier", self._tok_coord(tok)) return c_ast.ID(tok.value, self._tok_coord(tok)) # BNF: constant : INT_CONST | FLOAT_CONST | CHAR_CONST def _parse_constant(self) -> c_ast.Node: tok = self._advance() if tok.type in _INT_CONST: u_count = 0 l_count = 0 for ch in tok.value[-3:]: if ch in ("l", "L"): l_count += 1 elif ch in ("u", "U"): u_count += 1 if u_count > 1: raise ValueError("Constant cannot have more than one u/U suffix.") if l_count > 2: raise ValueError("Constant cannot have more than two l/L suffix.") prefix = "unsigned " * u_count + "long " * l_count return c_ast.Constant(prefix + "int", tok.value, self._tok_coord(tok)) if tok.type in _FLOAT_CONST: if tok.value[-1] in ("f", "F"): t = "float" elif tok.value[-1] in ("l", "L"): t = "long double" else: t = "double" return c_ast.Constant(t, tok.value, self._tok_coord(tok)) if tok.type in _CHAR_CONST: return c_ast.Constant("char", tok.value, self._tok_coord(tok)) self._parse_error("Invalid constant", self._tok_coord(tok)) # BNF: unified_string_literal : STRING_LITERAL+ def _parse_unified_string_literal(self) -> c_ast.Node: tok = self._expect("STRING_LITERAL") node = c_ast.Constant("string", tok.value, self._tok_coord(tok)) while self._peek_type() == "STRING_LITERAL": tok2 = self._advance() node.value = node.value[:-1] + tok2.value[1:] return node # BNF: unified_wstring_literal : WSTRING_LITERAL+ def _parse_unified_wstring_literal(self) -> c_ast.Node: tok = self._advance() if tok.type not in _WSTR_LITERAL: self._parse_error("Invalid string literal", self._tok_coord(tok)) node = c_ast.Constant("string", tok.value, self._tok_coord(tok)) while self._peek_type() in _WSTR_LITERAL: tok2 = self._advance() node.value = node.value.rstrip()[:-1] + tok2.value[2:] return node # ------------------------------------------------------------------ # Initializers # ------------------------------------------------------------------ # BNF: initializer : assignment_expression # | '{' initializer_list ','? '}' # | '{' '}' def _parse_initializer(self) -> c_ast.Node: lbrace_tok = self._accept("LBRACE") if lbrace_tok: if self._accept("RBRACE"): return c_ast.InitList([], self._tok_coord(lbrace_tok)) init_list = self._parse_initializer_list() self._accept("COMMA") self._expect("RBRACE") return init_list return self._parse_assignment_expression() # BNF: initializer_list : initializer_item (',' initializer_item)* ','? def _parse_initializer_list(self) -> c_ast.Node: items = [self._parse_initializer_item()] while self._accept("COMMA"): if self._peek_type() == "RBRACE": break items.append(self._parse_initializer_item()) return c_ast.InitList(items, items[0].coord) # BNF: initializer_item : designation? initializer def _parse_initializer_item(self) -> c_ast.Node: designation = None if self._peek_type() in {"LBRACKET", "PERIOD"}: designation = self._parse_designation() init = self._parse_initializer() if designation is not None: return c_ast.NamedInitializer(designation, init) return init # BNF: designation : designator_list '=' def _parse_designation(self) -> List[c_ast.Node]: designators = self._parse_designator_list() self._expect("EQUALS") return designators # BNF: designator_list : designator+ def _parse_designator_list(self) -> List[c_ast.Node]: designators = [] while self._peek_type() in {"LBRACKET", "PERIOD"}: designators.append(self._parse_designator()) return designators # BNF: designator : '[' constant_expression ']' # | '.' identifier_or_typeid def _parse_designator(self) -> c_ast.Node: if self._accept("LBRACKET"): expr = self._parse_constant_expression() self._expect("RBRACKET") return expr if self._accept("PERIOD"): return self._parse_identifier_or_typeid() self._parse_error("Invalid designator", self.clex.filename) # ------------------------------------------------------------------ # Preprocessor-like directives # ------------------------------------------------------------------ # BNF: pp_directive : '#' ... (unsupported) def _parse_pp_directive(self) -> NoReturn: tok = self._expect("PPHASH") self._parse_error("Directives not supported yet", self._tok_coord(tok)) # BNF: pppragma_directive : PPPRAGMA PPPRAGMASTR? # | _PRAGMA '(' string_literal ')' def _parse_pppragma_directive(self) -> c_ast.Node: if self._peek_type() == "PPPRAGMA": tok = self._advance() if self._peek_type() == "PPPRAGMASTR": str_tok = self._advance() return c_ast.Pragma(str_tok.value, self._tok_coord(str_tok)) return c_ast.Pragma("", self._tok_coord(tok)) if self._peek_type() == "_PRAGMA": tok = self._advance() lparen = self._expect("LPAREN") literal = self._parse_unified_string_literal() self._expect("RPAREN") return c_ast.Pragma(literal, self._tok_coord(lparen)) self._parse_error("Invalid pragma", self.clex.filename) # BNF: pppragma_directive_list : pppragma_directive+ def _parse_pppragma_directive_list(self) -> List[c_ast.Node]: pragmas = [] while self._peek_type() in {"PPPRAGMA", "_PRAGMA"}: pragmas.append(self._parse_pppragma_directive()) return pragmas # BNF: static_assert : _STATIC_ASSERT '(' constant_expression (',' string_literal)? ')' def _parse_static_assert(self) -> List[c_ast.Node]: tok = self._expect("_STATIC_ASSERT") self._expect("LPAREN") cond = self._parse_constant_expression() msg = None if self._accept("COMMA"): msg = self._parse_unified_string_literal() self._expect("RPAREN") return [c_ast.StaticAssert(cond, msg, self._tok_coord(tok))] _ASSIGNMENT_OPS = { "EQUALS", "XOREQUAL", "TIMESEQUAL", "DIVEQUAL", "MODEQUAL", "PLUSEQUAL", "MINUSEQUAL", "LSHIFTEQUAL", "RSHIFTEQUAL", "ANDEQUAL", "OREQUAL", } # Precedence of operators (lower number = weather binding) # If this changes, c_generator.CGenerator.precedence_map needs to change as # well _BINARY_PRECEDENCE = { "LOR": 0, "LAND": 1, "OR": 2, "XOR": 3, "AND": 4, "EQ": 5, "NE": 5, "GT": 6, "GE": 6, "LT": 6, "LE": 6, "RSHIFT": 7, "LSHIFT": 7, "PLUS": 8, "MINUS": 8, "TIMES": 9, "DIVIDE": 9, "MOD": 9, } _STORAGE_CLASS = {"AUTO", "REGISTER", "STATIC", "EXTERN", "TYPEDEF", "_THREAD_LOCAL"} _FUNCTION_SPEC = {"INLINE", "_NORETURN"} _TYPE_QUALIFIER = {"CONST", "RESTRICT", "VOLATILE", "_ATOMIC"} _TYPE_SPEC_SIMPLE = { "VOID", "_BOOL", "CHAR", "SHORT", "INT", "LONG", "FLOAT", "DOUBLE", "_COMPLEX", "SIGNED", "UNSIGNED", "__INT128", } _DECL_START = ( _STORAGE_CLASS | _FUNCTION_SPEC | _TYPE_QUALIFIER | _TYPE_SPEC_SIMPLE | {"TYPEID", "STRUCT", "UNION", "ENUM", "_ALIGNAS", "_ATOMIC"} ) _EXPR_START = { "ID", "LPAREN", "PLUSPLUS", "MINUSMINUS", "PLUS", "MINUS", "TIMES", "AND", "NOT", "LNOT", "SIZEOF", "_ALIGNOF", "OFFSETOF", } _INT_CONST = { "INT_CONST_DEC", "INT_CONST_OCT", "INT_CONST_HEX", "INT_CONST_BIN", "INT_CONST_CHAR", } _FLOAT_CONST = {"FLOAT_CONST", "HEX_FLOAT_CONST"} _CHAR_CONST = { "CHAR_CONST", "WCHAR_CONST", "U8CHAR_CONST", "U16CHAR_CONST", "U32CHAR_CONST", } _STRING_LITERAL = {"STRING_LITERAL"} _WSTR_LITERAL = { "WSTRING_LITERAL", "U8STRING_LITERAL", "U16STRING_LITERAL", "U32STRING_LITERAL", } _STARTS_EXPRESSION = ( _EXPR_START | _INT_CONST | _FLOAT_CONST | _CHAR_CONST | _STRING_LITERAL | _WSTR_LITERAL ) _STARTS_STATEMENT = { "LBRACE", "IF", "SWITCH", "WHILE", "DO", "FOR", "GOTO", "BREAK", "CONTINUE", "RETURN", "CASE", "DEFAULT", "PPPRAGMA", "_PRAGMA", "_STATIC_ASSERT", "SEMI", } class _TokenStream: """Wraps a lexer to provide convenient, buffered access to the underlying token stream. The lexer is expected to be initialized with the input string already. """ def __init__(self, lexer: CLexer) -> None: self._lexer = lexer self._buffer: List[Optional[_Token]] = [] self._index = 0 def peek(self, k: int = 1) -> Optional[_Token]: """Peek at the k-th next token in the stream, without consuming it. Examples: k=1 returns the immediate next token. k=2 returns the token after that. """ if k <= 0: return None self._fill(k) return self._buffer[self._index + k - 1] def next(self) -> Optional[_Token]: """Consume a single token and return it.""" self._fill(1) tok = self._buffer[self._index] self._index += 1 return tok # The 'mark' and 'reset' methods are useful for speculative parsing with # backtracking; when the parser needs to examine a sequence of tokens # and potentially decide to try a different path on the same sequence, it # can call 'mark' to obtain the current token position, and if the first # path fails restore the position with `reset(pos)`. def mark(self) -> int: return self._index def reset(self, mark: int) -> None: self._index = mark def _fill(self, n: int) -> None: while len(self._buffer) < self._index + n: tok = self._lexer.token() self._buffer.append(tok) if tok is None: break # Declaration specifiers are represented by a dictionary with entries: # - qual: a list of type qualifiers # - storage: a list of storage class specifiers # - type: a list of type specifiers # - function: a list of function specifiers # - alignment: a list of alignment specifiers class _DeclSpec(TypedDict): qual: List[Any] storage: List[Any] type: List[Any] function: List[Any] alignment: List[Any] _DeclSpecKind = Literal["qual", "storage", "type", "function", "alignment"] class _DeclInfo(TypedDict): # Declarator payloads used by declaration/initializer parsing: # - decl: the declarator node (may be None for abstract/implicit cases) # - init: optional initializer expression # - bitsize: optional bit-field width expression (for struct declarators) decl: Optional[c_ast.Node] init: Optional[c_ast.Node] bitsize: Optional[c_ast.Node]