// SPDX-License-Identifier: GPL-2.0-only /* 64-bit system call dispatch */ #include #include #include #include #include #include #include #define __SYSCALL(nr, sym) extern long __x64_##sym(const struct pt_regs *); #define __SYSCALL_NORETURN(nr, sym) extern long __noreturn __x64_##sym(const struct pt_regs *); #include #ifdef CONFIG_X86_X32_ABI #include #endif #undef __SYSCALL #undef __SYSCALL_NORETURN #define __SYSCALL_NORETURN __SYSCALL /* * The sys_call_table[] is no longer used for system calls, but * kernel/trace/trace_syscalls.c still wants to know the system * call address. */ #define __SYSCALL(nr, sym) __x64_##sym, const sys_call_ptr_t sys_call_table[] = { #include }; #undef __SYSCALL #define __SYSCALL(nr, sym) case nr: return __x64_##sym(regs); long x64_sys_call(const struct pt_regs *regs, unsigned int nr) { switch (nr) { #include default: return __x64_sys_ni_syscall(regs); } } #ifdef CONFIG_X86_X32_ABI long x32_sys_call(const struct pt_regs *regs, unsigned int nr) { switch (nr) { #include default: return __x64_sys_ni_syscall(regs); } } #endif static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr) { /* * Convert negative numbers to very high and thus out of range * numbers for comparisons. */ unsigned int unr = nr; if (likely(unr < NR_syscalls)) { unr = array_index_nospec(unr, NR_syscalls); regs->ax = x64_sys_call(regs, unr); return true; } return false; } static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr) { /* * Adjust the starting offset of the table, and convert numbers * < __X32_SYSCALL_BIT to very high and thus out of range * numbers for comparisons. */ unsigned int xnr = nr - __X32_SYSCALL_BIT; if (IS_ENABLED(CONFIG_X86_X32_ABI) && likely(xnr < X32_NR_syscalls)) { xnr = array_index_nospec(xnr, X32_NR_syscalls); regs->ax = x32_sys_call(regs, xnr); return true; } return false; } /* Returns true to return using SYSRET, or false to use IRET */ __visible noinstr bool do_syscall_64(struct pt_regs *regs, int nr) { add_random_kstack_offset(); nr = syscall_enter_from_user_mode(regs, nr); instrumentation_begin(); if (!do_syscall_x64(regs, nr) && !do_syscall_x32(regs, nr) && nr != -1) { /* Invalid system call, but still a system call. */ regs->ax = __x64_sys_ni_syscall(regs); } instrumentation_end(); syscall_exit_to_user_mode(regs); /* * Check that the register state is valid for using SYSRET to exit * to userspace. Otherwise use the slower but fully capable IRET * exit path. */ /* XEN PV guests always use the IRET path */ if (cpu_feature_enabled(X86_FEATURE_XENPV)) return false; /* SYSRET requires RCX == RIP and R11 == EFLAGS */ if (unlikely(regs->cx != regs->ip || regs->r11 != regs->flags)) return false; /* CS and SS must match the values set in MSR_STAR */ if (unlikely(regs->cs != __USER_CS || regs->ss != __USER_DS)) return false; /* * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP * in kernel space. This essentially lets the user take over * the kernel, since userspace controls RSP. * * TASK_SIZE_MAX covers all user-accessible addresses other than * the deprecated vsyscall page. */ if (unlikely(regs->ip >= TASK_SIZE_MAX)) return false; /* * SYSRET cannot restore RF. It can restore TF, but unlike IRET, * restoring TF results in a trap from userspace immediately after * SYSRET. */ if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF))) return false; /* Use SYSRET to exit to userspace */ return true; }