// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2024, Microsoft Corporation. * * The main part of the mshv_root module, providing APIs to create * and manage guest partitions. * * Authors: Microsoft Linux virtualization team */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "mshv_eventfd.h" #include "mshv.h" #include "mshv_root.h" MODULE_AUTHOR("Microsoft"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Microsoft Hyper-V root partition VMM interface /dev/mshv"); /* TODO move this to mshyperv.h when needed outside driver */ static inline bool hv_parent_partition(void) { return hv_root_partition(); } /* TODO move this to another file when debugfs code is added */ enum hv_stats_vp_counters { /* HV_THREAD_COUNTER */ #if defined(CONFIG_X86) VpRootDispatchThreadBlocked = 201, #elif defined(CONFIG_ARM64) VpRootDispatchThreadBlocked = 94, #endif VpStatsMaxCounter }; struct hv_stats_page { union { u64 vp_cntrs[VpStatsMaxCounter]; /* VP counters */ u8 data[HV_HYP_PAGE_SIZE]; }; } __packed; struct mshv_root mshv_root; enum hv_scheduler_type hv_scheduler_type; /* Once we implement the fast extended hypercall ABI they can go away. */ static void * __percpu *root_scheduler_input; static void * __percpu *root_scheduler_output; static long mshv_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); static int mshv_dev_open(struct inode *inode, struct file *filp); static int mshv_dev_release(struct inode *inode, struct file *filp); static int mshv_vp_release(struct inode *inode, struct file *filp); static long mshv_vp_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); static int mshv_partition_release(struct inode *inode, struct file *filp); static long mshv_partition_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); static int mshv_vp_mmap(struct file *file, struct vm_area_struct *vma); static vm_fault_t mshv_vp_fault(struct vm_fault *vmf); static int mshv_init_async_handler(struct mshv_partition *partition); static void mshv_async_hvcall_handler(void *data, u64 *status); static const union hv_input_vtl input_vtl_zero; static const union hv_input_vtl input_vtl_normal = { .target_vtl = HV_NORMAL_VTL, .use_target_vtl = 1, }; static const struct vm_operations_struct mshv_vp_vm_ops = { .fault = mshv_vp_fault, }; static const struct file_operations mshv_vp_fops = { .owner = THIS_MODULE, .release = mshv_vp_release, .unlocked_ioctl = mshv_vp_ioctl, .llseek = noop_llseek, .mmap = mshv_vp_mmap, }; static const struct file_operations mshv_partition_fops = { .owner = THIS_MODULE, .release = mshv_partition_release, .unlocked_ioctl = mshv_partition_ioctl, .llseek = noop_llseek, }; static const struct file_operations mshv_dev_fops = { .owner = THIS_MODULE, .open = mshv_dev_open, .release = mshv_dev_release, .unlocked_ioctl = mshv_dev_ioctl, .llseek = noop_llseek, }; static struct miscdevice mshv_dev = { .minor = MISC_DYNAMIC_MINOR, .name = "mshv", .fops = &mshv_dev_fops, .mode = 0600, }; /* * Only allow hypercalls that have a u64 partition id as the first member of * the input structure. * These are sorted by value. */ static u16 mshv_passthru_hvcalls[] = { HVCALL_GET_PARTITION_PROPERTY, HVCALL_SET_PARTITION_PROPERTY, HVCALL_INSTALL_INTERCEPT, HVCALL_GET_VP_REGISTERS, HVCALL_SET_VP_REGISTERS, HVCALL_TRANSLATE_VIRTUAL_ADDRESS, HVCALL_CLEAR_VIRTUAL_INTERRUPT, HVCALL_REGISTER_INTERCEPT_RESULT, HVCALL_ASSERT_VIRTUAL_INTERRUPT, HVCALL_GET_GPA_PAGES_ACCESS_STATES, HVCALL_SIGNAL_EVENT_DIRECT, HVCALL_POST_MESSAGE_DIRECT, HVCALL_GET_VP_CPUID_VALUES, }; static bool mshv_hvcall_is_async(u16 code) { switch (code) { case HVCALL_SET_PARTITION_PROPERTY: return true; default: break; } return false; } static int mshv_ioctl_passthru_hvcall(struct mshv_partition *partition, bool partition_locked, void __user *user_args) { u64 status; int ret = 0, i; bool is_async; struct mshv_root_hvcall args; struct page *page; unsigned int pages_order; void *input_pg = NULL; void *output_pg = NULL; if (copy_from_user(&args, user_args, sizeof(args))) return -EFAULT; if (args.status || !args.in_ptr || args.in_sz < sizeof(u64) || mshv_field_nonzero(args, rsvd) || args.in_sz > HV_HYP_PAGE_SIZE) return -EINVAL; if (args.out_ptr && (!args.out_sz || args.out_sz > HV_HYP_PAGE_SIZE)) return -EINVAL; for (i = 0; i < ARRAY_SIZE(mshv_passthru_hvcalls); ++i) if (args.code == mshv_passthru_hvcalls[i]) break; if (i >= ARRAY_SIZE(mshv_passthru_hvcalls)) return -EINVAL; is_async = mshv_hvcall_is_async(args.code); if (is_async) { /* async hypercalls can only be called from partition fd */ if (!partition_locked) return -EINVAL; ret = mshv_init_async_handler(partition); if (ret) return ret; } pages_order = args.out_ptr ? 1 : 0; page = alloc_pages(GFP_KERNEL, pages_order); if (!page) return -ENOMEM; input_pg = page_address(page); if (args.out_ptr) output_pg = (char *)input_pg + PAGE_SIZE; else output_pg = NULL; if (copy_from_user(input_pg, (void __user *)args.in_ptr, args.in_sz)) { ret = -EFAULT; goto free_pages_out; } /* * NOTE: This only works because all the allowed hypercalls' input * structs begin with a u64 partition_id field. */ *(u64 *)input_pg = partition->pt_id; if (args.reps) status = hv_do_rep_hypercall(args.code, args.reps, 0, input_pg, output_pg); else status = hv_do_hypercall(args.code, input_pg, output_pg); if (hv_result(status) == HV_STATUS_CALL_PENDING) { if (is_async) { mshv_async_hvcall_handler(partition, &status); } else { /* Paranoia check. This shouldn't happen! */ ret = -EBADFD; goto free_pages_out; } } if (hv_result(status) == HV_STATUS_INSUFFICIENT_MEMORY) { ret = hv_call_deposit_pages(NUMA_NO_NODE, partition->pt_id, 1); if (!ret) ret = -EAGAIN; } else if (!hv_result_success(status)) { ret = hv_result_to_errno(status); } /* * Always return the status and output data regardless of result. * The VMM may need it to determine how to proceed. E.g. the status may * contain the number of reps completed if a rep hypercall partially * succeeded. */ args.status = hv_result(status); args.reps = args.reps ? hv_repcomp(status) : 0; if (copy_to_user(user_args, &args, sizeof(args))) ret = -EFAULT; if (output_pg && copy_to_user((void __user *)args.out_ptr, output_pg, args.out_sz)) ret = -EFAULT; free_pages_out: free_pages((unsigned long)input_pg, pages_order); return ret; } static inline bool is_ghcb_mapping_available(void) { #if IS_ENABLED(CONFIG_X86_64) return ms_hyperv.ext_features & HV_VP_GHCB_ROOT_MAPPING_AVAILABLE; #else return 0; #endif } static int mshv_get_vp_registers(u32 vp_index, u64 partition_id, u16 count, struct hv_register_assoc *registers) { return hv_call_get_vp_registers(vp_index, partition_id, count, input_vtl_zero, registers); } static int mshv_set_vp_registers(u32 vp_index, u64 partition_id, u16 count, struct hv_register_assoc *registers) { return hv_call_set_vp_registers(vp_index, partition_id, count, input_vtl_zero, registers); } /* * Explicit guest vCPU suspend is asynchronous by nature (as it is requested by * dom0 vCPU for guest vCPU) and thus it can race with "intercept" suspend, * done by the hypervisor. * "Intercept" suspend leads to asynchronous message delivery to dom0 which * should be awaited to keep the VP loop consistent (i.e. no message pending * upon VP resume). * VP intercept suspend can't be done when the VP is explicitly suspended * already, and thus can be only two possible race scenarios: * 1. implicit suspend bit set -> explicit suspend bit set -> message sent * 2. implicit suspend bit set -> message sent -> explicit suspend bit set * Checking for implicit suspend bit set after explicit suspend request has * succeeded in either case allows us to reliably identify, if there is a * message to receive and deliver to VMM. */ static int mshv_suspend_vp(const struct mshv_vp *vp, bool *message_in_flight) { struct hv_register_assoc explicit_suspend = { .name = HV_REGISTER_EXPLICIT_SUSPEND }; struct hv_register_assoc intercept_suspend = { .name = HV_REGISTER_INTERCEPT_SUSPEND }; union hv_explicit_suspend_register *es = &explicit_suspend.value.explicit_suspend; union hv_intercept_suspend_register *is = &intercept_suspend.value.intercept_suspend; int ret; es->suspended = 1; ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id, 1, &explicit_suspend); if (ret) { vp_err(vp, "Failed to explicitly suspend vCPU\n"); return ret; } ret = mshv_get_vp_registers(vp->vp_index, vp->vp_partition->pt_id, 1, &intercept_suspend); if (ret) { vp_err(vp, "Failed to get intercept suspend state\n"); return ret; } *message_in_flight = is->suspended; return 0; } /* * This function is used when VPs are scheduled by the hypervisor's * scheduler. * * Caller has to make sure the registers contain cleared * HV_REGISTER_INTERCEPT_SUSPEND and HV_REGISTER_EXPLICIT_SUSPEND registers * exactly in this order (the hypervisor clears them sequentially) to avoid * potential invalid clearing a newly arrived HV_REGISTER_INTERCEPT_SUSPEND * after VP is released from HV_REGISTER_EXPLICIT_SUSPEND in case of the * opposite order. */ static long mshv_run_vp_with_hyp_scheduler(struct mshv_vp *vp) { long ret; struct hv_register_assoc suspend_regs[2] = { { .name = HV_REGISTER_INTERCEPT_SUSPEND }, { .name = HV_REGISTER_EXPLICIT_SUSPEND } }; size_t count = ARRAY_SIZE(suspend_regs); /* Resume VP execution */ ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id, count, suspend_regs); if (ret) { vp_err(vp, "Failed to resume vp execution. %lx\n", ret); return ret; } ret = wait_event_interruptible(vp->run.vp_suspend_queue, vp->run.kicked_by_hv == 1); if (ret) { bool message_in_flight; /* * Otherwise the waiting was interrupted by a signal: suspend * the vCPU explicitly and copy message in flight (if any). */ ret = mshv_suspend_vp(vp, &message_in_flight); if (ret) return ret; /* Return if no message in flight */ if (!message_in_flight) return -EINTR; /* Wait for the message in flight. */ wait_event(vp->run.vp_suspend_queue, vp->run.kicked_by_hv == 1); } /* * Reset the flag to make the wait_event call above work * next time. */ vp->run.kicked_by_hv = 0; return 0; } static int mshv_vp_dispatch(struct mshv_vp *vp, u32 flags, struct hv_output_dispatch_vp *res) { struct hv_input_dispatch_vp *input; struct hv_output_dispatch_vp *output; u64 status; preempt_disable(); input = *this_cpu_ptr(root_scheduler_input); output = *this_cpu_ptr(root_scheduler_output); memset(input, 0, sizeof(*input)); memset(output, 0, sizeof(*output)); input->partition_id = vp->vp_partition->pt_id; input->vp_index = vp->vp_index; input->time_slice = 0; /* Run forever until something happens */ input->spec_ctrl = 0; /* TODO: set sensible flags */ input->flags = flags; vp->run.flags.root_sched_dispatched = 1; status = hv_do_hypercall(HVCALL_DISPATCH_VP, input, output); vp->run.flags.root_sched_dispatched = 0; *res = *output; preempt_enable(); if (!hv_result_success(status)) vp_err(vp, "%s: status %s\n", __func__, hv_result_to_string(status)); return hv_result_to_errno(status); } static int mshv_vp_clear_explicit_suspend(struct mshv_vp *vp) { struct hv_register_assoc explicit_suspend = { .name = HV_REGISTER_EXPLICIT_SUSPEND, .value.explicit_suspend.suspended = 0, }; int ret; ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id, 1, &explicit_suspend); if (ret) vp_err(vp, "Failed to unsuspend\n"); return ret; } #if IS_ENABLED(CONFIG_X86_64) static u64 mshv_vp_interrupt_pending(struct mshv_vp *vp) { if (!vp->vp_register_page) return 0; return vp->vp_register_page->interrupt_vectors.as_uint64; } #else static u64 mshv_vp_interrupt_pending(struct mshv_vp *vp) { return 0; } #endif static bool mshv_vp_dispatch_thread_blocked(struct mshv_vp *vp) { struct hv_stats_page **stats = vp->vp_stats_pages; u64 *self_vp_cntrs = stats[HV_STATS_AREA_SELF]->vp_cntrs; u64 *parent_vp_cntrs = stats[HV_STATS_AREA_PARENT]->vp_cntrs; if (self_vp_cntrs[VpRootDispatchThreadBlocked]) return self_vp_cntrs[VpRootDispatchThreadBlocked]; return parent_vp_cntrs[VpRootDispatchThreadBlocked]; } static int mshv_vp_wait_for_hv_kick(struct mshv_vp *vp) { int ret; ret = wait_event_interruptible(vp->run.vp_suspend_queue, (vp->run.kicked_by_hv == 1 && !mshv_vp_dispatch_thread_blocked(vp)) || mshv_vp_interrupt_pending(vp)); if (ret) return -EINTR; vp->run.flags.root_sched_blocked = 0; vp->run.kicked_by_hv = 0; return 0; } static int mshv_pre_guest_mode_work(struct mshv_vp *vp) { const ulong work_flags = _TIF_NOTIFY_SIGNAL | _TIF_SIGPENDING | _TIF_NEED_RESCHED | _TIF_NOTIFY_RESUME; ulong th_flags; th_flags = read_thread_flags(); while (th_flags & work_flags) { int ret; /* nb: following will call schedule */ ret = mshv_do_pre_guest_mode_work(th_flags); if (ret) return ret; th_flags = read_thread_flags(); } return 0; } /* Must be called with interrupts enabled */ static long mshv_run_vp_with_root_scheduler(struct mshv_vp *vp) { long ret; if (vp->run.flags.root_sched_blocked) { /* * Dispatch state of this VP is blocked. Need to wait * for the hypervisor to clear the blocked state before * dispatching it. */ ret = mshv_vp_wait_for_hv_kick(vp); if (ret) return ret; } do { u32 flags = 0; struct hv_output_dispatch_vp output; ret = mshv_pre_guest_mode_work(vp); if (ret) break; if (vp->run.flags.intercept_suspend) flags |= HV_DISPATCH_VP_FLAG_CLEAR_INTERCEPT_SUSPEND; if (mshv_vp_interrupt_pending(vp)) flags |= HV_DISPATCH_VP_FLAG_SCAN_INTERRUPT_INJECTION; ret = mshv_vp_dispatch(vp, flags, &output); if (ret) break; vp->run.flags.intercept_suspend = 0; if (output.dispatch_state == HV_VP_DISPATCH_STATE_BLOCKED) { if (output.dispatch_event == HV_VP_DISPATCH_EVENT_SUSPEND) { /* * TODO: remove the warning once VP canceling * is supported */ WARN_ONCE(atomic64_read(&vp->run.vp_signaled_count), "%s: vp#%d: unexpected explicit suspend\n", __func__, vp->vp_index); /* * Need to clear explicit suspend before * dispatching. * Explicit suspend is either: * - set right after the first VP dispatch or * - set explicitly via hypercall * Since the latter case is not yet supported, * simply clear it here. */ ret = mshv_vp_clear_explicit_suspend(vp); if (ret) break; ret = mshv_vp_wait_for_hv_kick(vp); if (ret) break; } else { vp->run.flags.root_sched_blocked = 1; ret = mshv_vp_wait_for_hv_kick(vp); if (ret) break; } } else { /* HV_VP_DISPATCH_STATE_READY */ if (output.dispatch_event == HV_VP_DISPATCH_EVENT_INTERCEPT) vp->run.flags.intercept_suspend = 1; } } while (!vp->run.flags.intercept_suspend); return ret; } static_assert(sizeof(struct hv_message) <= MSHV_RUN_VP_BUF_SZ, "sizeof(struct hv_message) must not exceed MSHV_RUN_VP_BUF_SZ"); static long mshv_vp_ioctl_run_vp(struct mshv_vp *vp, void __user *ret_msg) { long rc; if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) rc = mshv_run_vp_with_root_scheduler(vp); else rc = mshv_run_vp_with_hyp_scheduler(vp); if (rc) return rc; if (copy_to_user(ret_msg, vp->vp_intercept_msg_page, sizeof(struct hv_message))) rc = -EFAULT; return rc; } static int mshv_vp_ioctl_get_set_state_pfn(struct mshv_vp *vp, struct hv_vp_state_data state_data, unsigned long user_pfn, size_t page_count, bool is_set) { int completed, ret = 0; unsigned long check; struct page **pages; if (page_count > INT_MAX) return -EINVAL; /* * Check the arithmetic for wraparound/overflow. * The last page address in the buffer is: * (user_pfn + (page_count - 1)) * PAGE_SIZE */ if (check_add_overflow(user_pfn, (page_count - 1), &check)) return -EOVERFLOW; if (check_mul_overflow(check, PAGE_SIZE, &check)) return -EOVERFLOW; /* Pin user pages so hypervisor can copy directly to them */ pages = kcalloc(page_count, sizeof(struct page *), GFP_KERNEL); if (!pages) return -ENOMEM; for (completed = 0; completed < page_count; completed += ret) { unsigned long user_addr = (user_pfn + completed) * PAGE_SIZE; int remaining = page_count - completed; ret = pin_user_pages_fast(user_addr, remaining, FOLL_WRITE, &pages[completed]); if (ret < 0) { vp_err(vp, "%s: Failed to pin user pages error %i\n", __func__, ret); goto unpin_pages; } } if (is_set) ret = hv_call_set_vp_state(vp->vp_index, vp->vp_partition->pt_id, state_data, page_count, pages, 0, NULL); else ret = hv_call_get_vp_state(vp->vp_index, vp->vp_partition->pt_id, state_data, page_count, pages, NULL); unpin_pages: unpin_user_pages(pages, completed); kfree(pages); return ret; } static long mshv_vp_ioctl_get_set_state(struct mshv_vp *vp, struct mshv_get_set_vp_state __user *user_args, bool is_set) { struct mshv_get_set_vp_state args; long ret = 0; union hv_output_get_vp_state vp_state; u32 data_sz; struct hv_vp_state_data state_data = {}; if (copy_from_user(&args, user_args, sizeof(args))) return -EFAULT; if (args.type >= MSHV_VP_STATE_COUNT || mshv_field_nonzero(args, rsvd) || !args.buf_sz || !PAGE_ALIGNED(args.buf_sz) || !PAGE_ALIGNED(args.buf_ptr)) return -EINVAL; if (!access_ok((void __user *)args.buf_ptr, args.buf_sz)) return -EFAULT; switch (args.type) { case MSHV_VP_STATE_LAPIC: state_data.type = HV_GET_SET_VP_STATE_LAPIC_STATE; data_sz = HV_HYP_PAGE_SIZE; break; case MSHV_VP_STATE_XSAVE: { u64 data_sz_64; ret = hv_call_get_partition_property(vp->vp_partition->pt_id, HV_PARTITION_PROPERTY_XSAVE_STATES, &state_data.xsave.states.as_uint64); if (ret) return ret; ret = hv_call_get_partition_property(vp->vp_partition->pt_id, HV_PARTITION_PROPERTY_MAX_XSAVE_DATA_SIZE, &data_sz_64); if (ret) return ret; data_sz = (u32)data_sz_64; state_data.xsave.flags = 0; /* Always request legacy states */ state_data.xsave.states.legacy_x87 = 1; state_data.xsave.states.legacy_sse = 1; state_data.type = HV_GET_SET_VP_STATE_XSAVE; break; } case MSHV_VP_STATE_SIMP: state_data.type = HV_GET_SET_VP_STATE_SIM_PAGE; data_sz = HV_HYP_PAGE_SIZE; break; case MSHV_VP_STATE_SIEFP: state_data.type = HV_GET_SET_VP_STATE_SIEF_PAGE; data_sz = HV_HYP_PAGE_SIZE; break; case MSHV_VP_STATE_SYNTHETIC_TIMERS: state_data.type = HV_GET_SET_VP_STATE_SYNTHETIC_TIMERS; data_sz = sizeof(vp_state.synthetic_timers_state); break; default: return -EINVAL; } if (copy_to_user(&user_args->buf_sz, &data_sz, sizeof(user_args->buf_sz))) return -EFAULT; if (data_sz > args.buf_sz) return -EINVAL; /* If the data is transmitted via pfns, delegate to helper */ if (state_data.type & HV_GET_SET_VP_STATE_TYPE_PFN) { unsigned long user_pfn = PFN_DOWN(args.buf_ptr); size_t page_count = PFN_DOWN(args.buf_sz); return mshv_vp_ioctl_get_set_state_pfn(vp, state_data, user_pfn, page_count, is_set); } /* Paranoia check - this shouldn't happen! */ if (data_sz > sizeof(vp_state)) { vp_err(vp, "Invalid vp state data size!\n"); return -EINVAL; } if (is_set) { if (copy_from_user(&vp_state, (__user void *)args.buf_ptr, data_sz)) return -EFAULT; return hv_call_set_vp_state(vp->vp_index, vp->vp_partition->pt_id, state_data, 0, NULL, sizeof(vp_state), (u8 *)&vp_state); } ret = hv_call_get_vp_state(vp->vp_index, vp->vp_partition->pt_id, state_data, 0, NULL, &vp_state); if (ret) return ret; if (copy_to_user((void __user *)args.buf_ptr, &vp_state, data_sz)) return -EFAULT; return 0; } static long mshv_vp_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { struct mshv_vp *vp = filp->private_data; long r = -ENOTTY; if (mutex_lock_killable(&vp->vp_mutex)) return -EINTR; switch (ioctl) { case MSHV_RUN_VP: r = mshv_vp_ioctl_run_vp(vp, (void __user *)arg); break; case MSHV_GET_VP_STATE: r = mshv_vp_ioctl_get_set_state(vp, (void __user *)arg, false); break; case MSHV_SET_VP_STATE: r = mshv_vp_ioctl_get_set_state(vp, (void __user *)arg, true); break; case MSHV_ROOT_HVCALL: r = mshv_ioctl_passthru_hvcall(vp->vp_partition, false, (void __user *)arg); break; default: vp_warn(vp, "Invalid ioctl: %#x\n", ioctl); break; } mutex_unlock(&vp->vp_mutex); return r; } static vm_fault_t mshv_vp_fault(struct vm_fault *vmf) { struct mshv_vp *vp = vmf->vma->vm_file->private_data; switch (vmf->vma->vm_pgoff) { case MSHV_VP_MMAP_OFFSET_REGISTERS: vmf->page = virt_to_page(vp->vp_register_page); break; case MSHV_VP_MMAP_OFFSET_INTERCEPT_MESSAGE: vmf->page = virt_to_page(vp->vp_intercept_msg_page); break; case MSHV_VP_MMAP_OFFSET_GHCB: vmf->page = virt_to_page(vp->vp_ghcb_page); break; default: return VM_FAULT_SIGBUS; } get_page(vmf->page); return 0; } static int mshv_vp_mmap(struct file *file, struct vm_area_struct *vma) { struct mshv_vp *vp = file->private_data; switch (vma->vm_pgoff) { case MSHV_VP_MMAP_OFFSET_REGISTERS: if (!vp->vp_register_page) return -ENODEV; break; case MSHV_VP_MMAP_OFFSET_INTERCEPT_MESSAGE: if (!vp->vp_intercept_msg_page) return -ENODEV; break; case MSHV_VP_MMAP_OFFSET_GHCB: if (!vp->vp_ghcb_page) return -ENODEV; break; default: return -EINVAL; } vma->vm_ops = &mshv_vp_vm_ops; return 0; } static int mshv_vp_release(struct inode *inode, struct file *filp) { struct mshv_vp *vp = filp->private_data; /* Rest of VP cleanup happens in destroy_partition() */ mshv_partition_put(vp->vp_partition); return 0; } static void mshv_vp_stats_unmap(u64 partition_id, u32 vp_index) { union hv_stats_object_identity identity = { .vp.partition_id = partition_id, .vp.vp_index = vp_index, }; identity.vp.stats_area_type = HV_STATS_AREA_SELF; hv_call_unmap_stat_page(HV_STATS_OBJECT_VP, &identity); identity.vp.stats_area_type = HV_STATS_AREA_PARENT; hv_call_unmap_stat_page(HV_STATS_OBJECT_VP, &identity); } static int mshv_vp_stats_map(u64 partition_id, u32 vp_index, void *stats_pages[]) { union hv_stats_object_identity identity = { .vp.partition_id = partition_id, .vp.vp_index = vp_index, }; int err; identity.vp.stats_area_type = HV_STATS_AREA_SELF; err = hv_call_map_stat_page(HV_STATS_OBJECT_VP, &identity, &stats_pages[HV_STATS_AREA_SELF]); if (err) return err; identity.vp.stats_area_type = HV_STATS_AREA_PARENT; err = hv_call_map_stat_page(HV_STATS_OBJECT_VP, &identity, &stats_pages[HV_STATS_AREA_PARENT]); if (err) goto unmap_self; return 0; unmap_self: identity.vp.stats_area_type = HV_STATS_AREA_SELF; hv_call_unmap_stat_page(HV_STATS_OBJECT_VP, &identity); return err; } static long mshv_partition_ioctl_create_vp(struct mshv_partition *partition, void __user *arg) { struct mshv_create_vp args; struct mshv_vp *vp; struct page *intercept_message_page, *register_page, *ghcb_page; void *stats_pages[2]; long ret; if (copy_from_user(&args, arg, sizeof(args))) return -EFAULT; if (args.vp_index >= MSHV_MAX_VPS) return -EINVAL; if (partition->pt_vp_array[args.vp_index]) return -EEXIST; ret = hv_call_create_vp(NUMA_NO_NODE, partition->pt_id, args.vp_index, 0 /* Only valid for root partition VPs */); if (ret) return ret; ret = hv_call_map_vp_state_page(partition->pt_id, args.vp_index, HV_VP_STATE_PAGE_INTERCEPT_MESSAGE, input_vtl_zero, &intercept_message_page); if (ret) goto destroy_vp; if (!mshv_partition_encrypted(partition)) { ret = hv_call_map_vp_state_page(partition->pt_id, args.vp_index, HV_VP_STATE_PAGE_REGISTERS, input_vtl_zero, ®ister_page); if (ret) goto unmap_intercept_message_page; } if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available()) { ret = hv_call_map_vp_state_page(partition->pt_id, args.vp_index, HV_VP_STATE_PAGE_GHCB, input_vtl_normal, &ghcb_page); if (ret) goto unmap_register_page; } if (hv_parent_partition()) { ret = mshv_vp_stats_map(partition->pt_id, args.vp_index, stats_pages); if (ret) goto unmap_ghcb_page; } vp = kzalloc(sizeof(*vp), GFP_KERNEL); if (!vp) goto unmap_stats_pages; vp->vp_partition = mshv_partition_get(partition); if (!vp->vp_partition) { ret = -EBADF; goto free_vp; } mutex_init(&vp->vp_mutex); init_waitqueue_head(&vp->run.vp_suspend_queue); atomic64_set(&vp->run.vp_signaled_count, 0); vp->vp_index = args.vp_index; vp->vp_intercept_msg_page = page_to_virt(intercept_message_page); if (!mshv_partition_encrypted(partition)) vp->vp_register_page = page_to_virt(register_page); if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available()) vp->vp_ghcb_page = page_to_virt(ghcb_page); if (hv_parent_partition()) memcpy(vp->vp_stats_pages, stats_pages, sizeof(stats_pages)); /* * Keep anon_inode_getfd last: it installs fd in the file struct and * thus makes the state accessible in user space. */ ret = anon_inode_getfd("mshv_vp", &mshv_vp_fops, vp, O_RDWR | O_CLOEXEC); if (ret < 0) goto put_partition; /* already exclusive with the partition mutex for all ioctls */ partition->pt_vp_count++; partition->pt_vp_array[args.vp_index] = vp; return ret; put_partition: mshv_partition_put(partition); free_vp: kfree(vp); unmap_stats_pages: if (hv_parent_partition()) mshv_vp_stats_unmap(partition->pt_id, args.vp_index); unmap_ghcb_page: if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available()) { hv_call_unmap_vp_state_page(partition->pt_id, args.vp_index, HV_VP_STATE_PAGE_GHCB, input_vtl_normal); } unmap_register_page: if (!mshv_partition_encrypted(partition)) { hv_call_unmap_vp_state_page(partition->pt_id, args.vp_index, HV_VP_STATE_PAGE_REGISTERS, input_vtl_zero); } unmap_intercept_message_page: hv_call_unmap_vp_state_page(partition->pt_id, args.vp_index, HV_VP_STATE_PAGE_INTERCEPT_MESSAGE, input_vtl_zero); destroy_vp: hv_call_delete_vp(partition->pt_id, args.vp_index); return ret; } static int mshv_init_async_handler(struct mshv_partition *partition) { if (completion_done(&partition->async_hypercall)) { pt_err(partition, "Cannot issue async hypercall while another one in progress!\n"); return -EPERM; } reinit_completion(&partition->async_hypercall); return 0; } static void mshv_async_hvcall_handler(void *data, u64 *status) { struct mshv_partition *partition = data; wait_for_completion(&partition->async_hypercall); pt_dbg(partition, "Async hypercall completed!\n"); *status = partition->async_hypercall_status; } static int mshv_partition_region_share(struct mshv_mem_region *region) { u32 flags = HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_SHARED; if (region->flags.large_pages) flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE; return hv_call_modify_spa_host_access(region->partition->pt_id, region->pages, region->nr_pages, HV_MAP_GPA_READABLE | HV_MAP_GPA_WRITABLE, flags, true); } static int mshv_partition_region_unshare(struct mshv_mem_region *region) { u32 flags = HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_EXCLUSIVE; if (region->flags.large_pages) flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE; return hv_call_modify_spa_host_access(region->partition->pt_id, region->pages, region->nr_pages, 0, flags, false); } static int mshv_region_remap_pages(struct mshv_mem_region *region, u32 map_flags, u64 page_offset, u64 page_count) { if (page_offset + page_count > region->nr_pages) return -EINVAL; if (region->flags.large_pages) map_flags |= HV_MAP_GPA_LARGE_PAGE; /* ask the hypervisor to map guest ram */ return hv_call_map_gpa_pages(region->partition->pt_id, region->start_gfn + page_offset, page_count, map_flags, region->pages + page_offset); } static int mshv_region_map(struct mshv_mem_region *region) { u32 map_flags = region->hv_map_flags; return mshv_region_remap_pages(region, map_flags, 0, region->nr_pages); } static void mshv_region_evict_pages(struct mshv_mem_region *region, u64 page_offset, u64 page_count) { if (region->flags.range_pinned) unpin_user_pages(region->pages + page_offset, page_count); memset(region->pages + page_offset, 0, page_count * sizeof(struct page *)); } static void mshv_region_evict(struct mshv_mem_region *region) { mshv_region_evict_pages(region, 0, region->nr_pages); } static int mshv_region_populate_pages(struct mshv_mem_region *region, u64 page_offset, u64 page_count) { u64 done_count, nr_pages; struct page **pages; __u64 userspace_addr; int ret; if (page_offset + page_count > region->nr_pages) return -EINVAL; for (done_count = 0; done_count < page_count; done_count += ret) { pages = region->pages + page_offset + done_count; userspace_addr = region->start_uaddr + (page_offset + done_count) * HV_HYP_PAGE_SIZE; nr_pages = min(page_count - done_count, MSHV_PIN_PAGES_BATCH_SIZE); /* * Pinning assuming 4k pages works for large pages too. * All page structs within the large page are returned. * * Pin requests are batched because pin_user_pages_fast * with the FOLL_LONGTERM flag does a large temporary * allocation of contiguous memory. */ if (region->flags.range_pinned) ret = pin_user_pages_fast(userspace_addr, nr_pages, FOLL_WRITE | FOLL_LONGTERM, pages); else ret = -EOPNOTSUPP; if (ret < 0) goto release_pages; } if (PageHuge(region->pages[page_offset])) region->flags.large_pages = true; return 0; release_pages: mshv_region_evict_pages(region, page_offset, done_count); return ret; } static int mshv_region_populate(struct mshv_mem_region *region) { return mshv_region_populate_pages(region, 0, region->nr_pages); } static struct mshv_mem_region * mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn) { struct mshv_mem_region *region; hlist_for_each_entry(region, &partition->pt_mem_regions, hnode) { if (gfn >= region->start_gfn && gfn < region->start_gfn + region->nr_pages) return region; } return NULL; } static struct mshv_mem_region * mshv_partition_region_by_uaddr(struct mshv_partition *partition, u64 uaddr) { struct mshv_mem_region *region; hlist_for_each_entry(region, &partition->pt_mem_regions, hnode) { if (uaddr >= region->start_uaddr && uaddr < region->start_uaddr + (region->nr_pages << HV_HYP_PAGE_SHIFT)) return region; } return NULL; } /* * NB: caller checks and makes sure mem->size is page aligned * Returns: 0 with regionpp updated on success, or -errno */ static int mshv_partition_create_region(struct mshv_partition *partition, struct mshv_user_mem_region *mem, struct mshv_mem_region **regionpp, bool is_mmio) { struct mshv_mem_region *region; u64 nr_pages = HVPFN_DOWN(mem->size); /* Reject overlapping regions */ if (mshv_partition_region_by_gfn(partition, mem->guest_pfn) || mshv_partition_region_by_gfn(partition, mem->guest_pfn + nr_pages - 1) || mshv_partition_region_by_uaddr(partition, mem->userspace_addr) || mshv_partition_region_by_uaddr(partition, mem->userspace_addr + mem->size - 1)) return -EEXIST; region = vzalloc(sizeof(*region) + sizeof(struct page *) * nr_pages); if (!region) return -ENOMEM; region->nr_pages = nr_pages; region->start_gfn = mem->guest_pfn; region->start_uaddr = mem->userspace_addr; region->hv_map_flags = HV_MAP_GPA_READABLE | HV_MAP_GPA_ADJUSTABLE; if (mem->flags & BIT(MSHV_SET_MEM_BIT_WRITABLE)) region->hv_map_flags |= HV_MAP_GPA_WRITABLE; if (mem->flags & BIT(MSHV_SET_MEM_BIT_EXECUTABLE)) region->hv_map_flags |= HV_MAP_GPA_EXECUTABLE; /* Note: large_pages flag populated when we pin the pages */ if (!is_mmio) region->flags.range_pinned = true; region->partition = partition; *regionpp = region; return 0; } /* * Map guest ram. if snp, make sure to release that from the host first * Side Effects: In case of failure, pages are unpinned when feasible. */ static int mshv_partition_mem_region_map(struct mshv_mem_region *region) { struct mshv_partition *partition = region->partition; int ret; ret = mshv_region_populate(region); if (ret) { pt_err(partition, "Failed to populate memory region: %d\n", ret); goto err_out; } /* * For an SNP partition it is a requirement that for every memory region * that we are going to map for this partition we should make sure that * host access to that region is released. This is ensured by doing an * additional hypercall which will update the SLAT to release host * access to guest memory regions. */ if (mshv_partition_encrypted(partition)) { ret = mshv_partition_region_unshare(region); if (ret) { pt_err(partition, "Failed to unshare memory region (guest_pfn: %llu): %d\n", region->start_gfn, ret); goto evict_region; } } ret = mshv_region_map(region); if (ret && mshv_partition_encrypted(partition)) { int shrc; shrc = mshv_partition_region_share(region); if (!shrc) goto evict_region; pt_err(partition, "Failed to share memory region (guest_pfn: %llu): %d\n", region->start_gfn, shrc); /* * Don't unpin if marking shared failed because pages are no * longer mapped in the host, ie root, anymore. */ goto err_out; } return 0; evict_region: mshv_region_evict(region); err_out: return ret; } /* * This maps two things: guest RAM and for pci passthru mmio space. * * mmio: * - vfio overloads vm_pgoff to store the mmio start pfn/spa. * - Two things need to happen for mapping mmio range: * 1. mapped in the uaddr so VMM can access it. * 2. mapped in the hwpt (gfn <-> mmio phys addr) so guest can access it. * * This function takes care of the second. The first one is managed by vfio, * and hence is taken care of via vfio_pci_mmap_fault(). */ static long mshv_map_user_memory(struct mshv_partition *partition, struct mshv_user_mem_region mem) { struct mshv_mem_region *region; struct vm_area_struct *vma; bool is_mmio; ulong mmio_pfn; long ret; if (mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP) || !access_ok((const void *)mem.userspace_addr, mem.size)) return -EINVAL; mmap_read_lock(current->mm); vma = vma_lookup(current->mm, mem.userspace_addr); is_mmio = vma ? !!(vma->vm_flags & (VM_IO | VM_PFNMAP)) : 0; mmio_pfn = is_mmio ? vma->vm_pgoff : 0; mmap_read_unlock(current->mm); if (!vma) return -EINVAL; ret = mshv_partition_create_region(partition, &mem, ®ion, is_mmio); if (ret) return ret; if (is_mmio) ret = hv_call_map_mmio_pages(partition->pt_id, mem.guest_pfn, mmio_pfn, HVPFN_DOWN(mem.size)); else ret = mshv_partition_mem_region_map(region); if (ret) goto errout; /* Install the new region */ hlist_add_head(®ion->hnode, &partition->pt_mem_regions); return 0; errout: vfree(region); return ret; } /* Called for unmapping both the guest ram and the mmio space */ static long mshv_unmap_user_memory(struct mshv_partition *partition, struct mshv_user_mem_region mem) { struct mshv_mem_region *region; u32 unmap_flags = 0; if (!(mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP))) return -EINVAL; region = mshv_partition_region_by_gfn(partition, mem.guest_pfn); if (!region) return -EINVAL; /* Paranoia check */ if (region->start_uaddr != mem.userspace_addr || region->start_gfn != mem.guest_pfn || region->nr_pages != HVPFN_DOWN(mem.size)) return -EINVAL; hlist_del(®ion->hnode); if (region->flags.large_pages) unmap_flags |= HV_UNMAP_GPA_LARGE_PAGE; /* ignore unmap failures and continue as process may be exiting */ hv_call_unmap_gpa_pages(partition->pt_id, region->start_gfn, region->nr_pages, unmap_flags); mshv_region_evict(region); vfree(region); return 0; } static long mshv_partition_ioctl_set_memory(struct mshv_partition *partition, struct mshv_user_mem_region __user *user_mem) { struct mshv_user_mem_region mem; if (copy_from_user(&mem, user_mem, sizeof(mem))) return -EFAULT; if (!mem.size || !PAGE_ALIGNED(mem.size) || !PAGE_ALIGNED(mem.userspace_addr) || (mem.flags & ~MSHV_SET_MEM_FLAGS_MASK) || mshv_field_nonzero(mem, rsvd)) return -EINVAL; if (mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP)) return mshv_unmap_user_memory(partition, mem); return mshv_map_user_memory(partition, mem); } static long mshv_partition_ioctl_ioeventfd(struct mshv_partition *partition, void __user *user_args) { struct mshv_user_ioeventfd args; if (copy_from_user(&args, user_args, sizeof(args))) return -EFAULT; return mshv_set_unset_ioeventfd(partition, &args); } static long mshv_partition_ioctl_irqfd(struct mshv_partition *partition, void __user *user_args) { struct mshv_user_irqfd args; if (copy_from_user(&args, user_args, sizeof(args))) return -EFAULT; return mshv_set_unset_irqfd(partition, &args); } static long mshv_partition_ioctl_get_gpap_access_bitmap(struct mshv_partition *partition, void __user *user_args) { struct mshv_gpap_access_bitmap args; union hv_gpa_page_access_state *states; long ret, i; union hv_gpa_page_access_state_flags hv_flags = {}; u8 hv_type_mask; ulong bitmap_buf_sz, states_buf_sz; int written = 0; if (copy_from_user(&args, user_args, sizeof(args))) return -EFAULT; if (args.access_type >= MSHV_GPAP_ACCESS_TYPE_COUNT || args.access_op >= MSHV_GPAP_ACCESS_OP_COUNT || mshv_field_nonzero(args, rsvd) || !args.page_count || !args.bitmap_ptr) return -EINVAL; if (check_mul_overflow(args.page_count, sizeof(*states), &states_buf_sz)) return -E2BIG; /* Num bytes needed to store bitmap; one bit per page rounded up */ bitmap_buf_sz = DIV_ROUND_UP(args.page_count, 8); /* Sanity check */ if (bitmap_buf_sz > states_buf_sz) return -EBADFD; switch (args.access_type) { case MSHV_GPAP_ACCESS_TYPE_ACCESSED: hv_type_mask = 1; if (args.access_op == MSHV_GPAP_ACCESS_OP_CLEAR) { hv_flags.clear_accessed = 1; /* not accessed implies not dirty */ hv_flags.clear_dirty = 1; } else { /* MSHV_GPAP_ACCESS_OP_SET */ hv_flags.set_accessed = 1; } break; case MSHV_GPAP_ACCESS_TYPE_DIRTY: hv_type_mask = 2; if (args.access_op == MSHV_GPAP_ACCESS_OP_CLEAR) { hv_flags.clear_dirty = 1; } else { /* MSHV_GPAP_ACCESS_OP_SET */ hv_flags.set_dirty = 1; /* dirty implies accessed */ hv_flags.set_accessed = 1; } break; } states = vzalloc(states_buf_sz); if (!states) return -ENOMEM; ret = hv_call_get_gpa_access_states(partition->pt_id, args.page_count, args.gpap_base, hv_flags, &written, states); if (ret) goto free_return; /* * Overwrite states buffer with bitmap - the bits in hv_type_mask * correspond to bitfields in hv_gpa_page_access_state */ for (i = 0; i < written; ++i) __assign_bit(i, (ulong *)states, states[i].as_uint8 & hv_type_mask); /* zero the unused bits in the last byte(s) of the returned bitmap */ for (i = written; i < bitmap_buf_sz * 8; ++i) __clear_bit(i, (ulong *)states); if (copy_to_user((void __user *)args.bitmap_ptr, states, bitmap_buf_sz)) ret = -EFAULT; free_return: vfree(states); return ret; } static long mshv_partition_ioctl_set_msi_routing(struct mshv_partition *partition, void __user *user_args) { struct mshv_user_irq_entry *entries = NULL; struct mshv_user_irq_table args; long ret; if (copy_from_user(&args, user_args, sizeof(args))) return -EFAULT; if (args.nr > MSHV_MAX_GUEST_IRQS || mshv_field_nonzero(args, rsvd)) return -EINVAL; if (args.nr) { struct mshv_user_irq_table __user *urouting = user_args; entries = vmemdup_user(urouting->entries, array_size(sizeof(*entries), args.nr)); if (IS_ERR(entries)) return PTR_ERR(entries); } ret = mshv_update_routing_table(partition, entries, args.nr); kvfree(entries); return ret; } static long mshv_partition_ioctl_initialize(struct mshv_partition *partition) { long ret; if (partition->pt_initialized) return 0; ret = hv_call_initialize_partition(partition->pt_id); if (ret) goto withdraw_mem; partition->pt_initialized = true; return 0; withdraw_mem: hv_call_withdraw_memory(U64_MAX, NUMA_NO_NODE, partition->pt_id); return ret; } static long mshv_partition_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { struct mshv_partition *partition = filp->private_data; long ret; void __user *uarg = (void __user *)arg; if (mutex_lock_killable(&partition->pt_mutex)) return -EINTR; switch (ioctl) { case MSHV_INITIALIZE_PARTITION: ret = mshv_partition_ioctl_initialize(partition); break; case MSHV_SET_GUEST_MEMORY: ret = mshv_partition_ioctl_set_memory(partition, uarg); break; case MSHV_CREATE_VP: ret = mshv_partition_ioctl_create_vp(partition, uarg); break; case MSHV_IRQFD: ret = mshv_partition_ioctl_irqfd(partition, uarg); break; case MSHV_IOEVENTFD: ret = mshv_partition_ioctl_ioeventfd(partition, uarg); break; case MSHV_SET_MSI_ROUTING: ret = mshv_partition_ioctl_set_msi_routing(partition, uarg); break; case MSHV_GET_GPAP_ACCESS_BITMAP: ret = mshv_partition_ioctl_get_gpap_access_bitmap(partition, uarg); break; case MSHV_ROOT_HVCALL: ret = mshv_ioctl_passthru_hvcall(partition, true, uarg); break; default: ret = -ENOTTY; } mutex_unlock(&partition->pt_mutex); return ret; } static int disable_vp_dispatch(struct mshv_vp *vp) { int ret; struct hv_register_assoc dispatch_suspend = { .name = HV_REGISTER_DISPATCH_SUSPEND, .value.dispatch_suspend.suspended = 1, }; ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id, 1, &dispatch_suspend); if (ret) vp_err(vp, "failed to suspend\n"); return ret; } static int get_vp_signaled_count(struct mshv_vp *vp, u64 *count) { int ret; struct hv_register_assoc root_signal_count = { .name = HV_REGISTER_VP_ROOT_SIGNAL_COUNT, }; ret = mshv_get_vp_registers(vp->vp_index, vp->vp_partition->pt_id, 1, &root_signal_count); if (ret) { vp_err(vp, "Failed to get root signal count"); *count = 0; return ret; } *count = root_signal_count.value.reg64; return ret; } static void drain_vp_signals(struct mshv_vp *vp) { u64 hv_signal_count; u64 vp_signal_count; get_vp_signaled_count(vp, &hv_signal_count); vp_signal_count = atomic64_read(&vp->run.vp_signaled_count); /* * There should be at most 1 outstanding notification, but be extra * careful anyway. */ while (hv_signal_count != vp_signal_count) { WARN_ON(hv_signal_count - vp_signal_count != 1); if (wait_event_interruptible(vp->run.vp_suspend_queue, vp->run.kicked_by_hv == 1)) break; vp->run.kicked_by_hv = 0; vp_signal_count = atomic64_read(&vp->run.vp_signaled_count); } } static void drain_all_vps(const struct mshv_partition *partition) { int i; struct mshv_vp *vp; /* * VPs are reachable from ISR. It is safe to not take the partition * lock because nobody else can enter this function and drop the * partition from the list. */ for (i = 0; i < MSHV_MAX_VPS; i++) { vp = partition->pt_vp_array[i]; if (!vp) continue; /* * Disable dispatching of the VP in the hypervisor. After this * the hypervisor guarantees it won't generate any signals for * the VP and the hypervisor's VP signal count won't change. */ disable_vp_dispatch(vp); drain_vp_signals(vp); } } static void remove_partition(struct mshv_partition *partition) { spin_lock(&mshv_root.pt_ht_lock); hlist_del_rcu(&partition->pt_hnode); spin_unlock(&mshv_root.pt_ht_lock); synchronize_rcu(); } /* * Tear down a partition and remove it from the list. * Partition's refcount must be 0 */ static void destroy_partition(struct mshv_partition *partition) { struct mshv_vp *vp; struct mshv_mem_region *region; int i, ret; struct hlist_node *n; if (refcount_read(&partition->pt_ref_count)) { pt_err(partition, "Attempt to destroy partition but refcount > 0\n"); return; } if (partition->pt_initialized) { /* * We only need to drain signals for root scheduler. This should be * done before removing the partition from the partition list. */ if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) drain_all_vps(partition); /* Remove vps */ for (i = 0; i < MSHV_MAX_VPS; ++i) { vp = partition->pt_vp_array[i]; if (!vp) continue; if (hv_parent_partition()) mshv_vp_stats_unmap(partition->pt_id, vp->vp_index); if (vp->vp_register_page) { (void)hv_call_unmap_vp_state_page(partition->pt_id, vp->vp_index, HV_VP_STATE_PAGE_REGISTERS, input_vtl_zero); vp->vp_register_page = NULL; } (void)hv_call_unmap_vp_state_page(partition->pt_id, vp->vp_index, HV_VP_STATE_PAGE_INTERCEPT_MESSAGE, input_vtl_zero); vp->vp_intercept_msg_page = NULL; if (vp->vp_ghcb_page) { (void)hv_call_unmap_vp_state_page(partition->pt_id, vp->vp_index, HV_VP_STATE_PAGE_GHCB, input_vtl_normal); vp->vp_ghcb_page = NULL; } kfree(vp); partition->pt_vp_array[i] = NULL; } /* Deallocates and unmaps everything including vcpus, GPA mappings etc */ hv_call_finalize_partition(partition->pt_id); partition->pt_initialized = false; } remove_partition(partition); /* Remove regions, regain access to the memory and unpin the pages */ hlist_for_each_entry_safe(region, n, &partition->pt_mem_regions, hnode) { hlist_del(®ion->hnode); if (mshv_partition_encrypted(partition)) { ret = mshv_partition_region_share(region); if (ret) { pt_err(partition, "Failed to regain access to memory, unpinning user pages will fail and crash the host error: %d\n", ret); return; } } mshv_region_evict(region); vfree(region); } /* Withdraw and free all pages we deposited */ hv_call_withdraw_memory(U64_MAX, NUMA_NO_NODE, partition->pt_id); hv_call_delete_partition(partition->pt_id); mshv_free_routing_table(partition); kfree(partition); } struct mshv_partition *mshv_partition_get(struct mshv_partition *partition) { if (refcount_inc_not_zero(&partition->pt_ref_count)) return partition; return NULL; } struct mshv_partition *mshv_partition_find(u64 partition_id) __must_hold(RCU) { struct mshv_partition *p; hash_for_each_possible_rcu(mshv_root.pt_htable, p, pt_hnode, partition_id) if (p->pt_id == partition_id) return p; return NULL; } void mshv_partition_put(struct mshv_partition *partition) { if (refcount_dec_and_test(&partition->pt_ref_count)) destroy_partition(partition); } static int mshv_partition_release(struct inode *inode, struct file *filp) { struct mshv_partition *partition = filp->private_data; mshv_eventfd_release(partition); cleanup_srcu_struct(&partition->pt_irq_srcu); mshv_partition_put(partition); return 0; } static int add_partition(struct mshv_partition *partition) { spin_lock(&mshv_root.pt_ht_lock); hash_add_rcu(mshv_root.pt_htable, &partition->pt_hnode, partition->pt_id); spin_unlock(&mshv_root.pt_ht_lock); return 0; } static long mshv_ioctl_create_partition(void __user *user_arg, struct device *module_dev) { struct mshv_create_partition args; u64 creation_flags; struct hv_partition_creation_properties creation_properties = {}; union hv_partition_isolation_properties isolation_properties = {}; struct mshv_partition *partition; struct file *file; int fd; long ret; if (copy_from_user(&args, user_arg, sizeof(args))) return -EFAULT; if ((args.pt_flags & ~MSHV_PT_FLAGS_MASK) || args.pt_isolation >= MSHV_PT_ISOLATION_COUNT) return -EINVAL; /* Only support EXO partitions */ creation_flags = HV_PARTITION_CREATION_FLAG_EXO_PARTITION | HV_PARTITION_CREATION_FLAG_INTERCEPT_MESSAGE_PAGE_ENABLED; if (args.pt_flags & BIT(MSHV_PT_BIT_LAPIC)) creation_flags |= HV_PARTITION_CREATION_FLAG_LAPIC_ENABLED; if (args.pt_flags & BIT(MSHV_PT_BIT_X2APIC)) creation_flags |= HV_PARTITION_CREATION_FLAG_X2APIC_CAPABLE; if (args.pt_flags & BIT(MSHV_PT_BIT_GPA_SUPER_PAGES)) creation_flags |= HV_PARTITION_CREATION_FLAG_GPA_SUPER_PAGES_ENABLED; switch (args.pt_isolation) { case MSHV_PT_ISOLATION_NONE: isolation_properties.isolation_type = HV_PARTITION_ISOLATION_TYPE_NONE; break; } partition = kzalloc(sizeof(*partition), GFP_KERNEL); if (!partition) return -ENOMEM; partition->pt_module_dev = module_dev; partition->isolation_type = isolation_properties.isolation_type; refcount_set(&partition->pt_ref_count, 1); mutex_init(&partition->pt_mutex); mutex_init(&partition->pt_irq_lock); init_completion(&partition->async_hypercall); INIT_HLIST_HEAD(&partition->irq_ack_notifier_list); INIT_HLIST_HEAD(&partition->pt_devices); INIT_HLIST_HEAD(&partition->pt_mem_regions); mshv_eventfd_init(partition); ret = init_srcu_struct(&partition->pt_irq_srcu); if (ret) goto free_partition; ret = hv_call_create_partition(creation_flags, creation_properties, isolation_properties, &partition->pt_id); if (ret) goto cleanup_irq_srcu; ret = add_partition(partition); if (ret) goto delete_partition; ret = mshv_init_async_handler(partition); if (ret) goto remove_partition; fd = get_unused_fd_flags(O_CLOEXEC); if (fd < 0) { ret = fd; goto remove_partition; } file = anon_inode_getfile("mshv_partition", &mshv_partition_fops, partition, O_RDWR); if (IS_ERR(file)) { ret = PTR_ERR(file); goto put_fd; } fd_install(fd, file); return fd; put_fd: put_unused_fd(fd); remove_partition: remove_partition(partition); delete_partition: hv_call_delete_partition(partition->pt_id); cleanup_irq_srcu: cleanup_srcu_struct(&partition->pt_irq_srcu); free_partition: kfree(partition); return ret; } static long mshv_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { struct miscdevice *misc = filp->private_data; switch (ioctl) { case MSHV_CREATE_PARTITION: return mshv_ioctl_create_partition((void __user *)arg, misc->this_device); } return -ENOTTY; } static int mshv_dev_open(struct inode *inode, struct file *filp) { return 0; } static int mshv_dev_release(struct inode *inode, struct file *filp) { return 0; } static int mshv_cpuhp_online; static int mshv_root_sched_online; static const char *scheduler_type_to_string(enum hv_scheduler_type type) { switch (type) { case HV_SCHEDULER_TYPE_LP: return "classic scheduler without SMT"; case HV_SCHEDULER_TYPE_LP_SMT: return "classic scheduler with SMT"; case HV_SCHEDULER_TYPE_CORE_SMT: return "core scheduler"; case HV_SCHEDULER_TYPE_ROOT: return "root scheduler"; default: return "unknown scheduler"; }; } /* TODO move this to hv_common.c when needed outside */ static int __init hv_retrieve_scheduler_type(enum hv_scheduler_type *out) { struct hv_input_get_system_property *input; struct hv_output_get_system_property *output; unsigned long flags; u64 status; local_irq_save(flags); input = *this_cpu_ptr(hyperv_pcpu_input_arg); output = *this_cpu_ptr(hyperv_pcpu_output_arg); memset(input, 0, sizeof(*input)); memset(output, 0, sizeof(*output)); input->property_id = HV_SYSTEM_PROPERTY_SCHEDULER_TYPE; status = hv_do_hypercall(HVCALL_GET_SYSTEM_PROPERTY, input, output); if (!hv_result_success(status)) { local_irq_restore(flags); pr_err("%s: %s\n", __func__, hv_result_to_string(status)); return hv_result_to_errno(status); } *out = output->scheduler_type; local_irq_restore(flags); return 0; } /* Retrieve and stash the supported scheduler type */ static int __init mshv_retrieve_scheduler_type(struct device *dev) { int ret; ret = hv_retrieve_scheduler_type(&hv_scheduler_type); if (ret) return ret; dev_info(dev, "Hypervisor using %s\n", scheduler_type_to_string(hv_scheduler_type)); switch (hv_scheduler_type) { case HV_SCHEDULER_TYPE_CORE_SMT: case HV_SCHEDULER_TYPE_LP_SMT: case HV_SCHEDULER_TYPE_ROOT: case HV_SCHEDULER_TYPE_LP: /* Supported scheduler, nothing to do */ break; default: dev_err(dev, "unsupported scheduler 0x%x, bailing.\n", hv_scheduler_type); return -EOPNOTSUPP; } return 0; } static int mshv_root_scheduler_init(unsigned int cpu) { void **inputarg, **outputarg, *p; inputarg = (void **)this_cpu_ptr(root_scheduler_input); outputarg = (void **)this_cpu_ptr(root_scheduler_output); /* Allocate two consecutive pages. One for input, one for output. */ p = kmalloc(2 * HV_HYP_PAGE_SIZE, GFP_KERNEL); if (!p) return -ENOMEM; *inputarg = p; *outputarg = (char *)p + HV_HYP_PAGE_SIZE; return 0; } static int mshv_root_scheduler_cleanup(unsigned int cpu) { void *p, **inputarg, **outputarg; inputarg = (void **)this_cpu_ptr(root_scheduler_input); outputarg = (void **)this_cpu_ptr(root_scheduler_output); p = *inputarg; *inputarg = NULL; *outputarg = NULL; kfree(p); return 0; } /* Must be called after retrieving the scheduler type */ static int root_scheduler_init(struct device *dev) { int ret; if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT) return 0; root_scheduler_input = alloc_percpu(void *); root_scheduler_output = alloc_percpu(void *); if (!root_scheduler_input || !root_scheduler_output) { dev_err(dev, "Failed to allocate root scheduler buffers\n"); ret = -ENOMEM; goto out; } ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mshv_root_sched", mshv_root_scheduler_init, mshv_root_scheduler_cleanup); if (ret < 0) { dev_err(dev, "Failed to setup root scheduler state: %i\n", ret); goto out; } mshv_root_sched_online = ret; return 0; out: free_percpu(root_scheduler_input); free_percpu(root_scheduler_output); return ret; } static void root_scheduler_deinit(void) { if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT) return; cpuhp_remove_state(mshv_root_sched_online); free_percpu(root_scheduler_input); free_percpu(root_scheduler_output); } static int mshv_reboot_notify(struct notifier_block *nb, unsigned long code, void *unused) { cpuhp_remove_state(mshv_cpuhp_online); return 0; } struct notifier_block mshv_reboot_nb = { .notifier_call = mshv_reboot_notify, }; static void mshv_root_partition_exit(void) { unregister_reboot_notifier(&mshv_reboot_nb); root_scheduler_deinit(); } static int __init mshv_root_partition_init(struct device *dev) { int err; if (mshv_retrieve_scheduler_type(dev)) return -ENODEV; err = root_scheduler_init(dev); if (err) return err; err = register_reboot_notifier(&mshv_reboot_nb); if (err) goto root_sched_deinit; return 0; root_sched_deinit: root_scheduler_deinit(); return err; } static int __init mshv_parent_partition_init(void) { int ret; struct device *dev; union hv_hypervisor_version_info version_info; if (!hv_root_partition() || is_kdump_kernel()) return -ENODEV; if (hv_get_hypervisor_version(&version_info)) return -ENODEV; ret = misc_register(&mshv_dev); if (ret) return ret; dev = mshv_dev.this_device; if (version_info.build_number < MSHV_HV_MIN_VERSION || version_info.build_number > MSHV_HV_MAX_VERSION) { dev_err(dev, "Running on unvalidated Hyper-V version\n"); dev_err(dev, "Versions: current: %u min: %u max: %u\n", version_info.build_number, MSHV_HV_MIN_VERSION, MSHV_HV_MAX_VERSION); } mshv_root.synic_pages = alloc_percpu(struct hv_synic_pages); if (!mshv_root.synic_pages) { dev_err(dev, "Failed to allocate percpu synic page\n"); ret = -ENOMEM; goto device_deregister; } ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mshv_synic", mshv_synic_init, mshv_synic_cleanup); if (ret < 0) { dev_err(dev, "Failed to setup cpu hotplug state: %i\n", ret); goto free_synic_pages; } mshv_cpuhp_online = ret; ret = mshv_root_partition_init(dev); if (ret) goto remove_cpu_state; ret = mshv_irqfd_wq_init(); if (ret) goto exit_partition; spin_lock_init(&mshv_root.pt_ht_lock); hash_init(mshv_root.pt_htable); hv_setup_mshv_handler(mshv_isr); return 0; exit_partition: if (hv_root_partition()) mshv_root_partition_exit(); remove_cpu_state: cpuhp_remove_state(mshv_cpuhp_online); free_synic_pages: free_percpu(mshv_root.synic_pages); device_deregister: misc_deregister(&mshv_dev); return ret; } static void __exit mshv_parent_partition_exit(void) { hv_setup_mshv_handler(NULL); mshv_port_table_fini(); misc_deregister(&mshv_dev); mshv_irqfd_wq_cleanup(); if (hv_root_partition()) mshv_root_partition_exit(); cpuhp_remove_state(mshv_cpuhp_online); free_percpu(mshv_root.synic_pages); } module_init(mshv_parent_partition_init); module_exit(mshv_parent_partition_exit);