// SPDX-License-Identifier: GPL-2.0-or-later /* * Virtual PTP 1588 clock for use with LM-safe VMclock device. * * Copyright © 2024 Amazon.com, Inc. or its affiliates. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef CONFIG_X86 #include #include #endif #ifdef CONFIG_KVM_GUEST #define SUPPORT_KVMCLOCK #endif static DEFINE_IDA(vmclock_ida); ACPI_MODULE_NAME("vmclock"); struct vmclock_state { struct resource res; struct vmclock_abi *clk; struct miscdevice miscdev; struct ptp_clock_info ptp_clock_info; struct ptp_clock *ptp_clock; enum clocksource_ids cs_id, sys_cs_id; int index; char *name; }; #define VMCLOCK_MAX_WAIT ms_to_ktime(100) /* Require at least the flags field to be present. All else can be optional. */ #define VMCLOCK_MIN_SIZE offsetof(struct vmclock_abi, pad) #define VMCLOCK_FIELD_PRESENT(_c, _f) \ (le32_to_cpu((_c)->size) >= (offsetof(struct vmclock_abi, _f) + \ sizeof((_c)->_f))) /* * Multiply a 64-bit count by a 64-bit tick 'period' in units of seconds >> 64 * and add the fractional second part of the reference time. * * The result is a 128-bit value, the top 64 bits of which are seconds, and * the low 64 bits are (seconds >> 64). */ static uint64_t mul_u64_u64_shr_add_u64(uint64_t *res_hi, uint64_t delta, uint64_t period, uint8_t shift, uint64_t frac_sec) { unsigned __int128 res = (unsigned __int128)delta * period; res >>= shift; res += frac_sec; *res_hi = res >> 64; return (uint64_t)res; } static bool tai_adjust(struct vmclock_abi *clk, uint64_t *sec) { if (likely(clk->time_type == VMCLOCK_TIME_UTC)) return true; if (clk->time_type == VMCLOCK_TIME_TAI && (le64_to_cpu(clk->flags) & VMCLOCK_FLAG_TAI_OFFSET_VALID)) { if (sec) *sec += (int16_t)le16_to_cpu(clk->tai_offset_sec); return true; } return false; } static int vmclock_get_crosststamp(struct vmclock_state *st, struct ptp_system_timestamp *sts, struct system_counterval_t *system_counter, struct timespec64 *tspec) { ktime_t deadline = ktime_add(ktime_get(), VMCLOCK_MAX_WAIT); struct system_time_snapshot systime_snapshot; uint64_t cycle, delta, seq, frac_sec; #ifdef CONFIG_X86 /* * We'd expect the hypervisor to know this and to report the clock * status as VMCLOCK_STATUS_UNRELIABLE. But be paranoid. */ if (check_tsc_unstable()) return -EINVAL; #endif while (1) { seq = le32_to_cpu(st->clk->seq_count) & ~1ULL; /* * This pairs with a write barrier in the hypervisor * which populates this structure. */ virt_rmb(); if (st->clk->clock_status == VMCLOCK_STATUS_UNRELIABLE) return -EINVAL; /* * When invoked for gettimex64(), fill in the pre/post system * times. The simple case is when system time is based on the * same counter as st->cs_id, in which case all three times * will be derived from the *same* counter value. * * If the system isn't using the same counter, then the value * from ktime_get_snapshot() will still be used as pre_ts, and * ptp_read_system_postts() is called to populate postts after * calling get_cycles(). * * The conversion to timespec64 happens further down, outside * the seq_count loop. */ if (sts) { ktime_get_snapshot(&systime_snapshot); if (systime_snapshot.cs_id == st->cs_id) { cycle = systime_snapshot.cycles; } else { cycle = get_cycles(); ptp_read_system_postts(sts); } } else { cycle = get_cycles(); } delta = cycle - le64_to_cpu(st->clk->counter_value); frac_sec = mul_u64_u64_shr_add_u64(&tspec->tv_sec, delta, le64_to_cpu(st->clk->counter_period_frac_sec), st->clk->counter_period_shift, le64_to_cpu(st->clk->time_frac_sec)); tspec->tv_nsec = mul_u64_u64_shr(frac_sec, NSEC_PER_SEC, 64); tspec->tv_sec += le64_to_cpu(st->clk->time_sec); if (!tai_adjust(st->clk, &tspec->tv_sec)) return -EINVAL; /* * This pairs with a write barrier in the hypervisor * which populates this structure. */ virt_rmb(); if (seq == le32_to_cpu(st->clk->seq_count)) break; if (ktime_after(ktime_get(), deadline)) return -ETIMEDOUT; } if (system_counter) { system_counter->cycles = cycle; system_counter->cs_id = st->cs_id; } if (sts) { sts->pre_ts = ktime_to_timespec64(systime_snapshot.real); if (systime_snapshot.cs_id == st->cs_id) sts->post_ts = sts->pre_ts; } return 0; } #ifdef SUPPORT_KVMCLOCK /* * In the case where the system is using the KVM clock for timekeeping, convert * the TSC value into a KVM clock time in order to return a paired reading that * get_device_system_crosststamp() can cope with. */ static int vmclock_get_crosststamp_kvmclock(struct vmclock_state *st, struct ptp_system_timestamp *sts, struct system_counterval_t *system_counter, struct timespec64 *tspec) { struct pvclock_vcpu_time_info *pvti = this_cpu_pvti(); unsigned int pvti_ver; int ret; preempt_disable_notrace(); do { pvti_ver = pvclock_read_begin(pvti); ret = vmclock_get_crosststamp(st, sts, system_counter, tspec); if (ret) break; system_counter->cycles = __pvclock_read_cycles(pvti, system_counter->cycles); system_counter->cs_id = CSID_X86_KVM_CLK; /* * This retry should never really happen; if the TSC is * stable and reliable enough across vCPUS that it is sane * for the hypervisor to expose a VMCLOCK device which uses * it as the reference counter, then the KVM clock sohuld be * in 'master clock mode' and basically never changed. But * the KVM clock is a fickle and often broken thing, so do * it "properly" just in case. */ } while (pvclock_read_retry(pvti, pvti_ver)); preempt_enable_notrace(); return ret; } #endif static int ptp_vmclock_get_time_fn(ktime_t *device_time, struct system_counterval_t *system_counter, void *ctx) { struct vmclock_state *st = ctx; struct timespec64 tspec; int ret; #ifdef SUPPORT_KVMCLOCK if (READ_ONCE(st->sys_cs_id) == CSID_X86_KVM_CLK) ret = vmclock_get_crosststamp_kvmclock(st, NULL, system_counter, &tspec); else #endif ret = vmclock_get_crosststamp(st, NULL, system_counter, &tspec); if (!ret) *device_time = timespec64_to_ktime(tspec); return ret; } static int ptp_vmclock_getcrosststamp(struct ptp_clock_info *ptp, struct system_device_crosststamp *xtstamp) { struct vmclock_state *st = container_of(ptp, struct vmclock_state, ptp_clock_info); int ret = get_device_system_crosststamp(ptp_vmclock_get_time_fn, st, NULL, xtstamp); #ifdef SUPPORT_KVMCLOCK /* * On x86, the KVM clock may be used for the system time. We can * actually convert a TSC reading to that, and return a paired * timestamp that get_device_system_crosststamp() *can* handle. */ if (ret == -ENODEV) { struct system_time_snapshot systime_snapshot; ktime_get_snapshot(&systime_snapshot); if (systime_snapshot.cs_id == CSID_X86_TSC || systime_snapshot.cs_id == CSID_X86_KVM_CLK) { WRITE_ONCE(st->sys_cs_id, systime_snapshot.cs_id); ret = get_device_system_crosststamp(ptp_vmclock_get_time_fn, st, NULL, xtstamp); } } #endif return ret; } /* * PTP clock operations */ static int ptp_vmclock_adjfine(struct ptp_clock_info *ptp, long delta) { return -EOPNOTSUPP; } static int ptp_vmclock_adjtime(struct ptp_clock_info *ptp, s64 delta) { return -EOPNOTSUPP; } static int ptp_vmclock_settime(struct ptp_clock_info *ptp, const struct timespec64 *ts) { return -EOPNOTSUPP; } static int ptp_vmclock_gettimex(struct ptp_clock_info *ptp, struct timespec64 *ts, struct ptp_system_timestamp *sts) { struct vmclock_state *st = container_of(ptp, struct vmclock_state, ptp_clock_info); return vmclock_get_crosststamp(st, sts, NULL, ts); } static int ptp_vmclock_enable(struct ptp_clock_info *ptp, struct ptp_clock_request *rq, int on) { return -EOPNOTSUPP; } static const struct ptp_clock_info ptp_vmclock_info = { .owner = THIS_MODULE, .max_adj = 0, .n_ext_ts = 0, .n_pins = 0, .pps = 0, .adjfine = ptp_vmclock_adjfine, .adjtime = ptp_vmclock_adjtime, .gettimex64 = ptp_vmclock_gettimex, .settime64 = ptp_vmclock_settime, .enable = ptp_vmclock_enable, .getcrosststamp = ptp_vmclock_getcrosststamp, }; static struct ptp_clock *vmclock_ptp_register(struct device *dev, struct vmclock_state *st) { enum clocksource_ids cs_id; if (IS_ENABLED(CONFIG_ARM64) && st->clk->counter_id == VMCLOCK_COUNTER_ARM_VCNT) { /* Can we check it's the virtual counter? */ cs_id = CSID_ARM_ARCH_COUNTER; } else if (IS_ENABLED(CONFIG_X86) && st->clk->counter_id == VMCLOCK_COUNTER_X86_TSC) { cs_id = CSID_X86_TSC; } else { return NULL; } /* Only UTC, or TAI with offset */ if (!tai_adjust(st->clk, NULL)) { dev_info(dev, "vmclock does not provide unambiguous UTC\n"); return NULL; } st->sys_cs_id = cs_id; st->cs_id = cs_id; st->ptp_clock_info = ptp_vmclock_info; strscpy(st->ptp_clock_info.name, st->name); return ptp_clock_register(&st->ptp_clock_info, dev); } static int vmclock_miscdev_mmap(struct file *fp, struct vm_area_struct *vma) { struct vmclock_state *st = container_of(fp->private_data, struct vmclock_state, miscdev); if ((vma->vm_flags & (VM_READ|VM_WRITE)) != VM_READ) return -EROFS; if (vma->vm_end - vma->vm_start != PAGE_SIZE || vma->vm_pgoff) return -EINVAL; if (io_remap_pfn_range(vma, vma->vm_start, st->res.start >> PAGE_SHIFT, PAGE_SIZE, vma->vm_page_prot)) return -EAGAIN; return 0; } static ssize_t vmclock_miscdev_read(struct file *fp, char __user *buf, size_t count, loff_t *ppos) { struct vmclock_state *st = container_of(fp->private_data, struct vmclock_state, miscdev); ktime_t deadline = ktime_add(ktime_get(), VMCLOCK_MAX_WAIT); size_t max_count; uint32_t seq; if (*ppos >= PAGE_SIZE) return 0; max_count = PAGE_SIZE - *ppos; if (count > max_count) count = max_count; while (1) { seq = le32_to_cpu(st->clk->seq_count) & ~1U; /* Pairs with hypervisor wmb */ virt_rmb(); if (copy_to_user(buf, ((char *)st->clk) + *ppos, count)) return -EFAULT; /* Pairs with hypervisor wmb */ virt_rmb(); if (seq == le32_to_cpu(st->clk->seq_count)) break; if (ktime_after(ktime_get(), deadline)) return -ETIMEDOUT; } *ppos += count; return count; } static const struct file_operations vmclock_miscdev_fops = { .mmap = vmclock_miscdev_mmap, .read = vmclock_miscdev_read, }; /* module operations */ static void vmclock_remove(struct platform_device *pdev) { struct device *dev = &pdev->dev; struct vmclock_state *st = dev_get_drvdata(dev); if (st->ptp_clock) ptp_clock_unregister(st->ptp_clock); if (st->miscdev.minor != MISC_DYNAMIC_MINOR) misc_deregister(&st->miscdev); } static acpi_status vmclock_acpi_resources(struct acpi_resource *ares, void *data) { struct vmclock_state *st = data; struct resource_win win; struct resource *res = &win.res; if (ares->type == ACPI_RESOURCE_TYPE_END_TAG) return AE_OK; /* There can be only one */ if (resource_type(&st->res) == IORESOURCE_MEM) return AE_ERROR; if (acpi_dev_resource_memory(ares, res) || acpi_dev_resource_address_space(ares, &win)) { if (resource_type(res) != IORESOURCE_MEM || resource_size(res) < sizeof(st->clk)) return AE_ERROR; st->res = *res; return AE_OK; } return AE_ERROR; } static int vmclock_probe_acpi(struct device *dev, struct vmclock_state *st) { struct acpi_device *adev = ACPI_COMPANION(dev); acpi_status status; /* * This should never happen as this function is only called when * has_acpi_companion(dev) is true, but the logic is sufficiently * complex that Coverity can't see the tautology. */ if (!adev) return -ENODEV; status = acpi_walk_resources(adev->handle, METHOD_NAME__CRS, vmclock_acpi_resources, st); if (ACPI_FAILURE(status) || resource_type(&st->res) != IORESOURCE_MEM) { dev_err(dev, "failed to get resources\n"); return -ENODEV; } return 0; } static void vmclock_put_idx(void *data) { struct vmclock_state *st = data; ida_free(&vmclock_ida, st->index); } static int vmclock_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; struct vmclock_state *st; int ret; st = devm_kzalloc(dev, sizeof(*st), GFP_KERNEL); if (!st) return -ENOMEM; if (has_acpi_companion(dev)) ret = vmclock_probe_acpi(dev, st); else ret = -EINVAL; /* Only ACPI for now */ if (ret) { dev_info(dev, "Failed to obtain physical address: %d\n", ret); goto out; } if (resource_size(&st->res) < VMCLOCK_MIN_SIZE) { dev_info(dev, "Region too small (0x%llx)\n", resource_size(&st->res)); ret = -EINVAL; goto out; } st->clk = devm_memremap(dev, st->res.start, resource_size(&st->res), MEMREMAP_WB | MEMREMAP_DEC); if (IS_ERR(st->clk)) { ret = PTR_ERR(st->clk); dev_info(dev, "failed to map shared memory\n"); st->clk = NULL; goto out; } if (le32_to_cpu(st->clk->magic) != VMCLOCK_MAGIC || le32_to_cpu(st->clk->size) > resource_size(&st->res) || le16_to_cpu(st->clk->version) != 1) { dev_info(dev, "vmclock magic fields invalid\n"); ret = -EINVAL; goto out; } ret = ida_alloc(&vmclock_ida, GFP_KERNEL); if (ret < 0) goto out; st->index = ret; ret = devm_add_action_or_reset(&pdev->dev, vmclock_put_idx, st); if (ret) goto out; st->name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "vmclock%d", st->index); if (!st->name) { ret = -ENOMEM; goto out; } /* * If the structure is big enough, it can be mapped to userspace. * Theoretically a guest OS even using larger pages could still * use 4KiB PTEs to map smaller MMIO regions like this, but let's * cross that bridge if/when we come to it. */ if (le32_to_cpu(st->clk->size) >= PAGE_SIZE) { st->miscdev.minor = MISC_DYNAMIC_MINOR; st->miscdev.fops = &vmclock_miscdev_fops; st->miscdev.name = st->name; ret = misc_register(&st->miscdev); if (ret) goto out; } /* If there is valid clock information, register a PTP clock */ if (VMCLOCK_FIELD_PRESENT(st->clk, time_frac_sec)) { /* Can return a silent NULL, or an error. */ st->ptp_clock = vmclock_ptp_register(dev, st); if (IS_ERR(st->ptp_clock)) { ret = PTR_ERR(st->ptp_clock); st->ptp_clock = NULL; vmclock_remove(pdev); goto out; } } if (!st->miscdev.minor && !st->ptp_clock) { /* Neither miscdev nor PTP registered */ dev_info(dev, "vmclock: Neither miscdev nor PTP available; not registering\n"); ret = -ENODEV; goto out; } dev_info(dev, "%s: registered %s%s%s\n", st->name, st->miscdev.minor ? "miscdev" : "", (st->miscdev.minor && st->ptp_clock) ? ", " : "", st->ptp_clock ? "PTP" : ""); dev_set_drvdata(dev, st); out: return ret; } static const struct acpi_device_id vmclock_acpi_ids[] = { { "AMZNC10C", 0 }, {} }; MODULE_DEVICE_TABLE(acpi, vmclock_acpi_ids); static struct platform_driver vmclock_platform_driver = { .probe = vmclock_probe, .remove = vmclock_remove, .driver = { .name = "vmclock", .acpi_match_table = vmclock_acpi_ids, }, }; module_platform_driver(vmclock_platform_driver) MODULE_AUTHOR("David Woodhouse "); MODULE_DESCRIPTION("PTP clock using VMCLOCK"); MODULE_LICENSE("GPL");