/* SPDX-License-Identifier: GPL-2.0-only */ /* * Benchmarking code execution time inside the kernel * * Copyright (C) 2014, Red Hat, Inc., Jesper Dangaard Brouer * for licensing details see kernel-base/COPYING */ #ifndef _LINUX_TIME_BENCH_H #define _LINUX_TIME_BENCH_H /* Main structure used for recording a benchmark run */ struct time_bench_record { uint32_t version_abi; uint32_t loops; /* Requested loop invocations */ uint32_t step; /* option for e.g. bulk invocations */ uint32_t flags; /* Measurements types enabled */ #define TIME_BENCH_LOOP BIT(0) #define TIME_BENCH_TSC BIT(1) #define TIME_BENCH_WALLCLOCK BIT(2) #define TIME_BENCH_PMU BIT(3) uint32_t cpu; /* Used when embedded in time_bench_cpu */ /* Records */ uint64_t invoked_cnt; /* Returned actual invocations */ uint64_t tsc_start; uint64_t tsc_stop; struct timespec64 ts_start; struct timespec64 ts_stop; /* PMU counters for instruction and cycles * instructions counter including pipelined instructions */ uint64_t pmc_inst_start; uint64_t pmc_inst_stop; /* CPU unhalted clock counter */ uint64_t pmc_clk_start; uint64_t pmc_clk_stop; /* Result records */ uint64_t tsc_interval; uint64_t time_start, time_stop, time_interval; /* in nanosec */ uint64_t pmc_inst, pmc_clk; /* Derived result records */ uint64_t tsc_cycles; // +decimal? uint64_t ns_per_call_quotient, ns_per_call_decimal; uint64_t time_sec; uint32_t time_sec_remainder; uint64_t pmc_ipc_quotient, pmc_ipc_decimal; /* inst per cycle */ }; /* For synchronizing parallel CPUs to run concurrently */ struct time_bench_sync { atomic_t nr_tests_running; struct completion start_event; }; /* Keep track of CPUs executing our bench function. * * Embed a time_bench_record for storing info per cpu */ struct time_bench_cpu { struct time_bench_record rec; struct time_bench_sync *sync; /* back ptr */ struct task_struct *task; /* "data" opaque could have been placed in time_bench_sync, * but to avoid any false sharing, place it per CPU */ void *data; /* Support masking outsome CPUs, mark if it ran */ bool did_bench_run; /* int cpu; // note CPU stored in time_bench_record */ int (*bench_func)(struct time_bench_record *record, void *data); }; /* * Below TSC assembler code is not compatible with other archs, and * can also fail on guests if cpu-flags are not correct. * * The way TSC reading is used, many iterations, does not require as * high accuracy as described below (in Intel Doc #324264). * * Considering changing to use get_cycles() (#include ). */ /** TSC (Time-Stamp Counter) based ** * Recommend reading, to understand details of reading TSC accurately: * Intel Doc #324264, "How to Benchmark Code Execution Times on Intel" * * Consider getting exclusive ownership of CPU by using: * unsigned long flags; * preempt_disable(); * raw_local_irq_save(flags); * _your_code_ * raw_local_irq_restore(flags); * preempt_enable(); * * Clobbered registers: "%rax", "%rbx", "%rcx", "%rdx" * RDTSC only change "%rax" and "%rdx" but * CPUID clears the high 32-bits of all (rax/rbx/rcx/rdx) */ static __always_inline uint64_t tsc_start_clock(void) { /* See: Intel Doc #324264 */ unsigned int hi, lo; asm volatile("CPUID\n\t" "RDTSC\n\t" "mov %%edx, %0\n\t" "mov %%eax, %1\n\t" : "=r"(hi), "=r"(lo)::"%rax", "%rbx", "%rcx", "%rdx"); //FIXME: on 32bit use clobbered %eax + %edx return ((uint64_t)lo) | (((uint64_t)hi) << 32); } static __always_inline uint64_t tsc_stop_clock(void) { /* See: Intel Doc #324264 */ unsigned int hi, lo; asm volatile("RDTSCP\n\t" "mov %%edx, %0\n\t" "mov %%eax, %1\n\t" "CPUID\n\t" : "=r"(hi), "=r"(lo)::"%rax", "%rbx", "%rcx", "%rdx"); return ((uint64_t)lo) | (((uint64_t)hi) << 32); } /** Wall-clock based ** * * use: getnstimeofday() * getnstimeofday(&rec->ts_start); * getnstimeofday(&rec->ts_stop); * * API changed see: Documentation/core-api/timekeeping.rst * https://www.kernel.org/doc/html/latest/core-api/timekeeping.html#c.getnstimeofday * * We should instead use: ktime_get_real_ts64() is a direct * replacement, but consider using monotonic time (ktime_get_ts64()) * and/or a ktime_t based interface (ktime_get()/ktime_get_real()). */ /** PMU (Performance Monitor Unit) based ** * * Needed for calculating: Instructions Per Cycle (IPC) * - The IPC number tell how efficient the CPU pipelining were */ //lookup: perf_event_create_kernel_counter() bool time_bench_PMU_config(bool enable); /* Raw reading via rdpmc() using fixed counters * * From: https://github.com/andikleen/simple-pmu */ enum { FIXED_SELECT = (1U << 30), /* == 0x40000000 */ FIXED_INST_RETIRED_ANY = 0, FIXED_CPU_CLK_UNHALTED_CORE = 1, FIXED_CPU_CLK_UNHALTED_REF = 2, }; static __always_inline unsigned int long long p_rdpmc(unsigned int in) { unsigned int d, a; asm volatile("rdpmc" : "=d"(d), "=a"(a) : "c"(in) : "memory"); return ((unsigned long long)d << 32) | a; } /* These PMU counter needs to be enabled, but I don't have the * configure code implemented. My current hack is running: * sudo perf stat -e cycles:k -e instructions:k insmod lib/ring_queue_test.ko */ /* Reading all pipelined instruction */ static __always_inline unsigned long long pmc_inst(void) { return p_rdpmc(FIXED_SELECT | FIXED_INST_RETIRED_ANY); } /* Reading CPU clock cycles */ static __always_inline unsigned long long pmc_clk(void) { return p_rdpmc(FIXED_SELECT | FIXED_CPU_CLK_UNHALTED_CORE); } /* Raw reading via MSR rdmsr() is likely wrong * FIXME: How can I know which raw MSR registers are conf for what? */ #define MSR_IA32_PCM0 0x400000C1 /* PERFCTR0 */ #define MSR_IA32_PCM1 0x400000C2 /* PERFCTR1 */ #define MSR_IA32_PCM2 0x400000C3 static inline uint64_t msr_inst(unsigned long long *msr_result) { return rdmsrq_safe(MSR_IA32_PCM0, msr_result); } /** Generic functions ** */ bool time_bench_loop(uint32_t loops, int step, char *txt, void *data, int (*func)(struct time_bench_record *rec, void *data)); bool time_bench_calc_stats(struct time_bench_record *rec); void time_bench_run_concurrent(uint32_t loops, int step, void *data, const struct cpumask *mask, /* Support masking outsome CPUs*/ struct time_bench_sync *sync, struct time_bench_cpu *cpu_tasks, int (*func)(struct time_bench_record *record, void *data)); void time_bench_print_stats_cpumask(const char *desc, struct time_bench_cpu *cpu_tasks, const struct cpumask *mask); //FIXME: use rec->flags to select measurement, should be MACRO static __always_inline void time_bench_start(struct time_bench_record *rec) { //getnstimeofday(&rec->ts_start); ktime_get_real_ts64(&rec->ts_start); if (rec->flags & TIME_BENCH_PMU) { rec->pmc_inst_start = pmc_inst(); rec->pmc_clk_start = pmc_clk(); } rec->tsc_start = tsc_start_clock(); } static __always_inline void time_bench_stop(struct time_bench_record *rec, uint64_t invoked_cnt) { rec->tsc_stop = tsc_stop_clock(); if (rec->flags & TIME_BENCH_PMU) { rec->pmc_inst_stop = pmc_inst(); rec->pmc_clk_stop = pmc_clk(); } //getnstimeofday(&rec->ts_stop); ktime_get_real_ts64(&rec->ts_stop); rec->invoked_cnt = invoked_cnt; } #endif /* _LINUX_TIME_BENCH_H */