// SPDX-License-Identifier: GPL-2.0-only /* * Benchmarking code execution time inside the kernel * * Copyright (C) 2014, Red Hat, Inc., Jesper Dangaard Brouer */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include #include #include /* perf_event_create_kernel_counter() */ /* For concurrency testing */ #include #include #include #include #include "time_bench.h" static int verbose = 1; /** TSC (Time-Stamp Counter) based ** * See: linux/time_bench.h * tsc_start_clock() and tsc_stop_clock() */ /** Wall-clock based ** */ /** PMU (Performance Monitor Unit) based ** */ #define PERF_FORMAT \ (PERF_FORMAT_GROUP | PERF_FORMAT_ID | PERF_FORMAT_TOTAL_TIME_ENABLED | \ PERF_FORMAT_TOTAL_TIME_RUNNING) struct raw_perf_event { uint64_t config; /* event */ uint64_t config1; /* umask */ struct perf_event *save; char *desc; }; /* if HT is enable a maximum of 4 events (5 if one is instructions * retired can be specified, if HT is disabled a maximum of 8 (9 if * one is instructions retired) can be specified. * * From Table 19-1. Architectural Performance Events * Architectures Software Developer’s Manual Volume 3: System Programming * Guide */ struct raw_perf_event perf_events[] = { { 0x3c, 0x00, NULL, "Unhalted CPU Cycles" }, { 0xc0, 0x00, NULL, "Instruction Retired" } }; #define NUM_EVTS (ARRAY_SIZE(perf_events)) /* WARNING: PMU config is currently broken! */ bool time_bench_PMU_config(bool enable) { int i; struct perf_event_attr perf_conf; struct perf_event *perf_event; int cpu; preempt_disable(); cpu = smp_processor_id(); pr_info("DEBUG: cpu:%d\n", cpu); preempt_enable(); memset(&perf_conf, 0, sizeof(struct perf_event_attr)); perf_conf.type = PERF_TYPE_RAW; perf_conf.size = sizeof(struct perf_event_attr); perf_conf.read_format = PERF_FORMAT; perf_conf.pinned = 1; perf_conf.exclude_user = 1; /* No userspace events */ perf_conf.exclude_kernel = 0; /* Only kernel events */ for (i = 0; i < NUM_EVTS; i++) { perf_conf.disabled = enable; //perf_conf.disabled = (i == 0) ? 1 : 0; perf_conf.config = perf_events[i].config; perf_conf.config1 = perf_events[i].config1; if (verbose) pr_info("%s() enable PMU counter: %s\n", __func__, perf_events[i].desc); perf_event = perf_event_create_kernel_counter(&perf_conf, cpu, NULL /* task */, NULL /* overflow_handler*/, NULL /* context */); if (perf_event) { perf_events[i].save = perf_event; pr_info("%s():DEBUG perf_event success\n", __func__); perf_event_enable(perf_event); } else { pr_info("%s():DEBUG perf_event is NULL\n", __func__); } } return true; } /** Generic functions ** */ /* Calculate stats, store results in record */ bool time_bench_calc_stats(struct time_bench_record *rec) { #define NANOSEC_PER_SEC 1000000000 /* 10^9 */ uint64_t ns_per_call_tmp_rem = 0; uint32_t ns_per_call_remainder = 0; uint64_t pmc_ipc_tmp_rem = 0; uint32_t pmc_ipc_remainder = 0; uint32_t pmc_ipc_div = 0; uint32_t invoked_cnt_precision = 0; uint32_t invoked_cnt = 0; /* 32-bit due to div_u64_rem() */ if (rec->flags & TIME_BENCH_LOOP) { if (rec->invoked_cnt < 1000) { pr_err("ERR: need more(>1000) loops(%llu) for timing\n", rec->invoked_cnt); return false; } if (rec->invoked_cnt > ((1ULL << 32) - 1)) { /* div_u64_rem() can only support div with 32bit*/ pr_err("ERR: Invoke cnt(%llu) too big overflow 32bit\n", rec->invoked_cnt); return false; } invoked_cnt = (uint32_t)rec->invoked_cnt; } /* TSC (Time-Stamp Counter) records */ if (rec->flags & TIME_BENCH_TSC) { rec->tsc_interval = rec->tsc_stop - rec->tsc_start; if (rec->tsc_interval == 0) { pr_err("ABORT: timing took ZERO TSC time\n"); return false; } /* Calculate stats */ if (rec->flags & TIME_BENCH_LOOP) rec->tsc_cycles = rec->tsc_interval / invoked_cnt; else rec->tsc_cycles = rec->tsc_interval; } /* Wall-clock time calc */ if (rec->flags & TIME_BENCH_WALLCLOCK) { rec->time_start = rec->ts_start.tv_nsec + (NANOSEC_PER_SEC * rec->ts_start.tv_sec); rec->time_stop = rec->ts_stop.tv_nsec + (NANOSEC_PER_SEC * rec->ts_stop.tv_sec); rec->time_interval = rec->time_stop - rec->time_start; if (rec->time_interval == 0) { pr_err("ABORT: timing took ZERO wallclock time\n"); return false; } /* Calculate stats */ /*** Division in kernel it tricky ***/ /* Orig: time_sec = (time_interval / NANOSEC_PER_SEC); */ /* remainder only correct because NANOSEC_PER_SEC is 10^9 */ rec->time_sec = div_u64_rem(rec->time_interval, NANOSEC_PER_SEC, &rec->time_sec_remainder); //TODO: use existing struct timespec records instead of div? if (rec->flags & TIME_BENCH_LOOP) { /*** Division in kernel it tricky ***/ /* Orig: ns = ((double)time_interval / invoked_cnt); */ /* First get quotient */ rec->ns_per_call_quotient = div_u64_rem(rec->time_interval, invoked_cnt, &ns_per_call_remainder); /* Now get decimals .xxx precision (incorrect roundup)*/ ns_per_call_tmp_rem = ns_per_call_remainder; invoked_cnt_precision = invoked_cnt / 1000; if (invoked_cnt_precision > 0) { rec->ns_per_call_decimal = div_u64_rem(ns_per_call_tmp_rem, invoked_cnt_precision, &ns_per_call_remainder); } } } /* Performance Monitor Unit (PMU) counters */ if (rec->flags & TIME_BENCH_PMU) { //FIXME: Overflow handling??? rec->pmc_inst = rec->pmc_inst_stop - rec->pmc_inst_start; rec->pmc_clk = rec->pmc_clk_stop - rec->pmc_clk_start; /* Calc Instruction Per Cycle (IPC) */ /* First get quotient */ rec->pmc_ipc_quotient = div_u64_rem(rec->pmc_inst, rec->pmc_clk, &pmc_ipc_remainder); /* Now get decimals .xxx precision (incorrect roundup)*/ pmc_ipc_tmp_rem = pmc_ipc_remainder; pmc_ipc_div = rec->pmc_clk / 1000; if (pmc_ipc_div > 0) { rec->pmc_ipc_decimal = div_u64_rem(pmc_ipc_tmp_rem, pmc_ipc_div, &pmc_ipc_remainder); } } return true; } /* Generic function for invoking a loop function and calculating * execution time stats. The function being called/timed is assumed * to perform a tight loop, and update the timing record struct. */ bool time_bench_loop(uint32_t loops, int step, char *txt, void *data, int (*func)(struct time_bench_record *record, void *data)) { struct time_bench_record rec; /* Setup record */ memset(&rec, 0, sizeof(rec)); /* zero func might not update all */ rec.version_abi = 1; rec.loops = loops; rec.step = step; rec.flags = (TIME_BENCH_LOOP | TIME_BENCH_TSC | TIME_BENCH_WALLCLOCK); /*** Loop function being timed ***/ if (!func(&rec, data)) { pr_err("ABORT: function being timed failed\n"); return false; } if (rec.invoked_cnt < loops) pr_warn("WARNING: Invoke count(%llu) smaller than loops(%d)\n", rec.invoked_cnt, loops); /* Calculate stats */ time_bench_calc_stats(&rec); pr_info("Type:%s Per elem: %llu cycles(tsc) %llu.%03llu ns (step:%d) - (measurement period time:%llu.%09u sec time_interval:%llu) - (invoke count:%llu tsc_interval:%llu)\n", txt, rec.tsc_cycles, rec.ns_per_call_quotient, rec.ns_per_call_decimal, rec.step, rec.time_sec, rec.time_sec_remainder, rec.time_interval, rec.invoked_cnt, rec.tsc_interval); if (rec.flags & TIME_BENCH_PMU) pr_info("Type:%s PMU inst/clock%llu/%llu = %llu.%03llu IPC (inst per cycle)\n", txt, rec.pmc_inst, rec.pmc_clk, rec.pmc_ipc_quotient, rec.pmc_ipc_decimal); return true; } /* Function getting invoked by kthread */ static int invoke_test_on_cpu_func(void *private) { struct time_bench_cpu *cpu = private; struct time_bench_sync *sync = cpu->sync; cpumask_t newmask = CPU_MASK_NONE; void *data = cpu->data; /* Restrict CPU */ cpumask_set_cpu(cpu->rec.cpu, &newmask); set_cpus_allowed_ptr(current, &newmask); /* Synchronize start of concurrency test */ atomic_inc(&sync->nr_tests_running); wait_for_completion(&sync->start_event); /* Start benchmark function */ if (!cpu->bench_func(&cpu->rec, data)) { pr_err("ERROR: function being timed failed on CPU:%d(%d)\n", cpu->rec.cpu, smp_processor_id()); } else { if (verbose) pr_info("SUCCESS: ran on CPU:%d(%d)\n", cpu->rec.cpu, smp_processor_id()); } cpu->did_bench_run = true; /* End test */ atomic_dec(&sync->nr_tests_running); /* Wait for kthread_stop() telling us to stop */ while (!kthread_should_stop()) { set_current_state(TASK_INTERRUPTIBLE); schedule(); } __set_current_state(TASK_RUNNING); return 0; } void time_bench_print_stats_cpumask(const char *desc, struct time_bench_cpu *cpu_tasks, const struct cpumask *mask) { uint64_t average = 0; int cpu; int step = 0; struct sum { uint64_t tsc_cycles; int records; } sum = { 0 }; /* Get stats */ for_each_cpu(cpu, mask) { struct time_bench_cpu *c = &cpu_tasks[cpu]; struct time_bench_record *rec = &c->rec; /* Calculate stats */ time_bench_calc_stats(rec); pr_info("Type:%s CPU(%d) %llu cycles(tsc) %llu.%03llu ns (step:%d) - (measurement period time:%llu.%09u sec time_interval:%llu) - (invoke count:%llu tsc_interval:%llu)\n", desc, cpu, rec->tsc_cycles, rec->ns_per_call_quotient, rec->ns_per_call_decimal, rec->step, rec->time_sec, rec->time_sec_remainder, rec->time_interval, rec->invoked_cnt, rec->tsc_interval); /* Collect average */ sum.records++; sum.tsc_cycles += rec->tsc_cycles; step = rec->step; } if (sum.records) /* avoid div-by-zero */ average = sum.tsc_cycles / sum.records; pr_info("Sum Type:%s Average: %llu cycles(tsc) CPUs:%d step:%d\n", desc, average, sum.records, step); } void time_bench_run_concurrent(uint32_t loops, int step, void *data, const struct cpumask *mask, /* Support masking outsome CPUs*/ struct time_bench_sync *sync, struct time_bench_cpu *cpu_tasks, int (*func)(struct time_bench_record *record, void *data)) { int cpu, running = 0; if (verbose) // DEBUG pr_warn("%s() Started on CPU:%d\n", __func__, smp_processor_id()); /* Reset sync conditions */ atomic_set(&sync->nr_tests_running, 0); init_completion(&sync->start_event); /* Spawn off jobs on all CPUs */ for_each_cpu(cpu, mask) { struct time_bench_cpu *c = &cpu_tasks[cpu]; running++; c->sync = sync; /* Send sync variable along */ c->data = data; /* Send opaque along */ /* Init benchmark record */ memset(&c->rec, 0, sizeof(struct time_bench_record)); c->rec.version_abi = 1; c->rec.loops = loops; c->rec.step = step; c->rec.flags = (TIME_BENCH_LOOP | TIME_BENCH_TSC | TIME_BENCH_WALLCLOCK); c->rec.cpu = cpu; c->bench_func = func; c->task = kthread_run(invoke_test_on_cpu_func, c, "time_bench%d", cpu); if (IS_ERR(c->task)) { pr_err("%s(): Failed to start test func\n", __func__); return; /* Argh, what about cleanup?! */ } } /* Wait until all processes are running */ while (atomic_read(&sync->nr_tests_running) < running) { set_current_state(TASK_UNINTERRUPTIBLE); schedule_timeout(10); } /* Kick off all CPU concurrently on completion event */ complete_all(&sync->start_event); /* Wait for CPUs to finish */ while (atomic_read(&sync->nr_tests_running)) { set_current_state(TASK_UNINTERRUPTIBLE); schedule_timeout(10); } /* Stop the kthreads */ for_each_cpu(cpu, mask) { struct time_bench_cpu *c = &cpu_tasks[cpu]; kthread_stop(c->task); } if (verbose) // DEBUG - happens often, finish on another CPU pr_warn("%s() Finished on CPU:%d\n", __func__, smp_processor_id()); }