aboutsummaryrefslogtreecommitdiffstats
diff options
authorVlastimil Babka <vbabka@suse.cz>2024-02-12 21:36:33 +0100
committerVlastimil Babka <vbabka@suse.cz>2025-03-13 12:12:02 +0100
commitdf4891c13ee02fedadd66a990dd65c911f6b7ada (patch)
treea25dd941894c525b6509d39d2978ecd803f5d47d
parent856a00a80a1b2da777f49298047fef3969c6209d (diff)
downloadlinux-locking-bench.tar.gz
lib/test: add locking-benchmark modulelocking-bench
-rw-r--r--arch/x86/lib/cmpxchg16b_emu.S2
-rw-r--r--lib/Kconfig.debug9
-rw-r--r--lib/Makefile1
-rw-r--r--lib/locking-benchmark.c427
4 files changed, 439 insertions, 0 deletions
diff --git a/arch/x86/lib/cmpxchg16b_emu.S b/arch/x86/lib/cmpxchg16b_emu.S
index 4fb44894ad8759..d722d6df71f2b5 100644
--- a/arch/x86/lib/cmpxchg16b_emu.S
+++ b/arch/x86/lib/cmpxchg16b_emu.S
@@ -52,3 +52,5 @@ SYM_FUNC_START(this_cpu_cmpxchg16b_emu)
RET
SYM_FUNC_END(this_cpu_cmpxchg16b_emu)
+
+EXPORT_SYMBOL(this_cpu_cmpxchg16b_emu)
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 35796c290ca351..f67f132bd78ddc 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -2574,6 +2574,15 @@ config FIND_BIT_BENCHMARK
If unsure, say N.
+config LOCKING_BENCHMARK
+ tristate "Benchmark (pcp) locking primitives"
+ help
+ This builds the "locking-benchmark" module that measures performance
+ of various primitives and schemes for pcp structures locking or
+ atomic updates.
+
+ If unsure, say N.
+
config TEST_FIRMWARE
tristate "Test firmware loading via userspace interface"
depends on FW_LOADER
diff --git a/lib/Makefile b/lib/Makefile
index d5cfc7afbbb821..7f111fb94157f6 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -138,6 +138,7 @@ obj-$(CONFIG_GENERIC_IOMAP) += iomap.o
obj-$(CONFIG_HAS_IOMEM) += iomap_copy.o devres.o
obj-$(CONFIG_CHECK_SIGNATURE) += check_signature.o
obj-$(CONFIG_DEBUG_LOCKING_API_SELFTESTS) += locking-selftest.o
+obj-$(CONFIG_LOCKING_BENCHMARK) += locking-benchmark.o
lib-y += logic_pio.o
diff --git a/lib/locking-benchmark.c b/lib/locking-benchmark.c
new file mode 100644
index 00000000000000..2e699eefca5b89
--- /dev/null
+++ b/lib/locking-benchmark.c
@@ -0,0 +1,427 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/rcupdate.h>
+#include <linux/percpu_counter.h>
+#include <asm/tsc.h>
+
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
+/*
+ * On SMP, spin_trylock is sufficient protection.
+ * On PREEMPT_RT, spin_trylock is equivalent on both SMP and UP.
+ */
+#define pcp_trylock_prepare(flags) do { } while (0)
+#define pcp_trylock_finish(flag) do { } while (0)
+#else
+
+/* UP spin_trylock always succeeds so disable IRQs to prevent re-entrancy. */
+#define pcp_trylock_prepare(flags) local_irq_save(flags)
+#define pcp_trylock_finish(flags) local_irq_restore(flags)
+#endif
+
+/*
+ * Locking a pcp requires a PCP lookup followed by a spinlock. To avoid
+ * a migration causing the wrong PCP to be locked and remote memory being
+ * potentially allocated, pin the task to the CPU for the lookup+lock.
+ * preempt_disable is used on !RT because it is faster than migrate_disable.
+ * migrate_disable is used on RT because otherwise RT spinlock usage is
+ * interfered with and a high priority task cannot preempt the allocator.
+ */
+#ifndef CONFIG_PREEMPT_RT
+#define pcpu_task_pin() preempt_disable()
+#define pcpu_task_unpin() preempt_enable()
+#else
+#define pcpu_task_pin() migrate_disable()
+#define pcpu_task_unpin() migrate_enable()
+#endif
+
+/*
+ * Generic helper to lookup and a per-cpu variable with an embedded spinlock.
+ * Return value should be used with equivalent unlock helper.
+ */
+#define pcpu_spin_lock(type, member, ptr) \
+({ \
+ type *_ret; \
+ pcpu_task_pin(); \
+ _ret = this_cpu_ptr(ptr); \
+ spin_lock(&_ret->member); \
+ _ret; \
+})
+
+#define pcpu_spin_trylock(type, member, ptr) \
+({ \
+ type *_ret; \
+ pcpu_task_pin(); \
+ _ret = this_cpu_ptr(ptr); \
+ if (!spin_trylock(&_ret->member)) { \
+ pcpu_task_unpin(); \
+ _ret = NULL; \
+ } \
+ _ret; \
+})
+
+#define pcpu_spin_unlock(member, ptr) \
+({ \
+ spin_unlock(&ptr->member); \
+ pcpu_task_unpin(); \
+})
+
+/* struct per_cpu_pages specific helpers. */
+#define pcp_spin_lock(ptr) \
+ pcpu_spin_lock(struct per_cpu_pages, lock, ptr)
+
+#define pcp_spin_trylock(ptr) \
+ pcpu_spin_trylock(struct per_cpu_pages, lock, ptr)
+
+#define pcp_spin_unlock(ptr) \
+ pcpu_spin_unlock(lock, ptr)
+
+typedef union {
+ struct {
+ unsigned long counter;
+ void *dummy;
+ };
+ u128 full;
+} counter_ptr_t;
+
+struct test_pcp {
+ local_lock_t llock;
+ localtry_lock_t ltlock;
+ spinlock_t slock;
+ unsigned long counter;
+ counter_ptr_t counter_ptr;
+};
+
+static bool __dummy;
+
+static DEFINE_PER_CPU(struct test_pcp, test_pcps) = {
+ .llock = INIT_LOCAL_LOCK(llock),
+ .ltlock = INIT_LOCALTRY_LOCK(ltlock),
+ .slock = __SPIN_LOCK_UNLOCKED(slock),
+};
+
+static counter_ptr_t counter_ptr;
+
+struct test_bsl {
+ unsigned long page_flags;
+ unsigned long counter;
+};
+
+static struct test_bsl bsl = {};
+
+#define TIMING_ITERATIONS 1000000000
+
+#define print_result(name) \
+ pr_info("%-35s %12llu cycles\n", name, after - before)
+
+static int __init locking_bench(void)
+{
+ unsigned long long before, after;
+ unsigned long __maybe_unused UP_flags;
+ struct test_pcp *pcp;
+ struct percpu_counter pcpc;
+ unsigned long flags;
+
+ percpu_counter_init(&pcpc, 0, GFP_KERNEL);
+
+ before = rdtsc_ordered();
+
+ for (unsigned long i = 0; i < TIMING_ITERATIONS; i++) {
+ if (this_cpu_inc_return(test_pcps.counter) == 0)
+ __dummy = true;
+ }
+
+ after = rdtsc_ordered();
+
+ cond_resched();
+ print_result("this_cpu_inc_return");
+
+ before = rdtsc_ordered();
+
+ for (unsigned long i = 0; i < TIMING_ITERATIONS; i++) {
+ unsigned long old, new;
+ do {
+ old = this_cpu_read(test_pcps.counter);
+ new = old + 1;
+ } while (!this_cpu_try_cmpxchg(test_pcps.counter, &old, new));
+ }
+
+ after = rdtsc_ordered();
+
+ cond_resched();
+ print_result("this_cpu_try_cmpxchg");
+
+ before = rdtsc_ordered();
+
+ for (unsigned long i = 0; i < TIMING_ITERATIONS; i++) {
+ unsigned long old, new;
+ do {
+ old = raw_cpu_read(test_pcps.counter);
+ new = old + 1;
+ } while (!this_cpu_try_cmpxchg(test_pcps.counter, &old, new));
+ }
+
+ after = rdtsc_ordered();
+
+ cond_resched();
+ print_result("raw+this_cpu_try_cmpxchg");
+
+ before = rdtsc_ordered();
+
+ for (unsigned long i = 0; i < TIMING_ITERATIONS; i++) {
+ counter_ptr_t old, new;
+ do {
+ struct test_pcp *pcp = raw_cpu_ptr(&test_pcps);
+ old.full = pcp->counter_ptr.full;
+ new.counter = old.counter + 1;
+ new.dummy = old.dummy;
+ } while (!this_cpu_try_cmpxchg128(test_pcps.counter_ptr.full,
+ &old.full, new.full));
+ }
+
+ after = rdtsc_ordered();
+
+ cond_resched();
+ print_result("this_cpu_try_cmpxchg128");
+
+ before = rdtsc_ordered();
+
+ for (unsigned long i = 0; i < TIMING_ITERATIONS; i++) {
+ counter_ptr_t *test;
+ counter_ptr_t old, new;
+ do {
+ test = &counter_ptr;
+ old.full = test->full;
+ new.counter = old.counter + 1;
+ new.dummy = old.dummy;
+ } while (!try_cmpxchg128(&test->full,
+ &old.full, new.full));
+ }
+
+ after = rdtsc_ordered();
+
+ cond_resched();
+ print_result("try_cmpxchg128");
+
+ before = rdtsc_ordered();
+
+ for (unsigned long i = 0; i < TIMING_ITERATIONS; i++) {
+ struct test_bsl *test = &bsl;
+
+ bit_spin_lock(PG_locked, &test->page_flags);
+ test->counter++;
+ bit_spin_unlock(PG_locked, &test->page_flags);
+ }
+
+ after = rdtsc_ordered();
+
+ cond_resched();
+ print_result("bit_spin_lock");
+
+ before = rdtsc_ordered();
+
+ for (unsigned long i = 0; i < TIMING_ITERATIONS; i++) {
+ percpu_counter_inc(&pcpc);
+ }
+
+ after = rdtsc_ordered();
+
+ cond_resched();
+ print_result("percpu_counter_inc");
+
+ before = rdtsc_ordered();
+
+ for (unsigned long i = 0; i < TIMING_ITERATIONS; i++) {
+ percpu_counter_add_local(&pcpc, 1);
+ }
+
+ after = rdtsc_ordered();
+
+ cond_resched();
+ print_result("percpu_counter_inc_local");
+
+ before = rdtsc_ordered();
+
+ for (unsigned long i = 0; i < TIMING_ITERATIONS; i++) {
+ local_lock(&test_pcps.llock);
+
+ pcp = this_cpu_ptr(&test_pcps);
+
+ pcp->counter++;
+
+ local_unlock(&test_pcps.llock);
+ }
+
+ after = rdtsc_ordered();
+
+ cond_resched();
+ print_result("local_lock");
+
+ before = rdtsc_ordered();
+
+ for (unsigned long i = 0; i < TIMING_ITERATIONS; i++) {
+ local_lock_irq(&test_pcps.llock);
+
+ pcp = this_cpu_ptr(&test_pcps);
+
+ pcp->counter++;
+
+ local_unlock_irq(&test_pcps.llock);
+ }
+
+ after = rdtsc_ordered();
+
+ cond_resched();
+ print_result("local_lock_irq");
+
+ before = rdtsc_ordered();
+
+ for (unsigned long i = 0; i < TIMING_ITERATIONS; i++) {
+ local_lock_irqsave(&test_pcps.llock, flags);
+
+ pcp = this_cpu_ptr(&test_pcps);
+
+ pcp->counter++;
+
+ local_unlock_irqrestore(&test_pcps.llock, flags);
+ }
+
+ after = rdtsc_ordered();
+
+ cond_resched();
+ print_result("local_lock_irqsave");
+
+
+ before = rdtsc_ordered();
+
+ for (unsigned j = 0; j < 10; j++) {
+
+ local_irq_disable();
+
+ for (unsigned long i = 0; i < TIMING_ITERATIONS/10; i++) {
+ local_lock_irqsave(&test_pcps.llock, flags);
+
+ pcp = this_cpu_ptr(&test_pcps);
+
+ pcp->counter++;
+
+ local_unlock_irqrestore(&test_pcps.llock, flags);
+ }
+
+ local_irq_enable();
+ }
+
+ after = rdtsc_ordered();
+
+ cond_resched();
+ print_result("irq_dis(local_lock_irqsave)");
+
+ before = rdtsc_ordered();
+
+ for (unsigned long i = 0; i < TIMING_ITERATIONS; i++) {
+ localtry_lock(&test_pcps.ltlock);
+
+ pcp = this_cpu_ptr(&test_pcps);
+
+ pcp->counter++;
+
+ localtry_unlock(&test_pcps.ltlock);
+ }
+
+ after = rdtsc_ordered();
+
+ cond_resched();
+ print_result("localtry_lock");
+
+ before = rdtsc_ordered();
+
+ for (unsigned long i = 0; i < TIMING_ITERATIONS; i++) {
+ localtry_lock_irq(&test_pcps.ltlock);
+
+ pcp = this_cpu_ptr(&test_pcps);
+
+ pcp->counter++;
+
+ localtry_unlock_irq(&test_pcps.ltlock);
+ }
+
+ after = rdtsc_ordered();
+
+ cond_resched();
+ print_result("localtry_lock_irq");
+
+ before = rdtsc_ordered();
+
+ for (unsigned long i = 0; i < TIMING_ITERATIONS; i++) {
+ localtry_lock_irqsave(&test_pcps.ltlock, flags);
+
+ pcp = this_cpu_ptr(&test_pcps);
+
+ pcp->counter++;
+
+ localtry_unlock_irqrestore(&test_pcps.ltlock, flags);
+ }
+
+ after = rdtsc_ordered();
+
+ cond_resched();
+ print_result("localtry_lock_irqsave");
+
+ before = rdtsc_ordered();
+
+ for (unsigned j = 0; j < 10; j++) {
+
+ local_irq_disable();
+
+ for (unsigned long i = 0; i < TIMING_ITERATIONS/10; i++) {
+ localtry_lock_irqsave(&test_pcps.ltlock, flags);
+
+ pcp = this_cpu_ptr(&test_pcps);
+
+ pcp->counter++;
+
+ localtry_unlock_irqrestore(&test_pcps.ltlock, flags);
+ }
+
+ local_irq_enable();
+ }
+
+ after = rdtsc_ordered();
+
+ cond_resched();
+ print_result("irq_dis(localtry_lock_irqsave)");
+
+ before = rdtsc_ordered();
+
+ for (unsigned long i = 0; i < TIMING_ITERATIONS; i++) {
+
+ pcp_trylock_prepare(UP_flags);
+
+ pcp = pcpu_spin_trylock(struct test_pcp, slock, &test_pcps);
+
+ pcp = this_cpu_ptr(&test_pcps);
+
+ pcp->counter++;
+
+ pcpu_spin_unlock(slock, pcp);
+ pcp_trylock_finish(UP_flags);
+ }
+
+ after = rdtsc_ordered();
+
+ cond_resched();
+ print_result("pcpu_spin_trylock");
+
+ percpu_counter_destroy(&pcpc);
+
+ /*
+ * Everything is OK. Return error just to let user run benchmark
+ * again without annoying rmmod.
+ */
+ return -EINVAL;
+}
+
+module_init(locking_bench);
+
+MODULE_DESCRIPTION("Benchmark for (pcp) locking schemes");
+MODULE_LICENSE("GPL");