aboutsummaryrefslogtreecommitdiffstats
path: root/lib
diff options
authorFeng Tang <feng.tang@linux.alibaba.com>2026-05-21 11:03:36 +0800
committerAndrew Morton <akpm@linux-foundation.org>2026-05-28 21:24:59 -0700
commitc60ffec33ddf24577f6f4da18fe825b2058c5f78 (patch)
tree2ae24292770c97ca37efefdc1bb54ddb74bbdaaa /lib
parent685568777c5a18fbc40bd0b64527fd9444c255be (diff)
downloadlinux-next-history-c60ffec33ddf24577f6f4da18fe825b2058c5f78.tar.gz
lib/nmi_backtrace: print out the CPUs which fail to respond to NMI
When debugging RCU stall cases, usually all CPUs will respond to the NMI and print out the backtrace. But in some nasty or hardware related cases, some CPUs may fail to respond in 10 seconds, and very likely this is sign of severe issues. Paul McKenney has implemented the NMI backtrace stall check for x86, and for other architectures, it should be also helpful to at least print out those CPUs which failed to repond to the NMI, so that users can get an early heads-up for possible CPU hard stall. [feng.tang@linux.alibaba.com: avoid hard-coding "10" in two places and in a comment] Link: https://lore.kernel.org/ag-1ciG0FSomBf7q@U-2FWC9VHC-2323.local [akpm@linux-foundation.org: use __stringify()] Link: https://lore.kernel.org/20260521030336.92172-1-feng.tang@linux.alibaba.com Signed-off-by: Feng Tang <feng.tang@linux.alibaba.com> Reviewed-by: Petr Mladek <pmladek@suse.com> Cc: "Paul E . McKenney" <paulmck@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Diffstat (limited to 'lib')
-rw-r--r--lib/nmi_backtrace.c15
1 files changed, 12 insertions, 3 deletions
diff --git a/lib/nmi_backtrace.c b/lib/nmi_backtrace.c
index 33c154264bfe2..a3bfa9360b23d 100644
--- a/lib/nmi_backtrace.c
+++ b/lib/nmi_backtrace.c
@@ -16,6 +16,7 @@
#include <linux/cpumask.h>
#include <linux/delay.h>
#include <linux/kprobes.h>
+#include <linux/stringify.h>
#include <linux/nmi.h>
#include <linux/cpu.h>
#include <linux/sched/debug.h>
@@ -27,6 +28,8 @@ static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
/* "in progress" flag of arch_trigger_cpumask_backtrace */
static unsigned long backtrace_flag;
+#define NMI_BT_TIMEOUT_SEC 10
+
/*
* When raise() is called it will be passed a pointer to the
* backtrace_mask. Architectures that call nmi_cpu_backtrace()
@@ -68,14 +71,20 @@ void nmi_trigger_cpumask_backtrace(const cpumask_t *mask,
raise(to_cpumask(backtrace_mask));
}
- /* Wait for up to 10 seconds for all CPUs to do the backtrace */
- for (i = 0; i < 10 * 1000; i++) {
+ /* Wait for up to NMI_BT_TIMEOUT_SEC seconds for all CPUs to do the backtrace */
+ for (i = 0; i < NMI_BT_TIMEOUT_SEC * 1000; i++) {
if (cpumask_empty(to_cpumask(backtrace_mask)))
break;
mdelay(1);
touch_softlockup_watchdog();
}
- nmi_backtrace_stall_check(to_cpumask(backtrace_mask));
+
+ if (!cpumask_empty(to_cpumask(backtrace_mask))) {
+ pr_warn("After " __stringify(NMI_BT_TIMEOUT_SEC) " seconds, these CPUS still haven't responded to the NMI: %*pbl\n",
+ cpumask_pr_args(to_cpumask(backtrace_mask)));
+
+ nmi_backtrace_stall_check(to_cpumask(backtrace_mask));
+ }
/*
* Force flush any remote buffers that might be stuck in IRQ context