From 846fb984b506135917c2862d2e4607005d6afdeb Mon Sep 17 00:00:00 2001 From: Andrew Cooper Date: Tue, 2 Apr 2024 16:20:09 +0200 Subject: [PATCH 65/67] x86/boot: Improve the boot watchdog determination of stuck cpus MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Right now, check_nmi_watchdog() has two processing loops over all online CPUs using prev_nmi_count as storage. Use a cpumask_t instead (1/32th as much initdata) and have wait_for_nmis() make the determination of whether it is stuck, rather than having both functions needing to agree on how many ticks mean stuck. More importantly though, it means we can use the standard cpumask infrastructure, including turning this: (XEN) Brought up 512 CPUs (XEN) Testing NMI watchdog on all CPUs: {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299,300,301,302,303,304,305,306,307,308,309,310,311,312,313,314,315,316,317,318,319,320,321,322,323,324,325,326,327,328,329,330,331,332,333,334,335,336,337,338,339,340,341,342,343,344,345,346,347,348,349,350,351,352,353,354,355,356,357,358,359,360,361,362,363,364,365,366,367,368,369,370,371,372,373,374,375,376,377,378,379,380,381,382,383,384,385,386,387,388,389,390,391,392,393,394,395,396,397,398,399,400,401,402,403,404,405,406,407,408,409,410,411,412,413,414,415,416,417,418,419,420,421,422,423,424,425,426,427,428,429,430,431,432,433,434,435,436,437,438,439,440,441,442,443,444,445,446,447,448,449,450,451,452,453,454,455,456,457,458,459,460,461,462,463,464,465,466,467,468,469,470,471,472,473,474,475,476,477,478,479,480,481,482,483,484,485,486,487,488,489,490,491,492,493,494,495,496,497,498,499,500,501,502,503,504,505,506,507,508,509,510,511} stuck into the rather more manageable: (XEN) Brought up 512 CPUs (XEN) Testing NMI watchdog on all CPUs: {0-511} stuck Signed-off-by: Andrew Cooper Reviewed-by: Roger Pau Monné master commit: 9e18f339830c828798aef465556d4029d83476a0 master date: 2024-03-19 18:29:37 +0000 --- xen/arch/x86/nmi.c | 33 ++++++++++++++------------------- 1 file changed, 14 insertions(+), 19 deletions(-) diff --git a/xen/arch/x86/nmi.c b/xen/arch/x86/nmi.c index 7c9591b65e..dd31034ac8 100644 --- a/xen/arch/x86/nmi.c +++ b/xen/arch/x86/nmi.c @@ -150,6 +150,8 @@ int nmi_active; static void __init cf_check wait_for_nmis(void *p) { + cpumask_t *stuck_cpus = p; + unsigned int cpu = smp_processor_id(); unsigned int start_count = this_cpu(nmi_count); unsigned long ticks = 10 * 1000 * cpu_khz / nmi_hz; unsigned long s, e; @@ -158,42 +160,35 @@ static void __init cf_check wait_for_nmis(void *p) do { cpu_relax(); if ( this_cpu(nmi_count) >= start_count + 2 ) - break; + return; + e = rdtsc(); - } while( e - s < ticks ); + } while ( e - s < ticks ); + + /* Timeout. Mark ourselves as stuck. */ + cpumask_set_cpu(cpu, stuck_cpus); } void __init check_nmi_watchdog(void) { - static unsigned int __initdata prev_nmi_count[NR_CPUS]; - int cpu; - bool ok = true; + static cpumask_t __initdata stuck_cpus; if ( nmi_watchdog == NMI_NONE ) return; printk("Testing NMI watchdog on all CPUs:"); - for_each_online_cpu ( cpu ) - prev_nmi_count[cpu] = per_cpu(nmi_count, cpu); - /* * Wait at most 10 ticks for 2 watchdog NMIs on each CPU. * Busy-wait on all CPUs: the LAPIC counter that the NMI watchdog * uses only runs while the core's not halted */ - on_selected_cpus(&cpu_online_map, wait_for_nmis, NULL, 1); - - for_each_online_cpu ( cpu ) - { - if ( per_cpu(nmi_count, cpu) - prev_nmi_count[cpu] < 2 ) - { - printk(" %d", cpu); - ok = false; - } - } + on_selected_cpus(&cpu_online_map, wait_for_nmis, &stuck_cpus, 1); - printk(" %s\n", ok ? "ok" : "stuck"); + if ( cpumask_empty(&stuck_cpus) ) + printk("ok\n"); + else + printk("{%*pbl} stuck\n", CPUMASK_PR(&stuck_cpus)); /* * Now that we know it works we can reduce NMI frequency to -- 2.44.0