0051-locking-attempt-to-ensure-lock-wrappers-are-always-i.patch


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405

From 2cc5e57be680a516aa5cdef4281856d09b9d0ea6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com>
Date: Mon, 4 Mar 2024 14:29:36 +0100
Subject: [PATCH 51/67] locking: attempt to ensure lock wrappers are always
 inline
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In order to prevent the locking speculation barriers from being inside of
`call`ed functions that could be speculatively bypassed.

While there also add an extra locking barrier to _mm_write_lock() in the branch
taken when the lock is already held.

Note some functions are switched to use the unsafe variants (without speculation
barrier) of the locking primitives, but a speculation barrier is always added
to the exposed public lock wrapping helper.  That's the case with
sched_spin_lock_double() or pcidevs_lock() for example.

This is part of XSA-453 / CVE-2024-2193

Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
(cherry picked from commit 197ecd838a2aaf959a469df3696d4559c4f8b762)
---
 xen/arch/x86/hvm/vpt.c         | 10 +++++++---
 xen/arch/x86/include/asm/irq.h |  1 +
 xen/arch/x86/mm/mm-locks.h     | 28 +++++++++++++++-------------
 xen/arch/x86/mm/p2m-pod.c      |  2 +-
 xen/common/event_channel.c     |  5 +++--
 xen/common/grant_table.c       |  6 +++---
 xen/common/sched/core.c        | 19 ++++++++++++-------
 xen/common/sched/private.h     | 26 ++++++++++++++++++++++++--
 xen/common/timer.c             |  8 +++++---
 xen/drivers/passthrough/pci.c  |  5 +++--
 xen/include/xen/event.h        |  4 ++--
 xen/include/xen/pci.h          |  8 ++++++--
 12 files changed, 82 insertions(+), 40 deletions(-)

diff --git a/xen/arch/x86/hvm/vpt.c b/xen/arch/x86/hvm/vpt.c
index cb1d81bf9e..66f1095245 100644
--- a/xen/arch/x86/hvm/vpt.c
+++ b/xen/arch/x86/hvm/vpt.c
@@ -161,7 +161,7 @@ static int pt_irq_masked(struct periodic_time *pt)
  * pt->vcpu field, because another thread holding the pt_migrate lock
  * may already be spinning waiting for your vcpu lock.
  */
-static void pt_vcpu_lock(struct vcpu *v)
+static always_inline void pt_vcpu_lock(struct vcpu *v)
 {
     spin_lock(&v->arch.hvm.tm_lock);
 }
@@ -180,9 +180,13 @@ static void pt_vcpu_unlock(struct vcpu *v)
  * need to take an additional lock that protects against pt->vcpu
  * changing.
  */
-static void pt_lock(struct periodic_time *pt)
+static always_inline void pt_lock(struct periodic_time *pt)
 {
-    read_lock(&pt->vcpu->domain->arch.hvm.pl_time->pt_migrate);
+    /*
+     * Use the speculation unsafe variant for the first lock, as the following
+     * lock taking helper already includes a speculation barrier.
+     */
+    _read_lock(&pt->vcpu->domain->arch.hvm.pl_time->pt_migrate);
     spin_lock(&pt->vcpu->arch.hvm.tm_lock);
 }
 
diff --git a/xen/arch/x86/include/asm/irq.h b/xen/arch/x86/include/asm/irq.h
index f6a0207a80..823d627fd0 100644
--- a/xen/arch/x86/include/asm/irq.h
+++ b/xen/arch/x86/include/asm/irq.h
@@ -178,6 +178,7 @@ void cf_check irq_complete_move(struct irq_desc *);
 
 extern struct irq_desc *irq_desc;
 
+/* Not speculation safe, only used for AP bringup. */
 void lock_vector_lock(void);
 void unlock_vector_lock(void);
 
diff --git a/xen/arch/x86/mm/mm-locks.h b/xen/arch/x86/mm/mm-locks.h
index c1523aeccf..265239c49f 100644
--- a/xen/arch/x86/mm/mm-locks.h
+++ b/xen/arch/x86/mm/mm-locks.h
@@ -86,8 +86,8 @@ static inline void _set_lock_level(int l)
     this_cpu(mm_lock_level) = l;
 }
 
-static inline void _mm_lock(const struct domain *d, mm_lock_t *l,
-                            const char *func, int level, int rec)
+static always_inline void _mm_lock(const struct domain *d, mm_lock_t *l,
+                                   const char *func, int level, int rec)
 {
     if ( !((mm_locked_by_me(l)) && rec) )
         _check_lock_level(d, level);
@@ -137,8 +137,8 @@ static inline int mm_write_locked_by_me(mm_rwlock_t *l)
     return (l->locker == get_processor_id());
 }
 
-static inline void _mm_write_lock(const struct domain *d, mm_rwlock_t *l,
-                                  const char *func, int level)
+static always_inline void _mm_write_lock(const struct domain *d, mm_rwlock_t *l,
+                                         const char *func, int level)
 {
     if ( !mm_write_locked_by_me(l) )
     {
@@ -149,6 +149,8 @@ static inline void _mm_write_lock(const struct domain *d, mm_rwlock_t *l,
         l->unlock_level = _get_lock_level();
         _set_lock_level(_lock_level(d, level));
     }
+    else
+        block_speculation();
     l->recurse_count++;
 }
 
@@ -162,8 +164,8 @@ static inline void mm_write_unlock(mm_rwlock_t *l)
     percpu_write_unlock(p2m_percpu_rwlock, &l->lock);
 }
 
-static inline void _mm_read_lock(const struct domain *d, mm_rwlock_t *l,
-                                 int level)
+static always_inline void _mm_read_lock(const struct domain *d, mm_rwlock_t *l,
+                                        int level)
 {
     _check_lock_level(d, level);
     percpu_read_lock(p2m_percpu_rwlock, &l->lock);
@@ -178,15 +180,15 @@ static inline void mm_read_unlock(mm_rwlock_t *l)
 
 /* This wrapper uses the line number to express the locking order below */
 #define declare_mm_lock(name)                                                 \
-    static inline void mm_lock_##name(const struct domain *d, mm_lock_t *l,   \
-                                      const char *func, int rec)              \
+    static always_inline void mm_lock_##name(                                 \
+        const struct domain *d, mm_lock_t *l, const char *func, int rec)      \
     { _mm_lock(d, l, func, MM_LOCK_ORDER_##name, rec); }
 #define declare_mm_rwlock(name)                                               \
-    static inline void mm_write_lock_##name(const struct domain *d,           \
-                                            mm_rwlock_t *l, const char *func) \
+    static always_inline void mm_write_lock_##name(                           \
+        const struct domain *d, mm_rwlock_t *l, const char *func)             \
     { _mm_write_lock(d, l, func, MM_LOCK_ORDER_##name); }                     \
-    static inline void mm_read_lock_##name(const struct domain *d,            \
-                                           mm_rwlock_t *l)                    \
+    static always_inline void mm_read_lock_##name(const struct domain *d,     \
+                                                  mm_rwlock_t *l)             \
     { _mm_read_lock(d, l, MM_LOCK_ORDER_##name); }
 /* These capture the name of the calling function */
 #define mm_lock(name, d, l) mm_lock_##name(d, l, __func__, 0)
@@ -321,7 +323,7 @@ declare_mm_lock(altp2mlist)
 #define MM_LOCK_ORDER_altp2m                 40
 declare_mm_rwlock(altp2m);
 
-static inline void p2m_lock(struct p2m_domain *p)
+static always_inline void p2m_lock(struct p2m_domain *p)
 {
     if ( p2m_is_altp2m(p) )
         mm_write_lock(altp2m, p->domain, &p->lock);
diff --git a/xen/arch/x86/mm/p2m-pod.c b/xen/arch/x86/mm/p2m-pod.c
index fc110506dc..99dbcb3101 100644
--- a/xen/arch/x86/mm/p2m-pod.c
+++ b/xen/arch/x86/mm/p2m-pod.c
@@ -36,7 +36,7 @@
 #define superpage_aligned(_x)  (((_x)&(SUPERPAGE_PAGES-1))==0)
 
 /* Enforce lock ordering when grabbing the "external" page_alloc lock */
-static inline void lock_page_alloc(struct p2m_domain *p2m)
+static always_inline void lock_page_alloc(struct p2m_domain *p2m)
 {
     page_alloc_mm_pre_lock(p2m->domain);
     spin_lock(&(p2m->domain->page_alloc_lock));
diff --git a/xen/common/event_channel.c b/xen/common/event_channel.c
index f5e0b12d15..dada9f15f5 100644
--- a/xen/common/event_channel.c
+++ b/xen/common/event_channel.c
@@ -62,7 +62,7 @@
  * just assume the event channel is free or unbound at the moment when the
  * evtchn_read_trylock() returns false.
  */
-static inline void evtchn_write_lock(struct evtchn *evtchn)
+static always_inline void evtchn_write_lock(struct evtchn *evtchn)
 {
     write_lock(&evtchn->lock);
 
@@ -364,7 +364,8 @@ int evtchn_alloc_unbound(evtchn_alloc_unbound_t *alloc, evtchn_port_t port)
     return rc;
 }
 
-static void double_evtchn_lock(struct evtchn *lchn, struct evtchn *rchn)
+static always_inline void double_evtchn_lock(struct evtchn *lchn,
+                                             struct evtchn *rchn)
 {
     ASSERT(lchn != rchn);
 
diff --git a/xen/common/grant_table.c b/xen/common/grant_table.c
index ee7cc496b8..62a8685cd5 100644
--- a/xen/common/grant_table.c
+++ b/xen/common/grant_table.c
@@ -410,7 +410,7 @@ static inline void act_set_gfn(struct active_grant_entry *act, gfn_t gfn)
 
 static DEFINE_PERCPU_RWLOCK_GLOBAL(grant_rwlock);
 
-static inline void grant_read_lock(struct grant_table *gt)
+static always_inline void grant_read_lock(struct grant_table *gt)
 {
     percpu_read_lock(grant_rwlock, &gt->lock);
 }
@@ -420,7 +420,7 @@ static inline void grant_read_unlock(struct grant_table *gt)
     percpu_read_unlock(grant_rwlock, &gt->lock);
 }
 
-static inline void grant_write_lock(struct grant_table *gt)
+static always_inline void grant_write_lock(struct grant_table *gt)
 {
     percpu_write_lock(grant_rwlock, &gt->lock);
 }
@@ -457,7 +457,7 @@ nr_active_grant_frames(struct grant_table *gt)
     return num_act_frames_from_sha_frames(nr_grant_frames(gt));
 }
 
-static inline struct active_grant_entry *
+static always_inline struct active_grant_entry *
 active_entry_acquire(struct grant_table *t, grant_ref_t e)
 {
     struct active_grant_entry *act;
diff --git a/xen/common/sched/core.c b/xen/common/sched/core.c
index 078beb1adb..29bbab5ac6 100644
--- a/xen/common/sched/core.c
+++ b/xen/common/sched/core.c
@@ -348,23 +348,28 @@ uint64_t get_cpu_idle_time(unsigned int cpu)
  * This avoids dead- or live-locks when this code is running on both
  * cpus at the same time.
  */
-static void sched_spin_lock_double(spinlock_t *lock1, spinlock_t *lock2,
-                                   unsigned long *flags)
+static always_inline void sched_spin_lock_double(
+    spinlock_t *lock1, spinlock_t *lock2, unsigned long *flags)
 {
+    /*
+     * In order to avoid extra overhead, use the locking primitives without the
+     * speculation barrier, and introduce a single barrier here.
+     */
     if ( lock1 == lock2 )
     {
-        spin_lock_irqsave(lock1, *flags);
+        *flags = _spin_lock_irqsave(lock1);
     }
     else if ( lock1 < lock2 )
     {
-        spin_lock_irqsave(lock1, *flags);
-        spin_lock(lock2);
+        *flags = _spin_lock_irqsave(lock1);
+        _spin_lock(lock2);
     }
     else
     {
-        spin_lock_irqsave(lock2, *flags);
-        spin_lock(lock1);
+        *flags = _spin_lock_irqsave(lock2);
+        _spin_lock(lock1);
     }
+    block_lock_speculation();
 }
 
 static void sched_spin_unlock_double(spinlock_t *lock1, spinlock_t *lock2,
diff --git a/xen/common/sched/private.h b/xen/common/sched/private.h
index 0527a8c70d..24a93dd0c1 100644
--- a/xen/common/sched/private.h
+++ b/xen/common/sched/private.h
@@ -207,8 +207,24 @@ DECLARE_PER_CPU(cpumask_t, cpumask_scratch);
 #define cpumask_scratch        (&this_cpu(cpumask_scratch))
 #define cpumask_scratch_cpu(c) (&per_cpu(cpumask_scratch, c))
 
+/*
+ * Deal with _spin_lock_irqsave() returning the flags value instead of storing
+ * it in a passed parameter.
+ */
+#define _sched_spinlock0(lock, irq) _spin_lock##irq(lock)
+#define _sched_spinlock1(lock, irq, arg) ({ \
+    BUILD_BUG_ON(sizeof(arg) != sizeof(unsigned long)); \
+    (arg) = _spin_lock##irq(lock); \
+})
+
+#define _sched_spinlock__(nr) _sched_spinlock ## nr
+#define _sched_spinlock_(nr)  _sched_spinlock__(nr)
+#define _sched_spinlock(lock, irq, args...) \
+    _sched_spinlock_(count_args(args))(lock, irq, ## args)
+
 #define sched_lock(kind, param, cpu, irq, arg...) \
-static inline spinlock_t *kind##_schedule_lock##irq(param EXTRA_TYPE(arg)) \
+static always_inline spinlock_t \
+*kind##_schedule_lock##irq(param EXTRA_TYPE(arg)) \
 { \
     for ( ; ; ) \
     { \
@@ -220,10 +236,16 @@ static inline spinlock_t *kind##_schedule_lock##irq(param EXTRA_TYPE(arg)) \
          * \
          * It may also be the case that v->processor may change but the \
          * lock may be the same; this will succeed in that case. \
+         * \
+         * Use the speculation unsafe locking helper, there's a speculation \
+         * barrier before returning to the caller. \
          */ \
-        spin_lock##irq(lock, ## arg); \
+        _sched_spinlock(lock, irq, ## arg); \
         if ( likely(lock == get_sched_res(cpu)->schedule_lock) ) \
+        { \
+            block_lock_speculation(); \
             return lock; \
+        } \
         spin_unlock##irq(lock, ## arg); \
     } \
 }
diff --git a/xen/common/timer.c b/xen/common/timer.c
index 9b5016d5ed..459668d417 100644
--- a/xen/common/timer.c
+++ b/xen/common/timer.c
@@ -240,7 +240,7 @@ static inline void deactivate_timer(struct timer *timer)
     list_add(&timer->inactive, &per_cpu(timers, timer->cpu).inactive);
 }
 
-static inline bool_t timer_lock(struct timer *timer)
+static inline bool_t timer_lock_unsafe(struct timer *timer)
 {
     unsigned int cpu;
 
@@ -254,7 +254,8 @@ static inline bool_t timer_lock(struct timer *timer)
             rcu_read_unlock(&timer_cpu_read_lock);
             return 0;
         }
-        spin_lock(&per_cpu(timers, cpu).lock);
+        /* Use the speculation unsafe variant, the wrapper has the barrier. */
+        _spin_lock(&per_cpu(timers, cpu).lock);
         if ( likely(timer->cpu == cpu) )
             break;
         spin_unlock(&per_cpu(timers, cpu).lock);
@@ -267,8 +268,9 @@ static inline bool_t timer_lock(struct timer *timer)
 #define timer_lock_irqsave(t, flags) ({         \
     bool_t __x;                                 \
     local_irq_save(flags);                      \
-    if ( !(__x = timer_lock(t)) )               \
+    if ( !(__x = timer_lock_unsafe(t)) )        \
         local_irq_restore(flags);               \
+    block_lock_speculation();                   \
     __x;                                        \
 })
 
diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c
index 8c62b14d19..1b3d285166 100644
--- a/xen/drivers/passthrough/pci.c
+++ b/xen/drivers/passthrough/pci.c
@@ -52,9 +52,10 @@ struct pci_seg {
 
 static spinlock_t _pcidevs_lock = SPIN_LOCK_UNLOCKED;
 
-void pcidevs_lock(void)
+/* Do not use, as it has no speculation barrier, use pcidevs_lock() instead. */
+void pcidevs_lock_unsafe(void)
 {
-    spin_lock_recursive(&_pcidevs_lock);
+    _spin_lock_recursive(&_pcidevs_lock);
 }
 
 void pcidevs_unlock(void)
diff --git a/xen/include/xen/event.h b/xen/include/xen/event.h
index 8eae9984a9..dd96e84c69 100644
--- a/xen/include/xen/event.h
+++ b/xen/include/xen/event.h
@@ -114,12 +114,12 @@ void notify_via_xen_event_channel(struct domain *ld, int lport);
 #define bucket_from_port(d, p) \
     ((group_from_port(d, p))[((p) % EVTCHNS_PER_GROUP) / EVTCHNS_PER_BUCKET])
 
-static inline void evtchn_read_lock(struct evtchn *evtchn)
+static always_inline void evtchn_read_lock(struct evtchn *evtchn)
 {
     read_lock(&evtchn->lock);
 }
 
-static inline bool evtchn_read_trylock(struct evtchn *evtchn)
+static always_inline bool evtchn_read_trylock(struct evtchn *evtchn)
 {
     return read_trylock(&evtchn->lock);
 }
diff --git a/xen/include/xen/pci.h b/xen/include/xen/pci.h
index 5975ca2f30..b373f139d1 100644
--- a/xen/include/xen/pci.h
+++ b/xen/include/xen/pci.h
@@ -155,8 +155,12 @@ struct pci_dev {
  * devices, it also sync the access to the msi capability that is not
  * interrupt handling related (the mask bit register).
  */
-
-void pcidevs_lock(void);
+void pcidevs_lock_unsafe(void);
+static always_inline void pcidevs_lock(void)
+{
+    pcidevs_lock_unsafe();
+    block_lock_speculation();
+}
 void pcidevs_unlock(void);
 bool_t __must_check pcidevs_locked(void);
 
-- 
2.44.0