diff options
author | Florian Schmaus <flow@gentoo.org> | 2022-07-01 20:20:49 +0200 |
---|---|---|
committer | Florian Schmaus <flow@gentoo.org> | 2022-07-01 20:26:39 +0200 |
commit | a16128cfac11b2f6462bbbc993cced2636abb312 (patch) | |
tree | 19b04c0f36afd341ba8a0ffbedf91ada49a332f1 | |
download | xen-upstream-patches-a16128cfac11b2f6462bbbc993cced2636abb312.tar.gz xen-upstream-patches-a16128cfac11b2f6462bbbc993cced2636abb312.tar.bz2 xen-upstream-patches-a16128cfac11b2f6462bbbc993cced2636abb312.zip |
Xen 4.16.2-pre-patchset-04.16.2-pre-patchset-0
Signed-off-by: Florian Schmaus <flow@gentoo.org>
35 files changed, 3613 insertions, 0 deletions
diff --git a/0001-update-Xen-version-to-4.16.2-pre.patch b/0001-update-Xen-version-to-4.16.2-pre.patch new file mode 100644 index 0000000..30411de --- /dev/null +++ b/0001-update-Xen-version-to-4.16.2-pre.patch @@ -0,0 +1,25 @@ +From 5be9edb482ab20cf3e7acb05b511465294d1e19b Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Tue, 7 Jun 2022 13:55:17 +0200 +Subject: [PATCH 01/32] update Xen version to 4.16.2-pre + +--- + xen/Makefile | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/xen/Makefile b/xen/Makefile +index 8abc71cf73aa..90a29782dbf4 100644 +--- a/xen/Makefile ++++ b/xen/Makefile +@@ -2,7 +2,7 @@ + # All other places this is stored (eg. compile.h) should be autogenerated. + export XEN_VERSION = 4 + export XEN_SUBVERSION = 16 +-export XEN_EXTRAVERSION ?= .1$(XEN_VENDORVERSION) ++export XEN_EXTRAVERSION ?= .2-pre$(XEN_VENDORVERSION) + export XEN_FULLVERSION = $(XEN_VERSION).$(XEN_SUBVERSION)$(XEN_EXTRAVERSION) + -include xen-version + +-- +2.35.1 + diff --git a/0002-x86-irq-skip-unmap_domain_pirq-XSM-during-destructio.patch b/0002-x86-irq-skip-unmap_domain_pirq-XSM-during-destructio.patch new file mode 100644 index 0000000..fc6c2e1 --- /dev/null +++ b/0002-x86-irq-skip-unmap_domain_pirq-XSM-during-destructio.patch @@ -0,0 +1,50 @@ +From b58fb6e81bd55b6bd946abc3070770f7994c9ef9 Mon Sep 17 00:00:00 2001 +From: Jason Andryuk <jandryuk@gmail.com> +Date: Tue, 7 Jun 2022 13:55:39 +0200 +Subject: [PATCH 02/32] x86/irq: skip unmap_domain_pirq XSM during destruction +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +xsm_unmap_domain_irq was seen denying unmap_domain_pirq when called from +complete_domain_destroy as an RCU callback. The source context was an +unexpected, random domain. Since this is a xen-internal operation, +going through the XSM hook is inapproriate. + +Check d->is_dying and skip the XSM hook when set since this is a cleanup +operation for a domain being destroyed. + +Suggested-by: Roger Pau Monné <roger.pau@citrix.com> +Signed-off-by: Jason Andryuk <jandryuk@gmail.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> +master commit: 2e6f95a942d1927a53f077c301db0b799c54c05a +master date: 2022-04-08 14:51:52 +0200 +--- + xen/arch/x86/irq.c | 10 ++++++++-- + 1 file changed, 8 insertions(+), 2 deletions(-) + +diff --git a/xen/arch/x86/irq.c b/xen/arch/x86/irq.c +index 67cbf6b979dc..47b86af5dce9 100644 +--- a/xen/arch/x86/irq.c ++++ b/xen/arch/x86/irq.c +@@ -2342,8 +2342,14 @@ int unmap_domain_pirq(struct domain *d, int pirq) + nr = msi_desc->msi.nvec; + } + +- ret = xsm_unmap_domain_irq(XSM_HOOK, d, irq, +- msi_desc ? msi_desc->dev : NULL); ++ /* ++ * When called by complete_domain_destroy via RCU, current is a random ++ * domain. Skip the XSM check since this is a Xen-initiated action. ++ */ ++ if ( !d->is_dying ) ++ ret = xsm_unmap_domain_irq(XSM_HOOK, d, irq, ++ msi_desc ? msi_desc->dev : NULL); ++ + if ( ret ) + goto done; + +-- +2.35.1 + diff --git a/0003-xen-fix-XEN_DOMCTL_gdbsx_guestmemio-crash.patch b/0003-xen-fix-XEN_DOMCTL_gdbsx_guestmemio-crash.patch new file mode 100644 index 0000000..905993b --- /dev/null +++ b/0003-xen-fix-XEN_DOMCTL_gdbsx_guestmemio-crash.patch @@ -0,0 +1,63 @@ +From 6c6bbfdff9374ef41f84c4ebed7b8a7a40767ef6 Mon Sep 17 00:00:00 2001 +From: Juergen Gross <jgross@suse.com> +Date: Tue, 7 Jun 2022 13:56:54 +0200 +Subject: [PATCH 03/32] xen: fix XEN_DOMCTL_gdbsx_guestmemio crash + +A hypervisor built without CONFIG_GDBSX will crash in case the +XEN_DOMCTL_gdbsx_guestmemio domctl is being called, as the call will +end up in iommu_do_domctl() with d == NULL: + + (XEN) CPU: 6 + (XEN) RIP: e008:[<ffff82d040269984>] iommu_do_domctl+0x4/0x30 + (XEN) RFLAGS: 0000000000010202 CONTEXT: hypervisor (d0v0) + (XEN) rax: 00000000000003e8 rbx: ffff830856277ef8 rcx: ffff830856277fff + ... + (XEN) Xen call trace: + (XEN) [<ffff82d040269984>] R iommu_do_domctl+0x4/0x30 + (XEN) [<ffff82d04035cd5f>] S arch_do_domctl+0x7f/0x2330 + (XEN) [<ffff82d040239e46>] S do_domctl+0xe56/0x1930 + (XEN) [<ffff82d040238ff0>] S do_domctl+0/0x1930 + (XEN) [<ffff82d0402f8c59>] S pv_hypercall+0x99/0x110 + (XEN) [<ffff82d0402f5161>] S arch/x86/pv/domain.c#_toggle_guest_pt+0x11/0x90 + (XEN) [<ffff82d040366288>] S lstar_enter+0x128/0x130 + (XEN) + (XEN) Pagetable walk from 0000000000000144: + (XEN) L4[0x000] = 0000000000000000 ffffffffffffffff + (XEN) + (XEN) **************************************** + (XEN) Panic on CPU 6: + (XEN) FATAL PAGE FAULT + (XEN) [error_code=0000] + (XEN) Faulting linear address: 0000000000000144 + (XEN) **************************************** + +It used to be permitted to pass DOMID_IDLE to dbg_rw_mem(), which is why the +special case skipping the domid checks exists. Now that it is only permitted +to pass proper domids, remove the special case, making 'd' always valid. + +Reported-by: Cheyenne Wills <cheyenne.wills@gmail.com> +Fixes: e726a82ca0dc ("xen: make gdbsx support configurable") +Signed-off-by: Juergen Gross <jgross@suse.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> +master commit: f00daf1fb3213a9b0335d9dcd90fe9cb5c02b7a9 +master date: 2022-04-19 17:07:08 +0100 +--- + xen/common/domctl.c | 1 - + 1 file changed, 1 deletion(-) + +diff --git a/xen/common/domctl.c b/xen/common/domctl.c +index 271862ae587f..419e4070f59d 100644 +--- a/xen/common/domctl.c ++++ b/xen/common/domctl.c +@@ -304,7 +304,6 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl) + if ( op->domain == DOMID_INVALID ) + { + case XEN_DOMCTL_createdomain: +- case XEN_DOMCTL_gdbsx_guestmemio: + d = NULL; + break; + } +-- +2.35.1 + diff --git a/0004-VT-d-refuse-to-use-IOMMU-with-reserved-CAP.ND-value.patch b/0004-VT-d-refuse-to-use-IOMMU-with-reserved-CAP.ND-value.patch new file mode 100644 index 0000000..c566888 --- /dev/null +++ b/0004-VT-d-refuse-to-use-IOMMU-with-reserved-CAP.ND-value.patch @@ -0,0 +1,49 @@ +From b378ee56c7e0bb5eeb35dcc55b3d29e5f50eb566 Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Tue, 7 Jun 2022 13:58:16 +0200 +Subject: [PATCH 04/32] VT-d: refuse to use IOMMU with reserved CAP.ND value +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +The field taking the value 7 (resulting in 18-bit DIDs when using the +calculation in cap_ndoms(), when the DID fields are only 16 bits wide) +is reserved. Instead of misbehaving in case we would encounter such an +IOMMU, refuse to use it. + +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> +Reviewed-by: Kevin Tian <kevin.tian@intel.com> +master commit: a1545fbf45c689aff39ce76a6eaa609d32ef72a7 +master date: 2022-04-20 10:54:26 +0200 +--- + xen/drivers/passthrough/vtd/iommu.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c +index 93dd8aa643aa..8975c1de61bc 100644 +--- a/xen/drivers/passthrough/vtd/iommu.c ++++ b/xen/drivers/passthrough/vtd/iommu.c +@@ -1279,8 +1279,11 @@ int __init iommu_alloc(struct acpi_drhd_unit *drhd) + + quirk_iommu_caps(iommu); + ++ nr_dom = cap_ndoms(iommu->cap); ++ + if ( cap_fault_reg_offset(iommu->cap) + + cap_num_fault_regs(iommu->cap) * PRIMARY_FAULT_REG_LEN >= PAGE_SIZE || ++ ((nr_dom - 1) >> 16) /* I.e. cap.nd > 6 */ || + ecap_iotlb_offset(iommu->ecap) >= PAGE_SIZE ) + { + printk(XENLOG_ERR VTDPREFIX "IOMMU: unsupported\n"); +@@ -1305,7 +1308,6 @@ int __init iommu_alloc(struct acpi_drhd_unit *drhd) + vtd_ops.sync_cache = sync_cache; + + /* allocate domain id bitmap */ +- nr_dom = cap_ndoms(iommu->cap); + iommu->domid_bitmap = xzalloc_array(unsigned long, BITS_TO_LONGS(nr_dom)); + if ( !iommu->domid_bitmap ) + return -ENOMEM; +-- +2.35.1 + diff --git a/0005-x86-mm-avoid-inadvertently-degrading-a-TLB-flush-to-.patch b/0005-x86-mm-avoid-inadvertently-degrading-a-TLB-flush-to-.patch new file mode 100644 index 0000000..6410aaa --- /dev/null +++ b/0005-x86-mm-avoid-inadvertently-degrading-a-TLB-flush-to-.patch @@ -0,0 +1,116 @@ +From 7c003ab4a398ff4ddd54d15d4158cffb463134cc Mon Sep 17 00:00:00 2001 +From: David Vrabel <dvrabel@amazon.co.uk> +Date: Tue, 7 Jun 2022 13:59:31 +0200 +Subject: [PATCH 05/32] x86/mm: avoid inadvertently degrading a TLB flush to + local only +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +If the direct map is incorrectly modified with interrupts disabled, +the required TLB flushes are degraded to flushing the local CPU only. + +This could lead to very hard to diagnose problems as different CPUs will +end up with different views of memory. Although, no such issues have yet +been identified. + +Change the check in the flush_area() macro to look at system_state +instead. This defers the switch from local to all later in the boot +(see xen/arch/x86/setup.c:__start_xen()). This is fine because +additional PCPUs are not brought up until after the system state is +SYS_STATE_smp_boot. + +Signed-off-by: David Vrabel <dvrabel@amazon.co.uk> +Reviewed-by: Jan Beulich <jbeulich@suse.com> + +x86/flushtlb: remove flush_area check on system state + +Booting with Shadow Stacks leads to the following assert on a debug +hypervisor: + +Assertion 'local_irq_is_enabled()' failed at arch/x86/smp.c:265 +----[ Xen-4.17.0-10.24-d x86_64 debug=y Not tainted ]---- +CPU: 0 +RIP: e008:[<ffff82d040345300>] flush_area_mask+0x40/0x13e +[...] +Xen call trace: + [<ffff82d040345300>] R flush_area_mask+0x40/0x13e + [<ffff82d040338a40>] F modify_xen_mappings+0xc5/0x958 + [<ffff82d0404474f9>] F arch/x86/alternative.c#_alternative_instructions+0xb7/0xb9 + [<ffff82d0404476cc>] F alternative_branches+0xf/0x12 + [<ffff82d04044e37d>] F __start_xen+0x1ef4/0x2776 + [<ffff82d040203344>] F __high_start+0x94/0xa0 + +This is due to SYS_STATE_smp_boot being set before calling +alternative_branches(), and the flush in modify_xen_mappings() then +using flush_area_all() with interrupts disabled. Note that +alternative_branches() is called before APs are started, so the flush +must be a local one (and indeed the cpumask passed to +flush_area_mask() just contains one CPU). + +Take the opportunity to simplify a bit the logic and make flush_area() +an alias of flush_area_all() in mm.c, taking into account that +cpu_online_map just contains the BSP before APs are started. This +requires widening the assert in flush_area_mask() to allow being +called with interrupts disabled as long as it's strictly a local only +flush. + +The overall result is that a conditional can be removed from +flush_area(). + +While there also introduce an ASSERT to check that a vCPU state flush +is not issued for the local CPU only. + +Fixes: 78e072bc37 ('x86/mm: avoid inadvertently degrading a TLB flush to local only') +Suggested-by: Andrew Cooper <andrew.cooper3@citrix.com> +Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +master commit: 78e072bc375043e81691a59454e09f0b38241ddd +master date: 2022-04-20 10:55:01 +0200 +master commit: 9f735ee4903f1b9f1966bb4ba5b5616b03ae08b5 +master date: 2022-05-25 11:09:46 +0200 +--- + xen/arch/x86/mm.c | 10 ++-------- + xen/arch/x86/smp.c | 5 ++++- + 2 files changed, 6 insertions(+), 9 deletions(-) + +diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c +index 4d799032dc82..e222d9aa98ee 100644 +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -5051,14 +5051,8 @@ l1_pgentry_t *virt_to_xen_l1e(unsigned long v) + #define l1f_to_lNf(f) (((f) & _PAGE_PRESENT) ? ((f) | _PAGE_PSE) : (f)) + #define lNf_to_l1f(f) (((f) & _PAGE_PRESENT) ? ((f) & ~_PAGE_PSE) : (f)) + +-/* +- * map_pages_to_xen() can be called with interrupts disabled during +- * early bootstrap. In this case it is safe to use flush_area_local() +- * and avoid locking because only the local CPU is online. +- */ +-#define flush_area(v,f) (!local_irq_is_enabled() ? \ +- flush_area_local((const void *)v, f) : \ +- flush_area_all((const void *)v, f)) ++/* flush_area_all() can be used prior to any other CPU being online. */ ++#define flush_area(v, f) flush_area_all((const void *)(v), f) + + #define L3T_INIT(page) (page) = ZERO_BLOCK_PTR + +diff --git a/xen/arch/x86/smp.c b/xen/arch/x86/smp.c +index eef0f9c6cbf4..3556ec116608 100644 +--- a/xen/arch/x86/smp.c ++++ b/xen/arch/x86/smp.c +@@ -262,7 +262,10 @@ void flush_area_mask(const cpumask_t *mask, const void *va, unsigned int flags) + { + unsigned int cpu = smp_processor_id(); + +- ASSERT(local_irq_is_enabled()); ++ /* Local flushes can be performed with interrupts disabled. */ ++ ASSERT(local_irq_is_enabled() || cpumask_subset(mask, cpumask_of(cpu))); ++ /* Exclude use of FLUSH_VCPU_STATE for the local CPU. */ ++ ASSERT(!cpumask_test_cpu(cpu, mask) || !(flags & FLUSH_VCPU_STATE)); + + if ( (flags & ~(FLUSH_VCPU_STATE | FLUSH_ORDER_MASK)) && + cpumask_test_cpu(cpu, mask) ) +-- +2.35.1 + diff --git a/0006-xen-build-Fix-dependency-for-the-MAP-rule.patch b/0006-xen-build-Fix-dependency-for-the-MAP-rule.patch new file mode 100644 index 0000000..6489cba --- /dev/null +++ b/0006-xen-build-Fix-dependency-for-the-MAP-rule.patch @@ -0,0 +1,29 @@ +From 4bb8c34ba4241c2bf7845cd8b80c17530dbfb085 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper <andrew.cooper3@citrix.com> +Date: Tue, 7 Jun 2022 14:00:09 +0200 +Subject: [PATCH 06/32] xen/build: Fix dependency for the MAP rule + +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Acked-by: Jan Beulich <jbeulich@suse.com> +master commit: e1e72198213b80b7a82bdc90f96ed05ae4f53e20 +master date: 2022-04-20 19:10:59 +0100 +--- + xen/Makefile | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/xen/Makefile b/xen/Makefile +index 90a29782dbf4..ce4eca3ee4d7 100644 +--- a/xen/Makefile ++++ b/xen/Makefile +@@ -507,7 +507,7 @@ cscope: + cscope -k -b -q + + .PHONY: _MAP +-_MAP: ++_MAP: $(TARGET) + $(NM) -n $(TARGET)-syms | grep -v '\(compiled\)\|\(\.o$$\)\|\( [aUw] \)\|\(\.\.ng$$\)\|\(LASH[RL]DI\)' > System.map + + %.o %.i %.s: %.c FORCE +-- +2.35.1 + diff --git a/0007-tools-libs-evtchn-don-t-set-errno-to-negative-values.patch b/0007-tools-libs-evtchn-don-t-set-errno-to-negative-values.patch new file mode 100644 index 0000000..2f02fcc --- /dev/null +++ b/0007-tools-libs-evtchn-don-t-set-errno-to-negative-values.patch @@ -0,0 +1,74 @@ +From 13a29f3756bc4cab96c59f46c3875b483553fb8f Mon Sep 17 00:00:00 2001 +From: Juergen Gross <jgross@suse.com> +Date: Tue, 7 Jun 2022 14:00:31 +0200 +Subject: [PATCH 07/32] tools/libs/evtchn: don't set errno to negative values + +Setting errno to a negative value makes no sense. + +Fixes: 6b6500b3cbaa ("tools/libs/evtchn: Add support for restricting a handle") +Signed-off-by: Juergen Gross <jgross@suse.com> +Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> +master commit: 60245b71c1cd001686fa7b7a26869cbcb80d074c +master date: 2022-04-22 20:39:34 +0100 +--- + tools/libs/evtchn/freebsd.c | 2 +- + tools/libs/evtchn/minios.c | 2 +- + tools/libs/evtchn/netbsd.c | 2 +- + tools/libs/evtchn/solaris.c | 2 +- + 4 files changed, 4 insertions(+), 4 deletions(-) + +diff --git a/tools/libs/evtchn/freebsd.c b/tools/libs/evtchn/freebsd.c +index 7427ab240860..fa17a0f8dbb5 100644 +--- a/tools/libs/evtchn/freebsd.c ++++ b/tools/libs/evtchn/freebsd.c +@@ -58,7 +58,7 @@ int osdep_evtchn_close(xenevtchn_handle *xce) + + int osdep_evtchn_restrict(xenevtchn_handle *xce, domid_t domid) + { +- errno = -EOPNOTSUPP; ++ errno = EOPNOTSUPP; + + return -1; + } +diff --git a/tools/libs/evtchn/minios.c b/tools/libs/evtchn/minios.c +index e5dfdc5ef52e..c0bd5429eea2 100644 +--- a/tools/libs/evtchn/minios.c ++++ b/tools/libs/evtchn/minios.c +@@ -97,7 +97,7 @@ int osdep_evtchn_close(xenevtchn_handle *xce) + + int osdep_evtchn_restrict(xenevtchn_handle *xce, domid_t domid) + { +- errno = -EOPNOTSUPP; ++ errno = EOPNOTSUPP; + + return -1; + } +diff --git a/tools/libs/evtchn/netbsd.c b/tools/libs/evtchn/netbsd.c +index 1cebc21ffce0..56409513bc23 100644 +--- a/tools/libs/evtchn/netbsd.c ++++ b/tools/libs/evtchn/netbsd.c +@@ -53,7 +53,7 @@ int osdep_evtchn_close(xenevtchn_handle *xce) + + int osdep_evtchn_restrict(xenevtchn_handle *xce, domid_t domid) + { +- errno = -EOPNOTSUPP; ++ errno = EOPNOTSUPP; + + return -1; + } +diff --git a/tools/libs/evtchn/solaris.c b/tools/libs/evtchn/solaris.c +index df9579df1778..beaa7721425f 100644 +--- a/tools/libs/evtchn/solaris.c ++++ b/tools/libs/evtchn/solaris.c +@@ -53,7 +53,7 @@ int osdep_evtchn_close(xenevtchn_handle *xce) + + int osdep_evtchn_restrict(xenevtchn_handle *xce, domid_t domid) + { +- errno = -EOPNOTSUPP; ++ errno = EOPNOTSUPP; + return -1; + } + +-- +2.35.1 + diff --git a/0008-tools-libs-ctrl-don-t-set-errno-to-a-negative-value.patch b/0008-tools-libs-ctrl-don-t-set-errno-to-a-negative-value.patch new file mode 100644 index 0000000..acd7955 --- /dev/null +++ b/0008-tools-libs-ctrl-don-t-set-errno-to-a-negative-value.patch @@ -0,0 +1,36 @@ +From ba62afdbc31a8cfe897191efd25ed4449d9acd94 Mon Sep 17 00:00:00 2001 +From: Juergen Gross <jgross@suse.com> +Date: Tue, 7 Jun 2022 14:01:03 +0200 +Subject: [PATCH 08/32] tools/libs/ctrl: don't set errno to a negative value + +The claimed reason for setting errno to -1 is wrong. On x86 +xc_domain_pod_target() will set errno to a sane value in the error +case. + +Fixes: ff1745d5882b ("tools: libxl: do not set the PoD target on ARM") +Signed-off-by: Juergen Gross <jgross@suse.com> +Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> +master commit: a0fb7e0e73483ed042d5ca34861a891a51ad337b +master date: 2022-04-22 20:39:34 +0100 +--- + tools/libs/ctrl/xc_domain.c | 4 +--- + 1 file changed, 1 insertion(+), 3 deletions(-) + +diff --git a/tools/libs/ctrl/xc_domain.c b/tools/libs/ctrl/xc_domain.c +index b155d6afd2ef..9d675c8f21e1 100644 +--- a/tools/libs/ctrl/xc_domain.c ++++ b/tools/libs/ctrl/xc_domain.c +@@ -1297,9 +1297,7 @@ int xc_domain_get_pod_target(xc_interface *xch, + uint64_t *pod_cache_pages, + uint64_t *pod_entries) + { +- /* On x86 (above) xc_domain_pod_target will incorrectly return -1 +- * with errno==-1 on error. Do the same for least surprise. */ +- errno = -1; ++ errno = EOPNOTSUPP; + return -1; + } + #endif +-- +2.35.1 + diff --git a/0009-tools-libs-guest-don-t-set-errno-to-a-negative-value.patch b/0009-tools-libs-guest-don-t-set-errno-to-a-negative-value.patch new file mode 100644 index 0000000..41eb1f1 --- /dev/null +++ b/0009-tools-libs-guest-don-t-set-errno-to-a-negative-value.patch @@ -0,0 +1,32 @@ +From a2cf30eec08db5df974a9e8bb7366fee8fc7fcd9 Mon Sep 17 00:00:00 2001 +From: Juergen Gross <jgross@suse.com> +Date: Tue, 7 Jun 2022 14:01:27 +0200 +Subject: [PATCH 09/32] tools/libs/guest: don't set errno to a negative value + +Setting errno to a negative error value makes no sense. + +Fixes: cb99a64029c9 ("libxc: arm: allow passing a device tree blob to the guest") +Signed-off-by: Juergen Gross <jgross@suse.com> +Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> +master commit: 438e96ab479495a932391a22e219ee62fa8c4f47 +master date: 2022-04-22 20:39:34 +0100 +--- + tools/libs/guest/xg_dom_core.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/tools/libs/guest/xg_dom_core.c b/tools/libs/guest/xg_dom_core.c +index 2e4c1330ea6b..65975a75da37 100644 +--- a/tools/libs/guest/xg_dom_core.c ++++ b/tools/libs/guest/xg_dom_core.c +@@ -856,7 +856,7 @@ int xc_dom_devicetree_file(struct xc_dom_image *dom, const char *filename) + return -1; + return 0; + #else +- errno = -EINVAL; ++ errno = EINVAL; + return -1; + #endif + } +-- +2.35.1 + diff --git a/0010-tools-libs-light-don-t-set-errno-to-a-negative-value.patch b/0010-tools-libs-light-don-t-set-errno-to-a-negative-value.patch new file mode 100644 index 0000000..a83e1cc --- /dev/null +++ b/0010-tools-libs-light-don-t-set-errno-to-a-negative-value.patch @@ -0,0 +1,32 @@ +From 15391de8e2bb6153eadd483154c53044ab53d98d Mon Sep 17 00:00:00 2001 +From: Juergen Gross <jgross@suse.com> +Date: Tue, 7 Jun 2022 14:01:44 +0200 +Subject: [PATCH 10/32] tools/libs/light: don't set errno to a negative value + +Setting errno to a negative value makes no sense. + +Fixes: e78e8b9bb649 ("libxl: Add interface for querying hypervisor about PCI topology") +Signed-off-by: Juergen Gross <jgross@suse.com> +Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> +master commit: 2419a159fb943c24a6f2439604b9fdb1478fcd08 +master date: 2022-04-22 20:39:34 +0100 +--- + tools/libs/light/libxl_linux.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/tools/libs/light/libxl_linux.c b/tools/libs/light/libxl_linux.c +index 8d62dfd255cb..27f2bce71837 100644 +--- a/tools/libs/light/libxl_linux.c ++++ b/tools/libs/light/libxl_linux.c +@@ -288,7 +288,7 @@ int libxl__pci_topology_init(libxl__gc *gc, + if (i == num_devs) { + LOG(ERROR, "Too many devices"); + err = ERROR_FAIL; +- errno = -ENOSPC; ++ errno = ENOSPC; + goto out; + } + +-- +2.35.1 + diff --git a/0011-xen-iommu-cleanup-iommu-related-domctl-handling.patch b/0011-xen-iommu-cleanup-iommu-related-domctl-handling.patch new file mode 100644 index 0000000..b62ae9b --- /dev/null +++ b/0011-xen-iommu-cleanup-iommu-related-domctl-handling.patch @@ -0,0 +1,112 @@ +From a6c32abd144ec6443c6a433b5a2ac00e2615aa86 Mon Sep 17 00:00:00 2001 +From: Juergen Gross <jgross@suse.com> +Date: Tue, 7 Jun 2022 14:02:08 +0200 +Subject: [PATCH 11/32] xen/iommu: cleanup iommu related domctl handling + +Today iommu_do_domctl() is being called from arch_do_domctl() in the +"default:" case of a switch statement. This has led already to crashes +due to unvalidated parameters. + +Fix that by moving the call of iommu_do_domctl() to the main switch +statement of do_domctl(). + +Signed-off-by: Juergen Gross <jgross@suse.com> +Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> # Arm +master commit: 9cd7e31b3f584e97a138a770cfb031a91a867936 +master date: 2022-04-26 10:23:58 +0200 +--- + xen/arch/arm/domctl.c | 11 +---------- + xen/arch/x86/domctl.c | 2 +- + xen/common/domctl.c | 7 +++++++ + xen/include/xen/iommu.h | 12 +++++++++--- + 4 files changed, 18 insertions(+), 14 deletions(-) + +diff --git a/xen/arch/arm/domctl.c b/xen/arch/arm/domctl.c +index 6245af6d0bab..1baf25c3d98b 100644 +--- a/xen/arch/arm/domctl.c ++++ b/xen/arch/arm/domctl.c +@@ -176,16 +176,7 @@ long arch_do_domctl(struct xen_domctl *domctl, struct domain *d, + return rc; + } + default: +- { +- int rc; +- +- rc = subarch_do_domctl(domctl, d, u_domctl); +- +- if ( rc == -ENOSYS ) +- rc = iommu_do_domctl(domctl, d, u_domctl); +- +- return rc; +- } ++ return subarch_do_domctl(domctl, d, u_domctl); + } + } + +diff --git a/xen/arch/x86/domctl.c b/xen/arch/x86/domctl.c +index 7d102e0647ec..0fa51f2ebd10 100644 +--- a/xen/arch/x86/domctl.c ++++ b/xen/arch/x86/domctl.c +@@ -1380,7 +1380,7 @@ long arch_do_domctl( + break; + + default: +- ret = iommu_do_domctl(domctl, d, u_domctl); ++ ret = -ENOSYS; + break; + } + +diff --git a/xen/common/domctl.c b/xen/common/domctl.c +index 419e4070f59d..65d2a4588b71 100644 +--- a/xen/common/domctl.c ++++ b/xen/common/domctl.c +@@ -870,6 +870,13 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl) + copyback = 1; + break; + ++ case XEN_DOMCTL_assign_device: ++ case XEN_DOMCTL_test_assign_device: ++ case XEN_DOMCTL_deassign_device: ++ case XEN_DOMCTL_get_device_group: ++ ret = iommu_do_domctl(op, d, u_domctl); ++ break; ++ + default: + ret = arch_do_domctl(op, d, u_domctl); + break; +diff --git a/xen/include/xen/iommu.h b/xen/include/xen/iommu.h +index 92b2d23f0ba2..861579562e8a 100644 +--- a/xen/include/xen/iommu.h ++++ b/xen/include/xen/iommu.h +@@ -342,8 +342,17 @@ struct domain_iommu { + /* Does the IOMMU pagetable need to be kept synchronized with the P2M */ + #ifdef CONFIG_HAS_PASSTHROUGH + #define need_iommu_pt_sync(d) (dom_iommu(d)->need_sync) ++ ++int iommu_do_domctl(struct xen_domctl *domctl, struct domain *d, ++ XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl); + #else + #define need_iommu_pt_sync(d) ({ (void)(d); false; }) ++ ++static inline int iommu_do_domctl(struct xen_domctl *domctl, struct domain *d, ++ XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl) ++{ ++ return -ENOSYS; ++} + #endif + + int __must_check iommu_suspend(void); +@@ -357,9 +366,6 @@ int iommu_do_pci_domctl(struct xen_domctl *, struct domain *d, + XEN_GUEST_HANDLE_PARAM(xen_domctl_t)); + #endif + +-int iommu_do_domctl(struct xen_domctl *, struct domain *d, +- XEN_GUEST_HANDLE_PARAM(xen_domctl_t)); +- + void iommu_dev_iotlb_flush_timeout(struct domain *d, struct pci_dev *pdev); + + /* +-- +2.35.1 + diff --git a/0012-IOMMU-make-domctl-handler-tolerate-NULL-domain.patch b/0012-IOMMU-make-domctl-handler-tolerate-NULL-domain.patch new file mode 100644 index 0000000..ff26651 --- /dev/null +++ b/0012-IOMMU-make-domctl-handler-tolerate-NULL-domain.patch @@ -0,0 +1,36 @@ +From 4cf9a7c7bdb9d544fbac81105bbc1059ba3dd932 Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Tue, 7 Jun 2022 14:02:30 +0200 +Subject: [PATCH 12/32] IOMMU: make domctl handler tolerate NULL domain + +Besides the reporter's issue of hitting a NULL deref when !CONFIG_GDBSX, +XEN_DOMCTL_test_assign_device can legitimately end up having NULL passed +here, when the domctl was passed DOMID_INVALID. + +Fixes: 71e617a6b8f6 ("use is_iommu_enabled() where appropriate...") +Reported-by: Cheyenne Wills <cheyenne.wills@gmail.com> +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Paul Durrant <paul@xen.org> +Reviewed-by: Juergen Gross <jgross@suse.com> +master commit: fa4d84e6dd3c3bfd23a525b75a5483d4ce15adbb +master date: 2022-04-26 10:25:54 +0200 +--- + xen/drivers/passthrough/iommu.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/xen/drivers/passthrough/iommu.c b/xen/drivers/passthrough/iommu.c +index caaba62c8865..287f63fc736f 100644 +--- a/xen/drivers/passthrough/iommu.c ++++ b/xen/drivers/passthrough/iommu.c +@@ -535,7 +535,7 @@ int iommu_do_domctl( + { + int ret = -ENODEV; + +- if ( !is_iommu_enabled(d) ) ++ if ( !(d ? is_iommu_enabled(d) : iommu_enabled) ) + return -EOPNOTSUPP; + + #ifdef CONFIG_HAS_PCI +-- +2.35.1 + diff --git a/0013-IOMMU-x86-disallow-device-assignment-to-PoD-guests.patch b/0013-IOMMU-x86-disallow-device-assignment-to-PoD-guests.patch new file mode 100644 index 0000000..efadef6 --- /dev/null +++ b/0013-IOMMU-x86-disallow-device-assignment-to-PoD-guests.patch @@ -0,0 +1,229 @@ +From 838f6c211f7f05f107e1acdfb0977ab61ec0bf2e Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Tue, 7 Jun 2022 14:03:20 +0200 +Subject: [PATCH 13/32] IOMMU/x86: disallow device assignment to PoD guests +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +While it is okay for IOMMU page tables to be set up for guests starting +in PoD mode, actual device assignment may only occur once all PoD +entries have been removed from the P2M. So far this was enforced only +for boot-time assignment, and only in the tool stack. + +Also use the new function to replace p2m_pod_entry_count(): Its unlocked +access to p2m->pod.entry_count wasn't really okay (irrespective of the +result being stale by the time the caller gets to see it). Nor was the +use of that function in line with the immediately preceding comment: A +PoD guest isn't just one with a non-zero entry count, but also one with +a non-empty cache (e.g. prior to actually launching the guest). + +To allow the tool stack to see a consistent snapshot of PoD state, move +the tail of XENMEM_{get,set}_pod_target handling into a function, adding +proper locking there. + +In libxl take the liberty to use the new local variable r also for a +pre-existing call into libxc. + +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> +master commit: ad4312d764e8b40a1e45b64aac6d840a60c59f13 +master date: 2022-05-02 08:48:02 +0200 +--- + xen/arch/x86/mm.c | 6 +--- + xen/arch/x86/mm/p2m-pod.c | 43 ++++++++++++++++++++++++++++- + xen/common/vm_event.c | 2 +- + xen/drivers/passthrough/x86/iommu.c | 3 +- + xen/include/asm-x86/p2m.h | 21 +++++++------- + 5 files changed, 57 insertions(+), 18 deletions(-) + +diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c +index e222d9aa98ee..4ee2de11051d 100644 +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -4777,7 +4777,6 @@ long arch_memory_op(unsigned long cmd, XEN_GUEST_HANDLE_PARAM(void) arg) + { + xen_pod_target_t target; + struct domain *d; +- struct p2m_domain *p2m; + + if ( copy_from_guest(&target, arg, 1) ) + return -EFAULT; +@@ -4812,10 +4811,7 @@ long arch_memory_op(unsigned long cmd, XEN_GUEST_HANDLE_PARAM(void) arg) + } + else if ( rc >= 0 ) + { +- p2m = p2m_get_hostp2m(d); +- target.tot_pages = domain_tot_pages(d); +- target.pod_cache_pages = p2m->pod.count; +- target.pod_entries = p2m->pod.entry_count; ++ p2m_pod_get_mem_target(d, &target); + + if ( __copy_to_guest(arg, &target, 1) ) + { +diff --git a/xen/arch/x86/mm/p2m-pod.c b/xen/arch/x86/mm/p2m-pod.c +index d8d1a0ce7ed7..a3c9d8a97423 100644 +--- a/xen/arch/x86/mm/p2m-pod.c ++++ b/xen/arch/x86/mm/p2m-pod.c +@@ -20,6 +20,7 @@ + */ + + #include <xen/event.h> ++#include <xen/iocap.h> + #include <xen/ioreq.h> + #include <xen/mm.h> + #include <xen/sched.h> +@@ -362,7 +363,10 @@ p2m_pod_set_mem_target(struct domain *d, unsigned long target) + + ASSERT( pod_target >= p2m->pod.count ); + +- ret = p2m_pod_set_cache_target(p2m, pod_target, 1/*preemptible*/); ++ if ( has_arch_pdevs(d) || cache_flush_permitted(d) ) ++ ret = -ENOTEMPTY; ++ else ++ ret = p2m_pod_set_cache_target(p2m, pod_target, 1/*preemptible*/); + + out: + pod_unlock(p2m); +@@ -370,6 +374,23 @@ out: + return ret; + } + ++void p2m_pod_get_mem_target(const struct domain *d, xen_pod_target_t *target) ++{ ++ struct p2m_domain *p2m = p2m_get_hostp2m(d); ++ ++ ASSERT(is_hvm_domain(d)); ++ ++ pod_lock(p2m); ++ lock_page_alloc(p2m); ++ ++ target->tot_pages = domain_tot_pages(d); ++ target->pod_cache_pages = p2m->pod.count; ++ target->pod_entries = p2m->pod.entry_count; ++ ++ unlock_page_alloc(p2m); ++ pod_unlock(p2m); ++} ++ + int p2m_pod_empty_cache(struct domain *d) + { + struct p2m_domain *p2m = p2m_get_hostp2m(d); +@@ -1387,6 +1408,9 @@ guest_physmap_mark_populate_on_demand(struct domain *d, unsigned long gfn, + if ( !paging_mode_translate(d) ) + return -EINVAL; + ++ if ( has_arch_pdevs(d) || cache_flush_permitted(d) ) ++ return -ENOTEMPTY; ++ + do { + rc = mark_populate_on_demand(d, gfn, chunk_order); + +@@ -1408,3 +1432,20 @@ void p2m_pod_init(struct p2m_domain *p2m) + for ( i = 0; i < ARRAY_SIZE(p2m->pod.mrp.list); ++i ) + p2m->pod.mrp.list[i] = gfn_x(INVALID_GFN); + } ++ ++bool p2m_pod_active(const struct domain *d) ++{ ++ struct p2m_domain *p2m; ++ bool res; ++ ++ if ( !is_hvm_domain(d) ) ++ return false; ++ ++ p2m = p2m_get_hostp2m(d); ++ ++ pod_lock(p2m); ++ res = p2m->pod.entry_count | p2m->pod.count; ++ pod_unlock(p2m); ++ ++ return res; ++} +diff --git a/xen/common/vm_event.c b/xen/common/vm_event.c +index 70ab3ba406ff..21d2f0edf727 100644 +--- a/xen/common/vm_event.c ++++ b/xen/common/vm_event.c +@@ -639,7 +639,7 @@ int vm_event_domctl(struct domain *d, struct xen_domctl_vm_event_op *vec) + + rc = -EXDEV; + /* Disallow paging in a PoD guest */ +- if ( p2m_pod_entry_count(p2m_get_hostp2m(d)) ) ++ if ( p2m_pod_active(d) ) + break; + + /* domain_pause() not required here, see XSA-99 */ +diff --git a/xen/drivers/passthrough/x86/iommu.c b/xen/drivers/passthrough/x86/iommu.c +index a36a6bd4b249..dc9936e16930 100644 +--- a/xen/drivers/passthrough/x86/iommu.c ++++ b/xen/drivers/passthrough/x86/iommu.c +@@ -502,11 +502,12 @@ bool arch_iommu_use_permitted(const struct domain *d) + { + /* + * Prevent device assign if mem paging, mem sharing or log-dirty +- * have been enabled for this domain. ++ * have been enabled for this domain, or if PoD is still in active use. + */ + return d == dom_io || + (likely(!mem_sharing_enabled(d)) && + likely(!mem_paging_enabled(d)) && ++ likely(!p2m_pod_active(d)) && + likely(!p2m_get_hostp2m(d)->global_logdirty)); + } + +diff --git a/xen/include/asm-x86/p2m.h b/xen/include/asm-x86/p2m.h +index 357a8087481e..f2af7a746ced 100644 +--- a/xen/include/asm-x86/p2m.h ++++ b/xen/include/asm-x86/p2m.h +@@ -661,6 +661,12 @@ int p2m_pod_empty_cache(struct domain *d); + * domain matches target */ + int p2m_pod_set_mem_target(struct domain *d, unsigned long target); + ++/* Obtain a consistent snapshot of PoD related domain state. */ ++void p2m_pod_get_mem_target(const struct domain *d, xen_pod_target_t *target); ++ ++/* Check whether PoD is (still) active in a domain. */ ++bool p2m_pod_active(const struct domain *d); ++ + /* Scan pod cache when offline/broken page triggered */ + int + p2m_pod_offline_or_broken_hit(struct page_info *p); +@@ -669,11 +675,6 @@ p2m_pod_offline_or_broken_hit(struct page_info *p); + void + p2m_pod_offline_or_broken_replace(struct page_info *p); + +-static inline long p2m_pod_entry_count(const struct p2m_domain *p2m) +-{ +- return p2m->pod.entry_count; +-} +- + void p2m_pod_init(struct p2m_domain *p2m); + + #else +@@ -689,6 +690,11 @@ static inline int p2m_pod_empty_cache(struct domain *d) + return 0; + } + ++static inline bool p2m_pod_active(const struct domain *d) ++{ ++ return false; ++} ++ + static inline int p2m_pod_offline_or_broken_hit(struct page_info *p) + { + return 0; +@@ -699,11 +705,6 @@ static inline void p2m_pod_offline_or_broken_replace(struct page_info *p) + ASSERT_UNREACHABLE(); + } + +-static inline long p2m_pod_entry_count(const struct p2m_domain *p2m) +-{ +- return 0; +-} +- + static inline void p2m_pod_init(struct p2m_domain *p2m) {} + + #endif +-- +2.35.1 + diff --git a/0014-x86-msr-handle-reads-to-MSR_P5_MC_-ADDR-TYPE.patch b/0014-x86-msr-handle-reads-to-MSR_P5_MC_-ADDR-TYPE.patch new file mode 100644 index 0000000..09f56f5 --- /dev/null +++ b/0014-x86-msr-handle-reads-to-MSR_P5_MC_-ADDR-TYPE.patch @@ -0,0 +1,121 @@ +From 9ebe2ba83644ec6cd33a93c68dab5f551adcbea0 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> +Date: Tue, 7 Jun 2022 14:04:16 +0200 +Subject: [PATCH 14/32] x86/msr: handle reads to MSR_P5_MC_{ADDR,TYPE} +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Windows Server 2019 Essentials will unconditionally attempt to read +P5_MC_ADDR MSR at boot and throw a BSOD if injected a #GP. + +Fix this by mapping MSR_P5_MC_{ADDR,TYPE} to +MSR_IA32_MCi_{ADDR,STATUS}, as reported also done by hardware in Intel +SDM "Mapping of the Pentium Processor Machine-Check Errors to the +Machine-Check Architecture" section. + +Reported-by: Steffen Einsle <einsle@phptrix.de> +Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +master commit: ce59e472b581e4923f6892172dde62b88c8aa8b7 +master date: 2022-05-02 08:49:12 +0200 +--- + xen/arch/x86/cpu/mcheck/mce.h | 6 ++++++ + xen/arch/x86/cpu/mcheck/mce_intel.c | 19 +++++++++++++++++++ + xen/arch/x86/cpu/mcheck/vmce.c | 2 ++ + xen/arch/x86/msr.c | 2 ++ + xen/include/asm-x86/msr-index.h | 3 +++ + 5 files changed, 32 insertions(+) + +diff --git a/xen/arch/x86/cpu/mcheck/mce.h b/xen/arch/x86/cpu/mcheck/mce.h +index 195362691904..192315ecfa3d 100644 +--- a/xen/arch/x86/cpu/mcheck/mce.h ++++ b/xen/arch/x86/cpu/mcheck/mce.h +@@ -169,6 +169,12 @@ static inline int mce_vendor_bank_msr(const struct vcpu *v, uint32_t msr) + if (msr >= MSR_IA32_MC0_CTL2 && + msr < MSR_IA32_MCx_CTL2(v->arch.vmce.mcg_cap & MCG_CAP_COUNT) ) + return 1; ++ fallthrough; ++ ++ case X86_VENDOR_CENTAUR: ++ case X86_VENDOR_SHANGHAI: ++ if (msr == MSR_P5_MC_ADDR || msr == MSR_P5_MC_TYPE) ++ return 1; + break; + + case X86_VENDOR_AMD: +diff --git a/xen/arch/x86/cpu/mcheck/mce_intel.c b/xen/arch/x86/cpu/mcheck/mce_intel.c +index bb9f3a3ff795..d364e9bf5ad1 100644 +--- a/xen/arch/x86/cpu/mcheck/mce_intel.c ++++ b/xen/arch/x86/cpu/mcheck/mce_intel.c +@@ -1001,8 +1001,27 @@ int vmce_intel_wrmsr(struct vcpu *v, uint32_t msr, uint64_t val) + + int vmce_intel_rdmsr(const struct vcpu *v, uint32_t msr, uint64_t *val) + { ++ const struct cpuid_policy *cp = v->domain->arch.cpuid; + unsigned int bank = msr - MSR_IA32_MC0_CTL2; + ++ switch ( msr ) ++ { ++ case MSR_P5_MC_ADDR: ++ /* ++ * Bank 0 is used for the 'bank 0 quirk' on older processors. ++ * See vcpu_fill_mc_msrs() for reference. ++ */ ++ *val = v->arch.vmce.bank[1].mci_addr; ++ return 1; ++ ++ case MSR_P5_MC_TYPE: ++ *val = v->arch.vmce.bank[1].mci_status; ++ return 1; ++ } ++ ++ if ( !(cp->x86_vendor & X86_VENDOR_INTEL) ) ++ return 0; ++ + if ( bank < GUEST_MC_BANK_NUM ) + { + *val = v->arch.vmce.bank[bank].mci_ctl2; +diff --git a/xen/arch/x86/cpu/mcheck/vmce.c b/xen/arch/x86/cpu/mcheck/vmce.c +index eb6434a3ba20..0899df58bcbf 100644 +--- a/xen/arch/x86/cpu/mcheck/vmce.c ++++ b/xen/arch/x86/cpu/mcheck/vmce.c +@@ -150,6 +150,8 @@ static int bank_mce_rdmsr(const struct vcpu *v, uint32_t msr, uint64_t *val) + default: + switch ( boot_cpu_data.x86_vendor ) + { ++ case X86_VENDOR_CENTAUR: ++ case X86_VENDOR_SHANGHAI: + case X86_VENDOR_INTEL: + ret = vmce_intel_rdmsr(v, msr, val); + break; +diff --git a/xen/arch/x86/msr.c b/xen/arch/x86/msr.c +index aaedb2c31287..da305c7aa4c9 100644 +--- a/xen/arch/x86/msr.c ++++ b/xen/arch/x86/msr.c +@@ -282,6 +282,8 @@ int guest_rdmsr(struct vcpu *v, uint32_t msr, uint64_t *val) + *val = msrs->misc_features_enables.raw; + break; + ++ case MSR_P5_MC_ADDR: ++ case MSR_P5_MC_TYPE: + case MSR_IA32_MCG_CAP ... MSR_IA32_MCG_CTL: /* 0x179 -> 0x17b */ + case MSR_IA32_MCx_CTL2(0) ... MSR_IA32_MCx_CTL2(31): /* 0x280 -> 0x29f */ + case MSR_IA32_MCx_CTL(0) ... MSR_IA32_MCx_MISC(31): /* 0x400 -> 0x47f */ +diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h +index 3e038db618ff..31964b88af7a 100644 +--- a/xen/include/asm-x86/msr-index.h ++++ b/xen/include/asm-x86/msr-index.h +@@ -15,6 +15,9 @@ + * abbreviated name. Exceptions will be considered on a case-by-case basis. + */ + ++#define MSR_P5_MC_ADDR 0 ++#define MSR_P5_MC_TYPE 0x00000001 ++ + #define MSR_APIC_BASE 0x0000001b + #define APIC_BASE_BSP (_AC(1, ULL) << 8) + #define APIC_BASE_EXTD (_AC(1, ULL) << 10) +-- +2.35.1 + diff --git a/0015-kconfig-detect-LD-implementation.patch b/0015-kconfig-detect-LD-implementation.patch new file mode 100644 index 0000000..f2fc24a --- /dev/null +++ b/0015-kconfig-detect-LD-implementation.patch @@ -0,0 +1,46 @@ +From 3754bd128d1a6b3d5864d1a3ee5d27b67d35387a Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> +Date: Tue, 7 Jun 2022 14:05:06 +0200 +Subject: [PATCH 15/32] kconfig: detect LD implementation +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Detect GNU and LLVM ld implementations. This is required for further +patches that will introduce diverging behaviour depending on the +linker implementation in use. + +Note that LLVM ld returns "compatible with GNU linkers" as part of the +version string, so be on the safe side and use '^' to only match at +the start of the line in case LLVM ever decides to change the text to +use "compatible with GNU ld" instead. + +Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> +Reviewed-by: Michal Orzel <michal.orzel@arm.com> +Acked-by: Julien Grall <jgrall@amazon.com> +master commit: c70c4b624f85f7d4e28c70a804a0a3f20d73092b +master date: 2022-05-02 08:50:39 +0200 +--- + xen/Kconfig | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/xen/Kconfig b/xen/Kconfig +index bcbd2758e5d3..0c89afd50fcf 100644 +--- a/xen/Kconfig ++++ b/xen/Kconfig +@@ -23,6 +23,12 @@ config CLANG_VERSION + int + default $(shell,$(BASEDIR)/scripts/clang-version.sh $(CC)) + ++config LD_IS_GNU ++ def_bool $(success,$(LD) --version | head -n 1 | grep -q "^GNU ld") ++ ++config LD_IS_LLVM ++ def_bool $(success,$(LD) --version | head -n 1 | grep -q "^LLD") ++ + # -fvisibility=hidden reduces -fpic cost, if it's available + config CC_HAS_VISIBILITY_ATTRIBUTE + def_bool $(cc-option,-fvisibility=hidden) +-- +2.35.1 + diff --git a/0016-linker-lld-do-not-generate-quoted-section-names.patch b/0016-linker-lld-do-not-generate-quoted-section-names.patch new file mode 100644 index 0000000..a42083e --- /dev/null +++ b/0016-linker-lld-do-not-generate-quoted-section-names.patch @@ -0,0 +1,54 @@ +From 88b653f73928117461dc250acd1e830a47a14c2b Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> +Date: Tue, 7 Jun 2022 14:05:24 +0200 +Subject: [PATCH 16/32] linker/lld: do not generate quoted section names +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +LLVM LD doesn't strip the quotes from the section names, and so the +resulting binary ends up with section names like: + + [ 1] ".text" PROGBITS ffff82d040200000 00008000 + 000000000018cbc1 0000000000000000 AX 0 0 4096 + +This confuses some tools (like gdb) and prevents proper parsing of the +binary. + +The issue has already been reported and is being fixed in LLD. In +order to workaround this issue and keep the GNU ld support define +different DECL_SECTION macros depending on the used ld +implementation. + +Drop the quotes from the definitions of the debug sections in +DECL_DEBUG{2}, as those quotes are not required for GNU ld either. + +Fixes: 6254920587c3 ('x86: quote section names when defining them in linker script') +Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +master commit: 702c9a800eb3ecd4b8595998d37a769d470c5bb0 +master date: 2022-05-02 08:51:45 +0200 +--- + xen/arch/x86/xen.lds.S | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +diff --git a/xen/arch/x86/xen.lds.S b/xen/arch/x86/xen.lds.S +index 4c58f3209c3d..bc9b9651b192 100644 +--- a/xen/arch/x86/xen.lds.S ++++ b/xen/arch/x86/xen.lds.S +@@ -18,7 +18,11 @@ ENTRY(efi_start) + #else /* !EFI */ + + #define FORMAT "elf64-x86-64" +-#define DECL_SECTION(x) #x : AT(ADDR(#x) - __XEN_VIRT_START) ++#ifdef CONFIG_LD_IS_GNU ++# define DECL_SECTION(x) x : AT(ADDR(#x) - __XEN_VIRT_START) ++#else ++# define DECL_SECTION(x) x : AT(ADDR(x) - __XEN_VIRT_START) ++#endif + + ENTRY(start_pa) + +-- +2.35.1 + diff --git a/0017-xen-io-Fix-race-between-sending-an-I-O-and-domain-sh.patch b/0017-xen-io-Fix-race-between-sending-an-I-O-and-domain-sh.patch new file mode 100644 index 0000000..d226e97 --- /dev/null +++ b/0017-xen-io-Fix-race-between-sending-an-I-O-and-domain-sh.patch @@ -0,0 +1,142 @@ +From 982a314bd3000a16c3128afadb36a8ff41029adc Mon Sep 17 00:00:00 2001 +From: Julien Grall <jgrall@amazon.com> +Date: Tue, 7 Jun 2022 14:06:11 +0200 +Subject: [PATCH 17/32] xen: io: Fix race between sending an I/O and domain + shutdown + +Xen provides hypercalls to shutdown (SCHEDOP_shutdown{,_code}) and +resume a domain (XEN_DOMCTL_resumedomain). They can be used for checkpoint +where the expectation is the domain should continue as nothing happened +afterwards. + +hvmemul_do_io() and handle_pio() will act differently if the return +code of hvm_send_ioreq() (resp. hvmemul_do_pio_buffer()) is X86EMUL_RETRY. + +In this case, the I/O state will be reset to STATE_IOREQ_NONE (i.e +no I/O is pending) and/or the PC will not be advanced. + +If the shutdown request happens right after the I/O was sent to the +IOREQ, then emulation code will end up to re-execute the instruction +and therefore forward again the same I/O (at least when reading IO port). + +This would be problem if the access has a side-effect. A dumb example, +is a device implementing a counter which is incremented by one for every +access. When running shutdown/resume in a loop, the value read by the +OS may not be the old value + 1. + +Add an extra boolean in the structure hvm_vcpu_io to indicate whether +the I/O was suspended. This is then used in place of checking the domain +is shutting down in hvmemul_do_io() and handle_pio() as they should +act on suspend (i.e. vcpu_start_shutdown_deferral() returns false) rather +than shutdown. + +Signed-off-by: Julien Grall <jgrall@amazon.com> +Reviewed-by: Paul Durrant <paul@xen.org> +master commit: b7e0d8978810b534725e94a321736496928f00a5 +master date: 2022-05-06 17:16:22 +0100 +--- + xen/arch/arm/ioreq.c | 3 ++- + xen/arch/x86/hvm/emulate.c | 3 ++- + xen/arch/x86/hvm/io.c | 7 ++++--- + xen/common/ioreq.c | 4 ++++ + xen/include/xen/sched.h | 5 +++++ + 5 files changed, 17 insertions(+), 5 deletions(-) + +diff --git a/xen/arch/arm/ioreq.c b/xen/arch/arm/ioreq.c +index 308650b40051..fbccef212bf1 100644 +--- a/xen/arch/arm/ioreq.c ++++ b/xen/arch/arm/ioreq.c +@@ -80,9 +80,10 @@ enum io_state try_fwd_ioserv(struct cpu_user_regs *regs, + return IO_ABORT; + + vio->req = p; ++ vio->suspended = false; + + rc = ioreq_send(s, &p, 0); +- if ( rc != IO_RETRY || v->domain->is_shutting_down ) ++ if ( rc != IO_RETRY || vio->suspended ) + vio->req.state = STATE_IOREQ_NONE; + else if ( !ioreq_needs_completion(&vio->req) ) + rc = IO_HANDLED; +diff --git a/xen/arch/x86/hvm/emulate.c b/xen/arch/x86/hvm/emulate.c +index 76a2ccfafe23..7da348b5d486 100644 +--- a/xen/arch/x86/hvm/emulate.c ++++ b/xen/arch/x86/hvm/emulate.c +@@ -239,6 +239,7 @@ static int hvmemul_do_io( + ASSERT(p.count); + + vio->req = p; ++ vio->suspended = false; + + rc = hvm_io_intercept(&p); + +@@ -334,7 +335,7 @@ static int hvmemul_do_io( + else + { + rc = ioreq_send(s, &p, 0); +- if ( rc != X86EMUL_RETRY || currd->is_shutting_down ) ++ if ( rc != X86EMUL_RETRY || vio->suspended ) + vio->req.state = STATE_IOREQ_NONE; + else if ( !ioreq_needs_completion(&vio->req) ) + rc = X86EMUL_OKAY; +diff --git a/xen/arch/x86/hvm/io.c b/xen/arch/x86/hvm/io.c +index 93f1d1503fa6..80915f27e488 100644 +--- a/xen/arch/x86/hvm/io.c ++++ b/xen/arch/x86/hvm/io.c +@@ -138,10 +138,11 @@ bool handle_pio(uint16_t port, unsigned int size, int dir) + + case X86EMUL_RETRY: + /* +- * We should not advance RIP/EIP if the domain is shutting down or +- * if X86EMUL_RETRY has been returned by an internal handler. ++ * We should not advance RIP/EIP if the vio was suspended (e.g. ++ * because the domain is shutting down) or if X86EMUL_RETRY has ++ * been returned by an internal handler. + */ +- if ( curr->domain->is_shutting_down || !vcpu_ioreq_pending(curr) ) ++ if ( vio->suspended || !vcpu_ioreq_pending(curr) ) + return false; + break; + +diff --git a/xen/common/ioreq.c b/xen/common/ioreq.c +index d732dc045df9..42414b750bef 100644 +--- a/xen/common/ioreq.c ++++ b/xen/common/ioreq.c +@@ -1256,6 +1256,7 @@ int ioreq_send(struct ioreq_server *s, ioreq_t *proto_p, + struct vcpu *curr = current; + struct domain *d = curr->domain; + struct ioreq_vcpu *sv; ++ struct vcpu_io *vio = &curr->io; + + ASSERT(s); + +@@ -1263,7 +1264,10 @@ int ioreq_send(struct ioreq_server *s, ioreq_t *proto_p, + return ioreq_send_buffered(s, proto_p); + + if ( unlikely(!vcpu_start_shutdown_deferral(curr)) ) ++ { ++ vio->suspended = true; + return IOREQ_STATUS_RETRY; ++ } + + list_for_each_entry ( sv, + &s->ioreq_vcpu_list, +diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h +index 28146ee404e6..9671062360ac 100644 +--- a/xen/include/xen/sched.h ++++ b/xen/include/xen/sched.h +@@ -159,6 +159,11 @@ enum vio_completion { + struct vcpu_io { + /* I/O request in flight to device model. */ + enum vio_completion completion; ++ /* ++ * Indicate whether the I/O was not handled because the domain ++ * is about to be paused. ++ */ ++ bool suspended; + ioreq_t req; + }; + +-- +2.35.1 + diff --git a/0018-build-suppress-GNU-ld-warning-about-RWX-load-segment.patch b/0018-build-suppress-GNU-ld-warning-about-RWX-load-segment.patch new file mode 100644 index 0000000..87a0873 --- /dev/null +++ b/0018-build-suppress-GNU-ld-warning-about-RWX-load-segment.patch @@ -0,0 +1,35 @@ +From 4890031d224262a6cf43d3bef1af4a16c13db306 Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Tue, 7 Jun 2022 14:06:51 +0200 +Subject: [PATCH 18/32] build: suppress GNU ld warning about RWX load segments + +We cannot really avoid such and we're also not really at risk because of +them, as we control page table permissions ourselves rather than relying +on a loader of some sort. Present GNU ld master started warning about +such, and hence 2.39 is anticipated to have this warning. + +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> +Acked-by: Julien Grall <jgrall@amazon.com> +master commit: 68f5aac012b9ae36ce9b65d9ca9cc9f232191ad3 +master date: 2022-05-18 11:17:19 +0200 +--- + xen/Makefile | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/xen/Makefile b/xen/Makefile +index ce4eca3ee4d7..4d9abe704628 100644 +--- a/xen/Makefile ++++ b/xen/Makefile +@@ -260,6 +260,8 @@ endif + + AFLAGS += -D__ASSEMBLY__ + ++LDFLAGS-$(call ld-option,--warn-rwx-segments) += --no-warn-rwx-segments ++ + CFLAGS += $(CFLAGS-y) + # allow extra CFLAGS externally via EXTRA_CFLAGS_XEN_CORE + CFLAGS += $(EXTRA_CFLAGS_XEN_CORE) +-- +2.35.1 + diff --git a/0019-build-silence-GNU-ld-warning-about-executable-stacks.patch b/0019-build-silence-GNU-ld-warning-about-executable-stacks.patch new file mode 100644 index 0000000..75e9f7e --- /dev/null +++ b/0019-build-silence-GNU-ld-warning-about-executable-stacks.patch @@ -0,0 +1,35 @@ +From 1bc669a568a9f4bdab9e9ddb95823ba370dc0baf Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Tue, 7 Jun 2022 14:07:11 +0200 +Subject: [PATCH 19/32] build: silence GNU ld warning about executable stacks + +While for C files the compiler is supposed to arrange for emitting +respective information, for assembly sources we're responsible ourselves. +Present GNU ld master started warning about such, and hence 2.39 is +anticipated to have this warning. + +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> +Acked-by: Julien Grall <jgrall@amazon.com> +master commit: 62d22296a95d259c934ca2f39ac511d729cfbb68 +master date: 2022-05-18 11:18:45 +0200 +--- + xen/Makefile | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/xen/Makefile b/xen/Makefile +index 4d9abe704628..971028eda240 100644 +--- a/xen/Makefile ++++ b/xen/Makefile +@@ -260,6 +260,8 @@ endif + + AFLAGS += -D__ASSEMBLY__ + ++$(call cc-option-add,AFLAGS,CC,-Wa$(comma)--noexecstack) ++ + LDFLAGS-$(call ld-option,--warn-rwx-segments) += --no-warn-rwx-segments + + CFLAGS += $(CFLAGS-y) +-- +2.35.1 + diff --git a/0020-ns16550-use-poll-mode-if-INTERRUPT_LINE-is-0xff.patch b/0020-ns16550-use-poll-mode-if-INTERRUPT_LINE-is-0xff.patch new file mode 100644 index 0000000..b83be9a --- /dev/null +++ b/0020-ns16550-use-poll-mode-if-INTERRUPT_LINE-is-0xff.patch @@ -0,0 +1,50 @@ +From f1be0b62a03b90a40a03e21f965e4cbb89809bb1 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?= + <marmarek@invisiblethingslab.com> +Date: Tue, 7 Jun 2022 14:07:34 +0200 +Subject: [PATCH 20/32] ns16550: use poll mode if INTERRUPT_LINE is 0xff +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Intel LPSS has INTERRUPT_LINE set to 0xff by default, that is declared +by the PCI Local Bus Specification Revision 3.0 (from 2004) as +"unknown"/"no connection". Fallback to poll mode in this case. +The 0xff handling is x86-specific, the surrounding code is guarded with +CONFIG_X86 anyway. + +Signed-off-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com> +Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> +master commit: 6a2ea1a2370a0c8a0210accac0ae62e68c185134 +master date: 2022-05-20 12:19:45 +0200 +--- + xen/drivers/char/ns16550.c | 13 +++++++++++++ + 1 file changed, 13 insertions(+) + +diff --git a/xen/drivers/char/ns16550.c b/xen/drivers/char/ns16550.c +index 30596d60d4ed..2d2bd2a02469 100644 +--- a/xen/drivers/char/ns16550.c ++++ b/xen/drivers/char/ns16550.c +@@ -1221,6 +1221,19 @@ pci_uart_config(struct ns16550 *uart, bool_t skip_amt, unsigned int idx) + pci_conf_read8(PCI_SBDF(0, b, d, f), + PCI_INTERRUPT_LINE) : 0; + ++#ifdef CONFIG_X86 ++ /* ++ * PCI Local Bus Specification Revision 3.0 defines 0xff value ++ * as special only for X86. ++ */ ++ if ( uart->irq == 0xff ) ++ uart->irq = 0; ++#endif ++ if ( !uart->irq ) ++ printk(XENLOG_INFO ++ "ns16550: %pp: no legacy IRQ, using poll mode\n", ++ &PCI_SBDF(0, b, d, f)); ++ + return 0; + } + } +-- +2.35.1 + diff --git a/0021-PCI-don-t-allow-pci-phantom-to-mark-real-devices-as-.patch b/0021-PCI-don-t-allow-pci-phantom-to-mark-real-devices-as-.patch new file mode 100644 index 0000000..1264578 --- /dev/null +++ b/0021-PCI-don-t-allow-pci-phantom-to-mark-real-devices-as-.patch @@ -0,0 +1,56 @@ +From 8e11ec8fbf6f933f8854f4bc54226653316903f2 Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Tue, 7 Jun 2022 14:08:06 +0200 +Subject: [PATCH 21/32] PCI: don't allow "pci-phantom=" to mark real devices as + phantom functions +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +IOMMU code mapping / unmapping devices and interrupts will misbehave if +a wrong command line option declared a function "phantom" when there's a +real device at that position. Warn about this and adjust the specified +stride (in the worst case ignoring the option altogether). + +Requested-by: Andrew Cooper <andrew.cooper3@citrix.com> +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> +master commit: 444b555dc9e09fa3ce90f066e0c88dec9b47f422 +master date: 2022-05-20 12:20:35 +0200 +--- + xen/drivers/passthrough/pci.c | 19 ++++++++++++++++++- + 1 file changed, 18 insertions(+), 1 deletion(-) + +diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c +index 395958698e6a..e0491c908f10 100644 +--- a/xen/drivers/passthrough/pci.c ++++ b/xen/drivers/passthrough/pci.c +@@ -382,7 +382,24 @@ static struct pci_dev *alloc_pdev(struct pci_seg *pseg, u8 bus, u8 devfn) + phantom_devs[i].slot == PCI_SLOT(devfn) && + phantom_devs[i].stride > PCI_FUNC(devfn) ) + { +- pdev->phantom_stride = phantom_devs[i].stride; ++ pci_sbdf_t sbdf = pdev->sbdf; ++ unsigned int stride = phantom_devs[i].stride; ++ ++ while ( (sbdf.fn += stride) > PCI_FUNC(devfn) ) ++ { ++ if ( pci_conf_read16(sbdf, PCI_VENDOR_ID) == 0xffff && ++ pci_conf_read16(sbdf, PCI_DEVICE_ID) == 0xffff ) ++ continue; ++ stride <<= 1; ++ printk(XENLOG_WARNING ++ "%pp looks to be a real device; bumping %04x:%02x:%02x stride to %u\n", ++ &sbdf, phantom_devs[i].seg, ++ phantom_devs[i].bus, phantom_devs[i].slot, ++ stride); ++ sbdf = pdev->sbdf; ++ } ++ if ( PCI_FUNC(stride) ) ++ pdev->phantom_stride = stride; + break; + } + } +-- +2.35.1 + diff --git a/0022-x86-pv-Clean-up-_get_page_type.patch b/0022-x86-pv-Clean-up-_get_page_type.patch new file mode 100644 index 0000000..a6008b0 --- /dev/null +++ b/0022-x86-pv-Clean-up-_get_page_type.patch @@ -0,0 +1,180 @@ +From b152dfbc3ad71a788996440b18174d995c3bffc9 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper <andrew.cooper3@citrix.com> +Date: Thu, 9 Jun 2022 15:27:19 +0200 +Subject: [PATCH 22/32] x86/pv: Clean up _get_page_type() + +Various fixes for clarity, ahead of making complicated changes. + + * Split the overflow check out of the if/else chain for type handling, as + it's somewhat unrelated. + * Comment the main if/else chain to explain what is going on. Adjust one + ASSERT() and state the bit layout for validate-locked and partial states. + * Correct the comment about TLB flushing, as it's backwards. The problem + case is when writeable mappings are retained to a page becoming read-only, + as it allows the guest to bypass Xen's safety checks for updates. + * Reduce the scope of 'y'. It is an artefact of the cmpxchg loop and not + valid for use by subsequent logic. Switch to using ACCESS_ONCE() to treat + all reads as explicitly volatile. The only thing preventing the validated + wait-loop being infinite is the compiler barrier hidden in cpu_relax(). + * Replace one page_get_owner(page) with the already-calculated 'd' already in + scope. + +No functional change. + +This is part of XSA-401 / CVE-2022-26362. + +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Signed-off-by: George Dunlap <george.dunlap@eu.citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: George Dunlap <george.dunlap@citrix.com> +master commit: 9186e96b199e4f7e52e033b238f9fe869afb69c7 +master date: 2022-06-09 14:20:36 +0200 +--- + xen/arch/x86/mm.c | 72 +++++++++++++++++++++++++++++++++++++++-------- + 1 file changed, 61 insertions(+), 11 deletions(-) + +diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c +index 4ee2de11051d..79ad7fdd2b82 100644 +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -2906,16 +2906,17 @@ static int _put_page_type(struct page_info *page, unsigned int flags, + static int _get_page_type(struct page_info *page, unsigned long type, + bool preemptible) + { +- unsigned long nx, x, y = page->u.inuse.type_info; ++ unsigned long nx, x; + int rc = 0; + + ASSERT(!(type & ~(PGT_type_mask | PGT_pae_xen_l2))); + ASSERT(!in_irq()); + +- for ( ; ; ) ++ for ( unsigned long y = ACCESS_ONCE(page->u.inuse.type_info); ; ) + { + x = y; + nx = x + 1; ++ + if ( unlikely((nx & PGT_count_mask) == 0) ) + { + gdprintk(XENLOG_WARNING, +@@ -2923,8 +2924,15 @@ static int _get_page_type(struct page_info *page, unsigned long type, + mfn_x(page_to_mfn(page))); + return -EINVAL; + } +- else if ( unlikely((x & PGT_count_mask) == 0) ) ++ ++ if ( unlikely((x & PGT_count_mask) == 0) ) + { ++ /* ++ * Typeref 0 -> 1. ++ * ++ * Type changes are permitted when the typeref is 0. If the type ++ * actually changes, the page needs re-validating. ++ */ + struct domain *d = page_get_owner(page); + + if ( d && shadow_mode_enabled(d) ) +@@ -2935,8 +2943,8 @@ static int _get_page_type(struct page_info *page, unsigned long type, + { + /* + * On type change we check to flush stale TLB entries. It is +- * vital that no other CPUs are left with mappings of a frame +- * which is about to become writeable to the guest. ++ * vital that no other CPUs are left with writeable mappings ++ * to a frame which is intending to become pgtable/segdesc. + */ + cpumask_t *mask = this_cpu(scratch_cpumask); + +@@ -2948,7 +2956,7 @@ static int _get_page_type(struct page_info *page, unsigned long type, + + if ( unlikely(!cpumask_empty(mask)) && + /* Shadow mode: track only writable pages. */ +- (!shadow_mode_enabled(page_get_owner(page)) || ++ (!shadow_mode_enabled(d) || + ((nx & PGT_type_mask) == PGT_writable_page)) ) + { + perfc_incr(need_flush_tlb_flush); +@@ -2979,7 +2987,14 @@ static int _get_page_type(struct page_info *page, unsigned long type, + } + else if ( unlikely((x & (PGT_type_mask|PGT_pae_xen_l2)) != type) ) + { +- /* Don't log failure if it could be a recursive-mapping attempt. */ ++ /* ++ * else, we're trying to take a new reference, of the wrong type. ++ * ++ * This (being able to prohibit use of the wrong type) is what the ++ * typeref system exists for, but skip printing the failure if it ++ * looks like a recursive mapping, as subsequent logic might ++ * ultimately permit the attempt. ++ */ + if ( ((x & PGT_type_mask) == PGT_l2_page_table) && + (type == PGT_l1_page_table) ) + return -EINVAL; +@@ -2998,18 +3013,46 @@ static int _get_page_type(struct page_info *page, unsigned long type, + } + else if ( unlikely(!(x & PGT_validated)) ) + { ++ /* ++ * else, the count is non-zero, and we're grabbing the right type; ++ * but the page hasn't been validated yet. ++ * ++ * The page is in one of two states (depending on PGT_partial), ++ * and should have exactly one reference. ++ */ ++ ASSERT((x & (PGT_type_mask | PGT_count_mask)) == (type | 1)); ++ + if ( !(x & PGT_partial) ) + { +- /* Someone else is updating validation of this page. Wait... */ ++ /* ++ * The page has been left in the "validate locked" state ++ * (i.e. PGT_[type] | 1) which means that a concurrent caller ++ * of _get_page_type() is in the middle of validation. ++ * ++ * Spin waiting for the concurrent user to complete (partial ++ * or fully validated), then restart our attempt to acquire a ++ * type reference. ++ */ + do { + if ( preemptible && hypercall_preempt_check() ) + return -EINTR; + cpu_relax(); +- } while ( (y = page->u.inuse.type_info) == x ); ++ } while ( (y = ACCESS_ONCE(page->u.inuse.type_info)) == x ); + continue; + } +- /* Type ref count was left at 1 when PGT_partial got set. */ +- ASSERT((x & PGT_count_mask) == 1); ++ ++ /* ++ * The page has been left in the "partial" state ++ * (i.e., PGT_[type] | PGT_partial | 1). ++ * ++ * Rather than bumping the type count, we need to try to grab the ++ * validation lock; if we succeed, we need to validate the page, ++ * then drop the general ref associated with the PGT_partial bit. ++ * ++ * We grab the validation lock by setting nx to (PGT_[type] | 1) ++ * (i.e., non-zero type count, neither PGT_validated nor ++ * PGT_partial set). ++ */ + nx = x & ~PGT_partial; + } + +@@ -3058,6 +3101,13 @@ static int _get_page_type(struct page_info *page, unsigned long type, + } + + out: ++ /* ++ * Did we drop the PGT_partial bit when acquiring the typeref? If so, ++ * drop the general reference that went along with it. ++ * ++ * N.B. validate_page() may have have re-set PGT_partial, not reflected in ++ * nx, but will have taken an extra ref when doing so. ++ */ + if ( (x & PGT_partial) && !(nx & PGT_partial) ) + put_page(page); + +-- +2.35.1 + diff --git a/0023-x86-pv-Fix-ABAC-cmpxchg-race-in-_get_page_type.patch b/0023-x86-pv-Fix-ABAC-cmpxchg-race-in-_get_page_type.patch new file mode 100644 index 0000000..2f4b734 --- /dev/null +++ b/0023-x86-pv-Fix-ABAC-cmpxchg-race-in-_get_page_type.patch @@ -0,0 +1,201 @@ +From 8dab3f79b122e69cbcdebca72cdc14f004ee2193 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper <andrew.cooper3@citrix.com> +Date: Thu, 9 Jun 2022 15:27:37 +0200 +Subject: [PATCH 23/32] x86/pv: Fix ABAC cmpxchg() race in _get_page_type() + +_get_page_type() suffers from a race condition where it incorrectly assumes +that because 'x' was read and a subsequent a cmpxchg() succeeds, the type +cannot have changed in-between. Consider: + +CPU A: + 1. Creates an L2e referencing pg + `-> _get_page_type(pg, PGT_l1_page_table), sees count 0, type PGT_writable_page + 2. Issues flush_tlb_mask() +CPU B: + 3. Creates a writeable mapping of pg + `-> _get_page_type(pg, PGT_writable_page), count increases to 1 + 4. Writes into new mapping, creating a TLB entry for pg + 5. Removes the writeable mapping of pg + `-> _put_page_type(pg), count goes back down to 0 +CPU A: + 7. Issues cmpxchg(), setting count 1, type PGT_l1_page_table + +CPU B now has a writeable mapping to pg, which Xen believes is a pagetable and +suitably protected (i.e. read-only). The TLB flush in step 2 must be deferred +until after the guest is prohibited from creating new writeable mappings, +which is after step 7. + +Defer all safety actions until after the cmpxchg() has successfully taken the +intended typeref, because that is what prevents concurrent users from using +the old type. + +Also remove the early validation for writeable and shared pages. This removes +race conditions where one half of a parallel mapping attempt can return +successfully before: + * The IOMMU pagetables are in sync with the new page type + * Writeable mappings to shared pages have been torn down + +This is part of XSA-401 / CVE-2022-26362. + +Reported-by: Jann Horn <jannh@google.com> +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: George Dunlap <george.dunlap@citrix.com> +master commit: 8cc5036bc385112a82f1faff27a0970e6440dfed +master date: 2022-06-09 14:21:04 +0200 +--- + xen/arch/x86/mm.c | 116 ++++++++++++++++++++++++++-------------------- + 1 file changed, 67 insertions(+), 49 deletions(-) + +diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c +index 79ad7fdd2b82..c6429b0f749a 100644 +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -2933,56 +2933,12 @@ static int _get_page_type(struct page_info *page, unsigned long type, + * Type changes are permitted when the typeref is 0. If the type + * actually changes, the page needs re-validating. + */ +- struct domain *d = page_get_owner(page); +- +- if ( d && shadow_mode_enabled(d) ) +- shadow_prepare_page_type_change(d, page, type); + + ASSERT(!(x & PGT_pae_xen_l2)); + if ( (x & PGT_type_mask) != type ) + { +- /* +- * On type change we check to flush stale TLB entries. It is +- * vital that no other CPUs are left with writeable mappings +- * to a frame which is intending to become pgtable/segdesc. +- */ +- cpumask_t *mask = this_cpu(scratch_cpumask); +- +- BUG_ON(in_irq()); +- cpumask_copy(mask, d->dirty_cpumask); +- +- /* Don't flush if the timestamp is old enough */ +- tlbflush_filter(mask, page->tlbflush_timestamp); +- +- if ( unlikely(!cpumask_empty(mask)) && +- /* Shadow mode: track only writable pages. */ +- (!shadow_mode_enabled(d) || +- ((nx & PGT_type_mask) == PGT_writable_page)) ) +- { +- perfc_incr(need_flush_tlb_flush); +- /* +- * If page was a page table make sure the flush is +- * performed using an IPI in order to avoid changing the +- * type of a page table page under the feet of +- * spurious_page_fault(). +- */ +- flush_mask(mask, +- (x & PGT_type_mask) && +- (x & PGT_type_mask) <= PGT_root_page_table +- ? FLUSH_TLB | FLUSH_FORCE_IPI +- : FLUSH_TLB); +- } +- +- /* We lose existing type and validity. */ + nx &= ~(PGT_type_mask | PGT_validated); + nx |= type; +- +- /* +- * No special validation needed for writable pages. +- * Page tables and GDT/LDT need to be scanned for validity. +- */ +- if ( type == PGT_writable_page || type == PGT_shared_page ) +- nx |= PGT_validated; + } + } + else if ( unlikely((x & (PGT_type_mask|PGT_pae_xen_l2)) != type) ) +@@ -3063,6 +3019,56 @@ static int _get_page_type(struct page_info *page, unsigned long type, + return -EINTR; + } + ++ /* ++ * One typeref has been taken and is now globally visible. ++ * ++ * The page is either in the "validate locked" state (PGT_[type] | 1) or ++ * fully validated (PGT_[type] | PGT_validated | >0). ++ */ ++ ++ if ( unlikely((x & PGT_count_mask) == 0) ) ++ { ++ struct domain *d = page_get_owner(page); ++ ++ if ( d && shadow_mode_enabled(d) ) ++ shadow_prepare_page_type_change(d, page, type); ++ ++ if ( (x & PGT_type_mask) != type ) ++ { ++ /* ++ * On type change we check to flush stale TLB entries. It is ++ * vital that no other CPUs are left with writeable mappings ++ * to a frame which is intending to become pgtable/segdesc. ++ */ ++ cpumask_t *mask = this_cpu(scratch_cpumask); ++ ++ BUG_ON(in_irq()); ++ cpumask_copy(mask, d->dirty_cpumask); ++ ++ /* Don't flush if the timestamp is old enough */ ++ tlbflush_filter(mask, page->tlbflush_timestamp); ++ ++ if ( unlikely(!cpumask_empty(mask)) && ++ /* Shadow mode: track only writable pages. */ ++ (!shadow_mode_enabled(d) || ++ ((nx & PGT_type_mask) == PGT_writable_page)) ) ++ { ++ perfc_incr(need_flush_tlb_flush); ++ /* ++ * If page was a page table make sure the flush is ++ * performed using an IPI in order to avoid changing the ++ * type of a page table page under the feet of ++ * spurious_page_fault(). ++ */ ++ flush_mask(mask, ++ (x & PGT_type_mask) && ++ (x & PGT_type_mask) <= PGT_root_page_table ++ ? FLUSH_TLB | FLUSH_FORCE_IPI ++ : FLUSH_TLB); ++ } ++ } ++ } ++ + if ( unlikely(((x & PGT_type_mask) == PGT_writable_page) != + (type == PGT_writable_page)) ) + { +@@ -3091,13 +3097,25 @@ static int _get_page_type(struct page_info *page, unsigned long type, + + if ( unlikely(!(nx & PGT_validated)) ) + { +- if ( !(x & PGT_partial) ) ++ /* ++ * No special validation needed for writable or shared pages. Page ++ * tables and GDT/LDT need to have their contents audited. ++ * ++ * per validate_page(), non-atomic updates are fine here. ++ */ ++ if ( type == PGT_writable_page || type == PGT_shared_page ) ++ page->u.inuse.type_info |= PGT_validated; ++ else + { +- page->nr_validated_ptes = 0; +- page->partial_flags = 0; +- page->linear_pt_count = 0; ++ if ( !(x & PGT_partial) ) ++ { ++ page->nr_validated_ptes = 0; ++ page->partial_flags = 0; ++ page->linear_pt_count = 0; ++ } ++ ++ rc = validate_page(page, type, preemptible); + } +- rc = validate_page(page, type, preemptible); + } + + out: +-- +2.35.1 + diff --git a/0024-x86-page-Introduce-_PAGE_-constants-for-memory-types.patch b/0024-x86-page-Introduce-_PAGE_-constants-for-memory-types.patch new file mode 100644 index 0000000..c8c2dda --- /dev/null +++ b/0024-x86-page-Introduce-_PAGE_-constants-for-memory-types.patch @@ -0,0 +1,53 @@ +From 9cfd796ae05421ded8e4f70b2c55352491cfa841 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper <andrew.cooper3@citrix.com> +Date: Thu, 9 Jun 2022 15:27:53 +0200 +Subject: [PATCH 24/32] x86/page: Introduce _PAGE_* constants for memory types + +... rather than opencoding the PAT/PCD/PWT attributes in __PAGE_HYPERVISOR_* +constants. These are going to be needed by forthcoming logic. + +No functional change. + +This is part of XSA-402. + +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +master commit: 1be8707c75bf4ba68447c74e1618b521dd432499 +master date: 2022-06-09 14:21:38 +0200 +--- + xen/include/asm-x86/page.h | 12 ++++++++++-- + 1 file changed, 10 insertions(+), 2 deletions(-) + +diff --git a/xen/include/asm-x86/page.h b/xen/include/asm-x86/page.h +index 1d080cffbe84..2e542050f65a 100644 +--- a/xen/include/asm-x86/page.h ++++ b/xen/include/asm-x86/page.h +@@ -331,6 +331,14 @@ void efi_update_l4_pgtable(unsigned int l4idx, l4_pgentry_t); + + #define PAGE_CACHE_ATTRS (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT) + ++/* Memory types, encoded under Xen's choice of MSR_PAT. */ ++#define _PAGE_WB ( 0) ++#define _PAGE_WT ( _PAGE_PWT) ++#define _PAGE_UCM ( _PAGE_PCD ) ++#define _PAGE_UC ( _PAGE_PCD | _PAGE_PWT) ++#define _PAGE_WC (_PAGE_PAT ) ++#define _PAGE_WP (_PAGE_PAT | _PAGE_PWT) ++ + /* + * Debug option: Ensure that granted mappings are not implicitly unmapped. + * WARNING: This will need to be disabled to run OSes that use the spare PTE +@@ -349,8 +357,8 @@ void efi_update_l4_pgtable(unsigned int l4idx, l4_pgentry_t); + #define __PAGE_HYPERVISOR_RX (_PAGE_PRESENT | _PAGE_ACCESSED) + #define __PAGE_HYPERVISOR (__PAGE_HYPERVISOR_RX | \ + _PAGE_DIRTY | _PAGE_RW) +-#define __PAGE_HYPERVISOR_UCMINUS (__PAGE_HYPERVISOR | _PAGE_PCD) +-#define __PAGE_HYPERVISOR_UC (__PAGE_HYPERVISOR | _PAGE_PCD | _PAGE_PWT) ++#define __PAGE_HYPERVISOR_UCMINUS (__PAGE_HYPERVISOR | _PAGE_UCM) ++#define __PAGE_HYPERVISOR_UC (__PAGE_HYPERVISOR | _PAGE_UC) + #define __PAGE_HYPERVISOR_SHSTK (__PAGE_HYPERVISOR_RO | _PAGE_DIRTY) + + #define MAP_SMALL_PAGES _PAGE_AVAIL0 /* don't use superpages mappings */ +-- +2.35.1 + diff --git a/0025-x86-Don-t-change-the-cacheability-of-the-directmap.patch b/0025-x86-Don-t-change-the-cacheability-of-the-directmap.patch new file mode 100644 index 0000000..582fc74 --- /dev/null +++ b/0025-x86-Don-t-change-the-cacheability-of-the-directmap.patch @@ -0,0 +1,223 @@ +From 74193f4292d9cfc2874866e941d9939d8f33fcef Mon Sep 17 00:00:00 2001 +From: Andrew Cooper <andrew.cooper3@citrix.com> +Date: Thu, 9 Jun 2022 15:28:23 +0200 +Subject: [PATCH 25/32] x86: Don't change the cacheability of the directmap + +Changeset 55f97f49b7ce ("x86: Change cache attributes of Xen 1:1 page mappings +in response to guest mapping requests") attempted to keep the cacheability +consistent between different mappings of the same page. + +The reason wasn't described in the changelog, but it is understood to be in +regards to a concern over machine check exceptions, owing to errata when using +mixed cacheabilities. It did this primarily by updating Xen's mapping of the +page in the direct map when the guest mapped a page with reduced cacheability. + +Unfortunately, the logic didn't actually prevent mixed cacheability from +occurring: + * A guest could map a page normally, and then map the same page with + different cacheability; nothing prevented this. + * The cacheability of the directmap was always latest-takes-precedence in + terms of guest requests. + * Grant-mapped frames with lesser cacheability didn't adjust the page's + cacheattr settings. + * The map_domain_page() function still unconditionally created WB mappings, + irrespective of the page's cacheattr settings. + +Additionally, update_xen_mappings() had a bug where the alias calculation was +wrong for mfn's which were .init content, which should have been treated as +fully guest pages, not Xen pages. + +Worse yet, the logic introduced a vulnerability whereby necessary +pagetable/segdesc adjustments made by Xen in the validation logic could become +non-coherent between the cache and main memory. The CPU could subsequently +operate on the stale value in the cache, rather than the safe value in main +memory. + +The directmap contains primarily mappings of RAM. PAT/MTRR conflict +resolution is asymmetric, and generally for MTRR=WB ranges, PAT of lesser +cacheability resolves to being coherent. The special case is WC mappings, +which are non-coherent against MTRR=WB regions (except for fully-coherent +CPUs). + +Xen must not have any WC cacheability in the directmap, to prevent Xen's +actions from creating non-coherency. (Guest actions creating non-coherency is +dealt with in subsequent patches.) As all memory types for MTRR=WB ranges +inter-operate coherently, so leave Xen's directmap mappings as WB. + +Only PV guests with access to devices can use reduced-cacheability mappings to +begin with, and they're trusted not to mount DoSs against the system anyway. + +Drop PGC_cacheattr_{base,mask} entirely, and the logic to manipulate them. +Shift the later PGC_* constants up, to gain 3 extra bits in the main reference +count. Retain the check in get_page_from_l1e() for special_pages() because a +guest has no business using reduced cacheability on these. + +This reverts changeset 55f97f49b7ce6c3520c555d19caac6cf3f9a5df0 + +This is CVE-2022-26363, part of XSA-402. + +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: George Dunlap <george.dunlap@citrix.com> +master commit: ae09597da34aee6bc5b76475c5eea6994457e854 +master date: 2022-06-09 14:22:08 +0200 +--- + xen/arch/x86/mm.c | 84 ++++------------------------------------ + xen/include/asm-x86/mm.h | 23 +++++------ + 2 files changed, 17 insertions(+), 90 deletions(-) + +diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c +index c6429b0f749a..ab32d13a1a0d 100644 +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -783,28 +783,6 @@ bool is_iomem_page(mfn_t mfn) + return (page_get_owner(page) == dom_io); + } + +-static int update_xen_mappings(unsigned long mfn, unsigned int cacheattr) +-{ +- int err = 0; +- bool alias = mfn >= PFN_DOWN(xen_phys_start) && +- mfn < PFN_UP(xen_phys_start + xen_virt_end - XEN_VIRT_START); +- unsigned long xen_va = +- XEN_VIRT_START + ((mfn - PFN_DOWN(xen_phys_start)) << PAGE_SHIFT); +- +- if ( boot_cpu_has(X86_FEATURE_XEN_SELFSNOOP) ) +- return 0; +- +- if ( unlikely(alias) && cacheattr ) +- err = map_pages_to_xen(xen_va, _mfn(mfn), 1, 0); +- if ( !err ) +- err = map_pages_to_xen((unsigned long)mfn_to_virt(mfn), _mfn(mfn), 1, +- PAGE_HYPERVISOR | cacheattr_to_pte_flags(cacheattr)); +- if ( unlikely(alias) && !cacheattr && !err ) +- err = map_pages_to_xen(xen_va, _mfn(mfn), 1, PAGE_HYPERVISOR); +- +- return err; +-} +- + #ifndef NDEBUG + struct mmio_emul_range_ctxt { + const struct domain *d; +@@ -1009,47 +987,14 @@ get_page_from_l1e( + goto could_not_pin; + } + +- if ( pte_flags_to_cacheattr(l1f) != +- ((page->count_info & PGC_cacheattr_mask) >> PGC_cacheattr_base) ) ++ if ( (l1f & PAGE_CACHE_ATTRS) != _PAGE_WB && is_special_page(page) ) + { +- unsigned long x, nx, y = page->count_info; +- unsigned long cacheattr = pte_flags_to_cacheattr(l1f); +- int err; +- +- if ( is_special_page(page) ) +- { +- if ( write ) +- put_page_type(page); +- put_page(page); +- gdprintk(XENLOG_WARNING, +- "Attempt to change cache attributes of Xen heap page\n"); +- return -EACCES; +- } +- +- do { +- x = y; +- nx = (x & ~PGC_cacheattr_mask) | (cacheattr << PGC_cacheattr_base); +- } while ( (y = cmpxchg(&page->count_info, x, nx)) != x ); +- +- err = update_xen_mappings(mfn, cacheattr); +- if ( unlikely(err) ) +- { +- cacheattr = y & PGC_cacheattr_mask; +- do { +- x = y; +- nx = (x & ~PGC_cacheattr_mask) | cacheattr; +- } while ( (y = cmpxchg(&page->count_info, x, nx)) != x ); +- +- if ( write ) +- put_page_type(page); +- put_page(page); +- +- gdprintk(XENLOG_WARNING, "Error updating mappings for mfn %" PRI_mfn +- " (pfn %" PRI_pfn ", from L1 entry %" PRIpte ") for d%d\n", +- mfn, get_gpfn_from_mfn(mfn), +- l1e_get_intpte(l1e), l1e_owner->domain_id); +- return err; +- } ++ if ( write ) ++ put_page_type(page); ++ put_page(page); ++ gdprintk(XENLOG_WARNING, ++ "Attempt to change cache attributes of Xen heap page\n"); ++ return -EACCES; + } + + return 0; +@@ -2467,24 +2412,9 @@ static int mod_l4_entry(l4_pgentry_t *pl4e, + */ + static int cleanup_page_mappings(struct page_info *page) + { +- unsigned int cacheattr = +- (page->count_info & PGC_cacheattr_mask) >> PGC_cacheattr_base; + int rc = 0; + unsigned long mfn = mfn_x(page_to_mfn(page)); + +- /* +- * If we've modified xen mappings as a result of guest cache +- * attributes, restore them to the "normal" state. +- */ +- if ( unlikely(cacheattr) ) +- { +- page->count_info &= ~PGC_cacheattr_mask; +- +- BUG_ON(is_special_page(page)); +- +- rc = update_xen_mappings(mfn, 0); +- } +- + /* + * If this may be in a PV domain's IOMMU, remove it. + * +diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h +index cb9052749963..8a9a43bb0a9d 100644 +--- a/xen/include/asm-x86/mm.h ++++ b/xen/include/asm-x86/mm.h +@@ -69,25 +69,22 @@ + /* Set when is using a page as a page table */ + #define _PGC_page_table PG_shift(3) + #define PGC_page_table PG_mask(1, 3) +- /* 3-bit PAT/PCD/PWT cache-attribute hint. */ +-#define PGC_cacheattr_base PG_shift(6) +-#define PGC_cacheattr_mask PG_mask(7, 6) + /* Page is broken? */ +-#define _PGC_broken PG_shift(7) +-#define PGC_broken PG_mask(1, 7) ++#define _PGC_broken PG_shift(4) ++#define PGC_broken PG_mask(1, 4) + /* Mutually-exclusive page states: { inuse, offlining, offlined, free }. */ +-#define PGC_state PG_mask(3, 9) +-#define PGC_state_inuse PG_mask(0, 9) +-#define PGC_state_offlining PG_mask(1, 9) +-#define PGC_state_offlined PG_mask(2, 9) +-#define PGC_state_free PG_mask(3, 9) ++#define PGC_state PG_mask(3, 6) ++#define PGC_state_inuse PG_mask(0, 6) ++#define PGC_state_offlining PG_mask(1, 6) ++#define PGC_state_offlined PG_mask(2, 6) ++#define PGC_state_free PG_mask(3, 6) + #define page_state_is(pg, st) (((pg)->count_info&PGC_state) == PGC_state_##st) + /* Page is not reference counted (see below for caveats) */ +-#define _PGC_extra PG_shift(10) +-#define PGC_extra PG_mask(1, 10) ++#define _PGC_extra PG_shift(7) ++#define PGC_extra PG_mask(1, 7) + + /* Count of references to this frame. */ +-#define PGC_count_width PG_shift(10) ++#define PGC_count_width PG_shift(7) + #define PGC_count_mask ((1UL<<PGC_count_width)-1) + + /* +-- +2.35.1 + diff --git a/0026-x86-Split-cache_flush-out-of-cache_writeback.patch b/0026-x86-Split-cache_flush-out-of-cache_writeback.patch new file mode 100644 index 0000000..ffd8d7c --- /dev/null +++ b/0026-x86-Split-cache_flush-out-of-cache_writeback.patch @@ -0,0 +1,294 @@ +From 8eafa2d871ae51d461256e4a14175e24df330c70 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper <andrew.cooper3@citrix.com> +Date: Thu, 9 Jun 2022 15:28:48 +0200 +Subject: [PATCH 26/32] x86: Split cache_flush() out of cache_writeback() + +Subsequent changes will want a fully flushing version. + +Use the new helper rather than opencoding it in flush_area_local(). This +resolves an outstanding issue where the conditional sfence is on the wrong +side of the clflushopt loop. clflushopt is ordered with respect to older +stores, not to younger stores. + +Rename gnttab_cache_flush()'s helper to avoid colliding in name. +grant_table.c can see the prototype from cache.h so the build fails +otherwise. + +This is part of XSA-402. + +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +master commit: 9a67ffee3371506e1cbfdfff5b90658d4828f6a2 +master date: 2022-06-09 14:22:38 +0200 +--- + xen/arch/x86/flushtlb.c | 84 ++++++++++++++++++++++++--- + xen/common/grant_table.c | 4 +- + xen/drivers/passthrough/vtd/extern.h | 1 - + xen/drivers/passthrough/vtd/iommu.c | 53 +---------------- + xen/drivers/passthrough/vtd/x86/vtd.c | 5 -- + xen/include/asm-x86/cache.h | 7 +++ + 6 files changed, 88 insertions(+), 66 deletions(-) + +diff --git a/xen/arch/x86/flushtlb.c b/xen/arch/x86/flushtlb.c +index 25798df50f54..0c912b8669f8 100644 +--- a/xen/arch/x86/flushtlb.c ++++ b/xen/arch/x86/flushtlb.c +@@ -234,7 +234,7 @@ unsigned int flush_area_local(const void *va, unsigned int flags) + if ( flags & FLUSH_CACHE ) + { + const struct cpuinfo_x86 *c = ¤t_cpu_data; +- unsigned long i, sz = 0; ++ unsigned long sz = 0; + + if ( order < (BITS_PER_LONG - PAGE_SHIFT) ) + sz = 1UL << (order + PAGE_SHIFT); +@@ -244,13 +244,7 @@ unsigned int flush_area_local(const void *va, unsigned int flags) + c->x86_clflush_size && c->x86_cache_size && sz && + ((sz >> 10) < c->x86_cache_size) ) + { +- alternative("", "sfence", X86_FEATURE_CLFLUSHOPT); +- for ( i = 0; i < sz; i += c->x86_clflush_size ) +- alternative_input(".byte " __stringify(NOP_DS_PREFIX) ";" +- " clflush %0", +- "data16 clflush %0", /* clflushopt */ +- X86_FEATURE_CLFLUSHOPT, +- "m" (((const char *)va)[i])); ++ cache_flush(va, sz); + flags &= ~FLUSH_CACHE; + } + else +@@ -265,6 +259,80 @@ unsigned int flush_area_local(const void *va, unsigned int flags) + return flags; + } + ++void cache_flush(const void *addr, unsigned int size) ++{ ++ /* ++ * This function may be called before current_cpu_data is established. ++ * Hence a fallback is needed to prevent the loop below becoming infinite. ++ */ ++ unsigned int clflush_size = current_cpu_data.x86_clflush_size ?: 16; ++ const void *end = addr + size; ++ ++ addr -= (unsigned long)addr & (clflush_size - 1); ++ for ( ; addr < end; addr += clflush_size ) ++ { ++ /* ++ * Note regarding the "ds" prefix use: it's faster to do a clflush ++ * + prefix than a clflush + nop, and hence the prefix is added instead ++ * of letting the alternative framework fill the gap by appending nops. ++ */ ++ alternative_io("ds; clflush %[p]", ++ "data16 clflush %[p]", /* clflushopt */ ++ X86_FEATURE_CLFLUSHOPT, ++ /* no outputs */, ++ [p] "m" (*(const char *)(addr))); ++ } ++ ++ alternative("", "sfence", X86_FEATURE_CLFLUSHOPT); ++} ++ ++void cache_writeback(const void *addr, unsigned int size) ++{ ++ unsigned int clflush_size; ++ const void *end = addr + size; ++ ++ /* Fall back to CLFLUSH{,OPT} when CLWB isn't available. */ ++ if ( !boot_cpu_has(X86_FEATURE_CLWB) ) ++ return cache_flush(addr, size); ++ ++ /* ++ * This function may be called before current_cpu_data is established. ++ * Hence a fallback is needed to prevent the loop below becoming infinite. ++ */ ++ clflush_size = current_cpu_data.x86_clflush_size ?: 16; ++ addr -= (unsigned long)addr & (clflush_size - 1); ++ for ( ; addr < end; addr += clflush_size ) ++ { ++/* ++ * The arguments to a macro must not include preprocessor directives. Doing so ++ * results in undefined behavior, so we have to create some defines here in ++ * order to avoid it. ++ */ ++#if defined(HAVE_AS_CLWB) ++# define CLWB_ENCODING "clwb %[p]" ++#elif defined(HAVE_AS_XSAVEOPT) ++# define CLWB_ENCODING "data16 xsaveopt %[p]" /* clwb */ ++#else ++# define CLWB_ENCODING ".byte 0x66, 0x0f, 0xae, 0x30" /* clwb (%%rax) */ ++#endif ++ ++#define BASE_INPUT(addr) [p] "m" (*(const char *)(addr)) ++#if defined(HAVE_AS_CLWB) || defined(HAVE_AS_XSAVEOPT) ++# define INPUT BASE_INPUT ++#else ++# define INPUT(addr) "a" (addr), BASE_INPUT(addr) ++#endif ++ ++ asm volatile (CLWB_ENCODING :: INPUT(addr)); ++ ++#undef INPUT ++#undef BASE_INPUT ++#undef CLWB_ENCODING ++ } ++ ++ asm volatile ("sfence" ::: "memory"); ++} ++ + unsigned int guest_flush_tlb_flags(const struct domain *d) + { + bool shadow = paging_mode_shadow(d); +diff --git a/xen/common/grant_table.c b/xen/common/grant_table.c +index 66f8ce71741c..4c742cd8fe81 100644 +--- a/xen/common/grant_table.c ++++ b/xen/common/grant_table.c +@@ -3431,7 +3431,7 @@ gnttab_swap_grant_ref(XEN_GUEST_HANDLE_PARAM(gnttab_swap_grant_ref_t) uop, + return 0; + } + +-static int cache_flush(const gnttab_cache_flush_t *cflush, grant_ref_t *cur_ref) ++static int _cache_flush(const gnttab_cache_flush_t *cflush, grant_ref_t *cur_ref) + { + struct domain *d, *owner; + struct page_info *page; +@@ -3525,7 +3525,7 @@ gnttab_cache_flush(XEN_GUEST_HANDLE_PARAM(gnttab_cache_flush_t) uop, + return -EFAULT; + for ( ; ; ) + { +- int ret = cache_flush(&op, cur_ref); ++ int ret = _cache_flush(&op, cur_ref); + + if ( ret < 0 ) + return ret; +diff --git a/xen/drivers/passthrough/vtd/extern.h b/xen/drivers/passthrough/vtd/extern.h +index 01e010a10d61..401079299725 100644 +--- a/xen/drivers/passthrough/vtd/extern.h ++++ b/xen/drivers/passthrough/vtd/extern.h +@@ -76,7 +76,6 @@ int __must_check qinval_device_iotlb_sync(struct vtd_iommu *iommu, + struct pci_dev *pdev, + u16 did, u16 size, u64 addr); + +-unsigned int get_cache_line_size(void); + void flush_all_cache(void); + + uint64_t alloc_pgtable_maddr(unsigned long npages, nodeid_t node); +diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c +index 8975c1de61bc..bc377c9bcfa4 100644 +--- a/xen/drivers/passthrough/vtd/iommu.c ++++ b/xen/drivers/passthrough/vtd/iommu.c +@@ -31,6 +31,7 @@ + #include <xen/pci.h> + #include <xen/pci_regs.h> + #include <xen/keyhandler.h> ++#include <asm/cache.h> + #include <asm/msi.h> + #include <asm/nops.h> + #include <asm/irq.h> +@@ -206,54 +207,6 @@ static void check_cleanup_domid_map(const struct domain *d, + } + } + +-static void sync_cache(const void *addr, unsigned int size) +-{ +- static unsigned long clflush_size = 0; +- const void *end = addr + size; +- +- if ( clflush_size == 0 ) +- clflush_size = get_cache_line_size(); +- +- addr -= (unsigned long)addr & (clflush_size - 1); +- for ( ; addr < end; addr += clflush_size ) +-/* +- * The arguments to a macro must not include preprocessor directives. Doing so +- * results in undefined behavior, so we have to create some defines here in +- * order to avoid it. +- */ +-#if defined(HAVE_AS_CLWB) +-# define CLWB_ENCODING "clwb %[p]" +-#elif defined(HAVE_AS_XSAVEOPT) +-# define CLWB_ENCODING "data16 xsaveopt %[p]" /* clwb */ +-#else +-# define CLWB_ENCODING ".byte 0x66, 0x0f, 0xae, 0x30" /* clwb (%%rax) */ +-#endif +- +-#define BASE_INPUT(addr) [p] "m" (*(const char *)(addr)) +-#if defined(HAVE_AS_CLWB) || defined(HAVE_AS_XSAVEOPT) +-# define INPUT BASE_INPUT +-#else +-# define INPUT(addr) "a" (addr), BASE_INPUT(addr) +-#endif +- /* +- * Note regarding the use of NOP_DS_PREFIX: it's faster to do a clflush +- * + prefix than a clflush + nop, and hence the prefix is added instead +- * of letting the alternative framework fill the gap by appending nops. +- */ +- alternative_io_2(".byte " __stringify(NOP_DS_PREFIX) "; clflush %[p]", +- "data16 clflush %[p]", /* clflushopt */ +- X86_FEATURE_CLFLUSHOPT, +- CLWB_ENCODING, +- X86_FEATURE_CLWB, /* no outputs */, +- INPUT(addr)); +-#undef INPUT +-#undef BASE_INPUT +-#undef CLWB_ENCODING +- +- alternative_2("", "sfence", X86_FEATURE_CLFLUSHOPT, +- "sfence", X86_FEATURE_CLWB); +-} +- + /* Allocate page table, return its machine address */ + uint64_t alloc_pgtable_maddr(unsigned long npages, nodeid_t node) + { +@@ -273,7 +226,7 @@ uint64_t alloc_pgtable_maddr(unsigned long npages, nodeid_t node) + clear_page(vaddr); + + if ( (iommu_ops.init ? &iommu_ops : &vtd_ops)->sync_cache ) +- sync_cache(vaddr, PAGE_SIZE); ++ cache_writeback(vaddr, PAGE_SIZE); + unmap_domain_page(vaddr); + cur_pg++; + } +@@ -1305,7 +1258,7 @@ int __init iommu_alloc(struct acpi_drhd_unit *drhd) + iommu->nr_pt_levels = agaw_to_level(agaw); + + if ( !ecap_coherent(iommu->ecap) ) +- vtd_ops.sync_cache = sync_cache; ++ vtd_ops.sync_cache = cache_writeback; + + /* allocate domain id bitmap */ + iommu->domid_bitmap = xzalloc_array(unsigned long, BITS_TO_LONGS(nr_dom)); +diff --git a/xen/drivers/passthrough/vtd/x86/vtd.c b/xen/drivers/passthrough/vtd/x86/vtd.c +index 6681dccd6970..55f0faa521cb 100644 +--- a/xen/drivers/passthrough/vtd/x86/vtd.c ++++ b/xen/drivers/passthrough/vtd/x86/vtd.c +@@ -47,11 +47,6 @@ void unmap_vtd_domain_page(const void *va) + unmap_domain_page(va); + } + +-unsigned int get_cache_line_size(void) +-{ +- return ((cpuid_ebx(1) >> 8) & 0xff) * 8; +-} +- + void flush_all_cache() + { + wbinvd(); +diff --git a/xen/include/asm-x86/cache.h b/xen/include/asm-x86/cache.h +index 1f7173d8c72c..e4770efb22b9 100644 +--- a/xen/include/asm-x86/cache.h ++++ b/xen/include/asm-x86/cache.h +@@ -11,4 +11,11 @@ + + #define __read_mostly __section(".data.read_mostly") + ++#ifndef __ASSEMBLY__ ++ ++void cache_flush(const void *addr, unsigned int size); ++void cache_writeback(const void *addr, unsigned int size); ++ ++#endif ++ + #endif +-- +2.35.1 + diff --git a/0027-x86-amd-Work-around-CLFLUSH-ordering-on-older-parts.patch b/0027-x86-amd-Work-around-CLFLUSH-ordering-on-older-parts.patch new file mode 100644 index 0000000..a3ab379 --- /dev/null +++ b/0027-x86-amd-Work-around-CLFLUSH-ordering-on-older-parts.patch @@ -0,0 +1,95 @@ +From c4815be949aae6583a9a22897beb96b095b4f1a2 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper <andrew.cooper3@citrix.com> +Date: Thu, 9 Jun 2022 15:29:13 +0200 +Subject: [PATCH 27/32] x86/amd: Work around CLFLUSH ordering on older parts + +On pre-CLFLUSHOPT AMD CPUs, CLFLUSH is weakely ordered with everything, +including reads and writes to the address, and LFENCE/SFENCE instructions. + +This creates a multitude of problematic corner cases, laid out in the manual. +Arrange to use MFENCE on both sides of the CLFLUSH to force proper ordering. + +This is part of XSA-402. + +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +master commit: 062868a5a8b428b85db589fa9a6d6e43969ffeb9 +master date: 2022-06-09 14:23:07 +0200 +--- + xen/arch/x86/cpu/amd.c | 8 ++++++++ + xen/arch/x86/flushtlb.c | 13 ++++++++++++- + xen/include/asm-x86/cpufeatures.h | 1 + + 3 files changed, 21 insertions(+), 1 deletion(-) + +diff --git a/xen/arch/x86/cpu/amd.c b/xen/arch/x86/cpu/amd.c +index a8e37dbb1f5c..b3b9a0df5fed 100644 +--- a/xen/arch/x86/cpu/amd.c ++++ b/xen/arch/x86/cpu/amd.c +@@ -812,6 +812,14 @@ static void init_amd(struct cpuinfo_x86 *c) + if (!cpu_has_lfence_dispatch) + __set_bit(X86_FEATURE_MFENCE_RDTSC, c->x86_capability); + ++ /* ++ * On pre-CLFLUSHOPT AMD CPUs, CLFLUSH is weakly ordered with ++ * everything, including reads and writes to address, and ++ * LFENCE/SFENCE instructions. ++ */ ++ if (!cpu_has_clflushopt) ++ setup_force_cpu_cap(X86_BUG_CLFLUSH_MFENCE); ++ + switch(c->x86) + { + case 0xf ... 0x11: +diff --git a/xen/arch/x86/flushtlb.c b/xen/arch/x86/flushtlb.c +index 0c912b8669f8..dcbb4064012e 100644 +--- a/xen/arch/x86/flushtlb.c ++++ b/xen/arch/x86/flushtlb.c +@@ -259,6 +259,13 @@ unsigned int flush_area_local(const void *va, unsigned int flags) + return flags; + } + ++/* ++ * On pre-CLFLUSHOPT AMD CPUs, CLFLUSH is weakly ordered with everything, ++ * including reads and writes to address, and LFENCE/SFENCE instructions. ++ * ++ * This function only works safely after alternatives have run. Luckily, at ++ * the time of writing, we don't flush the caches that early. ++ */ + void cache_flush(const void *addr, unsigned int size) + { + /* +@@ -268,6 +275,8 @@ void cache_flush(const void *addr, unsigned int size) + unsigned int clflush_size = current_cpu_data.x86_clflush_size ?: 16; + const void *end = addr + size; + ++ alternative("", "mfence", X86_BUG_CLFLUSH_MFENCE); ++ + addr -= (unsigned long)addr & (clflush_size - 1); + for ( ; addr < end; addr += clflush_size ) + { +@@ -283,7 +292,9 @@ void cache_flush(const void *addr, unsigned int size) + [p] "m" (*(const char *)(addr))); + } + +- alternative("", "sfence", X86_FEATURE_CLFLUSHOPT); ++ alternative_2("", ++ "sfence", X86_FEATURE_CLFLUSHOPT, ++ "mfence", X86_BUG_CLFLUSH_MFENCE); + } + + void cache_writeback(const void *addr, unsigned int size) +diff --git a/xen/include/asm-x86/cpufeatures.h b/xen/include/asm-x86/cpufeatures.h +index 7413febd7ad8..ff3157d52d13 100644 +--- a/xen/include/asm-x86/cpufeatures.h ++++ b/xen/include/asm-x86/cpufeatures.h +@@ -47,6 +47,7 @@ XEN_CPUFEATURE(XEN_IBT, X86_SYNTH(27)) /* Xen uses CET Indirect Branch + + #define X86_BUG_FPU_PTRS X86_BUG( 0) /* (F)X{SAVE,RSTOR} doesn't save/restore FOP/FIP/FDP. */ + #define X86_BUG_NULL_SEG X86_BUG( 1) /* NULL-ing a selector preserves the base and limit. */ ++#define X86_BUG_CLFLUSH_MFENCE X86_BUG( 2) /* MFENCE needed to serialise CLFLUSH */ + + /* Total number of capability words, inc synth and bug words. */ + #define NCAPINTS (FSCAPINTS + X86_NR_SYNTH + X86_NR_BUG) /* N 32-bit words worth of info */ +-- +2.35.1 + diff --git a/0028-x86-pv-Track-and-flush-non-coherent-mappings-of-RAM.patch b/0028-x86-pv-Track-and-flush-non-coherent-mappings-of-RAM.patch new file mode 100644 index 0000000..66cd741 --- /dev/null +++ b/0028-x86-pv-Track-and-flush-non-coherent-mappings-of-RAM.patch @@ -0,0 +1,160 @@ +From dc020d8d1ba420e2dd0e7a40f5045db897f3c4f4 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper <andrew.cooper3@citrix.com> +Date: Thu, 9 Jun 2022 15:29:38 +0200 +Subject: [PATCH 28/32] x86/pv: Track and flush non-coherent mappings of RAM + +There are legitimate uses of WC mappings of RAM, e.g. for DMA buffers with +devices that make non-coherent writes. The Linux sound subsystem makes +extensive use of this technique. + +For such usecases, the guest's DMA buffer is mapped and consistently used as +WC, and Xen doesn't interact with the buffer. + +However, a mischevious guest can use WC mappings to deliberately create +non-coherency between the cache and RAM, and use this to trick Xen into +validating a pagetable which isn't actually safe. + +Allocate a new PGT_non_coherent to track the non-coherency of mappings. Set +it whenever a non-coherent writeable mapping is created. If the page is used +as anything other than PGT_writable_page, force a cache flush before +validation. Also force a cache flush before the page is returned to the heap. + +This is CVE-2022-26364, part of XSA-402. + +Reported-by: Jann Horn <jannh@google.com> +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: George Dunlap <george.dunlap@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +master commit: c1c9cae3a9633054b177c5de21ad7268162b2f2c +master date: 2022-06-09 14:23:37 +0200 +--- + xen/arch/x86/mm.c | 38 +++++++++++++++++++++++++++++++++++ + xen/arch/x86/pv/grant_table.c | 21 +++++++++++++++++++ + xen/include/asm-x86/mm.h | 6 +++++- + 3 files changed, 64 insertions(+), 1 deletion(-) + +diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c +index ab32d13a1a0d..bab9624fabb7 100644 +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -997,6 +997,15 @@ get_page_from_l1e( + return -EACCES; + } + ++ /* ++ * Track writeable non-coherent mappings to RAM pages, to trigger a cache ++ * flush later if the target is used as anything but a PGT_writeable page. ++ * We care about all writeable mappings, including foreign mappings. ++ */ ++ if ( !boot_cpu_has(X86_FEATURE_XEN_SELFSNOOP) && ++ (l1f & (PAGE_CACHE_ATTRS | _PAGE_RW)) == (_PAGE_WC | _PAGE_RW) ) ++ set_bit(_PGT_non_coherent, &page->u.inuse.type_info); ++ + return 0; + + could_not_pin: +@@ -2454,6 +2463,19 @@ static int cleanup_page_mappings(struct page_info *page) + } + } + ++ /* ++ * Flush the cache if there were previously non-coherent writeable ++ * mappings of this page. This forces the page to be coherent before it ++ * is freed back to the heap. ++ */ ++ if ( __test_and_clear_bit(_PGT_non_coherent, &page->u.inuse.type_info) ) ++ { ++ void *addr = __map_domain_page(page); ++ ++ cache_flush(addr, PAGE_SIZE); ++ unmap_domain_page(addr); ++ } ++ + return rc; + } + +@@ -3027,6 +3049,22 @@ static int _get_page_type(struct page_info *page, unsigned long type, + + if ( unlikely(!(nx & PGT_validated)) ) + { ++ /* ++ * Flush the cache if there were previously non-coherent mappings of ++ * this page, and we're trying to use it as anything other than a ++ * writeable page. This forces the page to be coherent before we ++ * validate its contents for safety. ++ */ ++ if ( (nx & PGT_non_coherent) && type != PGT_writable_page ) ++ { ++ void *addr = __map_domain_page(page); ++ ++ cache_flush(addr, PAGE_SIZE); ++ unmap_domain_page(addr); ++ ++ page->u.inuse.type_info &= ~PGT_non_coherent; ++ } ++ + /* + * No special validation needed for writable or shared pages. Page + * tables and GDT/LDT need to have their contents audited. +diff --git a/xen/arch/x86/pv/grant_table.c b/xen/arch/x86/pv/grant_table.c +index 0325618c9883..81c72e61ed55 100644 +--- a/xen/arch/x86/pv/grant_table.c ++++ b/xen/arch/x86/pv/grant_table.c +@@ -109,7 +109,17 @@ int create_grant_pv_mapping(uint64_t addr, mfn_t frame, + + ol1e = *pl1e; + if ( UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr, 0) ) ++ { ++ /* ++ * We always create mappings in this path. However, our caller, ++ * map_grant_ref(), only passes potentially non-zero cache_flags for ++ * MMIO frames, so this path doesn't create non-coherent mappings of ++ * RAM frames and there's no need to calculate PGT_non_coherent. ++ */ ++ ASSERT(!cache_flags || is_iomem_page(frame)); ++ + rc = GNTST_okay; ++ } + + out_unlock: + page_unlock(page); +@@ -294,7 +304,18 @@ int replace_grant_pv_mapping(uint64_t addr, mfn_t frame, + l1e_get_flags(ol1e), addr, grant_pte_flags); + + if ( UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr, 0) ) ++ { ++ /* ++ * Generally, replace_grant_pv_mapping() is used to destroy mappings ++ * (n1le = l1e_empty()), but it can be a present mapping on the ++ * GNTABOP_unmap_and_replace path. ++ * ++ * In such cases, the PTE is fully transplanted from its old location ++ * via steal_linear_addr(), so we need not perform PGT_non_coherent ++ * checking here. ++ */ + rc = GNTST_okay; ++ } + + out_unlock: + page_unlock(page); +diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h +index 8a9a43bb0a9d..7464167ae192 100644 +--- a/xen/include/asm-x86/mm.h ++++ b/xen/include/asm-x86/mm.h +@@ -53,8 +53,12 @@ + #define _PGT_partial PG_shift(8) + #define PGT_partial PG_mask(1, 8) + ++/* Has this page been mapped writeable with a non-coherent memory type? */ ++#define _PGT_non_coherent PG_shift(9) ++#define PGT_non_coherent PG_mask(1, 9) ++ + /* Count of uses of this frame as its current type. */ +-#define PGT_count_width PG_shift(8) ++#define PGT_count_width PG_shift(9) + #define PGT_count_mask ((1UL<<PGT_count_width)-1) + + /* Are the 'type mask' bits identical? */ +-- +2.35.1 + diff --git a/0029-x86-mm-account-for-PGT_pae_xen_l2-in-recently-added-.patch b/0029-x86-mm-account-for-PGT_pae_xen_l2-in-recently-added-.patch new file mode 100644 index 0000000..0076984 --- /dev/null +++ b/0029-x86-mm-account-for-PGT_pae_xen_l2-in-recently-added-.patch @@ -0,0 +1,37 @@ +From 0b4e62847c5af1a59eea8d17093feccd550d1c26 Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Fri, 10 Jun 2022 10:28:28 +0200 +Subject: [PATCH 29/32] x86/mm: account for PGT_pae_xen_l2 in recently added + assertion + +While PGT_pae_xen_l2 will be zapped once the type refcount of an L2 page +reaches zero, it'll be retained as long as the type refcount is non- +zero. Hence any checking against the requested type needs to either zap +the bit from the type or include it in the used mask. + +Fixes: 9186e96b199e ("x86/pv: Clean up _get_page_type()") +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> +master commit: c2095ac76be0f4a1940346c9ffb49fb967345060 +master date: 2022-06-10 10:21:06 +0200 +--- + xen/arch/x86/mm.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c +index bab9624fabb7..c1b9a3bb102a 100644 +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -2928,7 +2928,8 @@ static int _get_page_type(struct page_info *page, unsigned long type, + * The page is in one of two states (depending on PGT_partial), + * and should have exactly one reference. + */ +- ASSERT((x & (PGT_type_mask | PGT_count_mask)) == (type | 1)); ++ ASSERT((x & (PGT_type_mask | PGT_pae_xen_l2 | PGT_count_mask)) == ++ (type | 1)); + + if ( !(x & PGT_partial) ) + { +-- +2.35.1 + diff --git a/0030-x86-spec-ctrl-Make-VERW-flushing-runtime-conditional.patch b/0030-x86-spec-ctrl-Make-VERW-flushing-runtime-conditional.patch new file mode 100644 index 0000000..8556452 --- /dev/null +++ b/0030-x86-spec-ctrl-Make-VERW-flushing-runtime-conditional.patch @@ -0,0 +1,258 @@ +From 0e80f9f61168d4e4f008da75762cee0118f802ed Mon Sep 17 00:00:00 2001 +From: Andrew Cooper <andrew.cooper3@citrix.com> +Date: Mon, 13 Jun 2022 16:19:01 +0100 +Subject: [PATCH 30/32] x86/spec-ctrl: Make VERW flushing runtime conditional +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Currently, VERW flushing to mitigate MDS is boot time conditional per domain +type. However, to provide mitigations for DRPW (CVE-2022-21166), we need to +conditionally use VERW based on the trustworthiness of the guest, and the +devices passed through. + +Remove the PV/HVM alternatives and instead issue a VERW on the return-to-guest +path depending on the SCF_verw bit in cpuinfo spec_ctrl_flags. + +Introduce spec_ctrl_init_domain() and d->arch.verw to calculate the VERW +disposition at domain creation time, and context switch the SCF_verw bit. + +For now, VERW flushing is used and controlled exactly as before, but later +patches will add per-domain cases too. + +No change in behaviour. + +This is part of XSA-404. + +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> +(cherry picked from commit e06b95c1d44ab80da255219fc9f1e2fc423edcb6) +--- + docs/misc/xen-command-line.pandoc | 5 ++--- + xen/arch/x86/domain.c | 12 ++++++++++-- + xen/arch/x86/hvm/vmx/entry.S | 2 +- + xen/arch/x86/spec_ctrl.c | 30 +++++++++++++++++------------ + xen/include/asm-x86/cpufeatures.h | 3 +-- + xen/include/asm-x86/domain.h | 3 +++ + xen/include/asm-x86/spec_ctrl.h | 2 ++ + xen/include/asm-x86/spec_ctrl_asm.h | 16 +++++++++++++-- + 8 files changed, 51 insertions(+), 22 deletions(-) + +diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc +index 1d08fb7e9aa6..d5cb09f86541 100644 +--- a/docs/misc/xen-command-line.pandoc ++++ b/docs/misc/xen-command-line.pandoc +@@ -2258,9 +2258,8 @@ in place for guests to use. + Use of a positive boolean value for either of these options is invalid. + + The booleans `pv=`, `hvm=`, `msr-sc=`, `rsb=` and `md-clear=` offer fine +-grained control over the alternative blocks used by Xen. These impact Xen's +-ability to protect itself, and Xen's ability to virtualise support for guests +-to use. ++grained control over the primitives by Xen. These impact Xen's ability to ++protect itself, and Xen's ability to virtualise support for guests to use. + + * `pv=` and `hvm=` offer control over all suboptions for PV and HVM guests + respectively. +diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c +index ef1812dc1402..1fe6644a71ae 100644 +--- a/xen/arch/x86/domain.c ++++ b/xen/arch/x86/domain.c +@@ -863,6 +863,8 @@ int arch_domain_create(struct domain *d, + + d->arch.msr_relaxed = config->arch.misc_flags & XEN_X86_MSR_RELAXED; + ++ spec_ctrl_init_domain(d); ++ + return 0; + + fail: +@@ -2017,14 +2019,15 @@ static void __context_switch(void) + void context_switch(struct vcpu *prev, struct vcpu *next) + { + unsigned int cpu = smp_processor_id(); ++ struct cpu_info *info = get_cpu_info(); + const struct domain *prevd = prev->domain, *nextd = next->domain; + unsigned int dirty_cpu = read_atomic(&next->dirty_cpu); + + ASSERT(prev != next); + ASSERT(local_irq_is_enabled()); + +- get_cpu_info()->use_pv_cr3 = false; +- get_cpu_info()->xen_cr3 = 0; ++ info->use_pv_cr3 = false; ++ info->xen_cr3 = 0; + + if ( unlikely(dirty_cpu != cpu) && dirty_cpu != VCPU_CPU_CLEAN ) + { +@@ -2088,6 +2091,11 @@ void context_switch(struct vcpu *prev, struct vcpu *next) + *last_id = next_id; + } + } ++ ++ /* Update the top-of-stack block with the VERW disposition. */ ++ info->spec_ctrl_flags &= ~SCF_verw; ++ if ( nextd->arch.verw ) ++ info->spec_ctrl_flags |= SCF_verw; + } + + sched_context_switched(prev, next); +diff --git a/xen/arch/x86/hvm/vmx/entry.S b/xen/arch/x86/hvm/vmx/entry.S +index 49651f3c435a..5f5de45a1309 100644 +--- a/xen/arch/x86/hvm/vmx/entry.S ++++ b/xen/arch/x86/hvm/vmx/entry.S +@@ -87,7 +87,7 @@ UNLIKELY_END(realmode) + + /* WARNING! `ret`, `call *`, `jmp *` not safe beyond this point. */ + /* SPEC_CTRL_EXIT_TO_VMX Req: %rsp=regs/cpuinfo Clob: */ +- ALTERNATIVE "", __stringify(verw CPUINFO_verw_sel(%rsp)), X86_FEATURE_SC_VERW_HVM ++ DO_SPEC_CTRL_COND_VERW + + mov VCPU_hvm_guest_cr2(%rbx),%rax + +diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c +index c19464da70ce..21730aa03071 100644 +--- a/xen/arch/x86/spec_ctrl.c ++++ b/xen/arch/x86/spec_ctrl.c +@@ -36,8 +36,8 @@ static bool __initdata opt_msr_sc_pv = true; + static bool __initdata opt_msr_sc_hvm = true; + static int8_t __initdata opt_rsb_pv = -1; + static bool __initdata opt_rsb_hvm = true; +-static int8_t __initdata opt_md_clear_pv = -1; +-static int8_t __initdata opt_md_clear_hvm = -1; ++static int8_t __read_mostly opt_md_clear_pv = -1; ++static int8_t __read_mostly opt_md_clear_hvm = -1; + + /* Cmdline controls for Xen's speculative settings. */ + static enum ind_thunk { +@@ -932,6 +932,13 @@ static __init void mds_calculations(uint64_t caps) + } + } + ++void spec_ctrl_init_domain(struct domain *d) ++{ ++ bool pv = is_pv_domain(d); ++ ++ d->arch.verw = pv ? opt_md_clear_pv : opt_md_clear_hvm; ++} ++ + void __init init_speculation_mitigations(void) + { + enum ind_thunk thunk = THUNK_DEFAULT; +@@ -1196,21 +1203,20 @@ void __init init_speculation_mitigations(void) + boot_cpu_has(X86_FEATURE_MD_CLEAR)); + + /* +- * Enable MDS defences as applicable. The PV blocks need using all the +- * time, and the Idle blocks need using if either PV or HVM defences are +- * used. ++ * Enable MDS defences as applicable. The Idle blocks need using if ++ * either PV or HVM defences are used. + * + * HVM is more complicated. The MD_CLEAR microcode extends L1D_FLUSH with +- * equivelent semantics to avoid needing to perform both flushes on the +- * HVM path. The HVM blocks don't need activating if our hypervisor told +- * us it was handling L1D_FLUSH, or we are using L1D_FLUSH ourselves. ++ * equivalent semantics to avoid needing to perform both flushes on the ++ * HVM path. Therefore, we don't need VERW in addition to L1D_FLUSH. ++ * ++ * After calculating the appropriate idle setting, simplify ++ * opt_md_clear_hvm to mean just "should we VERW on the way into HVM ++ * guests", so spec_ctrl_init_domain() can calculate suitable settings. + */ +- if ( opt_md_clear_pv ) +- setup_force_cpu_cap(X86_FEATURE_SC_VERW_PV); + if ( opt_md_clear_pv || opt_md_clear_hvm ) + setup_force_cpu_cap(X86_FEATURE_SC_VERW_IDLE); +- if ( opt_md_clear_hvm && !(caps & ARCH_CAPS_SKIP_L1DFL) && !opt_l1d_flush ) +- setup_force_cpu_cap(X86_FEATURE_SC_VERW_HVM); ++ opt_md_clear_hvm &= !(caps & ARCH_CAPS_SKIP_L1DFL) && !opt_l1d_flush; + + /* + * Warn the user if they are on MLPDS/MFBDS-vulnerable hardware with HT +diff --git a/xen/include/asm-x86/cpufeatures.h b/xen/include/asm-x86/cpufeatures.h +index ff3157d52d13..bd45a144ee78 100644 +--- a/xen/include/asm-x86/cpufeatures.h ++++ b/xen/include/asm-x86/cpufeatures.h +@@ -35,8 +35,7 @@ XEN_CPUFEATURE(SC_RSB_HVM, X86_SYNTH(19)) /* RSB overwrite needed for HVM + XEN_CPUFEATURE(XEN_SELFSNOOP, X86_SYNTH(20)) /* SELFSNOOP gets used by Xen itself */ + XEN_CPUFEATURE(SC_MSR_IDLE, X86_SYNTH(21)) /* (SC_MSR_PV || SC_MSR_HVM) && default_xen_spec_ctrl */ + XEN_CPUFEATURE(XEN_LBR, X86_SYNTH(22)) /* Xen uses MSR_DEBUGCTL.LBR */ +-XEN_CPUFEATURE(SC_VERW_PV, X86_SYNTH(23)) /* VERW used by Xen for PV */ +-XEN_CPUFEATURE(SC_VERW_HVM, X86_SYNTH(24)) /* VERW used by Xen for HVM */ ++/* Bits 23,24 unused. */ + XEN_CPUFEATURE(SC_VERW_IDLE, X86_SYNTH(25)) /* VERW used by Xen for idle */ + XEN_CPUFEATURE(XEN_SHSTK, X86_SYNTH(26)) /* Xen uses CET Shadow Stacks */ + XEN_CPUFEATURE(XEN_IBT, X86_SYNTH(27)) /* Xen uses CET Indirect Branch Tracking */ +diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h +index 92d54de0b9a1..2398a1d99da9 100644 +--- a/xen/include/asm-x86/domain.h ++++ b/xen/include/asm-x86/domain.h +@@ -319,6 +319,9 @@ struct arch_domain + uint32_t pci_cf8; + uint8_t cmos_idx; + ++ /* Use VERW on return-to-guest for its flushing side effect. */ ++ bool verw; ++ + union { + struct pv_domain pv; + struct hvm_domain hvm; +diff --git a/xen/include/asm-x86/spec_ctrl.h b/xen/include/asm-x86/spec_ctrl.h +index f76029523610..751355f471f4 100644 +--- a/xen/include/asm-x86/spec_ctrl.h ++++ b/xen/include/asm-x86/spec_ctrl.h +@@ -24,6 +24,7 @@ + #define SCF_use_shadow (1 << 0) + #define SCF_ist_wrmsr (1 << 1) + #define SCF_ist_rsb (1 << 2) ++#define SCF_verw (1 << 3) + + #ifndef __ASSEMBLY__ + +@@ -32,6 +33,7 @@ + #include <asm/msr-index.h> + + void init_speculation_mitigations(void); ++void spec_ctrl_init_domain(struct domain *d); + + extern bool opt_ibpb; + extern bool opt_ssbd; +diff --git a/xen/include/asm-x86/spec_ctrl_asm.h b/xen/include/asm-x86/spec_ctrl_asm.h +index 02b3b18ce69f..5a590bac44aa 100644 +--- a/xen/include/asm-x86/spec_ctrl_asm.h ++++ b/xen/include/asm-x86/spec_ctrl_asm.h +@@ -136,6 +136,19 @@ + #endif + .endm + ++.macro DO_SPEC_CTRL_COND_VERW ++/* ++ * Requires %rsp=cpuinfo ++ * ++ * Issue a VERW for its flushing side effect, if indicated. This is a Spectre ++ * v1 gadget, but the IRET/VMEntry is serialising. ++ */ ++ testb $SCF_verw, CPUINFO_spec_ctrl_flags(%rsp) ++ jz .L\@_verw_skip ++ verw CPUINFO_verw_sel(%rsp) ++.L\@_verw_skip: ++.endm ++ + .macro DO_SPEC_CTRL_ENTRY maybexen:req + /* + * Requires %rsp=regs (also cpuinfo if !maybexen) +@@ -231,8 +244,7 @@ + #define SPEC_CTRL_EXIT_TO_PV \ + ALTERNATIVE "", \ + DO_SPEC_CTRL_EXIT_TO_GUEST, X86_FEATURE_SC_MSR_PV; \ +- ALTERNATIVE "", __stringify(verw CPUINFO_verw_sel(%rsp)), \ +- X86_FEATURE_SC_VERW_PV ++ DO_SPEC_CTRL_COND_VERW + + /* + * Use in IST interrupt/exception context. May interrupt Xen or PV context. +-- +2.35.1 + diff --git a/0031-x86-spec-ctrl-Enumeration-for-MMIO-Stale-Data-contro.patch b/0031-x86-spec-ctrl-Enumeration-for-MMIO-Stale-Data-contro.patch new file mode 100644 index 0000000..6934800 --- /dev/null +++ b/0031-x86-spec-ctrl-Enumeration-for-MMIO-Stale-Data-contro.patch @@ -0,0 +1,98 @@ +From a83108736db0ddaa5855f5abda6dcc8ae4fe25e9 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper <andrew.cooper3@citrix.com> +Date: Mon, 20 Sep 2021 18:47:49 +0100 +Subject: [PATCH 31/32] x86/spec-ctrl: Enumeration for MMIO Stale Data controls +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +The three *_NO bits indicate non-susceptibility to the SSDP, FBSDP and PSDP +data movement primitives. + +FB_CLEAR indicates that the VERW instruction has re-gained it's Fill Buffer +flushing side effect. This is only enumerated on parts where VERW had +previously lost it's flushing side effect due to the MDS/TAA vulnerabilities +being fixed in hardware. + +FB_CLEAR_CTRL is available on a subset of FB_CLEAR parts where the Fill Buffer +clearing side effect of VERW can be turned off for performance reasons. + +This is part of XSA-404. + +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> +(cherry picked from commit 2ebe8fe9b7e0d36e9ec3cfe4552b2b197ef0dcec) +--- + xen/arch/x86/spec_ctrl.c | 11 ++++++++--- + xen/include/asm-x86/msr-index.h | 6 ++++++ + 2 files changed, 14 insertions(+), 3 deletions(-) + +diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c +index 21730aa03071..d285538bde9f 100644 +--- a/xen/arch/x86/spec_ctrl.c ++++ b/xen/arch/x86/spec_ctrl.c +@@ -323,7 +323,7 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) + * Hardware read-only information, stating immunity to certain issues, or + * suggestions of which mitigation to use. + */ +- printk(" Hardware hints:%s%s%s%s%s%s%s%s%s%s%s\n", ++ printk(" Hardware hints:%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", + (caps & ARCH_CAPS_RDCL_NO) ? " RDCL_NO" : "", + (caps & ARCH_CAPS_IBRS_ALL) ? " IBRS_ALL" : "", + (caps & ARCH_CAPS_RSBA) ? " RSBA" : "", +@@ -332,13 +332,16 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) + (caps & ARCH_CAPS_SSB_NO) ? " SSB_NO" : "", + (caps & ARCH_CAPS_MDS_NO) ? " MDS_NO" : "", + (caps & ARCH_CAPS_TAA_NO) ? " TAA_NO" : "", ++ (caps & ARCH_CAPS_SBDR_SSDP_NO) ? " SBDR_SSDP_NO" : "", ++ (caps & ARCH_CAPS_FBSDP_NO) ? " FBSDP_NO" : "", ++ (caps & ARCH_CAPS_PSDP_NO) ? " PSDP_NO" : "", + (e8b & cpufeat_mask(X86_FEATURE_IBRS_ALWAYS)) ? " IBRS_ALWAYS" : "", + (e8b & cpufeat_mask(X86_FEATURE_STIBP_ALWAYS)) ? " STIBP_ALWAYS" : "", + (e8b & cpufeat_mask(X86_FEATURE_IBRS_FAST)) ? " IBRS_FAST" : "", + (e8b & cpufeat_mask(X86_FEATURE_IBRS_SAME_MODE)) ? " IBRS_SAME_MODE" : ""); + + /* Hardware features which need driving to mitigate issues. */ +- printk(" Hardware features:%s%s%s%s%s%s%s%s%s%s\n", ++ printk(" Hardware features:%s%s%s%s%s%s%s%s%s%s%s%s\n", + (e8b & cpufeat_mask(X86_FEATURE_IBPB)) || + (_7d0 & cpufeat_mask(X86_FEATURE_IBRSB)) ? " IBPB" : "", + (e8b & cpufeat_mask(X86_FEATURE_IBRS)) || +@@ -353,7 +356,9 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) + (_7d0 & cpufeat_mask(X86_FEATURE_MD_CLEAR)) ? " MD_CLEAR" : "", + (_7d0 & cpufeat_mask(X86_FEATURE_SRBDS_CTRL)) ? " SRBDS_CTRL" : "", + (e8b & cpufeat_mask(X86_FEATURE_VIRT_SSBD)) ? " VIRT_SSBD" : "", +- (caps & ARCH_CAPS_TSX_CTRL) ? " TSX_CTRL" : ""); ++ (caps & ARCH_CAPS_TSX_CTRL) ? " TSX_CTRL" : "", ++ (caps & ARCH_CAPS_FB_CLEAR) ? " FB_CLEAR" : "", ++ (caps & ARCH_CAPS_FB_CLEAR_CTRL) ? " FB_CLEAR_CTRL" : ""); + + /* Compiled-in support which pertains to mitigations. */ + if ( IS_ENABLED(CONFIG_INDIRECT_THUNK) || IS_ENABLED(CONFIG_SHADOW_PAGING) ) +diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h +index 31964b88af7a..72bc32ba04ff 100644 +--- a/xen/include/asm-x86/msr-index.h ++++ b/xen/include/asm-x86/msr-index.h +@@ -66,6 +66,11 @@ + #define ARCH_CAPS_IF_PSCHANGE_MC_NO (_AC(1, ULL) << 6) + #define ARCH_CAPS_TSX_CTRL (_AC(1, ULL) << 7) + #define ARCH_CAPS_TAA_NO (_AC(1, ULL) << 8) ++#define ARCH_CAPS_SBDR_SSDP_NO (_AC(1, ULL) << 13) ++#define ARCH_CAPS_FBSDP_NO (_AC(1, ULL) << 14) ++#define ARCH_CAPS_PSDP_NO (_AC(1, ULL) << 15) ++#define ARCH_CAPS_FB_CLEAR (_AC(1, ULL) << 17) ++#define ARCH_CAPS_FB_CLEAR_CTRL (_AC(1, ULL) << 18) + + #define MSR_FLUSH_CMD 0x0000010b + #define FLUSH_CMD_L1D (_AC(1, ULL) << 0) +@@ -83,6 +88,7 @@ + #define MCU_OPT_CTRL_RNGDS_MITG_DIS (_AC(1, ULL) << 0) + #define MCU_OPT_CTRL_RTM_ALLOW (_AC(1, ULL) << 1) + #define MCU_OPT_CTRL_RTM_LOCKED (_AC(1, ULL) << 2) ++#define MCU_OPT_CTRL_FB_CLEAR_DIS (_AC(1, ULL) << 3) + + #define MSR_RTIT_OUTPUT_BASE 0x00000560 + #define MSR_RTIT_OUTPUT_MASK 0x00000561 +-- +2.35.1 + diff --git a/0032-x86-spec-ctrl-Add-spec-ctrl-unpriv-mmio.patch b/0032-x86-spec-ctrl-Add-spec-ctrl-unpriv-mmio.patch new file mode 100644 index 0000000..a5ac3e9 --- /dev/null +++ b/0032-x86-spec-ctrl-Add-spec-ctrl-unpriv-mmio.patch @@ -0,0 +1,187 @@ +From 2e82446cb252f6c8ac697e81f4155872c69afde4 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper <andrew.cooper3@citrix.com> +Date: Mon, 13 Jun 2022 19:18:32 +0100 +Subject: [PATCH 32/32] x86/spec-ctrl: Add spec-ctrl=unpriv-mmio +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Per Xen's support statement, PCI passthrough should be to trusted domains +because the overall system security depends on factors outside of Xen's +control. + +As such, Xen, in a supported configuration, is not vulnerable to DRPW/SBDR. + +However, users who have risk assessed their configuration may be happy with +the risk of DoS, but unhappy with the risk of cross-domain data leakage. Such +users should enable this option. + +On CPUs vulnerable to MDS, the existing mitigations are the best we can do to +mitigate MMIO cross-domain data leakage. + +On CPUs fixed to MDS but vulnerable MMIO stale data leakage, this option: + + * On CPUs susceptible to FBSDP, mitigates cross-domain fill buffer leakage + using FB_CLEAR. + * On CPUs susceptible to SBDR, mitigates RNG data recovery by engaging the + srb-lock, previously used to mitigate SRBDS. + +Both mitigations require microcode from IPU 2022.1, May 2022. + +This is part of XSA-404. + +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> +(cherry picked from commit 8c24b70fedcb52633b2370f834d8a2be3f7fa38e) +--- + docs/misc/xen-command-line.pandoc | 14 +++++++-- + xen/arch/x86/spec_ctrl.c | 48 ++++++++++++++++++++++++------- + 2 files changed, 48 insertions(+), 14 deletions(-) + +diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc +index d5cb09f86541..a642e43476a2 100644 +--- a/docs/misc/xen-command-line.pandoc ++++ b/docs/misc/xen-command-line.pandoc +@@ -2235,7 +2235,7 @@ By default SSBD will be mitigated at runtime (i.e `ssbd=runtime`). + ### spec-ctrl (x86) + > `= List of [ <bool>, xen=<bool>, {pv,hvm,msr-sc,rsb,md-clear}=<bool>, + > bti-thunk=retpoline|lfence|jmp, {ibrs,ibpb,ssbd,eager-fpu, +-> l1d-flush,branch-harden,srb-lock}=<bool> ]` ++> l1d-flush,branch-harden,srb-lock,unpriv-mmio}=<bool> ]` + + Controls for speculative execution sidechannel mitigations. By default, Xen + will pick the most appropriate mitigations based on compiled in support, +@@ -2314,8 +2314,16 @@ Xen will enable this mitigation. + On hardware supporting SRBDS_CTRL, the `srb-lock=` option can be used to force + or prevent Xen from protect the Special Register Buffer from leaking stale + data. By default, Xen will enable this mitigation, except on parts where MDS +-is fixed and TAA is fixed/mitigated (in which case, there is believed to be no +-way for an attacker to obtain the stale data). ++is fixed and TAA is fixed/mitigated and there are no unprivileged MMIO ++mappings (in which case, there is believed to be no way for an attacker to ++obtain stale data). ++ ++The `unpriv-mmio=` boolean indicates whether the system has (or will have) ++less than fully privileged domains granted access to MMIO devices. By ++default, this option is disabled. If enabled, Xen will use the `FB_CLEAR` ++and/or `SRBDS_CTRL` functionality available in the Intel May 2022 microcode ++release to mitigate cross-domain leakage of data via the MMIO Stale Data ++vulnerabilities. + + ### sync_console + > `= <boolean>` +diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c +index d285538bde9f..099113ba41e6 100644 +--- a/xen/arch/x86/spec_ctrl.c ++++ b/xen/arch/x86/spec_ctrl.c +@@ -67,6 +67,8 @@ static bool __initdata cpu_has_bug_msbds_only; /* => minimal HT impact. */ + static bool __initdata cpu_has_bug_mds; /* Any other M{LP,SB,FB}DS combination. */ + + static int8_t __initdata opt_srb_lock = -1; ++static bool __initdata opt_unpriv_mmio; ++static bool __read_mostly opt_fb_clear_mmio; + + static int __init parse_spec_ctrl(const char *s) + { +@@ -184,6 +186,8 @@ static int __init parse_spec_ctrl(const char *s) + opt_branch_harden = val; + else if ( (val = parse_boolean("srb-lock", s, ss)) >= 0 ) + opt_srb_lock = val; ++ else if ( (val = parse_boolean("unpriv-mmio", s, ss)) >= 0 ) ++ opt_unpriv_mmio = val; + else + rc = -EINVAL; + +@@ -392,7 +396,8 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) + opt_srb_lock ? " SRB_LOCK+" : " SRB_LOCK-", + opt_ibpb ? " IBPB" : "", + opt_l1d_flush ? " L1D_FLUSH" : "", +- opt_md_clear_pv || opt_md_clear_hvm ? " VERW" : "", ++ opt_md_clear_pv || opt_md_clear_hvm || ++ opt_fb_clear_mmio ? " VERW" : "", + opt_branch_harden ? " BRANCH_HARDEN" : ""); + + /* L1TF diagnostics, printed if vulnerable or PV shadowing is in use. */ +@@ -941,7 +946,9 @@ void spec_ctrl_init_domain(struct domain *d) + { + bool pv = is_pv_domain(d); + +- d->arch.verw = pv ? opt_md_clear_pv : opt_md_clear_hvm; ++ d->arch.verw = ++ (pv ? opt_md_clear_pv : opt_md_clear_hvm) || ++ (opt_fb_clear_mmio && is_iommu_enabled(d)); + } + + void __init init_speculation_mitigations(void) +@@ -1195,6 +1202,18 @@ void __init init_speculation_mitigations(void) + + mds_calculations(caps); + ++ /* ++ * Parts which enumerate FB_CLEAR are those which are post-MDS_NO and have ++ * reintroduced the VERW fill buffer flushing side effect because of a ++ * susceptibility to FBSDP. ++ * ++ * If unprivileged guests have (or will have) MMIO mappings, we can ++ * mitigate cross-domain leakage of fill buffer data by issuing VERW on ++ * the return-to-guest path. ++ */ ++ if ( opt_unpriv_mmio ) ++ opt_fb_clear_mmio = caps & ARCH_CAPS_FB_CLEAR; ++ + /* + * By default, enable PV and HVM mitigations on MDS-vulnerable hardware. + * This will only be a token effort for MLPDS/MFBDS when HT is enabled, +@@ -1208,18 +1227,20 @@ void __init init_speculation_mitigations(void) + boot_cpu_has(X86_FEATURE_MD_CLEAR)); + + /* +- * Enable MDS defences as applicable. The Idle blocks need using if +- * either PV or HVM defences are used. ++ * Enable MDS/MMIO defences as applicable. The Idle blocks need using if ++ * either the PV or HVM MDS defences are used, or if we may give MMIO ++ * access to untrusted guests. + * + * HVM is more complicated. The MD_CLEAR microcode extends L1D_FLUSH with + * equivalent semantics to avoid needing to perform both flushes on the +- * HVM path. Therefore, we don't need VERW in addition to L1D_FLUSH. ++ * HVM path. Therefore, we don't need VERW in addition to L1D_FLUSH (for ++ * MDS mitigations. L1D_FLUSH is not safe for MMIO mitigations.) + * + * After calculating the appropriate idle setting, simplify + * opt_md_clear_hvm to mean just "should we VERW on the way into HVM + * guests", so spec_ctrl_init_domain() can calculate suitable settings. + */ +- if ( opt_md_clear_pv || opt_md_clear_hvm ) ++ if ( opt_md_clear_pv || opt_md_clear_hvm || opt_fb_clear_mmio ) + setup_force_cpu_cap(X86_FEATURE_SC_VERW_IDLE); + opt_md_clear_hvm &= !(caps & ARCH_CAPS_SKIP_L1DFL) && !opt_l1d_flush; + +@@ -1284,14 +1305,19 @@ void __init init_speculation_mitigations(void) + * On some SRBDS-affected hardware, it may be safe to relax srb-lock by + * default. + * +- * On parts which enumerate MDS_NO and not TAA_NO, TSX is the only known +- * way to access the Fill Buffer. If TSX isn't available (inc. SKU +- * reasons on some models), or TSX is explicitly disabled, then there is +- * no need for the extra overhead to protect RDRAND/RDSEED. ++ * All parts with SRBDS_CTRL suffer SSDP, the mechanism by which stale RNG ++ * data becomes available to other contexts. To recover the data, an ++ * attacker needs to use: ++ * - SBDS (MDS or TAA to sample the cores fill buffer) ++ * - SBDR (Architecturally retrieve stale transaction buffer contents) ++ * - DRPW (Architecturally latch stale fill buffer data) ++ * ++ * On MDS_NO parts, and with TAA_NO or TSX unavailable/disabled, and there ++ * is no unprivileged MMIO access, the RNG data doesn't need protecting. + */ + if ( cpu_has_srbds_ctrl ) + { +- if ( opt_srb_lock == -1 && ++ if ( opt_srb_lock == -1 && !opt_unpriv_mmio && + (caps & (ARCH_CAPS_MDS_NO|ARCH_CAPS_TAA_NO)) == ARCH_CAPS_MDS_NO && + (!cpu_has_hle || ((caps & ARCH_CAPS_TSX_CTRL) && rtm_disabled)) ) + opt_srb_lock = 0; +-- +2.35.1 + @@ -0,0 +1,339 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Lesser General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + <one line to give the program's name and a brief idea of what it does.> + Copyright (C) <year> <name of author> + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + <signature of Ty Coon>, 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. diff --git a/create-patches b/create-patches new file mode 100755 index 0000000..8e8c9fa --- /dev/null +++ b/create-patches @@ -0,0 +1,60 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +cd "${SCRIPT_DIR}" + +if [[ ! -v XEN_REPO_DIR ]]; then + XEN_REPO_DIR="${HOME}/repos/xen" +fi + +XEN_VERSION="${1}" +OUR_PATCHES_VERSION="${2}" + +XEN_VER_COMPONENTS=( ${XEN_VERSION//./ } ) +XEN_MAJOR_MINOR_VERSION="${XEN_VER_COMPONENTS[0]}.${XEN_VER_COMPONENTS[1]}" + + +git -C "${XEN_REPO_DIR}" fetch origin + +readarray -d '' CURRENT_PATCHES < <(find . -maxdepth 1 -type f -name "*.patch" -print0) +if [[ ${CURRENT_PATCHES[@]} -gt 0 ]]; then + git rm -f *.patch +fi + +PATCH_RANGE_START="RELEASE-${XEN_VERSION}" +PATCH_RANGE_END="staging-${XEN_MAJOR_MINOR_VERSION}" +git -C "${XEN_REPO_DIR}" format-patch \ + -o "${SCRIPT_DIR}" \ + ${PATCH_RANGE_START}..origin/${PATCH_RANGE_END} + +XEN_NEXT_PATCHLEVEL=$((XEN_VER_COMPONENTS[2]+1)) +XEN_NEXT_VERSION="${XEN_MAJOR_MINOR_VERSION}.${XEN_NEXT_PATCHLEVEL}" + +PATCH_RANGE_START_ID=$(git -C "${XEN_REPO_DIR}" rev-parse ${PATCH_RANGE_START}) +PATCH_RANGE_END_ID=$(git -C "${XEN_REPO_DIR}" rev-parse ${PATCH_RANGE_END}) + +cat <<EOF > "info.txt" +Xen upstream patchset #${OUR_PATCHES_VERSION} for ${XEN_NEXT_VERSION}-pre + +Containing patches from +$PATCH_RANGE_START ($PATCH_RANGE_START_ID) +to +$PATCH_RANGE_END ($PATCH_RANGE_END_ID) +EOF + +git add \ + info.txt \ + *.patch + +TAG="${XEN_NEXT_VERSION}-pre-patchset-${OUR_PATCHES_VERSION}" +DESCRIPTION="Xen ${TAG}" + +git commit \ + --signoff \ + -m "${DESCRIPTION}" + +git tag \ + -s \ + -m "${DESCRIPTION}" \ + "${TAG}" diff --git a/info.txt b/info.txt new file mode 100644 index 0000000..2310ace --- /dev/null +++ b/info.txt @@ -0,0 +1,6 @@ +Xen Upstream Patchset #0 for 4.16.2-pre + +Containing patches from +RELEASE-4.16.1 (13fee86475f3831d7a1ecf6d7e0acbc2ac779f7e) +to +staging-4.16 (2e82446cb252f6c8ac697e81f4155872c69afde4) |