diff options
author | omni <omni+alpine@hack.org> | 2022-04-20 20:57:50 +0000 |
---|---|---|
committer | omni <omni+alpine@hack.org> | 2022-04-22 09:30:08 +0000 |
commit | e5a8c3fe7051ee091719ddc4d39feb0c3bb53abb (patch) | |
tree | 9b77af7abd9fa8fcb83e2c90717a3ca78f91c57a | |
parent | 5a2d38340aca4e112aa285a1ec604b66622733d8 (diff) | |
download | aports-e5a8c3fe7051ee091719ddc4d39feb0c3bb53abb.tar.gz aports-e5a8c3fe7051ee091719ddc4d39feb0c3bb53abb.tar.bz2 aports-e5a8c3fe7051ee091719ddc4d39feb0c3bb53abb.tar.xz |
main/xen: add upstream XSA patches
With my limitations, in knowledge and time, I chose to create one .patch
file with all the commits to the upstream stable-4.13 branch from 4.13.4
releas up till and including the currently latest commit:
git format-patch b4bb02d^..fe97133 --stdout > xen-stable-4.13_git20220408.patch
This includes CVE/XSA patches, prerequisites and additional fixes.
https://xenbits.xen.org/gitweb/?p=xen.git;a=shortlog;h=refs/heads/stable-4.13
-rw-r--r-- | main/xen/APKBUILD | 26 | ||||
-rw-r--r-- | main/xen/xen-stable-4.13_git20220408.patch | 5414 | ||||
-rw-r--r-- | main/xen/xsa386.patch | 29 | ||||
-rw-r--r-- | main/xen/xsa388-4.14-1.patch | 174 | ||||
-rw-r--r-- | main/xen/xsa388-4.14-2.patch | 36 | ||||
-rw-r--r-- | main/xen/xsa389-4.13.patch | 180 |
6 files changed, 5430 insertions, 429 deletions
diff --git a/main/xen/APKBUILD b/main/xen/APKBUILD index 8e071e260e..eef4058947 100644 --- a/main/xen/APKBUILD +++ b/main/xen/APKBUILD @@ -2,7 +2,7 @@ # Maintainer: Natanael Copa <ncopa@alpinelinux.org> pkgname=xen pkgver=4.13.4 -pkgrel=2 +pkgrel=3 pkgdesc="Xen hypervisor" url="https://www.xenproject.org/" arch="x86_64 armhf aarch64" # enable armv7 when builds with gcc8 @@ -241,6 +241,19 @@ options="!strip" # - CVE-2021-28708 XSA-388 # - CVE-2021-28705 XSA-389 # - CVE-2021-28706 XSA-389 +# 4.13.4-r3: +# - CVE-2021-28706 XSA-385 +# - CVE-2022-23033 XSA-393 +# - CVE-2022-23034 XSA-394 +# - CVE-2022-23035 XSA-395 +# - CVE-2022-26356 XSA-397 +# - CVE-2022-23960 XSA-398 +# - CVE-2022-26401 XSA-398 +# - CVE-2022-26357 XSA-399 +# - CVE-2022-26358 XSA-400 +# - CVE-2022-26359 XSA-400 +# - CVE-2022-26360 XSA-400 +# - CVE-2022-26361 XSA-400 case "$CARCH" in @@ -307,11 +320,7 @@ source="https://downloads.xenproject.org/release/xen/$pkgver/xen-$pkgver.tar.gz drop-test.py.patch py3-compat.patch - xsa386.patch - - xsa388-4.14-1.patch - xsa388-4.14-2.patch - xsa389-4.13.patch + xen-stable-4.13_git20220408.patch xenstored.initd xenstored.confd @@ -560,10 +569,7 @@ e76816c6ad0e91dc5f81947f266da3429b20e6d976c3e8c41202c6179532eec878a3f0913921ef3a 8c9cfc6afca325df1d8026e21ed03fa8cd2c7e1a21a56cc1968301c5ab634bfe849951899e75d328951d7a41273d1e49a2448edbadec0029ed410c43c0549812 hotplug-Linux-iscsi-block-handle-lun-1.patch 61f66bab603778fb41bfe8e85320c15f2bf3e5d8583e077b56a93784dbdb9b2c7c5e55ce18f06b87501429086f8410d102d3ed5f2a77d54bcfa328bc07681f4d drop-test.py.patch 8cb12dbfc05a53898a97d47d71ab6b8a6f81c5e5579fd765b37303faea95c645cb8dedc05e3d064bdf070e93814e00bf8939767acc1127513375bab0fe2f4436 py3-compat.patch -77811232c5cf199d24fb8e4a5367a56d56e61ad218397913fa22bd89d0dffabe92acfded246aa731d450f80dcffee84268b27e73e60f19eec15d0ada988a0574 xsa386.patch -5e8165695a7e5a7fdc332de0d4ee31626eb72c8765f12855543592cb86f0eb4f98ea49cae31c8fc356a0645f6a2fe05ddf2b38f9f2bb04196bb4b9efc204dc26 xsa388-4.14-1.patch -9e7b5f66480d3c0898cc080d0506dddbe35a814ccd72619abb82e8241b8cddc726e7bb38ce818335451b56ba549ed9ea1743f46fb9f0fd81ac1310ec6e94fea4 xsa388-4.14-2.patch -bd18e7f61a28ebd99f8d7fe33b6130646493489bd4a21fa9febb81860b3c4a6c20aaf51f1cfa7c19340dbd21333c2e6859f852868f8de29e2862bd93e02040ba xsa389-4.13.patch +f02f939fc9f788e99c7363e1e385e83acaa5725594eb4b37597b824ec8f853ba0f91ee0d17ebcf59c3ae4ed08eaa4ae79e3572602a67d51ed46ed900a63054e1 xen-stable-4.13_git20220408.patch 52c43beb2596d645934d0f909f2d21f7587b6898ed5e5e7046799a8ed6d58f7a09c5809e1634fa26152f3fd4f3e7cfa07da7076f01b4a20cc8f5df8b9cb77e50 xenstored.initd 093f7fbd43faf0a16a226486a0776bade5dc1681d281c5946a3191c32d74f9699c6bf5d0ab8de9d1195a2461165d1660788e92a3156c9b3c7054d7b2d52d7ff0 xenstored.confd 3c86ed48fbee0af4051c65c4a3893f131fa66e47bf083caf20c9b6aa4b63fdead8832f84a58d0e27964bc49ec8397251b34e5be5c212c139f556916dc8da9523 xenconsoled.initd diff --git a/main/xen/xen-stable-4.13_git20220408.patch b/main/xen/xen-stable-4.13_git20220408.patch new file mode 100644 index 0000000000..33c1738ac2 --- /dev/null +++ b/main/xen/xen-stable-4.13_git20220408.patch @@ -0,0 +1,5414 @@ +From b4bb02d5999a56c93f0733b589b717e7cece9c09 Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Fri, 1 Oct 2021 15:05:42 +0200 +Subject: [PATCH 01/32] VT-d: fix deassign of device with RMRR + +Ignoring a specific error code here was not meant to short circuit +deassign to _just_ the unmapping of RMRRs. This bug was previously +hidden by the bogus (potentially indefinite) looping in +pci_release_devices(), until f591755823a7 ("IOMMU/PCI: don't let domain +cleanup continue when device de-assignment failed") fixed that loop. + +This is CVE-2021-28702 / XSA-386. + +Fixes: 8b99f4400b69 ("VT-d: fix RMRR related error handling") +Reported-by: Ivan Kardykov <kardykov@tabit.pro> +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Tested-by: Ivan Kardykov <kardykov@tabit.pro> +(cherry picked from commit 24ebe875a77833696bbe5c9372e9e1590a7e7101) +--- + xen/drivers/passthrough/vtd/iommu.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c +index 98787ce3a8..af8b9ca0e4 100644 +--- a/xen/drivers/passthrough/vtd/iommu.c ++++ b/xen/drivers/passthrough/vtd/iommu.c +@@ -2393,7 +2393,7 @@ static int reassign_device_ownership( + ret = iommu_identity_mapping(source, p2m_access_x, + rmrr->base_address, + rmrr->end_address, 0); +- if ( ret != -ENOENT ) ++ if ( ret && ret != -ENOENT ) + return ret; + } + } +-- +2.35.2 + + +From 0b28069aa7c26288376040e6ee9ca145245db39e Mon Sep 17 00:00:00 2001 +From: Julien Grall <jgrall@amazon.com> +Date: Tue, 23 Nov 2021 13:32:26 +0100 +Subject: [PATCH 02/32] xen/page_alloc: Harden assign_pages() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +domain_tot_pages() and d->max_pages are 32-bit values. While the order +should always be quite small, it would still be possible to overflow +if domain_tot_pages() is near to (2^32 - 1). + +As this code may be called by a guest via XENMEM_increase_reservation +and XENMEM_populate_physmap, we want to make sure the guest is not going +to be able to allocate more than it is allowed. + +Rework the allocation check to avoid any possible overflow. While the +check domain_tot_pages() < d->max_pages should technically not be +necessary, it is probably best to have it to catch any possible +inconsistencies in the future. + +This is CVE-2021-28706 / part of XSA-385. + +Signed-off-by: Julien Grall <jgrall@amazon.com> +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> +master commit: 143501861d48e1bfef495849fd68584baac05849 +master date: 2021-11-22 11:11:05 +0000 +--- + xen/common/grant_table.c | 7 ++++--- + xen/common/page_alloc.c | 19 ++++++++++++++----- + 2 files changed, 18 insertions(+), 8 deletions(-) + +diff --git a/xen/common/grant_table.c b/xen/common/grant_table.c +index d2853a664a..7b775a8c35 100644 +--- a/xen/common/grant_table.c ++++ b/xen/common/grant_table.c +@@ -2286,7 +2286,8 @@ gnttab_transfer( + * pages when it is dying. + */ + if ( unlikely(e->is_dying) || +- unlikely(e->tot_pages >= e->max_pages) ) ++ unlikely(e->tot_pages >= e->max_pages) || ++ unlikely(!(e->tot_pages + 1)) ) + { + spin_unlock(&e->page_alloc_lock); + +@@ -2295,8 +2296,8 @@ gnttab_transfer( + e->domain_id); + else + gdprintk(XENLOG_INFO, +- "Transferee d%d has no headroom (tot %u, max %u)\n", +- e->domain_id, e->tot_pages, e->max_pages); ++ "Transferee %pd has no headroom (tot %u, max %u)\n", ++ e, e->tot_pages, e->max_pages); + + gop.status = GNTST_general_error; + goto unlock_and_copyback; +diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c +index 1563188f4f..0976bf6489 100644 +--- a/xen/common/page_alloc.c ++++ b/xen/common/page_alloc.c +@@ -2276,16 +2276,25 @@ int assign_pages( + + if ( !(memflags & MEMF_no_refcount) ) + { +- if ( unlikely((d->tot_pages + (1 << order)) > d->max_pages) ) ++ unsigned int nr = 1u << order; ++ ++ if ( unlikely(d->tot_pages > d->max_pages) ) ++ { ++ gprintk(XENLOG_INFO, "Inconsistent allocation for %pd: %u > %u\n", ++ d, d->tot_pages, d->max_pages); ++ rc = -EPERM; ++ goto out; ++ } ++ ++ if ( unlikely(nr > d->max_pages - d->tot_pages) ) + { +- gprintk(XENLOG_INFO, "Over-allocation for domain %u: " +- "%u > %u\n", d->domain_id, +- d->tot_pages + (1 << order), d->max_pages); ++ gprintk(XENLOG_INFO, "Over-allocation for %pd: %Lu > %u\n", ++ d, d->tot_pages + 0ull + nr, d->max_pages); + rc = -E2BIG; + goto out; + } + +- if ( unlikely(domain_adjust_tot_pages(d, 1 << order) == (1 << order)) ) ++ if ( unlikely(domain_adjust_tot_pages(d, nr) == nr) ) + get_knownalive_domain(d); + } + +-- +2.35.2 + + +From d94d006ed36084914c2931641b724ae262e3fb80 Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Tue, 23 Nov 2021 13:32:54 +0100 +Subject: [PATCH 03/32] x86/PoD: deal with misaligned GFNs +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Users of XENMEM_decrease_reservation and XENMEM_populate_physmap aren't +required to pass in order-aligned GFN values. (While I consider this +bogus, I don't think we can fix this there, as that might break existing +code, e.g Linux'es swiotlb, which - while affecting PV only - until +recently had been enforcing only page alignment on the original +allocation.) Only non-PoD code paths (guest_physmap_{add,remove}_page(), +p2m_set_entry()) look to be dealing with this properly (in part by being +implemented inefficiently, handling every 4k page separately). + +Introduce wrappers taking care of splitting the incoming request into +aligned chunks, without putting much effort in trying to determine the +largest possible chunk at every iteration. + +Also "handle" p2m_set_entry() failure for non-order-0 requests by +crashing the domain in one more place. Alongside putting a log message +there, also add one to the other similar path. + +Note regarding locking: This is left in the actual worker functions on +the assumption that callers aren't guaranteed atomicity wrt acting on +multiple pages at a time. For mis-aligned GFNs gfn_lock() wouldn't have +locked the correct GFN range anyway, if it didn't simply resolve to +p2m_lock(), and for well-behaved callers there continues to be only a +single iteration, i.e. behavior is unchanged for them. (FTAOD pulling +out just pod_lock() into p2m_pod_decrease_reservation() would result in +a lock order violation.) + +This is CVE-2021-28704 and CVE-2021-28707 / part of XSA-388. + +Fixes: 3c352011c0d3 ("x86/PoD: shorten certain operations on higher order ranges") +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> +master commit: 182c737b9ba540ebceb1433f3940fbed6eac4ea9 +master date: 2021-11-22 12:27:30 +0000 +--- + xen/arch/x86/mm/p2m-pod.c | 75 ++++++++++++++++++++++++++++++++------- + 1 file changed, 63 insertions(+), 12 deletions(-) + +diff --git a/xen/arch/x86/mm/p2m-pod.c b/xen/arch/x86/mm/p2m-pod.c +index 007cdd87d0..c14801f5ff 100644 +--- a/xen/arch/x86/mm/p2m-pod.c ++++ b/xen/arch/x86/mm/p2m-pod.c +@@ -495,7 +495,7 @@ p2m_pod_zero_check_superpage(struct p2m_domain *p2m, gfn_t gfn); + + + /* +- * This function is needed for two reasons: ++ * This pair of functions is needed for two reasons: + * + To properly handle clearing of PoD entries + * + To "steal back" memory being freed for the PoD cache, rather than + * releasing it. +@@ -503,8 +503,8 @@ p2m_pod_zero_check_superpage(struct p2m_domain *p2m, gfn_t gfn); + * Once both of these functions have been completed, we can return and + * allow decrease_reservation() to handle everything else. + */ +-unsigned long +-p2m_pod_decrease_reservation(struct domain *d, gfn_t gfn, unsigned int order) ++static unsigned long ++decrease_reservation(struct domain *d, gfn_t gfn, unsigned int order) + { + unsigned long ret = 0, i, n; + struct p2m_domain *p2m = p2m_get_hostp2m(d); +@@ -557,8 +557,10 @@ p2m_pod_decrease_reservation(struct domain *d, gfn_t gfn, unsigned int order) + * All PoD: Mark the whole region invalid and tell caller + * we're done. + */ +- if ( p2m_set_entry(p2m, gfn, INVALID_MFN, order, p2m_invalid, +- p2m->default_access) ) ++ int rc = p2m_set_entry(p2m, gfn, INVALID_MFN, order, p2m_invalid, ++ p2m->default_access); ++ ++ if ( rc ) + { + /* + * If this fails, we can't tell how much of the range was changed. +@@ -566,7 +568,12 @@ p2m_pod_decrease_reservation(struct domain *d, gfn_t gfn, unsigned int order) + * impossible. + */ + if ( order != 0 ) ++ { ++ printk(XENLOG_G_ERR ++ "%pd: marking GFN %#lx (order %u) as non-PoD failed: %d\n", ++ d, gfn_x(gfn), order, rc); + domain_crash(d); ++ } + goto out_unlock; + } + ret = 1UL << order; +@@ -674,6 +681,22 @@ out_unlock: + return ret; + } + ++unsigned long ++p2m_pod_decrease_reservation(struct domain *d, gfn_t gfn, unsigned int order) ++{ ++ unsigned long left = 1UL << order, ret = 0; ++ unsigned int chunk_order = find_first_set_bit(gfn_x(gfn) | left); ++ ++ do { ++ ret += decrease_reservation(d, gfn, chunk_order); ++ ++ left -= 1UL << chunk_order; ++ gfn = gfn_add(gfn, 1UL << chunk_order); ++ } while ( left ); ++ ++ return ret; ++} ++ + void p2m_pod_dump_data(struct domain *d) + { + struct p2m_domain *p2m = p2m_get_hostp2m(d); +@@ -1269,19 +1292,15 @@ remap_and_retry: + return true; + } + +- +-int +-guest_physmap_mark_populate_on_demand(struct domain *d, unsigned long gfn_l, +- unsigned int order) ++static int ++mark_populate_on_demand(struct domain *d, unsigned long gfn_l, ++ unsigned int order) + { + struct p2m_domain *p2m = p2m_get_hostp2m(d); + gfn_t gfn = _gfn(gfn_l); + unsigned long i, n, pod_count = 0; + int rc = 0; + +- if ( !paging_mode_translate(d) ) +- return -EINVAL; +- + gfn_lock(p2m, gfn, order); + + P2M_DEBUG("mark pod gfn=%#lx\n", gfn_l); +@@ -1319,6 +1338,17 @@ guest_physmap_mark_populate_on_demand(struct domain *d, unsigned long gfn_l, + BUG_ON(p2m->pod.entry_count < 0); + pod_unlock(p2m); + } ++ else if ( order ) ++ { ++ /* ++ * If this failed, we can't tell how much of the range was changed. ++ * Best to crash the domain. ++ */ ++ printk(XENLOG_G_ERR ++ "%pd: marking GFN %#lx (order %u) as PoD failed: %d\n", ++ d, gfn_l, order, rc); ++ domain_crash(d); ++ } + + out: + gfn_unlock(p2m, gfn, order); +@@ -1326,6 +1356,27 @@ out: + return rc; + } + ++int ++guest_physmap_mark_populate_on_demand(struct domain *d, unsigned long gfn, ++ unsigned int order) ++{ ++ unsigned long left = 1UL << order; ++ unsigned int chunk_order = find_first_set_bit(gfn | left); ++ int rc; ++ ++ if ( !paging_mode_translate(d) ) ++ return -EINVAL; ++ ++ do { ++ rc = mark_populate_on_demand(d, gfn, chunk_order); ++ ++ left -= 1UL << chunk_order; ++ gfn += 1UL << chunk_order; ++ } while ( !rc && left ); ++ ++ return rc; ++} ++ + void p2m_pod_init(struct p2m_domain *p2m) + { + unsigned int i; +-- +2.35.2 + + +From d3cfb4b3a680d3e2ddd36f18201d48441f36aea0 Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Tue, 23 Nov 2021 13:33:14 +0100 +Subject: [PATCH 04/32] x86/PoD: handle intermediate page orders in + p2m_pod_cache_add() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +p2m_pod_decrease_reservation() may pass pages to the function which +aren't 4k, 2M, or 1G. Handle all intermediate orders as well, to avoid +hitting the BUG() at the switch() statement's "default" case. + +This is CVE-2021-28708 / part of XSA-388. + +Fixes: 3c352011c0d3 ("x86/PoD: shorten certain operations on higher order ranges") +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> +master commit: 8ec13f68e0b026863d23e7f44f252d06478bc809 +master date: 2021-11-22 12:27:30 +0000 +--- + xen/arch/x86/mm/p2m-pod.c | 12 +++++------- + 1 file changed, 5 insertions(+), 7 deletions(-) + +diff --git a/xen/arch/x86/mm/p2m-pod.c b/xen/arch/x86/mm/p2m-pod.c +index c14801f5ff..c981200087 100644 +--- a/xen/arch/x86/mm/p2m-pod.c ++++ b/xen/arch/x86/mm/p2m-pod.c +@@ -111,15 +111,13 @@ p2m_pod_cache_add(struct p2m_domain *p2m, + /* Then add to the appropriate populate-on-demand list. */ + switch ( order ) + { +- case PAGE_ORDER_1G: +- for ( i = 0; i < (1UL << PAGE_ORDER_1G); i += 1UL << PAGE_ORDER_2M ) ++ case PAGE_ORDER_2M ... PAGE_ORDER_1G: ++ for ( i = 0; i < (1UL << order); i += 1UL << PAGE_ORDER_2M ) + page_list_add_tail(page + i, &p2m->pod.super); + break; +- case PAGE_ORDER_2M: +- page_list_add_tail(page, &p2m->pod.super); +- break; +- case PAGE_ORDER_4K: +- page_list_add_tail(page, &p2m->pod.single); ++ case PAGE_ORDER_4K ... PAGE_ORDER_2M - 1: ++ for ( i = 0; i < (1UL << order); i += 1UL << PAGE_ORDER_4K ) ++ page_list_add_tail(page + i, &p2m->pod.single); + break; + default: + BUG(); +-- +2.35.2 + + +From d3c2319ea1657f31ae3899713afc23789b771c10 Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Tue, 23 Nov 2021 13:33:33 +0100 +Subject: [PATCH 05/32] x86/P2M: deal with partial success of p2m_set_entry() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +M2P and PoD stats need to remain in sync with P2M; if an update succeeds +only partially, respective adjustments need to be made. If updates get +made before the call, they may also need undoing upon complete failure +(i.e. including the single-page case). + +Log-dirty state would better also be kept in sync. + +Note that the change to set_typed_p2m_entry() may not be strictly +necessary (due to the order restriction enforced near the top of the +function), but is being kept here to be on the safe side. + +This is CVE-2021-28705 and CVE-2021-28709 / XSA-389. + +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> +master commit: 74a11c43fd7e074b1f77631b446dd2115eacb9e8 +master date: 2021-11-22 12:27:30 +0000 +--- + xen/arch/x86/mm/p2m.c | 116 +++++++++++++++++++++++++++++++++++++----- + 1 file changed, 102 insertions(+), 14 deletions(-) + +diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c +index a68b4fe526..a6bfda010a 100644 +--- a/xen/arch/x86/mm/p2m.c ++++ b/xen/arch/x86/mm/p2m.c +@@ -781,6 +781,7 @@ p2m_remove_page(struct p2m_domain *p2m, unsigned long gfn_l, unsigned long mfn, + gfn_t gfn = _gfn(gfn_l); + p2m_type_t t; + p2m_access_t a; ++ int rc; + + /* IOMMU for PV guests is handled in get_page_type() and put_page(). */ + if ( !paging_mode_translate(p2m->domain) ) +@@ -812,8 +813,27 @@ p2m_remove_page(struct p2m_domain *p2m, unsigned long gfn_l, unsigned long mfn, + set_gpfn_from_mfn(mfn+i, INVALID_M2P_ENTRY); + } + } +- return p2m_set_entry(p2m, gfn, INVALID_MFN, page_order, p2m_invalid, +- p2m->default_access); ++ rc = p2m_set_entry(p2m, gfn, INVALID_MFN, page_order, p2m_invalid, ++ p2m->default_access); ++ if ( likely(!rc) || !mfn_valid(_mfn(mfn)) ) ++ return rc; ++ ++ /* ++ * The operation may have partially succeeded. For the failed part we need ++ * to undo the M2P update and, out of precaution, mark the pages dirty ++ * again. ++ */ ++ for ( i = 0; i < (1UL << page_order); ++i ) ++ { ++ p2m->get_entry(p2m, gfn_add(gfn, i), &t, &a, 0, NULL, NULL); ++ if ( !p2m_is_hole(t) && !p2m_is_special(t) && !p2m_is_shared(t) ) ++ { ++ set_gpfn_from_mfn(mfn + i, gfn_l + i); ++ paging_mark_pfn_dirty(p2m->domain, _pfn(gfn_l + i)); ++ } ++ } ++ ++ return rc; + } + + int +@@ -1002,13 +1022,8 @@ guest_physmap_add_entry(struct domain *d, gfn_t gfn, mfn_t mfn, + + /* Now, actually do the two-way mapping */ + rc = p2m_set_entry(p2m, gfn, mfn, page_order, t, p2m->default_access); +- if ( rc == 0 ) ++ if ( likely(!rc) ) + { +- pod_lock(p2m); +- p2m->pod.entry_count -= pod_count; +- BUG_ON(p2m->pod.entry_count < 0); +- pod_unlock(p2m); +- + if ( !p2m_is_grant(t) ) + { + for ( i = 0; i < (1UL << page_order); i++ ) +@@ -1016,6 +1031,42 @@ guest_physmap_add_entry(struct domain *d, gfn_t gfn, mfn_t mfn, + gfn_x(gfn_add(gfn, i))); + } + } ++ else ++ { ++ /* ++ * The operation may have partially succeeded. For the successful part ++ * we need to update M2P and dirty state, while for the failed part we ++ * may need to adjust PoD stats as well as undo the earlier M2P update. ++ */ ++ for ( i = 0; i < (1UL << page_order); ++i ) ++ { ++ omfn = p2m->get_entry(p2m, gfn_add(gfn, i), &ot, &a, 0, NULL, NULL); ++ if ( p2m_is_pod(ot) ) ++ { ++ BUG_ON(!pod_count); ++ --pod_count; ++ } ++ else if ( mfn_eq(omfn, mfn_add(mfn, i)) && ot == t && ++ a == p2m->default_access && !p2m_is_grant(t) ) ++ { ++ set_gpfn_from_mfn(mfn_x(omfn), gfn_x(gfn) + i); ++ paging_mark_pfn_dirty(d, _pfn(gfn_x(gfn) + i)); ++ } ++ else if ( p2m_is_ram(ot) && !p2m_is_paged(ot) ) ++ { ++ ASSERT(mfn_valid(omfn)); ++ set_gpfn_from_mfn(mfn_x(omfn), gfn_x(gfn) + i); ++ } ++ } ++ } ++ ++ if ( pod_count ) ++ { ++ pod_lock(p2m); ++ p2m->pod.entry_count -= pod_count; ++ BUG_ON(p2m->pod.entry_count < 0); ++ pod_unlock(p2m); ++ } + + out: + p2m_unlock(p2m); +@@ -1307,6 +1358,49 @@ static int set_typed_p2m_entry(struct domain *d, unsigned long gfn_l, + return 0; + } + } ++ ++ P2M_DEBUG("set %d %lx %lx\n", gfn_p2mt, gfn_l, mfn_x(mfn)); ++ rc = p2m_set_entry(p2m, gfn, mfn, order, gfn_p2mt, access); ++ if ( unlikely(rc) ) ++ { ++ gdprintk(XENLOG_ERR, "p2m_set_entry: %#lx:%u -> %d (0x%"PRI_mfn")\n", ++ gfn_l, order, rc, mfn_x(mfn)); ++ ++ /* ++ * The operation may have partially succeeded. For the successful part ++ * we need to update PoD stats, M2P, and dirty state. ++ */ ++ if ( order != PAGE_ORDER_4K ) ++ { ++ unsigned long i; ++ ++ for ( i = 0; i < (1UL << order); ++i ) ++ { ++ p2m_type_t t; ++ mfn_t cmfn = p2m->get_entry(p2m, gfn_add(gfn, i), &t, &a, 0, ++ NULL, NULL); ++ ++ if ( !mfn_eq(cmfn, mfn_add(mfn, i)) || t != gfn_p2mt || ++ a != access ) ++ continue; ++ ++ if ( p2m_is_ram(ot) ) ++ { ++ ASSERT(mfn_valid(mfn_add(omfn, i))); ++ set_gpfn_from_mfn(mfn_x(omfn) + i, INVALID_M2P_ENTRY); ++ } ++#ifdef CONFIG_HVM ++ else if ( p2m_is_pod(ot) ) ++ { ++ pod_lock(p2m); ++ BUG_ON(!p2m->pod.entry_count); ++ --p2m->pod.entry_count; ++ pod_unlock(p2m); ++ } ++#endif ++ } ++ } ++ } + else if ( p2m_is_ram(ot) ) + { + unsigned long i; +@@ -1317,12 +1411,6 @@ static int set_typed_p2m_entry(struct domain *d, unsigned long gfn_l, + set_gpfn_from_mfn(mfn_x(omfn) + i, INVALID_M2P_ENTRY); + } + } +- +- P2M_DEBUG("set %d %lx %lx\n", gfn_p2mt, gfn_l, mfn_x(mfn)); +- rc = p2m_set_entry(p2m, gfn, mfn, order, gfn_p2mt, access); +- if ( rc ) +- gdprintk(XENLOG_ERR, "p2m_set_entry: %#lx:%u -> %d (0x%"PRI_mfn")\n", +- gfn_l, order, rc, mfn_x(mfn)); + #ifdef CONFIG_HVM + else if ( p2m_is_pod(ot) ) + { +-- +2.35.2 + + +From d0e2c2762b981abd984af66a844ac12d8bf8f813 Mon Sep 17 00:00:00 2001 +From: Ian Jackson <iwj@xenproject.org> +Date: Mon, 6 Dec 2021 14:40:24 +0000 +Subject: [PATCH 06/32] MAINTAINERS: Resign from tools stable branch + maintainership + +Signed-off-by: Ian Jackson <iwj@xenproject.org> +Signed-off-by: Ian Jackson <ian.jackson@eu.citrix.com> +(cherry picked from commit c623a84c2a4fda1cd25f5347a6298706218eb5fb) +(cherry picked from commit c4cf5388652e8434652e30c73aa79635b4253675) +--- + MAINTAINERS | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/MAINTAINERS b/MAINTAINERS +index 806e02b4f8..bdd885ddff 100644 +--- a/MAINTAINERS ++++ b/MAINTAINERS +@@ -60,7 +60,7 @@ The maintainer for this branch is: + + Tools backport requests should also be copied to: + +- Ian Jackson <Ian.Jackson@eu.citrix.com> ++ TODO - Loooking for new tools stable maintainer + + + Unstable Subsystem Maintainers +-- +2.35.2 + + +From 2d601a5ca15e02820d08232ad64add8b8374b81c Mon Sep 17 00:00:00 2001 +From: Julien Grall <jgrall@amazon.com> +Date: Tue, 25 Jan 2022 14:44:21 +0100 +Subject: [PATCH 07/32] xen/arm: p2m: Always clear the P2M entry when the + mapping is removed + +Commit 2148a125b73b ("xen/arm: Track page accessed between batch of +Set/Way operations") allowed an entry to be invalid from the CPU PoV +(lpae_is_valid()) but valid for Xen (p2m_is_valid()). This is useful +to track which page is accessed and only perform an action on them +(e.g. clean & invalidate the cache after a set/way instruction). + +Unfortunately, __p2m_set_entry() is only zeroing the P2M entry when +lpae_is_valid() returns true. This means the entry will not be zeroed +if the entry was valid from Xen PoV but invalid from the CPU PoV for +tracking purpose. + +As a consequence, this will allow a domain to continue to access the +page after it was removed. + +Resolve the issue by always zeroing the entry if it the LPAE bit is +set or the entry is about to be removed. + +This is CVE-2022-23033 / XSA-393. + +Reported-by: Dmytro Firsov <Dmytro_Firsov@epam.com> +Fixes: 2148a125b73b ("xen/arm: Track page accessed between batch of Set/Way operations") +Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> +Signed-off-by: Julien Grall <jgrall@amazon.com> +master commit: a428b913a002eb2b7425b48029c20a52eeee1b5a +master date: 2022-01-25 13:25:01 +0100 +--- + xen/arch/arm/p2m.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c +index ce59f2b503..993fe4ded2 100644 +--- a/xen/arch/arm/p2m.c ++++ b/xen/arch/arm/p2m.c +@@ -1012,7 +1012,7 @@ static int __p2m_set_entry(struct p2m_domain *p2m, + * sequence when updating the translation table (D4.7.1 in ARM DDI + * 0487A.j). + */ +- if ( lpae_is_valid(orig_pte) ) ++ if ( lpae_is_valid(orig_pte) || removing_mapping ) + p2m_remove_pte(entry, p2m->clean_pte); + + if ( removing_mapping ) +-- +2.35.2 + + +From e48c7878e54a5f970c00abed2cfd747858f0d592 Mon Sep 17 00:00:00 2001 +From: Julien Grall <jgrall@amazon.com> +Date: Tue, 25 Jan 2022 14:44:47 +0100 +Subject: [PATCH 08/32] xen/grant-table: Only decrement the refcounter when + grant is fully unmapped + +The grant unmapping hypercall (GNTTABOP_unmap_grant_ref) is not a +simple revert of the changes done by the grant mapping hypercall +(GNTTABOP_map_grant_ref). + +Instead, it is possible to partially (or even not) clear some flags. +This will leave the grant is mapped until a future call where all +the flags would be cleared. + +XSA-380 introduced a refcounting that is meant to only be dropped +when the grant is fully unmapped. Unfortunately, unmap_common() will +decrement the refcount for every successful call. + +A consequence is a domain would be able to underflow the refcount +and trigger a BUG(). + +Looking at the code, it is not clear to me why a domain would +want to partially clear some flags in the grant-table. But as +this is part of the ABI, it is better to not change the behavior +for now. + +Fix it by checking if the maptrack handle has been released before +decrementing the refcounting. + +This is CVE-2022-23034 / XSA-394. + +Fixes: 9781b51efde2 ("gnttab: replace mapkind()") +Signed-off-by: Julien Grall <jgrall@amazon.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +master commit: 975a8fb45ca186b3476e5656c6ad5dad1122dbfd +master date: 2022-01-25 13:25:49 +0100 +--- + xen/common/grant_table.c | 11 +++++++++-- + 1 file changed, 9 insertions(+), 2 deletions(-) + +diff --git a/xen/common/grant_table.c b/xen/common/grant_table.c +index 7b775a8c35..cbb2ce17c0 100644 +--- a/xen/common/grant_table.c ++++ b/xen/common/grant_table.c +@@ -1438,8 +1438,15 @@ unmap_common( + if ( put_handle ) + put_maptrack_handle(lgt, op->handle); + +- /* See the respective comment in map_grant_ref(). */ +- if ( rc == GNTST_okay && ld != rd && gnttab_need_iommu_mapping(ld) ) ++ /* ++ * map_grant_ref() will only increment the refcount (and update the ++ * IOMMU) once per mapping. So we only want to decrement it once the ++ * maptrack handle has been put, alongside the further IOMMU update. ++ * ++ * For the second and third check, see the respective comment in ++ * map_grant_ref(). ++ */ ++ if ( put_handle && ld != rd && gnttab_need_iommu_mapping(ld) ) + { + void **slot; + union maptrack_node node; +-- +2.35.2 + + +From ce49a1d6d819f4587436b4ff73334d3676c1aab6 Mon Sep 17 00:00:00 2001 +From: Julien Grall <jgrall@amazon.com> +Date: Tue, 25 Jan 2022 14:45:07 +0100 +Subject: [PATCH 09/32] passthrough/x86: stop pirq iteration immediately in + case of error +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +pt_pirq_iterate() will iterate in batch over all the PIRQs. The outer +loop will bail out if 'rc' is non-zero but the inner loop will continue. + +This means 'rc' will get clobbered and we may miss any errors (such as +-ERESTART in the case of the callback pci_clean_dpci_irq()). + +This is CVE-2022-23035 / XSA-395. + +Fixes: c24536b636f2 ("replace d->nr_pirqs sized arrays with radix tree") +Fixes: f6dd295381f4 ("dpci: replace tasklet with softirq") +Signed-off-by: Julien Grall <jgrall@amazon.com> +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> +master commit: 9480a1a519cf016623f657dc544cb372a82b5708 +master date: 2022-01-25 13:27:02 +0100 +--- + xen/drivers/passthrough/io.c | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/xen/drivers/passthrough/io.c b/xen/drivers/passthrough/io.c +index 71eaf2c17e..b6e88ebc86 100644 +--- a/xen/drivers/passthrough/io.c ++++ b/xen/drivers/passthrough/io.c +@@ -810,7 +810,11 @@ int pt_pirq_iterate(struct domain *d, + + pirq = pirqs[i]->pirq; + if ( (pirq_dpci->flags & HVM_IRQ_DPCI_MAPPED) ) ++ { + rc = cb(d, pirq_dpci, arg); ++ if ( rc ) ++ break; ++ } + } + } while ( !rc && ++pirq < d->nr_pirqs && n == ARRAY_SIZE(pirqs) ); + +-- +2.35.2 + + +From 9a8804a92fed77f77afe9fc525c6891bb60f68d3 Mon Sep 17 00:00:00 2001 +From: Bertrand Marquis <bertrand.marquis@arm.com> +Date: Tue, 15 Feb 2022 10:37:51 +0000 +Subject: [PATCH 10/32] xen/arm: Introduce new Arm processors + +Add some new processor identifiers in processor.h and sync Xen +definitions with status of Linux 5.17 (declared in +arch/arm64/include/asm/cputype.h). + +This is part of XSA-398 / CVE-2022-23960. + +Signed-off-by: Bertrand Marquis <bertrand.marquis@arm.com> +Acked-by: Julien Grall <julien@xen.org> +(cherry picked from commit 35d1b85a6b43483f6bd007d48757434e54743e98) +--- + xen/include/asm-arm/processor.h | 18 ++++++++++++++++++ + 1 file changed, 18 insertions(+) + +diff --git a/xen/include/asm-arm/processor.h b/xen/include/asm-arm/processor.h +index 87c8136022..17cc5cf486 100644 +--- a/xen/include/asm-arm/processor.h ++++ b/xen/include/asm-arm/processor.h +@@ -53,6 +53,7 @@ + #define ARM_CPU_PART_CORTEX_A17 0xC0E + #define ARM_CPU_PART_CORTEX_A15 0xC0F + #define ARM_CPU_PART_CORTEX_A53 0xD03 ++#define ARM_CPU_PART_CORTEX_A35 0xD04 + #define ARM_CPU_PART_CORTEX_A55 0xD05 + #define ARM_CPU_PART_CORTEX_A57 0xD07 + #define ARM_CPU_PART_CORTEX_A72 0xD08 +@@ -60,11 +61,20 @@ + #define ARM_CPU_PART_CORTEX_A75 0xD0A + #define ARM_CPU_PART_CORTEX_A76 0xD0B + #define ARM_CPU_PART_NEOVERSE_N1 0xD0C ++#define ARM_CPU_PART_CORTEX_A77 0xD0D ++#define ARM_CPU_PART_NEOVERSE_V1 0xD40 ++#define ARM_CPU_PART_CORTEX_A78 0xD41 ++#define ARM_CPU_PART_CORTEX_X1 0xD44 ++#define ARM_CPU_PART_CORTEX_A710 0xD47 ++#define ARM_CPU_PART_CORTEX_X2 0xD48 ++#define ARM_CPU_PART_NEOVERSE_N2 0xD49 ++#define ARM_CPU_PART_CORTEX_A78C 0xD4B + + #define MIDR_CORTEX_A12 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A12) + #define MIDR_CORTEX_A17 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A17) + #define MIDR_CORTEX_A15 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A15) + #define MIDR_CORTEX_A53 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A53) ++#define MIDR_CORTEX_A35 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A35) + #define MIDR_CORTEX_A55 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A55) + #define MIDR_CORTEX_A57 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A57) + #define MIDR_CORTEX_A72 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A72) +@@ -72,6 +82,14 @@ + #define MIDR_CORTEX_A75 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A75) + #define MIDR_CORTEX_A76 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A76) + #define MIDR_NEOVERSE_N1 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_N1) ++#define MIDR_CORTEX_A77 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A77) ++#define MIDR_NEOVERSE_V1 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_V1) ++#define MIDR_CORTEX_A78 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A78) ++#define MIDR_CORTEX_X1 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X1) ++#define MIDR_CORTEX_A710 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A710) ++#define MIDR_CORTEX_X2 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X2) ++#define MIDR_NEOVERSE_N2 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_N2) ++#define MIDR_CORTEX_A78C MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A78C) + + /* MPIDR Multiprocessor Affinity Register */ + #define _MPIDR_UP (30) +-- +2.35.2 + + +From 03db21387b8653d663e8da89c964d611ba509130 Mon Sep 17 00:00:00 2001 +From: Bertrand Marquis <bertrand.marquis@arm.com> +Date: Tue, 15 Feb 2022 10:39:47 +0000 +Subject: [PATCH 11/32] xen/arm: move errata CSV2 check earlier + +CSV2 availability check is done after printing to the user that +workaround 1 will be used. Move the check before to prevent saying to the +user that workaround 1 is used when it is not because it is not needed. +This will also allow to reuse install_bp_hardening_vec function for +other use cases. + +Code previously returning "true", now returns "0" to conform to +enable_smccc_arch_workaround_1 returning an int and surrounding code +doing a "return 0" if workaround is not needed. + +This is part of XSA-398 / CVE-2022-23960. + +Signed-off-by: Bertrand Marquis <bertrand.marquis@arm.com> +Reviewed-by: Julien Grall <julien@xen.org> +(cherry picked from commit 599616d70eb886b9ad0ef9d6b51693ce790504ba) +--- + xen/arch/arm/cpuerrata.c | 14 +++++++------- + 1 file changed, 7 insertions(+), 7 deletions(-) + +diff --git a/xen/arch/arm/cpuerrata.c b/xen/arch/arm/cpuerrata.c +index f94bcf74cc..79620889b4 100644 +--- a/xen/arch/arm/cpuerrata.c ++++ b/xen/arch/arm/cpuerrata.c +@@ -102,13 +102,6 @@ install_bp_hardening_vec(const struct arm_cpu_capabilities *entry, + printk(XENLOG_INFO "CPU%u will %s on exception entry\n", + smp_processor_id(), desc); + +- /* +- * No need to install hardened vector when the processor has +- * ID_AA64PRF0_EL1.CSV2 set. +- */ +- if ( cpu_data[smp_processor_id()].pfr64.csv2 ) +- return true; +- + spin_lock(&bp_lock); + + /* +@@ -167,6 +160,13 @@ static int enable_smccc_arch_workaround_1(void *data) + if ( !entry->matches(entry) ) + return 0; + ++ /* ++ * No need to install hardened vector when the processor has ++ * ID_AA64PRF0_EL1.CSV2 set. ++ */ ++ if ( cpu_data[smp_processor_id()].pfr64.csv2 ) ++ return 0; ++ + if ( smccc_ver < SMCCC_VERSION(1, 1) ) + goto warn; + +-- +2.35.2 + + +From d99df7d50d366c7a8dc71f5bdc3454f469b00a00 Mon Sep 17 00:00:00 2001 +From: Bertrand Marquis <bertrand.marquis@arm.com> +Date: Wed, 23 Feb 2022 09:42:18 +0000 +Subject: [PATCH 12/32] xen/arm: Add ECBHB and CLEARBHB ID fields + +Introduce ID coprocessor register ID_AA64ISAR2_EL1. +Add definitions in cpufeature and sysregs of ECBHB field in mmfr1 and +CLEARBHB in isar2 ID coprocessor registers. + +This is part of XSA-398 / CVE-2022-23960. + +Signed-off-by: Bertrand Marquis <bertrand.marquis@arm.com> +Acked-by: Julien Grall <julien@xen.org> +(cherry picked from commit 4b68d12d98b8790d8002fcc2c25a9d713374a4d7) +--- + xen/arch/arm/cpufeature.c | 1 + + xen/include/asm-arm/arm64/sysregs.h | 4 ++++ + xen/include/asm-arm/cpufeature.h | 20 +++++++++++++++++--- + 3 files changed, 22 insertions(+), 3 deletions(-) + +diff --git a/xen/arch/arm/cpufeature.c b/xen/arch/arm/cpufeature.c +index 44126dbf07..13dac7ccaf 100644 +--- a/xen/arch/arm/cpufeature.c ++++ b/xen/arch/arm/cpufeature.c +@@ -117,6 +117,7 @@ void identify_cpu(struct cpuinfo_arm *c) + + c->isa64.bits[0] = READ_SYSREG64(ID_AA64ISAR0_EL1); + c->isa64.bits[1] = READ_SYSREG64(ID_AA64ISAR1_EL1); ++ c->isa64.bits[2] = READ_SYSREG64(ID_AA64ISAR2_EL1); + #endif + + c->pfr32.bits[0] = READ_SYSREG32(ID_PFR0_EL1); +diff --git a/xen/include/asm-arm/arm64/sysregs.h b/xen/include/asm-arm/arm64/sysregs.h +index c60029d38f..cfd2e1d486 100644 +--- a/xen/include/asm-arm/arm64/sysregs.h ++++ b/xen/include/asm-arm/arm64/sysregs.h +@@ -57,6 +57,10 @@ + #define ICH_AP1R2_EL2 __AP1Rx_EL2(2) + #define ICH_AP1R3_EL2 __AP1Rx_EL2(3) + ++#ifndef ID_AA64ISAR2_EL1 ++#define ID_AA64ISAR2_EL1 S3_0_C0_C6_2 ++#endif ++ + /* Access to system registers */ + + #define READ_SYSREG32(name) ((uint32_t)READ_SYSREG64(name)) +diff --git a/xen/include/asm-arm/cpufeature.h b/xen/include/asm-arm/cpufeature.h +index 29753fee78..8519d2987b 100644 +--- a/xen/include/asm-arm/cpufeature.h ++++ b/xen/include/asm-arm/cpufeature.h +@@ -183,12 +183,26 @@ struct cpuinfo_arm { + unsigned long lo:4; + unsigned long pan:4; + unsigned long __res1:8; +- unsigned long __res2:32; ++ unsigned long __res2:28; ++ unsigned long ecbhb:4; + }; + } mm64; + +- struct { +- uint64_t bits[2]; ++ union { ++ uint64_t bits[3]; ++ struct { ++ /* ISAR0 */ ++ unsigned long __res0:64; ++ ++ /* ISAR1 */ ++ unsigned long __res1:64; ++ ++ /* ISAR2 */ ++ unsigned long __res3:28; ++ unsigned long clearbhb:4; ++ ++ unsigned long __res4:32; ++ }; + } isa64; + + #endif +-- +2.35.2 + + +From 47125f5fb2073abb9d5d3f65824cd066e7ec62f1 Mon Sep 17 00:00:00 2001 +From: Rahul Singh <rahul.singh@arm.com> +Date: Mon, 14 Feb 2022 18:47:32 +0000 +Subject: [PATCH 13/32] xen/arm: Add Spectre BHB handling + +This commit is adding Spectre BHB handling to Xen on Arm. +The commit is introducing new alternative code to be executed during +exception entry: +- SMCC workaround 3 call +- loop workaround (with 8, 24 or 32 iterations) +- use of new clearbhb instruction + +Cpuerrata is modified by this patch to apply the required workaround for +CPU affected by Spectre BHB when CONFIG_ARM64_HARDEN_BRANCH_PREDICTOR is +enabled. + +To do this the system previously used to apply smcc workaround 1 is +reused and new alternative code to be copied in the exception handler is +introduced. + +To define the type of workaround required by a processor, 4 new cpu +capabilities are introduced (for each number of loop and for smcc +workaround 3). + +When a processor is affected, enable_spectre_bhb_workaround is called +and if the processor does not have CSV2 set to 3 or ECBHB feature (which +would mean that the processor is doing what is required in hardware), +the proper code is enabled at exception entry. + +In the case where workaround 3 is not supported by the firmware, we +enable workaround 1 when possible as it will also mitigate Spectre BHB +on systems without CSV2. + +This is part of XSA-398 / CVE-2022-23960. + +Signed-off-by: Bertrand Marquis <bertrand.marquis@arm.com> +Signed-off-by: Rahul Singh <rahul.singh@arm.com> +Acked-by: Julien Grall <julien@xen.org> +(cherry picked from commit 62c91eb66a2904eefb1d1d9642e3697a1e3c3a3c) +--- + xen/arch/arm/arm64/bpi.S | 32 +++++- + xen/arch/arm/cpuerrata.c | 170 +++++++++++++++++++++++++++-- + xen/include/asm-arm/arm64/macros.h | 5 + + xen/include/asm-arm/cpufeature.h | 6 +- + xen/include/asm-arm/smccc.h | 6 + + 5 files changed, 207 insertions(+), 12 deletions(-) + +diff --git a/xen/arch/arm/arm64/bpi.S b/xen/arch/arm/arm64/bpi.S +index d8743d955c..4e63825220 100644 +--- a/xen/arch/arm/arm64/bpi.S ++++ b/xen/arch/arm/arm64/bpi.S +@@ -58,16 +58,42 @@ ENTRY(__bp_harden_hyp_vecs_start) + .endr + ENTRY(__bp_harden_hyp_vecs_end) + +-ENTRY(__smccc_workaround_1_smc_start) ++.macro mitigate_spectre_bhb_loop count ++ENTRY(__mitigate_spectre_bhb_loop_start_\count) ++ stp x0, x1, [sp, #-16]! ++ mov x0, \count ++.Lspectre_bhb_loop\@: ++ b . + 4 ++ subs x0, x0, #1 ++ b.ne .Lspectre_bhb_loop\@ ++ sb ++ ldp x0, x1, [sp], #16 ++ENTRY(__mitigate_spectre_bhb_loop_end_\count) ++.endm ++ ++.macro smccc_workaround num smcc_id ++ENTRY(__smccc_workaround_smc_start_\num) + sub sp, sp, #(8 * 4) + stp x0, x1, [sp, #(8 * 2)] + stp x2, x3, [sp, #(8 * 0)] +- mov w0, #ARM_SMCCC_ARCH_WORKAROUND_1_FID ++ mov w0, \smcc_id + smc #0 + ldp x2, x3, [sp, #(8 * 0)] + ldp x0, x1, [sp, #(8 * 2)] + add sp, sp, #(8 * 4) +-ENTRY(__smccc_workaround_1_smc_end) ++ENTRY(__smccc_workaround_smc_end_\num) ++.endm ++ ++ENTRY(__mitigate_spectre_bhb_clear_insn_start) ++ clearbhb ++ isb ++ENTRY(__mitigate_spectre_bhb_clear_insn_end) ++ ++mitigate_spectre_bhb_loop 8 ++mitigate_spectre_bhb_loop 24 ++mitigate_spectre_bhb_loop 32 ++smccc_workaround 1, #ARM_SMCCC_ARCH_WORKAROUND_1_FID ++smccc_workaround 3, #ARM_SMCCC_ARCH_WORKAROUND_3_FID + + /* + * Local variables: +diff --git a/xen/arch/arm/cpuerrata.c b/xen/arch/arm/cpuerrata.c +index 79620889b4..8d9e977b77 100644 +--- a/xen/arch/arm/cpuerrata.c ++++ b/xen/arch/arm/cpuerrata.c +@@ -144,7 +144,16 @@ install_bp_hardening_vec(const struct arm_cpu_capabilities *entry, + return ret; + } + +-extern char __smccc_workaround_1_smc_start[], __smccc_workaround_1_smc_end[]; ++extern char __smccc_workaround_smc_start_1[], __smccc_workaround_smc_end_1[]; ++extern char __smccc_workaround_smc_start_3[], __smccc_workaround_smc_end_3[]; ++extern char __mitigate_spectre_bhb_clear_insn_start[], ++ __mitigate_spectre_bhb_clear_insn_end[]; ++extern char __mitigate_spectre_bhb_loop_start_8[], ++ __mitigate_spectre_bhb_loop_end_8[]; ++extern char __mitigate_spectre_bhb_loop_start_24[], ++ __mitigate_spectre_bhb_loop_end_24[]; ++extern char __mitigate_spectre_bhb_loop_start_32[], ++ __mitigate_spectre_bhb_loop_end_32[]; + + static int enable_smccc_arch_workaround_1(void *data) + { +@@ -176,8 +185,8 @@ static int enable_smccc_arch_workaround_1(void *data) + if ( (int)res.a0 < 0 ) + goto warn; + +- return !install_bp_hardening_vec(entry,__smccc_workaround_1_smc_start, +- __smccc_workaround_1_smc_end, ++ return !install_bp_hardening_vec(entry,__smccc_workaround_smc_start_1, ++ __smccc_workaround_smc_end_1, + "call ARM_SMCCC_ARCH_WORKAROUND_1"); + + warn: +@@ -192,6 +201,93 @@ warn: + return 0; + } + ++/* ++ * Spectre BHB Mitigation ++ * ++ * CPU is either: ++ * - Having CVS2.3 so it is not affected. ++ * - Having ECBHB and is clearing the branch history buffer when an exception ++ * to a different exception level is happening so no mitigation is needed. ++ * - Mitigating using a loop on exception entry (number of loop depending on ++ * the CPU). ++ * - Mitigating using the firmware. ++ */ ++static int enable_spectre_bhb_workaround(void *data) ++{ ++ const struct arm_cpu_capabilities *entry = data; ++ ++ /* ++ * Enable callbacks are called on every CPU based on the capabilities, so ++ * double-check whether the CPU matches the entry. ++ */ ++ if ( !entry->matches(entry) ) ++ return 0; ++ ++ if ( cpu_data[smp_processor_id()].pfr64.csv2 == 3 ) ++ return 0; ++ ++ if ( cpu_data[smp_processor_id()].mm64.ecbhb ) ++ return 0; ++ ++ if ( cpu_data[smp_processor_id()].isa64.clearbhb ) ++ return !install_bp_hardening_vec(entry, ++ __mitigate_spectre_bhb_clear_insn_start, ++ __mitigate_spectre_bhb_clear_insn_end, ++ "use clearBHB instruction"); ++ ++ /* Apply solution depending on hwcaps set on arm_errata */ ++ if ( cpus_have_cap(ARM_WORKAROUND_BHB_LOOP_8) ) ++ return !install_bp_hardening_vec(entry, ++ __mitigate_spectre_bhb_loop_start_8, ++ __mitigate_spectre_bhb_loop_end_8, ++ "use 8 loops workaround"); ++ ++ if ( cpus_have_cap(ARM_WORKAROUND_BHB_LOOP_24) ) ++ return !install_bp_hardening_vec(entry, ++ __mitigate_spectre_bhb_loop_start_24, ++ __mitigate_spectre_bhb_loop_end_24, ++ "use 24 loops workaround"); ++ ++ if ( cpus_have_cap(ARM_WORKAROUND_BHB_LOOP_32) ) ++ return !install_bp_hardening_vec(entry, ++ __mitigate_spectre_bhb_loop_start_32, ++ __mitigate_spectre_bhb_loop_end_32, ++ "use 32 loops workaround"); ++ ++ if ( cpus_have_cap(ARM_WORKAROUND_BHB_SMCC_3) ) ++ { ++ struct arm_smccc_res res; ++ ++ if ( smccc_ver < SMCCC_VERSION(1, 1) ) ++ goto warn; ++ ++ arm_smccc_1_1_smc(ARM_SMCCC_ARCH_FEATURES_FID, ++ ARM_SMCCC_ARCH_WORKAROUND_3_FID, &res); ++ /* The return value is in the lower 32-bits. */ ++ if ( (int)res.a0 < 0 ) ++ { ++ /* ++ * On processor affected with CSV2=0, workaround 1 will mitigate ++ * both Spectre v2 and BHB so use it when available ++ */ ++ if ( enable_smccc_arch_workaround_1(data) ) ++ return 1; ++ ++ goto warn; ++ } ++ ++ return !install_bp_hardening_vec(entry,__smccc_workaround_smc_start_3, ++ __smccc_workaround_smc_end_3, ++ "call ARM_SMCCC_ARCH_WORKAROUND_3"); ++ } ++ ++warn: ++ printk_once("**** No support for any spectre BHB workaround. ****\n" ++ "**** Please update your firmware. ****\n"); ++ ++ return 0; ++} ++ + #endif /* CONFIG_ARM64_HARDEN_BRANCH_PREDICTOR */ + + /* Hardening Branch predictor code for Arm32 */ +@@ -437,19 +533,77 @@ static const struct arm_cpu_capabilities arm_errata[] = { + }, + { + .capability = ARM_HARDEN_BRANCH_PREDICTOR, +- MIDR_ALL_VERSIONS(MIDR_CORTEX_A72), ++ MIDR_RANGE(MIDR_CORTEX_A72, 0, 1 << MIDR_VARIANT_SHIFT), + .enable = enable_smccc_arch_workaround_1, + }, + { +- .capability = ARM_HARDEN_BRANCH_PREDICTOR, ++ .capability = ARM_WORKAROUND_BHB_SMCC_3, + MIDR_ALL_VERSIONS(MIDR_CORTEX_A73), +- .enable = enable_smccc_arch_workaround_1, ++ .enable = enable_spectre_bhb_workaround, + }, + { +- .capability = ARM_HARDEN_BRANCH_PREDICTOR, ++ .capability = ARM_WORKAROUND_BHB_SMCC_3, + MIDR_ALL_VERSIONS(MIDR_CORTEX_A75), +- .enable = enable_smccc_arch_workaround_1, ++ .enable = enable_spectre_bhb_workaround, ++ }, ++ /* spectre BHB */ ++ { ++ .capability = ARM_WORKAROUND_BHB_LOOP_8, ++ MIDR_RANGE(MIDR_CORTEX_A72, 1 << MIDR_VARIANT_SHIFT, ++ (MIDR_VARIANT_MASK | MIDR_REVISION_MASK)), ++ .enable = enable_spectre_bhb_workaround, ++ }, ++ { ++ .capability = ARM_WORKAROUND_BHB_LOOP_24, ++ MIDR_ALL_VERSIONS(MIDR_CORTEX_A76), ++ .enable = enable_spectre_bhb_workaround, ++ }, ++ { ++ .capability = ARM_WORKAROUND_BHB_LOOP_24, ++ MIDR_ALL_VERSIONS(MIDR_CORTEX_A77), ++ .enable = enable_spectre_bhb_workaround, ++ }, ++ { ++ .capability = ARM_WORKAROUND_BHB_LOOP_32, ++ MIDR_ALL_VERSIONS(MIDR_CORTEX_A78), ++ .enable = enable_spectre_bhb_workaround, ++ }, ++ { ++ .capability = ARM_WORKAROUND_BHB_LOOP_32, ++ MIDR_ALL_VERSIONS(MIDR_CORTEX_A78C), ++ .enable = enable_spectre_bhb_workaround, ++ }, ++ { ++ .capability = ARM_WORKAROUND_BHB_LOOP_32, ++ MIDR_ALL_VERSIONS(MIDR_CORTEX_X1), ++ .enable = enable_spectre_bhb_workaround, ++ }, ++ { ++ .capability = ARM_WORKAROUND_BHB_LOOP_32, ++ MIDR_ALL_VERSIONS(MIDR_CORTEX_X2), ++ .enable = enable_spectre_bhb_workaround, ++ }, ++ { ++ .capability = ARM_WORKAROUND_BHB_LOOP_32, ++ MIDR_ALL_VERSIONS(MIDR_CORTEX_A710), ++ .enable = enable_spectre_bhb_workaround, + }, ++ { ++ .capability = ARM_WORKAROUND_BHB_LOOP_24, ++ MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1), ++ .enable = enable_spectre_bhb_workaround, ++ }, ++ { ++ .capability = ARM_WORKAROUND_BHB_LOOP_32, ++ MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2), ++ .enable = enable_spectre_bhb_workaround, ++ }, ++ { ++ .capability = ARM_WORKAROUND_BHB_LOOP_32, ++ MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V1), ++ .enable = enable_spectre_bhb_workaround, ++ }, ++ + #endif + #ifdef CONFIG_ARM32_HARDEN_BRANCH_PREDICTOR + { +diff --git a/xen/include/asm-arm/arm64/macros.h b/xen/include/asm-arm/arm64/macros.h +index f981b4f43e..5100aed6e3 100644 +--- a/xen/include/asm-arm/arm64/macros.h ++++ b/xen/include/asm-arm/arm64/macros.h +@@ -21,6 +21,11 @@ + ldr \dst, [\dst, \tmp] + .endm + ++ /* clearbhb instruction clearing the branch history */ ++ .macro clearbhb ++ hint #22 ++ .endm ++ + /* + * Register aliases. + */ +diff --git a/xen/include/asm-arm/cpufeature.h b/xen/include/asm-arm/cpufeature.h +index 8519d2987b..a1fa3bc1cf 100644 +--- a/xen/include/asm-arm/cpufeature.h ++++ b/xen/include/asm-arm/cpufeature.h +@@ -46,8 +46,12 @@ + #define ARM_SMCCC_1_1 8 + #define ARM64_WORKAROUND_AT_SPECULATE 9 + #define ARM_WORKAROUND_858921 10 ++#define ARM_WORKAROUND_BHB_LOOP_8 11 ++#define ARM_WORKAROUND_BHB_LOOP_24 12 ++#define ARM_WORKAROUND_BHB_LOOP_32 13 ++#define ARM_WORKAROUND_BHB_SMCC_3 14 + +-#define ARM_NCAPS 11 ++#define ARM_NCAPS 15 + + #ifndef __ASSEMBLY__ + +diff --git a/xen/include/asm-arm/smccc.h b/xen/include/asm-arm/smccc.h +index 126399dd70..2abbffc3bd 100644 +--- a/xen/include/asm-arm/smccc.h ++++ b/xen/include/asm-arm/smccc.h +@@ -334,6 +334,12 @@ void __arm_smccc_1_0_smc(register_t a0, register_t a1, register_t a2, + ARM_SMCCC_OWNER_ARCH, \ + 0x7FFF) + ++#define ARM_SMCCC_ARCH_WORKAROUND_3_FID \ ++ ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ ++ ARM_SMCCC_CONV_32, \ ++ ARM_SMCCC_OWNER_ARCH, \ ++ 0x3FFF) ++ + /* SMCCC error codes */ + #define ARM_SMCCC_NOT_REQUIRED (-2) + #define ARM_SMCCC_ERR_UNKNOWN_FUNCTION (-1) +-- +2.35.2 + + +From fbabb62dd9e57180400f145a8756624c82de888f Mon Sep 17 00:00:00 2001 +From: Bertrand Marquis <bertrand.marquis@arm.com> +Date: Thu, 17 Feb 2022 14:52:54 +0000 +Subject: [PATCH 14/32] xen/arm: Allow to discover and use + SMCCC_ARCH_WORKAROUND_3 + +Allow guest to discover whether or not SMCCC_ARCH_WORKAROUND_3 is +supported and create a fastpath in the code to handle guests request to +do the workaround. + +The function SMCCC_ARCH_WORKAROUND_3 will be called by the guest for +flushing the branch history. So we want the handling to be as fast as +possible. + +As the mitigation is applied on every guest exit, we can check for the +call before saving all context and return very early. + +This is part of XSA-398 / CVE-2022-23960. + +Signed-off-by: Bertrand Marquis <bertrand.marquis@arm.com> +Reviewed-by: Julien Grall <julien@xen.org> +(cherry picked from commit c0a56ea0fd92ecb471936b7355ddbecbaea3707c) +--- + xen/arch/arm/arm64/entry.S | 21 ++++++++++++++------- + xen/arch/arm/vsmc.c | 5 +++++ + 2 files changed, 19 insertions(+), 7 deletions(-) + +diff --git a/xen/arch/arm/arm64/entry.S b/xen/arch/arm/arm64/entry.S +index 175ea2981e..a8c2145067 100644 +--- a/xen/arch/arm/arm64/entry.S ++++ b/xen/arch/arm/arm64/entry.S +@@ -338,16 +338,26 @@ guest_sync: + cbnz x1, guest_sync_slowpath /* should be 0 for HVC #0 */ + + /* +- * Fastest path possible for ARM_SMCCC_ARCH_WORKAROUND_1. +- * The workaround has already been applied on the exception ++ * Fastest path possible for ARM_SMCCC_ARCH_WORKAROUND_1 and ++ * ARM_SMCCC_ARCH_WORKAROUND_3. ++ * The workaround needed has already been applied on the exception + * entry from the guest, so let's quickly get back to the guest. + * + * Note that eor is used because the function identifier cannot + * be encoded as an immediate for cmp. + */ + eor w0, w0, #ARM_SMCCC_ARCH_WORKAROUND_1_FID +- cbnz w0, check_wa2 ++ cbz w0, fastpath_out_workaround + ++ /* ARM_SMCCC_ARCH_WORKAROUND_2 handling */ ++ eor w0, w0, #(ARM_SMCCC_ARCH_WORKAROUND_1_FID ^ ARM_SMCCC_ARCH_WORKAROUND_2_FID) ++ cbz w0, wa2_ssbd ++ ++ /* Fastpath out for ARM_SMCCC_ARCH_WORKAROUND_3 */ ++ eor w0, w0, #(ARM_SMCCC_ARCH_WORKAROUND_2_FID ^ ARM_SMCCC_ARCH_WORKAROUND_3_FID) ++ cbnz w0, guest_sync_slowpath ++ ++fastpath_out_workaround: + /* + * Clobber both x0 and x1 to prevent leakage. Note that thanks + * the eor, x0 = 0. +@@ -356,10 +366,7 @@ guest_sync: + eret + sb + +-check_wa2: +- /* ARM_SMCCC_ARCH_WORKAROUND_2 handling */ +- eor w0, w0, #(ARM_SMCCC_ARCH_WORKAROUND_1_FID ^ ARM_SMCCC_ARCH_WORKAROUND_2_FID) +- cbnz w0, guest_sync_slowpath ++wa2_ssbd: + #ifdef CONFIG_ARM_SSBD + alternative_cb arm_enable_wa2_handling + b wa2_end +diff --git a/xen/arch/arm/vsmc.c b/xen/arch/arm/vsmc.c +index a36db15fff..b633ff2fe8 100644 +--- a/xen/arch/arm/vsmc.c ++++ b/xen/arch/arm/vsmc.c +@@ -124,6 +124,10 @@ static bool handle_arch(struct cpu_user_regs *regs) + break; + } + break; ++ case ARM_SMCCC_ARCH_WORKAROUND_3_FID: ++ if ( cpus_have_cap(ARM_WORKAROUND_BHB_SMCC_3) ) ++ ret = 0; ++ break; + } + + set_user_reg(regs, 0, ret); +@@ -132,6 +136,7 @@ static bool handle_arch(struct cpu_user_regs *regs) + } + + case ARM_SMCCC_ARCH_WORKAROUND_1_FID: ++ case ARM_SMCCC_ARCH_WORKAROUND_3_FID: + /* No return value */ + return true; + +-- +2.35.2 + + +From 7b9814b250a5a28277bd0866d341a5cfc0f4c1ac Mon Sep 17 00:00:00 2001 +From: Andrew Cooper <andrew.cooper3@citrix.com> +Date: Mon, 7 Mar 2022 16:35:52 +0000 +Subject: [PATCH 15/32] x86/spec-ctrl: Cease using thunk=lfence on AMD + +AMD have updated their Spectre v2 guidance, and lfence/jmp is no longer +considered safe. AMD are recommending using retpoline everywhere. + +Update the default heuristics to never select THUNK_LFENCE. + +This is part of XSA-398 / CVE-2021-26401. + +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +(cherry picked from commit 8d03080d2a339840d3a59e0932a94f804e45110d) +--- + docs/misc/xen-command-line.pandoc | 6 +++--- + xen/arch/x86/spec_ctrl.c | 10 ++-------- + 2 files changed, 5 insertions(+), 11 deletions(-) + +diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc +index cf9dea62db..eead69ada2 100644 +--- a/docs/misc/xen-command-line.pandoc ++++ b/docs/misc/xen-command-line.pandoc +@@ -2077,9 +2077,9 @@ to use. + + If Xen was compiled with INDIRECT_THUNK support, `bti-thunk=` can be used to + select which of the thunks gets patched into the `__x86_indirect_thunk_%reg` +-locations. The default thunk is `retpoline` (generally preferred for Intel +-hardware), with the alternatives being `jmp` (a `jmp *%reg` gadget, minimal +-overhead), and `lfence` (an `lfence; jmp *%reg` gadget, preferred for AMD). ++locations. The default thunk is `retpoline` (generally preferred), with the ++alternatives being `jmp` (a `jmp *%reg` gadget, minimal overhead), and ++`lfence` (an `lfence; jmp *%reg` gadget). + + On hardware supporting IBRS (Indirect Branch Restricted Speculation), the + `ibrs=` option can be used to force or prevent Xen using the feature itself. +diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c +index 1cfd02d7d7..7447d4a8e5 100644 +--- a/xen/arch/x86/spec_ctrl.c ++++ b/xen/arch/x86/spec_ctrl.c +@@ -908,16 +908,10 @@ void __init init_speculation_mitigations(void) + if ( IS_ENABLED(CONFIG_INDIRECT_THUNK) ) + { + /* +- * AMD's recommended mitigation is to set lfence as being dispatch +- * serialising, and to use IND_THUNK_LFENCE. +- */ +- if ( cpu_has_lfence_dispatch ) +- thunk = THUNK_LFENCE; +- /* +- * On Intel hardware, we'd like to use retpoline in preference to ++ * On all hardware, we'd like to use retpoline in preference to + * IBRS, but only if it is safe on this hardware. + */ +- else if ( retpoline_safe(caps) ) ++ if ( retpoline_safe(caps) ) + thunk = THUNK_RETPOLINE; + else if ( boot_cpu_has(X86_FEATURE_IBRSB) ) + ibrs = true; +-- +2.35.2 + + +From 8ed46cc1ef14fb8463cc847d82cbd2491054547a Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Tue, 5 Apr 2022 15:19:40 +0200 +Subject: [PATCH 16/32] VT-d: split domid map cleanup check into a function +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +This logic will want invoking from elsewhere. + +No functional change intended. + +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> +Reviewed-by: Kevin Tian <kevin.tian@intel.com> +master commit: 9fdc10abe9457e4c9879a266f82372cb08e88ffb +master date: 2021-11-24 11:06:20 +0100 +--- + xen/drivers/passthrough/vtd/iommu.c | 70 +++++++++++++++++++---------- + 1 file changed, 47 insertions(+), 23 deletions(-) + +diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c +index af8b9ca0e4..234a4fbae5 100644 +--- a/xen/drivers/passthrough/vtd/iommu.c ++++ b/xen/drivers/passthrough/vtd/iommu.c +@@ -147,6 +147,51 @@ static void cleanup_domid_map(struct domain *domain, struct vtd_iommu *iommu) + } + } + ++static bool any_pdev_behind_iommu(const struct domain *d, ++ const struct pci_dev *exclude, ++ const struct vtd_iommu *iommu) ++{ ++ const struct pci_dev *pdev; ++ ++ for_each_pdev ( d, pdev ) ++ { ++ const struct acpi_drhd_unit *drhd; ++ ++ if ( pdev == exclude ) ++ continue; ++ ++ drhd = acpi_find_matched_drhd_unit(pdev); ++ if ( drhd && drhd->iommu == iommu ) ++ return true; ++ } ++ ++ return false; ++} ++ ++/* ++ * If no other devices under the same iommu owned by this domain, ++ * clear iommu in iommu_bitmap and clear domain_id in domid_bitmap. ++ */ ++static void check_cleanup_domid_map(struct domain *d, ++ const struct pci_dev *exclude, ++ struct vtd_iommu *iommu) ++{ ++ bool found = any_pdev_behind_iommu(d, exclude, iommu); ++ ++ /* ++ * Hidden devices are associated with DomXEN but usable by the hardware ++ * domain. Hence they need considering here as well. ++ */ ++ if ( !found && is_hardware_domain(d) ) ++ found = any_pdev_behind_iommu(dom_xen, exclude, iommu); ++ ++ if ( !found ) ++ { ++ clear_bit(iommu->index, &dom_iommu(d)->arch.iommu_bitmap); ++ cleanup_domid_map(d, iommu); ++ } ++} ++ + static int iommus_incoherent; + + static void sync_cache(const void *addr, unsigned int size) +@@ -1679,7 +1724,6 @@ static int domain_context_unmap(struct domain *domain, u8 devfn, + struct vtd_iommu *iommu; + int ret = 0; + u8 seg = pdev->seg, bus = pdev->bus, tmp_bus, tmp_devfn, secbus; +- int found = 0; + + drhd = acpi_find_matched_drhd_unit(pdev); + if ( !drhd ) +@@ -1763,28 +1807,8 @@ static int domain_context_unmap(struct domain *domain, u8 devfn, + if ( ret ) + goto out; + +- /* +- * if no other devices under the same iommu owned by this domain, +- * clear iommu in iommu_bitmap and clear domain_id in domid_bitmp +- */ +- for_each_pdev ( domain, pdev ) +- { +- if ( pdev->seg == seg && pdev->bus == bus && pdev->devfn == devfn ) +- continue; +- +- drhd = acpi_find_matched_drhd_unit(pdev); +- if ( drhd && drhd->iommu == iommu ) +- { +- found = 1; +- break; +- } +- } +- +- if ( found == 0 ) +- { +- clear_bit(iommu->index, &dom_iommu(domain)->arch.iommu_bitmap); +- cleanup_domid_map(domain, iommu); +- } ++ if ( !ret ) ++ check_cleanup_domid_map(domain, pdev, iommu); + + out: + return ret; +-- +2.35.2 + + +From 2ce2aec8c148a0a291eae2a0631802e0ffb42133 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> +Date: Tue, 5 Apr 2022 15:20:10 +0200 +Subject: [PATCH 17/32] x86/hap: do not switch on log dirty for VRAM tracking +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +XEN_DMOP_track_dirty_vram possibly calls into paging_log_dirty_enable +when using HAP mode, and it can interact badly with other ongoing +paging domctls, as XEN_DMOP_track_dirty_vram is not holding the domctl +lock. + +This was detected as a result of the following assert triggering when +doing repeated migrations of a HAP HVM domain with a stubdom: + +Assertion 'd->arch.paging.log_dirty.allocs == 0' failed at paging.c:198 +----[ Xen-4.17-unstable x86_64 debug=y Not tainted ]---- +CPU: 34 +RIP: e008:[<ffff82d040314b3b>] arch/x86/mm/paging.c#paging_free_log_dirty_bitmap+0x606/0x6 +RFLAGS: 0000000000010206 CONTEXT: hypervisor (d0v23) +[...] +Xen call trace: + [<ffff82d040314b3b>] R arch/x86/mm/paging.c#paging_free_log_dirty_bitmap+0x606/0x63a + [<ffff82d040279f96>] S xsm/flask/hooks.c#domain_has_perm+0x5a/0x67 + [<ffff82d04031577f>] F paging_domctl+0x251/0xd41 + [<ffff82d04031640c>] F paging_domctl_continuation+0x19d/0x202 + [<ffff82d0403202fa>] F pv_hypercall+0x150/0x2a7 + [<ffff82d0403a729d>] F lstar_enter+0x12d/0x140 + +Such assert triggered because the stubdom used +XEN_DMOP_track_dirty_vram while dom0 was in the middle of executing +XEN_DOMCTL_SHADOW_OP_OFF, and so log dirty become enabled while +retiring the old structures, thus leading to new entries being +populated in already clear slots. + +Fix this by not enabling log dirty for VRAM tracking, similar to what +is done when using shadow instead of HAP. Call +p2m_enable_hardware_log_dirty when enabling VRAM tracking in order to +get some hardware assistance if available. As a side effect the memory +pressure on the p2m pool should go down if only VRAM tracking is +enabled, as the dirty bitmap is no longer allocated. + +Note that paging_log_dirty_range (used to get the dirty bitmap for +VRAM tracking) doesn't use the log dirty bitmap, and instead relies on +checking whether each gfn on the range has been switched from +p2m_ram_logdirty to p2m_ram_rw in order to account for dirty pages. + +This is CVE-2022-26356 / XSA-397. + +Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +master commit: 4f4db53784d912c4f409a451c36ebfd4754e0a42 +master date: 2022-04-05 14:11:30 +0200 +--- + xen/arch/x86/mm/hap/hap.c | 11 ++++------- + xen/arch/x86/mm/paging.c | 2 +- + xen/include/asm-x86/paging.h | 3 --- + 3 files changed, 5 insertions(+), 11 deletions(-) + +diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c +index 3d93f3451c..9aac006d65 100644 +--- a/xen/arch/x86/mm/hap/hap.c ++++ b/xen/arch/x86/mm/hap/hap.c +@@ -69,13 +69,6 @@ int hap_track_dirty_vram(struct domain *d, + { + int size = (nr + BITS_PER_BYTE - 1) / BITS_PER_BYTE; + +- if ( !paging_mode_log_dirty(d) ) +- { +- rc = paging_log_dirty_enable(d, false); +- if ( rc ) +- goto out; +- } +- + rc = -ENOMEM; + dirty_bitmap = vzalloc(size); + if ( !dirty_bitmap ) +@@ -107,6 +100,10 @@ int hap_track_dirty_vram(struct domain *d, + + paging_unlock(d); + ++ domain_pause(d); ++ p2m_enable_hardware_log_dirty(d); ++ domain_unpause(d); ++ + if ( oend > ostart ) + p2m_change_type_range(d, ostart, oend, + p2m_ram_logdirty, p2m_ram_rw); +diff --git a/xen/arch/x86/mm/paging.c b/xen/arch/x86/mm/paging.c +index 469bb76429..8596e52458 100644 +--- a/xen/arch/x86/mm/paging.c ++++ b/xen/arch/x86/mm/paging.c +@@ -209,7 +209,7 @@ static int paging_free_log_dirty_bitmap(struct domain *d, int rc) + return rc; + } + +-int paging_log_dirty_enable(struct domain *d, bool log_global) ++static int paging_log_dirty_enable(struct domain *d, bool log_global) + { + int ret; + +diff --git a/xen/include/asm-x86/paging.h b/xen/include/asm-x86/paging.h +index 7544f73121..a16929eaa7 100644 +--- a/xen/include/asm-x86/paging.h ++++ b/xen/include/asm-x86/paging.h +@@ -156,9 +156,6 @@ void paging_log_dirty_range(struct domain *d, + unsigned long nr, + uint8_t *dirty_bitmap); + +-/* enable log dirty */ +-int paging_log_dirty_enable(struct domain *d, bool log_global); +- + /* log dirty initialization */ + void paging_log_dirty_init(struct domain *d, const struct log_dirty_ops *ops); + +-- +2.35.2 + + +From 920e93df4e16c03811665e459c414feced6bc9b6 Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Tue, 5 Apr 2022 15:20:42 +0200 +Subject: [PATCH 18/32] VT-d: correct ordering of operations in + cleanup_domid_map() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +The function may be called without any locks held (leaving aside the +domctl one, which we surely don't want to depend on here), so needs to +play safe wrt other accesses to domid_map[] and domid_bitmap[]. This is +to avoid context_set_domain_id()'s writing of domid_map[] to be reset to +zero right away in the case of it racing the freeing of a DID. + +For the interaction with context_set_domain_id() and did_to_domain_id() +see the code comment. + +{check_,}cleanup_domid_map() are called with pcidevs_lock held or during +domain cleanup only (and pcidevs_lock is also held around +context_set_domain_id()), i.e. racing calls with the same (dom, iommu) +tuple cannot occur. + +domain_iommu_domid(), besides its use by cleanup_domid_map(), has its +result used only to control flushing, and hence a stale result would +only lead to a stray extra flush. + +This is CVE-2022-26357 / XSA-399. + +Fixes: b9c20c78789f ("VT-d: per-iommu domain-id") +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> +master commit: d9eca7bb6c6636eb87bb17b08ba7de270f47ecd0 +master date: 2022-04-05 14:12:27 +0200 +--- + xen/drivers/passthrough/vtd/iommu.c | 8 +++++++- + 1 file changed, 7 insertions(+), 1 deletion(-) + +diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c +index 234a4fbae5..68f9a524b8 100644 +--- a/xen/drivers/passthrough/vtd/iommu.c ++++ b/xen/drivers/passthrough/vtd/iommu.c +@@ -142,8 +142,14 @@ static void cleanup_domid_map(struct domain *domain, struct vtd_iommu *iommu) + + if ( iommu_domid >= 0 ) + { ++ /* ++ * Update domid_map[] /before/ domid_bitmap[] to avoid a race with ++ * context_set_domain_id(), setting the slot to DOMID_INVALID for ++ * ->domid_map[] reads to produce a suitable value while the bit is ++ * still set. ++ */ ++ iommu->domid_map[iommu_domid] = DOMID_INVALID; + clear_bit(iommu_domid, iommu->domid_bitmap); +- iommu->domid_map[iommu_domid] = 0; + } + } + +-- +2.35.2 + + +From 650b888c8a0a03d796632597e6adfd0075f13954 Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Tue, 5 Apr 2022 15:21:21 +0200 +Subject: [PATCH 19/32] VT-d: fix (de)assign ordering when RMRRs are in use +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +In the event that the RMRR mappings are essential for device operation, +they should be established before updating the device's context entry, +while they should be torn down only after the device's context entry was +successfully updated. + +Also adjust a related log message. + +This is CVE-2022-26358 / part of XSA-400. + +Fixes: 8b99f4400b69 ("VT-d: fix RMRR related error handling") +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> +Reviewed-by: Paul Durrant <paul@xen.org> +Reviewed-by: Kevin Tian <kevin.tian@intel.com> +master commit: 78a40f8b5dfa1a3aec43528663f39473d4429101 +master date: 2022-04-05 14:15:33 +0200 +--- + xen/drivers/passthrough/vtd/iommu.c | 56 ++++++++++++++++++----------- + 1 file changed, 36 insertions(+), 20 deletions(-) + +diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c +index 68f9a524b8..50e21bf1d9 100644 +--- a/xen/drivers/passthrough/vtd/iommu.c ++++ b/xen/drivers/passthrough/vtd/iommu.c +@@ -2392,6 +2392,10 @@ static int reassign_device_ownership( + { + int ret; + ++ ret = domain_context_unmap(source, devfn, pdev); ++ if ( ret ) ++ return ret; ++ + /* + * Devices assigned to untrusted domains (here assumed to be any domU) + * can attempt to send arbitrary LAPIC/MSI messages. We are unprotected +@@ -2428,10 +2432,6 @@ static int reassign_device_ownership( + } + } + +- ret = domain_context_unmap(source, devfn, pdev); +- if ( ret ) +- return ret; +- + if ( devfn == pdev->devfn && pdev->domain != dom_io ) + { + list_move(&pdev->domain_list, &dom_io->pdev_list); +@@ -2508,9 +2508,8 @@ static int intel_iommu_assign_device( + } + } + +- ret = reassign_device_ownership(s, d, devfn, pdev); +- if ( ret || d == dom_io ) +- return ret; ++ if ( d == dom_io ) ++ return reassign_device_ownership(s, d, devfn, pdev); + + /* Setup rmrr identity mapping */ + for_each_rmrr_device( rmrr, bdf, i ) +@@ -2523,20 +2522,37 @@ static int intel_iommu_assign_device( + rmrr->end_address, flag); + if ( ret ) + { +- int rc; +- +- rc = reassign_device_ownership(d, s, devfn, pdev); + printk(XENLOG_G_ERR VTDPREFIX +- " cannot map reserved region (%"PRIx64",%"PRIx64"] for Dom%d (%d)\n", +- rmrr->base_address, rmrr->end_address, +- d->domain_id, ret); +- if ( rc ) +- { +- printk(XENLOG_ERR VTDPREFIX +- " failed to reclaim %04x:%02x:%02x.%u from %pd (%d)\n", +- seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), d, rc); +- domain_crash(d); +- } ++ "%pd: cannot map reserved region [%"PRIx64",%"PRIx64"]: %d\n", ++ d, rmrr->base_address, rmrr->end_address, ret); ++ break; ++ } ++ } ++ } ++ ++ if ( !ret ) ++ ret = reassign_device_ownership(s, d, devfn, pdev); ++ ++ /* See reassign_device_ownership() for the hwdom aspect. */ ++ if ( !ret || is_hardware_domain(d) ) ++ return ret; ++ ++ for_each_rmrr_device( rmrr, bdf, i ) ++ { ++ if ( rmrr->segment == seg && ++ PCI_BUS(bdf) == bus && ++ PCI_DEVFN2(bdf) == devfn ) ++ { ++ int rc = iommu_identity_mapping(d, p2m_access_x, ++ rmrr->base_address, ++ rmrr->end_address, 0); ++ ++ if ( rc && rc != -ENOENT ) ++ { ++ printk(XENLOG_ERR VTDPREFIX ++ "%pd: cannot unmap reserved region [%"PRIx64",%"PRIx64"]: %d\n", ++ d, rmrr->base_address, rmrr->end_address, rc); ++ domain_crash(d); + break; + } + } +-- +2.35.2 + + +From 81918cead1a5c2c3fb6648b078501af81f520849 Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Tue, 5 Apr 2022 15:22:31 +0200 +Subject: [PATCH 20/32] VT-d: fix add/remove ordering when RMRRs are in use +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +In the event that the RMRR mappings are essential for device operation, +they should be established before updating the device's context entry, +while they should be torn down only after the device's context entry was +successfully cleared. + +Also switch to %pd in related log messages. + +Fixes: fa88cfadf918 ("vt-d: Map RMRR in intel_iommu_add_device() if the device has RMRR") +Fixes: 8b99f4400b69 ("VT-d: fix RMRR related error handling") +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> +Reviewed-by: Kevin Tian <kevin.tian@intel.com> +master commit: 3221f270cf2eba0a22fd4f92319d664eacb92889 +master date: 2022-04-05 14:16:10 +0200 +--- + xen/drivers/passthrough/vtd/iommu.c | 27 ++++++++++++++------------- + 1 file changed, 14 insertions(+), 13 deletions(-) + +diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c +index 50e21bf1d9..f7d40414ef 100644 +--- a/xen/drivers/passthrough/vtd/iommu.c ++++ b/xen/drivers/passthrough/vtd/iommu.c +@@ -1993,14 +1993,6 @@ static int intel_iommu_add_device(u8 devfn, struct pci_dev *pdev) + if ( !pdev->domain ) + return -EINVAL; + +- ret = domain_context_mapping(pdev->domain, devfn, pdev); +- if ( ret ) +- { +- dprintk(XENLOG_ERR VTDPREFIX, "d%d: context mapping failed\n", +- pdev->domain->domain_id); +- return ret; +- } +- + for_each_rmrr_device ( rmrr, bdf, i ) + { + if ( rmrr->segment == pdev->seg && +@@ -2017,12 +2009,17 @@ static int intel_iommu_add_device(u8 devfn, struct pci_dev *pdev) + rmrr->base_address, rmrr->end_address, + 0); + if ( ret ) +- dprintk(XENLOG_ERR VTDPREFIX, "d%d: RMRR mapping failed\n", +- pdev->domain->domain_id); ++ dprintk(XENLOG_ERR VTDPREFIX, "%pd: RMRR mapping failed\n", ++ pdev->domain); + } + } + +- return 0; ++ ret = domain_context_mapping(pdev->domain, devfn, pdev); ++ if ( ret ) ++ dprintk(XENLOG_ERR VTDPREFIX, "%pd: context mapping failed\n", ++ pdev->domain); ++ ++ return ret; + } + + static int intel_iommu_enable_device(struct pci_dev *pdev) +@@ -2044,11 +2041,15 @@ static int intel_iommu_remove_device(u8 devfn, struct pci_dev *pdev) + { + struct acpi_rmrr_unit *rmrr; + u16 bdf; +- int i; ++ int ret, i; + + if ( !pdev->domain ) + return -EINVAL; + ++ ret = domain_context_unmap(pdev->domain, devfn, pdev); ++ if ( ret ) ++ return ret; ++ + for_each_rmrr_device ( rmrr, bdf, i ) + { + if ( rmrr->segment != pdev->seg || +@@ -2064,7 +2065,7 @@ static int intel_iommu_remove_device(u8 devfn, struct pci_dev *pdev) + rmrr->end_address, 0); + } + +- return domain_context_unmap(pdev->domain, devfn, pdev); ++ return 0; + } + + static int __hwdom_init setup_hwdom_device(u8 devfn, struct pci_dev *pdev) +-- +2.35.2 + + +From 33c13654cb6d7d2a5731614f55aace4866c93d97 Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Tue, 5 Apr 2022 15:23:26 +0200 +Subject: [PATCH 21/32] VT-d: drop ownership checking from + domain_context_mapping_one() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Despite putting in quite a bit of effort it was not possible to +establish why exactly this code exists (beyond possibly sanity +checking). Instead of a subsequent change further complicating this +logic, simply get rid of it. + +Take the opportunity and move the respective unmap_vtd_domain_page() out +of the locked region. + +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> +Reviewed-by: Paul Durrant <paul@xen.org> +Reviewed-by: Kevin Tian <kevin.tian@intel.com> +master commit: a680b8134b2d1828bbbf443a97feea66e8a85c75 +master date: 2022-04-05 14:17:21 +0200 +--- + xen/drivers/passthrough/vtd/iommu.c | 66 +---------------------------- + 1 file changed, 2 insertions(+), 64 deletions(-) + +diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c +index f7d40414ef..b729ae173a 100644 +--- a/xen/drivers/passthrough/vtd/iommu.c ++++ b/xen/drivers/passthrough/vtd/iommu.c +@@ -114,28 +114,6 @@ static int context_set_domain_id(struct context_entry *context, + return 0; + } + +-static int context_get_domain_id(struct context_entry *context, +- struct vtd_iommu *iommu) +-{ +- unsigned long dom_index, nr_dom; +- int domid = -1; +- +- if (iommu && context) +- { +- nr_dom = cap_ndoms(iommu->cap); +- +- dom_index = context_domain_id(*context); +- +- if ( dom_index < nr_dom && iommu->domid_map ) +- domid = iommu->domid_map[dom_index]; +- else +- dprintk(XENLOG_DEBUG VTDPREFIX, +- "dom_index %lu exceeds nr_dom %lu or iommu has no domid_map\n", +- dom_index, nr_dom); +- } +- return domid; +-} +- + static void cleanup_domid_map(struct domain *domain, struct vtd_iommu *iommu) + { + int iommu_domid = domain_iommu_domid(domain, iommu); +@@ -1392,49 +1370,9 @@ int domain_context_mapping_one( + + if ( context_present(*context) ) + { +- int res = 0; +- +- /* Try to get domain ownership from device structure. If that's +- * not available, try to read it from the context itself. */ +- if ( pdev ) +- { +- if ( pdev->domain != domain ) +- { +- printk(XENLOG_G_INFO VTDPREFIX +- "d%d: %04x:%02x:%02x.%u owned by d%d!", +- domain->domain_id, +- seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), +- pdev->domain ? pdev->domain->domain_id : -1); +- res = -EINVAL; +- } +- } +- else +- { +- int cdomain; +- cdomain = context_get_domain_id(context, iommu); +- +- if ( cdomain < 0 ) +- { +- printk(XENLOG_G_WARNING VTDPREFIX +- "d%d: %04x:%02x:%02x.%u mapped, but can't find owner!\n", +- domain->domain_id, +- seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); +- res = -EINVAL; +- } +- else if ( cdomain != domain->domain_id ) +- { +- printk(XENLOG_G_INFO VTDPREFIX +- "d%d: %04x:%02x:%02x.%u already mapped to d%d!", +- domain->domain_id, +- seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), +- cdomain); +- res = -EINVAL; +- } +- } +- +- unmap_vtd_domain_page(context_entries); + spin_unlock(&iommu->lock); +- return res; ++ unmap_vtd_domain_page(context_entries); ++ return 0; + } + + if ( iommu_hwdom_passthrough && is_hardware_domain(domain) ) +-- +2.35.2 + + +From 235aa158e0f71ee2bf20155ce6b0b429acf59d37 Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Tue, 5 Apr 2022 15:23:57 +0200 +Subject: [PATCH 22/32] VT-d: re-assign devices directly +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Devices with RMRRs, due to it being unspecified how/when the specified +memory regions may get accessed, may not be left disconnected from their +respective mappings (as long as it's not certain that the device has +been fully quiesced). Hence rather than unmapping the old context and +then mapping the new one, re-assignment needs to be done in a single +step. + +This is CVE-2022-26359 / part of XSA-400. + +Reported-by: Roger Pau Monné <roger.pau@citrix.com> + +Similarly quarantining scratch-page mode relies on page tables to be +continuously wired up. + +To avoid complicating things more than necessary, treat all devices +mostly equally, i.e. regardless of their association with any RMRRs. The +main difference is when it comes to updating context entries, which need +to be atomic when there are RMRRs. Yet atomicity can only be achieved +with CMPXCHG16B, availability of which we can't take for given. + +The seemingly complicated choice of non-negative return values for +domain_context_mapping_one() is to limit code churn: This way callers +passing NULL for pdev don't need fiddling with. + +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Kevin Tian <kevin.tian@intel.com> +Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> +master commit: 8f41e481b4852173909363b88c1ab3da747d3a05 +master date: 2022-04-05 14:17:42 +0200 +--- + xen/drivers/passthrough/vtd/extern.h | 7 +- + xen/drivers/passthrough/vtd/iommu.c | 268 +++++++++++++++++++++------ + xen/drivers/passthrough/vtd/iommu.h | 8 +- + xen/drivers/passthrough/vtd/quirks.c | 14 +- + xen/drivers/passthrough/vtd/vtd.h | 10 +- + 5 files changed, 233 insertions(+), 74 deletions(-) + +diff --git a/xen/drivers/passthrough/vtd/extern.h b/xen/drivers/passthrough/vtd/extern.h +index 1cac22a02f..f51f8aae0d 100644 +--- a/xen/drivers/passthrough/vtd/extern.h ++++ b/xen/drivers/passthrough/vtd/extern.h +@@ -85,7 +85,8 @@ void free_pgtable_maddr(u64 maddr); + void *map_vtd_domain_page(u64 maddr); + void unmap_vtd_domain_page(void *va); + int domain_context_mapping_one(struct domain *domain, struct vtd_iommu *iommu, +- u8 bus, u8 devfn, const struct pci_dev *); ++ uint8_t bus, uint8_t devfn, ++ const struct pci_dev *pdev, unsigned int mode); + int domain_context_unmap_one(struct domain *domain, struct vtd_iommu *iommu, + u8 bus, u8 devfn); + int intel_iommu_get_reserved_device_memory(iommu_grdm_t *func, void *ctxt); +@@ -105,8 +106,8 @@ int is_igd_vt_enabled_quirk(void); + void platform_quirks_init(void); + void vtd_ops_preamble_quirk(struct vtd_iommu *iommu); + void vtd_ops_postamble_quirk(struct vtd_iommu *iommu); +-int __must_check me_wifi_quirk(struct domain *domain, +- u8 bus, u8 devfn, int map); ++int __must_check me_wifi_quirk(struct domain *domain, uint8_t bus, ++ uint8_t devfn, unsigned int mode); + void pci_vtd_quirk(const struct pci_dev *); + void quirk_iommu_caps(struct vtd_iommu *iommu); + +diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c +index b729ae173a..17deda92d8 100644 +--- a/xen/drivers/passthrough/vtd/iommu.c ++++ b/xen/drivers/passthrough/vtd/iommu.c +@@ -110,6 +110,7 @@ static int context_set_domain_id(struct context_entry *context, + } + + set_bit(i, iommu->domid_bitmap); ++ context->hi &= ~(((1 << DID_FIELD_WIDTH) - 1) << DID_HIGH_OFFSET); + context->hi |= (i & ((1 << DID_FIELD_WIDTH) - 1)) << DID_HIGH_OFFSET; + return 0; + } +@@ -1350,15 +1351,27 @@ static void __hwdom_init intel_iommu_hwdom_init(struct domain *d) + } + } + ++/* ++ * This function returns ++ * - a negative errno value upon error, ++ * - zero upon success when previously the entry was non-present, or this isn't ++ * the "main" request for a device (pdev == NULL), or for no-op quarantining ++ * assignments, ++ * - positive (one) upon success when previously the entry was present and this ++ * is the "main" request for a device (pdev != NULL). ++ */ + int domain_context_mapping_one( + struct domain *domain, + struct vtd_iommu *iommu, +- u8 bus, u8 devfn, const struct pci_dev *pdev) ++ uint8_t bus, uint8_t devfn, const struct pci_dev *pdev, ++ unsigned int mode) + { + struct domain_iommu *hd = dom_iommu(domain); +- struct context_entry *context, *context_entries; ++ struct context_entry *context, *context_entries, lctxt; ++ __uint128_t old; + u64 maddr, pgd_maddr; +- u16 seg = iommu->drhd->segment; ++ uint16_t seg = iommu->drhd->segment, prev_did = 0; ++ struct domain *prev_dom = NULL; + int agaw, rc, ret; + bool_t flush_dev_iotlb; + +@@ -1367,17 +1380,32 @@ int domain_context_mapping_one( + maddr = bus_to_context_maddr(iommu, bus); + context_entries = (struct context_entry *)map_vtd_domain_page(maddr); + context = &context_entries[devfn]; ++ old = (lctxt = *context).full; + +- if ( context_present(*context) ) ++ if ( context_present(lctxt) ) + { +- spin_unlock(&iommu->lock); +- unmap_vtd_domain_page(context_entries); +- return 0; ++ domid_t domid; ++ ++ prev_did = context_domain_id(lctxt); ++ domid = iommu->domid_map[prev_did]; ++ if ( domid < DOMID_FIRST_RESERVED ) ++ prev_dom = rcu_lock_domain_by_id(domid); ++ else if ( domid == DOMID_IO ) ++ prev_dom = rcu_lock_domain(dom_io); ++ if ( !prev_dom ) ++ { ++ spin_unlock(&iommu->lock); ++ unmap_vtd_domain_page(context_entries); ++ dprintk(XENLOG_DEBUG VTDPREFIX, ++ "no domain for did %u (nr_dom %u)\n", ++ prev_did, cap_ndoms(iommu->cap)); ++ return -ESRCH; ++ } + } + + if ( iommu_hwdom_passthrough && is_hardware_domain(domain) ) + { +- context_set_translation_type(*context, CONTEXT_TT_PASS_THRU); ++ context_set_translation_type(lctxt, CONTEXT_TT_PASS_THRU); + agaw = level_to_agaw(iommu->nr_pt_levels); + } + else +@@ -1394,6 +1422,8 @@ int domain_context_mapping_one( + spin_unlock(&hd->arch.mapping_lock); + spin_unlock(&iommu->lock); + unmap_vtd_domain_page(context_entries); ++ if ( prev_dom ) ++ rcu_unlock_domain(prev_dom); + return -ENOMEM; + } + } +@@ -1411,33 +1441,102 @@ int domain_context_mapping_one( + goto nomem; + } + +- context_set_address_root(*context, pgd_maddr); ++ context_set_address_root(lctxt, pgd_maddr); + if ( ats_enabled && ecap_dev_iotlb(iommu->ecap) ) +- context_set_translation_type(*context, CONTEXT_TT_DEV_IOTLB); ++ context_set_translation_type(lctxt, CONTEXT_TT_DEV_IOTLB); + else +- context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL); ++ context_set_translation_type(lctxt, CONTEXT_TT_MULTI_LEVEL); + + spin_unlock(&hd->arch.mapping_lock); + } + +- if ( context_set_domain_id(context, domain, iommu) ) ++ rc = context_set_domain_id(&lctxt, domain, iommu); ++ if ( rc ) + { ++ unlock: + spin_unlock(&iommu->lock); + unmap_vtd_domain_page(context_entries); +- return -EFAULT; ++ if ( prev_dom ) ++ rcu_unlock_domain(prev_dom); ++ return rc; ++ } ++ ++ if ( !prev_dom ) ++ { ++ context_set_address_width(lctxt, agaw); ++ context_set_fault_enable(lctxt); ++ context_set_present(lctxt); ++ } ++ else if ( prev_dom == domain ) ++ { ++ ASSERT(lctxt.full == context->full); ++ rc = !!pdev; ++ goto unlock; ++ } ++ else ++ { ++ ASSERT(context_address_width(lctxt) == agaw); ++ ASSERT(!context_fault_disable(lctxt)); ++ } ++ ++ if ( cpu_has_cx16 ) ++ { ++ __uint128_t res = cmpxchg16b(context, &old, &lctxt.full); ++ ++ /* ++ * Hardware does not update the context entry behind our backs, ++ * so the return value should match "old". ++ */ ++ if ( res != old ) ++ { ++ if ( pdev ) ++ check_cleanup_domid_map(domain, pdev, iommu); ++ printk(XENLOG_ERR ++ "%04x:%02x:%02x.%u: unexpected context entry %016lx_%016lx (expected %016lx_%016lx)\n", ++ pdev->seg, pdev->bus, PCI_SLOT(devfn), PCI_FUNC(devfn), ++ (uint64_t)(res >> 64), (uint64_t)res, ++ (uint64_t)(old >> 64), (uint64_t)old); ++ rc = -EILSEQ; ++ goto unlock; ++ } ++ } ++ else if ( !prev_dom || !(mode & MAP_WITH_RMRR) ) ++ { ++ context_clear_present(*context); ++ iommu_sync_cache(context, sizeof(*context)); ++ ++ write_atomic(&context->hi, lctxt.hi); ++ /* No barrier should be needed between these two. */ ++ write_atomic(&context->lo, lctxt.lo); ++ } ++ else /* Best effort, updating DID last. */ ++ { ++ /* ++ * By non-atomically updating the context entry's DID field last, ++ * during a short window in time TLB entries with the old domain ID ++ * but the new page tables may be inserted. This could affect I/O ++ * of other devices using this same (old) domain ID. Such updating ++ * therefore is not a problem if this was the only device associated ++ * with the old domain ID. Diverting I/O of any of a dying domain's ++ * devices to the quarantine page tables is intended anyway. ++ */ ++ if ( !(mode & (MAP_OWNER_DYING | MAP_SINGLE_DEVICE)) ) ++ printk(XENLOG_WARNING VTDPREFIX ++ " %04x:%02x:%02x.%u: reassignment may cause %pd data corruption\n", ++ seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), prev_dom); ++ ++ write_atomic(&context->lo, lctxt.lo); ++ /* No barrier should be needed between these two. */ ++ write_atomic(&context->hi, lctxt.hi); + } + +- context_set_address_width(*context, agaw); +- context_set_fault_enable(*context); +- context_set_present(*context); + iommu_sync_cache(context, sizeof(struct context_entry)); + spin_unlock(&iommu->lock); + +- /* Context entry was previously non-present (with domid 0). */ +- rc = iommu_flush_context_device(iommu, 0, PCI_BDF2(bus, devfn), +- DMA_CCMD_MASK_NOBIT, 1); ++ rc = iommu_flush_context_device(iommu, prev_did, PCI_BDF2(bus, devfn), ++ DMA_CCMD_MASK_NOBIT, !prev_dom); + flush_dev_iotlb = !!find_ats_dev_drhd(iommu); +- ret = iommu_flush_iotlb_dsi(iommu, 0, 1, flush_dev_iotlb); ++ ret = iommu_flush_iotlb_dsi(iommu, prev_did, !prev_dom, flush_dev_iotlb); + + /* + * The current logic for returns: +@@ -1458,12 +1557,21 @@ int domain_context_mapping_one( + unmap_vtd_domain_page(context_entries); + + if ( !seg && !rc ) +- rc = me_wifi_quirk(domain, bus, devfn, MAP_ME_PHANTOM_FUNC); ++ rc = me_wifi_quirk(domain, bus, devfn, mode); + + if ( rc ) +- domain_context_unmap_one(domain, iommu, bus, devfn); ++ { ++ if ( !prev_dom ) ++ domain_context_unmap_one(domain, iommu, bus, devfn); ++ else if ( prev_dom != domain ) /* Avoid infinite recursion. */ ++ domain_context_mapping_one(prev_dom, iommu, bus, devfn, pdev, ++ mode & MAP_WITH_RMRR); ++ } + +- return rc; ++ if ( prev_dom ) ++ rcu_unlock_domain(prev_dom); ++ ++ return rc ?: pdev && prev_dom; + } + + static int domain_context_unmap(struct domain *d, uint8_t devfn, +@@ -1473,8 +1581,11 @@ static int domain_context_mapping(struct domain *domain, u8 devfn, + struct pci_dev *pdev) + { + struct acpi_drhd_unit *drhd; ++ const struct acpi_rmrr_unit *rmrr; + int ret = 0; +- u8 seg = pdev->seg, bus = pdev->bus, secbus; ++ unsigned int i, mode = 0; ++ uint16_t seg = pdev->seg, bdf; ++ uint8_t bus = pdev->bus, secbus; + + drhd = acpi_find_matched_drhd_unit(pdev); + if ( !drhd ) +@@ -1493,8 +1604,29 @@ static int domain_context_mapping(struct domain *domain, u8 devfn, + + ASSERT(pcidevs_locked()); + ++ for_each_rmrr_device( rmrr, bdf, i ) ++ { ++ if ( rmrr->segment != pdev->seg || bdf != pdev->sbdf.bdf ) ++ continue; ++ ++ mode |= MAP_WITH_RMRR; ++ break; ++ } ++ ++ if ( domain != pdev->domain ) ++ { ++ if ( pdev->domain->is_dying ) ++ mode |= MAP_OWNER_DYING; ++ else if ( drhd && ++ !any_pdev_behind_iommu(pdev->domain, pdev, drhd->iommu) && ++ !pdev->phantom_stride ) ++ mode |= MAP_SINGLE_DEVICE; ++ } ++ + switch ( pdev->type ) + { ++ bool prev_present; ++ + case DEV_TYPE_PCI_HOST_BRIDGE: + if ( iommu_debug ) + printk(VTDPREFIX "d%d:Hostbridge: skip %04x:%02x:%02x.%u map\n", +@@ -1515,7 +1647,9 @@ static int domain_context_mapping(struct domain *domain, u8 devfn, + domain->domain_id, seg, bus, + PCI_SLOT(devfn), PCI_FUNC(devfn)); + ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn, +- pdev); ++ pdev, mode); ++ if ( ret > 0 ) ++ ret = 0; + if ( !ret && devfn == pdev->devfn && ats_device(pdev, drhd) > 0 ) + enable_ats_device(pdev, &drhd->iommu->ats_devices); + +@@ -1528,9 +1662,10 @@ static int domain_context_mapping(struct domain *domain, u8 devfn, + PCI_SLOT(devfn), PCI_FUNC(devfn)); + + ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn, +- pdev); +- if ( ret ) ++ pdev, mode); ++ if ( ret < 0 ) + break; ++ prev_present = ret; + + if ( (ret = find_upstream_bridge(seg, &bus, &devfn, &secbus)) < 1 ) + { +@@ -1538,6 +1673,15 @@ static int domain_context_mapping(struct domain *domain, u8 devfn, + break; + ret = -ENXIO; + } ++ /* ++ * Strictly speaking if the device is the only one behind this bridge ++ * and the only one with this (secbus,0,0) tuple, it could be allowed ++ * to be re-assigned regardless of RMRR presence. But let's deal with ++ * that case only if it is actually found in the wild. ++ */ ++ else if ( prev_present && (mode & MAP_WITH_RMRR) && ++ domain != pdev->domain ) ++ ret = -EOPNOTSUPP; + + /* + * Mapping a bridge should, if anything, pass the struct pci_dev of +@@ -1546,7 +1690,7 @@ static int domain_context_mapping(struct domain *domain, u8 devfn, + */ + if ( ret >= 0 ) + ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn, +- NULL); ++ NULL, mode); + + /* + * Devices behind PCIe-to-PCI/PCIx bridge may generate different +@@ -1561,10 +1705,15 @@ static int domain_context_mapping(struct domain *domain, u8 devfn, + if ( !ret && pdev_type(seg, bus, devfn) == DEV_TYPE_PCIe2PCI_BRIDGE && + (secbus != pdev->bus || pdev->devfn != 0) ) + ret = domain_context_mapping_one(domain, drhd->iommu, secbus, 0, +- NULL); ++ NULL, mode); + + if ( ret ) +- domain_context_unmap(domain, devfn, pdev); ++ { ++ if ( !prev_present ) ++ domain_context_unmap(domain, devfn, pdev); ++ else if ( pdev->domain != domain ) /* Avoid infinite recursion. */ ++ domain_context_mapping(pdev->domain, devfn, pdev); ++ } + + break; + +@@ -2331,9 +2480,8 @@ static int reassign_device_ownership( + { + int ret; + +- ret = domain_context_unmap(source, devfn, pdev); +- if ( ret ) +- return ret; ++ if ( !has_arch_pdevs(target) ) ++ vmx_pi_hooks_assign(target); + + /* + * Devices assigned to untrusted domains (here assumed to be any domU) +@@ -2343,6 +2491,31 @@ static int reassign_device_ownership( + if ( (target != hardware_domain) && !iommu_intremap ) + untrusted_msi = true; + ++ ret = domain_context_mapping(target, devfn, pdev); ++ if ( ret ) ++ { ++ if ( !has_arch_pdevs(target) ) ++ vmx_pi_hooks_deassign(target); ++ return ret; ++ } ++ ++ if ( pdev->devfn == devfn ) ++ { ++ const struct acpi_drhd_unit *drhd = acpi_find_matched_drhd_unit(pdev); ++ ++ if ( drhd ) ++ check_cleanup_domid_map(source, pdev, drhd->iommu); ++ } ++ ++ if ( devfn == pdev->devfn && pdev->domain != target ) ++ { ++ list_move(&pdev->domain_list, &target->pdev_list); ++ pdev->domain = target; ++ } ++ ++ if ( !has_arch_pdevs(source) ) ++ vmx_pi_hooks_deassign(source); ++ + /* + * If the device belongs to the hardware domain, and it has RMRR, don't + * remove it from the hardware domain, because BIOS may use RMRR at +@@ -2371,34 +2544,7 @@ static int reassign_device_ownership( + } + } + +- if ( devfn == pdev->devfn && pdev->domain != dom_io ) +- { +- list_move(&pdev->domain_list, &dom_io->pdev_list); +- pdev->domain = dom_io; +- } +- +- if ( !has_arch_pdevs(source) ) +- vmx_pi_hooks_deassign(source); +- +- if ( !has_arch_pdevs(target) ) +- vmx_pi_hooks_assign(target); +- +- ret = domain_context_mapping(target, devfn, pdev); +- if ( ret ) +- { +- if ( !has_arch_pdevs(target) ) +- vmx_pi_hooks_deassign(target); +- +- return ret; +- } +- +- if ( devfn == pdev->devfn && pdev->domain != target ) +- { +- list_move(&pdev->domain_list, &target->pdev_list); +- pdev->domain = target; +- } +- +- return ret; ++ return 0; + } + + static int intel_iommu_assign_device( +diff --git a/xen/drivers/passthrough/vtd/iommu.h b/xen/drivers/passthrough/vtd/iommu.h +index 32b39c606a..503b07ffb7 100644 +--- a/xen/drivers/passthrough/vtd/iommu.h ++++ b/xen/drivers/passthrough/vtd/iommu.h +@@ -202,8 +202,12 @@ struct root_entry { + do {(root).val |= ((value) & PAGE_MASK_4K);} while(0) + + struct context_entry { +- u64 lo; +- u64 hi; ++ union { ++ struct { ++ uint64_t lo, hi; ++ }; ++ __uint128_t full; ++ }; + }; + #define ROOT_ENTRY_NR (PAGE_SIZE_4K/sizeof(struct root_entry)) + #define context_present(c) ((c).lo & 1) +diff --git a/xen/drivers/passthrough/vtd/quirks.c b/xen/drivers/passthrough/vtd/quirks.c +index 435e449ca3..99e159b4e9 100644 +--- a/xen/drivers/passthrough/vtd/quirks.c ++++ b/xen/drivers/passthrough/vtd/quirks.c +@@ -343,7 +343,8 @@ void __init platform_quirks_init(void) + */ + + static int __must_check map_me_phantom_function(struct domain *domain, +- u32 dev, int map) ++ unsigned int dev, ++ unsigned int mode) + { + struct acpi_drhd_unit *drhd; + struct pci_dev *pdev; +@@ -354,9 +355,9 @@ static int __must_check map_me_phantom_function(struct domain *domain, + drhd = acpi_find_matched_drhd_unit(pdev); + + /* map or unmap ME phantom function */ +- if ( map ) ++ if ( !(mode & UNMAP_ME_PHANTOM_FUNC) ) + rc = domain_context_mapping_one(domain, drhd->iommu, 0, +- PCI_DEVFN(dev, 7), NULL); ++ PCI_DEVFN(dev, 7), NULL, mode); + else + rc = domain_context_unmap_one(domain, drhd->iommu, 0, + PCI_DEVFN(dev, 7)); +@@ -364,7 +365,8 @@ static int __must_check map_me_phantom_function(struct domain *domain, + return rc; + } + +-int me_wifi_quirk(struct domain *domain, u8 bus, u8 devfn, int map) ++int me_wifi_quirk(struct domain *domain, uint8_t bus, uint8_t devfn, ++ unsigned int mode) + { + u32 id; + int rc = 0; +@@ -388,7 +390,7 @@ int me_wifi_quirk(struct domain *domain, u8 bus, u8 devfn, int map) + case 0x423b8086: + case 0x423c8086: + case 0x423d8086: +- rc = map_me_phantom_function(domain, 3, map); ++ rc = map_me_phantom_function(domain, 3, mode); + break; + default: + break; +@@ -414,7 +416,7 @@ int me_wifi_quirk(struct domain *domain, u8 bus, u8 devfn, int map) + case 0x42388086: /* Puma Peak */ + case 0x422b8086: + case 0x422c8086: +- rc = map_me_phantom_function(domain, 22, map); ++ rc = map_me_phantom_function(domain, 22, mode); + break; + default: + break; +diff --git a/xen/drivers/passthrough/vtd/vtd.h b/xen/drivers/passthrough/vtd/vtd.h +index bb8889f350..e4ab242fee 100644 +--- a/xen/drivers/passthrough/vtd/vtd.h ++++ b/xen/drivers/passthrough/vtd/vtd.h +@@ -22,8 +22,14 @@ + + #include <xen/iommu.h> + +-#define MAP_ME_PHANTOM_FUNC 1 +-#define UNMAP_ME_PHANTOM_FUNC 0 ++/* ++ * Values for domain_context_mapping_one()'s and me_wifi_quirk()'s "mode" ++ * parameters. ++ */ ++#define MAP_WITH_RMRR (1u << 0) ++#define MAP_OWNER_DYING (1u << 1) ++#define MAP_SINGLE_DEVICE (1u << 2) ++#define UNMAP_ME_PHANTOM_FUNC (1u << 3) + + /* Allow for both IOAPIC and IOSAPIC. */ + #define IO_xAPIC_route_entry IO_APIC_route_entry +-- +2.35.2 + + +From 73e25ecaef14d4df521235b6dbe5ceaaa3f02e8a Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Tue, 5 Apr 2022 15:24:23 +0200 +Subject: [PATCH 23/32] AMD/IOMMU: re-assign devices directly +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Devices with unity map ranges, due to it being unspecified how/when +these memory ranges may get accessed, may not be left disconnected from +their unity mappings (as long as it's not certain that the device has +been fully quiesced). Hence rather than tearing down the old root page +table pointer and then establishing the new one, re-assignment needs to +be done in a single step. + +This is CVE-2022-26360 / part of XSA-400. + +Reported-by: Roger Pau Monné <roger.pau@citrix.com> + +Similarly quarantining scratch-page mode relies on page tables to be +continuously wired up. + +To avoid complicating things more than necessary, treat all devices +mostly equally, i.e. regardless of their association with any unity map +ranges. The main difference is when it comes to updating DTEs, which need +to be atomic when there are unity mappings. Yet atomicity can only be +achieved with CMPXCHG16B, availability of which we can't take for given. + +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Paul Durrant <paul@xen.org> +Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> +master commit: 1fa6e9aa36233fe9c29a204fcb2697e985b8345f +master date: 2022-04-05 14:18:04 +0200 +--- + xen/drivers/passthrough/amd/iommu_map.c | 67 ++++++- + xen/drivers/passthrough/amd/pci_amd_iommu.c | 180 +++++++++++++----- + xen/include/asm-x86/hvm/svm/amd-iommu-proto.h | 10 +- + 3 files changed, 200 insertions(+), 57 deletions(-) + +diff --git a/xen/drivers/passthrough/amd/iommu_map.c b/xen/drivers/passthrough/amd/iommu_map.c +index ac30cac05b..45559f9678 100644 +--- a/xen/drivers/passthrough/amd/iommu_map.c ++++ b/xen/drivers/passthrough/amd/iommu_map.c +@@ -103,10 +103,69 @@ static unsigned int set_iommu_pte_present(unsigned long pt_mfn, + return flush_flags; + } + +-void amd_iommu_set_root_page_table(struct amd_iommu_dte *dte, +- uint64_t root_ptr, uint16_t domain_id, +- uint8_t paging_mode, bool valid) ++/* ++ * This function returns ++ * - -errno for errors, ++ * - 0 for a successful update, atomic when necessary ++ * - 1 for a successful but non-atomic update, which may need to be warned ++ * about by the caller. ++ */ ++int amd_iommu_set_root_page_table(struct amd_iommu_dte *dte, ++ uint64_t root_ptr, uint16_t domain_id, ++ uint8_t paging_mode, unsigned int flags) + { ++ bool valid = flags & SET_ROOT_VALID; ++ ++ if ( dte->v && dte->tv && ++ (cpu_has_cx16 || (flags & SET_ROOT_WITH_UNITY_MAP)) ) ++ { ++ union { ++ struct amd_iommu_dte dte; ++ uint64_t raw64[4]; ++ __uint128_t raw128[2]; ++ } ldte = { .dte = *dte }; ++ __uint128_t old = ldte.raw128[0]; ++ int ret = 0; ++ ++ ldte.dte.domain_id = domain_id; ++ ldte.dte.pt_root = paddr_to_pfn(root_ptr); ++ ldte.dte.iw = true; ++ ldte.dte.ir = true; ++ ldte.dte.paging_mode = paging_mode; ++ ldte.dte.v = valid; ++ ++ if ( cpu_has_cx16 ) ++ { ++ __uint128_t res = cmpxchg16b(dte, &old, &ldte.raw128[0]); ++ ++ /* ++ * Hardware does not update the DTE behind our backs, so the ++ * return value should match "old". ++ */ ++ if ( res != old ) ++ { ++ printk(XENLOG_ERR ++ "Dom%d: unexpected DTE %016lx_%016lx (expected %016lx_%016lx)\n", ++ domain_id, ++ (uint64_t)(res >> 64), (uint64_t)res, ++ (uint64_t)(old >> 64), (uint64_t)old); ++ ret = -EILSEQ; ++ } ++ } ++ else /* Best effort, updating domain_id last. */ ++ { ++ uint64_t *ptr = (void *)dte; ++ ++ write_atomic(ptr + 0, ldte.raw64[0]); ++ /* No barrier should be needed between these two. */ ++ write_atomic(ptr + 1, ldte.raw64[1]); ++ ++ ret = 1; ++ } ++ ++ return ret; ++ } ++ + if ( valid || dte->v ) + { + dte->tv = false; +@@ -121,6 +180,8 @@ void amd_iommu_set_root_page_table(struct amd_iommu_dte *dte, + smp_wmb(); + dte->tv = true; + dte->v = valid; ++ ++ return 0; + } + + void amd_iommu_set_intremap_table( +diff --git a/xen/drivers/passthrough/amd/pci_amd_iommu.c b/xen/drivers/passthrough/amd/pci_amd_iommu.c +index beafb0171d..14483e85ae 100644 +--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c ++++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c +@@ -85,40 +85,81 @@ int get_dma_requestor_id(uint16_t seg, uint16_t bdf) + return req_id; + } + +-static void amd_iommu_setup_domain_device( ++static int __must_check allocate_domain_resources(struct domain_iommu *hd) ++{ ++ int rc; ++ ++ spin_lock(&hd->arch.mapping_lock); ++ rc = amd_iommu_alloc_root(hd); ++ spin_unlock(&hd->arch.mapping_lock); ++ ++ return rc; ++} ++ ++static bool any_pdev_behind_iommu(const struct domain *d, ++ const struct pci_dev *exclude, ++ const struct amd_iommu *iommu) ++{ ++ const struct pci_dev *pdev; ++ ++ for_each_pdev ( d, pdev ) ++ { ++ if ( pdev == exclude ) ++ continue; ++ ++ if ( find_iommu_for_device(pdev->seg, pdev->sbdf.bdf) == iommu ) ++ return true; ++ } ++ ++ return false; ++} ++ ++static int __must_check amd_iommu_setup_domain_device( + struct domain *domain, struct amd_iommu *iommu, + uint8_t devfn, struct pci_dev *pdev) + { + struct amd_iommu_dte *table, *dte; + unsigned long flags; +- int req_id, valid = 1; ++ unsigned int req_id, sr_flags; ++ int rc; + u8 bus = pdev->bus; +- const struct domain_iommu *hd = dom_iommu(domain); ++ struct domain_iommu *hd = dom_iommu(domain); ++ const struct ivrs_mappings *ivrs_dev; ++ ++ BUG_ON(!hd->arch.paging_mode || !iommu->dev_table.buffer); + +- BUG_ON( !hd->arch.root_table || !hd->arch.paging_mode || +- !iommu->dev_table.buffer ); ++ rc = allocate_domain_resources(hd); ++ if ( rc ) ++ return rc; + +- if ( iommu_hwdom_passthrough && is_hardware_domain(domain) ) +- valid = 0; ++ req_id = get_dma_requestor_id(iommu->seg, pdev->sbdf.bdf); ++ ivrs_dev = &get_ivrs_mappings(iommu->seg)[req_id]; ++ sr_flags = (iommu_hwdom_passthrough && is_hardware_domain(domain) ++ ? 0 : SET_ROOT_VALID) ++ | (ivrs_dev->unity_map ? SET_ROOT_WITH_UNITY_MAP : 0); + + /* get device-table entry */ + req_id = get_dma_requestor_id(iommu->seg, PCI_BDF2(bus, devfn)); + table = iommu->dev_table.buffer; + dte = &table[req_id]; ++ ivrs_dev = &get_ivrs_mappings(iommu->seg)[req_id]; + + spin_lock_irqsave(&iommu->lock, flags); + + if ( !dte->v || !dte->tv ) + { +- const struct ivrs_mappings *ivrs_dev; +- + /* bind DTE to domain page-tables */ +- amd_iommu_set_root_page_table( +- dte, page_to_maddr(hd->arch.root_table), domain->domain_id, +- hd->arch.paging_mode, valid); ++ rc = amd_iommu_set_root_page_table( ++ dte, page_to_maddr(hd->arch.root_table), ++ domain->domain_id, hd->arch.paging_mode, sr_flags); ++ if ( rc ) ++ { ++ ASSERT(rc < 0); ++ spin_unlock_irqrestore(&iommu->lock, flags); ++ return rc; ++ } + + /* Undo what amd_iommu_disable_domain_device() may have done. */ +- ivrs_dev = &get_ivrs_mappings(iommu->seg)[req_id]; + if ( dte->it_root ) + { + dte->int_ctl = IOMMU_DEV_TABLE_INT_CONTROL_TRANSLATED; +@@ -133,17 +174,74 @@ static void amd_iommu_setup_domain_device( + dte->i = ats_enabled; + + amd_iommu_flush_device(iommu, req_id); ++ } ++ else if ( dte->pt_root != mfn_x(page_to_mfn(hd->arch.root_table)) ) ++ { ++ /* ++ * Strictly speaking if the device is the only one with this requestor ++ * ID, it could be allowed to be re-assigned regardless of unity map ++ * presence. But let's deal with that case only if it is actually ++ * found in the wild. ++ */ ++ if ( req_id != PCI_BDF2(bus, devfn) && ++ (sr_flags & SET_ROOT_WITH_UNITY_MAP) ) ++ rc = -EOPNOTSUPP; ++ else ++ rc = amd_iommu_set_root_page_table( ++ dte, page_to_maddr(hd->arch.root_table), ++ domain->domain_id, hd->arch.paging_mode, sr_flags); ++ if ( rc < 0 ) ++ { ++ spin_unlock_irqrestore(&iommu->lock, flags); ++ return rc; ++ } ++ if ( rc && ++ domain != pdev->domain && ++ /* ++ * By non-atomically updating the DTE's domain ID field last, ++ * during a short window in time TLB entries with the old domain ++ * ID but the new page tables may have been inserted. This could ++ * affect I/O of other devices using this same (old) domain ID. ++ * Such updating therefore is not a problem if this was the only ++ * device associated with the old domain ID. Diverting I/O of any ++ * of a dying domain's devices to the quarantine page tables is ++ * intended anyway. ++ */ ++ !pdev->domain->is_dying && ++ (any_pdev_behind_iommu(pdev->domain, pdev, iommu) || ++ pdev->phantom_stride) ) ++ printk(" %04x:%02x:%02x.%u: reassignment may cause %pd data corruption\n", ++ pdev->seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), ++ pdev->domain); ++ ++ /* ++ * Check remaining settings are still in place from an earlier call ++ * here. They're all independent of the domain, so should not have ++ * changed. ++ */ ++ if ( dte->it_root ) ++ ASSERT(dte->int_ctl == IOMMU_DEV_TABLE_INT_CONTROL_TRANSLATED); ++ ASSERT(dte->iv == iommu_intremap); ++ ASSERT(dte->ex == ivrs_dev->dte_allow_exclusion); ++ ASSERT(dte->sys_mgt == MASK_EXTR(ivrs_dev->device_flags, ++ ACPI_IVHD_SYSTEM_MGMT)); + +- AMD_IOMMU_DEBUG("Setup I/O page table: device id = %#x, type = %#x, " +- "root table = %#"PRIx64", " +- "domain = %d, paging mode = %d\n", +- req_id, pdev->type, +- page_to_maddr(hd->arch.root_table), +- domain->domain_id, hd->arch.paging_mode); ++ if ( pci_ats_device(iommu->seg, bus, pdev->devfn) && ++ iommu_has_cap(iommu, PCI_CAP_IOTLB_SHIFT) ) ++ ASSERT(dte->i == ats_enabled); ++ ++ amd_iommu_flush_device(iommu, req_id); + } + + spin_unlock_irqrestore(&iommu->lock, flags); + ++ AMD_IOMMU_DEBUG("Setup I/O page table: device id = %#x, type = %#x, " ++ "root table = %#"PRIx64", " ++ "domain = %d, paging mode = %d\n", ++ req_id, pdev->type, ++ page_to_maddr(hd->arch.root_table), ++ domain->domain_id, hd->arch.paging_mode); ++ + ASSERT(pcidevs_locked()); + + if ( pci_ats_device(iommu->seg, bus, pdev->devfn) && +@@ -154,6 +252,8 @@ static void amd_iommu_setup_domain_device( + + amd_iommu_flush_iotlb(devfn, pdev, INV_IOMMU_ALL_PAGES_ADDRESS, 0); + } ++ ++ return 0; + } + + int __init acpi_ivrs_init(void) +@@ -223,17 +323,6 @@ int amd_iommu_alloc_root(struct domain_iommu *hd) + return 0; + } + +-static int __must_check allocate_domain_resources(struct domain_iommu *hd) +-{ +- int rc; +- +- spin_lock(&hd->arch.mapping_lock); +- rc = amd_iommu_alloc_root(hd); +- spin_unlock(&hd->arch.mapping_lock); +- +- return rc; +-} +- + int __read_mostly amd_iommu_min_paging_mode = 1; + + static int amd_iommu_domain_init(struct domain *d) +@@ -333,7 +422,6 @@ static int reassign_device(struct domain *source, struct domain *target, + { + struct amd_iommu *iommu; + int bdf, rc; +- struct domain_iommu *t = dom_iommu(target); + const struct ivrs_mappings *ivrs_mappings = get_ivrs_mappings(pdev->seg); + + bdf = PCI_BDF2(pdev->bus, pdev->devfn); +@@ -347,7 +435,15 @@ static int reassign_device(struct domain *source, struct domain *target, + return -ENODEV; + } + +- amd_iommu_disable_domain_device(source, iommu, devfn, pdev); ++ rc = amd_iommu_setup_domain_device(target, iommu, devfn, pdev); ++ if ( rc ) ++ return rc; ++ ++ if ( devfn == pdev->devfn && pdev->domain != target ) ++ { ++ list_move(&pdev->domain_list, &target->pdev_list); ++ pdev->domain = target; ++ } + + /* + * If the device belongs to the hardware domain, and it has a unity mapping, +@@ -363,27 +459,10 @@ static int reassign_device(struct domain *source, struct domain *target, + return rc; + } + +- if ( devfn == pdev->devfn && pdev->domain != dom_io ) +- { +- list_move(&pdev->domain_list, &dom_io->pdev_list); +- pdev->domain = dom_io; +- } +- +- rc = allocate_domain_resources(t); +- if ( rc ) +- return rc; +- +- amd_iommu_setup_domain_device(target, iommu, devfn, pdev); + AMD_IOMMU_DEBUG("Re-assign %04x:%02x:%02x.%u from dom%d to dom%d\n", + pdev->seg, pdev->bus, PCI_SLOT(devfn), PCI_FUNC(devfn), + source->domain_id, target->domain_id); + +- if ( devfn == pdev->devfn && pdev->domain != target ) +- { +- list_move(&pdev->domain_list, &target->pdev_list); +- pdev->domain = target; +- } +- + return 0; + } + +@@ -547,8 +626,7 @@ static int amd_iommu_add_device(u8 devfn, struct pci_dev *pdev) + spin_unlock_irqrestore(&iommu->lock, flags); + } + +- amd_iommu_setup_domain_device(pdev->domain, iommu, devfn, pdev); +- return 0; ++ return amd_iommu_setup_domain_device(pdev->domain, iommu, devfn, pdev); + } + + static int amd_iommu_remove_device(u8 devfn, struct pci_dev *pdev) +diff --git a/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h b/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h +index 3983293540..52c889ade0 100644 +--- a/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h ++++ b/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h +@@ -79,9 +79,13 @@ void amd_iommu_set_intremap_table(struct amd_iommu_dte *dte, + const void *ptr, + const struct amd_iommu *iommu, + bool valid); +-void amd_iommu_set_root_page_table(struct amd_iommu_dte *dte, +- uint64_t root_ptr, uint16_t domain_id, +- uint8_t paging_mode, bool valid); ++#define SET_ROOT_VALID (1u << 0) ++#define SET_ROOT_WITH_UNITY_MAP (1u << 1) ++int __must_check amd_iommu_set_root_page_table(struct amd_iommu_dte *dte, ++ uint64_t root_ptr, ++ uint16_t domain_id, ++ uint8_t paging_mode, ++ unsigned int flags); + void iommu_dte_add_device_entry(struct amd_iommu_dte *dte, + const struct ivrs_mappings *ivrs_dev); + void iommu_dte_set_guest_cr3(struct amd_iommu_dte *dte, uint16_t dom_id, +-- +2.35.2 + + +From 92acf6b23154d65066ec4702fdca5cf232856d90 Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Tue, 5 Apr 2022 15:24:54 +0200 +Subject: [PATCH 24/32] VT-d: prepare for per-device quarantine page tables + (part I) +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Arrange for domain ID and page table root to be passed around, the latter in +particular to domain_pgd_maddr() such that taking it from the per-domain +fields can be overridden. + +No functional change intended. + +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Paul Durrant <paul@xen.org> +Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> +Reviewed-by: Kevin Tian <kevin.tian@intel.com> +master commit: eb19326a328d49a6a4dc3930391b340f3bcd8948 +master date: 2022-04-05 14:18:26 +0200 +--- + xen/drivers/passthrough/vtd/extern.h | 8 ++-- + xen/drivers/passthrough/vtd/iommu.c | 62 ++++++++++++++++++---------- + xen/drivers/passthrough/vtd/quirks.c | 13 +++--- + 3 files changed, 54 insertions(+), 29 deletions(-) + +diff --git a/xen/drivers/passthrough/vtd/extern.h b/xen/drivers/passthrough/vtd/extern.h +index f51f8aae0d..897dcff9ff 100644 +--- a/xen/drivers/passthrough/vtd/extern.h ++++ b/xen/drivers/passthrough/vtd/extern.h +@@ -86,9 +86,10 @@ void *map_vtd_domain_page(u64 maddr); + void unmap_vtd_domain_page(void *va); + int domain_context_mapping_one(struct domain *domain, struct vtd_iommu *iommu, + uint8_t bus, uint8_t devfn, +- const struct pci_dev *pdev, unsigned int mode); ++ const struct pci_dev *pdev, domid_t domid, ++ paddr_t pgd_maddr, unsigned int mode); + int domain_context_unmap_one(struct domain *domain, struct vtd_iommu *iommu, +- u8 bus, u8 devfn); ++ uint8_t bus, uint8_t devfn, domid_t domid); + int intel_iommu_get_reserved_device_memory(iommu_grdm_t *func, void *ctxt); + + unsigned int io_apic_read_remap_rte(unsigned int apic, unsigned int reg); +@@ -107,7 +108,8 @@ void platform_quirks_init(void); + void vtd_ops_preamble_quirk(struct vtd_iommu *iommu); + void vtd_ops_postamble_quirk(struct vtd_iommu *iommu); + int __must_check me_wifi_quirk(struct domain *domain, uint8_t bus, +- uint8_t devfn, unsigned int mode); ++ uint8_t devfn, domid_t domid, paddr_t pgd_maddr, ++ unsigned int mode); + void pci_vtd_quirk(const struct pci_dev *); + void quirk_iommu_caps(struct vtd_iommu *iommu); + +diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c +index 17deda92d8..ac2c73e32a 100644 +--- a/xen/drivers/passthrough/vtd/iommu.c ++++ b/xen/drivers/passthrough/vtd/iommu.c +@@ -1364,12 +1364,12 @@ int domain_context_mapping_one( + struct domain *domain, + struct vtd_iommu *iommu, + uint8_t bus, uint8_t devfn, const struct pci_dev *pdev, +- unsigned int mode) ++ domid_t domid, paddr_t pgd_maddr, unsigned int mode) + { + struct domain_iommu *hd = dom_iommu(domain); + struct context_entry *context, *context_entries, lctxt; + __uint128_t old; +- u64 maddr, pgd_maddr; ++ uint64_t maddr; + uint16_t seg = iommu->drhd->segment, prev_did = 0; + struct domain *prev_dom = NULL; + int agaw, rc, ret; +@@ -1410,10 +1410,12 @@ int domain_context_mapping_one( + } + else + { ++ paddr_t root = pgd_maddr; ++ + spin_lock(&hd->arch.mapping_lock); + + /* Ensure we have pagetables allocated down to leaf PTE. */ +- if ( hd->arch.pgd_maddr == 0 ) ++ if ( !root ) + { + addr_to_dma_page_maddr(domain, 0, 1); + if ( hd->arch.pgd_maddr == 0 ) +@@ -1426,22 +1428,24 @@ int domain_context_mapping_one( + rcu_unlock_domain(prev_dom); + return -ENOMEM; + } ++ ++ root = hd->arch.pgd_maddr; + } + + /* Skip top levels of page tables for 2- and 3-level DRHDs. */ +- pgd_maddr = hd->arch.pgd_maddr; + for ( agaw = level_to_agaw(4); + agaw != level_to_agaw(iommu->nr_pt_levels); + agaw-- ) + { +- struct dma_pte *p = map_vtd_domain_page(pgd_maddr); +- pgd_maddr = dma_pte_addr(*p); ++ struct dma_pte *p = map_vtd_domain_page(root); ++ ++ root = dma_pte_addr(*p); + unmap_vtd_domain_page(p); +- if ( pgd_maddr == 0 ) ++ if ( !root ) + goto nomem; + } + +- context_set_address_root(lctxt, pgd_maddr); ++ context_set_address_root(lctxt, root); + if ( ats_enabled && ecap_dev_iotlb(iommu->ecap) ) + context_set_translation_type(lctxt, CONTEXT_TT_DEV_IOTLB); + else +@@ -1557,15 +1561,21 @@ int domain_context_mapping_one( + unmap_vtd_domain_page(context_entries); + + if ( !seg && !rc ) +- rc = me_wifi_quirk(domain, bus, devfn, mode); ++ rc = me_wifi_quirk(domain, bus, devfn, domid, pgd_maddr, mode); + + if ( rc ) + { + if ( !prev_dom ) +- domain_context_unmap_one(domain, iommu, bus, devfn); ++ domain_context_unmap_one(domain, iommu, bus, devfn, ++ domain->domain_id); + else if ( prev_dom != domain ) /* Avoid infinite recursion. */ ++ { ++ hd = dom_iommu(prev_dom); + domain_context_mapping_one(prev_dom, iommu, bus, devfn, pdev, ++ domain->domain_id, ++ hd->arch.pgd_maddr, + mode & MAP_WITH_RMRR); ++ } + } + + if ( prev_dom ) +@@ -1582,6 +1592,7 @@ static int domain_context_mapping(struct domain *domain, u8 devfn, + { + struct acpi_drhd_unit *drhd; + const struct acpi_rmrr_unit *rmrr; ++ paddr_t pgd_maddr = dom_iommu(domain)->arch.pgd_maddr; + int ret = 0; + unsigned int i, mode = 0; + uint16_t seg = pdev->seg, bdf; +@@ -1647,7 +1658,8 @@ static int domain_context_mapping(struct domain *domain, u8 devfn, + domain->domain_id, seg, bus, + PCI_SLOT(devfn), PCI_FUNC(devfn)); + ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn, +- pdev, mode); ++ pdev, domain->domain_id, pgd_maddr, ++ mode); + if ( ret > 0 ) + ret = 0; + if ( !ret && devfn == pdev->devfn && ats_device(pdev, drhd) > 0 ) +@@ -1662,7 +1674,8 @@ static int domain_context_mapping(struct domain *domain, u8 devfn, + PCI_SLOT(devfn), PCI_FUNC(devfn)); + + ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn, +- pdev, mode); ++ pdev, domain->domain_id, pgd_maddr, ++ mode); + if ( ret < 0 ) + break; + prev_present = ret; +@@ -1690,7 +1703,8 @@ static int domain_context_mapping(struct domain *domain, u8 devfn, + */ + if ( ret >= 0 ) + ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn, +- NULL, mode); ++ NULL, domain->domain_id, pgd_maddr, ++ mode); + + /* + * Devices behind PCIe-to-PCI/PCIx bridge may generate different +@@ -1705,7 +1719,8 @@ static int domain_context_mapping(struct domain *domain, u8 devfn, + if ( !ret && pdev_type(seg, bus, devfn) == DEV_TYPE_PCIe2PCI_BRIDGE && + (secbus != pdev->bus || pdev->devfn != 0) ) + ret = domain_context_mapping_one(domain, drhd->iommu, secbus, 0, +- NULL, mode); ++ NULL, domain->domain_id, pgd_maddr, ++ mode); + + if ( ret ) + { +@@ -1734,7 +1749,7 @@ static int domain_context_mapping(struct domain *domain, u8 devfn, + int domain_context_unmap_one( + struct domain *domain, + struct vtd_iommu *iommu, +- u8 bus, u8 devfn) ++ uint8_t bus, uint8_t devfn, domid_t domid) + { + struct context_entry *context, *context_entries; + u64 maddr; +@@ -1792,7 +1807,7 @@ int domain_context_unmap_one( + unmap_vtd_domain_page(context_entries); + + if ( !iommu->drhd->segment && !rc ) +- rc = me_wifi_quirk(domain, bus, devfn, UNMAP_ME_PHANTOM_FUNC); ++ rc = me_wifi_quirk(domain, bus, devfn, domid, 0, UNMAP_ME_PHANTOM_FUNC); + + if ( rc && !is_hardware_domain(domain) && domain != dom_io ) + { +@@ -1844,7 +1859,8 @@ static int domain_context_unmap(struct domain *domain, u8 devfn, + printk(VTDPREFIX "d%d:PCIe: unmap %04x:%02x:%02x.%u\n", + domain->domain_id, seg, bus, + PCI_SLOT(devfn), PCI_FUNC(devfn)); +- ret = domain_context_unmap_one(domain, iommu, bus, devfn); ++ ret = domain_context_unmap_one(domain, iommu, bus, devfn, ++ domain->domain_id); + if ( !ret && devfn == pdev->devfn && ats_device(pdev, drhd) > 0 ) + disable_ats_device(pdev); + +@@ -1854,7 +1870,8 @@ static int domain_context_unmap(struct domain *domain, u8 devfn, + if ( iommu_debug ) + printk(VTDPREFIX "d%d:PCI: unmap %04x:%02x:%02x.%u\n", + domain->domain_id, seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); +- ret = domain_context_unmap_one(domain, iommu, bus, devfn); ++ ret = domain_context_unmap_one(domain, iommu, bus, devfn, ++ domain->domain_id); + if ( ret ) + break; + +@@ -1880,12 +1897,15 @@ static int domain_context_unmap(struct domain *domain, u8 devfn, + /* PCIe to PCI/PCIx bridge */ + if ( pdev_type(seg, tmp_bus, tmp_devfn) == DEV_TYPE_PCIe2PCI_BRIDGE ) + { +- ret = domain_context_unmap_one(domain, iommu, tmp_bus, tmp_devfn); ++ ret = domain_context_unmap_one(domain, iommu, tmp_bus, tmp_devfn, ++ domain->domain_id); + if ( !ret ) +- ret = domain_context_unmap_one(domain, iommu, secbus, 0); ++ ret = domain_context_unmap_one(domain, iommu, secbus, 0, ++ domain->domain_id); + } + else /* Legacy PCI bridge */ +- ret = domain_context_unmap_one(domain, iommu, tmp_bus, tmp_devfn); ++ ret = domain_context_unmap_one(domain, iommu, tmp_bus, tmp_devfn, ++ domain->domain_id); + + break; + +diff --git a/xen/drivers/passthrough/vtd/quirks.c b/xen/drivers/passthrough/vtd/quirks.c +index 99e159b4e9..4d54c21136 100644 +--- a/xen/drivers/passthrough/vtd/quirks.c ++++ b/xen/drivers/passthrough/vtd/quirks.c +@@ -344,6 +344,8 @@ void __init platform_quirks_init(void) + + static int __must_check map_me_phantom_function(struct domain *domain, + unsigned int dev, ++ domid_t domid, ++ paddr_t pgd_maddr, + unsigned int mode) + { + struct acpi_drhd_unit *drhd; +@@ -357,16 +359,17 @@ static int __must_check map_me_phantom_function(struct domain *domain, + /* map or unmap ME phantom function */ + if ( !(mode & UNMAP_ME_PHANTOM_FUNC) ) + rc = domain_context_mapping_one(domain, drhd->iommu, 0, +- PCI_DEVFN(dev, 7), NULL, mode); ++ PCI_DEVFN(dev, 7), NULL, ++ domid, pgd_maddr, mode); + else + rc = domain_context_unmap_one(domain, drhd->iommu, 0, +- PCI_DEVFN(dev, 7)); ++ PCI_DEVFN(dev, 7), domid); + + return rc; + } + + int me_wifi_quirk(struct domain *domain, uint8_t bus, uint8_t devfn, +- unsigned int mode) ++ domid_t domid, paddr_t pgd_maddr, unsigned int mode) + { + u32 id; + int rc = 0; +@@ -390,7 +393,7 @@ int me_wifi_quirk(struct domain *domain, uint8_t bus, uint8_t devfn, + case 0x423b8086: + case 0x423c8086: + case 0x423d8086: +- rc = map_me_phantom_function(domain, 3, mode); ++ rc = map_me_phantom_function(domain, 3, domid, pgd_maddr, mode); + break; + default: + break; +@@ -416,7 +419,7 @@ int me_wifi_quirk(struct domain *domain, uint8_t bus, uint8_t devfn, + case 0x42388086: /* Puma Peak */ + case 0x422b8086: + case 0x422c8086: +- rc = map_me_phantom_function(domain, 22, mode); ++ rc = map_me_phantom_function(domain, 22, domid, pgd_maddr, mode); + break; + default: + break; +-- +2.35.2 + + +From ab37463eec5724036059d7df027ca13d66368211 Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Tue, 5 Apr 2022 15:25:26 +0200 +Subject: [PATCH 25/32] VT-d: prepare for per-device quarantine page tables + (part II) +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Replace the passing of struct domain * by domid_t in preparation of +per-device quarantine page tables also requiring per-device pseudo +domain IDs, which aren't going to be associated with any struct domain +instances. + +No functional change intended (except for slightly adjusted log message +text). + +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Paul Durrant <paul@xen.org> +Reviewed-by: Kevin Tian <kevin.tian@intel.com> +Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> +master commit: 7131163c4806e3c7de24873164d1a003d2a27dee +master date: 2022-04-05 14:18:48 +0200 +--- + xen/drivers/passthrough/vtd/iommu.c | 33 ++++++++++++++--------------- + 1 file changed, 16 insertions(+), 17 deletions(-) + +diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c +index ac2c73e32a..6388d97d26 100644 +--- a/xen/drivers/passthrough/vtd/iommu.c ++++ b/xen/drivers/passthrough/vtd/iommu.c +@@ -52,8 +52,8 @@ static struct tasklet vtd_fault_tasklet; + static int setup_hwdom_device(u8 devfn, struct pci_dev *); + static void setup_hwdom_rmrr(struct domain *d); + +-static int domain_iommu_domid(struct domain *d, +- struct vtd_iommu *iommu) ++static int get_iommu_did(domid_t domid, const struct vtd_iommu *iommu, ++ bool warn) + { + unsigned long nr_dom, i; + +@@ -61,16 +61,16 @@ static int domain_iommu_domid(struct domain *d, + i = find_first_bit(iommu->domid_bitmap, nr_dom); + while ( i < nr_dom ) + { +- if ( iommu->domid_map[i] == d->domain_id ) ++ if ( iommu->domid_map[i] == domid ) + return i; + + i = find_next_bit(iommu->domid_bitmap, nr_dom, i+1); + } + +- if ( !d->is_dying ) ++ if ( warn ) + dprintk(XENLOG_ERR VTDPREFIX, +- "Cannot get valid iommu %u domid: %pd\n", +- iommu->index, d); ++ "No valid iommu %u domid for Dom%d\n", ++ iommu->index, domid); + + return -1; + } +@@ -78,8 +78,7 @@ static int domain_iommu_domid(struct domain *d, + #define DID_FIELD_WIDTH 16 + #define DID_HIGH_OFFSET 8 + static int context_set_domain_id(struct context_entry *context, +- struct domain *d, +- struct vtd_iommu *iommu) ++ domid_t domid, struct vtd_iommu *iommu) + { + unsigned long nr_dom, i; + int found = 0; +@@ -90,7 +89,7 @@ static int context_set_domain_id(struct context_entry *context, + i = find_first_bit(iommu->domid_bitmap, nr_dom); + while ( i < nr_dom ) + { +- if ( iommu->domid_map[i] == d->domain_id ) ++ if ( iommu->domid_map[i] == domid ) + { + found = 1; + break; +@@ -106,7 +105,7 @@ static int context_set_domain_id(struct context_entry *context, + dprintk(XENLOG_ERR VTDPREFIX, "IOMMU: no free domain ids\n"); + return -EFAULT; + } +- iommu->domid_map[i] = d->domain_id; ++ iommu->domid_map[i] = domid; + } + + set_bit(i, iommu->domid_bitmap); +@@ -115,9 +114,9 @@ static int context_set_domain_id(struct context_entry *context, + return 0; + } + +-static void cleanup_domid_map(struct domain *domain, struct vtd_iommu *iommu) ++static void cleanup_domid_map(domid_t domid, struct vtd_iommu *iommu) + { +- int iommu_domid = domain_iommu_domid(domain, iommu); ++ int iommu_domid = get_iommu_did(domid, iommu, false); + + if ( iommu_domid >= 0 ) + { +@@ -173,7 +172,7 @@ static void check_cleanup_domid_map(struct domain *d, + if ( !found ) + { + clear_bit(iommu->index, &dom_iommu(d)->arch.iommu_bitmap); +- cleanup_domid_map(d, iommu); ++ cleanup_domid_map(d->domain_id, iommu); + } + } + +@@ -630,7 +629,7 @@ static int __must_check iommu_flush_iotlb(struct domain *d, dfn_t dfn, + continue; + + flush_dev_iotlb = !!find_ats_dev_drhd(iommu); +- iommu_domid= domain_iommu_domid(d, iommu); ++ iommu_domid = get_iommu_did(d->domain_id, iommu, !d->is_dying); + if ( iommu_domid == -1 ) + continue; + +@@ -1454,7 +1453,7 @@ int domain_context_mapping_one( + spin_unlock(&hd->arch.mapping_lock); + } + +- rc = context_set_domain_id(&lctxt, domain, iommu); ++ rc = context_set_domain_id(&lctxt, domid, iommu); + if ( rc ) + { + unlock: +@@ -1774,7 +1773,7 @@ int domain_context_unmap_one( + context_clear_entry(*context); + iommu_sync_cache(context, sizeof(struct context_entry)); + +- iommu_domid= domain_iommu_domid(domain, iommu); ++ iommu_domid = get_iommu_did(domid, iommu, !domain->is_dying); + if ( iommu_domid == -1 ) + { + spin_unlock(&iommu->lock); +@@ -1948,7 +1947,7 @@ static void iommu_domain_teardown(struct domain *d) + spin_unlock(&hd->arch.mapping_lock); + + for_each_drhd_unit ( drhd ) +- cleanup_domid_map(d, drhd->iommu); ++ cleanup_domid_map(d->domain_id, drhd->iommu); + } + + static int __must_check intel_iommu_map_page(struct domain *d, dfn_t dfn, +-- +2.35.2 + + +From 7cfe3570b1c0b4b19317145fbe4c776f09768fd5 Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Tue, 5 Apr 2022 15:25:54 +0200 +Subject: [PATCH 26/32] IOMMU/x86: maintain a per-device pseudo domain ID +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +In order to subsequently enable per-device quarantine page tables, we'll +need domain-ID-like identifiers to be inserted in the respective device +(AMD) or context (Intel) table entries alongside the per-device page +table root addresses. + +Make use of "real" domain IDs occupying only half of the value range +coverable by domid_t. + +Note that in VT-d's iommu_alloc() I didn't want to introduce new memory +leaks in case of error, but existing ones don't get plugged - that'll be +the subject of a later change. + +The VT-d changes are slightly asymmetric, but this way we can avoid +assigning pseudo domain IDs to devices which would never be mapped while +still avoiding to add a new parameter to domain_context_unmap(). + +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Paul Durrant <paul@xen.org> +Reviewed-by: Kevin Tian <kevin.tian@intel.com> +Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> +master commit: 97af062b89d52c0ecf7af254b53345c97d438e33 +master date: 2022-04-05 14:19:10 +0200 +--- + xen/drivers/passthrough/amd/iommu_detect.c | 8 +++ + xen/drivers/passthrough/amd/pci_amd_iommu.c | 22 ++++++- + xen/drivers/passthrough/pci.c | 11 +++- + xen/drivers/passthrough/vtd/iommu.c | 69 +++++++++++++++++---- + xen/drivers/passthrough/vtd/iommu.h | 1 + + xen/drivers/passthrough/x86/iommu.c | 47 ++++++++++++++ + xen/include/asm-x86/amd-iommu.h | 1 + + xen/include/asm-x86/iommu.h | 4 ++ + xen/include/asm-x86/pci.h | 6 ++ + xen/include/public/xen.h | 3 + + 10 files changed, 156 insertions(+), 16 deletions(-) + +diff --git a/xen/drivers/passthrough/amd/iommu_detect.c b/xen/drivers/passthrough/amd/iommu_detect.c +index d782e66eee..0df10f25b0 100644 +--- a/xen/drivers/passthrough/amd/iommu_detect.c ++++ b/xen/drivers/passthrough/amd/iommu_detect.c +@@ -183,6 +183,11 @@ int __init amd_iommu_detect_one_acpi( + if ( rt ) + goto out; + ++ iommu->domid_map = iommu_init_domid(); ++ rt = -ENOMEM; ++ if ( !iommu->domid_map ) ++ goto out; ++ + rt = pci_ro_device(iommu->seg, bus, PCI_DEVFN(dev, func)); + if ( rt ) + printk(XENLOG_ERR +@@ -194,7 +199,10 @@ int __init amd_iommu_detect_one_acpi( + + out: + if ( rt ) ++ { ++ xfree(iommu->domid_map); + xfree(iommu); ++ } + + return rt; + } +diff --git a/xen/drivers/passthrough/amd/pci_amd_iommu.c b/xen/drivers/passthrough/amd/pci_amd_iommu.c +index 14483e85ae..b07091e71e 100644 +--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c ++++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c +@@ -563,6 +563,8 @@ static int amd_iommu_add_device(u8 devfn, struct pci_dev *pdev) + struct amd_iommu *iommu; + u16 bdf; + struct ivrs_mappings *ivrs_mappings; ++ bool fresh_domid = false; ++ int ret; + + if ( !pdev->domain ) + return -EINVAL; +@@ -626,7 +628,22 @@ static int amd_iommu_add_device(u8 devfn, struct pci_dev *pdev) + spin_unlock_irqrestore(&iommu->lock, flags); + } + +- return amd_iommu_setup_domain_device(pdev->domain, iommu, devfn, pdev); ++ if ( iommu_quarantine && pdev->arch.pseudo_domid == DOMID_INVALID ) ++ { ++ pdev->arch.pseudo_domid = iommu_alloc_domid(iommu->domid_map); ++ if ( pdev->arch.pseudo_domid == DOMID_INVALID ) ++ return -ENOSPC; ++ fresh_domid = true; ++ } ++ ++ ret = amd_iommu_setup_domain_device(pdev->domain, iommu, devfn, pdev); ++ if ( ret && fresh_domid ) ++ { ++ iommu_free_domid(pdev->arch.pseudo_domid, iommu->domid_map); ++ pdev->arch.pseudo_domid = DOMID_INVALID; ++ } ++ ++ return ret; + } + + static int amd_iommu_remove_device(u8 devfn, struct pci_dev *pdev) +@@ -651,6 +668,9 @@ static int amd_iommu_remove_device(u8 devfn, struct pci_dev *pdev) + + amd_iommu_disable_domain_device(pdev->domain, iommu, devfn, pdev); + ++ iommu_free_domid(pdev->arch.pseudo_domid, iommu->domid_map); ++ pdev->arch.pseudo_domid = DOMID_INVALID; ++ + ivrs_mappings = get_ivrs_mappings(pdev->seg); + bdf = PCI_BDF2(pdev->bus, devfn); + if ( amd_iommu_perdev_intremap && +diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c +index 32510351cf..97e42261eb 100644 +--- a/xen/drivers/passthrough/pci.c ++++ b/xen/drivers/passthrough/pci.c +@@ -338,6 +338,7 @@ static struct pci_dev *alloc_pdev(struct pci_seg *pseg, u8 bus, u8 devfn) + *((u8*) &pdev->bus) = bus; + *((u8*) &pdev->devfn) = devfn; + pdev->domain = NULL; ++ pdev->arch.pseudo_domid = DOMID_INVALID; + INIT_LIST_HEAD(&pdev->msi_list); + + pos = pci_find_cap_offset(pseg->nr, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), +@@ -1353,9 +1354,13 @@ static int _dump_pci_devices(struct pci_seg *pseg, void *arg) + + list_for_each_entry ( pdev, &pseg->alldevs_list, alldevs_list ) + { +- printk("%04x:%02x:%02x.%u - %pd - node %-3d - MSIs < ", +- pseg->nr, pdev->bus, +- PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn), pdev->domain, ++ printk("%04x:%02x:%02x.%u - ", pseg->nr, pdev->bus, ++ PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn)); ++ if ( pdev->domain == dom_io ) ++ printk("DomIO:%x", pdev->arch.pseudo_domid); ++ else ++ printk("%pd", pdev->domain); ++ printk(" - node %-3d - MSIs < ", + (pdev->node != NUMA_NO_NODE) ? pdev->node : -1); + list_for_each_entry ( msi, &pdev->msi_list, list ) + printk("%d ", msi->irq); +diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c +index 6388d97d26..fc89f3e4c5 100644 +--- a/xen/drivers/passthrough/vtd/iommu.c ++++ b/xen/drivers/passthrough/vtd/iommu.c +@@ -22,6 +22,7 @@ + #include <xen/sched.h> + #include <xen/xmalloc.h> + #include <xen/domain_page.h> ++#include <xen/err.h> + #include <xen/iocap.h> + #include <xen/iommu.h> + #include <xen/numa.h> +@@ -1192,7 +1193,7 @@ int __init iommu_alloc(struct acpi_drhd_unit *drhd) + { + struct vtd_iommu *iommu; + unsigned long sagaw, nr_dom; +- int agaw; ++ int agaw, rc; + + if ( nr_iommus >= MAX_IOMMUS ) + { +@@ -1285,7 +1286,16 @@ int __init iommu_alloc(struct acpi_drhd_unit *drhd) + if ( !iommu->domid_map ) + return -ENOMEM; + ++ iommu->pseudo_domid_map = iommu_init_domid(); ++ rc = -ENOMEM; ++ if ( !iommu->pseudo_domid_map ) ++ goto free; ++ + return 0; ++ ++ free: ++ iommu_free(drhd); ++ return rc; + } + + void __init iommu_free(struct acpi_drhd_unit *drhd) +@@ -1308,6 +1318,7 @@ void __init iommu_free(struct acpi_drhd_unit *drhd) + + xfree(iommu->domid_bitmap); + xfree(iommu->domid_map); ++ xfree(iommu->pseudo_domid_map); + + if ( iommu->msi.irq >= 0 ) + destroy_irq(iommu->msi.irq); +@@ -1583,8 +1594,8 @@ int domain_context_mapping_one( + return rc ?: pdev && prev_dom; + } + +-static int domain_context_unmap(struct domain *d, uint8_t devfn, +- struct pci_dev *pdev); ++static const struct acpi_drhd_unit *domain_context_unmap( ++ struct domain *d, uint8_t devfn, struct pci_dev *pdev); + + static int domain_context_mapping(struct domain *domain, u8 devfn, + struct pci_dev *pdev) +@@ -1592,6 +1603,7 @@ static int domain_context_mapping(struct domain *domain, u8 devfn, + struct acpi_drhd_unit *drhd; + const struct acpi_rmrr_unit *rmrr; + paddr_t pgd_maddr = dom_iommu(domain)->arch.pgd_maddr; ++ domid_t orig_domid = pdev->arch.pseudo_domid; + int ret = 0; + unsigned int i, mode = 0; + uint16_t seg = pdev->seg, bdf; +@@ -1652,6 +1664,14 @@ static int domain_context_mapping(struct domain *domain, u8 devfn, + break; + + case DEV_TYPE_PCIe_ENDPOINT: ++ if ( iommu_quarantine && orig_domid == DOMID_INVALID ) ++ { ++ pdev->arch.pseudo_domid = ++ iommu_alloc_domid(drhd->iommu->pseudo_domid_map); ++ if ( pdev->arch.pseudo_domid == DOMID_INVALID ) ++ return -ENOSPC; ++ } ++ + if ( iommu_debug ) + printk(VTDPREFIX "d%d:PCIe: map %04x:%02x:%02x.%u\n", + domain->domain_id, seg, bus, +@@ -1667,6 +1687,14 @@ static int domain_context_mapping(struct domain *domain, u8 devfn, + break; + + case DEV_TYPE_PCI: ++ if ( iommu_quarantine && orig_domid == DOMID_INVALID ) ++ { ++ pdev->arch.pseudo_domid = ++ iommu_alloc_domid(drhd->iommu->pseudo_domid_map); ++ if ( pdev->arch.pseudo_domid == DOMID_INVALID ) ++ return -ENOSPC; ++ } ++ + if ( iommu_debug ) + printk(VTDPREFIX "d%d:PCI: map %04x:%02x:%02x.%u\n", + domain->domain_id, seg, bus, +@@ -1742,6 +1770,13 @@ static int domain_context_mapping(struct domain *domain, u8 devfn, + if ( !ret && devfn == pdev->devfn ) + pci_vtd_quirk(pdev); + ++ if ( ret && drhd && orig_domid == DOMID_INVALID ) ++ { ++ iommu_free_domid(pdev->arch.pseudo_domid, ++ drhd->iommu->pseudo_domid_map); ++ pdev->arch.pseudo_domid = DOMID_INVALID; ++ } ++ + return ret; + } + +@@ -1824,8 +1859,10 @@ int domain_context_unmap_one( + return rc; + } + +-static int domain_context_unmap(struct domain *domain, u8 devfn, +- struct pci_dev *pdev) ++static const struct acpi_drhd_unit *domain_context_unmap( ++ struct domain *domain, ++ uint8_t devfn, ++ struct pci_dev *pdev) + { + struct acpi_drhd_unit *drhd; + struct vtd_iommu *iommu; +@@ -1834,7 +1871,7 @@ static int domain_context_unmap(struct domain *domain, u8 devfn, + + drhd = acpi_find_matched_drhd_unit(pdev); + if ( !drhd ) +- return -ENODEV; ++ return ERR_PTR(-ENODEV); + iommu = drhd->iommu; + + switch ( pdev->type ) +@@ -1845,7 +1882,7 @@ static int domain_context_unmap(struct domain *domain, u8 devfn, + domain->domain_id, seg, bus, + PCI_SLOT(devfn), PCI_FUNC(devfn)); + if ( !is_hardware_domain(domain) ) +- return -EPERM; ++ return ERR_PTR(-EPERM); + goto out; + + case DEV_TYPE_PCIe_BRIDGE: +@@ -1923,7 +1960,7 @@ static int domain_context_unmap(struct domain *domain, u8 devfn, + check_cleanup_domid_map(domain, pdev, iommu); + + out: +- return ret; ++ return ret ? ERR_PTR(ret) : drhd; + } + + static void iommu_domain_teardown(struct domain *d) +@@ -2145,16 +2182,17 @@ static int intel_iommu_enable_device(struct pci_dev *pdev) + + static int intel_iommu_remove_device(u8 devfn, struct pci_dev *pdev) + { ++ const struct acpi_drhd_unit *drhd; + struct acpi_rmrr_unit *rmrr; + u16 bdf; +- int ret, i; ++ unsigned int i; + + if ( !pdev->domain ) + return -EINVAL; + +- ret = domain_context_unmap(pdev->domain, devfn, pdev); +- if ( ret ) +- return ret; ++ drhd = domain_context_unmap(pdev->domain, devfn, pdev); ++ if ( IS_ERR(drhd) ) ++ return PTR_ERR(drhd); + + for_each_rmrr_device ( rmrr, bdf, i ) + { +@@ -2171,6 +2209,13 @@ static int intel_iommu_remove_device(u8 devfn, struct pci_dev *pdev) + rmrr->end_address, 0); + } + ++ if ( drhd ) ++ { ++ iommu_free_domid(pdev->arch.pseudo_domid, ++ drhd->iommu->pseudo_domid_map); ++ pdev->arch.pseudo_domid = DOMID_INVALID; ++ } ++ + return 0; + } + +diff --git a/xen/drivers/passthrough/vtd/iommu.h b/xen/drivers/passthrough/vtd/iommu.h +index 503b07ffb7..be44fc017f 100644 +--- a/xen/drivers/passthrough/vtd/iommu.h ++++ b/xen/drivers/passthrough/vtd/iommu.h +@@ -535,6 +535,7 @@ struct vtd_iommu { + } flush; + + struct list_head ats_devices; ++ unsigned long *pseudo_domid_map; /* "pseudo" domain id bitmap */ + unsigned long *domid_bitmap; /* domain id bitmap */ + u16 *domid_map; /* domain id mapping array */ + uint32_t version; +diff --git a/xen/drivers/passthrough/x86/iommu.c b/xen/drivers/passthrough/x86/iommu.c +index 818d28f770..f900bff60b 100644 +--- a/xen/drivers/passthrough/x86/iommu.c ++++ b/xen/drivers/passthrough/x86/iommu.c +@@ -346,6 +346,53 @@ void __hwdom_init arch_iommu_hwdom_init(struct domain *d) + return; + } + ++unsigned long *__init iommu_init_domid(void) ++{ ++ if ( !iommu_quarantine ) ++ return ZERO_BLOCK_PTR; ++ ++ BUILD_BUG_ON(DOMID_MASK * 2U >= UINT16_MAX); ++ ++ return xzalloc_array(unsigned long, ++ BITS_TO_LONGS(UINT16_MAX - DOMID_MASK)); ++} ++ ++domid_t iommu_alloc_domid(unsigned long *map) ++{ ++ /* ++ * This is used uniformly across all IOMMUs, such that on typical ++ * systems we wouldn't re-use the same ID very quickly (perhaps never). ++ */ ++ static unsigned int start; ++ unsigned int idx = find_next_zero_bit(map, UINT16_MAX - DOMID_MASK, start); ++ ++ ASSERT(pcidevs_locked()); ++ ++ if ( idx >= UINT16_MAX - DOMID_MASK ) ++ idx = find_first_zero_bit(map, UINT16_MAX - DOMID_MASK); ++ if ( idx >= UINT16_MAX - DOMID_MASK ) ++ return DOMID_INVALID; ++ ++ __set_bit(idx, map); ++ ++ start = idx + 1; ++ ++ return idx | (DOMID_MASK + 1); ++} ++ ++void iommu_free_domid(domid_t domid, unsigned long *map) ++{ ++ ASSERT(pcidevs_locked()); ++ ++ if ( domid == DOMID_INVALID ) ++ return; ++ ++ ASSERT(domid > DOMID_MASK); ++ ++ if ( !__test_and_clear_bit(domid & DOMID_MASK, map) ) ++ BUG(); ++} ++ + /* + * Local variables: + * mode: C +diff --git a/xen/include/asm-x86/amd-iommu.h b/xen/include/asm-x86/amd-iommu.h +index 829e1b1755..452ce97c02 100644 +--- a/xen/include/asm-x86/amd-iommu.h ++++ b/xen/include/asm-x86/amd-iommu.h +@@ -94,6 +94,7 @@ struct amd_iommu { + struct ring_buffer cmd_buffer; + struct ring_buffer event_log; + struct ring_buffer ppr_log; ++ unsigned long *domid_map; + + int exclusion_enable; + int exclusion_allow_all; +diff --git a/xen/include/asm-x86/iommu.h b/xen/include/asm-x86/iommu.h +index aaf9455b8e..389417d198 100644 +--- a/xen/include/asm-x86/iommu.h ++++ b/xen/include/asm-x86/iommu.h +@@ -130,6 +130,10 @@ int pi_update_irte(const struct pi_desc *pi_desc, const struct pirq *pirq, + iommu_vcall(ops, sync_cache, addr, size); \ + }) + ++unsigned long *iommu_init_domid(void); ++domid_t iommu_alloc_domid(unsigned long *map); ++void iommu_free_domid(domid_t domid, unsigned long *map); ++ + #endif /* !__ARCH_X86_IOMMU_H__ */ + /* + * Local variables: +diff --git a/xen/include/asm-x86/pci.h b/xen/include/asm-x86/pci.h +index cc05045e9c..70ed48e309 100644 +--- a/xen/include/asm-x86/pci.h ++++ b/xen/include/asm-x86/pci.h +@@ -15,6 +15,12 @@ + + struct arch_pci_dev { + vmask_t used_vectors; ++ /* ++ * These fields are (de)initialized under pcidevs-lock. Other uses of ++ * them don't race (de)initialization and hence don't strictly need any ++ * locking. ++ */ ++ domid_t pseudo_domid; + }; + + int pci_conf_write_intercept(unsigned int seg, unsigned int bdf, +diff --git a/xen/include/public/xen.h b/xen/include/public/xen.h +index d2198dffad..75b1619d0d 100644 +--- a/xen/include/public/xen.h ++++ b/xen/include/public/xen.h +@@ -614,6 +614,9 @@ DEFINE_XEN_GUEST_HANDLE(mmuext_op_t); + /* Idle domain. */ + #define DOMID_IDLE xen_mk_uint(0x7FFF) + ++/* Mask for valid domain id values */ ++#define DOMID_MASK xen_mk_uint(0x7FFF) ++ + #ifndef __ASSEMBLY__ + + typedef uint16_t domid_t; +-- +2.35.2 + + +From e6d6b5ba030a8d2d81bf902e4bc2a8530b3576ae Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Tue, 5 Apr 2022 15:26:41 +0200 +Subject: [PATCH 27/32] IOMMU/x86: drop TLB flushes from quarantine_init() + hooks +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +The page tables just created aren't hooked up yet anywhere, so there's +nothing that could be present in any TLB, and hence nothing to flush. +Dropping this flush is, at least on the VT-d side, a prereq to per- +device domain ID use when quarantining devices, as dom_io isn't going +to be assigned a DID anymore: The warning in get_iommu_did() would +trigger. + +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Paul Durrant <paul@xen.org> +Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> +Reviewed-by: Kevin Tian <kevin.tian@intel.com> +master commit: 54c5cef49239e2f27ec3b3fc8804bf57aa4bf46d +master date: 2022-04-05 14:19:42 +0200 +--- + xen/drivers/passthrough/amd/iommu_map.c | 2 -- + xen/drivers/passthrough/vtd/iommu.c | 5 +---- + 2 files changed, 1 insertion(+), 6 deletions(-) + +diff --git a/xen/drivers/passthrough/amd/iommu_map.c b/xen/drivers/passthrough/amd/iommu_map.c +index 45559f9678..3c7cd7ed9e 100644 +--- a/xen/drivers/passthrough/amd/iommu_map.c ++++ b/xen/drivers/passthrough/amd/iommu_map.c +@@ -595,8 +595,6 @@ int __init amd_iommu_quarantine_init(struct domain *d) + out: + spin_unlock(&hd->arch.mapping_lock); + +- amd_iommu_flush_all_pages(d); +- + /* Pages leaked in failure case */ + return level ? -ENOMEM : 0; + } +diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c +index fc89f3e4c5..e5c50429d2 100644 +--- a/xen/drivers/passthrough/vtd/iommu.c ++++ b/xen/drivers/passthrough/vtd/iommu.c +@@ -2894,7 +2894,6 @@ static int __init intel_iommu_quarantine_init(struct domain *d) + struct dma_pte *parent; + unsigned int agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH); + unsigned int level = agaw_to_level(agaw); +- int rc; + + if ( hd->arch.pgd_maddr ) + { +@@ -2941,10 +2940,8 @@ static int __init intel_iommu_quarantine_init(struct domain *d) + out: + spin_unlock(&hd->arch.mapping_lock); + +- rc = iommu_flush_iotlb_all(d); +- + /* Pages leaked in failure case */ +- return level ? -ENOMEM : rc; ++ return level ? -ENOMEM : 0; + } + + const struct iommu_ops __initconstrel intel_iommu_ops = { +-- +2.35.2 + + +From 454d5351a93d2438778630843cf3e77da0772167 Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Tue, 5 Apr 2022 15:27:09 +0200 +Subject: [PATCH 28/32] AMD/IOMMU: abstract maximum number of page table levels + +We will want to use the constant elsewhere. + +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Paul Durrant <paul@xen.org> +master commit: a038b514c1e970a8dc32229cbd31f6769ee61ad5 +master date: 2022-04-05 14:20:04 +0200 +--- + xen/drivers/passthrough/amd/iommu_map.c | 2 +- + xen/include/asm-x86/hvm/svm/amd-iommu-defs.h | 1 + + xen/include/asm-x86/hvm/svm/amd-iommu-proto.h | 2 +- + 3 files changed, 3 insertions(+), 2 deletions(-) + +diff --git a/xen/drivers/passthrough/amd/iommu_map.c b/xen/drivers/passthrough/amd/iommu_map.c +index 3c7cd7ed9e..db396dd1d4 100644 +--- a/xen/drivers/passthrough/amd/iommu_map.c ++++ b/xen/drivers/passthrough/amd/iommu_map.c +@@ -260,7 +260,7 @@ static int iommu_pde_from_dfn(struct domain *d, unsigned long dfn, + table = hd->arch.root_table; + level = hd->arch.paging_mode; + +- BUG_ON( table == NULL || level < 1 || level > 6 ); ++ BUG_ON( table == NULL || level < 1 || level > IOMMU_MAX_PT_LEVELS ); + + /* + * A frame number past what the current page tables can represent can't +diff --git a/xen/include/asm-x86/hvm/svm/amd-iommu-defs.h b/xen/include/asm-x86/hvm/svm/amd-iommu-defs.h +index a54d6e9fc6..c46247cb24 100644 +--- a/xen/include/asm-x86/hvm/svm/amd-iommu-defs.h ++++ b/xen/include/asm-x86/hvm/svm/amd-iommu-defs.h +@@ -110,6 +110,7 @@ struct amd_iommu_dte { + bool tv:1; + unsigned int :5; + unsigned int had:2; ++#define IOMMU_MAX_PT_LEVELS 6 + unsigned int paging_mode:3; + uint64_t pt_root:40; + bool ppr:1; +diff --git a/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h b/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h +index 52c889ade0..2a3bc47ab5 100644 +--- a/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h ++++ b/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h +@@ -193,7 +193,7 @@ static inline int amd_iommu_get_paging_mode(unsigned long max_frames) + while ( max_frames > PTE_PER_TABLE_SIZE ) + { + max_frames = PTE_PER_TABLE_ALIGN(max_frames) >> PTE_PER_TABLE_SHIFT; +- if ( ++level > 6 ) ++ if ( ++level > IOMMU_MAX_PT_LEVELS ) + return -ENOMEM; + } + +-- +2.35.2 + + +From 169a2834ef5d723091f187a5d6493ae77825757a Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Tue, 5 Apr 2022 15:27:36 +0200 +Subject: [PATCH 29/32] IOMMU/x86: use per-device page tables for quarantining +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Devices with RMRRs / unity mapped regions, due to it being unspecified +how/when these memory regions may be accessed, may not be left +disconnected from the mappings of these regions (as long as it's not +certain that the device has been fully quiesced). Hence even the page +tables used when quarantining such devices need to have mappings of +those regions. This implies installing page tables in the first place +even when not in scratch-page quarantining mode. + +This is CVE-2022-26361 / part of XSA-400. + +While for the purpose here it would be sufficient to have devices with +RMRRs / unity mapped regions use per-device page tables, extend this to +all devices (in scratch-page quarantining mode). This allows the leaf +pages to be mapped r/w, thus covering also memory writes (rather than +just reads) issued by non-quiescent devices. + +Set up quarantine page tables as late as possible, yet early enough to +not encounter failure during de-assign. This means setup generally +happens in assign_device(), while (for now) the one in deassign_device() +is there mainly to be on the safe side. + +In VT-d's DID allocation function don't require the IOMMU lock to be +held anymore: All involved code paths hold pcidevs_lock, so this way we +avoid the need to acquire the IOMMU lock around the new call to +context_set_domain_id(). + +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Paul Durrant <paul@xen.org> +Reviewed-by: Kevin Tian <kevin.tian@intel.com> +Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> +master commit: 14dd241aad8af447680ac73e8579990e2c09c1e7 +master date: 2022-04-05 14:24:18 +0200 +--- + xen/arch/x86/mm/p2m.c | 2 +- + xen/drivers/passthrough/amd/iommu_map.c | 155 ++++++++--- + xen/drivers/passthrough/amd/pci_amd_iommu.c | 35 ++- + xen/drivers/passthrough/iommu.c | 18 +- + xen/drivers/passthrough/pci.c | 20 +- + xen/drivers/passthrough/vtd/iommu.c | 247 +++++++++++++----- + xen/drivers/passthrough/vtd/iommu.h | 2 +- + xen/include/asm-x86/hvm/svm/amd-iommu-proto.h | 3 +- + xen/include/asm-x86/pci.h | 13 + + xen/include/xen/iommu.h | 3 +- + 10 files changed, 363 insertions(+), 135 deletions(-) + +diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c +index a6bfda010a..91f7b7760c 100644 +--- a/xen/arch/x86/mm/p2m.c ++++ b/xen/arch/x86/mm/p2m.c +@@ -1453,7 +1453,7 @@ int set_identity_p2m_entry(struct domain *d, unsigned long gfn_l, + struct p2m_domain *p2m = p2m_get_hostp2m(d); + int ret; + +- if ( !paging_mode_translate(p2m->domain) ) ++ if ( !paging_mode_translate(d) ) + { + if ( !is_iommu_enabled(d) ) + return 0; +diff --git a/xen/drivers/passthrough/amd/iommu_map.c b/xen/drivers/passthrough/amd/iommu_map.c +index db396dd1d4..8b7d5b7c7b 100644 +--- a/xen/drivers/passthrough/amd/iommu_map.c ++++ b/xen/drivers/passthrough/amd/iommu_map.c +@@ -539,64 +539,137 @@ int amd_iommu_reserve_domain_unity_unmap(struct domain *d, + return rc; + } + +-int __init amd_iommu_quarantine_init(struct domain *d) ++static int fill_qpt(union amd_iommu_pte *this, unsigned int level, ++ struct page_info *pgs[IOMMU_MAX_PT_LEVELS], ++ struct pci_dev *pdev) + { +- struct domain_iommu *hd = dom_iommu(d); ++ unsigned int i; ++ int rc = 0; ++ ++ for ( i = 0; !rc && i < PTE_PER_TABLE_SIZE; ++i ) ++ { ++ union amd_iommu_pte *pte = &this[i], *next; ++ ++ if ( !pte->pr ) ++ { ++ if ( !pgs[level] ) ++ { ++ /* ++ * The pgtable allocator is fine for the leaf page, as well as ++ * page table pages, and the resulting allocations are always ++ * zeroed. ++ */ ++ pgs[level] = alloc_amd_iommu_pgtable(); ++ if ( !pgs[level] ) ++ { ++ rc = -ENOMEM; ++ break; ++ } ++ ++ page_list_add(pgs[level], &pdev->arch.pgtables_list); ++ ++ if ( level ) ++ { ++ next = __map_domain_page(pgs[level]); ++ rc = fill_qpt(next, level - 1, pgs, pdev); ++ unmap_domain_page(next); ++ } ++ } ++ ++ /* ++ * PDEs are essentially a subset of PTEs, so this function ++ * is fine to use even at the leaf. ++ */ ++ set_iommu_pde_present(pte, mfn_x(page_to_mfn(pgs[level])), level, ++ true, true); ++ } ++ else if ( level && pte->next_level ) ++ { ++ page_list_add(mfn_to_page(_mfn(pte->mfn)), ++ &pdev->arch.pgtables_list); ++ next = map_domain_page(_mfn(pte->mfn)); ++ rc = fill_qpt(next, level - 1, pgs, pdev); ++ unmap_domain_page(next); ++ } ++ } ++ ++ return rc; ++} ++ ++int amd_iommu_quarantine_init(struct pci_dev *pdev) ++{ ++ struct domain_iommu *hd = dom_iommu(dom_io); + unsigned long end_gfn = + 1ul << (DEFAULT_DOMAIN_ADDRESS_WIDTH - PAGE_SHIFT); + unsigned int level = amd_iommu_get_paging_mode(end_gfn); +- union amd_iommu_pte *table; ++ unsigned int req_id = get_dma_requestor_id(pdev->seg, pdev->sbdf.bdf); ++ const struct ivrs_mappings *ivrs_mappings = get_ivrs_mappings(pdev->seg); ++ int rc; ++ ++ ASSERT(pcidevs_locked()); ++ ASSERT(!hd->arch.root_table); + +- if ( hd->arch.root_table ) ++ ASSERT(pdev->arch.pseudo_domid != DOMID_INVALID); ++ ++ if ( pdev->arch.amd.root_table ) + { +- ASSERT_UNREACHABLE(); ++ clear_domain_page(pdev->arch.leaf_mfn); + return 0; + } + +- spin_lock(&hd->arch.mapping_lock); ++ pdev->arch.amd.root_table = alloc_amd_iommu_pgtable(); ++ if ( !pdev->arch.amd.root_table ) ++ return -ENOMEM; + +- hd->arch.root_table = alloc_amd_iommu_pgtable(); +- if ( !hd->arch.root_table ) +- goto out; ++ /* Transiently install the root into DomIO, for iommu_identity_mapping(). */ ++ hd->arch.root_table = pdev->arch.amd.root_table; ++ ++ rc = amd_iommu_reserve_domain_unity_map(dom_io, ++ ivrs_mappings[req_id].unity_map, ++ 0); + +- table = __map_domain_page(hd->arch.root_table); +- while ( level ) ++ iommu_identity_map_teardown(dom_io); ++ hd->arch.root_table = NULL; ++ ++ if ( rc ) ++ printk("%04x:%02x:%02x.%u: quarantine unity mapping failed\n", ++ pdev->seg, pdev->bus, ++ PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn)); ++ else + { +- struct page_info *pg; +- unsigned int i; +- +- /* +- * The pgtable allocator is fine for the leaf page, as well as +- * page table pages, and the resulting allocations are always +- * zeroed. +- */ +- pg = alloc_amd_iommu_pgtable(); +- if ( !pg ) +- break; +- +- for ( i = 0; i < PTE_PER_TABLE_SIZE; i++ ) +- { +- union amd_iommu_pte *pde = &table[i]; ++ union amd_iommu_pte *root; ++ struct page_info *pgs[IOMMU_MAX_PT_LEVELS] = {}; + +- /* +- * PDEs are essentially a subset of PTEs, so this function +- * is fine to use even at the leaf. +- */ +- set_iommu_pde_present(pde, mfn_x(page_to_mfn(pg)), level - 1, +- false, true); +- } ++ spin_lock(&hd->arch.mapping_lock); + +- unmap_domain_page(table); +- table = __map_domain_page(pg); +- level--; ++ root = __map_domain_page(pdev->arch.amd.root_table); ++ rc = fill_qpt(root, level - 1, pgs, pdev); ++ unmap_domain_page(root); ++ ++ pdev->arch.leaf_mfn = page_to_mfn(pgs[0]); ++ ++ spin_unlock(&hd->arch.mapping_lock); + } +- unmap_domain_page(table); + +- out: +- spin_unlock(&hd->arch.mapping_lock); ++ if ( rc ) ++ amd_iommu_quarantine_teardown(pdev); ++ ++ return rc; ++} ++ ++void amd_iommu_quarantine_teardown(struct pci_dev *pdev) ++{ ++ struct page_info *pg; ++ ++ ASSERT(pcidevs_locked()); ++ ++ if ( !pdev->arch.amd.root_table ) ++ return; ++ ++ while ( (pg = page_list_remove_head(&pdev->arch.pgtables_list)) ) ++ free_amd_iommu_pgtable(pg); + +- /* Pages leaked in failure case */ +- return level ? -ENOMEM : 0; ++ pdev->arch.amd.root_table = NULL; + } + + /* +diff --git a/xen/drivers/passthrough/amd/pci_amd_iommu.c b/xen/drivers/passthrough/amd/pci_amd_iommu.c +index b07091e71e..e5c02ca710 100644 +--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c ++++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c +@@ -125,6 +125,8 @@ static int __must_check amd_iommu_setup_domain_device( + u8 bus = pdev->bus; + struct domain_iommu *hd = dom_iommu(domain); + const struct ivrs_mappings *ivrs_dev; ++ const struct page_info *root_pg; ++ domid_t domid; + + BUG_ON(!hd->arch.paging_mode || !iommu->dev_table.buffer); + +@@ -144,14 +146,25 @@ static int __must_check amd_iommu_setup_domain_device( + dte = &table[req_id]; + ivrs_dev = &get_ivrs_mappings(iommu->seg)[req_id]; + ++ if ( domain != dom_io ) ++ { ++ root_pg = hd->arch.root_table; ++ domid = domain->domain_id; ++ } ++ else ++ { ++ root_pg = pdev->arch.amd.root_table; ++ domid = pdev->arch.pseudo_domid; ++ } ++ + spin_lock_irqsave(&iommu->lock, flags); + + if ( !dte->v || !dte->tv ) + { + /* bind DTE to domain page-tables */ + rc = amd_iommu_set_root_page_table( +- dte, page_to_maddr(hd->arch.root_table), +- domain->domain_id, hd->arch.paging_mode, sr_flags); ++ dte, page_to_maddr(root_pg), domid, ++ hd->arch.paging_mode, sr_flags); + if ( rc ) + { + ASSERT(rc < 0); +@@ -175,7 +188,7 @@ static int __must_check amd_iommu_setup_domain_device( + + amd_iommu_flush_device(iommu, req_id); + } +- else if ( dte->pt_root != mfn_x(page_to_mfn(hd->arch.root_table)) ) ++ else if ( dte->pt_root != mfn_x(page_to_mfn(root_pg)) ) + { + /* + * Strictly speaking if the device is the only one with this requestor +@@ -188,8 +201,8 @@ static int __must_check amd_iommu_setup_domain_device( + rc = -EOPNOTSUPP; + else + rc = amd_iommu_set_root_page_table( +- dte, page_to_maddr(hd->arch.root_table), +- domain->domain_id, hd->arch.paging_mode, sr_flags); ++ dte, page_to_maddr(root_pg), domid, ++ hd->arch.paging_mode, sr_flags); + if ( rc < 0 ) + { + spin_unlock_irqrestore(&iommu->lock, flags); +@@ -208,6 +221,7 @@ static int __must_check amd_iommu_setup_domain_device( + * intended anyway. + */ + !pdev->domain->is_dying && ++ pdev->domain != dom_io && + (any_pdev_behind_iommu(pdev->domain, pdev, iommu) || + pdev->phantom_stride) ) + printk(" %04x:%02x:%02x.%u: reassignment may cause %pd data corruption\n", +@@ -238,9 +252,8 @@ static int __must_check amd_iommu_setup_domain_device( + AMD_IOMMU_DEBUG("Setup I/O page table: device id = %#x, type = %#x, " + "root table = %#"PRIx64", " + "domain = %d, paging mode = %d\n", +- req_id, pdev->type, +- page_to_maddr(hd->arch.root_table), +- domain->domain_id, hd->arch.paging_mode); ++ req_id, pdev->type, page_to_maddr(root_pg), ++ domid, hd->arch.paging_mode); + + ASSERT(pcidevs_locked()); + +@@ -313,7 +326,7 @@ static int iov_enable_xt(void) + + int amd_iommu_alloc_root(struct domain_iommu *hd) + { +- if ( unlikely(!hd->arch.root_table) ) ++ if ( unlikely(!hd->arch.root_table) && hd != dom_iommu(dom_io) ) + { + hd->arch.root_table = alloc_amd_iommu_pgtable(); + if ( !hd->arch.root_table ) +@@ -404,7 +417,7 @@ static void amd_iommu_disable_domain_device(const struct domain *domain, + + AMD_IOMMU_DEBUG("Disable: device id = %#x, " + "domain = %d, paging mode = %d\n", +- req_id, domain->domain_id, ++ req_id, dte->domain_id, + dom_iommu(domain)->arch.paging_mode); + } + spin_unlock_irqrestore(&iommu->lock, flags); +@@ -668,6 +681,8 @@ static int amd_iommu_remove_device(u8 devfn, struct pci_dev *pdev) + + amd_iommu_disable_domain_device(pdev->domain, iommu, devfn, pdev); + ++ amd_iommu_quarantine_teardown(pdev); ++ + iommu_free_domid(pdev->arch.pseudo_domid, iommu->domid_map); + pdev->arch.pseudo_domid = DOMID_INVALID; + +diff --git a/xen/drivers/passthrough/iommu.c b/xen/drivers/passthrough/iommu.c +index 93d4377978..9aef696d90 100644 +--- a/xen/drivers/passthrough/iommu.c ++++ b/xen/drivers/passthrough/iommu.c +@@ -450,21 +450,21 @@ int iommu_iotlb_flush_all(struct domain *d, unsigned int flush_flags) + return rc; + } + +-static int __init iommu_quarantine_init(void) ++int iommu_quarantine_dev_init(device_t *dev) + { + const struct domain_iommu *hd = dom_iommu(dom_io); +- int rc; + +- dom_io->options |= XEN_DOMCTL_CDF_iommu; ++ if ( !iommu_quarantine || !hd->platform_ops->quarantine_init ) ++ return 0; + +- rc = iommu_domain_init(dom_io, 0); +- if ( rc ) +- return rc; ++ return iommu_call(hd->platform_ops, quarantine_init, dev); ++} + +- if ( !hd->platform_ops->quarantine_init ) +- return 0; ++static int __init iommu_quarantine_init(void) ++{ ++ dom_io->options |= XEN_DOMCTL_CDF_iommu; + +- return hd->platform_ops->quarantine_init(dom_io); ++ return iommu_domain_init(dom_io, 0); + } + + int __init iommu_setup(void) +diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c +index 97e42261eb..1a1a387458 100644 +--- a/xen/drivers/passthrough/pci.c ++++ b/xen/drivers/passthrough/pci.c +@@ -929,9 +929,16 @@ static int deassign_device(struct domain *d, uint16_t seg, uint8_t bus, + return -ENODEV; + + /* De-assignment from dom_io should de-quarantine the device */ +- target = ((pdev->quarantine || iommu_quarantine) && +- pdev->domain != dom_io) ? +- dom_io : hardware_domain; ++ if ( (pdev->quarantine || iommu_quarantine) && pdev->domain != dom_io ) ++ { ++ ret = iommu_quarantine_dev_init(pci_to_dev(pdev)); ++ if ( ret ) ++ return ret; ++ ++ target = dom_io; ++ } ++ else ++ target = hardware_domain; + + while ( pdev->phantom_stride ) + { +@@ -1547,6 +1554,13 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag) + msixtbl_init(d); + } + ++ if ( pdev->domain != dom_io ) ++ { ++ rc = iommu_quarantine_dev_init(pci_to_dev(pdev)); ++ if ( rc ) ++ goto done; ++ } ++ + pdev->fault.count = 0; + + if ( (rc = hd->platform_ops->assign_device(d, devfn, pci_to_dev(pdev), flag)) ) +diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c +index e5c50429d2..6571b5dde4 100644 +--- a/xen/drivers/passthrough/vtd/iommu.c ++++ b/xen/drivers/passthrough/vtd/iommu.c +@@ -43,6 +43,12 @@ + #include "vtd.h" + #include "../ats.h" + ++#define DEVICE_DOMID(d, pdev) ((d) != dom_io ? (d)->domain_id \ ++ : (pdev)->arch.pseudo_domid) ++#define DEVICE_PGTABLE(d, pdev) ((d) != dom_io \ ++ ? dom_iommu(d)->arch.pgd_maddr \ ++ : (pdev)->arch.vtd.pgd_maddr) ++ + /* Possible unfiltered LAPIC/MSI messages from untrusted sources? */ + bool __read_mostly untrusted_msi; + +@@ -78,13 +84,18 @@ static int get_iommu_did(domid_t domid, const struct vtd_iommu *iommu, + + #define DID_FIELD_WIDTH 16 + #define DID_HIGH_OFFSET 8 ++ ++/* ++ * This function may have "context" passed as NULL, to merely obtain a DID ++ * for "domid". ++ */ + static int context_set_domain_id(struct context_entry *context, + domid_t domid, struct vtd_iommu *iommu) + { + unsigned long nr_dom, i; + int found = 0; + +- ASSERT(spin_is_locked(&iommu->lock)); ++ ASSERT(pcidevs_locked()); + + nr_dom = cap_ndoms(iommu->cap); + i = find_first_bit(iommu->domid_bitmap, nr_dom); +@@ -110,8 +121,13 @@ static int context_set_domain_id(struct context_entry *context, + } + + set_bit(i, iommu->domid_bitmap); +- context->hi &= ~(((1 << DID_FIELD_WIDTH) - 1) << DID_HIGH_OFFSET); +- context->hi |= (i & ((1 << DID_FIELD_WIDTH) - 1)) << DID_HIGH_OFFSET; ++ ++ if ( context ) ++ { ++ context->hi &= ~(((1 << DID_FIELD_WIDTH) - 1) << DID_HIGH_OFFSET); ++ context->hi |= (i & ((1 << DID_FIELD_WIDTH) - 1)) << DID_HIGH_OFFSET; ++ } ++ + return 0; + } + +@@ -161,8 +177,12 @@ static void check_cleanup_domid_map(struct domain *d, + const struct pci_dev *exclude, + struct vtd_iommu *iommu) + { +- bool found = any_pdev_behind_iommu(d, exclude, iommu); ++ bool found; ++ ++ if ( d == dom_io ) ++ return; + ++ found = any_pdev_behind_iommu(d, exclude, iommu); + /* + * Hidden devices are associated with DomXEN but usable by the hardware + * domain. Hence they need considering here as well. +@@ -1400,7 +1420,7 @@ int domain_context_mapping_one( + domid = iommu->domid_map[prev_did]; + if ( domid < DOMID_FIRST_RESERVED ) + prev_dom = rcu_lock_domain_by_id(domid); +- else if ( domid == DOMID_IO ) ++ else if ( pdev ? domid == pdev->arch.pseudo_domid : domid > DOMID_MASK ) + prev_dom = rcu_lock_domain(dom_io); + if ( !prev_dom ) + { +@@ -1577,15 +1597,12 @@ int domain_context_mapping_one( + { + if ( !prev_dom ) + domain_context_unmap_one(domain, iommu, bus, devfn, +- domain->domain_id); ++ DEVICE_DOMID(domain, pdev)); + else if ( prev_dom != domain ) /* Avoid infinite recursion. */ +- { +- hd = dom_iommu(prev_dom); + domain_context_mapping_one(prev_dom, iommu, bus, devfn, pdev, +- domain->domain_id, +- hd->arch.pgd_maddr, ++ DEVICE_DOMID(prev_dom, pdev), ++ DEVICE_PGTABLE(prev_dom, pdev), + mode & MAP_WITH_RMRR); +- } + } + + if ( prev_dom ) +@@ -1602,7 +1619,7 @@ static int domain_context_mapping(struct domain *domain, u8 devfn, + { + struct acpi_drhd_unit *drhd; + const struct acpi_rmrr_unit *rmrr; +- paddr_t pgd_maddr = dom_iommu(domain)->arch.pgd_maddr; ++ paddr_t pgd_maddr = DEVICE_PGTABLE(domain, pdev); + domid_t orig_domid = pdev->arch.pseudo_domid; + int ret = 0; + unsigned int i, mode = 0; +@@ -1635,7 +1652,7 @@ static int domain_context_mapping(struct domain *domain, u8 devfn, + break; + } + +- if ( domain != pdev->domain ) ++ if ( domain != pdev->domain && pdev->domain != dom_io ) + { + if ( pdev->domain->is_dying ) + mode |= MAP_OWNER_DYING; +@@ -1676,8 +1693,8 @@ static int domain_context_mapping(struct domain *domain, u8 devfn, + printk(VTDPREFIX "d%d:PCIe: map %04x:%02x:%02x.%u\n", + domain->domain_id, seg, bus, + PCI_SLOT(devfn), PCI_FUNC(devfn)); +- ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn, +- pdev, domain->domain_id, pgd_maddr, ++ ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn, pdev, ++ DEVICE_DOMID(domain, pdev), pgd_maddr, + mode); + if ( ret > 0 ) + ret = 0; +@@ -1701,8 +1718,8 @@ static int domain_context_mapping(struct domain *domain, u8 devfn, + PCI_SLOT(devfn), PCI_FUNC(devfn)); + + ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn, +- pdev, domain->domain_id, pgd_maddr, +- mode); ++ pdev, DEVICE_DOMID(domain, pdev), ++ pgd_maddr, mode); + if ( ret < 0 ) + break; + prev_present = ret; +@@ -1730,8 +1747,8 @@ static int domain_context_mapping(struct domain *domain, u8 devfn, + */ + if ( ret >= 0 ) + ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn, +- NULL, domain->domain_id, pgd_maddr, +- mode); ++ NULL, DEVICE_DOMID(domain, pdev), ++ pgd_maddr, mode); + + /* + * Devices behind PCIe-to-PCI/PCIx bridge may generate different +@@ -1746,8 +1763,8 @@ static int domain_context_mapping(struct domain *domain, u8 devfn, + if ( !ret && pdev_type(seg, bus, devfn) == DEV_TYPE_PCIe2PCI_BRIDGE && + (secbus != pdev->bus || pdev->devfn != 0) ) + ret = domain_context_mapping_one(domain, drhd->iommu, secbus, 0, +- NULL, domain->domain_id, pgd_maddr, +- mode); ++ NULL, DEVICE_DOMID(domain, pdev), ++ pgd_maddr, mode); + + if ( ret ) + { +@@ -1896,7 +1913,7 @@ static const struct acpi_drhd_unit *domain_context_unmap( + domain->domain_id, seg, bus, + PCI_SLOT(devfn), PCI_FUNC(devfn)); + ret = domain_context_unmap_one(domain, iommu, bus, devfn, +- domain->domain_id); ++ DEVICE_DOMID(domain, pdev)); + if ( !ret && devfn == pdev->devfn && ats_device(pdev, drhd) > 0 ) + disable_ats_device(pdev); + +@@ -1907,7 +1924,7 @@ static const struct acpi_drhd_unit *domain_context_unmap( + printk(VTDPREFIX "d%d:PCI: unmap %04x:%02x:%02x.%u\n", + domain->domain_id, seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); + ret = domain_context_unmap_one(domain, iommu, bus, devfn, +- domain->domain_id); ++ DEVICE_DOMID(domain, pdev)); + if ( ret ) + break; + +@@ -1930,18 +1947,12 @@ static const struct acpi_drhd_unit *domain_context_unmap( + break; + } + ++ ret = domain_context_unmap_one(domain, iommu, tmp_bus, tmp_devfn, ++ DEVICE_DOMID(domain, pdev)); + /* PCIe to PCI/PCIx bridge */ +- if ( pdev_type(seg, tmp_bus, tmp_devfn) == DEV_TYPE_PCIe2PCI_BRIDGE ) +- { +- ret = domain_context_unmap_one(domain, iommu, tmp_bus, tmp_devfn, +- domain->domain_id); +- if ( !ret ) +- ret = domain_context_unmap_one(domain, iommu, secbus, 0, +- domain->domain_id); +- } +- else /* Legacy PCI bridge */ +- ret = domain_context_unmap_one(domain, iommu, tmp_bus, tmp_devfn, +- domain->domain_id); ++ if ( !ret && pdev_type(seg, tmp_bus, tmp_devfn) == DEV_TYPE_PCIe2PCI_BRIDGE ) ++ ret = domain_context_unmap_one(domain, iommu, secbus, 0, ++ DEVICE_DOMID(domain, pdev)); + + break; + +@@ -1987,6 +1998,25 @@ static void iommu_domain_teardown(struct domain *d) + cleanup_domid_map(d->domain_id, drhd->iommu); + } + ++static void quarantine_teardown(struct pci_dev *pdev, ++ const struct acpi_drhd_unit *drhd) ++{ ++ struct page_info *pg; ++ ++ ASSERT(pcidevs_locked()); ++ ++ if ( !pdev->arch.vtd.pgd_maddr ) ++ return; ++ ++ while ( (pg = page_list_remove_head(&pdev->arch.pgtables_list)) ) ++ free_domheap_page(pg); ++ ++ pdev->arch.vtd.pgd_maddr = 0; ++ ++ if ( drhd ) ++ cleanup_domid_map(pdev->arch.pseudo_domid, drhd->iommu); ++} ++ + static int __must_check intel_iommu_map_page(struct domain *d, dfn_t dfn, + mfn_t mfn, unsigned int flags, + unsigned int *flush_flags) +@@ -2209,6 +2239,8 @@ static int intel_iommu_remove_device(u8 devfn, struct pci_dev *pdev) + rmrr->end_address, 0); + } + ++ quarantine_teardown(pdev, drhd); ++ + if ( drhd ) + { + iommu_free_domid(pdev->arch.pseudo_domid, +@@ -2888,60 +2920,139 @@ static void vtd_dump_p2m_table(struct domain *d) + vtd_dump_p2m_table_level(hd->arch.pgd_maddr, agaw_to_level(hd->arch.agaw), 0, 0); + } + +-static int __init intel_iommu_quarantine_init(struct domain *d) ++static int fill_qpt(struct dma_pte *this, unsigned int level, ++ paddr_t maddrs[6], struct pci_dev *pdev) + { +- struct domain_iommu *hd = dom_iommu(d); +- struct dma_pte *parent; ++ struct domain_iommu *hd = dom_iommu(dom_io); ++ unsigned int i; ++ int rc = 0; ++ ++ for ( i = 0; !rc && i < PTE_NUM; ++i ) ++ { ++ struct dma_pte *pte = &this[i], *next; ++ ++ if ( !dma_pte_present(*pte) ) ++ { ++ if ( !maddrs[level] ) ++ { ++ /* ++ * The pgtable allocator is fine for the leaf page, as well as ++ * page table pages, and the resulting allocations are always ++ * zeroed. ++ */ ++ maddrs[level] = alloc_pgtable_maddr(1, hd->node); ++ if ( !maddrs[level] ) ++ { ++ rc = -ENOMEM; ++ break; ++ } ++ ++ page_list_add(maddr_to_page(maddrs[level]), ++ &pdev->arch.pgtables_list); ++ ++ if ( level ) ++ { ++ next = map_vtd_domain_page(maddrs[level]); ++ rc = fill_qpt(next, level - 1, maddrs, pdev); ++ unmap_vtd_domain_page(next); ++ } ++ } ++ ++ dma_set_pte_addr(*pte, maddrs[level]); ++ dma_set_pte_readable(*pte); ++ dma_set_pte_writable(*pte); ++ } ++ else if ( level && !dma_pte_superpage(*pte) ) ++ { ++ page_list_add(maddr_to_page(dma_pte_addr(*pte)), ++ &pdev->arch.pgtables_list); ++ next = map_vtd_domain_page(dma_pte_addr(*pte)); ++ rc = fill_qpt(next, level - 1, maddrs, pdev); ++ unmap_vtd_domain_page(next); ++ } ++ } ++ ++ return rc; ++} ++ ++static int intel_iommu_quarantine_init(struct pci_dev *pdev) ++{ ++ struct domain_iommu *hd = dom_iommu(dom_io); ++ paddr_t maddr; + unsigned int agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH); + unsigned int level = agaw_to_level(agaw); ++ const struct acpi_drhd_unit *drhd; ++ const struct acpi_rmrr_unit *rmrr; ++ unsigned int i, bdf; ++ bool rmrr_found = false; ++ int rc; + +- if ( hd->arch.pgd_maddr ) ++ ASSERT(pcidevs_locked()); ++ ASSERT(!hd->arch.pgd_maddr); ++ ++ if ( pdev->arch.vtd.pgd_maddr ) + { +- ASSERT_UNREACHABLE(); ++ clear_domain_page(pdev->arch.leaf_mfn); + return 0; + } + +- spin_lock(&hd->arch.mapping_lock); ++ drhd = acpi_find_matched_drhd_unit(pdev); ++ if ( !drhd ) ++ return -ENODEV; + +- hd->arch.pgd_maddr = alloc_pgtable_maddr(1, hd->node); +- if ( !hd->arch.pgd_maddr ) +- goto out; ++ maddr = alloc_pgtable_maddr(1, hd->node); ++ if ( !maddr ) ++ return -ENOMEM; + +- parent = map_vtd_domain_page(hd->arch.pgd_maddr); +- while ( level ) +- { +- uint64_t maddr; +- unsigned int offset; ++ rc = context_set_domain_id(NULL, pdev->arch.pseudo_domid, drhd->iommu); + +- /* +- * The pgtable allocator is fine for the leaf page, as well as +- * page table pages, and the resulting allocations are always +- * zeroed. +- */ +- maddr = alloc_pgtable_maddr(1, hd->node); +- if ( !maddr ) ++ /* Transiently install the root into DomIO, for iommu_identity_mapping(). */ ++ hd->arch.pgd_maddr = maddr; ++ ++ for_each_rmrr_device ( rmrr, bdf, i ) ++ { ++ if ( rc ) + break; + +- for ( offset = 0; offset < PTE_NUM; offset++ ) ++ if ( rmrr->segment == pdev->seg && bdf == pdev->sbdf.bdf ) + { +- struct dma_pte *pte = &parent[offset]; ++ rmrr_found = true; + +- dma_set_pte_addr(*pte, maddr); +- dma_set_pte_readable(*pte); ++ rc = iommu_identity_mapping(dom_io, p2m_access_rw, ++ rmrr->base_address, rmrr->end_address, ++ 0); ++ if ( rc ) ++ printk(XENLOG_ERR VTDPREFIX ++ "%04x:%02x:%02x.%u: RMRR quarantine mapping failed\n", ++ pdev->seg, pdev->bus, ++ PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn)); + } +- iommu_sync_cache(parent, PAGE_SIZE); ++ } + +- unmap_vtd_domain_page(parent); +- parent = map_vtd_domain_page(maddr); +- level--; ++ iommu_identity_map_teardown(dom_io); ++ hd->arch.pgd_maddr = 0; ++ pdev->arch.vtd.pgd_maddr = maddr; ++ ++ if ( !rc ) ++ { ++ struct dma_pte *root; ++ paddr_t maddrs[6] = {}; ++ ++ spin_lock(&hd->arch.mapping_lock); ++ ++ root = map_vtd_domain_page(maddr); ++ rc = fill_qpt(root, level - 1, maddrs, pdev); ++ unmap_vtd_domain_page(root); ++ ++ pdev->arch.leaf_mfn = maddr_to_mfn(maddrs[0]); ++ ++ spin_unlock(&hd->arch.mapping_lock); + } +- unmap_vtd_domain_page(parent); + +- out: +- spin_unlock(&hd->arch.mapping_lock); ++ if ( rc ) ++ quarantine_teardown(pdev, drhd); + +- /* Pages leaked in failure case */ +- return level ? -ENOMEM : 0; ++ return rc; + } + + const struct iommu_ops __initconstrel intel_iommu_ops = { +diff --git a/xen/drivers/passthrough/vtd/iommu.h b/xen/drivers/passthrough/vtd/iommu.h +index be44fc017f..c67adb9b41 100644 +--- a/xen/drivers/passthrough/vtd/iommu.h ++++ b/xen/drivers/passthrough/vtd/iommu.h +@@ -509,7 +509,7 @@ struct vtd_iommu { + u32 nr_pt_levels; + u64 cap; + u64 ecap; +- spinlock_t lock; /* protect context, domain ids */ ++ spinlock_t lock; /* protect context */ + spinlock_t register_lock; /* protect iommu register handling */ + u64 root_maddr; /* root entry machine address */ + nodeid_t node; +diff --git a/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h b/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h +index 2a3bc47ab5..961182ac0f 100644 +--- a/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h ++++ b/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h +@@ -54,7 +54,8 @@ int amd_iommu_init_late(void); + int amd_iommu_update_ivrs_mapping_acpi(void); + int iov_adjust_irq_affinities(void); + +-int amd_iommu_quarantine_init(struct domain *d); ++int amd_iommu_quarantine_init(struct pci_dev *pdev); ++void amd_iommu_quarantine_teardown(struct pci_dev *pdev); + + /* mapping functions */ + int __must_check amd_iommu_map_page(struct domain *d, dfn_t dfn, +diff --git a/xen/include/asm-x86/pci.h b/xen/include/asm-x86/pci.h +index 70ed48e309..0c79acb1ed 100644 +--- a/xen/include/asm-x86/pci.h ++++ b/xen/include/asm-x86/pci.h +@@ -1,6 +1,8 @@ + #ifndef __X86_PCI_H__ + #define __X86_PCI_H__ + ++#include <xen/mm.h> ++ + #define CF8_BDF(cf8) ( ((cf8) & 0x00ffff00) >> 8) + #define CF8_ADDR_LO(cf8) ( (cf8) & 0x000000fc) + #define CF8_ADDR_HI(cf8) ( ((cf8) & 0x0f000000) >> 16) +@@ -20,7 +22,18 @@ struct arch_pci_dev { + * them don't race (de)initialization and hence don't strictly need any + * locking. + */ ++ union { ++ /* Subset of struct arch_iommu's fields, to be used in dom_io. */ ++ struct { ++ uint64_t pgd_maddr; ++ } vtd; ++ struct { ++ struct page_info *root_table; ++ } amd; ++ }; + domid_t pseudo_domid; ++ mfn_t leaf_mfn; ++ struct page_list_head pgtables_list; + }; + + int pci_conf_write_intercept(unsigned int seg, unsigned int bdf, +diff --git a/xen/include/xen/iommu.h b/xen/include/xen/iommu.h +index 041a7cf5e5..c94579fe92 100644 +--- a/xen/include/xen/iommu.h ++++ b/xen/include/xen/iommu.h +@@ -211,7 +211,7 @@ typedef int iommu_grdm_t(xen_pfn_t start, xen_ulong_t nr, u32 id, void *ctxt); + struct iommu_ops { + int (*init)(struct domain *d); + void (*hwdom_init)(struct domain *d); +- int (*quarantine_init)(struct domain *d); ++ int (*quarantine_init)(device_t *dev); + int (*add_device)(u8 devfn, device_t *dev); + int (*enable_device)(device_t *dev); + int (*remove_device)(u8 devfn, device_t *dev); +@@ -331,6 +331,7 @@ int __must_check iommu_suspend(void); + void iommu_resume(void); + void iommu_crash_shutdown(void); + int iommu_get_reserved_device_memory(iommu_grdm_t *, void *); ++int iommu_quarantine_dev_init(device_t *dev); + + void iommu_share_p2m_table(struct domain *d); + +-- +2.35.2 + + +From a6902a65160aac72a1889a268fd5f3cebb159d8e Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Fri, 8 Apr 2022 15:20:21 +0200 +Subject: [PATCH 30/32] VT-d: don't needlessly look up DID +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +If get_iommu_domid() in domain_context_unmap_one() fails, we better +wouldn't clear the context entry in the first place, as we're then unable +to issue the corresponding flush. However, we have no need to look up the +DID in the first place: What needs flushing is very specifically the DID +that was in the context entry before our clearing of it. + +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> +master commit: 445ab9852d69d8957467f0036098ebec75fec092 +master date: 2022-04-07 12:29:03 +0200 +--- + xen/drivers/passthrough/vtd/iommu.c | 10 ++-------- + 1 file changed, 2 insertions(+), 8 deletions(-) + +diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c +index 6571b5dde4..4b0d6a873c 100644 +--- a/xen/drivers/passthrough/vtd/iommu.c ++++ b/xen/drivers/passthrough/vtd/iommu.c +@@ -1821,18 +1821,12 @@ int domain_context_unmap_one( + return 0; + } + ++ iommu_domid = context_domain_id(*context); ++ + context_clear_present(*context); + context_clear_entry(*context); + iommu_sync_cache(context, sizeof(struct context_entry)); + +- iommu_domid = get_iommu_did(domid, iommu, !domain->is_dying); +- if ( iommu_domid == -1 ) +- { +- spin_unlock(&iommu->lock); +- unmap_vtd_domain_page(context_entries); +- return -EINVAL; +- } +- + rc = iommu_flush_context_device(iommu, iommu_domid, + PCI_BDF2(bus, devfn), + DMA_CCMD_MASK_NOBIT, 0); +-- +2.35.2 + + +From d64d46685c776b39d5c640a0ad2727fa0938273c Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Fri, 8 Apr 2022 15:21:33 +0200 +Subject: [PATCH 31/32] VT-d: avoid NULL deref on domain_context_mapping_one() + error paths +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +First there's a printk() which actually wrongly uses pdev in the first +place: We want to log the coordinates of the (perhaps fake) device +acted upon, which may not be pdev. + +Then it was quite pointless for eb19326a328d ("VT-d: prepare for per- +device quarantine page tables (part I)") to add a domid_t parameter to +domain_context_unmap_one(): It's only used to pass back here via +me_wifi_quirk() -> map_me_phantom_function(). Drop the parameter again. + +Finally there's the invocation of domain_context_mapping_one(), which +needs to be passed the correct domain ID. Avoid taking that path when +pdev is NULL and the quarantine state is what would need restoring to. +This means we can't security-support non-PCI-Express devices with RMRRs +(if such exist in practice) any longer; note that as of trhe 1st of the +two commits referenced below assigning them to DomU-s is unsupported +anyway. + +Fixes: 8f41e481b485 ("VT-d: re-assign devices directly") +Fixes: 14dd241aad8a ("IOMMU/x86: use per-device page tables for quarantining") +Coverity ID: 1503784 +Reported-by: Andrew Cooper <andrew.cooper3@citrix.com> +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> +master commit: 608394b906e71587f02e6662597bc985bad33a5a +master date: 2022-04-07 12:30:19 +0200 +--- + xen/drivers/passthrough/vtd/extern.h | 2 +- + xen/drivers/passthrough/vtd/iommu.c | 34 ++++++++++++++++------------ + xen/drivers/passthrough/vtd/quirks.c | 2 +- + 3 files changed, 21 insertions(+), 17 deletions(-) + +diff --git a/xen/drivers/passthrough/vtd/extern.h b/xen/drivers/passthrough/vtd/extern.h +index 897dcff9ff..fbe951b2fa 100644 +--- a/xen/drivers/passthrough/vtd/extern.h ++++ b/xen/drivers/passthrough/vtd/extern.h +@@ -89,7 +89,7 @@ int domain_context_mapping_one(struct domain *domain, struct vtd_iommu *iommu, + const struct pci_dev *pdev, domid_t domid, + paddr_t pgd_maddr, unsigned int mode); + int domain_context_unmap_one(struct domain *domain, struct vtd_iommu *iommu, +- uint8_t bus, uint8_t devfn, domid_t domid); ++ uint8_t bus, uint8_t devfn); + int intel_iommu_get_reserved_device_memory(iommu_grdm_t *func, void *ctxt); + + unsigned int io_apic_read_remap_rte(unsigned int apic, unsigned int reg); +diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c +index 4b0d6a873c..cb3ba3e409 100644 +--- a/xen/drivers/passthrough/vtd/iommu.c ++++ b/xen/drivers/passthrough/vtd/iommu.c +@@ -1527,7 +1527,7 @@ int domain_context_mapping_one( + check_cleanup_domid_map(domain, pdev, iommu); + printk(XENLOG_ERR + "%04x:%02x:%02x.%u: unexpected context entry %016lx_%016lx (expected %016lx_%016lx)\n", +- pdev->seg, pdev->bus, PCI_SLOT(devfn), PCI_FUNC(devfn), ++ seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), + (uint64_t)(res >> 64), (uint64_t)res, + (uint64_t)(old >> 64), (uint64_t)old); + rc = -EILSEQ; +@@ -1595,9 +1595,14 @@ int domain_context_mapping_one( + + if ( rc ) + { +- if ( !prev_dom ) +- domain_context_unmap_one(domain, iommu, bus, devfn, +- DEVICE_DOMID(domain, pdev)); ++ if ( !prev_dom || ++ /* ++ * Unmapping here means DEV_TYPE_PCI devices with RMRRs (if such ++ * exist) would cause problems if such a region was actually ++ * accessed. ++ */ ++ (prev_dom == dom_io && !pdev) ) ++ domain_context_unmap_one(domain, iommu, bus, devfn); + else if ( prev_dom != domain ) /* Avoid infinite recursion. */ + domain_context_mapping_one(prev_dom, iommu, bus, devfn, pdev, + DEVICE_DOMID(prev_dom, pdev), +@@ -1734,7 +1739,9 @@ static int domain_context_mapping(struct domain *domain, u8 devfn, + * Strictly speaking if the device is the only one behind this bridge + * and the only one with this (secbus,0,0) tuple, it could be allowed + * to be re-assigned regardless of RMRR presence. But let's deal with +- * that case only if it is actually found in the wild. ++ * that case only if it is actually found in the wild. Note that ++ * dealing with this just here would still not render the operation ++ * secure. + */ + else if ( prev_present && (mode & MAP_WITH_RMRR) && + domain != pdev->domain ) +@@ -1800,7 +1807,7 @@ static int domain_context_mapping(struct domain *domain, u8 devfn, + int domain_context_unmap_one( + struct domain *domain, + struct vtd_iommu *iommu, +- uint8_t bus, uint8_t devfn, domid_t domid) ++ uint8_t bus, uint8_t devfn) + { + struct context_entry *context, *context_entries; + u64 maddr; +@@ -1852,7 +1859,8 @@ int domain_context_unmap_one( + unmap_vtd_domain_page(context_entries); + + if ( !iommu->drhd->segment && !rc ) +- rc = me_wifi_quirk(domain, bus, devfn, domid, 0, UNMAP_ME_PHANTOM_FUNC); ++ rc = me_wifi_quirk(domain, bus, devfn, DOMID_INVALID, 0, ++ UNMAP_ME_PHANTOM_FUNC); + + if ( rc && !is_hardware_domain(domain) && domain != dom_io ) + { +@@ -1906,8 +1914,7 @@ static const struct acpi_drhd_unit *domain_context_unmap( + printk(VTDPREFIX "d%d:PCIe: unmap %04x:%02x:%02x.%u\n", + domain->domain_id, seg, bus, + PCI_SLOT(devfn), PCI_FUNC(devfn)); +- ret = domain_context_unmap_one(domain, iommu, bus, devfn, +- DEVICE_DOMID(domain, pdev)); ++ ret = domain_context_unmap_one(domain, iommu, bus, devfn); + if ( !ret && devfn == pdev->devfn && ats_device(pdev, drhd) > 0 ) + disable_ats_device(pdev); + +@@ -1917,8 +1924,7 @@ static const struct acpi_drhd_unit *domain_context_unmap( + if ( iommu_debug ) + printk(VTDPREFIX "d%d:PCI: unmap %04x:%02x:%02x.%u\n", + domain->domain_id, seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); +- ret = domain_context_unmap_one(domain, iommu, bus, devfn, +- DEVICE_DOMID(domain, pdev)); ++ ret = domain_context_unmap_one(domain, iommu, bus, devfn); + if ( ret ) + break; + +@@ -1941,12 +1947,10 @@ static const struct acpi_drhd_unit *domain_context_unmap( + break; + } + +- ret = domain_context_unmap_one(domain, iommu, tmp_bus, tmp_devfn, +- DEVICE_DOMID(domain, pdev)); ++ ret = domain_context_unmap_one(domain, iommu, tmp_bus, tmp_devfn); + /* PCIe to PCI/PCIx bridge */ + if ( !ret && pdev_type(seg, tmp_bus, tmp_devfn) == DEV_TYPE_PCIe2PCI_BRIDGE ) +- ret = domain_context_unmap_one(domain, iommu, secbus, 0, +- DEVICE_DOMID(domain, pdev)); ++ ret = domain_context_unmap_one(domain, iommu, secbus, 0); + + break; + +diff --git a/xen/drivers/passthrough/vtd/quirks.c b/xen/drivers/passthrough/vtd/quirks.c +index 4d54c21136..2b8a2bd9c6 100644 +--- a/xen/drivers/passthrough/vtd/quirks.c ++++ b/xen/drivers/passthrough/vtd/quirks.c +@@ -363,7 +363,7 @@ static int __must_check map_me_phantom_function(struct domain *domain, + domid, pgd_maddr, mode); + else + rc = domain_context_unmap_one(domain, drhd->iommu, 0, +- PCI_DEVFN(dev, 7), domid); ++ PCI_DEVFN(dev, 7)); + + return rc; + } +-- +2.35.2 + + +From fe97133b5deef58bd1422f4d87821131c66b1d0e Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Fri, 8 Apr 2022 15:22:49 +0200 +Subject: [PATCH 32/32] VT-d: avoid infinite recursion on + domain_context_mapping_one() error path +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Despite the comment there infinite recursion was still possible, by +flip-flopping between two domains. This is because prev_dom is derived +from the DID found in the context entry, which was already updated by +the time error recovery is invoked. Simply introduce yet another mode +flag to prevent rolling back an in-progress roll-back of a prior +mapping attempt. + +Also drop the existing recursion prevention for having been dead anyway: +Earlier in the function we already bail when prev_dom == domain. + +Fixes: 8f41e481b485 ("VT-d: re-assign devices directly") +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> +master commit: 99d829dba1390b98a3ca07b365713e62182ee7ca +master date: 2022-04-07 12:31:16 +0200 +--- + xen/drivers/passthrough/vtd/iommu.c | 7 ++++--- + xen/drivers/passthrough/vtd/vtd.h | 3 ++- + 2 files changed, 6 insertions(+), 4 deletions(-) + +diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c +index cb3ba3e409..f051a55764 100644 +--- a/xen/drivers/passthrough/vtd/iommu.c ++++ b/xen/drivers/passthrough/vtd/iommu.c +@@ -1593,7 +1593,7 @@ int domain_context_mapping_one( + if ( !seg && !rc ) + rc = me_wifi_quirk(domain, bus, devfn, domid, pgd_maddr, mode); + +- if ( rc ) ++ if ( rc && !(mode & MAP_ERROR_RECOVERY) ) + { + if ( !prev_dom || + /* +@@ -1603,11 +1603,12 @@ int domain_context_mapping_one( + */ + (prev_dom == dom_io && !pdev) ) + domain_context_unmap_one(domain, iommu, bus, devfn); +- else if ( prev_dom != domain ) /* Avoid infinite recursion. */ ++ else + domain_context_mapping_one(prev_dom, iommu, bus, devfn, pdev, + DEVICE_DOMID(prev_dom, pdev), + DEVICE_PGTABLE(prev_dom, pdev), +- mode & MAP_WITH_RMRR); ++ (mode & MAP_WITH_RMRR) ++ | MAP_ERROR_RECOVERY); + } + + if ( prev_dom ) +diff --git a/xen/drivers/passthrough/vtd/vtd.h b/xen/drivers/passthrough/vtd/vtd.h +index e4ab242fee..cb2df76eed 100644 +--- a/xen/drivers/passthrough/vtd/vtd.h ++++ b/xen/drivers/passthrough/vtd/vtd.h +@@ -29,7 +29,8 @@ + #define MAP_WITH_RMRR (1u << 0) + #define MAP_OWNER_DYING (1u << 1) + #define MAP_SINGLE_DEVICE (1u << 2) +-#define UNMAP_ME_PHANTOM_FUNC (1u << 3) ++#define MAP_ERROR_RECOVERY (1u << 3) ++#define UNMAP_ME_PHANTOM_FUNC (1u << 4) + + /* Allow for both IOAPIC and IOSAPIC. */ + #define IO_xAPIC_route_entry IO_APIC_route_entry +-- +2.35.2 + diff --git a/main/xen/xsa386.patch b/main/xen/xsa386.patch deleted file mode 100644 index 83f24d30d5..0000000000 --- a/main/xen/xsa386.patch +++ /dev/null @@ -1,29 +0,0 @@ -From: Jan Beulich <jbeulich@suse.com> -Subject: VT-d: fix deassign of device with RMRR -Date: Fri, 1 Oct 2021 15:05:42 +0200 - -Ignoring a specific error code here was not meant to short circuit -deassign to _just_ the unmapping of RMRRs. This bug was previously -hidden by the bogus (potentially indefinite) looping in -pci_release_devices(), until f591755823a7 ("IOMMU/PCI: don't let domain -cleanup continue when device de-assignment failed") fixed that loop. - -This is CVE-2021-28702 / XSA-386. - -Fixes: 8b99f4400b69 ("VT-d: fix RMRR related error handling") -Reported-by: Ivan Kardykov <kardykov@tabit.pro> -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Tested-by: Ivan Kardykov <kardykov@tabit.pro> - ---- a/xen/drivers/passthrough/vtd/iommu.c -+++ b/xen/drivers/passthrough/vtd/iommu.c -@@ -2409,7 +2409,7 @@ static int reassign_device_ownership( - ret = iommu_identity_mapping(source, p2m_access_x, - rmrr->base_address, - rmrr->end_address, 0); -- if ( ret != -ENOENT ) -+ if ( ret && ret != -ENOENT ) - return ret; - } - } - diff --git a/main/xen/xsa388-4.14-1.patch b/main/xen/xsa388-4.14-1.patch deleted file mode 100644 index f76f2d56b6..0000000000 --- a/main/xen/xsa388-4.14-1.patch +++ /dev/null @@ -1,174 +0,0 @@ -From: Jan Beulich <jbeulich@suse.com> -Subject: x86/PoD: deal with misaligned GFNs - -Users of XENMEM_decrease_reservation and XENMEM_populate_physmap aren't -required to pass in order-aligned GFN values. (While I consider this -bogus, I don't think we can fix this there, as that might break existing -code, e.g Linux'es swiotlb, which - while affecting PV only - until -recently had been enforcing only page alignment on the original -allocation.) Only non-PoD code paths (guest_physmap_{add,remove}_page(), -p2m_set_entry()) look to be dealing with this properly (in part by being -implemented inefficiently, handling every 4k page separately). - -Introduce wrappers taking care of splitting the incoming request into -aligned chunks, without putting much effort in trying to determine the -largest possible chunk at every iteration. - -Also "handle" p2m_set_entry() failure for non-order-0 requests by -crashing the domain in one more place. Alongside putting a log message -there, also add one to the other similar path. - -Note regarding locking: This is left in the actual worker functions on -the assumption that callers aren't guaranteed atomicity wrt acting on -multiple pages at a time. For mis-aligned GFNs gfn_lock() wouldn't have -locked the correct GFN range anyway, if it didn't simply resolve to -p2m_lock(), and for well-behaved callers there continues to be only a -single iteration, i.e. behavior is unchanged for them. (FTAOD pulling -out just pod_lock() into p2m_pod_decrease_reservation() would result in -a lock order violation.) - -This is CVE-2021-28704 and CVE-2021-28707 / part of XSA-388. - -Fixes: 3c352011c0d3 ("x86/PoD: shorten certain operations on higher order ranges") -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> - ---- a/xen/arch/x86/mm/p2m-pod.c -+++ b/xen/arch/x86/mm/p2m-pod.c -@@ -495,7 +495,7 @@ p2m_pod_zero_check_superpage(struct p2m_ - - - /* -- * This function is needed for two reasons: -+ * This pair of functions is needed for two reasons: - * + To properly handle clearing of PoD entries - * + To "steal back" memory being freed for the PoD cache, rather than - * releasing it. -@@ -503,8 +503,8 @@ p2m_pod_zero_check_superpage(struct p2m_ - * Once both of these functions have been completed, we can return and - * allow decrease_reservation() to handle everything else. - */ --unsigned long --p2m_pod_decrease_reservation(struct domain *d, gfn_t gfn, unsigned int order) -+static unsigned long -+decrease_reservation(struct domain *d, gfn_t gfn, unsigned int order) - { - unsigned long ret = 0, i, n; - struct p2m_domain *p2m = p2m_get_hostp2m(d); -@@ -551,8 +551,10 @@ p2m_pod_decrease_reservation(struct doma - * All PoD: Mark the whole region invalid and tell caller - * we're done. - */ -- if ( p2m_set_entry(p2m, gfn, INVALID_MFN, order, p2m_invalid, -- p2m->default_access) ) -+ int rc = p2m_set_entry(p2m, gfn, INVALID_MFN, order, p2m_invalid, -+ p2m->default_access); -+ -+ if ( rc ) - { - /* - * If this fails, we can't tell how much of the range was changed. -@@ -560,7 +562,12 @@ p2m_pod_decrease_reservation(struct doma - * impossible. - */ - if ( order != 0 ) -+ { -+ printk(XENLOG_G_ERR -+ "%pd: marking GFN %#lx (order %u) as non-PoD failed: %d\n", -+ d, gfn_x(gfn), order, rc); - domain_crash(d); -+ } - goto out_unlock; - } - ret = 1UL << order; -@@ -667,6 +674,22 @@ out_unlock: - return ret; - } - -+unsigned long -+p2m_pod_decrease_reservation(struct domain *d, gfn_t gfn, unsigned int order) -+{ -+ unsigned long left = 1UL << order, ret = 0; -+ unsigned int chunk_order = find_first_set_bit(gfn_x(gfn) | left); -+ -+ do { -+ ret += decrease_reservation(d, gfn, chunk_order); -+ -+ left -= 1UL << chunk_order; -+ gfn = gfn_add(gfn, 1UL << chunk_order); -+ } while ( left ); -+ -+ return ret; -+} -+ - void p2m_pod_dump_data(struct domain *d) - { - struct p2m_domain *p2m = p2m_get_hostp2m(d); -@@ -1266,19 +1289,15 @@ remap_and_retry: - return true; - } - -- --int --guest_physmap_mark_populate_on_demand(struct domain *d, unsigned long gfn_l, -- unsigned int order) -+static int -+mark_populate_on_demand(struct domain *d, unsigned long gfn_l, -+ unsigned int order) - { - struct p2m_domain *p2m = p2m_get_hostp2m(d); - gfn_t gfn = _gfn(gfn_l); - unsigned long i, n, pod_count = 0; - int rc = 0; - -- if ( !paging_mode_translate(d) ) -- return -EINVAL; -- - gfn_lock(p2m, gfn, order); - - P2M_DEBUG("mark pod gfn=%#lx\n", gfn_l); -@@ -1316,12 +1335,44 @@ guest_physmap_mark_populate_on_demand(st - BUG_ON(p2m->pod.entry_count < 0); - pod_unlock(p2m); - } -+ else if ( order ) -+ { -+ /* -+ * If this failed, we can't tell how much of the range was changed. -+ * Best to crash the domain. -+ */ -+ printk(XENLOG_G_ERR -+ "%pd: marking GFN %#lx (order %u) as PoD failed: %d\n", -+ d, gfn_l, order, rc); -+ domain_crash(d); -+ } - - out: - gfn_unlock(p2m, gfn, order); - - return rc; - } -+ -+int -+guest_physmap_mark_populate_on_demand(struct domain *d, unsigned long gfn, -+ unsigned int order) -+{ -+ unsigned long left = 1UL << order; -+ unsigned int chunk_order = find_first_set_bit(gfn | left); -+ int rc; -+ -+ if ( !paging_mode_translate(d) ) -+ return -EINVAL; -+ -+ do { -+ rc = mark_populate_on_demand(d, gfn, chunk_order); -+ -+ left -= 1UL << chunk_order; -+ gfn += 1UL << chunk_order; -+ } while ( !rc && left ); -+ -+ return rc; -+} - - void p2m_pod_init(struct p2m_domain *p2m) - { diff --git a/main/xen/xsa388-4.14-2.patch b/main/xen/xsa388-4.14-2.patch deleted file mode 100644 index 2f8cc881f0..0000000000 --- a/main/xen/xsa388-4.14-2.patch +++ /dev/null @@ -1,36 +0,0 @@ -From: Jan Beulich <jbeulich@suse.com> -Subject: x86/PoD: handle intermediate page orders in p2m_pod_cache_add() - -p2m_pod_decrease_reservation() may pass pages to the function which -aren't 4k, 2M, or 1G. Handle all intermediate orders as well, to avoid -hitting the BUG() at the switch() statement's "default" case. - -This is CVE-2021-28708 / part of XSA-388. - -Fixes: 3c352011c0d3 ("x86/PoD: shorten certain operations on higher order ranges") -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> - ---- a/xen/arch/x86/mm/p2m-pod.c -+++ b/xen/arch/x86/mm/p2m-pod.c -@@ -111,15 +111,13 @@ p2m_pod_cache_add(struct p2m_domain *p2m - /* Then add to the appropriate populate-on-demand list. */ - switch ( order ) - { -- case PAGE_ORDER_1G: -- for ( i = 0; i < (1UL << PAGE_ORDER_1G); i += 1UL << PAGE_ORDER_2M ) -+ case PAGE_ORDER_2M ... PAGE_ORDER_1G: -+ for ( i = 0; i < (1UL << order); i += 1UL << PAGE_ORDER_2M ) - page_list_add_tail(page + i, &p2m->pod.super); - break; -- case PAGE_ORDER_2M: -- page_list_add_tail(page, &p2m->pod.super); -- break; -- case PAGE_ORDER_4K: -- page_list_add_tail(page, &p2m->pod.single); -+ case PAGE_ORDER_4K ... PAGE_ORDER_2M - 1: -+ for ( i = 0; i < (1UL << order); i += 1UL << PAGE_ORDER_4K ) -+ page_list_add_tail(page + i, &p2m->pod.single); - break; - default: - BUG(); diff --git a/main/xen/xsa389-4.13.patch b/main/xen/xsa389-4.13.patch deleted file mode 100644 index 10a8a9b9ed..0000000000 --- a/main/xen/xsa389-4.13.patch +++ /dev/null @@ -1,180 +0,0 @@ -From: Jan Beulich <jbeulich@suse.com> -Subject: x86/P2M: deal with partial success of p2m_set_entry() - -M2P and PoD stats need to remain in sync with P2M; if an update succeeds -only partially, respective adjustments need to be made. If updates get -made before the call, they may also need undoing upon complete failure -(i.e. including the single-page case). - -Log-dirty state would better also be kept in sync. - -Note that the change to set_typed_p2m_entry() may not be strictly -necessary (due to the order restriction enforced near the top of the -function), but is being kept here to be on the safe side. - -This is CVE-2021-28705 and CVE-2021-28709 / XSA-389. - -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> - ---- a/xen/arch/x86/mm/p2m.c -+++ b/xen/arch/x86/mm/p2m.c -@@ -781,6 +781,7 @@ p2m_remove_page(struct p2m_domain *p2m, - gfn_t gfn = _gfn(gfn_l); - p2m_type_t t; - p2m_access_t a; -+ int rc; - - /* IOMMU for PV guests is handled in get_page_type() and put_page(). */ - if ( !paging_mode_translate(p2m->domain) ) -@@ -812,8 +813,27 @@ p2m_remove_page(struct p2m_domain *p2m, - set_gpfn_from_mfn(mfn+i, INVALID_M2P_ENTRY); - } - } -- return p2m_set_entry(p2m, gfn, INVALID_MFN, page_order, p2m_invalid, -- p2m->default_access); -+ rc = p2m_set_entry(p2m, gfn, INVALID_MFN, page_order, p2m_invalid, -+ p2m->default_access); -+ if ( likely(!rc) || !mfn_valid(_mfn(mfn)) ) -+ return rc; -+ -+ /* -+ * The operation may have partially succeeded. For the failed part we need -+ * to undo the M2P update and, out of precaution, mark the pages dirty -+ * again. -+ */ -+ for ( i = 0; i < (1UL << page_order); ++i ) -+ { -+ p2m->get_entry(p2m, gfn_add(gfn, i), &t, &a, 0, NULL, NULL); -+ if ( !p2m_is_hole(t) && !p2m_is_special(t) && !p2m_is_shared(t) ) -+ { -+ set_gpfn_from_mfn(mfn + i, gfn_l + i); -+ paging_mark_pfn_dirty(p2m->domain, _pfn(gfn_l + i)); -+ } -+ } -+ -+ return rc; - } - - int -@@ -1002,13 +1022,8 @@ guest_physmap_add_entry(struct domain *d - - /* Now, actually do the two-way mapping */ - rc = p2m_set_entry(p2m, gfn, mfn, page_order, t, p2m->default_access); -- if ( rc == 0 ) -+ if ( likely(!rc) ) - { -- pod_lock(p2m); -- p2m->pod.entry_count -= pod_count; -- BUG_ON(p2m->pod.entry_count < 0); -- pod_unlock(p2m); -- - if ( !p2m_is_grant(t) ) - { - for ( i = 0; i < (1UL << page_order); i++ ) -@@ -1016,6 +1031,42 @@ guest_physmap_add_entry(struct domain *d - gfn_x(gfn_add(gfn, i))); - } - } -+ else -+ { -+ /* -+ * The operation may have partially succeeded. For the successful part -+ * we need to update M2P and dirty state, while for the failed part we -+ * may need to adjust PoD stats as well as undo the earlier M2P update. -+ */ -+ for ( i = 0; i < (1UL << page_order); ++i ) -+ { -+ omfn = p2m->get_entry(p2m, gfn_add(gfn, i), &ot, &a, 0, NULL, NULL); -+ if ( p2m_is_pod(ot) ) -+ { -+ BUG_ON(!pod_count); -+ --pod_count; -+ } -+ else if ( mfn_eq(omfn, mfn_add(mfn, i)) && ot == t && -+ a == p2m->default_access && !p2m_is_grant(t) ) -+ { -+ set_gpfn_from_mfn(mfn_x(omfn), gfn_x(gfn) + i); -+ paging_mark_pfn_dirty(d, _pfn(gfn_x(gfn) + i)); -+ } -+ else if ( p2m_is_ram(ot) && !p2m_is_paged(ot) ) -+ { -+ ASSERT(mfn_valid(omfn)); -+ set_gpfn_from_mfn(mfn_x(omfn), gfn_x(gfn) + i); -+ } -+ } -+ } -+ -+ if ( pod_count ) -+ { -+ pod_lock(p2m); -+ p2m->pod.entry_count -= pod_count; -+ BUG_ON(p2m->pod.entry_count < 0); -+ pod_unlock(p2m); -+ } - - out: - p2m_unlock(p2m); -@@ -1307,6 +1358,49 @@ static int set_typed_p2m_entry(struct do - return 0; - } - } -+ -+ P2M_DEBUG("set %d %lx %lx\n", gfn_p2mt, gfn_l, mfn_x(mfn)); -+ rc = p2m_set_entry(p2m, gfn, mfn, order, gfn_p2mt, access); -+ if ( unlikely(rc) ) -+ { -+ gdprintk(XENLOG_ERR, "p2m_set_entry: %#lx:%u -> %d (0x%"PRI_mfn")\n", -+ gfn_l, order, rc, mfn_x(mfn)); -+ -+ /* -+ * The operation may have partially succeeded. For the successful part -+ * we need to update PoD stats, M2P, and dirty state. -+ */ -+ if ( order != PAGE_ORDER_4K ) -+ { -+ unsigned long i; -+ -+ for ( i = 0; i < (1UL << order); ++i ) -+ { -+ p2m_type_t t; -+ mfn_t cmfn = p2m->get_entry(p2m, gfn_add(gfn, i), &t, &a, 0, -+ NULL, NULL); -+ -+ if ( !mfn_eq(cmfn, mfn_add(mfn, i)) || t != gfn_p2mt || -+ a != access ) -+ continue; -+ -+ if ( p2m_is_ram(ot) ) -+ { -+ ASSERT(mfn_valid(mfn_add(omfn, i))); -+ set_gpfn_from_mfn(mfn_x(omfn) + i, INVALID_M2P_ENTRY); -+ } -+#ifdef CONFIG_HVM -+ else if ( p2m_is_pod(ot) ) -+ { -+ pod_lock(p2m); -+ BUG_ON(!p2m->pod.entry_count); -+ --p2m->pod.entry_count; -+ pod_unlock(p2m); -+ } -+#endif -+ } -+ } -+ } - else if ( p2m_is_ram(ot) ) - { - unsigned long i; -@@ -1317,12 +1411,6 @@ static int set_typed_p2m_entry(struct do - set_gpfn_from_mfn(mfn_x(omfn) + i, INVALID_M2P_ENTRY); - } - } -- -- P2M_DEBUG("set %d %lx %lx\n", gfn_p2mt, gfn_l, mfn_x(mfn)); -- rc = p2m_set_entry(p2m, gfn, mfn, order, gfn_p2mt, access); -- if ( rc ) -- gdprintk(XENLOG_ERR, "p2m_set_entry: %#lx:%u -> %d (0x%"PRI_mfn")\n", -- gfn_l, order, rc, mfn_x(mfn)); - #ifdef CONFIG_HVM - else if ( p2m_is_pod(ot) ) - { |