aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authoromni <omni+alpine@hack.org>2022-04-20 20:57:50 +0000
committeromni <omni+alpine@hack.org>2022-04-22 09:30:08 +0000
commite5a8c3fe7051ee091719ddc4d39feb0c3bb53abb (patch)
tree9b77af7abd9fa8fcb83e2c90717a3ca78f91c57a
parent5a2d38340aca4e112aa285a1ec604b66622733d8 (diff)
downloadaports-e5a8c3fe7051ee091719ddc4d39feb0c3bb53abb.tar.gz
aports-e5a8c3fe7051ee091719ddc4d39feb0c3bb53abb.tar.bz2
aports-e5a8c3fe7051ee091719ddc4d39feb0c3bb53abb.tar.xz
main/xen: add upstream XSA patches
With my limitations, in knowledge and time, I chose to create one .patch file with all the commits to the upstream stable-4.13 branch from 4.13.4 releas up till and including the currently latest commit: git format-patch b4bb02d^..fe97133 --stdout > xen-stable-4.13_git20220408.patch This includes CVE/XSA patches, prerequisites and additional fixes. https://xenbits.xen.org/gitweb/?p=xen.git;a=shortlog;h=refs/heads/stable-4.13
-rw-r--r--main/xen/APKBUILD26
-rw-r--r--main/xen/xen-stable-4.13_git20220408.patch5414
-rw-r--r--main/xen/xsa386.patch29
-rw-r--r--main/xen/xsa388-4.14-1.patch174
-rw-r--r--main/xen/xsa388-4.14-2.patch36
-rw-r--r--main/xen/xsa389-4.13.patch180
6 files changed, 5430 insertions, 429 deletions
diff --git a/main/xen/APKBUILD b/main/xen/APKBUILD
index 8e071e260e..eef4058947 100644
--- a/main/xen/APKBUILD
+++ b/main/xen/APKBUILD
@@ -2,7 +2,7 @@
# Maintainer: Natanael Copa <ncopa@alpinelinux.org>
pkgname=xen
pkgver=4.13.4
-pkgrel=2
+pkgrel=3
pkgdesc="Xen hypervisor"
url="https://www.xenproject.org/"
arch="x86_64 armhf aarch64" # enable armv7 when builds with gcc8
@@ -241,6 +241,19 @@ options="!strip"
# - CVE-2021-28708 XSA-388
# - CVE-2021-28705 XSA-389
# - CVE-2021-28706 XSA-389
+# 4.13.4-r3:
+# - CVE-2021-28706 XSA-385
+# - CVE-2022-23033 XSA-393
+# - CVE-2022-23034 XSA-394
+# - CVE-2022-23035 XSA-395
+# - CVE-2022-26356 XSA-397
+# - CVE-2022-23960 XSA-398
+# - CVE-2022-26401 XSA-398
+# - CVE-2022-26357 XSA-399
+# - CVE-2022-26358 XSA-400
+# - CVE-2022-26359 XSA-400
+# - CVE-2022-26360 XSA-400
+# - CVE-2022-26361 XSA-400
case "$CARCH" in
@@ -307,11 +320,7 @@ source="https://downloads.xenproject.org/release/xen/$pkgver/xen-$pkgver.tar.gz
drop-test.py.patch
py3-compat.patch
- xsa386.patch
-
- xsa388-4.14-1.patch
- xsa388-4.14-2.patch
- xsa389-4.13.patch
+ xen-stable-4.13_git20220408.patch
xenstored.initd
xenstored.confd
@@ -560,10 +569,7 @@ e76816c6ad0e91dc5f81947f266da3429b20e6d976c3e8c41202c6179532eec878a3f0913921ef3a
8c9cfc6afca325df1d8026e21ed03fa8cd2c7e1a21a56cc1968301c5ab634bfe849951899e75d328951d7a41273d1e49a2448edbadec0029ed410c43c0549812 hotplug-Linux-iscsi-block-handle-lun-1.patch
61f66bab603778fb41bfe8e85320c15f2bf3e5d8583e077b56a93784dbdb9b2c7c5e55ce18f06b87501429086f8410d102d3ed5f2a77d54bcfa328bc07681f4d drop-test.py.patch
8cb12dbfc05a53898a97d47d71ab6b8a6f81c5e5579fd765b37303faea95c645cb8dedc05e3d064bdf070e93814e00bf8939767acc1127513375bab0fe2f4436 py3-compat.patch
-77811232c5cf199d24fb8e4a5367a56d56e61ad218397913fa22bd89d0dffabe92acfded246aa731d450f80dcffee84268b27e73e60f19eec15d0ada988a0574 xsa386.patch
-5e8165695a7e5a7fdc332de0d4ee31626eb72c8765f12855543592cb86f0eb4f98ea49cae31c8fc356a0645f6a2fe05ddf2b38f9f2bb04196bb4b9efc204dc26 xsa388-4.14-1.patch
-9e7b5f66480d3c0898cc080d0506dddbe35a814ccd72619abb82e8241b8cddc726e7bb38ce818335451b56ba549ed9ea1743f46fb9f0fd81ac1310ec6e94fea4 xsa388-4.14-2.patch
-bd18e7f61a28ebd99f8d7fe33b6130646493489bd4a21fa9febb81860b3c4a6c20aaf51f1cfa7c19340dbd21333c2e6859f852868f8de29e2862bd93e02040ba xsa389-4.13.patch
+f02f939fc9f788e99c7363e1e385e83acaa5725594eb4b37597b824ec8f853ba0f91ee0d17ebcf59c3ae4ed08eaa4ae79e3572602a67d51ed46ed900a63054e1 xen-stable-4.13_git20220408.patch
52c43beb2596d645934d0f909f2d21f7587b6898ed5e5e7046799a8ed6d58f7a09c5809e1634fa26152f3fd4f3e7cfa07da7076f01b4a20cc8f5df8b9cb77e50 xenstored.initd
093f7fbd43faf0a16a226486a0776bade5dc1681d281c5946a3191c32d74f9699c6bf5d0ab8de9d1195a2461165d1660788e92a3156c9b3c7054d7b2d52d7ff0 xenstored.confd
3c86ed48fbee0af4051c65c4a3893f131fa66e47bf083caf20c9b6aa4b63fdead8832f84a58d0e27964bc49ec8397251b34e5be5c212c139f556916dc8da9523 xenconsoled.initd
diff --git a/main/xen/xen-stable-4.13_git20220408.patch b/main/xen/xen-stable-4.13_git20220408.patch
new file mode 100644
index 0000000000..33c1738ac2
--- /dev/null
+++ b/main/xen/xen-stable-4.13_git20220408.patch
@@ -0,0 +1,5414 @@
+From b4bb02d5999a56c93f0733b589b717e7cece9c09 Mon Sep 17 00:00:00 2001
+From: Jan Beulich <jbeulich@suse.com>
+Date: Fri, 1 Oct 2021 15:05:42 +0200
+Subject: [PATCH 01/32] VT-d: fix deassign of device with RMRR
+
+Ignoring a specific error code here was not meant to short circuit
+deassign to _just_ the unmapping of RMRRs. This bug was previously
+hidden by the bogus (potentially indefinite) looping in
+pci_release_devices(), until f591755823a7 ("IOMMU/PCI: don't let domain
+cleanup continue when device de-assignment failed") fixed that loop.
+
+This is CVE-2021-28702 / XSA-386.
+
+Fixes: 8b99f4400b69 ("VT-d: fix RMRR related error handling")
+Reported-by: Ivan Kardykov <kardykov@tabit.pro>
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Tested-by: Ivan Kardykov <kardykov@tabit.pro>
+(cherry picked from commit 24ebe875a77833696bbe5c9372e9e1590a7e7101)
+---
+ xen/drivers/passthrough/vtd/iommu.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c
+index 98787ce3a8..af8b9ca0e4 100644
+--- a/xen/drivers/passthrough/vtd/iommu.c
++++ b/xen/drivers/passthrough/vtd/iommu.c
+@@ -2393,7 +2393,7 @@ static int reassign_device_ownership(
+ ret = iommu_identity_mapping(source, p2m_access_x,
+ rmrr->base_address,
+ rmrr->end_address, 0);
+- if ( ret != -ENOENT )
++ if ( ret && ret != -ENOENT )
+ return ret;
+ }
+ }
+--
+2.35.2
+
+
+From 0b28069aa7c26288376040e6ee9ca145245db39e Mon Sep 17 00:00:00 2001
+From: Julien Grall <jgrall@amazon.com>
+Date: Tue, 23 Nov 2021 13:32:26 +0100
+Subject: [PATCH 02/32] xen/page_alloc: Harden assign_pages()
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+domain_tot_pages() and d->max_pages are 32-bit values. While the order
+should always be quite small, it would still be possible to overflow
+if domain_tot_pages() is near to (2^32 - 1).
+
+As this code may be called by a guest via XENMEM_increase_reservation
+and XENMEM_populate_physmap, we want to make sure the guest is not going
+to be able to allocate more than it is allowed.
+
+Rework the allocation check to avoid any possible overflow. While the
+check domain_tot_pages() < d->max_pages should technically not be
+necessary, it is probably best to have it to catch any possible
+inconsistencies in the future.
+
+This is CVE-2021-28706 / part of XSA-385.
+
+Signed-off-by: Julien Grall <jgrall@amazon.com>
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
+master commit: 143501861d48e1bfef495849fd68584baac05849
+master date: 2021-11-22 11:11:05 +0000
+---
+ xen/common/grant_table.c | 7 ++++---
+ xen/common/page_alloc.c | 19 ++++++++++++++-----
+ 2 files changed, 18 insertions(+), 8 deletions(-)
+
+diff --git a/xen/common/grant_table.c b/xen/common/grant_table.c
+index d2853a664a..7b775a8c35 100644
+--- a/xen/common/grant_table.c
++++ b/xen/common/grant_table.c
+@@ -2286,7 +2286,8 @@ gnttab_transfer(
+ * pages when it is dying.
+ */
+ if ( unlikely(e->is_dying) ||
+- unlikely(e->tot_pages >= e->max_pages) )
++ unlikely(e->tot_pages >= e->max_pages) ||
++ unlikely(!(e->tot_pages + 1)) )
+ {
+ spin_unlock(&e->page_alloc_lock);
+
+@@ -2295,8 +2296,8 @@ gnttab_transfer(
+ e->domain_id);
+ else
+ gdprintk(XENLOG_INFO,
+- "Transferee d%d has no headroom (tot %u, max %u)\n",
+- e->domain_id, e->tot_pages, e->max_pages);
++ "Transferee %pd has no headroom (tot %u, max %u)\n",
++ e, e->tot_pages, e->max_pages);
+
+ gop.status = GNTST_general_error;
+ goto unlock_and_copyback;
+diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
+index 1563188f4f..0976bf6489 100644
+--- a/xen/common/page_alloc.c
++++ b/xen/common/page_alloc.c
+@@ -2276,16 +2276,25 @@ int assign_pages(
+
+ if ( !(memflags & MEMF_no_refcount) )
+ {
+- if ( unlikely((d->tot_pages + (1 << order)) > d->max_pages) )
++ unsigned int nr = 1u << order;
++
++ if ( unlikely(d->tot_pages > d->max_pages) )
++ {
++ gprintk(XENLOG_INFO, "Inconsistent allocation for %pd: %u > %u\n",
++ d, d->tot_pages, d->max_pages);
++ rc = -EPERM;
++ goto out;
++ }
++
++ if ( unlikely(nr > d->max_pages - d->tot_pages) )
+ {
+- gprintk(XENLOG_INFO, "Over-allocation for domain %u: "
+- "%u > %u\n", d->domain_id,
+- d->tot_pages + (1 << order), d->max_pages);
++ gprintk(XENLOG_INFO, "Over-allocation for %pd: %Lu > %u\n",
++ d, d->tot_pages + 0ull + nr, d->max_pages);
+ rc = -E2BIG;
+ goto out;
+ }
+
+- if ( unlikely(domain_adjust_tot_pages(d, 1 << order) == (1 << order)) )
++ if ( unlikely(domain_adjust_tot_pages(d, nr) == nr) )
+ get_knownalive_domain(d);
+ }
+
+--
+2.35.2
+
+
+From d94d006ed36084914c2931641b724ae262e3fb80 Mon Sep 17 00:00:00 2001
+From: Jan Beulich <jbeulich@suse.com>
+Date: Tue, 23 Nov 2021 13:32:54 +0100
+Subject: [PATCH 03/32] x86/PoD: deal with misaligned GFNs
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Users of XENMEM_decrease_reservation and XENMEM_populate_physmap aren't
+required to pass in order-aligned GFN values. (While I consider this
+bogus, I don't think we can fix this there, as that might break existing
+code, e.g Linux'es swiotlb, which - while affecting PV only - until
+recently had been enforcing only page alignment on the original
+allocation.) Only non-PoD code paths (guest_physmap_{add,remove}_page(),
+p2m_set_entry()) look to be dealing with this properly (in part by being
+implemented inefficiently, handling every 4k page separately).
+
+Introduce wrappers taking care of splitting the incoming request into
+aligned chunks, without putting much effort in trying to determine the
+largest possible chunk at every iteration.
+
+Also "handle" p2m_set_entry() failure for non-order-0 requests by
+crashing the domain in one more place. Alongside putting a log message
+there, also add one to the other similar path.
+
+Note regarding locking: This is left in the actual worker functions on
+the assumption that callers aren't guaranteed atomicity wrt acting on
+multiple pages at a time. For mis-aligned GFNs gfn_lock() wouldn't have
+locked the correct GFN range anyway, if it didn't simply resolve to
+p2m_lock(), and for well-behaved callers there continues to be only a
+single iteration, i.e. behavior is unchanged for them. (FTAOD pulling
+out just pod_lock() into p2m_pod_decrease_reservation() would result in
+a lock order violation.)
+
+This is CVE-2021-28704 and CVE-2021-28707 / part of XSA-388.
+
+Fixes: 3c352011c0d3 ("x86/PoD: shorten certain operations on higher order ranges")
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
+master commit: 182c737b9ba540ebceb1433f3940fbed6eac4ea9
+master date: 2021-11-22 12:27:30 +0000
+---
+ xen/arch/x86/mm/p2m-pod.c | 75 ++++++++++++++++++++++++++++++++-------
+ 1 file changed, 63 insertions(+), 12 deletions(-)
+
+diff --git a/xen/arch/x86/mm/p2m-pod.c b/xen/arch/x86/mm/p2m-pod.c
+index 007cdd87d0..c14801f5ff 100644
+--- a/xen/arch/x86/mm/p2m-pod.c
++++ b/xen/arch/x86/mm/p2m-pod.c
+@@ -495,7 +495,7 @@ p2m_pod_zero_check_superpage(struct p2m_domain *p2m, gfn_t gfn);
+
+
+ /*
+- * This function is needed for two reasons:
++ * This pair of functions is needed for two reasons:
+ * + To properly handle clearing of PoD entries
+ * + To "steal back" memory being freed for the PoD cache, rather than
+ * releasing it.
+@@ -503,8 +503,8 @@ p2m_pod_zero_check_superpage(struct p2m_domain *p2m, gfn_t gfn);
+ * Once both of these functions have been completed, we can return and
+ * allow decrease_reservation() to handle everything else.
+ */
+-unsigned long
+-p2m_pod_decrease_reservation(struct domain *d, gfn_t gfn, unsigned int order)
++static unsigned long
++decrease_reservation(struct domain *d, gfn_t gfn, unsigned int order)
+ {
+ unsigned long ret = 0, i, n;
+ struct p2m_domain *p2m = p2m_get_hostp2m(d);
+@@ -557,8 +557,10 @@ p2m_pod_decrease_reservation(struct domain *d, gfn_t gfn, unsigned int order)
+ * All PoD: Mark the whole region invalid and tell caller
+ * we're done.
+ */
+- if ( p2m_set_entry(p2m, gfn, INVALID_MFN, order, p2m_invalid,
+- p2m->default_access) )
++ int rc = p2m_set_entry(p2m, gfn, INVALID_MFN, order, p2m_invalid,
++ p2m->default_access);
++
++ if ( rc )
+ {
+ /*
+ * If this fails, we can't tell how much of the range was changed.
+@@ -566,7 +568,12 @@ p2m_pod_decrease_reservation(struct domain *d, gfn_t gfn, unsigned int order)
+ * impossible.
+ */
+ if ( order != 0 )
++ {
++ printk(XENLOG_G_ERR
++ "%pd: marking GFN %#lx (order %u) as non-PoD failed: %d\n",
++ d, gfn_x(gfn), order, rc);
+ domain_crash(d);
++ }
+ goto out_unlock;
+ }
+ ret = 1UL << order;
+@@ -674,6 +681,22 @@ out_unlock:
+ return ret;
+ }
+
++unsigned long
++p2m_pod_decrease_reservation(struct domain *d, gfn_t gfn, unsigned int order)
++{
++ unsigned long left = 1UL << order, ret = 0;
++ unsigned int chunk_order = find_first_set_bit(gfn_x(gfn) | left);
++
++ do {
++ ret += decrease_reservation(d, gfn, chunk_order);
++
++ left -= 1UL << chunk_order;
++ gfn = gfn_add(gfn, 1UL << chunk_order);
++ } while ( left );
++
++ return ret;
++}
++
+ void p2m_pod_dump_data(struct domain *d)
+ {
+ struct p2m_domain *p2m = p2m_get_hostp2m(d);
+@@ -1269,19 +1292,15 @@ remap_and_retry:
+ return true;
+ }
+
+-
+-int
+-guest_physmap_mark_populate_on_demand(struct domain *d, unsigned long gfn_l,
+- unsigned int order)
++static int
++mark_populate_on_demand(struct domain *d, unsigned long gfn_l,
++ unsigned int order)
+ {
+ struct p2m_domain *p2m = p2m_get_hostp2m(d);
+ gfn_t gfn = _gfn(gfn_l);
+ unsigned long i, n, pod_count = 0;
+ int rc = 0;
+
+- if ( !paging_mode_translate(d) )
+- return -EINVAL;
+-
+ gfn_lock(p2m, gfn, order);
+
+ P2M_DEBUG("mark pod gfn=%#lx\n", gfn_l);
+@@ -1319,6 +1338,17 @@ guest_physmap_mark_populate_on_demand(struct domain *d, unsigned long gfn_l,
+ BUG_ON(p2m->pod.entry_count < 0);
+ pod_unlock(p2m);
+ }
++ else if ( order )
++ {
++ /*
++ * If this failed, we can't tell how much of the range was changed.
++ * Best to crash the domain.
++ */
++ printk(XENLOG_G_ERR
++ "%pd: marking GFN %#lx (order %u) as PoD failed: %d\n",
++ d, gfn_l, order, rc);
++ domain_crash(d);
++ }
+
+ out:
+ gfn_unlock(p2m, gfn, order);
+@@ -1326,6 +1356,27 @@ out:
+ return rc;
+ }
+
++int
++guest_physmap_mark_populate_on_demand(struct domain *d, unsigned long gfn,
++ unsigned int order)
++{
++ unsigned long left = 1UL << order;
++ unsigned int chunk_order = find_first_set_bit(gfn | left);
++ int rc;
++
++ if ( !paging_mode_translate(d) )
++ return -EINVAL;
++
++ do {
++ rc = mark_populate_on_demand(d, gfn, chunk_order);
++
++ left -= 1UL << chunk_order;
++ gfn += 1UL << chunk_order;
++ } while ( !rc && left );
++
++ return rc;
++}
++
+ void p2m_pod_init(struct p2m_domain *p2m)
+ {
+ unsigned int i;
+--
+2.35.2
+
+
+From d3cfb4b3a680d3e2ddd36f18201d48441f36aea0 Mon Sep 17 00:00:00 2001
+From: Jan Beulich <jbeulich@suse.com>
+Date: Tue, 23 Nov 2021 13:33:14 +0100
+Subject: [PATCH 04/32] x86/PoD: handle intermediate page orders in
+ p2m_pod_cache_add()
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+p2m_pod_decrease_reservation() may pass pages to the function which
+aren't 4k, 2M, or 1G. Handle all intermediate orders as well, to avoid
+hitting the BUG() at the switch() statement's "default" case.
+
+This is CVE-2021-28708 / part of XSA-388.
+
+Fixes: 3c352011c0d3 ("x86/PoD: shorten certain operations on higher order ranges")
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
+master commit: 8ec13f68e0b026863d23e7f44f252d06478bc809
+master date: 2021-11-22 12:27:30 +0000
+---
+ xen/arch/x86/mm/p2m-pod.c | 12 +++++-------
+ 1 file changed, 5 insertions(+), 7 deletions(-)
+
+diff --git a/xen/arch/x86/mm/p2m-pod.c b/xen/arch/x86/mm/p2m-pod.c
+index c14801f5ff..c981200087 100644
+--- a/xen/arch/x86/mm/p2m-pod.c
++++ b/xen/arch/x86/mm/p2m-pod.c
+@@ -111,15 +111,13 @@ p2m_pod_cache_add(struct p2m_domain *p2m,
+ /* Then add to the appropriate populate-on-demand list. */
+ switch ( order )
+ {
+- case PAGE_ORDER_1G:
+- for ( i = 0; i < (1UL << PAGE_ORDER_1G); i += 1UL << PAGE_ORDER_2M )
++ case PAGE_ORDER_2M ... PAGE_ORDER_1G:
++ for ( i = 0; i < (1UL << order); i += 1UL << PAGE_ORDER_2M )
+ page_list_add_tail(page + i, &p2m->pod.super);
+ break;
+- case PAGE_ORDER_2M:
+- page_list_add_tail(page, &p2m->pod.super);
+- break;
+- case PAGE_ORDER_4K:
+- page_list_add_tail(page, &p2m->pod.single);
++ case PAGE_ORDER_4K ... PAGE_ORDER_2M - 1:
++ for ( i = 0; i < (1UL << order); i += 1UL << PAGE_ORDER_4K )
++ page_list_add_tail(page + i, &p2m->pod.single);
+ break;
+ default:
+ BUG();
+--
+2.35.2
+
+
+From d3c2319ea1657f31ae3899713afc23789b771c10 Mon Sep 17 00:00:00 2001
+From: Jan Beulich <jbeulich@suse.com>
+Date: Tue, 23 Nov 2021 13:33:33 +0100
+Subject: [PATCH 05/32] x86/P2M: deal with partial success of p2m_set_entry()
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+M2P and PoD stats need to remain in sync with P2M; if an update succeeds
+only partially, respective adjustments need to be made. If updates get
+made before the call, they may also need undoing upon complete failure
+(i.e. including the single-page case).
+
+Log-dirty state would better also be kept in sync.
+
+Note that the change to set_typed_p2m_entry() may not be strictly
+necessary (due to the order restriction enforced near the top of the
+function), but is being kept here to be on the safe side.
+
+This is CVE-2021-28705 and CVE-2021-28709 / XSA-389.
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
+master commit: 74a11c43fd7e074b1f77631b446dd2115eacb9e8
+master date: 2021-11-22 12:27:30 +0000
+---
+ xen/arch/x86/mm/p2m.c | 116 +++++++++++++++++++++++++++++++++++++-----
+ 1 file changed, 102 insertions(+), 14 deletions(-)
+
+diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
+index a68b4fe526..a6bfda010a 100644
+--- a/xen/arch/x86/mm/p2m.c
++++ b/xen/arch/x86/mm/p2m.c
+@@ -781,6 +781,7 @@ p2m_remove_page(struct p2m_domain *p2m, unsigned long gfn_l, unsigned long mfn,
+ gfn_t gfn = _gfn(gfn_l);
+ p2m_type_t t;
+ p2m_access_t a;
++ int rc;
+
+ /* IOMMU for PV guests is handled in get_page_type() and put_page(). */
+ if ( !paging_mode_translate(p2m->domain) )
+@@ -812,8 +813,27 @@ p2m_remove_page(struct p2m_domain *p2m, unsigned long gfn_l, unsigned long mfn,
+ set_gpfn_from_mfn(mfn+i, INVALID_M2P_ENTRY);
+ }
+ }
+- return p2m_set_entry(p2m, gfn, INVALID_MFN, page_order, p2m_invalid,
+- p2m->default_access);
++ rc = p2m_set_entry(p2m, gfn, INVALID_MFN, page_order, p2m_invalid,
++ p2m->default_access);
++ if ( likely(!rc) || !mfn_valid(_mfn(mfn)) )
++ return rc;
++
++ /*
++ * The operation may have partially succeeded. For the failed part we need
++ * to undo the M2P update and, out of precaution, mark the pages dirty
++ * again.
++ */
++ for ( i = 0; i < (1UL << page_order); ++i )
++ {
++ p2m->get_entry(p2m, gfn_add(gfn, i), &t, &a, 0, NULL, NULL);
++ if ( !p2m_is_hole(t) && !p2m_is_special(t) && !p2m_is_shared(t) )
++ {
++ set_gpfn_from_mfn(mfn + i, gfn_l + i);
++ paging_mark_pfn_dirty(p2m->domain, _pfn(gfn_l + i));
++ }
++ }
++
++ return rc;
+ }
+
+ int
+@@ -1002,13 +1022,8 @@ guest_physmap_add_entry(struct domain *d, gfn_t gfn, mfn_t mfn,
+
+ /* Now, actually do the two-way mapping */
+ rc = p2m_set_entry(p2m, gfn, mfn, page_order, t, p2m->default_access);
+- if ( rc == 0 )
++ if ( likely(!rc) )
+ {
+- pod_lock(p2m);
+- p2m->pod.entry_count -= pod_count;
+- BUG_ON(p2m->pod.entry_count < 0);
+- pod_unlock(p2m);
+-
+ if ( !p2m_is_grant(t) )
+ {
+ for ( i = 0; i < (1UL << page_order); i++ )
+@@ -1016,6 +1031,42 @@ guest_physmap_add_entry(struct domain *d, gfn_t gfn, mfn_t mfn,
+ gfn_x(gfn_add(gfn, i)));
+ }
+ }
++ else
++ {
++ /*
++ * The operation may have partially succeeded. For the successful part
++ * we need to update M2P and dirty state, while for the failed part we
++ * may need to adjust PoD stats as well as undo the earlier M2P update.
++ */
++ for ( i = 0; i < (1UL << page_order); ++i )
++ {
++ omfn = p2m->get_entry(p2m, gfn_add(gfn, i), &ot, &a, 0, NULL, NULL);
++ if ( p2m_is_pod(ot) )
++ {
++ BUG_ON(!pod_count);
++ --pod_count;
++ }
++ else if ( mfn_eq(omfn, mfn_add(mfn, i)) && ot == t &&
++ a == p2m->default_access && !p2m_is_grant(t) )
++ {
++ set_gpfn_from_mfn(mfn_x(omfn), gfn_x(gfn) + i);
++ paging_mark_pfn_dirty(d, _pfn(gfn_x(gfn) + i));
++ }
++ else if ( p2m_is_ram(ot) && !p2m_is_paged(ot) )
++ {
++ ASSERT(mfn_valid(omfn));
++ set_gpfn_from_mfn(mfn_x(omfn), gfn_x(gfn) + i);
++ }
++ }
++ }
++
++ if ( pod_count )
++ {
++ pod_lock(p2m);
++ p2m->pod.entry_count -= pod_count;
++ BUG_ON(p2m->pod.entry_count < 0);
++ pod_unlock(p2m);
++ }
+
+ out:
+ p2m_unlock(p2m);
+@@ -1307,6 +1358,49 @@ static int set_typed_p2m_entry(struct domain *d, unsigned long gfn_l,
+ return 0;
+ }
+ }
++
++ P2M_DEBUG("set %d %lx %lx\n", gfn_p2mt, gfn_l, mfn_x(mfn));
++ rc = p2m_set_entry(p2m, gfn, mfn, order, gfn_p2mt, access);
++ if ( unlikely(rc) )
++ {
++ gdprintk(XENLOG_ERR, "p2m_set_entry: %#lx:%u -> %d (0x%"PRI_mfn")\n",
++ gfn_l, order, rc, mfn_x(mfn));
++
++ /*
++ * The operation may have partially succeeded. For the successful part
++ * we need to update PoD stats, M2P, and dirty state.
++ */
++ if ( order != PAGE_ORDER_4K )
++ {
++ unsigned long i;
++
++ for ( i = 0; i < (1UL << order); ++i )
++ {
++ p2m_type_t t;
++ mfn_t cmfn = p2m->get_entry(p2m, gfn_add(gfn, i), &t, &a, 0,
++ NULL, NULL);
++
++ if ( !mfn_eq(cmfn, mfn_add(mfn, i)) || t != gfn_p2mt ||
++ a != access )
++ continue;
++
++ if ( p2m_is_ram(ot) )
++ {
++ ASSERT(mfn_valid(mfn_add(omfn, i)));
++ set_gpfn_from_mfn(mfn_x(omfn) + i, INVALID_M2P_ENTRY);
++ }
++#ifdef CONFIG_HVM
++ else if ( p2m_is_pod(ot) )
++ {
++ pod_lock(p2m);
++ BUG_ON(!p2m->pod.entry_count);
++ --p2m->pod.entry_count;
++ pod_unlock(p2m);
++ }
++#endif
++ }
++ }
++ }
+ else if ( p2m_is_ram(ot) )
+ {
+ unsigned long i;
+@@ -1317,12 +1411,6 @@ static int set_typed_p2m_entry(struct domain *d, unsigned long gfn_l,
+ set_gpfn_from_mfn(mfn_x(omfn) + i, INVALID_M2P_ENTRY);
+ }
+ }
+-
+- P2M_DEBUG("set %d %lx %lx\n", gfn_p2mt, gfn_l, mfn_x(mfn));
+- rc = p2m_set_entry(p2m, gfn, mfn, order, gfn_p2mt, access);
+- if ( rc )
+- gdprintk(XENLOG_ERR, "p2m_set_entry: %#lx:%u -> %d (0x%"PRI_mfn")\n",
+- gfn_l, order, rc, mfn_x(mfn));
+ #ifdef CONFIG_HVM
+ else if ( p2m_is_pod(ot) )
+ {
+--
+2.35.2
+
+
+From d0e2c2762b981abd984af66a844ac12d8bf8f813 Mon Sep 17 00:00:00 2001
+From: Ian Jackson <iwj@xenproject.org>
+Date: Mon, 6 Dec 2021 14:40:24 +0000
+Subject: [PATCH 06/32] MAINTAINERS: Resign from tools stable branch
+ maintainership
+
+Signed-off-by: Ian Jackson <iwj@xenproject.org>
+Signed-off-by: Ian Jackson <ian.jackson@eu.citrix.com>
+(cherry picked from commit c623a84c2a4fda1cd25f5347a6298706218eb5fb)
+(cherry picked from commit c4cf5388652e8434652e30c73aa79635b4253675)
+---
+ MAINTAINERS | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/MAINTAINERS b/MAINTAINERS
+index 806e02b4f8..bdd885ddff 100644
+--- a/MAINTAINERS
++++ b/MAINTAINERS
+@@ -60,7 +60,7 @@ The maintainer for this branch is:
+
+ Tools backport requests should also be copied to:
+
+- Ian Jackson <Ian.Jackson@eu.citrix.com>
++ TODO - Loooking for new tools stable maintainer
+
+
+ Unstable Subsystem Maintainers
+--
+2.35.2
+
+
+From 2d601a5ca15e02820d08232ad64add8b8374b81c Mon Sep 17 00:00:00 2001
+From: Julien Grall <jgrall@amazon.com>
+Date: Tue, 25 Jan 2022 14:44:21 +0100
+Subject: [PATCH 07/32] xen/arm: p2m: Always clear the P2M entry when the
+ mapping is removed
+
+Commit 2148a125b73b ("xen/arm: Track page accessed between batch of
+Set/Way operations") allowed an entry to be invalid from the CPU PoV
+(lpae_is_valid()) but valid for Xen (p2m_is_valid()). This is useful
+to track which page is accessed and only perform an action on them
+(e.g. clean & invalidate the cache after a set/way instruction).
+
+Unfortunately, __p2m_set_entry() is only zeroing the P2M entry when
+lpae_is_valid() returns true. This means the entry will not be zeroed
+if the entry was valid from Xen PoV but invalid from the CPU PoV for
+tracking purpose.
+
+As a consequence, this will allow a domain to continue to access the
+page after it was removed.
+
+Resolve the issue by always zeroing the entry if it the LPAE bit is
+set or the entry is about to be removed.
+
+This is CVE-2022-23033 / XSA-393.
+
+Reported-by: Dmytro Firsov <Dmytro_Firsov@epam.com>
+Fixes: 2148a125b73b ("xen/arm: Track page accessed between batch of Set/Way operations")
+Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
+Signed-off-by: Julien Grall <jgrall@amazon.com>
+master commit: a428b913a002eb2b7425b48029c20a52eeee1b5a
+master date: 2022-01-25 13:25:01 +0100
+---
+ xen/arch/arm/p2m.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c
+index ce59f2b503..993fe4ded2 100644
+--- a/xen/arch/arm/p2m.c
++++ b/xen/arch/arm/p2m.c
+@@ -1012,7 +1012,7 @@ static int __p2m_set_entry(struct p2m_domain *p2m,
+ * sequence when updating the translation table (D4.7.1 in ARM DDI
+ * 0487A.j).
+ */
+- if ( lpae_is_valid(orig_pte) )
++ if ( lpae_is_valid(orig_pte) || removing_mapping )
+ p2m_remove_pte(entry, p2m->clean_pte);
+
+ if ( removing_mapping )
+--
+2.35.2
+
+
+From e48c7878e54a5f970c00abed2cfd747858f0d592 Mon Sep 17 00:00:00 2001
+From: Julien Grall <jgrall@amazon.com>
+Date: Tue, 25 Jan 2022 14:44:47 +0100
+Subject: [PATCH 08/32] xen/grant-table: Only decrement the refcounter when
+ grant is fully unmapped
+
+The grant unmapping hypercall (GNTTABOP_unmap_grant_ref) is not a
+simple revert of the changes done by the grant mapping hypercall
+(GNTTABOP_map_grant_ref).
+
+Instead, it is possible to partially (or even not) clear some flags.
+This will leave the grant is mapped until a future call where all
+the flags would be cleared.
+
+XSA-380 introduced a refcounting that is meant to only be dropped
+when the grant is fully unmapped. Unfortunately, unmap_common() will
+decrement the refcount for every successful call.
+
+A consequence is a domain would be able to underflow the refcount
+and trigger a BUG().
+
+Looking at the code, it is not clear to me why a domain would
+want to partially clear some flags in the grant-table. But as
+this is part of the ABI, it is better to not change the behavior
+for now.
+
+Fix it by checking if the maptrack handle has been released before
+decrementing the refcounting.
+
+This is CVE-2022-23034 / XSA-394.
+
+Fixes: 9781b51efde2 ("gnttab: replace mapkind()")
+Signed-off-by: Julien Grall <jgrall@amazon.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+master commit: 975a8fb45ca186b3476e5656c6ad5dad1122dbfd
+master date: 2022-01-25 13:25:49 +0100
+---
+ xen/common/grant_table.c | 11 +++++++++--
+ 1 file changed, 9 insertions(+), 2 deletions(-)
+
+diff --git a/xen/common/grant_table.c b/xen/common/grant_table.c
+index 7b775a8c35..cbb2ce17c0 100644
+--- a/xen/common/grant_table.c
++++ b/xen/common/grant_table.c
+@@ -1438,8 +1438,15 @@ unmap_common(
+ if ( put_handle )
+ put_maptrack_handle(lgt, op->handle);
+
+- /* See the respective comment in map_grant_ref(). */
+- if ( rc == GNTST_okay && ld != rd && gnttab_need_iommu_mapping(ld) )
++ /*
++ * map_grant_ref() will only increment the refcount (and update the
++ * IOMMU) once per mapping. So we only want to decrement it once the
++ * maptrack handle has been put, alongside the further IOMMU update.
++ *
++ * For the second and third check, see the respective comment in
++ * map_grant_ref().
++ */
++ if ( put_handle && ld != rd && gnttab_need_iommu_mapping(ld) )
+ {
+ void **slot;
+ union maptrack_node node;
+--
+2.35.2
+
+
+From ce49a1d6d819f4587436b4ff73334d3676c1aab6 Mon Sep 17 00:00:00 2001
+From: Julien Grall <jgrall@amazon.com>
+Date: Tue, 25 Jan 2022 14:45:07 +0100
+Subject: [PATCH 09/32] passthrough/x86: stop pirq iteration immediately in
+ case of error
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+pt_pirq_iterate() will iterate in batch over all the PIRQs. The outer
+loop will bail out if 'rc' is non-zero but the inner loop will continue.
+
+This means 'rc' will get clobbered and we may miss any errors (such as
+-ERESTART in the case of the callback pci_clean_dpci_irq()).
+
+This is CVE-2022-23035 / XSA-395.
+
+Fixes: c24536b636f2 ("replace d->nr_pirqs sized arrays with radix tree")
+Fixes: f6dd295381f4 ("dpci: replace tasklet with softirq")
+Signed-off-by: Julien Grall <jgrall@amazon.com>
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
+master commit: 9480a1a519cf016623f657dc544cb372a82b5708
+master date: 2022-01-25 13:27:02 +0100
+---
+ xen/drivers/passthrough/io.c | 4 ++++
+ 1 file changed, 4 insertions(+)
+
+diff --git a/xen/drivers/passthrough/io.c b/xen/drivers/passthrough/io.c
+index 71eaf2c17e..b6e88ebc86 100644
+--- a/xen/drivers/passthrough/io.c
++++ b/xen/drivers/passthrough/io.c
+@@ -810,7 +810,11 @@ int pt_pirq_iterate(struct domain *d,
+
+ pirq = pirqs[i]->pirq;
+ if ( (pirq_dpci->flags & HVM_IRQ_DPCI_MAPPED) )
++ {
+ rc = cb(d, pirq_dpci, arg);
++ if ( rc )
++ break;
++ }
+ }
+ } while ( !rc && ++pirq < d->nr_pirqs && n == ARRAY_SIZE(pirqs) );
+
+--
+2.35.2
+
+
+From 9a8804a92fed77f77afe9fc525c6891bb60f68d3 Mon Sep 17 00:00:00 2001
+From: Bertrand Marquis <bertrand.marquis@arm.com>
+Date: Tue, 15 Feb 2022 10:37:51 +0000
+Subject: [PATCH 10/32] xen/arm: Introduce new Arm processors
+
+Add some new processor identifiers in processor.h and sync Xen
+definitions with status of Linux 5.17 (declared in
+arch/arm64/include/asm/cputype.h).
+
+This is part of XSA-398 / CVE-2022-23960.
+
+Signed-off-by: Bertrand Marquis <bertrand.marquis@arm.com>
+Acked-by: Julien Grall <julien@xen.org>
+(cherry picked from commit 35d1b85a6b43483f6bd007d48757434e54743e98)
+---
+ xen/include/asm-arm/processor.h | 18 ++++++++++++++++++
+ 1 file changed, 18 insertions(+)
+
+diff --git a/xen/include/asm-arm/processor.h b/xen/include/asm-arm/processor.h
+index 87c8136022..17cc5cf486 100644
+--- a/xen/include/asm-arm/processor.h
++++ b/xen/include/asm-arm/processor.h
+@@ -53,6 +53,7 @@
+ #define ARM_CPU_PART_CORTEX_A17 0xC0E
+ #define ARM_CPU_PART_CORTEX_A15 0xC0F
+ #define ARM_CPU_PART_CORTEX_A53 0xD03
++#define ARM_CPU_PART_CORTEX_A35 0xD04
+ #define ARM_CPU_PART_CORTEX_A55 0xD05
+ #define ARM_CPU_PART_CORTEX_A57 0xD07
+ #define ARM_CPU_PART_CORTEX_A72 0xD08
+@@ -60,11 +61,20 @@
+ #define ARM_CPU_PART_CORTEX_A75 0xD0A
+ #define ARM_CPU_PART_CORTEX_A76 0xD0B
+ #define ARM_CPU_PART_NEOVERSE_N1 0xD0C
++#define ARM_CPU_PART_CORTEX_A77 0xD0D
++#define ARM_CPU_PART_NEOVERSE_V1 0xD40
++#define ARM_CPU_PART_CORTEX_A78 0xD41
++#define ARM_CPU_PART_CORTEX_X1 0xD44
++#define ARM_CPU_PART_CORTEX_A710 0xD47
++#define ARM_CPU_PART_CORTEX_X2 0xD48
++#define ARM_CPU_PART_NEOVERSE_N2 0xD49
++#define ARM_CPU_PART_CORTEX_A78C 0xD4B
+
+ #define MIDR_CORTEX_A12 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A12)
+ #define MIDR_CORTEX_A17 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A17)
+ #define MIDR_CORTEX_A15 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A15)
+ #define MIDR_CORTEX_A53 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A53)
++#define MIDR_CORTEX_A35 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A35)
+ #define MIDR_CORTEX_A55 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A55)
+ #define MIDR_CORTEX_A57 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A57)
+ #define MIDR_CORTEX_A72 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A72)
+@@ -72,6 +82,14 @@
+ #define MIDR_CORTEX_A75 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A75)
+ #define MIDR_CORTEX_A76 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A76)
+ #define MIDR_NEOVERSE_N1 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_N1)
++#define MIDR_CORTEX_A77 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A77)
++#define MIDR_NEOVERSE_V1 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_V1)
++#define MIDR_CORTEX_A78 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A78)
++#define MIDR_CORTEX_X1 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X1)
++#define MIDR_CORTEX_A710 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A710)
++#define MIDR_CORTEX_X2 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X2)
++#define MIDR_NEOVERSE_N2 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_N2)
++#define MIDR_CORTEX_A78C MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A78C)
+
+ /* MPIDR Multiprocessor Affinity Register */
+ #define _MPIDR_UP (30)
+--
+2.35.2
+
+
+From 03db21387b8653d663e8da89c964d611ba509130 Mon Sep 17 00:00:00 2001
+From: Bertrand Marquis <bertrand.marquis@arm.com>
+Date: Tue, 15 Feb 2022 10:39:47 +0000
+Subject: [PATCH 11/32] xen/arm: move errata CSV2 check earlier
+
+CSV2 availability check is done after printing to the user that
+workaround 1 will be used. Move the check before to prevent saying to the
+user that workaround 1 is used when it is not because it is not needed.
+This will also allow to reuse install_bp_hardening_vec function for
+other use cases.
+
+Code previously returning "true", now returns "0" to conform to
+enable_smccc_arch_workaround_1 returning an int and surrounding code
+doing a "return 0" if workaround is not needed.
+
+This is part of XSA-398 / CVE-2022-23960.
+
+Signed-off-by: Bertrand Marquis <bertrand.marquis@arm.com>
+Reviewed-by: Julien Grall <julien@xen.org>
+(cherry picked from commit 599616d70eb886b9ad0ef9d6b51693ce790504ba)
+---
+ xen/arch/arm/cpuerrata.c | 14 +++++++-------
+ 1 file changed, 7 insertions(+), 7 deletions(-)
+
+diff --git a/xen/arch/arm/cpuerrata.c b/xen/arch/arm/cpuerrata.c
+index f94bcf74cc..79620889b4 100644
+--- a/xen/arch/arm/cpuerrata.c
++++ b/xen/arch/arm/cpuerrata.c
+@@ -102,13 +102,6 @@ install_bp_hardening_vec(const struct arm_cpu_capabilities *entry,
+ printk(XENLOG_INFO "CPU%u will %s on exception entry\n",
+ smp_processor_id(), desc);
+
+- /*
+- * No need to install hardened vector when the processor has
+- * ID_AA64PRF0_EL1.CSV2 set.
+- */
+- if ( cpu_data[smp_processor_id()].pfr64.csv2 )
+- return true;
+-
+ spin_lock(&bp_lock);
+
+ /*
+@@ -167,6 +160,13 @@ static int enable_smccc_arch_workaround_1(void *data)
+ if ( !entry->matches(entry) )
+ return 0;
+
++ /*
++ * No need to install hardened vector when the processor has
++ * ID_AA64PRF0_EL1.CSV2 set.
++ */
++ if ( cpu_data[smp_processor_id()].pfr64.csv2 )
++ return 0;
++
+ if ( smccc_ver < SMCCC_VERSION(1, 1) )
+ goto warn;
+
+--
+2.35.2
+
+
+From d99df7d50d366c7a8dc71f5bdc3454f469b00a00 Mon Sep 17 00:00:00 2001
+From: Bertrand Marquis <bertrand.marquis@arm.com>
+Date: Wed, 23 Feb 2022 09:42:18 +0000
+Subject: [PATCH 12/32] xen/arm: Add ECBHB and CLEARBHB ID fields
+
+Introduce ID coprocessor register ID_AA64ISAR2_EL1.
+Add definitions in cpufeature and sysregs of ECBHB field in mmfr1 and
+CLEARBHB in isar2 ID coprocessor registers.
+
+This is part of XSA-398 / CVE-2022-23960.
+
+Signed-off-by: Bertrand Marquis <bertrand.marquis@arm.com>
+Acked-by: Julien Grall <julien@xen.org>
+(cherry picked from commit 4b68d12d98b8790d8002fcc2c25a9d713374a4d7)
+---
+ xen/arch/arm/cpufeature.c | 1 +
+ xen/include/asm-arm/arm64/sysregs.h | 4 ++++
+ xen/include/asm-arm/cpufeature.h | 20 +++++++++++++++++---
+ 3 files changed, 22 insertions(+), 3 deletions(-)
+
+diff --git a/xen/arch/arm/cpufeature.c b/xen/arch/arm/cpufeature.c
+index 44126dbf07..13dac7ccaf 100644
+--- a/xen/arch/arm/cpufeature.c
++++ b/xen/arch/arm/cpufeature.c
+@@ -117,6 +117,7 @@ void identify_cpu(struct cpuinfo_arm *c)
+
+ c->isa64.bits[0] = READ_SYSREG64(ID_AA64ISAR0_EL1);
+ c->isa64.bits[1] = READ_SYSREG64(ID_AA64ISAR1_EL1);
++ c->isa64.bits[2] = READ_SYSREG64(ID_AA64ISAR2_EL1);
+ #endif
+
+ c->pfr32.bits[0] = READ_SYSREG32(ID_PFR0_EL1);
+diff --git a/xen/include/asm-arm/arm64/sysregs.h b/xen/include/asm-arm/arm64/sysregs.h
+index c60029d38f..cfd2e1d486 100644
+--- a/xen/include/asm-arm/arm64/sysregs.h
++++ b/xen/include/asm-arm/arm64/sysregs.h
+@@ -57,6 +57,10 @@
+ #define ICH_AP1R2_EL2 __AP1Rx_EL2(2)
+ #define ICH_AP1R3_EL2 __AP1Rx_EL2(3)
+
++#ifndef ID_AA64ISAR2_EL1
++#define ID_AA64ISAR2_EL1 S3_0_C0_C6_2
++#endif
++
+ /* Access to system registers */
+
+ #define READ_SYSREG32(name) ((uint32_t)READ_SYSREG64(name))
+diff --git a/xen/include/asm-arm/cpufeature.h b/xen/include/asm-arm/cpufeature.h
+index 29753fee78..8519d2987b 100644
+--- a/xen/include/asm-arm/cpufeature.h
++++ b/xen/include/asm-arm/cpufeature.h
+@@ -183,12 +183,26 @@ struct cpuinfo_arm {
+ unsigned long lo:4;
+ unsigned long pan:4;
+ unsigned long __res1:8;
+- unsigned long __res2:32;
++ unsigned long __res2:28;
++ unsigned long ecbhb:4;
+ };
+ } mm64;
+
+- struct {
+- uint64_t bits[2];
++ union {
++ uint64_t bits[3];
++ struct {
++ /* ISAR0 */
++ unsigned long __res0:64;
++
++ /* ISAR1 */
++ unsigned long __res1:64;
++
++ /* ISAR2 */
++ unsigned long __res3:28;
++ unsigned long clearbhb:4;
++
++ unsigned long __res4:32;
++ };
+ } isa64;
+
+ #endif
+--
+2.35.2
+
+
+From 47125f5fb2073abb9d5d3f65824cd066e7ec62f1 Mon Sep 17 00:00:00 2001
+From: Rahul Singh <rahul.singh@arm.com>
+Date: Mon, 14 Feb 2022 18:47:32 +0000
+Subject: [PATCH 13/32] xen/arm: Add Spectre BHB handling
+
+This commit is adding Spectre BHB handling to Xen on Arm.
+The commit is introducing new alternative code to be executed during
+exception entry:
+- SMCC workaround 3 call
+- loop workaround (with 8, 24 or 32 iterations)
+- use of new clearbhb instruction
+
+Cpuerrata is modified by this patch to apply the required workaround for
+CPU affected by Spectre BHB when CONFIG_ARM64_HARDEN_BRANCH_PREDICTOR is
+enabled.
+
+To do this the system previously used to apply smcc workaround 1 is
+reused and new alternative code to be copied in the exception handler is
+introduced.
+
+To define the type of workaround required by a processor, 4 new cpu
+capabilities are introduced (for each number of loop and for smcc
+workaround 3).
+
+When a processor is affected, enable_spectre_bhb_workaround is called
+and if the processor does not have CSV2 set to 3 or ECBHB feature (which
+would mean that the processor is doing what is required in hardware),
+the proper code is enabled at exception entry.
+
+In the case where workaround 3 is not supported by the firmware, we
+enable workaround 1 when possible as it will also mitigate Spectre BHB
+on systems without CSV2.
+
+This is part of XSA-398 / CVE-2022-23960.
+
+Signed-off-by: Bertrand Marquis <bertrand.marquis@arm.com>
+Signed-off-by: Rahul Singh <rahul.singh@arm.com>
+Acked-by: Julien Grall <julien@xen.org>
+(cherry picked from commit 62c91eb66a2904eefb1d1d9642e3697a1e3c3a3c)
+---
+ xen/arch/arm/arm64/bpi.S | 32 +++++-
+ xen/arch/arm/cpuerrata.c | 170 +++++++++++++++++++++++++++--
+ xen/include/asm-arm/arm64/macros.h | 5 +
+ xen/include/asm-arm/cpufeature.h | 6 +-
+ xen/include/asm-arm/smccc.h | 6 +
+ 5 files changed, 207 insertions(+), 12 deletions(-)
+
+diff --git a/xen/arch/arm/arm64/bpi.S b/xen/arch/arm/arm64/bpi.S
+index d8743d955c..4e63825220 100644
+--- a/xen/arch/arm/arm64/bpi.S
++++ b/xen/arch/arm/arm64/bpi.S
+@@ -58,16 +58,42 @@ ENTRY(__bp_harden_hyp_vecs_start)
+ .endr
+ ENTRY(__bp_harden_hyp_vecs_end)
+
+-ENTRY(__smccc_workaround_1_smc_start)
++.macro mitigate_spectre_bhb_loop count
++ENTRY(__mitigate_spectre_bhb_loop_start_\count)
++ stp x0, x1, [sp, #-16]!
++ mov x0, \count
++.Lspectre_bhb_loop\@:
++ b . + 4
++ subs x0, x0, #1
++ b.ne .Lspectre_bhb_loop\@
++ sb
++ ldp x0, x1, [sp], #16
++ENTRY(__mitigate_spectre_bhb_loop_end_\count)
++.endm
++
++.macro smccc_workaround num smcc_id
++ENTRY(__smccc_workaround_smc_start_\num)
+ sub sp, sp, #(8 * 4)
+ stp x0, x1, [sp, #(8 * 2)]
+ stp x2, x3, [sp, #(8 * 0)]
+- mov w0, #ARM_SMCCC_ARCH_WORKAROUND_1_FID
++ mov w0, \smcc_id
+ smc #0
+ ldp x2, x3, [sp, #(8 * 0)]
+ ldp x0, x1, [sp, #(8 * 2)]
+ add sp, sp, #(8 * 4)
+-ENTRY(__smccc_workaround_1_smc_end)
++ENTRY(__smccc_workaround_smc_end_\num)
++.endm
++
++ENTRY(__mitigate_spectre_bhb_clear_insn_start)
++ clearbhb
++ isb
++ENTRY(__mitigate_spectre_bhb_clear_insn_end)
++
++mitigate_spectre_bhb_loop 8
++mitigate_spectre_bhb_loop 24
++mitigate_spectre_bhb_loop 32
++smccc_workaround 1, #ARM_SMCCC_ARCH_WORKAROUND_1_FID
++smccc_workaround 3, #ARM_SMCCC_ARCH_WORKAROUND_3_FID
+
+ /*
+ * Local variables:
+diff --git a/xen/arch/arm/cpuerrata.c b/xen/arch/arm/cpuerrata.c
+index 79620889b4..8d9e977b77 100644
+--- a/xen/arch/arm/cpuerrata.c
++++ b/xen/arch/arm/cpuerrata.c
+@@ -144,7 +144,16 @@ install_bp_hardening_vec(const struct arm_cpu_capabilities *entry,
+ return ret;
+ }
+
+-extern char __smccc_workaround_1_smc_start[], __smccc_workaround_1_smc_end[];
++extern char __smccc_workaround_smc_start_1[], __smccc_workaround_smc_end_1[];
++extern char __smccc_workaround_smc_start_3[], __smccc_workaround_smc_end_3[];
++extern char __mitigate_spectre_bhb_clear_insn_start[],
++ __mitigate_spectre_bhb_clear_insn_end[];
++extern char __mitigate_spectre_bhb_loop_start_8[],
++ __mitigate_spectre_bhb_loop_end_8[];
++extern char __mitigate_spectre_bhb_loop_start_24[],
++ __mitigate_spectre_bhb_loop_end_24[];
++extern char __mitigate_spectre_bhb_loop_start_32[],
++ __mitigate_spectre_bhb_loop_end_32[];
+
+ static int enable_smccc_arch_workaround_1(void *data)
+ {
+@@ -176,8 +185,8 @@ static int enable_smccc_arch_workaround_1(void *data)
+ if ( (int)res.a0 < 0 )
+ goto warn;
+
+- return !install_bp_hardening_vec(entry,__smccc_workaround_1_smc_start,
+- __smccc_workaround_1_smc_end,
++ return !install_bp_hardening_vec(entry,__smccc_workaround_smc_start_1,
++ __smccc_workaround_smc_end_1,
+ "call ARM_SMCCC_ARCH_WORKAROUND_1");
+
+ warn:
+@@ -192,6 +201,93 @@ warn:
+ return 0;
+ }
+
++/*
++ * Spectre BHB Mitigation
++ *
++ * CPU is either:
++ * - Having CVS2.3 so it is not affected.
++ * - Having ECBHB and is clearing the branch history buffer when an exception
++ * to a different exception level is happening so no mitigation is needed.
++ * - Mitigating using a loop on exception entry (number of loop depending on
++ * the CPU).
++ * - Mitigating using the firmware.
++ */
++static int enable_spectre_bhb_workaround(void *data)
++{
++ const struct arm_cpu_capabilities *entry = data;
++
++ /*
++ * Enable callbacks are called on every CPU based on the capabilities, so
++ * double-check whether the CPU matches the entry.
++ */
++ if ( !entry->matches(entry) )
++ return 0;
++
++ if ( cpu_data[smp_processor_id()].pfr64.csv2 == 3 )
++ return 0;
++
++ if ( cpu_data[smp_processor_id()].mm64.ecbhb )
++ return 0;
++
++ if ( cpu_data[smp_processor_id()].isa64.clearbhb )
++ return !install_bp_hardening_vec(entry,
++ __mitigate_spectre_bhb_clear_insn_start,
++ __mitigate_spectre_bhb_clear_insn_end,
++ "use clearBHB instruction");
++
++ /* Apply solution depending on hwcaps set on arm_errata */
++ if ( cpus_have_cap(ARM_WORKAROUND_BHB_LOOP_8) )
++ return !install_bp_hardening_vec(entry,
++ __mitigate_spectre_bhb_loop_start_8,
++ __mitigate_spectre_bhb_loop_end_8,
++ "use 8 loops workaround");
++
++ if ( cpus_have_cap(ARM_WORKAROUND_BHB_LOOP_24) )
++ return !install_bp_hardening_vec(entry,
++ __mitigate_spectre_bhb_loop_start_24,
++ __mitigate_spectre_bhb_loop_end_24,
++ "use 24 loops workaround");
++
++ if ( cpus_have_cap(ARM_WORKAROUND_BHB_LOOP_32) )
++ return !install_bp_hardening_vec(entry,
++ __mitigate_spectre_bhb_loop_start_32,
++ __mitigate_spectre_bhb_loop_end_32,
++ "use 32 loops workaround");
++
++ if ( cpus_have_cap(ARM_WORKAROUND_BHB_SMCC_3) )
++ {
++ struct arm_smccc_res res;
++
++ if ( smccc_ver < SMCCC_VERSION(1, 1) )
++ goto warn;
++
++ arm_smccc_1_1_smc(ARM_SMCCC_ARCH_FEATURES_FID,
++ ARM_SMCCC_ARCH_WORKAROUND_3_FID, &res);
++ /* The return value is in the lower 32-bits. */
++ if ( (int)res.a0 < 0 )
++ {
++ /*
++ * On processor affected with CSV2=0, workaround 1 will mitigate
++ * both Spectre v2 and BHB so use it when available
++ */
++ if ( enable_smccc_arch_workaround_1(data) )
++ return 1;
++
++ goto warn;
++ }
++
++ return !install_bp_hardening_vec(entry,__smccc_workaround_smc_start_3,
++ __smccc_workaround_smc_end_3,
++ "call ARM_SMCCC_ARCH_WORKAROUND_3");
++ }
++
++warn:
++ printk_once("**** No support for any spectre BHB workaround. ****\n"
++ "**** Please update your firmware. ****\n");
++
++ return 0;
++}
++
+ #endif /* CONFIG_ARM64_HARDEN_BRANCH_PREDICTOR */
+
+ /* Hardening Branch predictor code for Arm32 */
+@@ -437,19 +533,77 @@ static const struct arm_cpu_capabilities arm_errata[] = {
+ },
+ {
+ .capability = ARM_HARDEN_BRANCH_PREDICTOR,
+- MIDR_ALL_VERSIONS(MIDR_CORTEX_A72),
++ MIDR_RANGE(MIDR_CORTEX_A72, 0, 1 << MIDR_VARIANT_SHIFT),
+ .enable = enable_smccc_arch_workaround_1,
+ },
+ {
+- .capability = ARM_HARDEN_BRANCH_PREDICTOR,
++ .capability = ARM_WORKAROUND_BHB_SMCC_3,
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_A73),
+- .enable = enable_smccc_arch_workaround_1,
++ .enable = enable_spectre_bhb_workaround,
+ },
+ {
+- .capability = ARM_HARDEN_BRANCH_PREDICTOR,
++ .capability = ARM_WORKAROUND_BHB_SMCC_3,
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_A75),
+- .enable = enable_smccc_arch_workaround_1,
++ .enable = enable_spectre_bhb_workaround,
++ },
++ /* spectre BHB */
++ {
++ .capability = ARM_WORKAROUND_BHB_LOOP_8,
++ MIDR_RANGE(MIDR_CORTEX_A72, 1 << MIDR_VARIANT_SHIFT,
++ (MIDR_VARIANT_MASK | MIDR_REVISION_MASK)),
++ .enable = enable_spectre_bhb_workaround,
++ },
++ {
++ .capability = ARM_WORKAROUND_BHB_LOOP_24,
++ MIDR_ALL_VERSIONS(MIDR_CORTEX_A76),
++ .enable = enable_spectre_bhb_workaround,
++ },
++ {
++ .capability = ARM_WORKAROUND_BHB_LOOP_24,
++ MIDR_ALL_VERSIONS(MIDR_CORTEX_A77),
++ .enable = enable_spectre_bhb_workaround,
++ },
++ {
++ .capability = ARM_WORKAROUND_BHB_LOOP_32,
++ MIDR_ALL_VERSIONS(MIDR_CORTEX_A78),
++ .enable = enable_spectre_bhb_workaround,
++ },
++ {
++ .capability = ARM_WORKAROUND_BHB_LOOP_32,
++ MIDR_ALL_VERSIONS(MIDR_CORTEX_A78C),
++ .enable = enable_spectre_bhb_workaround,
++ },
++ {
++ .capability = ARM_WORKAROUND_BHB_LOOP_32,
++ MIDR_ALL_VERSIONS(MIDR_CORTEX_X1),
++ .enable = enable_spectre_bhb_workaround,
++ },
++ {
++ .capability = ARM_WORKAROUND_BHB_LOOP_32,
++ MIDR_ALL_VERSIONS(MIDR_CORTEX_X2),
++ .enable = enable_spectre_bhb_workaround,
++ },
++ {
++ .capability = ARM_WORKAROUND_BHB_LOOP_32,
++ MIDR_ALL_VERSIONS(MIDR_CORTEX_A710),
++ .enable = enable_spectre_bhb_workaround,
+ },
++ {
++ .capability = ARM_WORKAROUND_BHB_LOOP_24,
++ MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1),
++ .enable = enable_spectre_bhb_workaround,
++ },
++ {
++ .capability = ARM_WORKAROUND_BHB_LOOP_32,
++ MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2),
++ .enable = enable_spectre_bhb_workaround,
++ },
++ {
++ .capability = ARM_WORKAROUND_BHB_LOOP_32,
++ MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V1),
++ .enable = enable_spectre_bhb_workaround,
++ },
++
+ #endif
+ #ifdef CONFIG_ARM32_HARDEN_BRANCH_PREDICTOR
+ {
+diff --git a/xen/include/asm-arm/arm64/macros.h b/xen/include/asm-arm/arm64/macros.h
+index f981b4f43e..5100aed6e3 100644
+--- a/xen/include/asm-arm/arm64/macros.h
++++ b/xen/include/asm-arm/arm64/macros.h
+@@ -21,6 +21,11 @@
+ ldr \dst, [\dst, \tmp]
+ .endm
+
++ /* clearbhb instruction clearing the branch history */
++ .macro clearbhb
++ hint #22
++ .endm
++
+ /*
+ * Register aliases.
+ */
+diff --git a/xen/include/asm-arm/cpufeature.h b/xen/include/asm-arm/cpufeature.h
+index 8519d2987b..a1fa3bc1cf 100644
+--- a/xen/include/asm-arm/cpufeature.h
++++ b/xen/include/asm-arm/cpufeature.h
+@@ -46,8 +46,12 @@
+ #define ARM_SMCCC_1_1 8
+ #define ARM64_WORKAROUND_AT_SPECULATE 9
+ #define ARM_WORKAROUND_858921 10
++#define ARM_WORKAROUND_BHB_LOOP_8 11
++#define ARM_WORKAROUND_BHB_LOOP_24 12
++#define ARM_WORKAROUND_BHB_LOOP_32 13
++#define ARM_WORKAROUND_BHB_SMCC_3 14
+
+-#define ARM_NCAPS 11
++#define ARM_NCAPS 15
+
+ #ifndef __ASSEMBLY__
+
+diff --git a/xen/include/asm-arm/smccc.h b/xen/include/asm-arm/smccc.h
+index 126399dd70..2abbffc3bd 100644
+--- a/xen/include/asm-arm/smccc.h
++++ b/xen/include/asm-arm/smccc.h
+@@ -334,6 +334,12 @@ void __arm_smccc_1_0_smc(register_t a0, register_t a1, register_t a2,
+ ARM_SMCCC_OWNER_ARCH, \
+ 0x7FFF)
+
++#define ARM_SMCCC_ARCH_WORKAROUND_3_FID \
++ ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \
++ ARM_SMCCC_CONV_32, \
++ ARM_SMCCC_OWNER_ARCH, \
++ 0x3FFF)
++
+ /* SMCCC error codes */
+ #define ARM_SMCCC_NOT_REQUIRED (-2)
+ #define ARM_SMCCC_ERR_UNKNOWN_FUNCTION (-1)
+--
+2.35.2
+
+
+From fbabb62dd9e57180400f145a8756624c82de888f Mon Sep 17 00:00:00 2001
+From: Bertrand Marquis <bertrand.marquis@arm.com>
+Date: Thu, 17 Feb 2022 14:52:54 +0000
+Subject: [PATCH 14/32] xen/arm: Allow to discover and use
+ SMCCC_ARCH_WORKAROUND_3
+
+Allow guest to discover whether or not SMCCC_ARCH_WORKAROUND_3 is
+supported and create a fastpath in the code to handle guests request to
+do the workaround.
+
+The function SMCCC_ARCH_WORKAROUND_3 will be called by the guest for
+flushing the branch history. So we want the handling to be as fast as
+possible.
+
+As the mitigation is applied on every guest exit, we can check for the
+call before saving all context and return very early.
+
+This is part of XSA-398 / CVE-2022-23960.
+
+Signed-off-by: Bertrand Marquis <bertrand.marquis@arm.com>
+Reviewed-by: Julien Grall <julien@xen.org>
+(cherry picked from commit c0a56ea0fd92ecb471936b7355ddbecbaea3707c)
+---
+ xen/arch/arm/arm64/entry.S | 21 ++++++++++++++-------
+ xen/arch/arm/vsmc.c | 5 +++++
+ 2 files changed, 19 insertions(+), 7 deletions(-)
+
+diff --git a/xen/arch/arm/arm64/entry.S b/xen/arch/arm/arm64/entry.S
+index 175ea2981e..a8c2145067 100644
+--- a/xen/arch/arm/arm64/entry.S
++++ b/xen/arch/arm/arm64/entry.S
+@@ -338,16 +338,26 @@ guest_sync:
+ cbnz x1, guest_sync_slowpath /* should be 0 for HVC #0 */
+
+ /*
+- * Fastest path possible for ARM_SMCCC_ARCH_WORKAROUND_1.
+- * The workaround has already been applied on the exception
++ * Fastest path possible for ARM_SMCCC_ARCH_WORKAROUND_1 and
++ * ARM_SMCCC_ARCH_WORKAROUND_3.
++ * The workaround needed has already been applied on the exception
+ * entry from the guest, so let's quickly get back to the guest.
+ *
+ * Note that eor is used because the function identifier cannot
+ * be encoded as an immediate for cmp.
+ */
+ eor w0, w0, #ARM_SMCCC_ARCH_WORKAROUND_1_FID
+- cbnz w0, check_wa2
++ cbz w0, fastpath_out_workaround
+
++ /* ARM_SMCCC_ARCH_WORKAROUND_2 handling */
++ eor w0, w0, #(ARM_SMCCC_ARCH_WORKAROUND_1_FID ^ ARM_SMCCC_ARCH_WORKAROUND_2_FID)
++ cbz w0, wa2_ssbd
++
++ /* Fastpath out for ARM_SMCCC_ARCH_WORKAROUND_3 */
++ eor w0, w0, #(ARM_SMCCC_ARCH_WORKAROUND_2_FID ^ ARM_SMCCC_ARCH_WORKAROUND_3_FID)
++ cbnz w0, guest_sync_slowpath
++
++fastpath_out_workaround:
+ /*
+ * Clobber both x0 and x1 to prevent leakage. Note that thanks
+ * the eor, x0 = 0.
+@@ -356,10 +366,7 @@ guest_sync:
+ eret
+ sb
+
+-check_wa2:
+- /* ARM_SMCCC_ARCH_WORKAROUND_2 handling */
+- eor w0, w0, #(ARM_SMCCC_ARCH_WORKAROUND_1_FID ^ ARM_SMCCC_ARCH_WORKAROUND_2_FID)
+- cbnz w0, guest_sync_slowpath
++wa2_ssbd:
+ #ifdef CONFIG_ARM_SSBD
+ alternative_cb arm_enable_wa2_handling
+ b wa2_end
+diff --git a/xen/arch/arm/vsmc.c b/xen/arch/arm/vsmc.c
+index a36db15fff..b633ff2fe8 100644
+--- a/xen/arch/arm/vsmc.c
++++ b/xen/arch/arm/vsmc.c
+@@ -124,6 +124,10 @@ static bool handle_arch(struct cpu_user_regs *regs)
+ break;
+ }
+ break;
++ case ARM_SMCCC_ARCH_WORKAROUND_3_FID:
++ if ( cpus_have_cap(ARM_WORKAROUND_BHB_SMCC_3) )
++ ret = 0;
++ break;
+ }
+
+ set_user_reg(regs, 0, ret);
+@@ -132,6 +136,7 @@ static bool handle_arch(struct cpu_user_regs *regs)
+ }
+
+ case ARM_SMCCC_ARCH_WORKAROUND_1_FID:
++ case ARM_SMCCC_ARCH_WORKAROUND_3_FID:
+ /* No return value */
+ return true;
+
+--
+2.35.2
+
+
+From 7b9814b250a5a28277bd0866d341a5cfc0f4c1ac Mon Sep 17 00:00:00 2001
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Date: Mon, 7 Mar 2022 16:35:52 +0000
+Subject: [PATCH 15/32] x86/spec-ctrl: Cease using thunk=lfence on AMD
+
+AMD have updated their Spectre v2 guidance, and lfence/jmp is no longer
+considered safe. AMD are recommending using retpoline everywhere.
+
+Update the default heuristics to never select THUNK_LFENCE.
+
+This is part of XSA-398 / CVE-2021-26401.
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+(cherry picked from commit 8d03080d2a339840d3a59e0932a94f804e45110d)
+---
+ docs/misc/xen-command-line.pandoc | 6 +++---
+ xen/arch/x86/spec_ctrl.c | 10 ++--------
+ 2 files changed, 5 insertions(+), 11 deletions(-)
+
+diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc
+index cf9dea62db..eead69ada2 100644
+--- a/docs/misc/xen-command-line.pandoc
++++ b/docs/misc/xen-command-line.pandoc
+@@ -2077,9 +2077,9 @@ to use.
+
+ If Xen was compiled with INDIRECT_THUNK support, `bti-thunk=` can be used to
+ select which of the thunks gets patched into the `__x86_indirect_thunk_%reg`
+-locations. The default thunk is `retpoline` (generally preferred for Intel
+-hardware), with the alternatives being `jmp` (a `jmp *%reg` gadget, minimal
+-overhead), and `lfence` (an `lfence; jmp *%reg` gadget, preferred for AMD).
++locations. The default thunk is `retpoline` (generally preferred), with the
++alternatives being `jmp` (a `jmp *%reg` gadget, minimal overhead), and
++`lfence` (an `lfence; jmp *%reg` gadget).
+
+ On hardware supporting IBRS (Indirect Branch Restricted Speculation), the
+ `ibrs=` option can be used to force or prevent Xen using the feature itself.
+diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c
+index 1cfd02d7d7..7447d4a8e5 100644
+--- a/xen/arch/x86/spec_ctrl.c
++++ b/xen/arch/x86/spec_ctrl.c
+@@ -908,16 +908,10 @@ void __init init_speculation_mitigations(void)
+ if ( IS_ENABLED(CONFIG_INDIRECT_THUNK) )
+ {
+ /*
+- * AMD's recommended mitigation is to set lfence as being dispatch
+- * serialising, and to use IND_THUNK_LFENCE.
+- */
+- if ( cpu_has_lfence_dispatch )
+- thunk = THUNK_LFENCE;
+- /*
+- * On Intel hardware, we'd like to use retpoline in preference to
++ * On all hardware, we'd like to use retpoline in preference to
+ * IBRS, but only if it is safe on this hardware.
+ */
+- else if ( retpoline_safe(caps) )
++ if ( retpoline_safe(caps) )
+ thunk = THUNK_RETPOLINE;
+ else if ( boot_cpu_has(X86_FEATURE_IBRSB) )
+ ibrs = true;
+--
+2.35.2
+
+
+From 8ed46cc1ef14fb8463cc847d82cbd2491054547a Mon Sep 17 00:00:00 2001
+From: Jan Beulich <jbeulich@suse.com>
+Date: Tue, 5 Apr 2022 15:19:40 +0200
+Subject: [PATCH 16/32] VT-d: split domid map cleanup check into a function
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+This logic will want invoking from elsewhere.
+
+No functional change intended.
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
+Reviewed-by: Kevin Tian <kevin.tian@intel.com>
+master commit: 9fdc10abe9457e4c9879a266f82372cb08e88ffb
+master date: 2021-11-24 11:06:20 +0100
+---
+ xen/drivers/passthrough/vtd/iommu.c | 70 +++++++++++++++++++----------
+ 1 file changed, 47 insertions(+), 23 deletions(-)
+
+diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c
+index af8b9ca0e4..234a4fbae5 100644
+--- a/xen/drivers/passthrough/vtd/iommu.c
++++ b/xen/drivers/passthrough/vtd/iommu.c
+@@ -147,6 +147,51 @@ static void cleanup_domid_map(struct domain *domain, struct vtd_iommu *iommu)
+ }
+ }
+
++static bool any_pdev_behind_iommu(const struct domain *d,
++ const struct pci_dev *exclude,
++ const struct vtd_iommu *iommu)
++{
++ const struct pci_dev *pdev;
++
++ for_each_pdev ( d, pdev )
++ {
++ const struct acpi_drhd_unit *drhd;
++
++ if ( pdev == exclude )
++ continue;
++
++ drhd = acpi_find_matched_drhd_unit(pdev);
++ if ( drhd && drhd->iommu == iommu )
++ return true;
++ }
++
++ return false;
++}
++
++/*
++ * If no other devices under the same iommu owned by this domain,
++ * clear iommu in iommu_bitmap and clear domain_id in domid_bitmap.
++ */
++static void check_cleanup_domid_map(struct domain *d,
++ const struct pci_dev *exclude,
++ struct vtd_iommu *iommu)
++{
++ bool found = any_pdev_behind_iommu(d, exclude, iommu);
++
++ /*
++ * Hidden devices are associated with DomXEN but usable by the hardware
++ * domain. Hence they need considering here as well.
++ */
++ if ( !found && is_hardware_domain(d) )
++ found = any_pdev_behind_iommu(dom_xen, exclude, iommu);
++
++ if ( !found )
++ {
++ clear_bit(iommu->index, &dom_iommu(d)->arch.iommu_bitmap);
++ cleanup_domid_map(d, iommu);
++ }
++}
++
+ static int iommus_incoherent;
+
+ static void sync_cache(const void *addr, unsigned int size)
+@@ -1679,7 +1724,6 @@ static int domain_context_unmap(struct domain *domain, u8 devfn,
+ struct vtd_iommu *iommu;
+ int ret = 0;
+ u8 seg = pdev->seg, bus = pdev->bus, tmp_bus, tmp_devfn, secbus;
+- int found = 0;
+
+ drhd = acpi_find_matched_drhd_unit(pdev);
+ if ( !drhd )
+@@ -1763,28 +1807,8 @@ static int domain_context_unmap(struct domain *domain, u8 devfn,
+ if ( ret )
+ goto out;
+
+- /*
+- * if no other devices under the same iommu owned by this domain,
+- * clear iommu in iommu_bitmap and clear domain_id in domid_bitmp
+- */
+- for_each_pdev ( domain, pdev )
+- {
+- if ( pdev->seg == seg && pdev->bus == bus && pdev->devfn == devfn )
+- continue;
+-
+- drhd = acpi_find_matched_drhd_unit(pdev);
+- if ( drhd && drhd->iommu == iommu )
+- {
+- found = 1;
+- break;
+- }
+- }
+-
+- if ( found == 0 )
+- {
+- clear_bit(iommu->index, &dom_iommu(domain)->arch.iommu_bitmap);
+- cleanup_domid_map(domain, iommu);
+- }
++ if ( !ret )
++ check_cleanup_domid_map(domain, pdev, iommu);
+
+ out:
+ return ret;
+--
+2.35.2
+
+
+From 2ce2aec8c148a0a291eae2a0631802e0ffb42133 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com>
+Date: Tue, 5 Apr 2022 15:20:10 +0200
+Subject: [PATCH 17/32] x86/hap: do not switch on log dirty for VRAM tracking
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+XEN_DMOP_track_dirty_vram possibly calls into paging_log_dirty_enable
+when using HAP mode, and it can interact badly with other ongoing
+paging domctls, as XEN_DMOP_track_dirty_vram is not holding the domctl
+lock.
+
+This was detected as a result of the following assert triggering when
+doing repeated migrations of a HAP HVM domain with a stubdom:
+
+Assertion 'd->arch.paging.log_dirty.allocs == 0' failed at paging.c:198
+----[ Xen-4.17-unstable x86_64 debug=y Not tainted ]----
+CPU: 34
+RIP: e008:[<ffff82d040314b3b>] arch/x86/mm/paging.c#paging_free_log_dirty_bitmap+0x606/0x6
+RFLAGS: 0000000000010206 CONTEXT: hypervisor (d0v23)
+[...]
+Xen call trace:
+ [<ffff82d040314b3b>] R arch/x86/mm/paging.c#paging_free_log_dirty_bitmap+0x606/0x63a
+ [<ffff82d040279f96>] S xsm/flask/hooks.c#domain_has_perm+0x5a/0x67
+ [<ffff82d04031577f>] F paging_domctl+0x251/0xd41
+ [<ffff82d04031640c>] F paging_domctl_continuation+0x19d/0x202
+ [<ffff82d0403202fa>] F pv_hypercall+0x150/0x2a7
+ [<ffff82d0403a729d>] F lstar_enter+0x12d/0x140
+
+Such assert triggered because the stubdom used
+XEN_DMOP_track_dirty_vram while dom0 was in the middle of executing
+XEN_DOMCTL_SHADOW_OP_OFF, and so log dirty become enabled while
+retiring the old structures, thus leading to new entries being
+populated in already clear slots.
+
+Fix this by not enabling log dirty for VRAM tracking, similar to what
+is done when using shadow instead of HAP. Call
+p2m_enable_hardware_log_dirty when enabling VRAM tracking in order to
+get some hardware assistance if available. As a side effect the memory
+pressure on the p2m pool should go down if only VRAM tracking is
+enabled, as the dirty bitmap is no longer allocated.
+
+Note that paging_log_dirty_range (used to get the dirty bitmap for
+VRAM tracking) doesn't use the log dirty bitmap, and instead relies on
+checking whether each gfn on the range has been switched from
+p2m_ram_logdirty to p2m_ram_rw in order to account for dirty pages.
+
+This is CVE-2022-26356 / XSA-397.
+
+Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+master commit: 4f4db53784d912c4f409a451c36ebfd4754e0a42
+master date: 2022-04-05 14:11:30 +0200
+---
+ xen/arch/x86/mm/hap/hap.c | 11 ++++-------
+ xen/arch/x86/mm/paging.c | 2 +-
+ xen/include/asm-x86/paging.h | 3 ---
+ 3 files changed, 5 insertions(+), 11 deletions(-)
+
+diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c
+index 3d93f3451c..9aac006d65 100644
+--- a/xen/arch/x86/mm/hap/hap.c
++++ b/xen/arch/x86/mm/hap/hap.c
+@@ -69,13 +69,6 @@ int hap_track_dirty_vram(struct domain *d,
+ {
+ int size = (nr + BITS_PER_BYTE - 1) / BITS_PER_BYTE;
+
+- if ( !paging_mode_log_dirty(d) )
+- {
+- rc = paging_log_dirty_enable(d, false);
+- if ( rc )
+- goto out;
+- }
+-
+ rc = -ENOMEM;
+ dirty_bitmap = vzalloc(size);
+ if ( !dirty_bitmap )
+@@ -107,6 +100,10 @@ int hap_track_dirty_vram(struct domain *d,
+
+ paging_unlock(d);
+
++ domain_pause(d);
++ p2m_enable_hardware_log_dirty(d);
++ domain_unpause(d);
++
+ if ( oend > ostart )
+ p2m_change_type_range(d, ostart, oend,
+ p2m_ram_logdirty, p2m_ram_rw);
+diff --git a/xen/arch/x86/mm/paging.c b/xen/arch/x86/mm/paging.c
+index 469bb76429..8596e52458 100644
+--- a/xen/arch/x86/mm/paging.c
++++ b/xen/arch/x86/mm/paging.c
+@@ -209,7 +209,7 @@ static int paging_free_log_dirty_bitmap(struct domain *d, int rc)
+ return rc;
+ }
+
+-int paging_log_dirty_enable(struct domain *d, bool log_global)
++static int paging_log_dirty_enable(struct domain *d, bool log_global)
+ {
+ int ret;
+
+diff --git a/xen/include/asm-x86/paging.h b/xen/include/asm-x86/paging.h
+index 7544f73121..a16929eaa7 100644
+--- a/xen/include/asm-x86/paging.h
++++ b/xen/include/asm-x86/paging.h
+@@ -156,9 +156,6 @@ void paging_log_dirty_range(struct domain *d,
+ unsigned long nr,
+ uint8_t *dirty_bitmap);
+
+-/* enable log dirty */
+-int paging_log_dirty_enable(struct domain *d, bool log_global);
+-
+ /* log dirty initialization */
+ void paging_log_dirty_init(struct domain *d, const struct log_dirty_ops *ops);
+
+--
+2.35.2
+
+
+From 920e93df4e16c03811665e459c414feced6bc9b6 Mon Sep 17 00:00:00 2001
+From: Jan Beulich <jbeulich@suse.com>
+Date: Tue, 5 Apr 2022 15:20:42 +0200
+Subject: [PATCH 18/32] VT-d: correct ordering of operations in
+ cleanup_domid_map()
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+The function may be called without any locks held (leaving aside the
+domctl one, which we surely don't want to depend on here), so needs to
+play safe wrt other accesses to domid_map[] and domid_bitmap[]. This is
+to avoid context_set_domain_id()'s writing of domid_map[] to be reset to
+zero right away in the case of it racing the freeing of a DID.
+
+For the interaction with context_set_domain_id() and did_to_domain_id()
+see the code comment.
+
+{check_,}cleanup_domid_map() are called with pcidevs_lock held or during
+domain cleanup only (and pcidevs_lock is also held around
+context_set_domain_id()), i.e. racing calls with the same (dom, iommu)
+tuple cannot occur.
+
+domain_iommu_domid(), besides its use by cleanup_domid_map(), has its
+result used only to control flushing, and hence a stale result would
+only lead to a stray extra flush.
+
+This is CVE-2022-26357 / XSA-399.
+
+Fixes: b9c20c78789f ("VT-d: per-iommu domain-id")
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
+master commit: d9eca7bb6c6636eb87bb17b08ba7de270f47ecd0
+master date: 2022-04-05 14:12:27 +0200
+---
+ xen/drivers/passthrough/vtd/iommu.c | 8 +++++++-
+ 1 file changed, 7 insertions(+), 1 deletion(-)
+
+diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c
+index 234a4fbae5..68f9a524b8 100644
+--- a/xen/drivers/passthrough/vtd/iommu.c
++++ b/xen/drivers/passthrough/vtd/iommu.c
+@@ -142,8 +142,14 @@ static void cleanup_domid_map(struct domain *domain, struct vtd_iommu *iommu)
+
+ if ( iommu_domid >= 0 )
+ {
++ /*
++ * Update domid_map[] /before/ domid_bitmap[] to avoid a race with
++ * context_set_domain_id(), setting the slot to DOMID_INVALID for
++ * ->domid_map[] reads to produce a suitable value while the bit is
++ * still set.
++ */
++ iommu->domid_map[iommu_domid] = DOMID_INVALID;
+ clear_bit(iommu_domid, iommu->domid_bitmap);
+- iommu->domid_map[iommu_domid] = 0;
+ }
+ }
+
+--
+2.35.2
+
+
+From 650b888c8a0a03d796632597e6adfd0075f13954 Mon Sep 17 00:00:00 2001
+From: Jan Beulich <jbeulich@suse.com>
+Date: Tue, 5 Apr 2022 15:21:21 +0200
+Subject: [PATCH 19/32] VT-d: fix (de)assign ordering when RMRRs are in use
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+In the event that the RMRR mappings are essential for device operation,
+they should be established before updating the device's context entry,
+while they should be torn down only after the device's context entry was
+successfully updated.
+
+Also adjust a related log message.
+
+This is CVE-2022-26358 / part of XSA-400.
+
+Fixes: 8b99f4400b69 ("VT-d: fix RMRR related error handling")
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
+Reviewed-by: Paul Durrant <paul@xen.org>
+Reviewed-by: Kevin Tian <kevin.tian@intel.com>
+master commit: 78a40f8b5dfa1a3aec43528663f39473d4429101
+master date: 2022-04-05 14:15:33 +0200
+---
+ xen/drivers/passthrough/vtd/iommu.c | 56 ++++++++++++++++++-----------
+ 1 file changed, 36 insertions(+), 20 deletions(-)
+
+diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c
+index 68f9a524b8..50e21bf1d9 100644
+--- a/xen/drivers/passthrough/vtd/iommu.c
++++ b/xen/drivers/passthrough/vtd/iommu.c
+@@ -2392,6 +2392,10 @@ static int reassign_device_ownership(
+ {
+ int ret;
+
++ ret = domain_context_unmap(source, devfn, pdev);
++ if ( ret )
++ return ret;
++
+ /*
+ * Devices assigned to untrusted domains (here assumed to be any domU)
+ * can attempt to send arbitrary LAPIC/MSI messages. We are unprotected
+@@ -2428,10 +2432,6 @@ static int reassign_device_ownership(
+ }
+ }
+
+- ret = domain_context_unmap(source, devfn, pdev);
+- if ( ret )
+- return ret;
+-
+ if ( devfn == pdev->devfn && pdev->domain != dom_io )
+ {
+ list_move(&pdev->domain_list, &dom_io->pdev_list);
+@@ -2508,9 +2508,8 @@ static int intel_iommu_assign_device(
+ }
+ }
+
+- ret = reassign_device_ownership(s, d, devfn, pdev);
+- if ( ret || d == dom_io )
+- return ret;
++ if ( d == dom_io )
++ return reassign_device_ownership(s, d, devfn, pdev);
+
+ /* Setup rmrr identity mapping */
+ for_each_rmrr_device( rmrr, bdf, i )
+@@ -2523,20 +2522,37 @@ static int intel_iommu_assign_device(
+ rmrr->end_address, flag);
+ if ( ret )
+ {
+- int rc;
+-
+- rc = reassign_device_ownership(d, s, devfn, pdev);
+ printk(XENLOG_G_ERR VTDPREFIX
+- " cannot map reserved region (%"PRIx64",%"PRIx64"] for Dom%d (%d)\n",
+- rmrr->base_address, rmrr->end_address,
+- d->domain_id, ret);
+- if ( rc )
+- {
+- printk(XENLOG_ERR VTDPREFIX
+- " failed to reclaim %04x:%02x:%02x.%u from %pd (%d)\n",
+- seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), d, rc);
+- domain_crash(d);
+- }
++ "%pd: cannot map reserved region [%"PRIx64",%"PRIx64"]: %d\n",
++ d, rmrr->base_address, rmrr->end_address, ret);
++ break;
++ }
++ }
++ }
++
++ if ( !ret )
++ ret = reassign_device_ownership(s, d, devfn, pdev);
++
++ /* See reassign_device_ownership() for the hwdom aspect. */
++ if ( !ret || is_hardware_domain(d) )
++ return ret;
++
++ for_each_rmrr_device( rmrr, bdf, i )
++ {
++ if ( rmrr->segment == seg &&
++ PCI_BUS(bdf) == bus &&
++ PCI_DEVFN2(bdf) == devfn )
++ {
++ int rc = iommu_identity_mapping(d, p2m_access_x,
++ rmrr->base_address,
++ rmrr->end_address, 0);
++
++ if ( rc && rc != -ENOENT )
++ {
++ printk(XENLOG_ERR VTDPREFIX
++ "%pd: cannot unmap reserved region [%"PRIx64",%"PRIx64"]: %d\n",
++ d, rmrr->base_address, rmrr->end_address, rc);
++ domain_crash(d);
+ break;
+ }
+ }
+--
+2.35.2
+
+
+From 81918cead1a5c2c3fb6648b078501af81f520849 Mon Sep 17 00:00:00 2001
+From: Jan Beulich <jbeulich@suse.com>
+Date: Tue, 5 Apr 2022 15:22:31 +0200
+Subject: [PATCH 20/32] VT-d: fix add/remove ordering when RMRRs are in use
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+In the event that the RMRR mappings are essential for device operation,
+they should be established before updating the device's context entry,
+while they should be torn down only after the device's context entry was
+successfully cleared.
+
+Also switch to %pd in related log messages.
+
+Fixes: fa88cfadf918 ("vt-d: Map RMRR in intel_iommu_add_device() if the device has RMRR")
+Fixes: 8b99f4400b69 ("VT-d: fix RMRR related error handling")
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
+Reviewed-by: Kevin Tian <kevin.tian@intel.com>
+master commit: 3221f270cf2eba0a22fd4f92319d664eacb92889
+master date: 2022-04-05 14:16:10 +0200
+---
+ xen/drivers/passthrough/vtd/iommu.c | 27 ++++++++++++++-------------
+ 1 file changed, 14 insertions(+), 13 deletions(-)
+
+diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c
+index 50e21bf1d9..f7d40414ef 100644
+--- a/xen/drivers/passthrough/vtd/iommu.c
++++ b/xen/drivers/passthrough/vtd/iommu.c
+@@ -1993,14 +1993,6 @@ static int intel_iommu_add_device(u8 devfn, struct pci_dev *pdev)
+ if ( !pdev->domain )
+ return -EINVAL;
+
+- ret = domain_context_mapping(pdev->domain, devfn, pdev);
+- if ( ret )
+- {
+- dprintk(XENLOG_ERR VTDPREFIX, "d%d: context mapping failed\n",
+- pdev->domain->domain_id);
+- return ret;
+- }
+-
+ for_each_rmrr_device ( rmrr, bdf, i )
+ {
+ if ( rmrr->segment == pdev->seg &&
+@@ -2017,12 +2009,17 @@ static int intel_iommu_add_device(u8 devfn, struct pci_dev *pdev)
+ rmrr->base_address, rmrr->end_address,
+ 0);
+ if ( ret )
+- dprintk(XENLOG_ERR VTDPREFIX, "d%d: RMRR mapping failed\n",
+- pdev->domain->domain_id);
++ dprintk(XENLOG_ERR VTDPREFIX, "%pd: RMRR mapping failed\n",
++ pdev->domain);
+ }
+ }
+
+- return 0;
++ ret = domain_context_mapping(pdev->domain, devfn, pdev);
++ if ( ret )
++ dprintk(XENLOG_ERR VTDPREFIX, "%pd: context mapping failed\n",
++ pdev->domain);
++
++ return ret;
+ }
+
+ static int intel_iommu_enable_device(struct pci_dev *pdev)
+@@ -2044,11 +2041,15 @@ static int intel_iommu_remove_device(u8 devfn, struct pci_dev *pdev)
+ {
+ struct acpi_rmrr_unit *rmrr;
+ u16 bdf;
+- int i;
++ int ret, i;
+
+ if ( !pdev->domain )
+ return -EINVAL;
+
++ ret = domain_context_unmap(pdev->domain, devfn, pdev);
++ if ( ret )
++ return ret;
++
+ for_each_rmrr_device ( rmrr, bdf, i )
+ {
+ if ( rmrr->segment != pdev->seg ||
+@@ -2064,7 +2065,7 @@ static int intel_iommu_remove_device(u8 devfn, struct pci_dev *pdev)
+ rmrr->end_address, 0);
+ }
+
+- return domain_context_unmap(pdev->domain, devfn, pdev);
++ return 0;
+ }
+
+ static int __hwdom_init setup_hwdom_device(u8 devfn, struct pci_dev *pdev)
+--
+2.35.2
+
+
+From 33c13654cb6d7d2a5731614f55aace4866c93d97 Mon Sep 17 00:00:00 2001
+From: Jan Beulich <jbeulich@suse.com>
+Date: Tue, 5 Apr 2022 15:23:26 +0200
+Subject: [PATCH 21/32] VT-d: drop ownership checking from
+ domain_context_mapping_one()
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Despite putting in quite a bit of effort it was not possible to
+establish why exactly this code exists (beyond possibly sanity
+checking). Instead of a subsequent change further complicating this
+logic, simply get rid of it.
+
+Take the opportunity and move the respective unmap_vtd_domain_page() out
+of the locked region.
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
+Reviewed-by: Paul Durrant <paul@xen.org>
+Reviewed-by: Kevin Tian <kevin.tian@intel.com>
+master commit: a680b8134b2d1828bbbf443a97feea66e8a85c75
+master date: 2022-04-05 14:17:21 +0200
+---
+ xen/drivers/passthrough/vtd/iommu.c | 66 +----------------------------
+ 1 file changed, 2 insertions(+), 64 deletions(-)
+
+diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c
+index f7d40414ef..b729ae173a 100644
+--- a/xen/drivers/passthrough/vtd/iommu.c
++++ b/xen/drivers/passthrough/vtd/iommu.c
+@@ -114,28 +114,6 @@ static int context_set_domain_id(struct context_entry *context,
+ return 0;
+ }
+
+-static int context_get_domain_id(struct context_entry *context,
+- struct vtd_iommu *iommu)
+-{
+- unsigned long dom_index, nr_dom;
+- int domid = -1;
+-
+- if (iommu && context)
+- {
+- nr_dom = cap_ndoms(iommu->cap);
+-
+- dom_index = context_domain_id(*context);
+-
+- if ( dom_index < nr_dom && iommu->domid_map )
+- domid = iommu->domid_map[dom_index];
+- else
+- dprintk(XENLOG_DEBUG VTDPREFIX,
+- "dom_index %lu exceeds nr_dom %lu or iommu has no domid_map\n",
+- dom_index, nr_dom);
+- }
+- return domid;
+-}
+-
+ static void cleanup_domid_map(struct domain *domain, struct vtd_iommu *iommu)
+ {
+ int iommu_domid = domain_iommu_domid(domain, iommu);
+@@ -1392,49 +1370,9 @@ int domain_context_mapping_one(
+
+ if ( context_present(*context) )
+ {
+- int res = 0;
+-
+- /* Try to get domain ownership from device structure. If that's
+- * not available, try to read it from the context itself. */
+- if ( pdev )
+- {
+- if ( pdev->domain != domain )
+- {
+- printk(XENLOG_G_INFO VTDPREFIX
+- "d%d: %04x:%02x:%02x.%u owned by d%d!",
+- domain->domain_id,
+- seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
+- pdev->domain ? pdev->domain->domain_id : -1);
+- res = -EINVAL;
+- }
+- }
+- else
+- {
+- int cdomain;
+- cdomain = context_get_domain_id(context, iommu);
+-
+- if ( cdomain < 0 )
+- {
+- printk(XENLOG_G_WARNING VTDPREFIX
+- "d%d: %04x:%02x:%02x.%u mapped, but can't find owner!\n",
+- domain->domain_id,
+- seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
+- res = -EINVAL;
+- }
+- else if ( cdomain != domain->domain_id )
+- {
+- printk(XENLOG_G_INFO VTDPREFIX
+- "d%d: %04x:%02x:%02x.%u already mapped to d%d!",
+- domain->domain_id,
+- seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
+- cdomain);
+- res = -EINVAL;
+- }
+- }
+-
+- unmap_vtd_domain_page(context_entries);
+ spin_unlock(&iommu->lock);
+- return res;
++ unmap_vtd_domain_page(context_entries);
++ return 0;
+ }
+
+ if ( iommu_hwdom_passthrough && is_hardware_domain(domain) )
+--
+2.35.2
+
+
+From 235aa158e0f71ee2bf20155ce6b0b429acf59d37 Mon Sep 17 00:00:00 2001
+From: Jan Beulich <jbeulich@suse.com>
+Date: Tue, 5 Apr 2022 15:23:57 +0200
+Subject: [PATCH 22/32] VT-d: re-assign devices directly
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Devices with RMRRs, due to it being unspecified how/when the specified
+memory regions may get accessed, may not be left disconnected from their
+respective mappings (as long as it's not certain that the device has
+been fully quiesced). Hence rather than unmapping the old context and
+then mapping the new one, re-assignment needs to be done in a single
+step.
+
+This is CVE-2022-26359 / part of XSA-400.
+
+Reported-by: Roger Pau Monné <roger.pau@citrix.com>
+
+Similarly quarantining scratch-page mode relies on page tables to be
+continuously wired up.
+
+To avoid complicating things more than necessary, treat all devices
+mostly equally, i.e. regardless of their association with any RMRRs. The
+main difference is when it comes to updating context entries, which need
+to be atomic when there are RMRRs. Yet atomicity can only be achieved
+with CMPXCHG16B, availability of which we can't take for given.
+
+The seemingly complicated choice of non-negative return values for
+domain_context_mapping_one() is to limit code churn: This way callers
+passing NULL for pdev don't need fiddling with.
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Kevin Tian <kevin.tian@intel.com>
+Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
+master commit: 8f41e481b4852173909363b88c1ab3da747d3a05
+master date: 2022-04-05 14:17:42 +0200
+---
+ xen/drivers/passthrough/vtd/extern.h | 7 +-
+ xen/drivers/passthrough/vtd/iommu.c | 268 +++++++++++++++++++++------
+ xen/drivers/passthrough/vtd/iommu.h | 8 +-
+ xen/drivers/passthrough/vtd/quirks.c | 14 +-
+ xen/drivers/passthrough/vtd/vtd.h | 10 +-
+ 5 files changed, 233 insertions(+), 74 deletions(-)
+
+diff --git a/xen/drivers/passthrough/vtd/extern.h b/xen/drivers/passthrough/vtd/extern.h
+index 1cac22a02f..f51f8aae0d 100644
+--- a/xen/drivers/passthrough/vtd/extern.h
++++ b/xen/drivers/passthrough/vtd/extern.h
+@@ -85,7 +85,8 @@ void free_pgtable_maddr(u64 maddr);
+ void *map_vtd_domain_page(u64 maddr);
+ void unmap_vtd_domain_page(void *va);
+ int domain_context_mapping_one(struct domain *domain, struct vtd_iommu *iommu,
+- u8 bus, u8 devfn, const struct pci_dev *);
++ uint8_t bus, uint8_t devfn,
++ const struct pci_dev *pdev, unsigned int mode);
+ int domain_context_unmap_one(struct domain *domain, struct vtd_iommu *iommu,
+ u8 bus, u8 devfn);
+ int intel_iommu_get_reserved_device_memory(iommu_grdm_t *func, void *ctxt);
+@@ -105,8 +106,8 @@ int is_igd_vt_enabled_quirk(void);
+ void platform_quirks_init(void);
+ void vtd_ops_preamble_quirk(struct vtd_iommu *iommu);
+ void vtd_ops_postamble_quirk(struct vtd_iommu *iommu);
+-int __must_check me_wifi_quirk(struct domain *domain,
+- u8 bus, u8 devfn, int map);
++int __must_check me_wifi_quirk(struct domain *domain, uint8_t bus,
++ uint8_t devfn, unsigned int mode);
+ void pci_vtd_quirk(const struct pci_dev *);
+ void quirk_iommu_caps(struct vtd_iommu *iommu);
+
+diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c
+index b729ae173a..17deda92d8 100644
+--- a/xen/drivers/passthrough/vtd/iommu.c
++++ b/xen/drivers/passthrough/vtd/iommu.c
+@@ -110,6 +110,7 @@ static int context_set_domain_id(struct context_entry *context,
+ }
+
+ set_bit(i, iommu->domid_bitmap);
++ context->hi &= ~(((1 << DID_FIELD_WIDTH) - 1) << DID_HIGH_OFFSET);
+ context->hi |= (i & ((1 << DID_FIELD_WIDTH) - 1)) << DID_HIGH_OFFSET;
+ return 0;
+ }
+@@ -1350,15 +1351,27 @@ static void __hwdom_init intel_iommu_hwdom_init(struct domain *d)
+ }
+ }
+
++/*
++ * This function returns
++ * - a negative errno value upon error,
++ * - zero upon success when previously the entry was non-present, or this isn't
++ * the "main" request for a device (pdev == NULL), or for no-op quarantining
++ * assignments,
++ * - positive (one) upon success when previously the entry was present and this
++ * is the "main" request for a device (pdev != NULL).
++ */
+ int domain_context_mapping_one(
+ struct domain *domain,
+ struct vtd_iommu *iommu,
+- u8 bus, u8 devfn, const struct pci_dev *pdev)
++ uint8_t bus, uint8_t devfn, const struct pci_dev *pdev,
++ unsigned int mode)
+ {
+ struct domain_iommu *hd = dom_iommu(domain);
+- struct context_entry *context, *context_entries;
++ struct context_entry *context, *context_entries, lctxt;
++ __uint128_t old;
+ u64 maddr, pgd_maddr;
+- u16 seg = iommu->drhd->segment;
++ uint16_t seg = iommu->drhd->segment, prev_did = 0;
++ struct domain *prev_dom = NULL;
+ int agaw, rc, ret;
+ bool_t flush_dev_iotlb;
+
+@@ -1367,17 +1380,32 @@ int domain_context_mapping_one(
+ maddr = bus_to_context_maddr(iommu, bus);
+ context_entries = (struct context_entry *)map_vtd_domain_page(maddr);
+ context = &context_entries[devfn];
++ old = (lctxt = *context).full;
+
+- if ( context_present(*context) )
++ if ( context_present(lctxt) )
+ {
+- spin_unlock(&iommu->lock);
+- unmap_vtd_domain_page(context_entries);
+- return 0;
++ domid_t domid;
++
++ prev_did = context_domain_id(lctxt);
++ domid = iommu->domid_map[prev_did];
++ if ( domid < DOMID_FIRST_RESERVED )
++ prev_dom = rcu_lock_domain_by_id(domid);
++ else if ( domid == DOMID_IO )
++ prev_dom = rcu_lock_domain(dom_io);
++ if ( !prev_dom )
++ {
++ spin_unlock(&iommu->lock);
++ unmap_vtd_domain_page(context_entries);
++ dprintk(XENLOG_DEBUG VTDPREFIX,
++ "no domain for did %u (nr_dom %u)\n",
++ prev_did, cap_ndoms(iommu->cap));
++ return -ESRCH;
++ }
+ }
+
+ if ( iommu_hwdom_passthrough && is_hardware_domain(domain) )
+ {
+- context_set_translation_type(*context, CONTEXT_TT_PASS_THRU);
++ context_set_translation_type(lctxt, CONTEXT_TT_PASS_THRU);
+ agaw = level_to_agaw(iommu->nr_pt_levels);
+ }
+ else
+@@ -1394,6 +1422,8 @@ int domain_context_mapping_one(
+ spin_unlock(&hd->arch.mapping_lock);
+ spin_unlock(&iommu->lock);
+ unmap_vtd_domain_page(context_entries);
++ if ( prev_dom )
++ rcu_unlock_domain(prev_dom);
+ return -ENOMEM;
+ }
+ }
+@@ -1411,33 +1441,102 @@ int domain_context_mapping_one(
+ goto nomem;
+ }
+
+- context_set_address_root(*context, pgd_maddr);
++ context_set_address_root(lctxt, pgd_maddr);
+ if ( ats_enabled && ecap_dev_iotlb(iommu->ecap) )
+- context_set_translation_type(*context, CONTEXT_TT_DEV_IOTLB);
++ context_set_translation_type(lctxt, CONTEXT_TT_DEV_IOTLB);
+ else
+- context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL);
++ context_set_translation_type(lctxt, CONTEXT_TT_MULTI_LEVEL);
+
+ spin_unlock(&hd->arch.mapping_lock);
+ }
+
+- if ( context_set_domain_id(context, domain, iommu) )
++ rc = context_set_domain_id(&lctxt, domain, iommu);
++ if ( rc )
+ {
++ unlock:
+ spin_unlock(&iommu->lock);
+ unmap_vtd_domain_page(context_entries);
+- return -EFAULT;
++ if ( prev_dom )
++ rcu_unlock_domain(prev_dom);
++ return rc;
++ }
++
++ if ( !prev_dom )
++ {
++ context_set_address_width(lctxt, agaw);
++ context_set_fault_enable(lctxt);
++ context_set_present(lctxt);
++ }
++ else if ( prev_dom == domain )
++ {
++ ASSERT(lctxt.full == context->full);
++ rc = !!pdev;
++ goto unlock;
++ }
++ else
++ {
++ ASSERT(context_address_width(lctxt) == agaw);
++ ASSERT(!context_fault_disable(lctxt));
++ }
++
++ if ( cpu_has_cx16 )
++ {
++ __uint128_t res = cmpxchg16b(context, &old, &lctxt.full);
++
++ /*
++ * Hardware does not update the context entry behind our backs,
++ * so the return value should match "old".
++ */
++ if ( res != old )
++ {
++ if ( pdev )
++ check_cleanup_domid_map(domain, pdev, iommu);
++ printk(XENLOG_ERR
++ "%04x:%02x:%02x.%u: unexpected context entry %016lx_%016lx (expected %016lx_%016lx)\n",
++ pdev->seg, pdev->bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
++ (uint64_t)(res >> 64), (uint64_t)res,
++ (uint64_t)(old >> 64), (uint64_t)old);
++ rc = -EILSEQ;
++ goto unlock;
++ }
++ }
++ else if ( !prev_dom || !(mode & MAP_WITH_RMRR) )
++ {
++ context_clear_present(*context);
++ iommu_sync_cache(context, sizeof(*context));
++
++ write_atomic(&context->hi, lctxt.hi);
++ /* No barrier should be needed between these two. */
++ write_atomic(&context->lo, lctxt.lo);
++ }
++ else /* Best effort, updating DID last. */
++ {
++ /*
++ * By non-atomically updating the context entry's DID field last,
++ * during a short window in time TLB entries with the old domain ID
++ * but the new page tables may be inserted. This could affect I/O
++ * of other devices using this same (old) domain ID. Such updating
++ * therefore is not a problem if this was the only device associated
++ * with the old domain ID. Diverting I/O of any of a dying domain's
++ * devices to the quarantine page tables is intended anyway.
++ */
++ if ( !(mode & (MAP_OWNER_DYING | MAP_SINGLE_DEVICE)) )
++ printk(XENLOG_WARNING VTDPREFIX
++ " %04x:%02x:%02x.%u: reassignment may cause %pd data corruption\n",
++ seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), prev_dom);
++
++ write_atomic(&context->lo, lctxt.lo);
++ /* No barrier should be needed between these two. */
++ write_atomic(&context->hi, lctxt.hi);
+ }
+
+- context_set_address_width(*context, agaw);
+- context_set_fault_enable(*context);
+- context_set_present(*context);
+ iommu_sync_cache(context, sizeof(struct context_entry));
+ spin_unlock(&iommu->lock);
+
+- /* Context entry was previously non-present (with domid 0). */
+- rc = iommu_flush_context_device(iommu, 0, PCI_BDF2(bus, devfn),
+- DMA_CCMD_MASK_NOBIT, 1);
++ rc = iommu_flush_context_device(iommu, prev_did, PCI_BDF2(bus, devfn),
++ DMA_CCMD_MASK_NOBIT, !prev_dom);
+ flush_dev_iotlb = !!find_ats_dev_drhd(iommu);
+- ret = iommu_flush_iotlb_dsi(iommu, 0, 1, flush_dev_iotlb);
++ ret = iommu_flush_iotlb_dsi(iommu, prev_did, !prev_dom, flush_dev_iotlb);
+
+ /*
+ * The current logic for returns:
+@@ -1458,12 +1557,21 @@ int domain_context_mapping_one(
+ unmap_vtd_domain_page(context_entries);
+
+ if ( !seg && !rc )
+- rc = me_wifi_quirk(domain, bus, devfn, MAP_ME_PHANTOM_FUNC);
++ rc = me_wifi_quirk(domain, bus, devfn, mode);
+
+ if ( rc )
+- domain_context_unmap_one(domain, iommu, bus, devfn);
++ {
++ if ( !prev_dom )
++ domain_context_unmap_one(domain, iommu, bus, devfn);
++ else if ( prev_dom != domain ) /* Avoid infinite recursion. */
++ domain_context_mapping_one(prev_dom, iommu, bus, devfn, pdev,
++ mode & MAP_WITH_RMRR);
++ }
+
+- return rc;
++ if ( prev_dom )
++ rcu_unlock_domain(prev_dom);
++
++ return rc ?: pdev && prev_dom;
+ }
+
+ static int domain_context_unmap(struct domain *d, uint8_t devfn,
+@@ -1473,8 +1581,11 @@ static int domain_context_mapping(struct domain *domain, u8 devfn,
+ struct pci_dev *pdev)
+ {
+ struct acpi_drhd_unit *drhd;
++ const struct acpi_rmrr_unit *rmrr;
+ int ret = 0;
+- u8 seg = pdev->seg, bus = pdev->bus, secbus;
++ unsigned int i, mode = 0;
++ uint16_t seg = pdev->seg, bdf;
++ uint8_t bus = pdev->bus, secbus;
+
+ drhd = acpi_find_matched_drhd_unit(pdev);
+ if ( !drhd )
+@@ -1493,8 +1604,29 @@ static int domain_context_mapping(struct domain *domain, u8 devfn,
+
+ ASSERT(pcidevs_locked());
+
++ for_each_rmrr_device( rmrr, bdf, i )
++ {
++ if ( rmrr->segment != pdev->seg || bdf != pdev->sbdf.bdf )
++ continue;
++
++ mode |= MAP_WITH_RMRR;
++ break;
++ }
++
++ if ( domain != pdev->domain )
++ {
++ if ( pdev->domain->is_dying )
++ mode |= MAP_OWNER_DYING;
++ else if ( drhd &&
++ !any_pdev_behind_iommu(pdev->domain, pdev, drhd->iommu) &&
++ !pdev->phantom_stride )
++ mode |= MAP_SINGLE_DEVICE;
++ }
++
+ switch ( pdev->type )
+ {
++ bool prev_present;
++
+ case DEV_TYPE_PCI_HOST_BRIDGE:
+ if ( iommu_debug )
+ printk(VTDPREFIX "d%d:Hostbridge: skip %04x:%02x:%02x.%u map\n",
+@@ -1515,7 +1647,9 @@ static int domain_context_mapping(struct domain *domain, u8 devfn,
+ domain->domain_id, seg, bus,
+ PCI_SLOT(devfn), PCI_FUNC(devfn));
+ ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn,
+- pdev);
++ pdev, mode);
++ if ( ret > 0 )
++ ret = 0;
+ if ( !ret && devfn == pdev->devfn && ats_device(pdev, drhd) > 0 )
+ enable_ats_device(pdev, &drhd->iommu->ats_devices);
+
+@@ -1528,9 +1662,10 @@ static int domain_context_mapping(struct domain *domain, u8 devfn,
+ PCI_SLOT(devfn), PCI_FUNC(devfn));
+
+ ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn,
+- pdev);
+- if ( ret )
++ pdev, mode);
++ if ( ret < 0 )
+ break;
++ prev_present = ret;
+
+ if ( (ret = find_upstream_bridge(seg, &bus, &devfn, &secbus)) < 1 )
+ {
+@@ -1538,6 +1673,15 @@ static int domain_context_mapping(struct domain *domain, u8 devfn,
+ break;
+ ret = -ENXIO;
+ }
++ /*
++ * Strictly speaking if the device is the only one behind this bridge
++ * and the only one with this (secbus,0,0) tuple, it could be allowed
++ * to be re-assigned regardless of RMRR presence. But let's deal with
++ * that case only if it is actually found in the wild.
++ */
++ else if ( prev_present && (mode & MAP_WITH_RMRR) &&
++ domain != pdev->domain )
++ ret = -EOPNOTSUPP;
+
+ /*
+ * Mapping a bridge should, if anything, pass the struct pci_dev of
+@@ -1546,7 +1690,7 @@ static int domain_context_mapping(struct domain *domain, u8 devfn,
+ */
+ if ( ret >= 0 )
+ ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn,
+- NULL);
++ NULL, mode);
+
+ /*
+ * Devices behind PCIe-to-PCI/PCIx bridge may generate different
+@@ -1561,10 +1705,15 @@ static int domain_context_mapping(struct domain *domain, u8 devfn,
+ if ( !ret && pdev_type(seg, bus, devfn) == DEV_TYPE_PCIe2PCI_BRIDGE &&
+ (secbus != pdev->bus || pdev->devfn != 0) )
+ ret = domain_context_mapping_one(domain, drhd->iommu, secbus, 0,
+- NULL);
++ NULL, mode);
+
+ if ( ret )
+- domain_context_unmap(domain, devfn, pdev);
++ {
++ if ( !prev_present )
++ domain_context_unmap(domain, devfn, pdev);
++ else if ( pdev->domain != domain ) /* Avoid infinite recursion. */
++ domain_context_mapping(pdev->domain, devfn, pdev);
++ }
+
+ break;
+
+@@ -2331,9 +2480,8 @@ static int reassign_device_ownership(
+ {
+ int ret;
+
+- ret = domain_context_unmap(source, devfn, pdev);
+- if ( ret )
+- return ret;
++ if ( !has_arch_pdevs(target) )
++ vmx_pi_hooks_assign(target);
+
+ /*
+ * Devices assigned to untrusted domains (here assumed to be any domU)
+@@ -2343,6 +2491,31 @@ static int reassign_device_ownership(
+ if ( (target != hardware_domain) && !iommu_intremap )
+ untrusted_msi = true;
+
++ ret = domain_context_mapping(target, devfn, pdev);
++ if ( ret )
++ {
++ if ( !has_arch_pdevs(target) )
++ vmx_pi_hooks_deassign(target);
++ return ret;
++ }
++
++ if ( pdev->devfn == devfn )
++ {
++ const struct acpi_drhd_unit *drhd = acpi_find_matched_drhd_unit(pdev);
++
++ if ( drhd )
++ check_cleanup_domid_map(source, pdev, drhd->iommu);
++ }
++
++ if ( devfn == pdev->devfn && pdev->domain != target )
++ {
++ list_move(&pdev->domain_list, &target->pdev_list);
++ pdev->domain = target;
++ }
++
++ if ( !has_arch_pdevs(source) )
++ vmx_pi_hooks_deassign(source);
++
+ /*
+ * If the device belongs to the hardware domain, and it has RMRR, don't
+ * remove it from the hardware domain, because BIOS may use RMRR at
+@@ -2371,34 +2544,7 @@ static int reassign_device_ownership(
+ }
+ }
+
+- if ( devfn == pdev->devfn && pdev->domain != dom_io )
+- {
+- list_move(&pdev->domain_list, &dom_io->pdev_list);
+- pdev->domain = dom_io;
+- }
+-
+- if ( !has_arch_pdevs(source) )
+- vmx_pi_hooks_deassign(source);
+-
+- if ( !has_arch_pdevs(target) )
+- vmx_pi_hooks_assign(target);
+-
+- ret = domain_context_mapping(target, devfn, pdev);
+- if ( ret )
+- {
+- if ( !has_arch_pdevs(target) )
+- vmx_pi_hooks_deassign(target);
+-
+- return ret;
+- }
+-
+- if ( devfn == pdev->devfn && pdev->domain != target )
+- {
+- list_move(&pdev->domain_list, &target->pdev_list);
+- pdev->domain = target;
+- }
+-
+- return ret;
++ return 0;
+ }
+
+ static int intel_iommu_assign_device(
+diff --git a/xen/drivers/passthrough/vtd/iommu.h b/xen/drivers/passthrough/vtd/iommu.h
+index 32b39c606a..503b07ffb7 100644
+--- a/xen/drivers/passthrough/vtd/iommu.h
++++ b/xen/drivers/passthrough/vtd/iommu.h
+@@ -202,8 +202,12 @@ struct root_entry {
+ do {(root).val |= ((value) & PAGE_MASK_4K);} while(0)
+
+ struct context_entry {
+- u64 lo;
+- u64 hi;
++ union {
++ struct {
++ uint64_t lo, hi;
++ };
++ __uint128_t full;
++ };
+ };
+ #define ROOT_ENTRY_NR (PAGE_SIZE_4K/sizeof(struct root_entry))
+ #define context_present(c) ((c).lo & 1)
+diff --git a/xen/drivers/passthrough/vtd/quirks.c b/xen/drivers/passthrough/vtd/quirks.c
+index 435e449ca3..99e159b4e9 100644
+--- a/xen/drivers/passthrough/vtd/quirks.c
++++ b/xen/drivers/passthrough/vtd/quirks.c
+@@ -343,7 +343,8 @@ void __init platform_quirks_init(void)
+ */
+
+ static int __must_check map_me_phantom_function(struct domain *domain,
+- u32 dev, int map)
++ unsigned int dev,
++ unsigned int mode)
+ {
+ struct acpi_drhd_unit *drhd;
+ struct pci_dev *pdev;
+@@ -354,9 +355,9 @@ static int __must_check map_me_phantom_function(struct domain *domain,
+ drhd = acpi_find_matched_drhd_unit(pdev);
+
+ /* map or unmap ME phantom function */
+- if ( map )
++ if ( !(mode & UNMAP_ME_PHANTOM_FUNC) )
+ rc = domain_context_mapping_one(domain, drhd->iommu, 0,
+- PCI_DEVFN(dev, 7), NULL);
++ PCI_DEVFN(dev, 7), NULL, mode);
+ else
+ rc = domain_context_unmap_one(domain, drhd->iommu, 0,
+ PCI_DEVFN(dev, 7));
+@@ -364,7 +365,8 @@ static int __must_check map_me_phantom_function(struct domain *domain,
+ return rc;
+ }
+
+-int me_wifi_quirk(struct domain *domain, u8 bus, u8 devfn, int map)
++int me_wifi_quirk(struct domain *domain, uint8_t bus, uint8_t devfn,
++ unsigned int mode)
+ {
+ u32 id;
+ int rc = 0;
+@@ -388,7 +390,7 @@ int me_wifi_quirk(struct domain *domain, u8 bus, u8 devfn, int map)
+ case 0x423b8086:
+ case 0x423c8086:
+ case 0x423d8086:
+- rc = map_me_phantom_function(domain, 3, map);
++ rc = map_me_phantom_function(domain, 3, mode);
+ break;
+ default:
+ break;
+@@ -414,7 +416,7 @@ int me_wifi_quirk(struct domain *domain, u8 bus, u8 devfn, int map)
+ case 0x42388086: /* Puma Peak */
+ case 0x422b8086:
+ case 0x422c8086:
+- rc = map_me_phantom_function(domain, 22, map);
++ rc = map_me_phantom_function(domain, 22, mode);
+ break;
+ default:
+ break;
+diff --git a/xen/drivers/passthrough/vtd/vtd.h b/xen/drivers/passthrough/vtd/vtd.h
+index bb8889f350..e4ab242fee 100644
+--- a/xen/drivers/passthrough/vtd/vtd.h
++++ b/xen/drivers/passthrough/vtd/vtd.h
+@@ -22,8 +22,14 @@
+
+ #include <xen/iommu.h>
+
+-#define MAP_ME_PHANTOM_FUNC 1
+-#define UNMAP_ME_PHANTOM_FUNC 0
++/*
++ * Values for domain_context_mapping_one()'s and me_wifi_quirk()'s "mode"
++ * parameters.
++ */
++#define MAP_WITH_RMRR (1u << 0)
++#define MAP_OWNER_DYING (1u << 1)
++#define MAP_SINGLE_DEVICE (1u << 2)
++#define UNMAP_ME_PHANTOM_FUNC (1u << 3)
+
+ /* Allow for both IOAPIC and IOSAPIC. */
+ #define IO_xAPIC_route_entry IO_APIC_route_entry
+--
+2.35.2
+
+
+From 73e25ecaef14d4df521235b6dbe5ceaaa3f02e8a Mon Sep 17 00:00:00 2001
+From: Jan Beulich <jbeulich@suse.com>
+Date: Tue, 5 Apr 2022 15:24:23 +0200
+Subject: [PATCH 23/32] AMD/IOMMU: re-assign devices directly
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Devices with unity map ranges, due to it being unspecified how/when
+these memory ranges may get accessed, may not be left disconnected from
+their unity mappings (as long as it's not certain that the device has
+been fully quiesced). Hence rather than tearing down the old root page
+table pointer and then establishing the new one, re-assignment needs to
+be done in a single step.
+
+This is CVE-2022-26360 / part of XSA-400.
+
+Reported-by: Roger Pau Monné <roger.pau@citrix.com>
+
+Similarly quarantining scratch-page mode relies on page tables to be
+continuously wired up.
+
+To avoid complicating things more than necessary, treat all devices
+mostly equally, i.e. regardless of their association with any unity map
+ranges. The main difference is when it comes to updating DTEs, which need
+to be atomic when there are unity mappings. Yet atomicity can only be
+achieved with CMPXCHG16B, availability of which we can't take for given.
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Paul Durrant <paul@xen.org>
+Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
+master commit: 1fa6e9aa36233fe9c29a204fcb2697e985b8345f
+master date: 2022-04-05 14:18:04 +0200
+---
+ xen/drivers/passthrough/amd/iommu_map.c | 67 ++++++-
+ xen/drivers/passthrough/amd/pci_amd_iommu.c | 180 +++++++++++++-----
+ xen/include/asm-x86/hvm/svm/amd-iommu-proto.h | 10 +-
+ 3 files changed, 200 insertions(+), 57 deletions(-)
+
+diff --git a/xen/drivers/passthrough/amd/iommu_map.c b/xen/drivers/passthrough/amd/iommu_map.c
+index ac30cac05b..45559f9678 100644
+--- a/xen/drivers/passthrough/amd/iommu_map.c
++++ b/xen/drivers/passthrough/amd/iommu_map.c
+@@ -103,10 +103,69 @@ static unsigned int set_iommu_pte_present(unsigned long pt_mfn,
+ return flush_flags;
+ }
+
+-void amd_iommu_set_root_page_table(struct amd_iommu_dte *dte,
+- uint64_t root_ptr, uint16_t domain_id,
+- uint8_t paging_mode, bool valid)
++/*
++ * This function returns
++ * - -errno for errors,
++ * - 0 for a successful update, atomic when necessary
++ * - 1 for a successful but non-atomic update, which may need to be warned
++ * about by the caller.
++ */
++int amd_iommu_set_root_page_table(struct amd_iommu_dte *dte,
++ uint64_t root_ptr, uint16_t domain_id,
++ uint8_t paging_mode, unsigned int flags)
+ {
++ bool valid = flags & SET_ROOT_VALID;
++
++ if ( dte->v && dte->tv &&
++ (cpu_has_cx16 || (flags & SET_ROOT_WITH_UNITY_MAP)) )
++ {
++ union {
++ struct amd_iommu_dte dte;
++ uint64_t raw64[4];
++ __uint128_t raw128[2];
++ } ldte = { .dte = *dte };
++ __uint128_t old = ldte.raw128[0];
++ int ret = 0;
++
++ ldte.dte.domain_id = domain_id;
++ ldte.dte.pt_root = paddr_to_pfn(root_ptr);
++ ldte.dte.iw = true;
++ ldte.dte.ir = true;
++ ldte.dte.paging_mode = paging_mode;
++ ldte.dte.v = valid;
++
++ if ( cpu_has_cx16 )
++ {
++ __uint128_t res = cmpxchg16b(dte, &old, &ldte.raw128[0]);
++
++ /*
++ * Hardware does not update the DTE behind our backs, so the
++ * return value should match "old".
++ */
++ if ( res != old )
++ {
++ printk(XENLOG_ERR
++ "Dom%d: unexpected DTE %016lx_%016lx (expected %016lx_%016lx)\n",
++ domain_id,
++ (uint64_t)(res >> 64), (uint64_t)res,
++ (uint64_t)(old >> 64), (uint64_t)old);
++ ret = -EILSEQ;
++ }
++ }
++ else /* Best effort, updating domain_id last. */
++ {
++ uint64_t *ptr = (void *)dte;
++
++ write_atomic(ptr + 0, ldte.raw64[0]);
++ /* No barrier should be needed between these two. */
++ write_atomic(ptr + 1, ldte.raw64[1]);
++
++ ret = 1;
++ }
++
++ return ret;
++ }
++
+ if ( valid || dte->v )
+ {
+ dte->tv = false;
+@@ -121,6 +180,8 @@ void amd_iommu_set_root_page_table(struct amd_iommu_dte *dte,
+ smp_wmb();
+ dte->tv = true;
+ dte->v = valid;
++
++ return 0;
+ }
+
+ void amd_iommu_set_intremap_table(
+diff --git a/xen/drivers/passthrough/amd/pci_amd_iommu.c b/xen/drivers/passthrough/amd/pci_amd_iommu.c
+index beafb0171d..14483e85ae 100644
+--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
++++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
+@@ -85,40 +85,81 @@ int get_dma_requestor_id(uint16_t seg, uint16_t bdf)
+ return req_id;
+ }
+
+-static void amd_iommu_setup_domain_device(
++static int __must_check allocate_domain_resources(struct domain_iommu *hd)
++{
++ int rc;
++
++ spin_lock(&hd->arch.mapping_lock);
++ rc = amd_iommu_alloc_root(hd);
++ spin_unlock(&hd->arch.mapping_lock);
++
++ return rc;
++}
++
++static bool any_pdev_behind_iommu(const struct domain *d,
++ const struct pci_dev *exclude,
++ const struct amd_iommu *iommu)
++{
++ const struct pci_dev *pdev;
++
++ for_each_pdev ( d, pdev )
++ {
++ if ( pdev == exclude )
++ continue;
++
++ if ( find_iommu_for_device(pdev->seg, pdev->sbdf.bdf) == iommu )
++ return true;
++ }
++
++ return false;
++}
++
++static int __must_check amd_iommu_setup_domain_device(
+ struct domain *domain, struct amd_iommu *iommu,
+ uint8_t devfn, struct pci_dev *pdev)
+ {
+ struct amd_iommu_dte *table, *dte;
+ unsigned long flags;
+- int req_id, valid = 1;
++ unsigned int req_id, sr_flags;
++ int rc;
+ u8 bus = pdev->bus;
+- const struct domain_iommu *hd = dom_iommu(domain);
++ struct domain_iommu *hd = dom_iommu(domain);
++ const struct ivrs_mappings *ivrs_dev;
++
++ BUG_ON(!hd->arch.paging_mode || !iommu->dev_table.buffer);
+
+- BUG_ON( !hd->arch.root_table || !hd->arch.paging_mode ||
+- !iommu->dev_table.buffer );
++ rc = allocate_domain_resources(hd);
++ if ( rc )
++ return rc;
+
+- if ( iommu_hwdom_passthrough && is_hardware_domain(domain) )
+- valid = 0;
++ req_id = get_dma_requestor_id(iommu->seg, pdev->sbdf.bdf);
++ ivrs_dev = &get_ivrs_mappings(iommu->seg)[req_id];
++ sr_flags = (iommu_hwdom_passthrough && is_hardware_domain(domain)
++ ? 0 : SET_ROOT_VALID)
++ | (ivrs_dev->unity_map ? SET_ROOT_WITH_UNITY_MAP : 0);
+
+ /* get device-table entry */
+ req_id = get_dma_requestor_id(iommu->seg, PCI_BDF2(bus, devfn));
+ table = iommu->dev_table.buffer;
+ dte = &table[req_id];
++ ivrs_dev = &get_ivrs_mappings(iommu->seg)[req_id];
+
+ spin_lock_irqsave(&iommu->lock, flags);
+
+ if ( !dte->v || !dte->tv )
+ {
+- const struct ivrs_mappings *ivrs_dev;
+-
+ /* bind DTE to domain page-tables */
+- amd_iommu_set_root_page_table(
+- dte, page_to_maddr(hd->arch.root_table), domain->domain_id,
+- hd->arch.paging_mode, valid);
++ rc = amd_iommu_set_root_page_table(
++ dte, page_to_maddr(hd->arch.root_table),
++ domain->domain_id, hd->arch.paging_mode, sr_flags);
++ if ( rc )
++ {
++ ASSERT(rc < 0);
++ spin_unlock_irqrestore(&iommu->lock, flags);
++ return rc;
++ }
+
+ /* Undo what amd_iommu_disable_domain_device() may have done. */
+- ivrs_dev = &get_ivrs_mappings(iommu->seg)[req_id];
+ if ( dte->it_root )
+ {
+ dte->int_ctl = IOMMU_DEV_TABLE_INT_CONTROL_TRANSLATED;
+@@ -133,17 +174,74 @@ static void amd_iommu_setup_domain_device(
+ dte->i = ats_enabled;
+
+ amd_iommu_flush_device(iommu, req_id);
++ }
++ else if ( dte->pt_root != mfn_x(page_to_mfn(hd->arch.root_table)) )
++ {
++ /*
++ * Strictly speaking if the device is the only one with this requestor
++ * ID, it could be allowed to be re-assigned regardless of unity map
++ * presence. But let's deal with that case only if it is actually
++ * found in the wild.
++ */
++ if ( req_id != PCI_BDF2(bus, devfn) &&
++ (sr_flags & SET_ROOT_WITH_UNITY_MAP) )
++ rc = -EOPNOTSUPP;
++ else
++ rc = amd_iommu_set_root_page_table(
++ dte, page_to_maddr(hd->arch.root_table),
++ domain->domain_id, hd->arch.paging_mode, sr_flags);
++ if ( rc < 0 )
++ {
++ spin_unlock_irqrestore(&iommu->lock, flags);
++ return rc;
++ }
++ if ( rc &&
++ domain != pdev->domain &&
++ /*
++ * By non-atomically updating the DTE's domain ID field last,
++ * during a short window in time TLB entries with the old domain
++ * ID but the new page tables may have been inserted. This could
++ * affect I/O of other devices using this same (old) domain ID.
++ * Such updating therefore is not a problem if this was the only
++ * device associated with the old domain ID. Diverting I/O of any
++ * of a dying domain's devices to the quarantine page tables is
++ * intended anyway.
++ */
++ !pdev->domain->is_dying &&
++ (any_pdev_behind_iommu(pdev->domain, pdev, iommu) ||
++ pdev->phantom_stride) )
++ printk(" %04x:%02x:%02x.%u: reassignment may cause %pd data corruption\n",
++ pdev->seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
++ pdev->domain);
++
++ /*
++ * Check remaining settings are still in place from an earlier call
++ * here. They're all independent of the domain, so should not have
++ * changed.
++ */
++ if ( dte->it_root )
++ ASSERT(dte->int_ctl == IOMMU_DEV_TABLE_INT_CONTROL_TRANSLATED);
++ ASSERT(dte->iv == iommu_intremap);
++ ASSERT(dte->ex == ivrs_dev->dte_allow_exclusion);
++ ASSERT(dte->sys_mgt == MASK_EXTR(ivrs_dev->device_flags,
++ ACPI_IVHD_SYSTEM_MGMT));
+
+- AMD_IOMMU_DEBUG("Setup I/O page table: device id = %#x, type = %#x, "
+- "root table = %#"PRIx64", "
+- "domain = %d, paging mode = %d\n",
+- req_id, pdev->type,
+- page_to_maddr(hd->arch.root_table),
+- domain->domain_id, hd->arch.paging_mode);
++ if ( pci_ats_device(iommu->seg, bus, pdev->devfn) &&
++ iommu_has_cap(iommu, PCI_CAP_IOTLB_SHIFT) )
++ ASSERT(dte->i == ats_enabled);
++
++ amd_iommu_flush_device(iommu, req_id);
+ }
+
+ spin_unlock_irqrestore(&iommu->lock, flags);
+
++ AMD_IOMMU_DEBUG("Setup I/O page table: device id = %#x, type = %#x, "
++ "root table = %#"PRIx64", "
++ "domain = %d, paging mode = %d\n",
++ req_id, pdev->type,
++ page_to_maddr(hd->arch.root_table),
++ domain->domain_id, hd->arch.paging_mode);
++
+ ASSERT(pcidevs_locked());
+
+ if ( pci_ats_device(iommu->seg, bus, pdev->devfn) &&
+@@ -154,6 +252,8 @@ static void amd_iommu_setup_domain_device(
+
+ amd_iommu_flush_iotlb(devfn, pdev, INV_IOMMU_ALL_PAGES_ADDRESS, 0);
+ }
++
++ return 0;
+ }
+
+ int __init acpi_ivrs_init(void)
+@@ -223,17 +323,6 @@ int amd_iommu_alloc_root(struct domain_iommu *hd)
+ return 0;
+ }
+
+-static int __must_check allocate_domain_resources(struct domain_iommu *hd)
+-{
+- int rc;
+-
+- spin_lock(&hd->arch.mapping_lock);
+- rc = amd_iommu_alloc_root(hd);
+- spin_unlock(&hd->arch.mapping_lock);
+-
+- return rc;
+-}
+-
+ int __read_mostly amd_iommu_min_paging_mode = 1;
+
+ static int amd_iommu_domain_init(struct domain *d)
+@@ -333,7 +422,6 @@ static int reassign_device(struct domain *source, struct domain *target,
+ {
+ struct amd_iommu *iommu;
+ int bdf, rc;
+- struct domain_iommu *t = dom_iommu(target);
+ const struct ivrs_mappings *ivrs_mappings = get_ivrs_mappings(pdev->seg);
+
+ bdf = PCI_BDF2(pdev->bus, pdev->devfn);
+@@ -347,7 +435,15 @@ static int reassign_device(struct domain *source, struct domain *target,
+ return -ENODEV;
+ }
+
+- amd_iommu_disable_domain_device(source, iommu, devfn, pdev);
++ rc = amd_iommu_setup_domain_device(target, iommu, devfn, pdev);
++ if ( rc )
++ return rc;
++
++ if ( devfn == pdev->devfn && pdev->domain != target )
++ {
++ list_move(&pdev->domain_list, &target->pdev_list);
++ pdev->domain = target;
++ }
+
+ /*
+ * If the device belongs to the hardware domain, and it has a unity mapping,
+@@ -363,27 +459,10 @@ static int reassign_device(struct domain *source, struct domain *target,
+ return rc;
+ }
+
+- if ( devfn == pdev->devfn && pdev->domain != dom_io )
+- {
+- list_move(&pdev->domain_list, &dom_io->pdev_list);
+- pdev->domain = dom_io;
+- }
+-
+- rc = allocate_domain_resources(t);
+- if ( rc )
+- return rc;
+-
+- amd_iommu_setup_domain_device(target, iommu, devfn, pdev);
+ AMD_IOMMU_DEBUG("Re-assign %04x:%02x:%02x.%u from dom%d to dom%d\n",
+ pdev->seg, pdev->bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
+ source->domain_id, target->domain_id);
+
+- if ( devfn == pdev->devfn && pdev->domain != target )
+- {
+- list_move(&pdev->domain_list, &target->pdev_list);
+- pdev->domain = target;
+- }
+-
+ return 0;
+ }
+
+@@ -547,8 +626,7 @@ static int amd_iommu_add_device(u8 devfn, struct pci_dev *pdev)
+ spin_unlock_irqrestore(&iommu->lock, flags);
+ }
+
+- amd_iommu_setup_domain_device(pdev->domain, iommu, devfn, pdev);
+- return 0;
++ return amd_iommu_setup_domain_device(pdev->domain, iommu, devfn, pdev);
+ }
+
+ static int amd_iommu_remove_device(u8 devfn, struct pci_dev *pdev)
+diff --git a/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h b/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
+index 3983293540..52c889ade0 100644
+--- a/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
++++ b/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
+@@ -79,9 +79,13 @@ void amd_iommu_set_intremap_table(struct amd_iommu_dte *dte,
+ const void *ptr,
+ const struct amd_iommu *iommu,
+ bool valid);
+-void amd_iommu_set_root_page_table(struct amd_iommu_dte *dte,
+- uint64_t root_ptr, uint16_t domain_id,
+- uint8_t paging_mode, bool valid);
++#define SET_ROOT_VALID (1u << 0)
++#define SET_ROOT_WITH_UNITY_MAP (1u << 1)
++int __must_check amd_iommu_set_root_page_table(struct amd_iommu_dte *dte,
++ uint64_t root_ptr,
++ uint16_t domain_id,
++ uint8_t paging_mode,
++ unsigned int flags);
+ void iommu_dte_add_device_entry(struct amd_iommu_dte *dte,
+ const struct ivrs_mappings *ivrs_dev);
+ void iommu_dte_set_guest_cr3(struct amd_iommu_dte *dte, uint16_t dom_id,
+--
+2.35.2
+
+
+From 92acf6b23154d65066ec4702fdca5cf232856d90 Mon Sep 17 00:00:00 2001
+From: Jan Beulich <jbeulich@suse.com>
+Date: Tue, 5 Apr 2022 15:24:54 +0200
+Subject: [PATCH 24/32] VT-d: prepare for per-device quarantine page tables
+ (part I)
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Arrange for domain ID and page table root to be passed around, the latter in
+particular to domain_pgd_maddr() such that taking it from the per-domain
+fields can be overridden.
+
+No functional change intended.
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Paul Durrant <paul@xen.org>
+Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
+Reviewed-by: Kevin Tian <kevin.tian@intel.com>
+master commit: eb19326a328d49a6a4dc3930391b340f3bcd8948
+master date: 2022-04-05 14:18:26 +0200
+---
+ xen/drivers/passthrough/vtd/extern.h | 8 ++--
+ xen/drivers/passthrough/vtd/iommu.c | 62 ++++++++++++++++++----------
+ xen/drivers/passthrough/vtd/quirks.c | 13 +++---
+ 3 files changed, 54 insertions(+), 29 deletions(-)
+
+diff --git a/xen/drivers/passthrough/vtd/extern.h b/xen/drivers/passthrough/vtd/extern.h
+index f51f8aae0d..897dcff9ff 100644
+--- a/xen/drivers/passthrough/vtd/extern.h
++++ b/xen/drivers/passthrough/vtd/extern.h
+@@ -86,9 +86,10 @@ void *map_vtd_domain_page(u64 maddr);
+ void unmap_vtd_domain_page(void *va);
+ int domain_context_mapping_one(struct domain *domain, struct vtd_iommu *iommu,
+ uint8_t bus, uint8_t devfn,
+- const struct pci_dev *pdev, unsigned int mode);
++ const struct pci_dev *pdev, domid_t domid,
++ paddr_t pgd_maddr, unsigned int mode);
+ int domain_context_unmap_one(struct domain *domain, struct vtd_iommu *iommu,
+- u8 bus, u8 devfn);
++ uint8_t bus, uint8_t devfn, domid_t domid);
+ int intel_iommu_get_reserved_device_memory(iommu_grdm_t *func, void *ctxt);
+
+ unsigned int io_apic_read_remap_rte(unsigned int apic, unsigned int reg);
+@@ -107,7 +108,8 @@ void platform_quirks_init(void);
+ void vtd_ops_preamble_quirk(struct vtd_iommu *iommu);
+ void vtd_ops_postamble_quirk(struct vtd_iommu *iommu);
+ int __must_check me_wifi_quirk(struct domain *domain, uint8_t bus,
+- uint8_t devfn, unsigned int mode);
++ uint8_t devfn, domid_t domid, paddr_t pgd_maddr,
++ unsigned int mode);
+ void pci_vtd_quirk(const struct pci_dev *);
+ void quirk_iommu_caps(struct vtd_iommu *iommu);
+
+diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c
+index 17deda92d8..ac2c73e32a 100644
+--- a/xen/drivers/passthrough/vtd/iommu.c
++++ b/xen/drivers/passthrough/vtd/iommu.c
+@@ -1364,12 +1364,12 @@ int domain_context_mapping_one(
+ struct domain *domain,
+ struct vtd_iommu *iommu,
+ uint8_t bus, uint8_t devfn, const struct pci_dev *pdev,
+- unsigned int mode)
++ domid_t domid, paddr_t pgd_maddr, unsigned int mode)
+ {
+ struct domain_iommu *hd = dom_iommu(domain);
+ struct context_entry *context, *context_entries, lctxt;
+ __uint128_t old;
+- u64 maddr, pgd_maddr;
++ uint64_t maddr;
+ uint16_t seg = iommu->drhd->segment, prev_did = 0;
+ struct domain *prev_dom = NULL;
+ int agaw, rc, ret;
+@@ -1410,10 +1410,12 @@ int domain_context_mapping_one(
+ }
+ else
+ {
++ paddr_t root = pgd_maddr;
++
+ spin_lock(&hd->arch.mapping_lock);
+
+ /* Ensure we have pagetables allocated down to leaf PTE. */
+- if ( hd->arch.pgd_maddr == 0 )
++ if ( !root )
+ {
+ addr_to_dma_page_maddr(domain, 0, 1);
+ if ( hd->arch.pgd_maddr == 0 )
+@@ -1426,22 +1428,24 @@ int domain_context_mapping_one(
+ rcu_unlock_domain(prev_dom);
+ return -ENOMEM;
+ }
++
++ root = hd->arch.pgd_maddr;
+ }
+
+ /* Skip top levels of page tables for 2- and 3-level DRHDs. */
+- pgd_maddr = hd->arch.pgd_maddr;
+ for ( agaw = level_to_agaw(4);
+ agaw != level_to_agaw(iommu->nr_pt_levels);
+ agaw-- )
+ {
+- struct dma_pte *p = map_vtd_domain_page(pgd_maddr);
+- pgd_maddr = dma_pte_addr(*p);
++ struct dma_pte *p = map_vtd_domain_page(root);
++
++ root = dma_pte_addr(*p);
+ unmap_vtd_domain_page(p);
+- if ( pgd_maddr == 0 )
++ if ( !root )
+ goto nomem;
+ }
+
+- context_set_address_root(lctxt, pgd_maddr);
++ context_set_address_root(lctxt, root);
+ if ( ats_enabled && ecap_dev_iotlb(iommu->ecap) )
+ context_set_translation_type(lctxt, CONTEXT_TT_DEV_IOTLB);
+ else
+@@ -1557,15 +1561,21 @@ int domain_context_mapping_one(
+ unmap_vtd_domain_page(context_entries);
+
+ if ( !seg && !rc )
+- rc = me_wifi_quirk(domain, bus, devfn, mode);
++ rc = me_wifi_quirk(domain, bus, devfn, domid, pgd_maddr, mode);
+
+ if ( rc )
+ {
+ if ( !prev_dom )
+- domain_context_unmap_one(domain, iommu, bus, devfn);
++ domain_context_unmap_one(domain, iommu, bus, devfn,
++ domain->domain_id);
+ else if ( prev_dom != domain ) /* Avoid infinite recursion. */
++ {
++ hd = dom_iommu(prev_dom);
+ domain_context_mapping_one(prev_dom, iommu, bus, devfn, pdev,
++ domain->domain_id,
++ hd->arch.pgd_maddr,
+ mode & MAP_WITH_RMRR);
++ }
+ }
+
+ if ( prev_dom )
+@@ -1582,6 +1592,7 @@ static int domain_context_mapping(struct domain *domain, u8 devfn,
+ {
+ struct acpi_drhd_unit *drhd;
+ const struct acpi_rmrr_unit *rmrr;
++ paddr_t pgd_maddr = dom_iommu(domain)->arch.pgd_maddr;
+ int ret = 0;
+ unsigned int i, mode = 0;
+ uint16_t seg = pdev->seg, bdf;
+@@ -1647,7 +1658,8 @@ static int domain_context_mapping(struct domain *domain, u8 devfn,
+ domain->domain_id, seg, bus,
+ PCI_SLOT(devfn), PCI_FUNC(devfn));
+ ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn,
+- pdev, mode);
++ pdev, domain->domain_id, pgd_maddr,
++ mode);
+ if ( ret > 0 )
+ ret = 0;
+ if ( !ret && devfn == pdev->devfn && ats_device(pdev, drhd) > 0 )
+@@ -1662,7 +1674,8 @@ static int domain_context_mapping(struct domain *domain, u8 devfn,
+ PCI_SLOT(devfn), PCI_FUNC(devfn));
+
+ ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn,
+- pdev, mode);
++ pdev, domain->domain_id, pgd_maddr,
++ mode);
+ if ( ret < 0 )
+ break;
+ prev_present = ret;
+@@ -1690,7 +1703,8 @@ static int domain_context_mapping(struct domain *domain, u8 devfn,
+ */
+ if ( ret >= 0 )
+ ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn,
+- NULL, mode);
++ NULL, domain->domain_id, pgd_maddr,
++ mode);
+
+ /*
+ * Devices behind PCIe-to-PCI/PCIx bridge may generate different
+@@ -1705,7 +1719,8 @@ static int domain_context_mapping(struct domain *domain, u8 devfn,
+ if ( !ret && pdev_type(seg, bus, devfn) == DEV_TYPE_PCIe2PCI_BRIDGE &&
+ (secbus != pdev->bus || pdev->devfn != 0) )
+ ret = domain_context_mapping_one(domain, drhd->iommu, secbus, 0,
+- NULL, mode);
++ NULL, domain->domain_id, pgd_maddr,
++ mode);
+
+ if ( ret )
+ {
+@@ -1734,7 +1749,7 @@ static int domain_context_mapping(struct domain *domain, u8 devfn,
+ int domain_context_unmap_one(
+ struct domain *domain,
+ struct vtd_iommu *iommu,
+- u8 bus, u8 devfn)
++ uint8_t bus, uint8_t devfn, domid_t domid)
+ {
+ struct context_entry *context, *context_entries;
+ u64 maddr;
+@@ -1792,7 +1807,7 @@ int domain_context_unmap_one(
+ unmap_vtd_domain_page(context_entries);
+
+ if ( !iommu->drhd->segment && !rc )
+- rc = me_wifi_quirk(domain, bus, devfn, UNMAP_ME_PHANTOM_FUNC);
++ rc = me_wifi_quirk(domain, bus, devfn, domid, 0, UNMAP_ME_PHANTOM_FUNC);
+
+ if ( rc && !is_hardware_domain(domain) && domain != dom_io )
+ {
+@@ -1844,7 +1859,8 @@ static int domain_context_unmap(struct domain *domain, u8 devfn,
+ printk(VTDPREFIX "d%d:PCIe: unmap %04x:%02x:%02x.%u\n",
+ domain->domain_id, seg, bus,
+ PCI_SLOT(devfn), PCI_FUNC(devfn));
+- ret = domain_context_unmap_one(domain, iommu, bus, devfn);
++ ret = domain_context_unmap_one(domain, iommu, bus, devfn,
++ domain->domain_id);
+ if ( !ret && devfn == pdev->devfn && ats_device(pdev, drhd) > 0 )
+ disable_ats_device(pdev);
+
+@@ -1854,7 +1870,8 @@ static int domain_context_unmap(struct domain *domain, u8 devfn,
+ if ( iommu_debug )
+ printk(VTDPREFIX "d%d:PCI: unmap %04x:%02x:%02x.%u\n",
+ domain->domain_id, seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
+- ret = domain_context_unmap_one(domain, iommu, bus, devfn);
++ ret = domain_context_unmap_one(domain, iommu, bus, devfn,
++ domain->domain_id);
+ if ( ret )
+ break;
+
+@@ -1880,12 +1897,15 @@ static int domain_context_unmap(struct domain *domain, u8 devfn,
+ /* PCIe to PCI/PCIx bridge */
+ if ( pdev_type(seg, tmp_bus, tmp_devfn) == DEV_TYPE_PCIe2PCI_BRIDGE )
+ {
+- ret = domain_context_unmap_one(domain, iommu, tmp_bus, tmp_devfn);
++ ret = domain_context_unmap_one(domain, iommu, tmp_bus, tmp_devfn,
++ domain->domain_id);
+ if ( !ret )
+- ret = domain_context_unmap_one(domain, iommu, secbus, 0);
++ ret = domain_context_unmap_one(domain, iommu, secbus, 0,
++ domain->domain_id);
+ }
+ else /* Legacy PCI bridge */
+- ret = domain_context_unmap_one(domain, iommu, tmp_bus, tmp_devfn);
++ ret = domain_context_unmap_one(domain, iommu, tmp_bus, tmp_devfn,
++ domain->domain_id);
+
+ break;
+
+diff --git a/xen/drivers/passthrough/vtd/quirks.c b/xen/drivers/passthrough/vtd/quirks.c
+index 99e159b4e9..4d54c21136 100644
+--- a/xen/drivers/passthrough/vtd/quirks.c
++++ b/xen/drivers/passthrough/vtd/quirks.c
+@@ -344,6 +344,8 @@ void __init platform_quirks_init(void)
+
+ static int __must_check map_me_phantom_function(struct domain *domain,
+ unsigned int dev,
++ domid_t domid,
++ paddr_t pgd_maddr,
+ unsigned int mode)
+ {
+ struct acpi_drhd_unit *drhd;
+@@ -357,16 +359,17 @@ static int __must_check map_me_phantom_function(struct domain *domain,
+ /* map or unmap ME phantom function */
+ if ( !(mode & UNMAP_ME_PHANTOM_FUNC) )
+ rc = domain_context_mapping_one(domain, drhd->iommu, 0,
+- PCI_DEVFN(dev, 7), NULL, mode);
++ PCI_DEVFN(dev, 7), NULL,
++ domid, pgd_maddr, mode);
+ else
+ rc = domain_context_unmap_one(domain, drhd->iommu, 0,
+- PCI_DEVFN(dev, 7));
++ PCI_DEVFN(dev, 7), domid);
+
+ return rc;
+ }
+
+ int me_wifi_quirk(struct domain *domain, uint8_t bus, uint8_t devfn,
+- unsigned int mode)
++ domid_t domid, paddr_t pgd_maddr, unsigned int mode)
+ {
+ u32 id;
+ int rc = 0;
+@@ -390,7 +393,7 @@ int me_wifi_quirk(struct domain *domain, uint8_t bus, uint8_t devfn,
+ case 0x423b8086:
+ case 0x423c8086:
+ case 0x423d8086:
+- rc = map_me_phantom_function(domain, 3, mode);
++ rc = map_me_phantom_function(domain, 3, domid, pgd_maddr, mode);
+ break;
+ default:
+ break;
+@@ -416,7 +419,7 @@ int me_wifi_quirk(struct domain *domain, uint8_t bus, uint8_t devfn,
+ case 0x42388086: /* Puma Peak */
+ case 0x422b8086:
+ case 0x422c8086:
+- rc = map_me_phantom_function(domain, 22, mode);
++ rc = map_me_phantom_function(domain, 22, domid, pgd_maddr, mode);
+ break;
+ default:
+ break;
+--
+2.35.2
+
+
+From ab37463eec5724036059d7df027ca13d66368211 Mon Sep 17 00:00:00 2001
+From: Jan Beulich <jbeulich@suse.com>
+Date: Tue, 5 Apr 2022 15:25:26 +0200
+Subject: [PATCH 25/32] VT-d: prepare for per-device quarantine page tables
+ (part II)
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Replace the passing of struct domain * by domid_t in preparation of
+per-device quarantine page tables also requiring per-device pseudo
+domain IDs, which aren't going to be associated with any struct domain
+instances.
+
+No functional change intended (except for slightly adjusted log message
+text).
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Paul Durrant <paul@xen.org>
+Reviewed-by: Kevin Tian <kevin.tian@intel.com>
+Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
+master commit: 7131163c4806e3c7de24873164d1a003d2a27dee
+master date: 2022-04-05 14:18:48 +0200
+---
+ xen/drivers/passthrough/vtd/iommu.c | 33 ++++++++++++++---------------
+ 1 file changed, 16 insertions(+), 17 deletions(-)
+
+diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c
+index ac2c73e32a..6388d97d26 100644
+--- a/xen/drivers/passthrough/vtd/iommu.c
++++ b/xen/drivers/passthrough/vtd/iommu.c
+@@ -52,8 +52,8 @@ static struct tasklet vtd_fault_tasklet;
+ static int setup_hwdom_device(u8 devfn, struct pci_dev *);
+ static void setup_hwdom_rmrr(struct domain *d);
+
+-static int domain_iommu_domid(struct domain *d,
+- struct vtd_iommu *iommu)
++static int get_iommu_did(domid_t domid, const struct vtd_iommu *iommu,
++ bool warn)
+ {
+ unsigned long nr_dom, i;
+
+@@ -61,16 +61,16 @@ static int domain_iommu_domid(struct domain *d,
+ i = find_first_bit(iommu->domid_bitmap, nr_dom);
+ while ( i < nr_dom )
+ {
+- if ( iommu->domid_map[i] == d->domain_id )
++ if ( iommu->domid_map[i] == domid )
+ return i;
+
+ i = find_next_bit(iommu->domid_bitmap, nr_dom, i+1);
+ }
+
+- if ( !d->is_dying )
++ if ( warn )
+ dprintk(XENLOG_ERR VTDPREFIX,
+- "Cannot get valid iommu %u domid: %pd\n",
+- iommu->index, d);
++ "No valid iommu %u domid for Dom%d\n",
++ iommu->index, domid);
+
+ return -1;
+ }
+@@ -78,8 +78,7 @@ static int domain_iommu_domid(struct domain *d,
+ #define DID_FIELD_WIDTH 16
+ #define DID_HIGH_OFFSET 8
+ static int context_set_domain_id(struct context_entry *context,
+- struct domain *d,
+- struct vtd_iommu *iommu)
++ domid_t domid, struct vtd_iommu *iommu)
+ {
+ unsigned long nr_dom, i;
+ int found = 0;
+@@ -90,7 +89,7 @@ static int context_set_domain_id(struct context_entry *context,
+ i = find_first_bit(iommu->domid_bitmap, nr_dom);
+ while ( i < nr_dom )
+ {
+- if ( iommu->domid_map[i] == d->domain_id )
++ if ( iommu->domid_map[i] == domid )
+ {
+ found = 1;
+ break;
+@@ -106,7 +105,7 @@ static int context_set_domain_id(struct context_entry *context,
+ dprintk(XENLOG_ERR VTDPREFIX, "IOMMU: no free domain ids\n");
+ return -EFAULT;
+ }
+- iommu->domid_map[i] = d->domain_id;
++ iommu->domid_map[i] = domid;
+ }
+
+ set_bit(i, iommu->domid_bitmap);
+@@ -115,9 +114,9 @@ static int context_set_domain_id(struct context_entry *context,
+ return 0;
+ }
+
+-static void cleanup_domid_map(struct domain *domain, struct vtd_iommu *iommu)
++static void cleanup_domid_map(domid_t domid, struct vtd_iommu *iommu)
+ {
+- int iommu_domid = domain_iommu_domid(domain, iommu);
++ int iommu_domid = get_iommu_did(domid, iommu, false);
+
+ if ( iommu_domid >= 0 )
+ {
+@@ -173,7 +172,7 @@ static void check_cleanup_domid_map(struct domain *d,
+ if ( !found )
+ {
+ clear_bit(iommu->index, &dom_iommu(d)->arch.iommu_bitmap);
+- cleanup_domid_map(d, iommu);
++ cleanup_domid_map(d->domain_id, iommu);
+ }
+ }
+
+@@ -630,7 +629,7 @@ static int __must_check iommu_flush_iotlb(struct domain *d, dfn_t dfn,
+ continue;
+
+ flush_dev_iotlb = !!find_ats_dev_drhd(iommu);
+- iommu_domid= domain_iommu_domid(d, iommu);
++ iommu_domid = get_iommu_did(d->domain_id, iommu, !d->is_dying);
+ if ( iommu_domid == -1 )
+ continue;
+
+@@ -1454,7 +1453,7 @@ int domain_context_mapping_one(
+ spin_unlock(&hd->arch.mapping_lock);
+ }
+
+- rc = context_set_domain_id(&lctxt, domain, iommu);
++ rc = context_set_domain_id(&lctxt, domid, iommu);
+ if ( rc )
+ {
+ unlock:
+@@ -1774,7 +1773,7 @@ int domain_context_unmap_one(
+ context_clear_entry(*context);
+ iommu_sync_cache(context, sizeof(struct context_entry));
+
+- iommu_domid= domain_iommu_domid(domain, iommu);
++ iommu_domid = get_iommu_did(domid, iommu, !domain->is_dying);
+ if ( iommu_domid == -1 )
+ {
+ spin_unlock(&iommu->lock);
+@@ -1948,7 +1947,7 @@ static void iommu_domain_teardown(struct domain *d)
+ spin_unlock(&hd->arch.mapping_lock);
+
+ for_each_drhd_unit ( drhd )
+- cleanup_domid_map(d, drhd->iommu);
++ cleanup_domid_map(d->domain_id, drhd->iommu);
+ }
+
+ static int __must_check intel_iommu_map_page(struct domain *d, dfn_t dfn,
+--
+2.35.2
+
+
+From 7cfe3570b1c0b4b19317145fbe4c776f09768fd5 Mon Sep 17 00:00:00 2001
+From: Jan Beulich <jbeulich@suse.com>
+Date: Tue, 5 Apr 2022 15:25:54 +0200
+Subject: [PATCH 26/32] IOMMU/x86: maintain a per-device pseudo domain ID
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+In order to subsequently enable per-device quarantine page tables, we'll
+need domain-ID-like identifiers to be inserted in the respective device
+(AMD) or context (Intel) table entries alongside the per-device page
+table root addresses.
+
+Make use of "real" domain IDs occupying only half of the value range
+coverable by domid_t.
+
+Note that in VT-d's iommu_alloc() I didn't want to introduce new memory
+leaks in case of error, but existing ones don't get plugged - that'll be
+the subject of a later change.
+
+The VT-d changes are slightly asymmetric, but this way we can avoid
+assigning pseudo domain IDs to devices which would never be mapped while
+still avoiding to add a new parameter to domain_context_unmap().
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Paul Durrant <paul@xen.org>
+Reviewed-by: Kevin Tian <kevin.tian@intel.com>
+Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
+master commit: 97af062b89d52c0ecf7af254b53345c97d438e33
+master date: 2022-04-05 14:19:10 +0200
+---
+ xen/drivers/passthrough/amd/iommu_detect.c | 8 +++
+ xen/drivers/passthrough/amd/pci_amd_iommu.c | 22 ++++++-
+ xen/drivers/passthrough/pci.c | 11 +++-
+ xen/drivers/passthrough/vtd/iommu.c | 69 +++++++++++++++++----
+ xen/drivers/passthrough/vtd/iommu.h | 1 +
+ xen/drivers/passthrough/x86/iommu.c | 47 ++++++++++++++
+ xen/include/asm-x86/amd-iommu.h | 1 +
+ xen/include/asm-x86/iommu.h | 4 ++
+ xen/include/asm-x86/pci.h | 6 ++
+ xen/include/public/xen.h | 3 +
+ 10 files changed, 156 insertions(+), 16 deletions(-)
+
+diff --git a/xen/drivers/passthrough/amd/iommu_detect.c b/xen/drivers/passthrough/amd/iommu_detect.c
+index d782e66eee..0df10f25b0 100644
+--- a/xen/drivers/passthrough/amd/iommu_detect.c
++++ b/xen/drivers/passthrough/amd/iommu_detect.c
+@@ -183,6 +183,11 @@ int __init amd_iommu_detect_one_acpi(
+ if ( rt )
+ goto out;
+
++ iommu->domid_map = iommu_init_domid();
++ rt = -ENOMEM;
++ if ( !iommu->domid_map )
++ goto out;
++
+ rt = pci_ro_device(iommu->seg, bus, PCI_DEVFN(dev, func));
+ if ( rt )
+ printk(XENLOG_ERR
+@@ -194,7 +199,10 @@ int __init amd_iommu_detect_one_acpi(
+
+ out:
+ if ( rt )
++ {
++ xfree(iommu->domid_map);
+ xfree(iommu);
++ }
+
+ return rt;
+ }
+diff --git a/xen/drivers/passthrough/amd/pci_amd_iommu.c b/xen/drivers/passthrough/amd/pci_amd_iommu.c
+index 14483e85ae..b07091e71e 100644
+--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
++++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
+@@ -563,6 +563,8 @@ static int amd_iommu_add_device(u8 devfn, struct pci_dev *pdev)
+ struct amd_iommu *iommu;
+ u16 bdf;
+ struct ivrs_mappings *ivrs_mappings;
++ bool fresh_domid = false;
++ int ret;
+
+ if ( !pdev->domain )
+ return -EINVAL;
+@@ -626,7 +628,22 @@ static int amd_iommu_add_device(u8 devfn, struct pci_dev *pdev)
+ spin_unlock_irqrestore(&iommu->lock, flags);
+ }
+
+- return amd_iommu_setup_domain_device(pdev->domain, iommu, devfn, pdev);
++ if ( iommu_quarantine && pdev->arch.pseudo_domid == DOMID_INVALID )
++ {
++ pdev->arch.pseudo_domid = iommu_alloc_domid(iommu->domid_map);
++ if ( pdev->arch.pseudo_domid == DOMID_INVALID )
++ return -ENOSPC;
++ fresh_domid = true;
++ }
++
++ ret = amd_iommu_setup_domain_device(pdev->domain, iommu, devfn, pdev);
++ if ( ret && fresh_domid )
++ {
++ iommu_free_domid(pdev->arch.pseudo_domid, iommu->domid_map);
++ pdev->arch.pseudo_domid = DOMID_INVALID;
++ }
++
++ return ret;
+ }
+
+ static int amd_iommu_remove_device(u8 devfn, struct pci_dev *pdev)
+@@ -651,6 +668,9 @@ static int amd_iommu_remove_device(u8 devfn, struct pci_dev *pdev)
+
+ amd_iommu_disable_domain_device(pdev->domain, iommu, devfn, pdev);
+
++ iommu_free_domid(pdev->arch.pseudo_domid, iommu->domid_map);
++ pdev->arch.pseudo_domid = DOMID_INVALID;
++
+ ivrs_mappings = get_ivrs_mappings(pdev->seg);
+ bdf = PCI_BDF2(pdev->bus, devfn);
+ if ( amd_iommu_perdev_intremap &&
+diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c
+index 32510351cf..97e42261eb 100644
+--- a/xen/drivers/passthrough/pci.c
++++ b/xen/drivers/passthrough/pci.c
+@@ -338,6 +338,7 @@ static struct pci_dev *alloc_pdev(struct pci_seg *pseg, u8 bus, u8 devfn)
+ *((u8*) &pdev->bus) = bus;
+ *((u8*) &pdev->devfn) = devfn;
+ pdev->domain = NULL;
++ pdev->arch.pseudo_domid = DOMID_INVALID;
+ INIT_LIST_HEAD(&pdev->msi_list);
+
+ pos = pci_find_cap_offset(pseg->nr, bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
+@@ -1353,9 +1354,13 @@ static int _dump_pci_devices(struct pci_seg *pseg, void *arg)
+
+ list_for_each_entry ( pdev, &pseg->alldevs_list, alldevs_list )
+ {
+- printk("%04x:%02x:%02x.%u - %pd - node %-3d - MSIs < ",
+- pseg->nr, pdev->bus,
+- PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn), pdev->domain,
++ printk("%04x:%02x:%02x.%u - ", pseg->nr, pdev->bus,
++ PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));
++ if ( pdev->domain == dom_io )
++ printk("DomIO:%x", pdev->arch.pseudo_domid);
++ else
++ printk("%pd", pdev->domain);
++ printk(" - node %-3d - MSIs < ",
+ (pdev->node != NUMA_NO_NODE) ? pdev->node : -1);
+ list_for_each_entry ( msi, &pdev->msi_list, list )
+ printk("%d ", msi->irq);
+diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c
+index 6388d97d26..fc89f3e4c5 100644
+--- a/xen/drivers/passthrough/vtd/iommu.c
++++ b/xen/drivers/passthrough/vtd/iommu.c
+@@ -22,6 +22,7 @@
+ #include <xen/sched.h>
+ #include <xen/xmalloc.h>
+ #include <xen/domain_page.h>
++#include <xen/err.h>
+ #include <xen/iocap.h>
+ #include <xen/iommu.h>
+ #include <xen/numa.h>
+@@ -1192,7 +1193,7 @@ int __init iommu_alloc(struct acpi_drhd_unit *drhd)
+ {
+ struct vtd_iommu *iommu;
+ unsigned long sagaw, nr_dom;
+- int agaw;
++ int agaw, rc;
+
+ if ( nr_iommus >= MAX_IOMMUS )
+ {
+@@ -1285,7 +1286,16 @@ int __init iommu_alloc(struct acpi_drhd_unit *drhd)
+ if ( !iommu->domid_map )
+ return -ENOMEM;
+
++ iommu->pseudo_domid_map = iommu_init_domid();
++ rc = -ENOMEM;
++ if ( !iommu->pseudo_domid_map )
++ goto free;
++
+ return 0;
++
++ free:
++ iommu_free(drhd);
++ return rc;
+ }
+
+ void __init iommu_free(struct acpi_drhd_unit *drhd)
+@@ -1308,6 +1318,7 @@ void __init iommu_free(struct acpi_drhd_unit *drhd)
+
+ xfree(iommu->domid_bitmap);
+ xfree(iommu->domid_map);
++ xfree(iommu->pseudo_domid_map);
+
+ if ( iommu->msi.irq >= 0 )
+ destroy_irq(iommu->msi.irq);
+@@ -1583,8 +1594,8 @@ int domain_context_mapping_one(
+ return rc ?: pdev && prev_dom;
+ }
+
+-static int domain_context_unmap(struct domain *d, uint8_t devfn,
+- struct pci_dev *pdev);
++static const struct acpi_drhd_unit *domain_context_unmap(
++ struct domain *d, uint8_t devfn, struct pci_dev *pdev);
+
+ static int domain_context_mapping(struct domain *domain, u8 devfn,
+ struct pci_dev *pdev)
+@@ -1592,6 +1603,7 @@ static int domain_context_mapping(struct domain *domain, u8 devfn,
+ struct acpi_drhd_unit *drhd;
+ const struct acpi_rmrr_unit *rmrr;
+ paddr_t pgd_maddr = dom_iommu(domain)->arch.pgd_maddr;
++ domid_t orig_domid = pdev->arch.pseudo_domid;
+ int ret = 0;
+ unsigned int i, mode = 0;
+ uint16_t seg = pdev->seg, bdf;
+@@ -1652,6 +1664,14 @@ static int domain_context_mapping(struct domain *domain, u8 devfn,
+ break;
+
+ case DEV_TYPE_PCIe_ENDPOINT:
++ if ( iommu_quarantine && orig_domid == DOMID_INVALID )
++ {
++ pdev->arch.pseudo_domid =
++ iommu_alloc_domid(drhd->iommu->pseudo_domid_map);
++ if ( pdev->arch.pseudo_domid == DOMID_INVALID )
++ return -ENOSPC;
++ }
++
+ if ( iommu_debug )
+ printk(VTDPREFIX "d%d:PCIe: map %04x:%02x:%02x.%u\n",
+ domain->domain_id, seg, bus,
+@@ -1667,6 +1687,14 @@ static int domain_context_mapping(struct domain *domain, u8 devfn,
+ break;
+
+ case DEV_TYPE_PCI:
++ if ( iommu_quarantine && orig_domid == DOMID_INVALID )
++ {
++ pdev->arch.pseudo_domid =
++ iommu_alloc_domid(drhd->iommu->pseudo_domid_map);
++ if ( pdev->arch.pseudo_domid == DOMID_INVALID )
++ return -ENOSPC;
++ }
++
+ if ( iommu_debug )
+ printk(VTDPREFIX "d%d:PCI: map %04x:%02x:%02x.%u\n",
+ domain->domain_id, seg, bus,
+@@ -1742,6 +1770,13 @@ static int domain_context_mapping(struct domain *domain, u8 devfn,
+ if ( !ret && devfn == pdev->devfn )
+ pci_vtd_quirk(pdev);
+
++ if ( ret && drhd && orig_domid == DOMID_INVALID )
++ {
++ iommu_free_domid(pdev->arch.pseudo_domid,
++ drhd->iommu->pseudo_domid_map);
++ pdev->arch.pseudo_domid = DOMID_INVALID;
++ }
++
+ return ret;
+ }
+
+@@ -1824,8 +1859,10 @@ int domain_context_unmap_one(
+ return rc;
+ }
+
+-static int domain_context_unmap(struct domain *domain, u8 devfn,
+- struct pci_dev *pdev)
++static const struct acpi_drhd_unit *domain_context_unmap(
++ struct domain *domain,
++ uint8_t devfn,
++ struct pci_dev *pdev)
+ {
+ struct acpi_drhd_unit *drhd;
+ struct vtd_iommu *iommu;
+@@ -1834,7 +1871,7 @@ static int domain_context_unmap(struct domain *domain, u8 devfn,
+
+ drhd = acpi_find_matched_drhd_unit(pdev);
+ if ( !drhd )
+- return -ENODEV;
++ return ERR_PTR(-ENODEV);
+ iommu = drhd->iommu;
+
+ switch ( pdev->type )
+@@ -1845,7 +1882,7 @@ static int domain_context_unmap(struct domain *domain, u8 devfn,
+ domain->domain_id, seg, bus,
+ PCI_SLOT(devfn), PCI_FUNC(devfn));
+ if ( !is_hardware_domain(domain) )
+- return -EPERM;
++ return ERR_PTR(-EPERM);
+ goto out;
+
+ case DEV_TYPE_PCIe_BRIDGE:
+@@ -1923,7 +1960,7 @@ static int domain_context_unmap(struct domain *domain, u8 devfn,
+ check_cleanup_domid_map(domain, pdev, iommu);
+
+ out:
+- return ret;
++ return ret ? ERR_PTR(ret) : drhd;
+ }
+
+ static void iommu_domain_teardown(struct domain *d)
+@@ -2145,16 +2182,17 @@ static int intel_iommu_enable_device(struct pci_dev *pdev)
+
+ static int intel_iommu_remove_device(u8 devfn, struct pci_dev *pdev)
+ {
++ const struct acpi_drhd_unit *drhd;
+ struct acpi_rmrr_unit *rmrr;
+ u16 bdf;
+- int ret, i;
++ unsigned int i;
+
+ if ( !pdev->domain )
+ return -EINVAL;
+
+- ret = domain_context_unmap(pdev->domain, devfn, pdev);
+- if ( ret )
+- return ret;
++ drhd = domain_context_unmap(pdev->domain, devfn, pdev);
++ if ( IS_ERR(drhd) )
++ return PTR_ERR(drhd);
+
+ for_each_rmrr_device ( rmrr, bdf, i )
+ {
+@@ -2171,6 +2209,13 @@ static int intel_iommu_remove_device(u8 devfn, struct pci_dev *pdev)
+ rmrr->end_address, 0);
+ }
+
++ if ( drhd )
++ {
++ iommu_free_domid(pdev->arch.pseudo_domid,
++ drhd->iommu->pseudo_domid_map);
++ pdev->arch.pseudo_domid = DOMID_INVALID;
++ }
++
+ return 0;
+ }
+
+diff --git a/xen/drivers/passthrough/vtd/iommu.h b/xen/drivers/passthrough/vtd/iommu.h
+index 503b07ffb7..be44fc017f 100644
+--- a/xen/drivers/passthrough/vtd/iommu.h
++++ b/xen/drivers/passthrough/vtd/iommu.h
+@@ -535,6 +535,7 @@ struct vtd_iommu {
+ } flush;
+
+ struct list_head ats_devices;
++ unsigned long *pseudo_domid_map; /* "pseudo" domain id bitmap */
+ unsigned long *domid_bitmap; /* domain id bitmap */
+ u16 *domid_map; /* domain id mapping array */
+ uint32_t version;
+diff --git a/xen/drivers/passthrough/x86/iommu.c b/xen/drivers/passthrough/x86/iommu.c
+index 818d28f770..f900bff60b 100644
+--- a/xen/drivers/passthrough/x86/iommu.c
++++ b/xen/drivers/passthrough/x86/iommu.c
+@@ -346,6 +346,53 @@ void __hwdom_init arch_iommu_hwdom_init(struct domain *d)
+ return;
+ }
+
++unsigned long *__init iommu_init_domid(void)
++{
++ if ( !iommu_quarantine )
++ return ZERO_BLOCK_PTR;
++
++ BUILD_BUG_ON(DOMID_MASK * 2U >= UINT16_MAX);
++
++ return xzalloc_array(unsigned long,
++ BITS_TO_LONGS(UINT16_MAX - DOMID_MASK));
++}
++
++domid_t iommu_alloc_domid(unsigned long *map)
++{
++ /*
++ * This is used uniformly across all IOMMUs, such that on typical
++ * systems we wouldn't re-use the same ID very quickly (perhaps never).
++ */
++ static unsigned int start;
++ unsigned int idx = find_next_zero_bit(map, UINT16_MAX - DOMID_MASK, start);
++
++ ASSERT(pcidevs_locked());
++
++ if ( idx >= UINT16_MAX - DOMID_MASK )
++ idx = find_first_zero_bit(map, UINT16_MAX - DOMID_MASK);
++ if ( idx >= UINT16_MAX - DOMID_MASK )
++ return DOMID_INVALID;
++
++ __set_bit(idx, map);
++
++ start = idx + 1;
++
++ return idx | (DOMID_MASK + 1);
++}
++
++void iommu_free_domid(domid_t domid, unsigned long *map)
++{
++ ASSERT(pcidevs_locked());
++
++ if ( domid == DOMID_INVALID )
++ return;
++
++ ASSERT(domid > DOMID_MASK);
++
++ if ( !__test_and_clear_bit(domid & DOMID_MASK, map) )
++ BUG();
++}
++
+ /*
+ * Local variables:
+ * mode: C
+diff --git a/xen/include/asm-x86/amd-iommu.h b/xen/include/asm-x86/amd-iommu.h
+index 829e1b1755..452ce97c02 100644
+--- a/xen/include/asm-x86/amd-iommu.h
++++ b/xen/include/asm-x86/amd-iommu.h
+@@ -94,6 +94,7 @@ struct amd_iommu {
+ struct ring_buffer cmd_buffer;
+ struct ring_buffer event_log;
+ struct ring_buffer ppr_log;
++ unsigned long *domid_map;
+
+ int exclusion_enable;
+ int exclusion_allow_all;
+diff --git a/xen/include/asm-x86/iommu.h b/xen/include/asm-x86/iommu.h
+index aaf9455b8e..389417d198 100644
+--- a/xen/include/asm-x86/iommu.h
++++ b/xen/include/asm-x86/iommu.h
+@@ -130,6 +130,10 @@ int pi_update_irte(const struct pi_desc *pi_desc, const struct pirq *pirq,
+ iommu_vcall(ops, sync_cache, addr, size); \
+ })
+
++unsigned long *iommu_init_domid(void);
++domid_t iommu_alloc_domid(unsigned long *map);
++void iommu_free_domid(domid_t domid, unsigned long *map);
++
+ #endif /* !__ARCH_X86_IOMMU_H__ */
+ /*
+ * Local variables:
+diff --git a/xen/include/asm-x86/pci.h b/xen/include/asm-x86/pci.h
+index cc05045e9c..70ed48e309 100644
+--- a/xen/include/asm-x86/pci.h
++++ b/xen/include/asm-x86/pci.h
+@@ -15,6 +15,12 @@
+
+ struct arch_pci_dev {
+ vmask_t used_vectors;
++ /*
++ * These fields are (de)initialized under pcidevs-lock. Other uses of
++ * them don't race (de)initialization and hence don't strictly need any
++ * locking.
++ */
++ domid_t pseudo_domid;
+ };
+
+ int pci_conf_write_intercept(unsigned int seg, unsigned int bdf,
+diff --git a/xen/include/public/xen.h b/xen/include/public/xen.h
+index d2198dffad..75b1619d0d 100644
+--- a/xen/include/public/xen.h
++++ b/xen/include/public/xen.h
+@@ -614,6 +614,9 @@ DEFINE_XEN_GUEST_HANDLE(mmuext_op_t);
+ /* Idle domain. */
+ #define DOMID_IDLE xen_mk_uint(0x7FFF)
+
++/* Mask for valid domain id values */
++#define DOMID_MASK xen_mk_uint(0x7FFF)
++
+ #ifndef __ASSEMBLY__
+
+ typedef uint16_t domid_t;
+--
+2.35.2
+
+
+From e6d6b5ba030a8d2d81bf902e4bc2a8530b3576ae Mon Sep 17 00:00:00 2001
+From: Jan Beulich <jbeulich@suse.com>
+Date: Tue, 5 Apr 2022 15:26:41 +0200
+Subject: [PATCH 27/32] IOMMU/x86: drop TLB flushes from quarantine_init()
+ hooks
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+The page tables just created aren't hooked up yet anywhere, so there's
+nothing that could be present in any TLB, and hence nothing to flush.
+Dropping this flush is, at least on the VT-d side, a prereq to per-
+device domain ID use when quarantining devices, as dom_io isn't going
+to be assigned a DID anymore: The warning in get_iommu_did() would
+trigger.
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Paul Durrant <paul@xen.org>
+Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
+Reviewed-by: Kevin Tian <kevin.tian@intel.com>
+master commit: 54c5cef49239e2f27ec3b3fc8804bf57aa4bf46d
+master date: 2022-04-05 14:19:42 +0200
+---
+ xen/drivers/passthrough/amd/iommu_map.c | 2 --
+ xen/drivers/passthrough/vtd/iommu.c | 5 +----
+ 2 files changed, 1 insertion(+), 6 deletions(-)
+
+diff --git a/xen/drivers/passthrough/amd/iommu_map.c b/xen/drivers/passthrough/amd/iommu_map.c
+index 45559f9678..3c7cd7ed9e 100644
+--- a/xen/drivers/passthrough/amd/iommu_map.c
++++ b/xen/drivers/passthrough/amd/iommu_map.c
+@@ -595,8 +595,6 @@ int __init amd_iommu_quarantine_init(struct domain *d)
+ out:
+ spin_unlock(&hd->arch.mapping_lock);
+
+- amd_iommu_flush_all_pages(d);
+-
+ /* Pages leaked in failure case */
+ return level ? -ENOMEM : 0;
+ }
+diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c
+index fc89f3e4c5..e5c50429d2 100644
+--- a/xen/drivers/passthrough/vtd/iommu.c
++++ b/xen/drivers/passthrough/vtd/iommu.c
+@@ -2894,7 +2894,6 @@ static int __init intel_iommu_quarantine_init(struct domain *d)
+ struct dma_pte *parent;
+ unsigned int agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH);
+ unsigned int level = agaw_to_level(agaw);
+- int rc;
+
+ if ( hd->arch.pgd_maddr )
+ {
+@@ -2941,10 +2940,8 @@ static int __init intel_iommu_quarantine_init(struct domain *d)
+ out:
+ spin_unlock(&hd->arch.mapping_lock);
+
+- rc = iommu_flush_iotlb_all(d);
+-
+ /* Pages leaked in failure case */
+- return level ? -ENOMEM : rc;
++ return level ? -ENOMEM : 0;
+ }
+
+ const struct iommu_ops __initconstrel intel_iommu_ops = {
+--
+2.35.2
+
+
+From 454d5351a93d2438778630843cf3e77da0772167 Mon Sep 17 00:00:00 2001
+From: Jan Beulich <jbeulich@suse.com>
+Date: Tue, 5 Apr 2022 15:27:09 +0200
+Subject: [PATCH 28/32] AMD/IOMMU: abstract maximum number of page table levels
+
+We will want to use the constant elsewhere.
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Paul Durrant <paul@xen.org>
+master commit: a038b514c1e970a8dc32229cbd31f6769ee61ad5
+master date: 2022-04-05 14:20:04 +0200
+---
+ xen/drivers/passthrough/amd/iommu_map.c | 2 +-
+ xen/include/asm-x86/hvm/svm/amd-iommu-defs.h | 1 +
+ xen/include/asm-x86/hvm/svm/amd-iommu-proto.h | 2 +-
+ 3 files changed, 3 insertions(+), 2 deletions(-)
+
+diff --git a/xen/drivers/passthrough/amd/iommu_map.c b/xen/drivers/passthrough/amd/iommu_map.c
+index 3c7cd7ed9e..db396dd1d4 100644
+--- a/xen/drivers/passthrough/amd/iommu_map.c
++++ b/xen/drivers/passthrough/amd/iommu_map.c
+@@ -260,7 +260,7 @@ static int iommu_pde_from_dfn(struct domain *d, unsigned long dfn,
+ table = hd->arch.root_table;
+ level = hd->arch.paging_mode;
+
+- BUG_ON( table == NULL || level < 1 || level > 6 );
++ BUG_ON( table == NULL || level < 1 || level > IOMMU_MAX_PT_LEVELS );
+
+ /*
+ * A frame number past what the current page tables can represent can't
+diff --git a/xen/include/asm-x86/hvm/svm/amd-iommu-defs.h b/xen/include/asm-x86/hvm/svm/amd-iommu-defs.h
+index a54d6e9fc6..c46247cb24 100644
+--- a/xen/include/asm-x86/hvm/svm/amd-iommu-defs.h
++++ b/xen/include/asm-x86/hvm/svm/amd-iommu-defs.h
+@@ -110,6 +110,7 @@ struct amd_iommu_dte {
+ bool tv:1;
+ unsigned int :5;
+ unsigned int had:2;
++#define IOMMU_MAX_PT_LEVELS 6
+ unsigned int paging_mode:3;
+ uint64_t pt_root:40;
+ bool ppr:1;
+diff --git a/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h b/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
+index 52c889ade0..2a3bc47ab5 100644
+--- a/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
++++ b/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
+@@ -193,7 +193,7 @@ static inline int amd_iommu_get_paging_mode(unsigned long max_frames)
+ while ( max_frames > PTE_PER_TABLE_SIZE )
+ {
+ max_frames = PTE_PER_TABLE_ALIGN(max_frames) >> PTE_PER_TABLE_SHIFT;
+- if ( ++level > 6 )
++ if ( ++level > IOMMU_MAX_PT_LEVELS )
+ return -ENOMEM;
+ }
+
+--
+2.35.2
+
+
+From 169a2834ef5d723091f187a5d6493ae77825757a Mon Sep 17 00:00:00 2001
+From: Jan Beulich <jbeulich@suse.com>
+Date: Tue, 5 Apr 2022 15:27:36 +0200
+Subject: [PATCH 29/32] IOMMU/x86: use per-device page tables for quarantining
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Devices with RMRRs / unity mapped regions, due to it being unspecified
+how/when these memory regions may be accessed, may not be left
+disconnected from the mappings of these regions (as long as it's not
+certain that the device has been fully quiesced). Hence even the page
+tables used when quarantining such devices need to have mappings of
+those regions. This implies installing page tables in the first place
+even when not in scratch-page quarantining mode.
+
+This is CVE-2022-26361 / part of XSA-400.
+
+While for the purpose here it would be sufficient to have devices with
+RMRRs / unity mapped regions use per-device page tables, extend this to
+all devices (in scratch-page quarantining mode). This allows the leaf
+pages to be mapped r/w, thus covering also memory writes (rather than
+just reads) issued by non-quiescent devices.
+
+Set up quarantine page tables as late as possible, yet early enough to
+not encounter failure during de-assign. This means setup generally
+happens in assign_device(), while (for now) the one in deassign_device()
+is there mainly to be on the safe side.
+
+In VT-d's DID allocation function don't require the IOMMU lock to be
+held anymore: All involved code paths hold pcidevs_lock, so this way we
+avoid the need to acquire the IOMMU lock around the new call to
+context_set_domain_id().
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Paul Durrant <paul@xen.org>
+Reviewed-by: Kevin Tian <kevin.tian@intel.com>
+Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
+master commit: 14dd241aad8af447680ac73e8579990e2c09c1e7
+master date: 2022-04-05 14:24:18 +0200
+---
+ xen/arch/x86/mm/p2m.c | 2 +-
+ xen/drivers/passthrough/amd/iommu_map.c | 155 ++++++++---
+ xen/drivers/passthrough/amd/pci_amd_iommu.c | 35 ++-
+ xen/drivers/passthrough/iommu.c | 18 +-
+ xen/drivers/passthrough/pci.c | 20 +-
+ xen/drivers/passthrough/vtd/iommu.c | 247 +++++++++++++-----
+ xen/drivers/passthrough/vtd/iommu.h | 2 +-
+ xen/include/asm-x86/hvm/svm/amd-iommu-proto.h | 3 +-
+ xen/include/asm-x86/pci.h | 13 +
+ xen/include/xen/iommu.h | 3 +-
+ 10 files changed, 363 insertions(+), 135 deletions(-)
+
+diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
+index a6bfda010a..91f7b7760c 100644
+--- a/xen/arch/x86/mm/p2m.c
++++ b/xen/arch/x86/mm/p2m.c
+@@ -1453,7 +1453,7 @@ int set_identity_p2m_entry(struct domain *d, unsigned long gfn_l,
+ struct p2m_domain *p2m = p2m_get_hostp2m(d);
+ int ret;
+
+- if ( !paging_mode_translate(p2m->domain) )
++ if ( !paging_mode_translate(d) )
+ {
+ if ( !is_iommu_enabled(d) )
+ return 0;
+diff --git a/xen/drivers/passthrough/amd/iommu_map.c b/xen/drivers/passthrough/amd/iommu_map.c
+index db396dd1d4..8b7d5b7c7b 100644
+--- a/xen/drivers/passthrough/amd/iommu_map.c
++++ b/xen/drivers/passthrough/amd/iommu_map.c
+@@ -539,64 +539,137 @@ int amd_iommu_reserve_domain_unity_unmap(struct domain *d,
+ return rc;
+ }
+
+-int __init amd_iommu_quarantine_init(struct domain *d)
++static int fill_qpt(union amd_iommu_pte *this, unsigned int level,
++ struct page_info *pgs[IOMMU_MAX_PT_LEVELS],
++ struct pci_dev *pdev)
+ {
+- struct domain_iommu *hd = dom_iommu(d);
++ unsigned int i;
++ int rc = 0;
++
++ for ( i = 0; !rc && i < PTE_PER_TABLE_SIZE; ++i )
++ {
++ union amd_iommu_pte *pte = &this[i], *next;
++
++ if ( !pte->pr )
++ {
++ if ( !pgs[level] )
++ {
++ /*
++ * The pgtable allocator is fine for the leaf page, as well as
++ * page table pages, and the resulting allocations are always
++ * zeroed.
++ */
++ pgs[level] = alloc_amd_iommu_pgtable();
++ if ( !pgs[level] )
++ {
++ rc = -ENOMEM;
++ break;
++ }
++
++ page_list_add(pgs[level], &pdev->arch.pgtables_list);
++
++ if ( level )
++ {
++ next = __map_domain_page(pgs[level]);
++ rc = fill_qpt(next, level - 1, pgs, pdev);
++ unmap_domain_page(next);
++ }
++ }
++
++ /*
++ * PDEs are essentially a subset of PTEs, so this function
++ * is fine to use even at the leaf.
++ */
++ set_iommu_pde_present(pte, mfn_x(page_to_mfn(pgs[level])), level,
++ true, true);
++ }
++ else if ( level && pte->next_level )
++ {
++ page_list_add(mfn_to_page(_mfn(pte->mfn)),
++ &pdev->arch.pgtables_list);
++ next = map_domain_page(_mfn(pte->mfn));
++ rc = fill_qpt(next, level - 1, pgs, pdev);
++ unmap_domain_page(next);
++ }
++ }
++
++ return rc;
++}
++
++int amd_iommu_quarantine_init(struct pci_dev *pdev)
++{
++ struct domain_iommu *hd = dom_iommu(dom_io);
+ unsigned long end_gfn =
+ 1ul << (DEFAULT_DOMAIN_ADDRESS_WIDTH - PAGE_SHIFT);
+ unsigned int level = amd_iommu_get_paging_mode(end_gfn);
+- union amd_iommu_pte *table;
++ unsigned int req_id = get_dma_requestor_id(pdev->seg, pdev->sbdf.bdf);
++ const struct ivrs_mappings *ivrs_mappings = get_ivrs_mappings(pdev->seg);
++ int rc;
++
++ ASSERT(pcidevs_locked());
++ ASSERT(!hd->arch.root_table);
+
+- if ( hd->arch.root_table )
++ ASSERT(pdev->arch.pseudo_domid != DOMID_INVALID);
++
++ if ( pdev->arch.amd.root_table )
+ {
+- ASSERT_UNREACHABLE();
++ clear_domain_page(pdev->arch.leaf_mfn);
+ return 0;
+ }
+
+- spin_lock(&hd->arch.mapping_lock);
++ pdev->arch.amd.root_table = alloc_amd_iommu_pgtable();
++ if ( !pdev->arch.amd.root_table )
++ return -ENOMEM;
+
+- hd->arch.root_table = alloc_amd_iommu_pgtable();
+- if ( !hd->arch.root_table )
+- goto out;
++ /* Transiently install the root into DomIO, for iommu_identity_mapping(). */
++ hd->arch.root_table = pdev->arch.amd.root_table;
++
++ rc = amd_iommu_reserve_domain_unity_map(dom_io,
++ ivrs_mappings[req_id].unity_map,
++ 0);
+
+- table = __map_domain_page(hd->arch.root_table);
+- while ( level )
++ iommu_identity_map_teardown(dom_io);
++ hd->arch.root_table = NULL;
++
++ if ( rc )
++ printk("%04x:%02x:%02x.%u: quarantine unity mapping failed\n",
++ pdev->seg, pdev->bus,
++ PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));
++ else
+ {
+- struct page_info *pg;
+- unsigned int i;
+-
+- /*
+- * The pgtable allocator is fine for the leaf page, as well as
+- * page table pages, and the resulting allocations are always
+- * zeroed.
+- */
+- pg = alloc_amd_iommu_pgtable();
+- if ( !pg )
+- break;
+-
+- for ( i = 0; i < PTE_PER_TABLE_SIZE; i++ )
+- {
+- union amd_iommu_pte *pde = &table[i];
++ union amd_iommu_pte *root;
++ struct page_info *pgs[IOMMU_MAX_PT_LEVELS] = {};
+
+- /*
+- * PDEs are essentially a subset of PTEs, so this function
+- * is fine to use even at the leaf.
+- */
+- set_iommu_pde_present(pde, mfn_x(page_to_mfn(pg)), level - 1,
+- false, true);
+- }
++ spin_lock(&hd->arch.mapping_lock);
+
+- unmap_domain_page(table);
+- table = __map_domain_page(pg);
+- level--;
++ root = __map_domain_page(pdev->arch.amd.root_table);
++ rc = fill_qpt(root, level - 1, pgs, pdev);
++ unmap_domain_page(root);
++
++ pdev->arch.leaf_mfn = page_to_mfn(pgs[0]);
++
++ spin_unlock(&hd->arch.mapping_lock);
+ }
+- unmap_domain_page(table);
+
+- out:
+- spin_unlock(&hd->arch.mapping_lock);
++ if ( rc )
++ amd_iommu_quarantine_teardown(pdev);
++
++ return rc;
++}
++
++void amd_iommu_quarantine_teardown(struct pci_dev *pdev)
++{
++ struct page_info *pg;
++
++ ASSERT(pcidevs_locked());
++
++ if ( !pdev->arch.amd.root_table )
++ return;
++
++ while ( (pg = page_list_remove_head(&pdev->arch.pgtables_list)) )
++ free_amd_iommu_pgtable(pg);
+
+- /* Pages leaked in failure case */
+- return level ? -ENOMEM : 0;
++ pdev->arch.amd.root_table = NULL;
+ }
+
+ /*
+diff --git a/xen/drivers/passthrough/amd/pci_amd_iommu.c b/xen/drivers/passthrough/amd/pci_amd_iommu.c
+index b07091e71e..e5c02ca710 100644
+--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
++++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
+@@ -125,6 +125,8 @@ static int __must_check amd_iommu_setup_domain_device(
+ u8 bus = pdev->bus;
+ struct domain_iommu *hd = dom_iommu(domain);
+ const struct ivrs_mappings *ivrs_dev;
++ const struct page_info *root_pg;
++ domid_t domid;
+
+ BUG_ON(!hd->arch.paging_mode || !iommu->dev_table.buffer);
+
+@@ -144,14 +146,25 @@ static int __must_check amd_iommu_setup_domain_device(
+ dte = &table[req_id];
+ ivrs_dev = &get_ivrs_mappings(iommu->seg)[req_id];
+
++ if ( domain != dom_io )
++ {
++ root_pg = hd->arch.root_table;
++ domid = domain->domain_id;
++ }
++ else
++ {
++ root_pg = pdev->arch.amd.root_table;
++ domid = pdev->arch.pseudo_domid;
++ }
++
+ spin_lock_irqsave(&iommu->lock, flags);
+
+ if ( !dte->v || !dte->tv )
+ {
+ /* bind DTE to domain page-tables */
+ rc = amd_iommu_set_root_page_table(
+- dte, page_to_maddr(hd->arch.root_table),
+- domain->domain_id, hd->arch.paging_mode, sr_flags);
++ dte, page_to_maddr(root_pg), domid,
++ hd->arch.paging_mode, sr_flags);
+ if ( rc )
+ {
+ ASSERT(rc < 0);
+@@ -175,7 +188,7 @@ static int __must_check amd_iommu_setup_domain_device(
+
+ amd_iommu_flush_device(iommu, req_id);
+ }
+- else if ( dte->pt_root != mfn_x(page_to_mfn(hd->arch.root_table)) )
++ else if ( dte->pt_root != mfn_x(page_to_mfn(root_pg)) )
+ {
+ /*
+ * Strictly speaking if the device is the only one with this requestor
+@@ -188,8 +201,8 @@ static int __must_check amd_iommu_setup_domain_device(
+ rc = -EOPNOTSUPP;
+ else
+ rc = amd_iommu_set_root_page_table(
+- dte, page_to_maddr(hd->arch.root_table),
+- domain->domain_id, hd->arch.paging_mode, sr_flags);
++ dte, page_to_maddr(root_pg), domid,
++ hd->arch.paging_mode, sr_flags);
+ if ( rc < 0 )
+ {
+ spin_unlock_irqrestore(&iommu->lock, flags);
+@@ -208,6 +221,7 @@ static int __must_check amd_iommu_setup_domain_device(
+ * intended anyway.
+ */
+ !pdev->domain->is_dying &&
++ pdev->domain != dom_io &&
+ (any_pdev_behind_iommu(pdev->domain, pdev, iommu) ||
+ pdev->phantom_stride) )
+ printk(" %04x:%02x:%02x.%u: reassignment may cause %pd data corruption\n",
+@@ -238,9 +252,8 @@ static int __must_check amd_iommu_setup_domain_device(
+ AMD_IOMMU_DEBUG("Setup I/O page table: device id = %#x, type = %#x, "
+ "root table = %#"PRIx64", "
+ "domain = %d, paging mode = %d\n",
+- req_id, pdev->type,
+- page_to_maddr(hd->arch.root_table),
+- domain->domain_id, hd->arch.paging_mode);
++ req_id, pdev->type, page_to_maddr(root_pg),
++ domid, hd->arch.paging_mode);
+
+ ASSERT(pcidevs_locked());
+
+@@ -313,7 +326,7 @@ static int iov_enable_xt(void)
+
+ int amd_iommu_alloc_root(struct domain_iommu *hd)
+ {
+- if ( unlikely(!hd->arch.root_table) )
++ if ( unlikely(!hd->arch.root_table) && hd != dom_iommu(dom_io) )
+ {
+ hd->arch.root_table = alloc_amd_iommu_pgtable();
+ if ( !hd->arch.root_table )
+@@ -404,7 +417,7 @@ static void amd_iommu_disable_domain_device(const struct domain *domain,
+
+ AMD_IOMMU_DEBUG("Disable: device id = %#x, "
+ "domain = %d, paging mode = %d\n",
+- req_id, domain->domain_id,
++ req_id, dte->domain_id,
+ dom_iommu(domain)->arch.paging_mode);
+ }
+ spin_unlock_irqrestore(&iommu->lock, flags);
+@@ -668,6 +681,8 @@ static int amd_iommu_remove_device(u8 devfn, struct pci_dev *pdev)
+
+ amd_iommu_disable_domain_device(pdev->domain, iommu, devfn, pdev);
+
++ amd_iommu_quarantine_teardown(pdev);
++
+ iommu_free_domid(pdev->arch.pseudo_domid, iommu->domid_map);
+ pdev->arch.pseudo_domid = DOMID_INVALID;
+
+diff --git a/xen/drivers/passthrough/iommu.c b/xen/drivers/passthrough/iommu.c
+index 93d4377978..9aef696d90 100644
+--- a/xen/drivers/passthrough/iommu.c
++++ b/xen/drivers/passthrough/iommu.c
+@@ -450,21 +450,21 @@ int iommu_iotlb_flush_all(struct domain *d, unsigned int flush_flags)
+ return rc;
+ }
+
+-static int __init iommu_quarantine_init(void)
++int iommu_quarantine_dev_init(device_t *dev)
+ {
+ const struct domain_iommu *hd = dom_iommu(dom_io);
+- int rc;
+
+- dom_io->options |= XEN_DOMCTL_CDF_iommu;
++ if ( !iommu_quarantine || !hd->platform_ops->quarantine_init )
++ return 0;
+
+- rc = iommu_domain_init(dom_io, 0);
+- if ( rc )
+- return rc;
++ return iommu_call(hd->platform_ops, quarantine_init, dev);
++}
+
+- if ( !hd->platform_ops->quarantine_init )
+- return 0;
++static int __init iommu_quarantine_init(void)
++{
++ dom_io->options |= XEN_DOMCTL_CDF_iommu;
+
+- return hd->platform_ops->quarantine_init(dom_io);
++ return iommu_domain_init(dom_io, 0);
+ }
+
+ int __init iommu_setup(void)
+diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c
+index 97e42261eb..1a1a387458 100644
+--- a/xen/drivers/passthrough/pci.c
++++ b/xen/drivers/passthrough/pci.c
+@@ -929,9 +929,16 @@ static int deassign_device(struct domain *d, uint16_t seg, uint8_t bus,
+ return -ENODEV;
+
+ /* De-assignment from dom_io should de-quarantine the device */
+- target = ((pdev->quarantine || iommu_quarantine) &&
+- pdev->domain != dom_io) ?
+- dom_io : hardware_domain;
++ if ( (pdev->quarantine || iommu_quarantine) && pdev->domain != dom_io )
++ {
++ ret = iommu_quarantine_dev_init(pci_to_dev(pdev));
++ if ( ret )
++ return ret;
++
++ target = dom_io;
++ }
++ else
++ target = hardware_domain;
+
+ while ( pdev->phantom_stride )
+ {
+@@ -1547,6 +1554,13 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag)
+ msixtbl_init(d);
+ }
+
++ if ( pdev->domain != dom_io )
++ {
++ rc = iommu_quarantine_dev_init(pci_to_dev(pdev));
++ if ( rc )
++ goto done;
++ }
++
+ pdev->fault.count = 0;
+
+ if ( (rc = hd->platform_ops->assign_device(d, devfn, pci_to_dev(pdev), flag)) )
+diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c
+index e5c50429d2..6571b5dde4 100644
+--- a/xen/drivers/passthrough/vtd/iommu.c
++++ b/xen/drivers/passthrough/vtd/iommu.c
+@@ -43,6 +43,12 @@
+ #include "vtd.h"
+ #include "../ats.h"
+
++#define DEVICE_DOMID(d, pdev) ((d) != dom_io ? (d)->domain_id \
++ : (pdev)->arch.pseudo_domid)
++#define DEVICE_PGTABLE(d, pdev) ((d) != dom_io \
++ ? dom_iommu(d)->arch.pgd_maddr \
++ : (pdev)->arch.vtd.pgd_maddr)
++
+ /* Possible unfiltered LAPIC/MSI messages from untrusted sources? */
+ bool __read_mostly untrusted_msi;
+
+@@ -78,13 +84,18 @@ static int get_iommu_did(domid_t domid, const struct vtd_iommu *iommu,
+
+ #define DID_FIELD_WIDTH 16
+ #define DID_HIGH_OFFSET 8
++
++/*
++ * This function may have "context" passed as NULL, to merely obtain a DID
++ * for "domid".
++ */
+ static int context_set_domain_id(struct context_entry *context,
+ domid_t domid, struct vtd_iommu *iommu)
+ {
+ unsigned long nr_dom, i;
+ int found = 0;
+
+- ASSERT(spin_is_locked(&iommu->lock));
++ ASSERT(pcidevs_locked());
+
+ nr_dom = cap_ndoms(iommu->cap);
+ i = find_first_bit(iommu->domid_bitmap, nr_dom);
+@@ -110,8 +121,13 @@ static int context_set_domain_id(struct context_entry *context,
+ }
+
+ set_bit(i, iommu->domid_bitmap);
+- context->hi &= ~(((1 << DID_FIELD_WIDTH) - 1) << DID_HIGH_OFFSET);
+- context->hi |= (i & ((1 << DID_FIELD_WIDTH) - 1)) << DID_HIGH_OFFSET;
++
++ if ( context )
++ {
++ context->hi &= ~(((1 << DID_FIELD_WIDTH) - 1) << DID_HIGH_OFFSET);
++ context->hi |= (i & ((1 << DID_FIELD_WIDTH) - 1)) << DID_HIGH_OFFSET;
++ }
++
+ return 0;
+ }
+
+@@ -161,8 +177,12 @@ static void check_cleanup_domid_map(struct domain *d,
+ const struct pci_dev *exclude,
+ struct vtd_iommu *iommu)
+ {
+- bool found = any_pdev_behind_iommu(d, exclude, iommu);
++ bool found;
++
++ if ( d == dom_io )
++ return;
+
++ found = any_pdev_behind_iommu(d, exclude, iommu);
+ /*
+ * Hidden devices are associated with DomXEN but usable by the hardware
+ * domain. Hence they need considering here as well.
+@@ -1400,7 +1420,7 @@ int domain_context_mapping_one(
+ domid = iommu->domid_map[prev_did];
+ if ( domid < DOMID_FIRST_RESERVED )
+ prev_dom = rcu_lock_domain_by_id(domid);
+- else if ( domid == DOMID_IO )
++ else if ( pdev ? domid == pdev->arch.pseudo_domid : domid > DOMID_MASK )
+ prev_dom = rcu_lock_domain(dom_io);
+ if ( !prev_dom )
+ {
+@@ -1577,15 +1597,12 @@ int domain_context_mapping_one(
+ {
+ if ( !prev_dom )
+ domain_context_unmap_one(domain, iommu, bus, devfn,
+- domain->domain_id);
++ DEVICE_DOMID(domain, pdev));
+ else if ( prev_dom != domain ) /* Avoid infinite recursion. */
+- {
+- hd = dom_iommu(prev_dom);
+ domain_context_mapping_one(prev_dom, iommu, bus, devfn, pdev,
+- domain->domain_id,
+- hd->arch.pgd_maddr,
++ DEVICE_DOMID(prev_dom, pdev),
++ DEVICE_PGTABLE(prev_dom, pdev),
+ mode & MAP_WITH_RMRR);
+- }
+ }
+
+ if ( prev_dom )
+@@ -1602,7 +1619,7 @@ static int domain_context_mapping(struct domain *domain, u8 devfn,
+ {
+ struct acpi_drhd_unit *drhd;
+ const struct acpi_rmrr_unit *rmrr;
+- paddr_t pgd_maddr = dom_iommu(domain)->arch.pgd_maddr;
++ paddr_t pgd_maddr = DEVICE_PGTABLE(domain, pdev);
+ domid_t orig_domid = pdev->arch.pseudo_domid;
+ int ret = 0;
+ unsigned int i, mode = 0;
+@@ -1635,7 +1652,7 @@ static int domain_context_mapping(struct domain *domain, u8 devfn,
+ break;
+ }
+
+- if ( domain != pdev->domain )
++ if ( domain != pdev->domain && pdev->domain != dom_io )
+ {
+ if ( pdev->domain->is_dying )
+ mode |= MAP_OWNER_DYING;
+@@ -1676,8 +1693,8 @@ static int domain_context_mapping(struct domain *domain, u8 devfn,
+ printk(VTDPREFIX "d%d:PCIe: map %04x:%02x:%02x.%u\n",
+ domain->domain_id, seg, bus,
+ PCI_SLOT(devfn), PCI_FUNC(devfn));
+- ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn,
+- pdev, domain->domain_id, pgd_maddr,
++ ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn, pdev,
++ DEVICE_DOMID(domain, pdev), pgd_maddr,
+ mode);
+ if ( ret > 0 )
+ ret = 0;
+@@ -1701,8 +1718,8 @@ static int domain_context_mapping(struct domain *domain, u8 devfn,
+ PCI_SLOT(devfn), PCI_FUNC(devfn));
+
+ ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn,
+- pdev, domain->domain_id, pgd_maddr,
+- mode);
++ pdev, DEVICE_DOMID(domain, pdev),
++ pgd_maddr, mode);
+ if ( ret < 0 )
+ break;
+ prev_present = ret;
+@@ -1730,8 +1747,8 @@ static int domain_context_mapping(struct domain *domain, u8 devfn,
+ */
+ if ( ret >= 0 )
+ ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn,
+- NULL, domain->domain_id, pgd_maddr,
+- mode);
++ NULL, DEVICE_DOMID(domain, pdev),
++ pgd_maddr, mode);
+
+ /*
+ * Devices behind PCIe-to-PCI/PCIx bridge may generate different
+@@ -1746,8 +1763,8 @@ static int domain_context_mapping(struct domain *domain, u8 devfn,
+ if ( !ret && pdev_type(seg, bus, devfn) == DEV_TYPE_PCIe2PCI_BRIDGE &&
+ (secbus != pdev->bus || pdev->devfn != 0) )
+ ret = domain_context_mapping_one(domain, drhd->iommu, secbus, 0,
+- NULL, domain->domain_id, pgd_maddr,
+- mode);
++ NULL, DEVICE_DOMID(domain, pdev),
++ pgd_maddr, mode);
+
+ if ( ret )
+ {
+@@ -1896,7 +1913,7 @@ static const struct acpi_drhd_unit *domain_context_unmap(
+ domain->domain_id, seg, bus,
+ PCI_SLOT(devfn), PCI_FUNC(devfn));
+ ret = domain_context_unmap_one(domain, iommu, bus, devfn,
+- domain->domain_id);
++ DEVICE_DOMID(domain, pdev));
+ if ( !ret && devfn == pdev->devfn && ats_device(pdev, drhd) > 0 )
+ disable_ats_device(pdev);
+
+@@ -1907,7 +1924,7 @@ static const struct acpi_drhd_unit *domain_context_unmap(
+ printk(VTDPREFIX "d%d:PCI: unmap %04x:%02x:%02x.%u\n",
+ domain->domain_id, seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
+ ret = domain_context_unmap_one(domain, iommu, bus, devfn,
+- domain->domain_id);
++ DEVICE_DOMID(domain, pdev));
+ if ( ret )
+ break;
+
+@@ -1930,18 +1947,12 @@ static const struct acpi_drhd_unit *domain_context_unmap(
+ break;
+ }
+
++ ret = domain_context_unmap_one(domain, iommu, tmp_bus, tmp_devfn,
++ DEVICE_DOMID(domain, pdev));
+ /* PCIe to PCI/PCIx bridge */
+- if ( pdev_type(seg, tmp_bus, tmp_devfn) == DEV_TYPE_PCIe2PCI_BRIDGE )
+- {
+- ret = domain_context_unmap_one(domain, iommu, tmp_bus, tmp_devfn,
+- domain->domain_id);
+- if ( !ret )
+- ret = domain_context_unmap_one(domain, iommu, secbus, 0,
+- domain->domain_id);
+- }
+- else /* Legacy PCI bridge */
+- ret = domain_context_unmap_one(domain, iommu, tmp_bus, tmp_devfn,
+- domain->domain_id);
++ if ( !ret && pdev_type(seg, tmp_bus, tmp_devfn) == DEV_TYPE_PCIe2PCI_BRIDGE )
++ ret = domain_context_unmap_one(domain, iommu, secbus, 0,
++ DEVICE_DOMID(domain, pdev));
+
+ break;
+
+@@ -1987,6 +1998,25 @@ static void iommu_domain_teardown(struct domain *d)
+ cleanup_domid_map(d->domain_id, drhd->iommu);
+ }
+
++static void quarantine_teardown(struct pci_dev *pdev,
++ const struct acpi_drhd_unit *drhd)
++{
++ struct page_info *pg;
++
++ ASSERT(pcidevs_locked());
++
++ if ( !pdev->arch.vtd.pgd_maddr )
++ return;
++
++ while ( (pg = page_list_remove_head(&pdev->arch.pgtables_list)) )
++ free_domheap_page(pg);
++
++ pdev->arch.vtd.pgd_maddr = 0;
++
++ if ( drhd )
++ cleanup_domid_map(pdev->arch.pseudo_domid, drhd->iommu);
++}
++
+ static int __must_check intel_iommu_map_page(struct domain *d, dfn_t dfn,
+ mfn_t mfn, unsigned int flags,
+ unsigned int *flush_flags)
+@@ -2209,6 +2239,8 @@ static int intel_iommu_remove_device(u8 devfn, struct pci_dev *pdev)
+ rmrr->end_address, 0);
+ }
+
++ quarantine_teardown(pdev, drhd);
++
+ if ( drhd )
+ {
+ iommu_free_domid(pdev->arch.pseudo_domid,
+@@ -2888,60 +2920,139 @@ static void vtd_dump_p2m_table(struct domain *d)
+ vtd_dump_p2m_table_level(hd->arch.pgd_maddr, agaw_to_level(hd->arch.agaw), 0, 0);
+ }
+
+-static int __init intel_iommu_quarantine_init(struct domain *d)
++static int fill_qpt(struct dma_pte *this, unsigned int level,
++ paddr_t maddrs[6], struct pci_dev *pdev)
+ {
+- struct domain_iommu *hd = dom_iommu(d);
+- struct dma_pte *parent;
++ struct domain_iommu *hd = dom_iommu(dom_io);
++ unsigned int i;
++ int rc = 0;
++
++ for ( i = 0; !rc && i < PTE_NUM; ++i )
++ {
++ struct dma_pte *pte = &this[i], *next;
++
++ if ( !dma_pte_present(*pte) )
++ {
++ if ( !maddrs[level] )
++ {
++ /*
++ * The pgtable allocator is fine for the leaf page, as well as
++ * page table pages, and the resulting allocations are always
++ * zeroed.
++ */
++ maddrs[level] = alloc_pgtable_maddr(1, hd->node);
++ if ( !maddrs[level] )
++ {
++ rc = -ENOMEM;
++ break;
++ }
++
++ page_list_add(maddr_to_page(maddrs[level]),
++ &pdev->arch.pgtables_list);
++
++ if ( level )
++ {
++ next = map_vtd_domain_page(maddrs[level]);
++ rc = fill_qpt(next, level - 1, maddrs, pdev);
++ unmap_vtd_domain_page(next);
++ }
++ }
++
++ dma_set_pte_addr(*pte, maddrs[level]);
++ dma_set_pte_readable(*pte);
++ dma_set_pte_writable(*pte);
++ }
++ else if ( level && !dma_pte_superpage(*pte) )
++ {
++ page_list_add(maddr_to_page(dma_pte_addr(*pte)),
++ &pdev->arch.pgtables_list);
++ next = map_vtd_domain_page(dma_pte_addr(*pte));
++ rc = fill_qpt(next, level - 1, maddrs, pdev);
++ unmap_vtd_domain_page(next);
++ }
++ }
++
++ return rc;
++}
++
++static int intel_iommu_quarantine_init(struct pci_dev *pdev)
++{
++ struct domain_iommu *hd = dom_iommu(dom_io);
++ paddr_t maddr;
+ unsigned int agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH);
+ unsigned int level = agaw_to_level(agaw);
++ const struct acpi_drhd_unit *drhd;
++ const struct acpi_rmrr_unit *rmrr;
++ unsigned int i, bdf;
++ bool rmrr_found = false;
++ int rc;
+
+- if ( hd->arch.pgd_maddr )
++ ASSERT(pcidevs_locked());
++ ASSERT(!hd->arch.pgd_maddr);
++
++ if ( pdev->arch.vtd.pgd_maddr )
+ {
+- ASSERT_UNREACHABLE();
++ clear_domain_page(pdev->arch.leaf_mfn);
+ return 0;
+ }
+
+- spin_lock(&hd->arch.mapping_lock);
++ drhd = acpi_find_matched_drhd_unit(pdev);
++ if ( !drhd )
++ return -ENODEV;
+
+- hd->arch.pgd_maddr = alloc_pgtable_maddr(1, hd->node);
+- if ( !hd->arch.pgd_maddr )
+- goto out;
++ maddr = alloc_pgtable_maddr(1, hd->node);
++ if ( !maddr )
++ return -ENOMEM;
+
+- parent = map_vtd_domain_page(hd->arch.pgd_maddr);
+- while ( level )
+- {
+- uint64_t maddr;
+- unsigned int offset;
++ rc = context_set_domain_id(NULL, pdev->arch.pseudo_domid, drhd->iommu);
+
+- /*
+- * The pgtable allocator is fine for the leaf page, as well as
+- * page table pages, and the resulting allocations are always
+- * zeroed.
+- */
+- maddr = alloc_pgtable_maddr(1, hd->node);
+- if ( !maddr )
++ /* Transiently install the root into DomIO, for iommu_identity_mapping(). */
++ hd->arch.pgd_maddr = maddr;
++
++ for_each_rmrr_device ( rmrr, bdf, i )
++ {
++ if ( rc )
+ break;
+
+- for ( offset = 0; offset < PTE_NUM; offset++ )
++ if ( rmrr->segment == pdev->seg && bdf == pdev->sbdf.bdf )
+ {
+- struct dma_pte *pte = &parent[offset];
++ rmrr_found = true;
+
+- dma_set_pte_addr(*pte, maddr);
+- dma_set_pte_readable(*pte);
++ rc = iommu_identity_mapping(dom_io, p2m_access_rw,
++ rmrr->base_address, rmrr->end_address,
++ 0);
++ if ( rc )
++ printk(XENLOG_ERR VTDPREFIX
++ "%04x:%02x:%02x.%u: RMRR quarantine mapping failed\n",
++ pdev->seg, pdev->bus,
++ PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));
+ }
+- iommu_sync_cache(parent, PAGE_SIZE);
++ }
+
+- unmap_vtd_domain_page(parent);
+- parent = map_vtd_domain_page(maddr);
+- level--;
++ iommu_identity_map_teardown(dom_io);
++ hd->arch.pgd_maddr = 0;
++ pdev->arch.vtd.pgd_maddr = maddr;
++
++ if ( !rc )
++ {
++ struct dma_pte *root;
++ paddr_t maddrs[6] = {};
++
++ spin_lock(&hd->arch.mapping_lock);
++
++ root = map_vtd_domain_page(maddr);
++ rc = fill_qpt(root, level - 1, maddrs, pdev);
++ unmap_vtd_domain_page(root);
++
++ pdev->arch.leaf_mfn = maddr_to_mfn(maddrs[0]);
++
++ spin_unlock(&hd->arch.mapping_lock);
+ }
+- unmap_vtd_domain_page(parent);
+
+- out:
+- spin_unlock(&hd->arch.mapping_lock);
++ if ( rc )
++ quarantine_teardown(pdev, drhd);
+
+- /* Pages leaked in failure case */
+- return level ? -ENOMEM : 0;
++ return rc;
+ }
+
+ const struct iommu_ops __initconstrel intel_iommu_ops = {
+diff --git a/xen/drivers/passthrough/vtd/iommu.h b/xen/drivers/passthrough/vtd/iommu.h
+index be44fc017f..c67adb9b41 100644
+--- a/xen/drivers/passthrough/vtd/iommu.h
++++ b/xen/drivers/passthrough/vtd/iommu.h
+@@ -509,7 +509,7 @@ struct vtd_iommu {
+ u32 nr_pt_levels;
+ u64 cap;
+ u64 ecap;
+- spinlock_t lock; /* protect context, domain ids */
++ spinlock_t lock; /* protect context */
+ spinlock_t register_lock; /* protect iommu register handling */
+ u64 root_maddr; /* root entry machine address */
+ nodeid_t node;
+diff --git a/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h b/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
+index 2a3bc47ab5..961182ac0f 100644
+--- a/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
++++ b/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
+@@ -54,7 +54,8 @@ int amd_iommu_init_late(void);
+ int amd_iommu_update_ivrs_mapping_acpi(void);
+ int iov_adjust_irq_affinities(void);
+
+-int amd_iommu_quarantine_init(struct domain *d);
++int amd_iommu_quarantine_init(struct pci_dev *pdev);
++void amd_iommu_quarantine_teardown(struct pci_dev *pdev);
+
+ /* mapping functions */
+ int __must_check amd_iommu_map_page(struct domain *d, dfn_t dfn,
+diff --git a/xen/include/asm-x86/pci.h b/xen/include/asm-x86/pci.h
+index 70ed48e309..0c79acb1ed 100644
+--- a/xen/include/asm-x86/pci.h
++++ b/xen/include/asm-x86/pci.h
+@@ -1,6 +1,8 @@
+ #ifndef __X86_PCI_H__
+ #define __X86_PCI_H__
+
++#include <xen/mm.h>
++
+ #define CF8_BDF(cf8) ( ((cf8) & 0x00ffff00) >> 8)
+ #define CF8_ADDR_LO(cf8) ( (cf8) & 0x000000fc)
+ #define CF8_ADDR_HI(cf8) ( ((cf8) & 0x0f000000) >> 16)
+@@ -20,7 +22,18 @@ struct arch_pci_dev {
+ * them don't race (de)initialization and hence don't strictly need any
+ * locking.
+ */
++ union {
++ /* Subset of struct arch_iommu's fields, to be used in dom_io. */
++ struct {
++ uint64_t pgd_maddr;
++ } vtd;
++ struct {
++ struct page_info *root_table;
++ } amd;
++ };
+ domid_t pseudo_domid;
++ mfn_t leaf_mfn;
++ struct page_list_head pgtables_list;
+ };
+
+ int pci_conf_write_intercept(unsigned int seg, unsigned int bdf,
+diff --git a/xen/include/xen/iommu.h b/xen/include/xen/iommu.h
+index 041a7cf5e5..c94579fe92 100644
+--- a/xen/include/xen/iommu.h
++++ b/xen/include/xen/iommu.h
+@@ -211,7 +211,7 @@ typedef int iommu_grdm_t(xen_pfn_t start, xen_ulong_t nr, u32 id, void *ctxt);
+ struct iommu_ops {
+ int (*init)(struct domain *d);
+ void (*hwdom_init)(struct domain *d);
+- int (*quarantine_init)(struct domain *d);
++ int (*quarantine_init)(device_t *dev);
+ int (*add_device)(u8 devfn, device_t *dev);
+ int (*enable_device)(device_t *dev);
+ int (*remove_device)(u8 devfn, device_t *dev);
+@@ -331,6 +331,7 @@ int __must_check iommu_suspend(void);
+ void iommu_resume(void);
+ void iommu_crash_shutdown(void);
+ int iommu_get_reserved_device_memory(iommu_grdm_t *, void *);
++int iommu_quarantine_dev_init(device_t *dev);
+
+ void iommu_share_p2m_table(struct domain *d);
+
+--
+2.35.2
+
+
+From a6902a65160aac72a1889a268fd5f3cebb159d8e Mon Sep 17 00:00:00 2001
+From: Jan Beulich <jbeulich@suse.com>
+Date: Fri, 8 Apr 2022 15:20:21 +0200
+Subject: [PATCH 30/32] VT-d: don't needlessly look up DID
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+If get_iommu_domid() in domain_context_unmap_one() fails, we better
+wouldn't clear the context entry in the first place, as we're then unable
+to issue the corresponding flush. However, we have no need to look up the
+DID in the first place: What needs flushing is very specifically the DID
+that was in the context entry before our clearing of it.
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
+master commit: 445ab9852d69d8957467f0036098ebec75fec092
+master date: 2022-04-07 12:29:03 +0200
+---
+ xen/drivers/passthrough/vtd/iommu.c | 10 ++--------
+ 1 file changed, 2 insertions(+), 8 deletions(-)
+
+diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c
+index 6571b5dde4..4b0d6a873c 100644
+--- a/xen/drivers/passthrough/vtd/iommu.c
++++ b/xen/drivers/passthrough/vtd/iommu.c
+@@ -1821,18 +1821,12 @@ int domain_context_unmap_one(
+ return 0;
+ }
+
++ iommu_domid = context_domain_id(*context);
++
+ context_clear_present(*context);
+ context_clear_entry(*context);
+ iommu_sync_cache(context, sizeof(struct context_entry));
+
+- iommu_domid = get_iommu_did(domid, iommu, !domain->is_dying);
+- if ( iommu_domid == -1 )
+- {
+- spin_unlock(&iommu->lock);
+- unmap_vtd_domain_page(context_entries);
+- return -EINVAL;
+- }
+-
+ rc = iommu_flush_context_device(iommu, iommu_domid,
+ PCI_BDF2(bus, devfn),
+ DMA_CCMD_MASK_NOBIT, 0);
+--
+2.35.2
+
+
+From d64d46685c776b39d5c640a0ad2727fa0938273c Mon Sep 17 00:00:00 2001
+From: Jan Beulich <jbeulich@suse.com>
+Date: Fri, 8 Apr 2022 15:21:33 +0200
+Subject: [PATCH 31/32] VT-d: avoid NULL deref on domain_context_mapping_one()
+ error paths
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+First there's a printk() which actually wrongly uses pdev in the first
+place: We want to log the coordinates of the (perhaps fake) device
+acted upon, which may not be pdev.
+
+Then it was quite pointless for eb19326a328d ("VT-d: prepare for per-
+device quarantine page tables (part I)") to add a domid_t parameter to
+domain_context_unmap_one(): It's only used to pass back here via
+me_wifi_quirk() -> map_me_phantom_function(). Drop the parameter again.
+
+Finally there's the invocation of domain_context_mapping_one(), which
+needs to be passed the correct domain ID. Avoid taking that path when
+pdev is NULL and the quarantine state is what would need restoring to.
+This means we can't security-support non-PCI-Express devices with RMRRs
+(if such exist in practice) any longer; note that as of trhe 1st of the
+two commits referenced below assigning them to DomU-s is unsupported
+anyway.
+
+Fixes: 8f41e481b485 ("VT-d: re-assign devices directly")
+Fixes: 14dd241aad8a ("IOMMU/x86: use per-device page tables for quarantining")
+Coverity ID: 1503784
+Reported-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
+master commit: 608394b906e71587f02e6662597bc985bad33a5a
+master date: 2022-04-07 12:30:19 +0200
+---
+ xen/drivers/passthrough/vtd/extern.h | 2 +-
+ xen/drivers/passthrough/vtd/iommu.c | 34 ++++++++++++++++------------
+ xen/drivers/passthrough/vtd/quirks.c | 2 +-
+ 3 files changed, 21 insertions(+), 17 deletions(-)
+
+diff --git a/xen/drivers/passthrough/vtd/extern.h b/xen/drivers/passthrough/vtd/extern.h
+index 897dcff9ff..fbe951b2fa 100644
+--- a/xen/drivers/passthrough/vtd/extern.h
++++ b/xen/drivers/passthrough/vtd/extern.h
+@@ -89,7 +89,7 @@ int domain_context_mapping_one(struct domain *domain, struct vtd_iommu *iommu,
+ const struct pci_dev *pdev, domid_t domid,
+ paddr_t pgd_maddr, unsigned int mode);
+ int domain_context_unmap_one(struct domain *domain, struct vtd_iommu *iommu,
+- uint8_t bus, uint8_t devfn, domid_t domid);
++ uint8_t bus, uint8_t devfn);
+ int intel_iommu_get_reserved_device_memory(iommu_grdm_t *func, void *ctxt);
+
+ unsigned int io_apic_read_remap_rte(unsigned int apic, unsigned int reg);
+diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c
+index 4b0d6a873c..cb3ba3e409 100644
+--- a/xen/drivers/passthrough/vtd/iommu.c
++++ b/xen/drivers/passthrough/vtd/iommu.c
+@@ -1527,7 +1527,7 @@ int domain_context_mapping_one(
+ check_cleanup_domid_map(domain, pdev, iommu);
+ printk(XENLOG_ERR
+ "%04x:%02x:%02x.%u: unexpected context entry %016lx_%016lx (expected %016lx_%016lx)\n",
+- pdev->seg, pdev->bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
++ seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
+ (uint64_t)(res >> 64), (uint64_t)res,
+ (uint64_t)(old >> 64), (uint64_t)old);
+ rc = -EILSEQ;
+@@ -1595,9 +1595,14 @@ int domain_context_mapping_one(
+
+ if ( rc )
+ {
+- if ( !prev_dom )
+- domain_context_unmap_one(domain, iommu, bus, devfn,
+- DEVICE_DOMID(domain, pdev));
++ if ( !prev_dom ||
++ /*
++ * Unmapping here means DEV_TYPE_PCI devices with RMRRs (if such
++ * exist) would cause problems if such a region was actually
++ * accessed.
++ */
++ (prev_dom == dom_io && !pdev) )
++ domain_context_unmap_one(domain, iommu, bus, devfn);
+ else if ( prev_dom != domain ) /* Avoid infinite recursion. */
+ domain_context_mapping_one(prev_dom, iommu, bus, devfn, pdev,
+ DEVICE_DOMID(prev_dom, pdev),
+@@ -1734,7 +1739,9 @@ static int domain_context_mapping(struct domain *domain, u8 devfn,
+ * Strictly speaking if the device is the only one behind this bridge
+ * and the only one with this (secbus,0,0) tuple, it could be allowed
+ * to be re-assigned regardless of RMRR presence. But let's deal with
+- * that case only if it is actually found in the wild.
++ * that case only if it is actually found in the wild. Note that
++ * dealing with this just here would still not render the operation
++ * secure.
+ */
+ else if ( prev_present && (mode & MAP_WITH_RMRR) &&
+ domain != pdev->domain )
+@@ -1800,7 +1807,7 @@ static int domain_context_mapping(struct domain *domain, u8 devfn,
+ int domain_context_unmap_one(
+ struct domain *domain,
+ struct vtd_iommu *iommu,
+- uint8_t bus, uint8_t devfn, domid_t domid)
++ uint8_t bus, uint8_t devfn)
+ {
+ struct context_entry *context, *context_entries;
+ u64 maddr;
+@@ -1852,7 +1859,8 @@ int domain_context_unmap_one(
+ unmap_vtd_domain_page(context_entries);
+
+ if ( !iommu->drhd->segment && !rc )
+- rc = me_wifi_quirk(domain, bus, devfn, domid, 0, UNMAP_ME_PHANTOM_FUNC);
++ rc = me_wifi_quirk(domain, bus, devfn, DOMID_INVALID, 0,
++ UNMAP_ME_PHANTOM_FUNC);
+
+ if ( rc && !is_hardware_domain(domain) && domain != dom_io )
+ {
+@@ -1906,8 +1914,7 @@ static const struct acpi_drhd_unit *domain_context_unmap(
+ printk(VTDPREFIX "d%d:PCIe: unmap %04x:%02x:%02x.%u\n",
+ domain->domain_id, seg, bus,
+ PCI_SLOT(devfn), PCI_FUNC(devfn));
+- ret = domain_context_unmap_one(domain, iommu, bus, devfn,
+- DEVICE_DOMID(domain, pdev));
++ ret = domain_context_unmap_one(domain, iommu, bus, devfn);
+ if ( !ret && devfn == pdev->devfn && ats_device(pdev, drhd) > 0 )
+ disable_ats_device(pdev);
+
+@@ -1917,8 +1924,7 @@ static const struct acpi_drhd_unit *domain_context_unmap(
+ if ( iommu_debug )
+ printk(VTDPREFIX "d%d:PCI: unmap %04x:%02x:%02x.%u\n",
+ domain->domain_id, seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
+- ret = domain_context_unmap_one(domain, iommu, bus, devfn,
+- DEVICE_DOMID(domain, pdev));
++ ret = domain_context_unmap_one(domain, iommu, bus, devfn);
+ if ( ret )
+ break;
+
+@@ -1941,12 +1947,10 @@ static const struct acpi_drhd_unit *domain_context_unmap(
+ break;
+ }
+
+- ret = domain_context_unmap_one(domain, iommu, tmp_bus, tmp_devfn,
+- DEVICE_DOMID(domain, pdev));
++ ret = domain_context_unmap_one(domain, iommu, tmp_bus, tmp_devfn);
+ /* PCIe to PCI/PCIx bridge */
+ if ( !ret && pdev_type(seg, tmp_bus, tmp_devfn) == DEV_TYPE_PCIe2PCI_BRIDGE )
+- ret = domain_context_unmap_one(domain, iommu, secbus, 0,
+- DEVICE_DOMID(domain, pdev));
++ ret = domain_context_unmap_one(domain, iommu, secbus, 0);
+
+ break;
+
+diff --git a/xen/drivers/passthrough/vtd/quirks.c b/xen/drivers/passthrough/vtd/quirks.c
+index 4d54c21136..2b8a2bd9c6 100644
+--- a/xen/drivers/passthrough/vtd/quirks.c
++++ b/xen/drivers/passthrough/vtd/quirks.c
+@@ -363,7 +363,7 @@ static int __must_check map_me_phantom_function(struct domain *domain,
+ domid, pgd_maddr, mode);
+ else
+ rc = domain_context_unmap_one(domain, drhd->iommu, 0,
+- PCI_DEVFN(dev, 7), domid);
++ PCI_DEVFN(dev, 7));
+
+ return rc;
+ }
+--
+2.35.2
+
+
+From fe97133b5deef58bd1422f4d87821131c66b1d0e Mon Sep 17 00:00:00 2001
+From: Jan Beulich <jbeulich@suse.com>
+Date: Fri, 8 Apr 2022 15:22:49 +0200
+Subject: [PATCH 32/32] VT-d: avoid infinite recursion on
+ domain_context_mapping_one() error path
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Despite the comment there infinite recursion was still possible, by
+flip-flopping between two domains. This is because prev_dom is derived
+from the DID found in the context entry, which was already updated by
+the time error recovery is invoked. Simply introduce yet another mode
+flag to prevent rolling back an in-progress roll-back of a prior
+mapping attempt.
+
+Also drop the existing recursion prevention for having been dead anyway:
+Earlier in the function we already bail when prev_dom == domain.
+
+Fixes: 8f41e481b485 ("VT-d: re-assign devices directly")
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
+master commit: 99d829dba1390b98a3ca07b365713e62182ee7ca
+master date: 2022-04-07 12:31:16 +0200
+---
+ xen/drivers/passthrough/vtd/iommu.c | 7 ++++---
+ xen/drivers/passthrough/vtd/vtd.h | 3 ++-
+ 2 files changed, 6 insertions(+), 4 deletions(-)
+
+diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c
+index cb3ba3e409..f051a55764 100644
+--- a/xen/drivers/passthrough/vtd/iommu.c
++++ b/xen/drivers/passthrough/vtd/iommu.c
+@@ -1593,7 +1593,7 @@ int domain_context_mapping_one(
+ if ( !seg && !rc )
+ rc = me_wifi_quirk(domain, bus, devfn, domid, pgd_maddr, mode);
+
+- if ( rc )
++ if ( rc && !(mode & MAP_ERROR_RECOVERY) )
+ {
+ if ( !prev_dom ||
+ /*
+@@ -1603,11 +1603,12 @@ int domain_context_mapping_one(
+ */
+ (prev_dom == dom_io && !pdev) )
+ domain_context_unmap_one(domain, iommu, bus, devfn);
+- else if ( prev_dom != domain ) /* Avoid infinite recursion. */
++ else
+ domain_context_mapping_one(prev_dom, iommu, bus, devfn, pdev,
+ DEVICE_DOMID(prev_dom, pdev),
+ DEVICE_PGTABLE(prev_dom, pdev),
+- mode & MAP_WITH_RMRR);
++ (mode & MAP_WITH_RMRR)
++ | MAP_ERROR_RECOVERY);
+ }
+
+ if ( prev_dom )
+diff --git a/xen/drivers/passthrough/vtd/vtd.h b/xen/drivers/passthrough/vtd/vtd.h
+index e4ab242fee..cb2df76eed 100644
+--- a/xen/drivers/passthrough/vtd/vtd.h
++++ b/xen/drivers/passthrough/vtd/vtd.h
+@@ -29,7 +29,8 @@
+ #define MAP_WITH_RMRR (1u << 0)
+ #define MAP_OWNER_DYING (1u << 1)
+ #define MAP_SINGLE_DEVICE (1u << 2)
+-#define UNMAP_ME_PHANTOM_FUNC (1u << 3)
++#define MAP_ERROR_RECOVERY (1u << 3)
++#define UNMAP_ME_PHANTOM_FUNC (1u << 4)
+
+ /* Allow for both IOAPIC and IOSAPIC. */
+ #define IO_xAPIC_route_entry IO_APIC_route_entry
+--
+2.35.2
+
diff --git a/main/xen/xsa386.patch b/main/xen/xsa386.patch
deleted file mode 100644
index 83f24d30d5..0000000000
--- a/main/xen/xsa386.patch
+++ /dev/null
@@ -1,29 +0,0 @@
-From: Jan Beulich <jbeulich@suse.com>
-Subject: VT-d: fix deassign of device with RMRR
-Date: Fri, 1 Oct 2021 15:05:42 +0200
-
-Ignoring a specific error code here was not meant to short circuit
-deassign to _just_ the unmapping of RMRRs. This bug was previously
-hidden by the bogus (potentially indefinite) looping in
-pci_release_devices(), until f591755823a7 ("IOMMU/PCI: don't let domain
-cleanup continue when device de-assignment failed") fixed that loop.
-
-This is CVE-2021-28702 / XSA-386.
-
-Fixes: 8b99f4400b69 ("VT-d: fix RMRR related error handling")
-Reported-by: Ivan Kardykov <kardykov@tabit.pro>
-Signed-off-by: Jan Beulich <jbeulich@suse.com>
-Tested-by: Ivan Kardykov <kardykov@tabit.pro>
-
---- a/xen/drivers/passthrough/vtd/iommu.c
-+++ b/xen/drivers/passthrough/vtd/iommu.c
-@@ -2409,7 +2409,7 @@ static int reassign_device_ownership(
- ret = iommu_identity_mapping(source, p2m_access_x,
- rmrr->base_address,
- rmrr->end_address, 0);
-- if ( ret != -ENOENT )
-+ if ( ret && ret != -ENOENT )
- return ret;
- }
- }
-
diff --git a/main/xen/xsa388-4.14-1.patch b/main/xen/xsa388-4.14-1.patch
deleted file mode 100644
index f76f2d56b6..0000000000
--- a/main/xen/xsa388-4.14-1.patch
+++ /dev/null
@@ -1,174 +0,0 @@
-From: Jan Beulich <jbeulich@suse.com>
-Subject: x86/PoD: deal with misaligned GFNs
-
-Users of XENMEM_decrease_reservation and XENMEM_populate_physmap aren't
-required to pass in order-aligned GFN values. (While I consider this
-bogus, I don't think we can fix this there, as that might break existing
-code, e.g Linux'es swiotlb, which - while affecting PV only - until
-recently had been enforcing only page alignment on the original
-allocation.) Only non-PoD code paths (guest_physmap_{add,remove}_page(),
-p2m_set_entry()) look to be dealing with this properly (in part by being
-implemented inefficiently, handling every 4k page separately).
-
-Introduce wrappers taking care of splitting the incoming request into
-aligned chunks, without putting much effort in trying to determine the
-largest possible chunk at every iteration.
-
-Also "handle" p2m_set_entry() failure for non-order-0 requests by
-crashing the domain in one more place. Alongside putting a log message
-there, also add one to the other similar path.
-
-Note regarding locking: This is left in the actual worker functions on
-the assumption that callers aren't guaranteed atomicity wrt acting on
-multiple pages at a time. For mis-aligned GFNs gfn_lock() wouldn't have
-locked the correct GFN range anyway, if it didn't simply resolve to
-p2m_lock(), and for well-behaved callers there continues to be only a
-single iteration, i.e. behavior is unchanged for them. (FTAOD pulling
-out just pod_lock() into p2m_pod_decrease_reservation() would result in
-a lock order violation.)
-
-This is CVE-2021-28704 and CVE-2021-28707 / part of XSA-388.
-
-Fixes: 3c352011c0d3 ("x86/PoD: shorten certain operations on higher order ranges")
-Signed-off-by: Jan Beulich <jbeulich@suse.com>
-Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
-
---- a/xen/arch/x86/mm/p2m-pod.c
-+++ b/xen/arch/x86/mm/p2m-pod.c
-@@ -495,7 +495,7 @@ p2m_pod_zero_check_superpage(struct p2m_
-
-
- /*
-- * This function is needed for two reasons:
-+ * This pair of functions is needed for two reasons:
- * + To properly handle clearing of PoD entries
- * + To "steal back" memory being freed for the PoD cache, rather than
- * releasing it.
-@@ -503,8 +503,8 @@ p2m_pod_zero_check_superpage(struct p2m_
- * Once both of these functions have been completed, we can return and
- * allow decrease_reservation() to handle everything else.
- */
--unsigned long
--p2m_pod_decrease_reservation(struct domain *d, gfn_t gfn, unsigned int order)
-+static unsigned long
-+decrease_reservation(struct domain *d, gfn_t gfn, unsigned int order)
- {
- unsigned long ret = 0, i, n;
- struct p2m_domain *p2m = p2m_get_hostp2m(d);
-@@ -551,8 +551,10 @@ p2m_pod_decrease_reservation(struct doma
- * All PoD: Mark the whole region invalid and tell caller
- * we're done.
- */
-- if ( p2m_set_entry(p2m, gfn, INVALID_MFN, order, p2m_invalid,
-- p2m->default_access) )
-+ int rc = p2m_set_entry(p2m, gfn, INVALID_MFN, order, p2m_invalid,
-+ p2m->default_access);
-+
-+ if ( rc )
- {
- /*
- * If this fails, we can't tell how much of the range was changed.
-@@ -560,7 +562,12 @@ p2m_pod_decrease_reservation(struct doma
- * impossible.
- */
- if ( order != 0 )
-+ {
-+ printk(XENLOG_G_ERR
-+ "%pd: marking GFN %#lx (order %u) as non-PoD failed: %d\n",
-+ d, gfn_x(gfn), order, rc);
- domain_crash(d);
-+ }
- goto out_unlock;
- }
- ret = 1UL << order;
-@@ -667,6 +674,22 @@ out_unlock:
- return ret;
- }
-
-+unsigned long
-+p2m_pod_decrease_reservation(struct domain *d, gfn_t gfn, unsigned int order)
-+{
-+ unsigned long left = 1UL << order, ret = 0;
-+ unsigned int chunk_order = find_first_set_bit(gfn_x(gfn) | left);
-+
-+ do {
-+ ret += decrease_reservation(d, gfn, chunk_order);
-+
-+ left -= 1UL << chunk_order;
-+ gfn = gfn_add(gfn, 1UL << chunk_order);
-+ } while ( left );
-+
-+ return ret;
-+}
-+
- void p2m_pod_dump_data(struct domain *d)
- {
- struct p2m_domain *p2m = p2m_get_hostp2m(d);
-@@ -1266,19 +1289,15 @@ remap_and_retry:
- return true;
- }
-
--
--int
--guest_physmap_mark_populate_on_demand(struct domain *d, unsigned long gfn_l,
-- unsigned int order)
-+static int
-+mark_populate_on_demand(struct domain *d, unsigned long gfn_l,
-+ unsigned int order)
- {
- struct p2m_domain *p2m = p2m_get_hostp2m(d);
- gfn_t gfn = _gfn(gfn_l);
- unsigned long i, n, pod_count = 0;
- int rc = 0;
-
-- if ( !paging_mode_translate(d) )
-- return -EINVAL;
--
- gfn_lock(p2m, gfn, order);
-
- P2M_DEBUG("mark pod gfn=%#lx\n", gfn_l);
-@@ -1316,12 +1335,44 @@ guest_physmap_mark_populate_on_demand(st
- BUG_ON(p2m->pod.entry_count < 0);
- pod_unlock(p2m);
- }
-+ else if ( order )
-+ {
-+ /*
-+ * If this failed, we can't tell how much of the range was changed.
-+ * Best to crash the domain.
-+ */
-+ printk(XENLOG_G_ERR
-+ "%pd: marking GFN %#lx (order %u) as PoD failed: %d\n",
-+ d, gfn_l, order, rc);
-+ domain_crash(d);
-+ }
-
- out:
- gfn_unlock(p2m, gfn, order);
-
- return rc;
- }
-+
-+int
-+guest_physmap_mark_populate_on_demand(struct domain *d, unsigned long gfn,
-+ unsigned int order)
-+{
-+ unsigned long left = 1UL << order;
-+ unsigned int chunk_order = find_first_set_bit(gfn | left);
-+ int rc;
-+
-+ if ( !paging_mode_translate(d) )
-+ return -EINVAL;
-+
-+ do {
-+ rc = mark_populate_on_demand(d, gfn, chunk_order);
-+
-+ left -= 1UL << chunk_order;
-+ gfn += 1UL << chunk_order;
-+ } while ( !rc && left );
-+
-+ return rc;
-+}
-
- void p2m_pod_init(struct p2m_domain *p2m)
- {
diff --git a/main/xen/xsa388-4.14-2.patch b/main/xen/xsa388-4.14-2.patch
deleted file mode 100644
index 2f8cc881f0..0000000000
--- a/main/xen/xsa388-4.14-2.patch
+++ /dev/null
@@ -1,36 +0,0 @@
-From: Jan Beulich <jbeulich@suse.com>
-Subject: x86/PoD: handle intermediate page orders in p2m_pod_cache_add()
-
-p2m_pod_decrease_reservation() may pass pages to the function which
-aren't 4k, 2M, or 1G. Handle all intermediate orders as well, to avoid
-hitting the BUG() at the switch() statement's "default" case.
-
-This is CVE-2021-28708 / part of XSA-388.
-
-Fixes: 3c352011c0d3 ("x86/PoD: shorten certain operations on higher order ranges")
-Signed-off-by: Jan Beulich <jbeulich@suse.com>
-Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
-
---- a/xen/arch/x86/mm/p2m-pod.c
-+++ b/xen/arch/x86/mm/p2m-pod.c
-@@ -111,15 +111,13 @@ p2m_pod_cache_add(struct p2m_domain *p2m
- /* Then add to the appropriate populate-on-demand list. */
- switch ( order )
- {
-- case PAGE_ORDER_1G:
-- for ( i = 0; i < (1UL << PAGE_ORDER_1G); i += 1UL << PAGE_ORDER_2M )
-+ case PAGE_ORDER_2M ... PAGE_ORDER_1G:
-+ for ( i = 0; i < (1UL << order); i += 1UL << PAGE_ORDER_2M )
- page_list_add_tail(page + i, &p2m->pod.super);
- break;
-- case PAGE_ORDER_2M:
-- page_list_add_tail(page, &p2m->pod.super);
-- break;
-- case PAGE_ORDER_4K:
-- page_list_add_tail(page, &p2m->pod.single);
-+ case PAGE_ORDER_4K ... PAGE_ORDER_2M - 1:
-+ for ( i = 0; i < (1UL << order); i += 1UL << PAGE_ORDER_4K )
-+ page_list_add_tail(page + i, &p2m->pod.single);
- break;
- default:
- BUG();
diff --git a/main/xen/xsa389-4.13.patch b/main/xen/xsa389-4.13.patch
deleted file mode 100644
index 10a8a9b9ed..0000000000
--- a/main/xen/xsa389-4.13.patch
+++ /dev/null
@@ -1,180 +0,0 @@
-From: Jan Beulich <jbeulich@suse.com>
-Subject: x86/P2M: deal with partial success of p2m_set_entry()
-
-M2P and PoD stats need to remain in sync with P2M; if an update succeeds
-only partially, respective adjustments need to be made. If updates get
-made before the call, they may also need undoing upon complete failure
-(i.e. including the single-page case).
-
-Log-dirty state would better also be kept in sync.
-
-Note that the change to set_typed_p2m_entry() may not be strictly
-necessary (due to the order restriction enforced near the top of the
-function), but is being kept here to be on the safe side.
-
-This is CVE-2021-28705 and CVE-2021-28709 / XSA-389.
-
-Signed-off-by: Jan Beulich <jbeulich@suse.com>
-Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
-
---- a/xen/arch/x86/mm/p2m.c
-+++ b/xen/arch/x86/mm/p2m.c
-@@ -781,6 +781,7 @@ p2m_remove_page(struct p2m_domain *p2m,
- gfn_t gfn = _gfn(gfn_l);
- p2m_type_t t;
- p2m_access_t a;
-+ int rc;
-
- /* IOMMU for PV guests is handled in get_page_type() and put_page(). */
- if ( !paging_mode_translate(p2m->domain) )
-@@ -812,8 +813,27 @@ p2m_remove_page(struct p2m_domain *p2m,
- set_gpfn_from_mfn(mfn+i, INVALID_M2P_ENTRY);
- }
- }
-- return p2m_set_entry(p2m, gfn, INVALID_MFN, page_order, p2m_invalid,
-- p2m->default_access);
-+ rc = p2m_set_entry(p2m, gfn, INVALID_MFN, page_order, p2m_invalid,
-+ p2m->default_access);
-+ if ( likely(!rc) || !mfn_valid(_mfn(mfn)) )
-+ return rc;
-+
-+ /*
-+ * The operation may have partially succeeded. For the failed part we need
-+ * to undo the M2P update and, out of precaution, mark the pages dirty
-+ * again.
-+ */
-+ for ( i = 0; i < (1UL << page_order); ++i )
-+ {
-+ p2m->get_entry(p2m, gfn_add(gfn, i), &t, &a, 0, NULL, NULL);
-+ if ( !p2m_is_hole(t) && !p2m_is_special(t) && !p2m_is_shared(t) )
-+ {
-+ set_gpfn_from_mfn(mfn + i, gfn_l + i);
-+ paging_mark_pfn_dirty(p2m->domain, _pfn(gfn_l + i));
-+ }
-+ }
-+
-+ return rc;
- }
-
- int
-@@ -1002,13 +1022,8 @@ guest_physmap_add_entry(struct domain *d
-
- /* Now, actually do the two-way mapping */
- rc = p2m_set_entry(p2m, gfn, mfn, page_order, t, p2m->default_access);
-- if ( rc == 0 )
-+ if ( likely(!rc) )
- {
-- pod_lock(p2m);
-- p2m->pod.entry_count -= pod_count;
-- BUG_ON(p2m->pod.entry_count < 0);
-- pod_unlock(p2m);
--
- if ( !p2m_is_grant(t) )
- {
- for ( i = 0; i < (1UL << page_order); i++ )
-@@ -1016,6 +1031,42 @@ guest_physmap_add_entry(struct domain *d
- gfn_x(gfn_add(gfn, i)));
- }
- }
-+ else
-+ {
-+ /*
-+ * The operation may have partially succeeded. For the successful part
-+ * we need to update M2P and dirty state, while for the failed part we
-+ * may need to adjust PoD stats as well as undo the earlier M2P update.
-+ */
-+ for ( i = 0; i < (1UL << page_order); ++i )
-+ {
-+ omfn = p2m->get_entry(p2m, gfn_add(gfn, i), &ot, &a, 0, NULL, NULL);
-+ if ( p2m_is_pod(ot) )
-+ {
-+ BUG_ON(!pod_count);
-+ --pod_count;
-+ }
-+ else if ( mfn_eq(omfn, mfn_add(mfn, i)) && ot == t &&
-+ a == p2m->default_access && !p2m_is_grant(t) )
-+ {
-+ set_gpfn_from_mfn(mfn_x(omfn), gfn_x(gfn) + i);
-+ paging_mark_pfn_dirty(d, _pfn(gfn_x(gfn) + i));
-+ }
-+ else if ( p2m_is_ram(ot) && !p2m_is_paged(ot) )
-+ {
-+ ASSERT(mfn_valid(omfn));
-+ set_gpfn_from_mfn(mfn_x(omfn), gfn_x(gfn) + i);
-+ }
-+ }
-+ }
-+
-+ if ( pod_count )
-+ {
-+ pod_lock(p2m);
-+ p2m->pod.entry_count -= pod_count;
-+ BUG_ON(p2m->pod.entry_count < 0);
-+ pod_unlock(p2m);
-+ }
-
- out:
- p2m_unlock(p2m);
-@@ -1307,6 +1358,49 @@ static int set_typed_p2m_entry(struct do
- return 0;
- }
- }
-+
-+ P2M_DEBUG("set %d %lx %lx\n", gfn_p2mt, gfn_l, mfn_x(mfn));
-+ rc = p2m_set_entry(p2m, gfn, mfn, order, gfn_p2mt, access);
-+ if ( unlikely(rc) )
-+ {
-+ gdprintk(XENLOG_ERR, "p2m_set_entry: %#lx:%u -> %d (0x%"PRI_mfn")\n",
-+ gfn_l, order, rc, mfn_x(mfn));
-+
-+ /*
-+ * The operation may have partially succeeded. For the successful part
-+ * we need to update PoD stats, M2P, and dirty state.
-+ */
-+ if ( order != PAGE_ORDER_4K )
-+ {
-+ unsigned long i;
-+
-+ for ( i = 0; i < (1UL << order); ++i )
-+ {
-+ p2m_type_t t;
-+ mfn_t cmfn = p2m->get_entry(p2m, gfn_add(gfn, i), &t, &a, 0,
-+ NULL, NULL);
-+
-+ if ( !mfn_eq(cmfn, mfn_add(mfn, i)) || t != gfn_p2mt ||
-+ a != access )
-+ continue;
-+
-+ if ( p2m_is_ram(ot) )
-+ {
-+ ASSERT(mfn_valid(mfn_add(omfn, i)));
-+ set_gpfn_from_mfn(mfn_x(omfn) + i, INVALID_M2P_ENTRY);
-+ }
-+#ifdef CONFIG_HVM
-+ else if ( p2m_is_pod(ot) )
-+ {
-+ pod_lock(p2m);
-+ BUG_ON(!p2m->pod.entry_count);
-+ --p2m->pod.entry_count;
-+ pod_unlock(p2m);
-+ }
-+#endif
-+ }
-+ }
-+ }
- else if ( p2m_is_ram(ot) )
- {
- unsigned long i;
-@@ -1317,12 +1411,6 @@ static int set_typed_p2m_entry(struct do
- set_gpfn_from_mfn(mfn_x(omfn) + i, INVALID_M2P_ENTRY);
- }
- }
--
-- P2M_DEBUG("set %d %lx %lx\n", gfn_p2mt, gfn_l, mfn_x(mfn));
-- rc = p2m_set_entry(p2m, gfn, mfn, order, gfn_p2mt, access);
-- if ( rc )
-- gdprintk(XENLOG_ERR, "p2m_set_entry: %#lx:%u -> %d (0x%"PRI_mfn")\n",
-- gfn_l, order, rc, mfn_x(mfn));
- #ifdef CONFIG_HVM
- else if ( p2m_is_pod(ot) )
- {