aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNatanael Copa <ncopa@alpinelinux.org>2018-09-06 08:03:40 +0200
committerNatanael Copa <ncopa@alpinelinux.org>2018-09-06 08:03:40 +0200
commit74dce6e0451466b8eb5078660886cc226f9704f4 (patch)
tree5c2dc6bd447da5baf73f9934102b5a5d2a916a6b
parentd72f525745c3193dfb608c0ce2fd7054bdc45e1b (diff)
main/xen: backport various security fixes
fixes #9295
-rw-r--r--main/xen/APKBUILD31
-rw-r--r--main/xen/git.patch8547
-rw-r--r--main/xen/xsa260-1.patch72
-rw-r--r--main/xen/xsa260-2.patch110
-rw-r--r--main/xen/xsa260-3.patch138
-rw-r--r--main/xen/xsa260-4.patch72
-rw-r--r--main/xen/xsa261.patch279
-rw-r--r--main/xen/xsa262-4.10.patch76
8 files changed, 8564 insertions, 761 deletions
diff --git a/main/xen/APKBUILD b/main/xen/APKBUILD
index 3958ff5032c..4c320d40cdc 100644
--- a/main/xen/APKBUILD
+++ b/main/xen/APKBUILD
@@ -3,7 +3,7 @@
# Maintainer: William Pitcock <nenolod@dereferenced.org>
pkgname=xen
pkgver=4.10.1
-pkgrel=2
+pkgrel=3
pkgdesc="Xen hypervisor"
url="http://www.xen.org/"
arch="x86_64 armhf aarch64"
@@ -119,6 +119,19 @@ options="!strip"
# - CVE-2018-8897 XSA-260
# - CVE-2018-10982 XSA-261
# - CVE-2018-10981 XSA-262
+# 4.10.1-r3:
+# - CVE-2018-14678 XSA-274
+# - CVE-2018-3646 XSA-273
+# - CVE-2018-15470 XSA-272
+# - CVE-2018-14007 XSA-271
+# - CVE-2018-15471 XSA-270
+# - CVE-2018-15468 XSA-269
+# - CVE-2018-15469 XSA-268
+# - CVE-2018-3665 XSA-267
+# - CVE-2018-12892 XSA-266
+# - CVE-2018-12893 XSA-265
+# - CVE-2018-12891 XSA-264
+# - CVE-2018-3639 XSA-263
case "$CARCH" in
x86*)
@@ -165,6 +178,7 @@ source="https://downloads.xenproject.org/release/$pkgname/$pkgver/$pkgname-$pkgv
http://xenbits.xen.org/xen-extfiles/tpm_emulator-$_TPMEMU_VERSION.tar.gz
http://xenbits.xen.org/xen-extfiles/zlib-$_ZLIB_VERSION.tar.gz
http://xenbits.xen.org/xen-extfiles/ipxe-git-$_IPXE_GIT_TAG.tar.gz
+ git.patch
qemu-xen_paths.patch
@@ -183,13 +197,6 @@ source="https://downloads.xenproject.org/release/$pkgname/$pkgver/$pkgname-$pkgv
hotplug-Linux-iscsi-block-handle-lun-1.patch
- xsa260-1.patch
- xsa260-2.patch
- xsa260-3.patch
- xsa260-4.patch
- xsa261.patch
- xsa262-4.10.patch
-
xenstored.initd
xenstored.confd
xenconsoled.initd
@@ -433,6 +440,8 @@ c2bc9ffc8583aeae71cee9ddcc4418969768d4e3764d47307da54f93981c0109fb07d84b061b3a36
4928b5b82f57645be9408362706ff2c4d9baa635b21b0d41b1c82930e8c60a759b1ea4fa74d7e6c7cae1b7692d006aa5cb72df0c3b88bf049779aa2b566f9d35 tpm_emulator-0.7.4.tar.gz
021b958fcd0d346c4ba761bcf0cc40f3522de6186cf5a0a6ea34a70504ce9622b1c2626fce40675bc8282cf5f5ade18473656abc38050f72f5d6480507a2106e zlib-1.2.3.tar.gz
bbcce5e55040e7e29adebd4a5253a046016a6e2e7ff34cf801a42d147e1ec1af57e0297318249bfa9c5bbeac969fe4b37c18cbf845a80b2136d65387a4fc31da ipxe-git-356f6c1b64d7a97746d1816cef8ca22bdd8d0b5d.tar.gz
+0fd2622469f3ff136b33a66576319920e050aac3fefa41c06306661eb6f6792fc21a4c15c8928febd10b1a14b4c712a2918532cdb23ccbddba9f1ba55d7d4478 git.patch
+7fdb705d26f100c409c354d3d249afde2ee9273e1f0028d4f320bc67325dc4ffa411ac9c59d75b31c79e2f95c17ec3ef0b4ac98de4fefb073c5f2529d3c69be2 xsa271-xapi.patch
1936ab39a1867957fa640eb81c4070214ca4856a2743ba7e49c0cd017917071a9680d015f002c57fa7b9600dbadd29dcea5887f50e6c133305df2669a7a933f3 qemu-xen_paths.patch
f095ea373f36381491ad36f0662fb4f53665031973721256b23166e596318581da7cbb0146d0beb2446729adfdb321e01468e377793f6563a67d68b8b0f7ffe3 hotplug-vif-vtrill.patch
77b08e9655e091b0352e4630d520b54c6ca6d659d1d38fbb4b3bfc9ff3e66db433a2e194ead32bb10ff962c382d800a670e82b7a62835b238e294b22808290ea musl-hvmloader-fix-stdint.patch
@@ -443,12 +452,6 @@ e76816c6ad0e91dc5f81947f266da3429b20e6d976c3e8c41202c6179532eec878a3f0913921ef3a
69dfa60628ca838678862383528654ecbdf4269cbb5c9cfb6b84d976202a8dea85d711aa65a52fa1b477fb0b30604ca70cf1337192d6fb9388a08bbe7fe56077 xenstore_client_transaction_fix.patch
2094ea964fa610b2bf72fd2c7ede7e954899a75c0f5b08030cf1d74460fb759ade84866176e32f8fe29c921dfdc6dafd2b31e23ab9b0a3874d3dceeabdd1913b xenqemu-xattr-size-max.patch
8c9cfc6afca325df1d8026e21ed03fa8cd2c7e1a21a56cc1968301c5ab634bfe849951899e75d328951d7a41273d1e49a2448edbadec0029ed410c43c0549812 hotplug-Linux-iscsi-block-handle-lun-1.patch
-08a35c2c14d51f4e004274367a948332b97f22d12b0b8f7647f5f026f3d57cfe294dd4c3f3e4d34439c9010f2efc30689e62ec805ca134cfd75fe85f0c53c94b xsa260-1.patch
-6d152a54d38a8c06a8c1293ab637c484ad6baf53b7be54a8a916143011f5042a089972c5c08e489d510356507296da8c7aa8e89b17517b1c167a95084b5389db xsa260-2.patch
-d7208e68d60581ad6a6a5f56528e7b820f0f6db56593a4b01a5c59f245e3e06596a6512f5cca6d3c88b662c787c46b98f7f0759822e375e10b2e2402c89262f6 xsa260-3.patch
-2b26451201f0b754b19f7cd7f8ffdc3b2ea083fd3f54de6cd0c29bc0dba89d5dac4b33ed58b3b80a48887ffa11d9c82ded0c60a4df5895022ff97d1b11b2357c xsa260-4.patch
-f6c55fb28915d54b05585c4ba177fd57f8a70b87930af24307c3142e97e39239f684b52c70d9051d1ac6a21a9e8eaabba482c451d7b4e3f48054a02048d5603e xsa261.patch
-aa6089f017c0e00e0e464b6f8d82dd5c8d588ccff027b175f43dd9d4efd4014ac899fceedef2005854b892ea156c7951c71183c03479cdf70c6d0298f5f76522 xsa262-4.10.patch
52c43beb2596d645934d0f909f2d21f7587b6898ed5e5e7046799a8ed6d58f7a09c5809e1634fa26152f3fd4f3e7cfa07da7076f01b4a20cc8f5df8b9cb77e50 xenstored.initd
093f7fbd43faf0a16a226486a0776bade5dc1681d281c5946a3191c32d74f9699c6bf5d0ab8de9d1195a2461165d1660788e92a3156c9b3c7054d7b2d52d7ff0 xenstored.confd
3c86ed48fbee0af4051c65c4a3893f131fa66e47bf083caf20c9b6aa4b63fdead8832f84a58d0e27964bc49ec8397251b34e5be5c212c139f556916dc8da9523 xenconsoled.initd
diff --git a/main/xen/git.patch b/main/xen/git.patch
new file mode 100644
index 00000000000..b4224d8d674
--- /dev/null
+++ b/main/xen/git.patch
@@ -0,0 +1,8547 @@
+diff --git a/docs/man/xl.conf.pod.5 b/docs/man/xl.conf.pod.5
+index da91b8626c..37262a7ef8 100644
+--- a/docs/man/xl.conf.pod.5
++++ b/docs/man/xl.conf.pod.5
+@@ -185,6 +185,28 @@ massively huge guests).
+
+ =back
+
++=item B<vm.cpumask>="CPULIST"
++
++=item B<vm.hvm.cpumask>="CPULIST"
++
++=item B<vm.pv.cpumask>="CPULIST"
++
++Global masks that are applied when creating guests and pinning vcpus
++to indicate which cpus they are allowed to run on. Specifically,
++C<vm.cpumask> applies to all guest types, C<vm.hvm.cpumask> applies to
++both HVM and PVH guests and C<vm.pv.cpumask> applies to PV guests.
++
++The hard affinity of guest's vcpus are logical-AND'ed with respective
++masks. If the resulting affinity mask is empty, operation will fail.
++
++Use --ignore-global-affinity-masks to skip applying global masks.
++
++The default value for these masks are all 1's, i.e. all cpus are allowed.
++
++Due to bug(s), these options may not interact well with other options
++concerning CPU affinity. One example is CPU pools. Users should always double
++check that the required affinity has taken effect.
++
+ =back
+
+ =head1 SEE ALSO
+diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown
+index 6c673eedc8..470da80174 100644
+--- a/docs/misc/xen-command-line.markdown
++++ b/docs/misc/xen-command-line.markdown
+@@ -248,6 +248,9 @@ the NMI watchdog is also enabled.
+ ### bti (x86)
+ > `= List of [ thunk=retpoline|lfence|jmp, ibrs=<bool>, ibpb=<bool>, rsb_{vmexit,native}=<bool> ]`
+
++**WARNING: This command line option is deprecated, and superseded by
++_spec-ctrl=_ - using both options in combination is undefined.**
++
+ Branch Target Injection controls. By default, Xen will pick the most
+ appropriate BTI mitigations based on compiled in support, loaded microcode,
+ and hardware details.
+@@ -493,9 +496,10 @@ accounting for hardware capabilities as enumerated via CPUID.
+
+ Currently accepted:
+
+-The Speculation Control hardware features `ibrsb`, `stibp`, `ibpb` are used by
+-default if avaiable. They can be ignored, e.g. `no-ibrsb`, at which point Xen
+-won't use them itself, and won't offer them to guests.
++The Speculation Control hardware features `ibrsb`, `stibp`, `ibpb`,
++`l1d-flush` and `ssbd` are used by default if available and applicable. They can
++be ignored, e.g. `no-ibrsb`, at which point Xen won't use them itself, and
++won't offer them to guests.
+
+ ### cpuid\_mask\_cpu (AMD only)
+ > `= fam_0f_rev_c | fam_0f_rev_d | fam_0f_rev_e | fam_0f_rev_f | fam_0f_rev_g | fam_10_rev_b | fam_10_rev_c | fam_11_rev_b`
+@@ -916,6 +920,21 @@ Controls EPT related features.
+
+ Specify which console gdbstub should use. See **console**.
+
++### gnttab
++> `= List of [ max-ver:<integer>, transitive=<bool> ]`
++
++> Default: `gnttab=max-ver:2,transitive`
++
++Control various aspects of the grant table behaviour available to guests.
++
++* `max-ver` Select the maximum grant table version to offer to guests. Valid
++version are 1 and 2.
++* `transitive` Permit or disallow the use of transitive grants. Note that the
++use of grant table v2 without transitive grants is an ABI breakage from the
++guests point of view.
++
++The usage of gnttab v2 is not security supported on ARM platforms.
++
+ ### gnttab\_max\_frames
+ > `= <integer>`
+
+@@ -1348,6 +1367,15 @@ Because responsibility for APIC setup is shared between Xen and the
+ domain 0 kernel this option is automatically propagated to the domain
+ 0 command line.
+
++### invpcid (x86)
++> `= <boolean>`
++
++> Default: `true`
++
++By default, Xen will use the INVPCID instruction for TLB management if
++it is available. This option can be used to cause Xen to fall back to
++older mechanisms, which are generally slower.
++
+ ### noirqbalance
+ > `= <boolean>`
+
+@@ -1426,6 +1454,20 @@ Flag to enable Memory Protection Keys.
+ The protection-key feature provides an additional mechanism by which IA-32e
+ paging controls access to usermode addresses.
+
++### pcid (x86)
++> `= <boolean> | xpti=<bool>`
++
++> Default: `xpti`
++
++> Can be modified at runtime (change takes effect only for domains created
++ afterwards)
++
++If available, control usage of the PCID feature of the processor for
++64-bit pv-domains. PCID can be used either for no domain at all (`false`),
++for all of them (`true`), only for those subject to XPTI (`xpti`) or for
++those not subject to XPTI (`no-xpti`). The feature is used only in case
++INVPCID is supported and not disabled via `invpcid=false`.
++
+ ### psr (Intel)
+ > `= List of ( cmt:<boolean> | rmid_max:<integer> | cat:<boolean> | cos_max:<integer> | cdp:<boolean> )`
+
+@@ -1486,6 +1528,30 @@ do; there may be other custom operating systems which do. If you're
+ certain you don't plan on having PV guests which use this feature,
+ turning it off can reduce the attack surface.
+
++### pv-l1tf (x86)
++> `= List of [ <bool>, dom0=<bool>, domu=<bool> ]`
++
++> Default: `false` on believed-unaffected hardware, or in pv-shim mode.
++> `domu` on believed-affected hardware.
++
++Mitigations for L1TF / XSA-273 / CVE-2018-3620 for PV guests.
++
++For backwards compatibility, we may not alter an architecturally-legitimate
++pagetable entry a PV guest chooses to write. We can however force such a
++guest into shadow mode so that Xen controls the PTEs which are reachable by
++the CPU pagewalk.
++
++Shadowing is performed at the point where a PV guest first tries to write an
++L1TF-vulnerable PTE. Therefore, a PV guest kernel which has been updated with
++its own L1TF mitigations will not trigger shadow mode if it is well behaved.
++
++If CONFIG\_SHADOW\_PAGING is not compiled in, this mitigation instead crashes
++the guest when an L1TF-vulnerable PTE is written, which still allows updated,
++well-behaved PV guests to run, despite Shadow being compiled out.
++
++In the pv-shim case, Shadow is expected to be compiled out, and a malicious
++guest kernel can only leak data from the shim Xen, rather than the host Xen.
++
+ ### pv-shim (x86)
+ > `= <boolean>`
+
+@@ -1690,6 +1756,13 @@ Use `smap=hvm` to allow SMAP use by HVM guests only.
+ Flag to enable Supervisor Mode Execution Protection
+ Use `smep=hvm` to allow SMEP use by HVM guests only.
+
++### smt (x86)
++> `= <boolean>`
++
++Default: `true`
++
++Control bring up of multiple hyper-threads per CPU core.
++
+ ### snb\_igd\_quirk
+ > `= <boolean> | cap | <integer>`
+
+@@ -1698,6 +1771,75 @@ enforces the maximum theoretically necessary timeout of 670ms. Any number
+ is being interpreted as a custom timeout in milliseconds. Zero or boolean
+ false disable the quirk workaround, which is also the default.
+
++### spec-ctrl (x86)
++> `= List of [ <bool>, xen=<bool>, {pv,hvm,msr-sc,rsb}=<bool>,
++> bti-thunk=retpoline|lfence|jmp, {ibrs,ibpb,ssbd,eager-fpu,
++> l1d-flush}=<bool> ]`
++
++Controls for speculative execution sidechannel mitigations. By default, Xen
++will pick the most appropriate mitigations based on compiled in support,
++loaded microcode, and hardware details, and will virtualise appropriate
++mitigations for guests to use.
++
++**WARNING: Any use of this option may interfere with heuristics. Use with
++extreme care.**
++
++An overall boolean value, `spec-ctrl=no`, can be specified to turn off all
++mitigations, including pieces of infrastructure used to virtualise certain
++mitigation features for guests. This also includes settings which `xpti`,
++`smt`, `pv-l1tf` control, unless the respective option(s) have been
++specified earlier on the command line.
++
++Alternatively, a slightly more restricted `spec-ctrl=no-xen` can be used to
++turn off all of Xen's mitigations, while leaving the virtualisation support
++in place for guests to use.
++
++Use of a positive boolean value for either of these options is invalid.
++
++The booleans `pv=`, `hvm=`, `msr-sc=` and `rsb=` offer fine grained control
++over the alternative blocks used by Xen. These impact Xen's ability to
++protect itself, and Xen's ability to virtualise support for guests to use.
++
++* `pv=` and `hvm=` offer control over all suboptions for PV and HVM guests
++ respectively.
++* `msr-sc=` offers control over Xen's support for manipulating MSR\_SPEC\_CTRL
++ on entry and exit. These blocks are necessary to virtualise support for
++ guests and if disabled, guests will be unable to use IBRS/STIBP/SSBD/etc.
++* `rsb=` offers control over whether to overwrite the Return Stack Buffer /
++ Return Address Stack on entry to Xen.
++
++If Xen was compiled with INDIRECT\_THUNK support, `bti-thunk=` can be used to
++select which of the thunks gets patched into the `__x86_indirect_thunk_%reg`
++locations. The default thunk is `retpoline` (generally preferred for Intel
++hardware), with the alternatives being `jmp` (a `jmp *%reg` gadget, minimal
++overhead), and `lfence` (an `lfence; jmp *%reg` gadget, preferred for AMD).
++
++On hardware supporting IBRS (Indirect Branch Restricted Speculation), the
++`ibrs=` option can be used to force or prevent Xen using the feature itself.
++If Xen is not using IBRS itself, functionality is still set up so IBRS can be
++virtualised for guests.
++
++On hardware supporting IBPB (Indirect Branch Prediction Barrier), the `ibpb=`
++option can be used to force (the default) or prevent Xen from issuing branch
++prediction barriers on vcpu context switches.
++
++On hardware supporting SSBD (Speculative Store Bypass Disable), the `ssbd=`
++option can be used to force or prevent Xen using the feature itself. On AMD
++hardware, this is a global option applied at boot, and not virtualised for
++guest use. On Intel hardware, the feature is virtualised for guests,
++independently of Xen's choice of setting.
++
++On all hardware, the `eager-fpu=` option can be used to force or prevent Xen
++from using fully eager FPU context switches. This is currently implemented as
++a global control. By default, Xen will choose to use fully eager context
++switches on hardware believed to speculate past #NM exceptions.
++
++On hardware supporting L1D_FLUSH, the `l1d-flush=` option can be used to force
++or prevent Xen from issuing an L1 data cache flush on each VMEntry.
++Irrespective of Xen's setting, the feature is virtualised for HVM guests to
++use. By default, Xen will enable this mitigation on hardware believed to be
++vulnerable to L1TF.
++
+ ### sync\_console
+ > `= <boolean>`
+
+@@ -1923,14 +2065,24 @@ clustered mode. The default, given no hint from the **FADT**, is cluster
+ mode.
+
+ ### xpti
+-> `= <boolean>`
++> `= List of [ default | <boolean> | dom0=<bool> | domu=<bool> ]`
+
+-> Default: `false` on AMD hardware
++> Default: `false` on hardware not to be vulnerable to Meltdown (e.g. AMD)
+ > Default: `true` everywhere else
+
+ Override default selection of whether to isolate 64-bit PV guest page
+ tables.
+
++`true` activates page table isolation even on hardware not vulnerable by
++Meltdown for all domains.
++
++`false` deactivates page table isolation on all systems for all domains.
++
++`default` sets the default behaviour.
++
++With `dom0` and `domu` it is possible to control page table isolation
++for dom0 or guest domains only.
++
+ ### xsave
+ > `= <boolean>`
+
+diff --git a/tools/Makefile b/tools/Makefile
+index ab7a01ee1b..67977ad850 100644
+--- a/tools/Makefile
++++ b/tools/Makefile
+@@ -232,7 +232,7 @@ subdir-all-qemu-xen-dir: qemu-xen-dir-find
+ else \
+ enable_trace_backend='' ; \
+ fi ; \
+- PKG_CONFIG_PATH=$(XEN_ROOT)/tools/pkg-config \
++ PKG_CONFIG_PATH=$(XEN_ROOT)/tools/pkg-config$${PKG_CONFIG_PATH:+:$${PKG_CONFIG_PATH}} \
+ $$source/configure --enable-xen --target-list=i386-softmmu \
+ $(QEMU_XEN_ENABLE_DEBUG) \
+ $$enable_trace_backend \
+diff --git a/tools/examples/xl.conf b/tools/examples/xl.conf
+index 374b6bbc2e..0446deb304 100644
+--- a/tools/examples/xl.conf
++++ b/tools/examples/xl.conf
+@@ -37,3 +37,8 @@
+ # (which can take a long time to find out if launching huge guests).
+ # see xl.conf(5) for details.
+ #claim_mode=1
++
++# Specify global vcpu hard affinity masks. See xl.conf(5) for details.
++#vm.cpumask="0-7"
++#vm.pv.cpumask="0-3"
++#vm.hvm.cpumask="3-7"
+diff --git a/tools/libacpi/Makefile b/tools/libacpi/Makefile
+index a47a658a25..c17f3924cc 100644
+--- a/tools/libacpi/Makefile
++++ b/tools/libacpi/Makefile
+@@ -43,7 +43,7 @@ all: $(C_SRC) $(H_SRC)
+
+ $(H_SRC): $(ACPI_BUILD_DIR)/%.h: %.asl iasl
+ iasl -vs -p $(ACPI_BUILD_DIR)/$*.$(TMP_SUFFIX) -tc $<
+- sed -e 's/AmlCode/$*/g' $(ACPI_BUILD_DIR)/$*.hex >$@
++ sed -e 's/AmlCode/$*/g' -e 's/_aml_code//g' $(ACPI_BUILD_DIR)/$*.hex >$@
+ rm -f $(addprefix $(ACPI_BUILD_DIR)/, $*.aml $*.hex)
+
+ $(MK_DSDT): mk_dsdt.c
+@@ -76,7 +76,7 @@ $(ACPI_BUILD_DIR)/dsdt_anycpu_arm.asl: $(MK_DSDT)
+
+ $(C_SRC): $(ACPI_BUILD_DIR)/%.c: iasl $(ACPI_BUILD_DIR)/%.asl
+ iasl -vs -p $(ACPI_BUILD_DIR)/$*.$(TMP_SUFFIX) -tc $(ACPI_BUILD_DIR)/$*.asl
+- sed -e 's/AmlCode/$*/g' $(ACPI_BUILD_DIR)/$*.hex > $@.$(TMP_SUFFIX)
++ sed -e 's/AmlCode/$*/g' -e 's/_aml_code//g' $(ACPI_BUILD_DIR)/$*.hex > $@.$(TMP_SUFFIX)
+ echo "int $*_len=sizeof($*);" >> $@.$(TMP_SUFFIX)
+ mv -f $@.$(TMP_SUFFIX) $@
+ rm -f $(addprefix $(ACPI_BUILD_DIR)/, $*.aml $*.hex)
+diff --git a/tools/libxc/xc_cpuid_x86.c b/tools/libxc/xc_cpuid_x86.c
+index 9fa2f7c360..21537f06f1 100644
+--- a/tools/libxc/xc_cpuid_x86.c
++++ b/tools/libxc/xc_cpuid_x86.c
+@@ -575,6 +575,12 @@ static void xc_cpuid_pv_policy(xc_interface *xch,
+ break;
+ }
+
++ case 0x80000008:
++ regs[0] &= 0x0000ffffu;
++ regs[1] = info->featureset[featureword_of(X86_FEATURE_CLZERO)];
++ regs[2] = regs[3] = 0;
++ break;
++
+ case 0x00000005: /* MONITOR/MWAIT */
+ case 0x0000000b: /* Extended Topology Enumeration */
+ case 0x8000000a: /* SVM revision and features */
+diff --git a/tools/libxl/libxl_cpuid.c b/tools/libxl/libxl_cpuid.c
+index 3a21f4e7da..52e16c20ed 100644
+--- a/tools/libxl/libxl_cpuid.c
++++ b/tools/libxl/libxl_cpuid.c
+@@ -204,7 +204,9 @@ int libxl_cpuid_parse_config(libxl_cpuid_policy_list *cpuid, const char* str)
+ {"avx512-4fmaps",0x00000007, 0, CPUID_REG_EDX, 3, 1},
+ {"ibrsb", 0x00000007, 0, CPUID_REG_EDX, 26, 1},
+ {"stibp", 0x00000007, 0, CPUID_REG_EDX, 27, 1},
++ {"l1d-flush", 0x00000007, 0, CPUID_REG_EDX, 28, 1},
+ {"arch-caps", 0x00000007, 0, CPUID_REG_EDX, 29, 1},
++ {"ssbd", 0x00000007, 0, CPUID_REG_EDX, 31, 1},
+
+ {"lahfsahf", 0x80000001, NA, CPUID_REG_ECX, 0, 1},
+ {"cmplegacy", 0x80000001, NA, CPUID_REG_ECX, 1, 1},
+diff --git a/tools/libxl/libxl_dm.c b/tools/libxl/libxl_dm.c
+index b51178b9fd..07399bb8e0 100644
+--- a/tools/libxl/libxl_dm.c
++++ b/tools/libxl/libxl_dm.c
+@@ -798,6 +798,8 @@ static char *qemu_disk_scsi_drive_string(libxl__gc *gc, const char *target_path,
+ int colo_mode)
+ {
+ char *drive = NULL;
++ char *common = GCSPRINTF("cache=writeback,readonly=%s",
++ disk->readwrite ? "off" : "on");
+ const char *exportname = disk->colo_export;
+ const char *active_disk = disk->active_disk;
+ const char *hidden_disk = disk->hidden_disk;
+@@ -805,8 +807,8 @@ static char *qemu_disk_scsi_drive_string(libxl__gc *gc, const char *target_path,
+ switch (colo_mode) {
+ case LIBXL__COLO_NONE:
+ drive = libxl__sprintf
+- (gc, "file=%s,if=scsi,bus=0,unit=%d,format=%s,cache=writeback",
+- target_path, unit, format);
++ (gc, "%s,file=%s,if=scsi,bus=0,unit=%d,format=%s",
++ common, target_path, unit, format);
+ break;
+ case LIBXL__COLO_PRIMARY:
+ /*
+@@ -819,13 +821,13 @@ static char *qemu_disk_scsi_drive_string(libxl__gc *gc, const char *target_path,
+ * vote-threshold=1
+ */
+ drive = GCSPRINTF(
+- "if=scsi,bus=0,unit=%d,cache=writeback,driver=quorum,"
++ "%s,if=scsi,bus=0,unit=%d,,driver=quorum,"
+ "id=%s,"
+ "children.0.file.filename=%s,"
+ "children.0.driver=%s,"
+ "read-pattern=fifo,"
+ "vote-threshold=1",
+- unit, exportname, target_path, format);
++ common, unit, exportname, target_path, format);
+ break;
+ case LIBXL__COLO_SECONDARY:
+ /*
+@@ -839,7 +841,7 @@ static char *qemu_disk_scsi_drive_string(libxl__gc *gc, const char *target_path,
+ * file.backing.backing=exportname,
+ */
+ drive = GCSPRINTF(
+- "if=scsi,id=top-colo,bus=0,unit=%d,cache=writeback,"
++ "%s,if=scsi,id=top-colo,bus=0,unit=%d,"
+ "driver=replication,"
+ "mode=secondary,"
+ "top-id=top-colo,"
+@@ -848,7 +850,7 @@ static char *qemu_disk_scsi_drive_string(libxl__gc *gc, const char *target_path,
+ "file.backing.driver=qcow2,"
+ "file.backing.file.filename=%s,"
+ "file.backing.backing=%s",
+- unit, active_disk, hidden_disk, exportname);
++ common, unit, active_disk, hidden_disk, exportname);
+ break;
+ default:
+ abort();
+@@ -866,6 +868,8 @@ static char *qemu_disk_ide_drive_string(libxl__gc *gc, const char *target_path,
+ const char *exportname = disk->colo_export;
+ const char *active_disk = disk->active_disk;
+ const char *hidden_disk = disk->hidden_disk;
++
++ assert(disk->readwrite); /* should have been checked earlier */
+
+ switch (colo_mode) {
+ case LIBXL__COLO_NONE:
+@@ -1575,8 +1579,9 @@ static int libxl__build_device_model_args_new(libxl__gc *gc,
+ if (strncmp(disks[i].vdev, "sd", 2) == 0) {
+ if (colo_mode == LIBXL__COLO_SECONDARY) {
+ drive = libxl__sprintf
+- (gc, "if=none,driver=%s,file=%s,id=%s",
+- format, target_path, disks[i].colo_export);
++ (gc, "if=none,driver=%s,file=%s,id=%s,readonly=%s",
++ format, target_path, disks[i].colo_export,
++ disks[i].readwrite ? "off" : "on");
+
+ flexarray_append(dm_args, "-drive");
+ flexarray_append(dm_args, drive);
+@@ -2586,7 +2591,7 @@ int libxl__need_xenpv_qemu(libxl__gc *gc, libxl_domain_config *d_config)
+ goto out;
+ }
+
+- if (d_config->num_vfbs > 0) {
++ if (d_config->num_vfbs > 0 || d_config->num_p9s > 0) {
+ ret = 1;
+ goto out;
+ }
+diff --git a/tools/misc/xen-cpuid.c b/tools/misc/xen-cpuid.c
+index b1a46c667d..85298d277d 100644
+--- a/tools/misc/xen-cpuid.c
++++ b/tools/misc/xen-cpuid.c
+@@ -165,9 +165,8 @@ static const char *str_7d0[32] =
+ [4 ... 25] = "REZ",
+
+ [26] = "ibrsb", [27] = "stibp",
+- [28] = "REZ", [29] = "arch_caps",
+-
+- [30 ... 31] = "REZ",
++ [28] = "l1d_flush", [29] = "arch_caps",
++ [30] = "REZ", [31] = "ssbd",
+ };
+
+ static struct {
+diff --git a/tools/ocaml/xenstored/store.ml b/tools/ocaml/xenstored/store.ml
+index 13cf3b5bf4..5a8c377603 100644
+--- a/tools/ocaml/xenstored/store.ml
++++ b/tools/ocaml/xenstored/store.ml
+@@ -262,7 +262,8 @@ let path_write store perm path value =
+ Node.check_perm store.root perm Perms.WRITE;
+ Node.set_value store.root value, false
+ ) else
+- Path.apply_modify store.root path do_write, !node_created
++ let root = Path.apply_modify store.root path do_write in
++ root, !node_created
+
+ let path_rm store perm path =
+ let do_rm node name =
+diff --git a/tools/xl/xl.c b/tools/xl/xl.c
+index 179908b4f6..7d2142f16f 100644
+--- a/tools/xl/xl.c
++++ b/tools/xl/xl.c
+@@ -28,6 +28,9 @@
+ #include <libxl_utils.h>
+ #include <libxlutil.h>
+ #include "xl.h"
++#include "xl_parse.h"
++
++#include "xl_utils.h"
+
+ xentoollog_logger_stdiostream *logger;
+ int dryrun_only;
+@@ -42,6 +45,9 @@ char *default_gatewaydev = NULL;
+ char *default_vifbackend = NULL;
+ char *default_remus_netbufscript = NULL;
+ char *default_colo_proxy_script = NULL;
++libxl_bitmap global_vm_affinity_mask;
++libxl_bitmap global_hvm_affinity_mask;
++libxl_bitmap global_pv_affinity_mask;
+ enum output_format default_output_format = OUTPUT_FORMAT_JSON;
+ int claim_mode = 1;
+ bool progress_use_cr = 0;
+@@ -203,6 +209,26 @@ static void parse_global_config(const char *configfile,
+ if (!xlu_cfg_get_long (config, "max_maptrack_frames", &l, 0))
+ max_maptrack_frames = l;
+
++ libxl_bitmap_init(&global_vm_affinity_mask);
++ libxl_cpu_bitmap_alloc(ctx, &global_vm_affinity_mask, 0);
++ libxl_bitmap_init(&global_hvm_affinity_mask);
++ libxl_cpu_bitmap_alloc(ctx, &global_hvm_affinity_mask, 0);
++ libxl_bitmap_init(&global_pv_affinity_mask);
++ libxl_cpu_bitmap_alloc(ctx, &global_pv_affinity_mask, 0);
++
++ if (!xlu_cfg_get_string (config, "vm.cpumask", &buf, 0))
++ parse_cpurange(buf, &global_vm_affinity_mask);
++ else
++ libxl_bitmap_set_any(&global_vm_affinity_mask);
++ if (!xlu_cfg_get_string (config, "vm.hvm.cpumask", &buf, 0))
++ parse_cpurange(buf, &global_hvm_affinity_mask);
++ else
++ libxl_bitmap_set_any(&global_hvm_affinity_mask);
++ if (!xlu_cfg_get_string (config, "vm.pv.cpumask", &buf, 0))
++ parse_cpurange(buf, &global_pv_affinity_mask);
++ else
++ libxl_bitmap_set_any(&global_pv_affinity_mask);
++
+ xlu_cfg_destroy(config);
+ }
+
+diff --git a/tools/xl/xl.h b/tools/xl/xl.h
+index 6b60d1db50..7b9f58fc6c 100644
+--- a/tools/xl/xl.h
++++ b/tools/xl/xl.h
+@@ -41,6 +41,7 @@ struct domain_create {
+ int vncautopass;
+ int console_autoconnect;
+ int checkpointed_stream;
++ int ignore_global_affinity_masks;
+ const char *config_file;
+ char *extra_config; /* extra config string */
+ const char *restore_file;
+@@ -277,6 +278,9 @@ extern char *default_colo_proxy_script;
+ extern char *blkdev_start;
+ extern int max_grant_frames;
+ extern int max_maptrack_frames;
++extern libxl_bitmap global_vm_affinity_mask;
++extern libxl_bitmap global_hvm_affinity_mask;
++extern libxl_bitmap global_pv_affinity_mask;
+
+ enum output_format {
+ OUTPUT_FORMAT_JSON,
+@@ -292,6 +296,9 @@ typedef enum {
+ } domain_restart_type;
+
+ extern void printf_info_sexp(int domid, libxl_domain_config *d_config, FILE *fh);
++extern void apply_global_affinity_masks(libxl_domain_type type,
++ libxl_bitmap *vcpu_affinity_array,
++ unsigned int size);
+
+ #define XL_GLOBAL_CONFIG XEN_CONFIG_DIR "/xl.conf"
+ #define XL_LOCK_FILE XEN_LOCK_DIR "/xl"
+diff --git a/tools/xl/xl_cmdtable.c b/tools/xl/xl_cmdtable.c
+index 5546cf66e7..1a6c28dfdc 100644
+--- a/tools/xl/xl_cmdtable.c
++++ b/tools/xl/xl_cmdtable.c
+@@ -34,7 +34,8 @@ struct cmd_spec cmd_table[] = {
+ "-e Do not wait in the background for the death of the domain.\n"
+ "-V, --vncviewer Connect to the VNC display after the domain is created.\n"
+ "-A, --vncviewer-autopass\n"
+- " Pass VNC password to viewer via stdin."
++ " Pass VNC password to viewer via stdin.\n"
++ "--ignore-global-affinity-masks Ignore global masks in xl.conf."
+ },
+ { "config-update",
+ &main_config_update, 1, 1,
+@@ -224,7 +225,8 @@ struct cmd_spec cmd_table[] = {
+ &main_vcpupin, 1, 1,
+ "Set which CPUs a VCPU can use",
+ "[option] <Domain> <VCPU|all> <Hard affinity|-|all> <Soft affinity|-|all>",
+- "-f, --force undo an override pinning done by the kernel",
++ "-f, --force undo an override pinning done by the kernel\n"
++ "--ignore-global-affinity-masks Ignore global masks in xl.conf",
+ },
+ { "vcpu-set",
+ &main_vcpuset, 0, 1,
+diff --git a/tools/xl/xl_vcpu.c b/tools/xl/xl_vcpu.c
+index 8e735b38c1..3384eeed06 100644
+--- a/tools/xl/xl_vcpu.c
++++ b/tools/xl/xl_vcpu.c
+@@ -68,6 +68,61 @@ static void print_domain_vcpuinfo(uint32_t domid, uint32_t nr_cpus)
+ libxl_vcpuinfo_list_free(vcpuinfo, nb_vcpu);
+ }
+
++void apply_global_affinity_masks(libxl_domain_type type,
++ libxl_bitmap *vcpu_affinity_array,
++ unsigned int size)
++{
++ libxl_bitmap *mask = &global_vm_affinity_mask;
++ libxl_bitmap *type_mask;
++ unsigned int i;
++
++ switch (type) {
++ case LIBXL_DOMAIN_TYPE_HVM:
++ case LIBXL_DOMAIN_TYPE_PVH:
++ type_mask = &global_hvm_affinity_mask;
++ break;
++ case LIBXL_DOMAIN_TYPE_PV:
++ type_mask = &global_pv_affinity_mask;
++ break;
++ default:
++ fprintf(stderr, "Unknown guest type\n");
++ exit(EXIT_FAILURE);
++ }
++
++ for (i = 0; i < size; i++) {
++ int rc;
++ libxl_bitmap *t = &vcpu_affinity_array[i];
++ libxl_bitmap b1, b2;
++
++ libxl_bitmap_init(&b1);
++ libxl_bitmap_init(&b2);
++
++ rc = libxl_bitmap_and(ctx, &b1, t, mask);
++ if (rc) {
++ fprintf(stderr, "libxl_bitmap_and errored\n");
++ exit(EXIT_FAILURE);
++ }
++ rc = libxl_bitmap_and(ctx, &b2, &b1, type_mask);
++ if (rc) {
++ fprintf(stderr, "libxl_bitmap_and errored\n");
++ exit(EXIT_FAILURE);
++ }
++
++ if (libxl_bitmap_is_empty(&b2)) {
++ fprintf(stderr, "vcpu hard affinity map is empty\n");
++ exit(EXIT_FAILURE);
++ }
++
++ /* Replace target bitmap with the result */
++ libxl_bitmap_dispose(t);
++ libxl_bitmap_init(t);
++ libxl_bitmap_copy_alloc(ctx, t, &b2);
++
++ libxl_bitmap_dispose(&b1);
++ libxl_bitmap_dispose(&b2);
++ }
++}
++
+ static void vcpulist(int argc, char **argv)
+ {
+ libxl_dominfo *dominfo;
+@@ -118,6 +173,7 @@ int main_vcpupin(int argc, char **argv)
+ {
+ static struct option opts[] = {
+ {"force", 0, 0, 'f'},
++ {"ignore-global-affinity-masks", 0, 0, 'i'},
+ COMMON_LONG_OPTS
+ };
+ libxl_vcpuinfo *vcpuinfo;
+@@ -132,15 +188,18 @@ int main_vcpupin(int argc, char **argv)
+ const char *vcpu, *hard_str, *soft_str;
+ char *endptr;
+ int opt, nb_cpu, nb_vcpu, rc = EXIT_FAILURE;
+- bool force = false;
++ bool force = false, ignore_masks = false;
+
+ libxl_bitmap_init(&cpumap_hard);
+ libxl_bitmap_init(&cpumap_soft);
+
+- SWITCH_FOREACH_OPT(opt, "f", opts, "vcpu-pin", 3) {
++ SWITCH_FOREACH_OPT(opt, "fi", opts, "vcpu-pin", 3) {
+ case 'f':
+ force = true;
+ break;
++ case 'i':
++ ignore_masks = true;
++ break;
+ default:
+ break;
+ }
+@@ -222,6 +281,23 @@ int main_vcpupin(int argc, char **argv)
+ goto out;
+ }
+
++ /* Only hard affinity matters here */
++ if (!ignore_masks) {
++ libxl_domain_config d_config;
++
++ libxl_domain_config_init(&d_config);
++ rc = libxl_retrieve_domain_configuration(ctx, domid, &d_config);
++ if (rc) {
++ fprintf(stderr, "Could not retrieve domain configuration\n");
++ libxl_domain_config_dispose(&d_config);
++ goto out;
++ }
++
++ apply_global_affinity_masks(d_config.b_info.type, hard, 1);
++
++ libxl_domain_config_dispose(&d_config);
++ }
++
+ if (force) {
+ if (libxl_set_vcpuaffinity_force(ctx, domid, vcpuid, hard, soft)) {
+ fprintf(stderr, "Could not set affinity for vcpu `%ld'.\n",
+diff --git a/tools/xl/xl_vmcontrol.c b/tools/xl/xl_vmcontrol.c
+index 89c2b25ded..a1d633795c 100644
+--- a/tools/xl/xl_vmcontrol.c
++++ b/tools/xl/xl_vmcontrol.c
+@@ -804,6 +804,36 @@ int create_domain(struct domain_create *dom_info)
+ parse_config_data(config_source, config_data, config_len, &d_config);
+ }
+
++ if (!dom_info->ignore_global_affinity_masks) {
++ libxl_domain_build_info *b_info = &d_config.b_info;
++
++ /* It is possible that no hard affinity is specified in config file.
++ * Generate hard affinity maps now if we care about those.
++ */
++ if (b_info->num_vcpu_hard_affinity == 0 &&
++ (!libxl_bitmap_is_full(&global_vm_affinity_mask) ||
++ (b_info->type == LIBXL_DOMAIN_TYPE_PV &&
++ !libxl_bitmap_is_full(&global_pv_affinity_mask)) ||
++ (b_info->type != LIBXL_DOMAIN_TYPE_PV &&
++ !libxl_bitmap_is_full(&global_hvm_affinity_mask))
++ )) {
++ b_info->num_vcpu_hard_affinity = b_info->max_vcpus;
++ b_info->vcpu_hard_affinity =
++ xmalloc(b_info->max_vcpus * sizeof(libxl_bitmap));
++
++ for (i = 0; i < b_info->num_vcpu_hard_affinity; i++) {
++ libxl_bitmap *m = &b_info->vcpu_hard_affinity[i];
++ libxl_bitmap_init(m);
++ libxl_cpu_bitmap_alloc(ctx, m, 0);
++ libxl_bitmap_set_any(m);
++ }
++ }
++
++ apply_global_affinity_masks(b_info->type,
++ b_info->vcpu_hard_affinity,
++ b_info->num_vcpu_hard_affinity);
++ }
++
+ if (migrate_fd >= 0) {
+ if (d_config.c_info.name) {
+ /* when we receive a domain we get its name from the config
+@@ -1124,7 +1154,7 @@ int main_create(int argc, char **argv)
+ const char *filename = NULL;
+ struct domain_create dom_info;
+ int paused = 0, debug = 0, daemonize = 1, console_autoconnect = 0,
+- quiet = 0, monitor = 1, vnc = 0, vncautopass = 0;
++ quiet = 0, monitor = 1, vnc = 0, vncautopass = 0, ignore_masks = 0;
+ int opt, rc;
+ static struct option opts[] = {
+ {"dryrun", 0, 0, 'n'},
+@@ -1132,6 +1162,7 @@ int main_create(int argc, char **argv)
+ {"defconfig", 1, 0, 'f'},
+ {"vncviewer", 0, 0, 'V'},
+ {"vncviewer-autopass", 0, 0, 'A'},
++ {"ignore-global-affinity-masks", 0, 0, 'i'},
+ COMMON_LONG_OPTS
+ };
+
+@@ -1142,7 +1173,7 @@ int main_create(int argc, char **argv)
+ argc--; argv++;
+ }
+
+- SWITCH_FOREACH_OPT(opt, "Fnqf:pcdeVA", opts, "create", 0) {
++ SWITCH_FOREACH_OPT(opt, "Fnqf:pcdeVAi", opts, "create", 0) {
+ case 'f':
+ filename = optarg;
+ break;
+@@ -1174,6 +1205,9 @@ int main_create(int argc, char **argv)
+ case 'A':
+ vnc = vncautopass = 1;
+ break;
++ case 'i':
++ ignore_masks = 1;
++ break;
+ }
+
+ memset(&dom_info, 0, sizeof(dom_info));
+@@ -1203,6 +1237,7 @@ int main_create(int argc, char **argv)
+ dom_info.vnc = vnc;
+ dom_info.vncautopass = vncautopass;
+ dom_info.console_autoconnect = console_autoconnect;
++ dom_info.ignore_global_affinity_masks = ignore_masks;
+
+ rc = create_domain(&dom_info);
+ if (rc < 0) {
+diff --git a/xen/Makefile b/xen/Makefile
+index ecec297b9b..580af86931 100644
+--- a/xen/Makefile
++++ b/xen/Makefile
+@@ -2,7 +2,7 @@
+ # All other places this is stored (eg. compile.h) should be autogenerated.
+ export XEN_VERSION = 4
+ export XEN_SUBVERSION = 10
+-export XEN_EXTRAVERSION ?= .1$(XEN_VENDORVERSION)
++export XEN_EXTRAVERSION ?= .2-pre$(XEN_VENDORVERSION)
+ export XEN_FULLVERSION = $(XEN_VERSION).$(XEN_SUBVERSION)$(XEN_EXTRAVERSION)
+ -include xen-version
+
+diff --git a/xen/arch/x86/Kconfig b/xen/arch/x86/Kconfig
+index f621e799ed..33535ca9aa 100644
+--- a/xen/arch/x86/Kconfig
++++ b/xen/arch/x86/Kconfig
+@@ -71,6 +71,7 @@ config SHADOW_PAGING
+ * Running HVM guests on hardware lacking hardware paging support
+ (First-generation Intel VT-x or AMD SVM).
+ * Live migration of PV guests.
++ * L1TF sidechannel mitigation for PV guests.
+
+ Under a small number of specific workloads, shadow paging may be
+ deliberately used as a performance optimisation.
+diff --git a/xen/arch/x86/Rules.mk b/xen/arch/x86/Rules.mk
+index 70e9d8f5cf..03b1b581f3 100644
+--- a/xen/arch/x86/Rules.mk
++++ b/xen/arch/x86/Rules.mk
+@@ -23,6 +23,7 @@ $(call as-insn-check,CFLAGS,CC,"rdseed %eax",-DHAVE_GAS_RDSEED)
+ $(call as-insn-check,CFLAGS,CC,".equ \"x\"$$(comma)1", \
+ -U__OBJECT_LABEL__ -DHAVE_GAS_QUOTED_SYM \
+ '-D__OBJECT_LABEL__=$(subst $(BASEDIR)/,,$(CURDIR))/$$@')
++$(call as-insn-check,CFLAGS,CC,"invpcid (%rax)$$(comma)%rax",-DHAVE_AS_INVPCID)
+
+ CFLAGS += -mno-red-zone -mno-sse -fpic
+ CFLAGS += -fno-asynchronous-unwind-tables
+diff --git a/xen/arch/x86/acpi/power.c b/xen/arch/x86/acpi/power.c
+index 1e4e5680a7..f3480aa800 100644
+--- a/xen/arch/x86/acpi/power.c
++++ b/xen/arch/x86/acpi/power.c
+@@ -28,6 +28,7 @@
+ #include <asm/tboot.h>
+ #include <asm/apic.h>
+ #include <asm/io_apic.h>
++#include <asm/spec_ctrl.h>
+ #include <acpi/cpufreq/cpufreq.h>
+
+ uint32_t system_reset_counter = 1;
+@@ -163,6 +164,7 @@ static int enter_state(u32 state)
+ {
+ unsigned long flags;
+ int error;
++ struct cpu_info *ci;
+ unsigned long cr4;
+
+ if ( (state <= ACPI_STATE_S0) || (state > ACPI_S_STATES_MAX) )
+@@ -203,12 +205,18 @@ static int enter_state(u32 state)
+ printk(XENLOG_ERR "Some devices failed to power down.");
+ system_state = SYS_STATE_resume;
+ device_power_up(error);
++ console_end_sync();
+ error = -EIO;
+ goto done;
+ }
+ else
+ error = 0;
+
++ ci = get_cpu_info();
++ spec_ctrl_enter_idle(ci);
++ /* Avoid NMI/#MC using MSR_SPEC_CTRL until we've reloaded microcode. */
++ ci->spec_ctrl_flags &= ~SCF_ist_wrmsr;
++
+ ACPI_FLUSH_CPU_CACHE();
+
+ switch ( state )
+@@ -243,17 +251,23 @@ static int enter_state(u32 state)
+ if ( (state == ACPI_STATE_S3) && error )
+ tboot_s3_error(error);
+
++ console_end_sync();
++
++ microcode_resume_cpu(0);
++
++ /* Re-enabled default NMI/#MC use of MSR_SPEC_CTRL. */
++ ci->spec_ctrl_flags |= (default_spec_ctrl_flags & SCF_ist_wrmsr);
++ spec_ctrl_exit_idle(ci);
++
+ done:
+ spin_debug_enable();
+ local_irq_restore(flags);
+- console_end_sync();
+ acpi_sleep_post(state);
+ if ( hvm_cpu_up() )
+ BUG();
++ cpufreq_add_cpu(0);
+
+ enable_cpu:
+- cpufreq_add_cpu(0);
+- microcode_resume_cpu(0);
+ rcu_barrier();
+ mtrr_aps_sync_begin();
+ enable_nonboot_cpus();
+diff --git a/xen/arch/x86/cpu/amd.c b/xen/arch/x86/cpu/amd.c
+index fc9677f020..76078b55b2 100644
+--- a/xen/arch/x86/cpu/amd.c
++++ b/xen/arch/x86/cpu/amd.c
+@@ -9,6 +9,7 @@
+ #include <asm/amd.h>
+ #include <asm/hvm/support.h>
+ #include <asm/setup.h> /* amd_init_cpu */
++#include <asm/spec_ctrl.h>
+ #include <asm/acpi.h>
+ #include <asm/apic.h>
+
+@@ -504,17 +505,23 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
+ u32 eax, ebx, ecx, edx;
+
+ cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
+- c->compute_unit_id = ebx & 0xFF;
+ c->x86_num_siblings = ((ebx >> 8) & 0x3) + 1;
++
++ if (c->x86 < 0x17)
++ c->compute_unit_id = ebx & 0xFF;
++ else {
++ c->cpu_core_id = ebx & 0xFF;
++ c->x86_max_cores /= c->x86_num_siblings;
++ }
+ }
+
+ if (opt_cpu_info)
+ printk("CPU %d(%d) -> Processor %d, %s %d\n",
+ cpu, c->x86_max_cores, c->phys_proc_id,
+- cpu_has(c, X86_FEATURE_TOPOEXT) ? "Compute Unit" :
+- "Core",
+- cpu_has(c, X86_FEATURE_TOPOEXT) ? c->compute_unit_id :
+- c->cpu_core_id);
++ c->compute_unit_id != INVALID_CUID ? "Compute Unit"
++ : "Core",
++ c->compute_unit_id != INVALID_CUID ? c->compute_unit_id
++ : c->cpu_core_id);
+ }
+
+ static void early_init_amd(struct cpuinfo_x86 *c)
+@@ -594,6 +601,25 @@ static void init_amd(struct cpuinfo_x86 *c)
+ c->x86_capability);
+ }
+
++ /*
++ * If the user has explicitly chosen to disable Memory Disambiguation
++ * to mitigiate Speculative Store Bypass, poke the appropriate MSR.
++ */
++ if (opt_ssbd) {
++ int bit = -1;
++
++ switch (c->x86) {
++ case 0x15: bit = 54; break;
++ case 0x16: bit = 33; break;
++ case 0x17: bit = 10; break;
++ }
++
++ if (bit >= 0 && !rdmsr_safe(MSR_AMD64_LS_CFG, value)) {
++ value |= 1ull << bit;
++ wrmsr_safe(MSR_AMD64_LS_CFG, value);
++ }
++ }
++
+ /* MFENCE stops RDTSC speculation */
+ if (!cpu_has_lfence_dispatch)
+ __set_bit(X86_FEATURE_MFENCE_RDTSC, c->x86_capability);
+diff --git a/xen/arch/x86/cpu/common.c b/xen/arch/x86/cpu/common.c
+index fdb2bf1779..eb266c5ba6 100644
+--- a/xen/arch/x86/cpu/common.c
++++ b/xen/arch/x86/cpu/common.c
+@@ -14,6 +14,7 @@
+ #include <public/sysctl.h> /* for XEN_INVALID_{SOCKET,CORE}_ID */
+
+ #include "cpu.h"
++#include "mcheck/x86_mca.h"
+
+ bool_t opt_arat = 1;
+ boolean_param("arat", opt_arat);
+@@ -345,6 +346,9 @@ static void __init early_cpu_detect(void)
+ hap_paddr_bits = PADDR_BITS;
+ }
+
++ if (c->x86_vendor != X86_VENDOR_AMD)
++ park_offline_cpus = opt_mce;
++
+ initialize_cpu_data(0);
+ }
+
+@@ -747,6 +751,7 @@ void load_system_tables(void)
+ [IST_MCE - 1] = stack_top + IST_MCE * PAGE_SIZE,
+ [IST_DF - 1] = stack_top + IST_DF * PAGE_SIZE,
+ [IST_NMI - 1] = stack_top + IST_NMI * PAGE_SIZE,
++ [IST_DB - 1] = stack_top + IST_DB * PAGE_SIZE,
+
+ [IST_MAX ... ARRAY_SIZE(tss->ist) - 1] =
+ 0x8600111111111111ul,
+@@ -774,6 +779,7 @@ void load_system_tables(void)
+ set_ist(&idt_tables[cpu][TRAP_double_fault], IST_DF);
+ set_ist(&idt_tables[cpu][TRAP_nmi], IST_NMI);
+ set_ist(&idt_tables[cpu][TRAP_machine_check], IST_MCE);
++ set_ist(&idt_tables[cpu][TRAP_debug], IST_DB);
+
+ /*
+ * Bottom-of-stack must be 16-byte aligned!
+diff --git a/xen/arch/x86/cpu/mcheck/mce.c b/xen/arch/x86/cpu/mcheck/mce.c
+index df0106ec3c..1ee1870de9 100644
+--- a/xen/arch/x86/cpu/mcheck/mce.c
++++ b/xen/arch/x86/cpu/mcheck/mce.c
+@@ -695,12 +695,15 @@ static void cpu_bank_free(unsigned int cpu)
+
+ mcabanks_free(poll);
+ mcabanks_free(clr);
++
++ per_cpu(poll_bankmask, cpu) = NULL;
++ per_cpu(mce_clear_banks, cpu) = NULL;
+ }
+
+ static int cpu_bank_alloc(unsigned int cpu)
+ {
+- struct mca_banks *poll = mcabanks_alloc();
+- struct mca_banks *clr = mcabanks_alloc();
++ struct mca_banks *poll = per_cpu(poll_bankmask, cpu) ?: mcabanks_alloc();
++ struct mca_banks *clr = per_cpu(mce_clear_banks, cpu) ?: mcabanks_alloc();
+
+ if ( !poll || !clr )
+ {
+@@ -728,7 +731,13 @@ static int cpu_callback(
+
+ case CPU_UP_CANCELED:
+ case CPU_DEAD:
+- cpu_bank_free(cpu);
++ if ( !park_offline_cpus )
++ cpu_bank_free(cpu);
++ break;
++
++ case CPU_REMOVE:
++ if ( park_offline_cpus )
++ cpu_bank_free(cpu);
+ break;
+ }
+
+diff --git a/xen/arch/x86/cpu/mcheck/mce_intel.c b/xen/arch/x86/cpu/mcheck/mce_intel.c
+index e5dd956a24..4474a34e34 100644
+--- a/xen/arch/x86/cpu/mcheck/mce_intel.c
++++ b/xen/arch/x86/cpu/mcheck/mce_intel.c
+@@ -636,8 +636,6 @@ static void clear_cmci(void)
+
+ static void cpu_mcheck_disable(void)
+ {
+- clear_in_cr4(X86_CR4_MCE);
+-
+ if ( cmci_support && opt_mce )
+ clear_cmci();
+ }
+diff --git a/xen/arch/x86/cpu/mtrr/generic.c b/xen/arch/x86/cpu/mtrr/generic.c
+index e9c0e5e059..7ba0c3f0fe 100644
+--- a/xen/arch/x86/cpu/mtrr/generic.c
++++ b/xen/arch/x86/cpu/mtrr/generic.c
+@@ -5,6 +5,7 @@
+ #include <xen/mm.h>
+ #include <xen/stdbool.h>
+ #include <asm/flushtlb.h>
++#include <asm/invpcid.h>
+ #include <asm/io.h>
+ #include <asm/mtrr.h>
+ #include <asm/msr.h>
+@@ -400,8 +401,10 @@ static DEFINE_SPINLOCK(set_atomicity_lock);
+ * has been called.
+ */
+
+-static void prepare_set(void)
++static bool prepare_set(void)
+ {
++ unsigned long cr4;
++
+ /* Note that this is not ideal, since the cache is only flushed/disabled
+ for this CPU while the MTRRs are changed, but changing this requires
+ more invasive changes to the way the kernel boots */
+@@ -412,18 +415,24 @@ static void prepare_set(void)
+ write_cr0(read_cr0() | X86_CR0_CD);
+ wbinvd();
+
+- /* TLB flushing here relies on Xen always using CR4.PGE. */
+- BUILD_BUG_ON(!(XEN_MINIMAL_CR4 & X86_CR4_PGE));
+- write_cr4(read_cr4() & ~X86_CR4_PGE);
++ cr4 = read_cr4();
++ if (cr4 & X86_CR4_PGE)
++ write_cr4(cr4 & ~X86_CR4_PGE);
++ else if (use_invpcid)
++ invpcid_flush_all();
++ else
++ write_cr3(read_cr3());
+
+ /* Save MTRR state */
+ rdmsrl(MSR_MTRRdefType, deftype);
+
+ /* Disable MTRRs, and set the default type to uncached */
+ mtrr_wrmsr(MSR_MTRRdefType, deftype & ~0xcff);
++
++ return cr4 & X86_CR4_PGE;
+ }
+
+-static void post_set(void)
++static void post_set(bool pge)
+ {
+ /* Intel (P6) standard MTRRs */
+ mtrr_wrmsr(MSR_MTRRdefType, deftype);
+@@ -432,7 +441,12 @@ static void post_set(void)
+ write_cr0(read_cr0() & ~X86_CR0_CD);
+
+ /* Reenable CR4.PGE (also flushes the TLB) */
+- write_cr4(read_cr4() | X86_CR4_PGE);
++ if (pge)
++ write_cr4(read_cr4() | X86_CR4_PGE);
++ else if (use_invpcid)
++ invpcid_flush_all();
++ else
++ write_cr3(read_cr3());
+
+ spin_unlock(&set_atomicity_lock);
+ }
+@@ -441,14 +455,15 @@ static void generic_set_all(void)
+ {
+ unsigned long mask, count;
+ unsigned long flags;
++ bool pge;
+
+ local_irq_save(flags);
+- prepare_set();
++ pge = prepare_set();
+
+ /* Actually set the state */
+ mask = set_mtrr_state();
+
+- post_set();
++ post_set(pge);
+ local_irq_restore(flags);
+
+ /* Use the atomic bitops to update the global mask */
+@@ -457,7 +472,6 @@ static void generic_set_all(void)
+ set_bit(count, &smp_changes_mask);
+ mask >>= 1;
+ }
+-
+ }
+
+ static void generic_set_mtrr(unsigned int reg, unsigned long base,
+@@ -474,11 +488,12 @@ static void generic_set_mtrr(unsigned int reg, unsigned long base,
+ {
+ unsigned long flags;
+ struct mtrr_var_range *vr;
++ bool pge;
+
+ vr = &mtrr_state.var_ranges[reg];
+
+ local_irq_save(flags);
+- prepare_set();
++ pge = prepare_set();
+
+ if (size == 0) {
+ /* The invalid bit is kept in the mask, so we simply clear the
+@@ -499,7 +514,7 @@ static void generic_set_mtrr(unsigned int reg, unsigned long base,
+ mtrr_wrmsr(MSR_IA32_MTRR_PHYSMASK(reg), vr->mask);
+ }
+
+- post_set();
++ post_set(pge);
+ local_irq_restore(flags);
+ }
+
+diff --git a/xen/arch/x86/cpu/vpmu_intel.c b/xen/arch/x86/cpu/vpmu_intel.c
+index 207e2e712c..6e27f6ec8e 100644
+--- a/xen/arch/x86/cpu/vpmu_intel.c
++++ b/xen/arch/x86/cpu/vpmu_intel.c
+@@ -454,13 +454,11 @@ static int core2_vpmu_alloc_resource(struct vcpu *v)
+
+ if ( is_hvm_vcpu(v) )
+ {
+- wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+- if ( vmx_add_host_load_msr(MSR_CORE_PERF_GLOBAL_CTRL) )
++ if ( vmx_add_host_load_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, 0) )
+ goto out_err;
+
+- if ( vmx_add_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL) )
++ if ( vmx_add_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, 0) )
+ goto out_err;
+- vmx_write_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+ }
+
+ core2_vpmu_cxt = xzalloc_bytes(sizeof(*core2_vpmu_cxt) +
+@@ -535,27 +533,7 @@ static int core2_vpmu_do_wrmsr(unsigned int msr, uint64_t msr_content,
+ uint64_t *enabled_cntrs;
+
+ if ( !core2_vpmu_msr_common_check(msr, &type, &index) )
+- {
+- /* Special handling for BTS */
+- if ( msr == MSR_IA32_DEBUGCTLMSR )
+- {
+- supported |= IA32_DEBUGCTLMSR_TR | IA32_DEBUGCTLMSR_BTS |
+- IA32_DEBUGCTLMSR_BTINT;
+-
+- if ( cpu_has(&current_cpu_data, X86_FEATURE_DSCPL) )
+- supported |= IA32_DEBUGCTLMSR_BTS_OFF_OS |
+- IA32_DEBUGCTLMSR_BTS_OFF_USR;
+- if ( !(msr_content & ~supported) &&
+- vpmu_is_set(vpmu, VPMU_CPU_HAS_BTS) )
+- return 0;
+- if ( (msr_content & supported) &&
+- !vpmu_is_set(vpmu, VPMU_CPU_HAS_BTS) )
+- printk(XENLOG_G_WARNING
+- "%pv: Debug Store unsupported on this CPU\n",
+- current);
+- }
+ return -EINVAL;
+- }
+
+ ASSERT(!supported);
+
+@@ -613,7 +591,7 @@ static int core2_vpmu_do_wrmsr(unsigned int msr, uint64_t msr_content,
+ return -EINVAL;
+
+ if ( is_hvm_vcpu(v) )
+- vmx_read_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL,
++ vmx_read_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL,
+ &core2_vpmu_cxt->global_ctrl);
+ else
+ rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, core2_vpmu_cxt->global_ctrl);
+@@ -682,7 +660,7 @@ static int core2_vpmu_do_wrmsr(unsigned int msr, uint64_t msr_content,
+ return -EINVAL;
+
+ if ( is_hvm_vcpu(v) )
+- vmx_read_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL,
++ vmx_read_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL,
+ &core2_vpmu_cxt->global_ctrl);
+ else
+ rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, core2_vpmu_cxt->global_ctrl);
+@@ -701,7 +679,7 @@ static int core2_vpmu_do_wrmsr(unsigned int msr, uint64_t msr_content,
+ else
+ {
+ if ( is_hvm_vcpu(v) )
+- vmx_write_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, msr_content);
++ vmx_write_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, msr_content);
+ else
+ wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, msr_content);
+ }
+@@ -735,7 +713,7 @@ static int core2_vpmu_do_rdmsr(unsigned int msr, uint64_t *msr_content)
+ break;
+ case MSR_CORE_PERF_GLOBAL_CTRL:
+ if ( is_hvm_vcpu(v) )
+- vmx_read_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, msr_content);
++ vmx_read_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, msr_content);
+ else
+ rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, *msr_content);
+ break;
+diff --git a/xen/arch/x86/cpuid.c b/xen/arch/x86/cpuid.c
+index b3c9ac6c48..24b9495faa 100644
+--- a/xen/arch/x86/cpuid.c
++++ b/xen/arch/x86/cpuid.c
+@@ -43,6 +43,16 @@ static int __init parse_xen_cpuid(const char *s)
+ if ( !val )
+ setup_clear_cpu_cap(X86_FEATURE_STIBP);
+ }
++ else if ( (val = parse_boolean("l1d-flush", s, ss)) >= 0 )
++ {
++ if ( !val )
++ setup_clear_cpu_cap(X86_FEATURE_L1D_FLUSH);
++ }
++ else if ( (val = parse_boolean("ssbd", s, ss)) >= 0 )
++ {
++ if ( !val )
++ setup_clear_cpu_cap(X86_FEATURE_SSBD);
++ }
+ else
+ rc = -EINVAL;
+
+@@ -368,6 +378,28 @@ static void __init calculate_host_policy(void)
+ }
+ }
+
++static void __init guest_common_feature_adjustments(uint32_t *fs)
++{
++ /* Unconditionally claim to be able to set the hypervisor bit. */
++ __set_bit(X86_FEATURE_HYPERVISOR, fs);
++
++ /*
++ * If IBRS is offered to the guest, unconditionally offer STIBP. It is a
++ * nop on non-HT hardware, and has this behaviour to make heterogeneous
++ * setups easier to manage.
++ */
++ if ( test_bit(X86_FEATURE_IBRSB, fs) )
++ __set_bit(X86_FEATURE_STIBP, fs);
++
++ /*
++ * On hardware which supports IBRS/IBPB, we can offer IBPB independently
++ * of IBRS by using the AMD feature bit. An administrator may wish for
++ * performance reasons to offer IBPB without IBRS.
++ */
++ if ( host_cpuid_policy.feat.ibrsb )
++ __set_bit(X86_FEATURE_IBPB, fs);
++}
++
+ static void __init calculate_pv_max_policy(void)
+ {
+ struct cpuid_policy *p = &pv_max_cpuid_policy;
+@@ -380,18 +412,14 @@ static void __init calculate_pv_max_policy(void)
+ for ( i = 0; i < ARRAY_SIZE(pv_featureset); ++i )
+ pv_featureset[i] &= pv_featuremask[i];
+
+- /* Unconditionally claim to be able to set the hypervisor bit. */
+- __set_bit(X86_FEATURE_HYPERVISOR, pv_featureset);
+-
+- /* On hardware with IBRS/IBPB support, there are further adjustments. */
+- if ( test_bit(X86_FEATURE_IBRSB, pv_featureset) )
+- {
+- /* Offer STIBP unconditionally. It is a nop on non-HT hardware. */
+- __set_bit(X86_FEATURE_STIBP, pv_featureset);
++ /*
++ * If Xen isn't virtualising MSR_SPEC_CTRL for PV guests because of
++ * administrator choice, hide the feature.
++ */
++ if ( !boot_cpu_has(X86_FEATURE_SC_MSR_PV) )
++ __clear_bit(X86_FEATURE_IBRSB, pv_featureset);
+
+- /* AMD's IBPB is a subset of IBRS/IBPB. */
+- __set_bit(X86_FEATURE_IBPB, pv_featureset);
+- }
++ guest_common_feature_adjustments(pv_featureset);
+
+ sanitise_featureset(pv_featureset);
+ cpuid_featureset_to_policy(pv_featureset, p);
+@@ -419,9 +447,6 @@ static void __init calculate_hvm_max_policy(void)
+ for ( i = 0; i < ARRAY_SIZE(hvm_featureset); ++i )
+ hvm_featureset[i] &= hvm_featuremask[i];
+
+- /* Unconditionally claim to be able to set the hypervisor bit. */
+- __set_bit(X86_FEATURE_HYPERVISOR, hvm_featureset);
+-
+ /*
+ * Xen can provide an APIC emulation to HVM guests even if the host's APIC
+ * isn't enabled.
+@@ -437,6 +462,13 @@ static void __init calculate_hvm_max_policy(void)
+ raw_cpuid_policy.basic.sep )
+ __set_bit(X86_FEATURE_SEP, hvm_featureset);
+
++ /*
++ * If Xen isn't virtualising MSR_SPEC_CTRL for HVM guests because of
++ * administrator choice, hide the feature.
++ */
++ if ( !boot_cpu_has(X86_FEATURE_SC_MSR_HVM) )
++ __clear_bit(X86_FEATURE_IBRSB, hvm_featureset);
++
+ /*
+ * With VT-x, some features are only supported by Xen if dedicated
+ * hardware support is also available.
+@@ -450,15 +482,7 @@ static void __init calculate_hvm_max_policy(void)
+ __clear_bit(X86_FEATURE_XSAVES, hvm_featureset);
+ }
+
+- /* On hardware with IBRS/IBPB support, there are further adjustments. */
+- if ( test_bit(X86_FEATURE_IBRSB, hvm_featureset) )
+- {
+- /* Offer STIBP unconditionally. It is a nop on non-HT hardware. */
+- __set_bit(X86_FEATURE_STIBP, hvm_featureset);
+-
+- /* AMD's IBPB is a subset of IBRS/IBPB. */
+- __set_bit(X86_FEATURE_IBPB, hvm_featureset);
+- }
++ guest_common_feature_adjustments(hvm_featureset);
+
+ sanitise_featureset(hvm_featureset);
+ cpuid_featureset_to_policy(hvm_featureset, p);
+@@ -601,14 +625,6 @@ void recalculate_cpuid_policy(struct domain *d)
+ recalculate_xstate(p);
+ recalculate_misc(p);
+
+- /*
+- * Override STIBP to match IBRS. Guests can safely use STIBP
+- * functionality on non-HT hardware, but can't necesserily protect
+- * themselves from SP2/Spectre/Branch Target Injection if STIBP is hidden
+- * on HT-capable hardware.
+- */
+- p->feat.stibp = p->feat.ibrsb;
+-
+ for ( i = 0; i < ARRAY_SIZE(p->cache.raw); ++i )
+ {
+ if ( p->cache.subleaf[i].type >= 1 &&
+diff --git a/xen/arch/x86/debug.c b/xen/arch/x86/debug.c
+index 9159f32db4..a500df01ac 100644
+--- a/xen/arch/x86/debug.c
++++ b/xen/arch/x86/debug.c
+@@ -98,7 +98,7 @@ dbg_pv_va2mfn(dbgva_t vaddr, struct domain *dp, uint64_t pgd3val)
+ l2_pgentry_t l2e, *l2t;
+ l1_pgentry_t l1e, *l1t;
+ unsigned long cr3 = (pgd3val ? pgd3val : dp->vcpu[0]->arch.cr3);
+- mfn_t mfn = maddr_to_mfn(cr3);
++ mfn_t mfn = maddr_to_mfn(cr3_pa(cr3));
+
+ DBGP2("vaddr:%lx domid:%d cr3:%lx pgd3:%lx\n", vaddr, dp->domain_id,
+ cr3, pgd3val);
+diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
+index 5a4d5c3bfc..2020e0b682 100644
+--- a/xen/arch/x86/domain.c
++++ b/xen/arch/x86/domain.c
+@@ -107,10 +107,11 @@ static void play_dead(void)
+ local_irq_disable();
+
+ /*
+- * NOTE: After cpu_exit_clear, per-cpu variables are no longer accessible,
+- * as they may be freed at any time. In this case, heap corruption or
+- * #PF can occur (when heap debugging is enabled). For example, even
+- * printk() can involve tasklet scheduling, which touches per-cpu vars.
++ * NOTE: After cpu_exit_clear, per-cpu variables may no longer accessible,
++ * as they may be freed at any time if offline CPUs don't get parked. In
++ * this case, heap corruption or #PF can occur (when heap debugging is
++ * enabled). For example, even printk() can involve tasklet scheduling,
++ * which touches per-cpu vars.
+ *
+ * Consider very carefully when adding code to *dead_idle. Most hypervisor
+ * subsystems are unsafe to call.
+@@ -1517,17 +1518,12 @@ void paravirt_ctxt_switch_from(struct vcpu *v)
+ void paravirt_ctxt_switch_to(struct vcpu *v)
+ {
+ root_pgentry_t *root_pgt = this_cpu(root_pgt);
+- unsigned long cr4;
+
+ if ( root_pgt )
+ root_pgt[root_table_offset(PERDOMAIN_VIRT_START)] =
+ l4e_from_page(v->domain->arch.perdomain_l3_pg,
+ __PAGE_HYPERVISOR_RW);
+
+- cr4 = pv_guest_cr4_to_real_cr4(v);
+- if ( unlikely(cr4 != read_cr4()) )
+- write_cr4(cr4);
+-
+ if ( unlikely(v->arch.debugreg[7] & DR7_ACTIVE_MASK) )
+ activate_debugregs(v);
+
+@@ -1640,7 +1636,7 @@ static void __context_switch(void)
+ if ( cpu_has_xsaves && is_hvm_vcpu(n) )
+ set_msr_xss(n->arch.hvm_vcpu.msr_xss);
+ }
+- vcpu_restore_fpu_eager(n);
++ vcpu_restore_fpu_nonlazy(n, false);
+ nd->arch.ctxt_switch->to(n);
+ }
+
+@@ -1693,6 +1689,7 @@ void context_switch(struct vcpu *prev, struct vcpu *next)
+
+ ASSERT(local_irq_is_enabled());
+
++ get_cpu_info()->use_pv_cr3 = false;
+ get_cpu_info()->xen_cr3 = 0;
+
+ cpumask_copy(&dirty_mask, next->vcpu_dirty_cpumask);
+diff --git a/xen/arch/x86/domain_page.c b/xen/arch/x86/domain_page.c
+index 3432a854dd..9a52276866 100644
+--- a/xen/arch/x86/domain_page.c
++++ b/xen/arch/x86/domain_page.c
+@@ -51,7 +51,7 @@ static inline struct vcpu *mapcache_current_vcpu(void)
+ if ( (v = idle_vcpu[smp_processor_id()]) == current )
+ sync_local_execstate();
+ /* We must now be running on the idle page table. */
+- ASSERT(read_cr3() == __pa(idle_pg_table));
++ ASSERT(cr3_pa(read_cr3()) == __pa(idle_pg_table));
+ }
+
+ return v;
+diff --git a/xen/arch/x86/domctl.c b/xen/arch/x86/domctl.c
+index 74e9749d89..eefd94eb92 100644
+--- a/xen/arch/x86/domctl.c
++++ b/xen/arch/x86/domctl.c
+@@ -226,7 +226,8 @@ static int update_domain_cpuid_info(struct domain *d,
+ */
+ call_policy_changed = (is_hvm_domain(d) &&
+ ((old_7d0 ^ p->feat.raw[0].d) &
+- cpufeat_mask(X86_FEATURE_IBRSB)));
++ (cpufeat_mask(X86_FEATURE_IBRSB) |
++ cpufeat_mask(X86_FEATURE_L1D_FLUSH))));
+ break;
+
+ case 0xa:
+@@ -1227,7 +1228,7 @@ long arch_do_domctl(
+ if ( _xcr0_accum )
+ {
+ if ( evc->size >= PV_XSAVE_HDR_SIZE + XSTATE_AREA_MIN_SIZE )
+- ret = validate_xstate(_xcr0, _xcr0_accum,
++ ret = validate_xstate(d, _xcr0, _xcr0_accum,
+ &_xsave_area->xsave_hdr);
+ }
+ else if ( !_xcr0 )
+@@ -1251,8 +1252,7 @@ long arch_do_domctl(
+ vcpu_pause(v);
+ v->arch.xcr0 = _xcr0;
+ v->arch.xcr0_accum = _xcr0_accum;
+- if ( _xcr0_accum & XSTATE_NONLAZY )
+- v->arch.nonlazy_xstate_used = 1;
++ v->arch.nonlazy_xstate_used = _xcr0_accum & XSTATE_NONLAZY;
+ compress_xsave_states(v, _xsave_area,
+ evc->size - PV_XSAVE_HDR_SIZE);
+ vcpu_unpause(v);
+diff --git a/xen/arch/x86/flushtlb.c b/xen/arch/x86/flushtlb.c
+index f6d7ad1650..797c5d52cc 100644
+--- a/xen/arch/x86/flushtlb.c
++++ b/xen/arch/x86/flushtlb.c
+@@ -8,9 +8,12 @@
+ */
+
+ #include <xen/sched.h>
++#include <xen/smp.h>
+ #include <xen/softirq.h>
+ #include <asm/flushtlb.h>
++#include <asm/invpcid.h>
+ #include <asm/page.h>
++#include <asm/pv/domain.h>
+
+ /* Debug builds: Wrap frequently to stress-test the wrap logic. */
+ #ifdef NDEBUG
+@@ -49,6 +52,8 @@ static u32 pre_flush(void)
+ raise_softirq(NEW_TLBFLUSH_CLOCK_PERIOD_SOFTIRQ);
+
+ skip_clocktick:
++ hvm_flush_guest_tlbs();
++
+ return t2;
+ }
+
+@@ -69,21 +74,65 @@ static void post_flush(u32 t)
+ this_cpu(tlbflush_time) = t;
+ }
+
+-void write_cr3(unsigned long cr3)
++static void do_tlb_flush(void)
++{
++ u32 t = pre_flush();
++
++ if ( use_invpcid )
++ invpcid_flush_all();
++ else
++ {
++ unsigned long cr4 = read_cr4();
++
++ write_cr4(cr4 ^ X86_CR4_PGE);
++ write_cr4(cr4);
++ }
++
++ post_flush(t);
++}
++
++void switch_cr3_cr4(unsigned long cr3, unsigned long cr4)
+ {
+- unsigned long flags, cr4 = read_cr4();
++ unsigned long flags, old_cr4;
+ u32 t;
++ unsigned long old_pcid = cr3_pcid(read_cr3());
+
+ /* This non-reentrant function is sometimes called in interrupt context. */
+ local_irq_save(flags);
+
+ t = pre_flush();
+
+- hvm_flush_guest_tlbs();
+-
+- write_cr4(cr4 & ~X86_CR4_PGE);
+- asm volatile ( "mov %0, %%cr3" : : "r" (cr3) : "memory" );
+- write_cr4(cr4);
++ old_cr4 = read_cr4();
++ if ( old_cr4 & X86_CR4_PGE )
++ {
++ /*
++ * X86_CR4_PGE set means PCID is inactive.
++ * We have to purge the TLB via flipping cr4.pge.
++ */
++ old_cr4 = cr4 & ~X86_CR4_PGE;
++ write_cr4(old_cr4);
++ }
++ else if ( use_invpcid )
++ /*
++ * Flushing the TLB via INVPCID is necessary only in case PCIDs are
++ * in use, which is true only with INVPCID being available.
++ * Without PCID usage the following write_cr3() will purge the TLB
++ * (we are in the cr4.pge off path) of all entries.
++ * Using invpcid_flush_all_nonglobals() seems to be faster than
++ * invpcid_flush_all(), so use that.
++ */
++ invpcid_flush_all_nonglobals();
++
++ write_cr3(cr3);
++
++ if ( old_cr4 != cr4 )
++ write_cr4(cr4);
++ else if ( old_pcid != cr3_pcid(cr3) )
++ /*
++ * Make sure no TLB entries related to the old PCID created between
++ * flushing the TLB and writing the new %cr3 value remain in the TLB.
++ */
++ invpcid_flush_single_context(old_pcid);
+
+ post_flush(t);
+
+@@ -113,22 +162,32 @@ unsigned int flush_area_local(const void *va, unsigned int flags)
+ * are various errata surrounding INVLPG usage on superpages, and
+ * a full flush is in any case not *that* expensive.
+ */
+- asm volatile ( "invlpg %0"
+- : : "m" (*(const char *)(va)) : "memory" );
++ if ( read_cr4() & X86_CR4_PCIDE )
++ {
++ unsigned long addr = (unsigned long)va;
++
++ /*
++ * Flush the addresses for all potential address spaces.
++ * We can't check the current domain for being subject to
++ * XPTI as current might be the idle vcpu while we still have
++ * some XPTI domain TLB entries.
++ * Using invpcid is okay here, as with PCID enabled we always
++ * have global pages disabled.
++ */
++ invpcid_flush_one(PCID_PV_PRIV, addr);
++ invpcid_flush_one(PCID_PV_USER, addr);
++ if ( !cpu_has_no_xpti )
++ {
++ invpcid_flush_one(PCID_PV_PRIV | PCID_PV_XPTI, addr);
++ invpcid_flush_one(PCID_PV_USER | PCID_PV_XPTI, addr);
++ }
++ }
++ else
++ asm volatile ( "invlpg %0"
++ : : "m" (*(const char *)(va)) : "memory" );
+ }
+ else
+- {
+- u32 t = pre_flush();
+- unsigned long cr4 = read_cr4();
+-
+- hvm_flush_guest_tlbs();
+-
+- write_cr4(cr4 & ~X86_CR4_PGE);
+- barrier();
+- write_cr4(cr4);
+-
+- post_flush(t);
+- }
++ do_tlb_flush();
+ }
+
+ if ( flags & FLUSH_CACHE )
+@@ -161,5 +220,8 @@ unsigned int flush_area_local(const void *va, unsigned int flags)
+
+ local_irq_restore(irqfl);
+
++ if ( flags & FLUSH_ROOT_PGTBL )
++ get_cpu_info()->root_pgt_changed = true;
++
+ return flags;
+ }
+diff --git a/xen/arch/x86/genapic/x2apic.c b/xen/arch/x86/genapic/x2apic.c
+index 5fffb317d9..4fb9a2225d 100644
+--- a/xen/arch/x86/genapic/x2apic.c
++++ b/xen/arch/x86/genapic/x2apic.c
+@@ -201,18 +201,21 @@ static int update_clusterinfo(
+ if ( !cluster_cpus_spare )
+ cluster_cpus_spare = xzalloc(cpumask_t);
+ if ( !cluster_cpus_spare ||
+- !alloc_cpumask_var(&per_cpu(scratch_mask, cpu)) )
++ !cond_alloc_cpumask_var(&per_cpu(scratch_mask, cpu)) )
+ err = -ENOMEM;
+ break;
+ case CPU_UP_CANCELED:
+ case CPU_DEAD:
++ case CPU_REMOVE:
++ if ( park_offline_cpus == (action != CPU_REMOVE) )
++ break;
+ if ( per_cpu(cluster_cpus, cpu) )
+ {
+ cpumask_clear_cpu(cpu, per_cpu(cluster_cpus, cpu));
+ if ( cpumask_empty(per_cpu(cluster_cpus, cpu)) )
+- xfree(per_cpu(cluster_cpus, cpu));
++ XFREE(per_cpu(cluster_cpus, cpu));
+ }
+- free_cpumask_var(per_cpu(scratch_mask, cpu));
++ FREE_CPUMASK_VAR(per_cpu(scratch_mask, cpu));
+ break;
+ }
+
+diff --git a/xen/arch/x86/hpet.c b/xen/arch/x86/hpet.c
+index 8229c635e4..f18cbbd55a 100644
+--- a/xen/arch/x86/hpet.c
++++ b/xen/arch/x86/hpet.c
+@@ -509,6 +509,8 @@ static void hpet_attach_channel(unsigned int cpu,
+ static void hpet_detach_channel(unsigned int cpu,
+ struct hpet_event_channel *ch)
+ {
++ unsigned int next;
++
+ spin_lock_irq(&ch->lock);
+
+ ASSERT(ch == per_cpu(cpu_bc_channel, cpu));
+@@ -517,7 +519,7 @@ static void hpet_detach_channel(unsigned int cpu,
+
+ if ( cpu != ch->cpu )
+ spin_unlock_irq(&ch->lock);
+- else if ( cpumask_empty(ch->cpumask) )
++ else if ( (next = cpumask_first(ch->cpumask)) >= nr_cpu_ids )
+ {
+ ch->cpu = -1;
+ clear_bit(HPET_EVT_USED_BIT, &ch->flags);
+@@ -525,7 +527,7 @@ static void hpet_detach_channel(unsigned int cpu,
+ }
+ else
+ {
+- ch->cpu = cpumask_first(ch->cpumask);
++ ch->cpu = next;
+ set_channel_irq_affinity(ch);
+ local_irq_enable();
+ }
+diff --git a/xen/arch/x86/hvm/emulate.c b/xen/arch/x86/hvm/emulate.c
+index b282089e03..03db6b3d31 100644
+--- a/xen/arch/x86/hvm/emulate.c
++++ b/xen/arch/x86/hvm/emulate.c
+@@ -1900,6 +1900,7 @@ static int hvmemul_get_fpu(
+ * masking of all exceptions by FNSTENV.)
+ */
+ save_fpu_enable();
++ curr->fpu_initialised = true;
+ curr->fpu_dirtied = true;
+ if ( (fpu_ctxt->fcw & 0x3f) != 0x3f )
+ {
+@@ -1991,13 +1992,20 @@ static void hvmemul_put_fpu(
+ if ( backout == X86EMUL_FPU_fpu )
+ {
+ /*
+- * To back out changes to the register file simply adjust state such
+- * that upon next FPU insn use by the guest we'll reload the state
+- * saved (or freshly loaded) by hvmemul_get_fpu().
++ * To back out changes to the register file
++ * - in fully eager mode, restore original state immediately,
++ * - in lazy mode, simply adjust state such that upon next FPU insn
++ * use by the guest we'll reload the state saved (or freshly loaded)
++ * by hvmemul_get_fpu().
+ */
+- curr->fpu_dirtied = false;
+- stts();
+- hvm_funcs.fpu_leave(curr);
++ if ( curr->arch.fully_eager_fpu )
++ vcpu_restore_fpu_nonlazy(curr, false);
++ else
++ {
++ curr->fpu_dirtied = false;
++ stts();
++ hvm_funcs.fpu_leave(curr);
++ }
+ }
+ }
+
+@@ -2113,22 +2121,20 @@ static int _hvm_emulate_one(struct hvm_emulate_ctxt *hvmemul_ctxt,
+
+ vio->mmio_retry = 0;
+
+- switch ( rc = x86_emulate(&hvmemul_ctxt->ctxt, ops) )
++ rc = x86_emulate(&hvmemul_ctxt->ctxt, ops);
++ if ( rc == X86EMUL_OKAY && vio->mmio_retry )
++ rc = X86EMUL_RETRY;
++
++ if ( !hvm_vcpu_io_need_completion(vio) )
+ {
+- case X86EMUL_OKAY:
+- if ( vio->mmio_retry )
+- rc = X86EMUL_RETRY;
+- /* fall through */
+- default:
+ vio->mmio_cache_count = 0;
+ vio->mmio_insn_bytes = 0;
+- break;
+-
+- case X86EMUL_RETRY:
++ }
++ else
++ {
+ BUILD_BUG_ON(sizeof(vio->mmio_insn) < sizeof(hvmemul_ctxt->insn_buf));
+ vio->mmio_insn_bytes = hvmemul_ctxt->insn_buf_bytes;
+ memcpy(vio->mmio_insn, hvmemul_ctxt->insn_buf, vio->mmio_insn_bytes);
+- break;
+ }
+
+ if ( hvmemul_ctxt->ctxt.retire.singlestep )
+diff --git a/xen/arch/x86/hvm/hpet.c b/xen/arch/x86/hvm/hpet.c
+index f7aed7f69e..28377091ca 100644
+--- a/xen/arch/x86/hvm/hpet.c
++++ b/xen/arch/x86/hvm/hpet.c
+@@ -264,13 +264,20 @@ static void hpet_set_timer(HPETState *h, unsigned int tn,
+ diff = (timer_is_32bit(h, tn) && (-diff > HPET_TINY_TIME_SPAN))
+ ? (uint32_t)diff : 0;
+
++ destroy_periodic_time(&h->pt[tn]);
+ if ( (tn <= 1) && (h->hpet.config & HPET_CFG_LEGACY) )
++ {
+ /* if LegacyReplacementRoute bit is set, HPET specification requires
+ timer0 be routed to IRQ0 in NON-APIC or IRQ2 in the I/O APIC,
+ timer1 be routed to IRQ8 in NON-APIC or IRQ8 in the I/O APIC. */
+ irq = (tn == 0) ? 0 : 8;
++ h->pt[tn].source = PTSRC_isa;
++ }
+ else
++ {
+ irq = timer_int_route(h, tn);
++ h->pt[tn].source = PTSRC_ioapic;
++ }
+
+ /*
+ * diff is the time from now when the timer should fire, for a periodic
+diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
+index 18adec5ad8..8e237eb1ac 100644
+--- a/xen/arch/x86/hvm/hvm.c
++++ b/xen/arch/x86/hvm/hvm.c
+@@ -895,6 +895,9 @@ const char *hvm_efer_valid(const struct vcpu *v, uint64_t value,
+ else
+ p = &host_cpuid_policy;
+
++ if ( value & ~EFER_KNOWN_MASK )
++ return "Unknown bits set";
++
+ if ( (value & EFER_SCE) && !p->extd.syscall )
+ return "SCE without feature";
+
+@@ -1258,7 +1261,7 @@ static int hvm_load_cpu_xsave_states(struct domain *d, hvm_domain_context_t *h)
+ ctxt = (struct hvm_hw_cpu_xsave *)&h->data[h->cur];
+ h->cur += desc->length;
+
+- err = validate_xstate(ctxt->xcr0, ctxt->xcr0_accum,
++ err = validate_xstate(d, ctxt->xcr0, ctxt->xcr0_accum,
+ (const void *)&ctxt->save_area.xsave_hdr);
+ if ( err )
+ {
+@@ -1313,8 +1316,7 @@ static int hvm_load_cpu_xsave_states(struct domain *d, hvm_domain_context_t *h)
+
+ v->arch.xcr0 = ctxt->xcr0;
+ v->arch.xcr0_accum = ctxt->xcr0_accum;
+- if ( ctxt->xcr0_accum & XSTATE_NONLAZY )
+- v->arch.nonlazy_xstate_used = 1;
++ v->arch.nonlazy_xstate_used = ctxt->xcr0_accum & XSTATE_NONLAZY;
+ compress_xsave_states(v, &ctxt->save_area,
+ size - offsetof(struct hvm_hw_cpu_xsave, save_area));
+
+diff --git a/xen/arch/x86/hvm/ioreq.c b/xen/arch/x86/hvm/ioreq.c
+index d5afe20cc8..25b2445429 100644
+--- a/xen/arch/x86/hvm/ioreq.c
++++ b/xen/arch/x86/hvm/ioreq.c
+@@ -87,14 +87,17 @@ static void hvm_io_assist(struct hvm_ioreq_vcpu *sv, uint64_t data)
+
+ static bool hvm_wait_for_io(struct hvm_ioreq_vcpu *sv, ioreq_t *p)
+ {
++ unsigned int prev_state = STATE_IOREQ_NONE;
++
+ while ( sv->pending )
+ {
+ unsigned int state = p->state;
+
+- rmb();
+- switch ( state )
++ smp_rmb();
++
++ recheck:
++ if ( unlikely(state == STATE_IOREQ_NONE) )
+ {
+- case STATE_IOREQ_NONE:
+ /*
+ * The only reason we should see this case is when an
+ * emulator is dying and it races with an I/O being
+@@ -102,14 +105,30 @@ static bool hvm_wait_for_io(struct hvm_ioreq_vcpu *sv, ioreq_t *p)
+ */
+ hvm_io_assist(sv, ~0ul);
+ break;
++ }
++
++ if ( unlikely(state < prev_state) )
++ {
++ gdprintk(XENLOG_ERR, "Weird HVM ioreq state transition %u -> %u\n",
++ prev_state, state);
++ sv->pending = false;
++ domain_crash(sv->vcpu->domain);
++ return false; /* bail */
++ }
++
++ switch ( prev_state = state )
++ {
+ case STATE_IORESP_READY: /* IORESP_READY -> NONE */
+ p->state = STATE_IOREQ_NONE;
+ hvm_io_assist(sv, p->data);
+ break;
+ case STATE_IOREQ_READY: /* IOREQ_{READY,INPROCESS} -> IORESP_READY */
+ case STATE_IOREQ_INPROCESS:
+- wait_on_xen_event_channel(sv->ioreq_evtchn, p->state != state);
+- break;
++ wait_on_xen_event_channel(sv->ioreq_evtchn,
++ ({ state = p->state;
++ smp_rmb();
++ state != prev_state; }));
++ goto recheck;
+ default:
+ gdprintk(XENLOG_ERR, "Weird HVM iorequest state %u\n", state);
+ sv->pending = false;
+diff --git a/xen/arch/x86/hvm/irq.c b/xen/arch/x86/hvm/irq.c
+index f528e2d081..c85d004402 100644
+--- a/xen/arch/x86/hvm/irq.c
++++ b/xen/arch/x86/hvm/irq.c
+@@ -41,6 +41,26 @@ static void assert_gsi(struct domain *d, unsigned ioapic_gsi)
+ vioapic_irq_positive_edge(d, ioapic_gsi);
+ }
+
++int hvm_ioapic_assert(struct domain *d, unsigned int gsi, bool level)
++{
++ struct hvm_irq *hvm_irq = hvm_domain_irq(d);
++ int vector;
++
++ if ( gsi >= hvm_irq->nr_gsis )
++ {
++ ASSERT_UNREACHABLE();
++ return -1;
++ }
++
++ spin_lock(&d->arch.hvm_domain.irq_lock);
++ if ( !level || hvm_irq->gsi_assert_count[gsi]++ == 0 )
++ assert_gsi(d, gsi);
++ vector = vioapic_get_vector(d, gsi);
++ spin_unlock(&d->arch.hvm_domain.irq_lock);
++
++ return vector;
++}
++
+ static void assert_irq(struct domain *d, unsigned ioapic_gsi, unsigned pic_irq)
+ {
+ assert_gsi(d, ioapic_gsi);
+diff --git a/xen/arch/x86/hvm/svm/entry.S b/xen/arch/x86/hvm/svm/entry.S
+index bf092fe071..5e7c080c7c 100644
+--- a/xen/arch/x86/hvm/svm/entry.S
++++ b/xen/arch/x86/hvm/svm/entry.S
+@@ -83,7 +83,7 @@ UNLIKELY_END(svm_trace)
+ mov VCPUMSR_spec_ctrl_raw(%rax), %eax
+
+ /* WARNING! `ret`, `call *`, `jmp *` not safe beyond this point. */
+- SPEC_CTRL_EXIT_TO_GUEST /* Req: a=spec_ctrl %rsp=regs/cpuinfo, Clob: cd */
++ SPEC_CTRL_EXIT_TO_HVM /* Req: a=spec_ctrl %rsp=regs/cpuinfo, Clob: cd */
+
+ pop %r15
+ pop %r14
+@@ -108,7 +108,7 @@ UNLIKELY_END(svm_trace)
+
+ GET_CURRENT(bx)
+
+- SPEC_CTRL_ENTRY_FROM_VMEXIT /* Req: b=curr %rsp=regs/cpuinfo, Clob: acd */
++ SPEC_CTRL_ENTRY_FROM_HVM /* Req: b=curr %rsp=regs/cpuinfo, Clob: acd */
+ /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */
+
+ mov VCPU_svm_vmcb(%rbx),%rcx
+diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c
+index dedec5752d..aad3655855 100644
+--- a/xen/arch/x86/hvm/svm/svm.c
++++ b/xen/arch/x86/hvm/svm/svm.c
+@@ -546,7 +546,10 @@ void svm_update_guest_cr(struct vcpu *v, unsigned int cr)
+ if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) )
+ {
+ if ( v != current )
+- hw_cr0_mask |= X86_CR0_TS;
++ {
++ if ( !v->arch.fully_eager_fpu )
++ hw_cr0_mask |= X86_CR0_TS;
++ }
+ else if ( vmcb_get_cr0(vmcb) & X86_CR0_TS )
+ svm_fpu_enter(v);
+ }
+@@ -1033,7 +1036,8 @@ static void svm_ctxt_switch_from(struct vcpu *v)
+ if ( unlikely((read_efer() & EFER_SVME) == 0) )
+ return;
+
+- svm_fpu_leave(v);
++ if ( !v->arch.fully_eager_fpu )
++ svm_fpu_leave(v);
+
+ svm_save_dr(v);
+ svm_lwp_save(v);
+@@ -1046,6 +1050,7 @@ static void svm_ctxt_switch_from(struct vcpu *v)
+ set_ist(&idt_tables[cpu][TRAP_double_fault], IST_DF);
+ set_ist(&idt_tables[cpu][TRAP_nmi], IST_NMI);
+ set_ist(&idt_tables[cpu][TRAP_machine_check], IST_MCE);
++ set_ist(&idt_tables[cpu][TRAP_debug], IST_DB);
+ }
+
+ static void svm_ctxt_switch_to(struct vcpu *v)
+@@ -1067,6 +1072,7 @@ static void svm_ctxt_switch_to(struct vcpu *v)
+ set_ist(&idt_tables[cpu][TRAP_double_fault], IST_NONE);
+ set_ist(&idt_tables[cpu][TRAP_nmi], IST_NONE);
+ set_ist(&idt_tables[cpu][TRAP_machine_check], IST_NONE);
++ set_ist(&idt_tables[cpu][TRAP_debug], IST_NONE);
+
+ svm_restore_dr(v);
+
+@@ -1361,24 +1367,18 @@ static void svm_inject_event(const struct x86_event *event)
+ * Xen must emulate enough of the event injection to be sure that a
+ * further fault shouldn't occur during delivery. This covers the fact
+ * that hardware doesn't perform DPL checking on injection.
+- *
+- * Also, it accounts for proper positioning of %rip for an event with trap
+- * semantics (where %rip should point after the instruction) which suffers
+- * a fault during injection (at which point %rip should point at the
+- * instruction).
+ */
+ if ( event->type == X86_EVENTTYPE_PRI_SW_EXCEPTION ||
+- (!cpu_has_svm_nrips && (event->type == X86_EVENTTYPE_SW_INTERRUPT ||
+- event->type == X86_EVENTTYPE_SW_EXCEPTION)) )
++ (!cpu_has_svm_nrips && (event->type >= X86_EVENTTYPE_SW_INTERRUPT)) )
+ svm_emul_swint_injection(&_event);
+
+- switch ( _event.vector )
++ switch ( _event.vector | -(_event.type == X86_EVENTTYPE_SW_INTERRUPT) )
+ {
+ case TRAP_debug:
+ if ( regs->eflags & X86_EFLAGS_TF )
+ {
+ __restore_debug_registers(vmcb, curr);
+- vmcb_set_dr6(vmcb, vmcb_get_dr6(vmcb) | 0x4000);
++ vmcb_set_dr6(vmcb, vmcb_get_dr6(vmcb) | DR_STEP);
+ }
+ /* fall through */
+ case TRAP_int3:
+@@ -1388,6 +1388,13 @@ static void svm_inject_event(const struct x86_event *event)
+ domain_pause_for_debugger();
+ return;
+ }
++ break;
++
++ case TRAP_page_fault:
++ ASSERT(_event.type == X86_EVENTTYPE_HW_EXCEPTION);
++ curr->arch.hvm_vcpu.guest_cr[2] = _event.cr2;
++ vmcb_set_cr2(vmcb, _event.cr2);
++ break;
+ }
+
+ if ( unlikely(eventinj.fields.v) &&
+@@ -1410,13 +1417,9 @@ static void svm_inject_event(const struct x86_event *event)
+ * icebp, software events with trap semantics need emulating, so %rip in
+ * the trap frame points after the instruction.
+ *
+- * The x86 emulator (if requested by the x86_swint_emulate_* choice) will
+- * have performed checks such as presence/dpl/etc and believes that the
+- * event injection will succeed without faulting.
+- *
+- * The x86 emulator will always provide fault semantics for software
+- * events, with _trap.insn_len set appropriately. If the injection
+- * requires emulation, move %rip forwards at this point.
++ * svm_emul_swint_injection() has already confirmed that events with trap
++ * semantics won't fault on injection. Position %rip/NextRIP suitably,
++ * and restrict the event type to what hardware will tolerate.
+ */
+ switch ( _event.type )
+ {
+@@ -1473,16 +1476,12 @@ static void svm_inject_event(const struct x86_event *event)
+ eventinj.fields.errorcode == (uint16_t)eventinj.fields.errorcode);
+ vmcb->eventinj = eventinj;
+
+- if ( _event.vector == TRAP_page_fault )
+- {
+- curr->arch.hvm_vcpu.guest_cr[2] = _event.cr2;
+- vmcb_set_cr2(vmcb, _event.cr2);
+- HVMTRACE_LONG_2D(PF_INJECT, _event.error_code, TRC_PAR_LONG(_event.cr2));
+- }
++ if ( _event.vector == TRAP_page_fault &&
++ _event.type == X86_EVENTTYPE_HW_EXCEPTION )
++ HVMTRACE_LONG_2D(PF_INJECT, _event.error_code,
++ TRC_PAR_LONG(_event.cr2));
+ else
+- {
+ HVMTRACE_2D(INJ_EXC, _event.vector, _event.error_code);
+- }
+ }
+
+ static int svm_event_pending(struct vcpu *v)
+@@ -1836,6 +1835,25 @@ static int svm_msr_read_intercept(unsigned int msr, uint64_t *msr_content)
+ struct vcpu *v = current;
+ struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
+
++ switch ( msr )
++ {
++ /*
++ * Sync not needed while the cross-vendor logic is in unilateral effect.
++ case MSR_IA32_SYSENTER_CS:
++ case MSR_IA32_SYSENTER_ESP:
++ case MSR_IA32_SYSENTER_EIP:
++ */
++ case MSR_STAR:
++ case MSR_LSTAR:
++ case MSR_CSTAR:
++ case MSR_SYSCALL_MASK:
++ case MSR_FS_BASE:
++ case MSR_GS_BASE:
++ case MSR_SHADOW_GS_BASE:
++ svm_sync_vmcb(v);
++ break;
++ }
++
+ switch ( msr )
+ {
+ case MSR_IA32_SYSENTER_CS:
+@@ -1848,6 +1866,34 @@ static int svm_msr_read_intercept(unsigned int msr, uint64_t *msr_content)
+ *msr_content = v->arch.hvm_svm.guest_sysenter_eip;
+ break;
+
++ case MSR_STAR:
++ *msr_content = vmcb->star;
++ break;
++
++ case MSR_LSTAR:
++ *msr_content = vmcb->lstar;
++ break;
++
++ case MSR_CSTAR:
++ *msr_content = vmcb->cstar;
++ break;
++
++ case MSR_SYSCALL_MASK:
++ *msr_content = vmcb->sfmask;
++ break;
++
++ case MSR_FS_BASE:
++ *msr_content = vmcb->fs.base;
++ break;
++
++ case MSR_GS_BASE:
++ *msr_content = vmcb->gs.base;
++ break;
++
++ case MSR_SHADOW_GS_BASE:
++ *msr_content = vmcb->kerngsbase;
++ break;
++
+ case MSR_IA32_MCx_MISC(4): /* Threshold register */
+ case MSR_F10_MC4_MISC1 ... MSR_F10_MC4_MISC3:
+ /*
+@@ -1976,32 +2022,81 @@ static int svm_msr_write_intercept(unsigned int msr, uint64_t msr_content)
+ int ret, result = X86EMUL_OKAY;
+ struct vcpu *v = current;
+ struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
+- int sync = 0;
++ bool sync = false;
+
+ switch ( msr )
+ {
+ case MSR_IA32_SYSENTER_CS:
+ case MSR_IA32_SYSENTER_ESP:
+ case MSR_IA32_SYSENTER_EIP:
+- sync = 1;
+- break;
+- default:
++ case MSR_STAR:
++ case MSR_LSTAR:
++ case MSR_CSTAR:
++ case MSR_SYSCALL_MASK:
++ case MSR_FS_BASE:
++ case MSR_GS_BASE:
++ case MSR_SHADOW_GS_BASE:
++ sync = true;
+ break;
+ }
+
+ if ( sync )
+- svm_sync_vmcb(v);
++ svm_sync_vmcb(v);
+
+ switch ( msr )
+ {
++ case MSR_IA32_SYSENTER_ESP:
++ case MSR_IA32_SYSENTER_EIP:
++ case MSR_LSTAR:
++ case MSR_CSTAR:
++ case MSR_FS_BASE:
++ case MSR_GS_BASE:
++ case MSR_SHADOW_GS_BASE:
++ if ( !is_canonical_address(msr_content) )
++ goto gpf;
++
++ switch ( msr )
++ {
++ case MSR_IA32_SYSENTER_ESP:
++ vmcb->sysenter_esp = v->arch.hvm_svm.guest_sysenter_esp = msr_content;
++ break;
++
++ case MSR_IA32_SYSENTER_EIP:
++ vmcb->sysenter_eip = v->arch.hvm_svm.guest_sysenter_eip = msr_content;
++ break;
++
++ case MSR_LSTAR:
++ vmcb->lstar = msr_content;
++ break;
++
++ case MSR_CSTAR:
++ vmcb->cstar = msr_content;
++ break;
++
++ case MSR_FS_BASE:
++ vmcb->fs.base = msr_content;
++ break;
++
++ case MSR_GS_BASE:
++ vmcb->gs.base = msr_content;
++ break;
++
++ case MSR_SHADOW_GS_BASE:
++ vmcb->kerngsbase = msr_content;
++ break;
++ }
++ break;
++
+ case MSR_IA32_SYSENTER_CS:
+ vmcb->sysenter_cs = v->arch.hvm_svm.guest_sysenter_cs = msr_content;
+ break;
+- case MSR_IA32_SYSENTER_ESP:
+- vmcb->sysenter_esp = v->arch.hvm_svm.guest_sysenter_esp = msr_content;
++
++ case MSR_STAR:
++ vmcb->star = msr_content;
+ break;
+- case MSR_IA32_SYSENTER_EIP:
+- vmcb->sysenter_eip = v->arch.hvm_svm.guest_sysenter_eip = msr_content;
++
++ case MSR_SYSCALL_MASK:
++ vmcb->sfmask = msr_content;
+ break;
+
+ case MSR_IA32_DEBUGCTLMSR:
+diff --git a/xen/arch/x86/hvm/svm/svmdebug.c b/xen/arch/x86/hvm/svm/svmdebug.c
+index 89ef2db932..b5b946aa94 100644
+--- a/xen/arch/x86/hvm/svm/svmdebug.c
++++ b/xen/arch/x86/hvm/svm/svmdebug.c
+@@ -131,9 +131,8 @@ bool svm_vmcb_isvalid(const char *from, const struct vmcb_struct *vmcb,
+ PRINTF("DR7: bits [63:32] are not zero (%#"PRIx64")\n",
+ vmcb_get_dr7(vmcb));
+
+- if ( efer & ~(EFER_SCE | EFER_LME | EFER_LMA | EFER_NX | EFER_SVME |
+- EFER_LMSLE | EFER_FFXSE) )
+- PRINTF("EFER: undefined bits are not zero (%#"PRIx64")\n", efer);
++ if ( efer & ~EFER_KNOWN_MASK )
++ PRINTF("EFER: unknown bits are not zero (%#"PRIx64")\n", efer);
+
+ if ( hvm_efer_valid(v, efer, -1) )
+ PRINTF("EFER: %s (%"PRIx64")\n", hvm_efer_valid(v, efer, -1), efer);
+diff --git a/xen/arch/x86/hvm/svm/vmcb.c b/xen/arch/x86/hvm/svm/vmcb.c
+index 997e7597e0..612ced2f0d 100644
+--- a/xen/arch/x86/hvm/svm/vmcb.c
++++ b/xen/arch/x86/hvm/svm/vmcb.c
+@@ -178,8 +178,8 @@ static int construct_vmcb(struct vcpu *v)
+ paging_update_paging_modes(v);
+
+ vmcb->_exception_intercepts =
+- HVM_TRAP_MASK
+- | (1U << TRAP_no_device);
++ HVM_TRAP_MASK |
++ (v->arch.fully_eager_fpu ? 0 : (1U << TRAP_no_device));
+
+ if ( paging_mode_hap(v->domain) )
+ {
+diff --git a/xen/arch/x86/hvm/viridian.c b/xen/arch/x86/hvm/viridian.c
+index f0fa59d7d5..b02a70d086 100644
+--- a/xen/arch/x86/hvm/viridian.c
++++ b/xen/arch/x86/hvm/viridian.c
+@@ -245,7 +245,7 @@ void cpuid_viridian_leaves(const struct vcpu *v, uint32_t leaf,
+ };
+ union {
+ HV_PARTITION_PRIVILEGE_MASK mask;
+- uint32_t lo, hi;
++ struct { uint32_t lo, hi; };
+ } u;
+
+ if ( !(viridian_feature_mask(d) & HVMPV_no_freq) )
+@@ -966,12 +966,10 @@ int viridian_hypercall(struct cpu_user_regs *regs)
+ gprintk(XENLOG_WARNING, "unimplemented hypercall %04x\n",
+ input.call_code);
+ /* Fallthrough. */
+- case HvGetPartitionId:
+ case HvExtCallQueryCapabilities:
+ /*
+- * These hypercalls seem to be erroneously issued by Windows
+- * despite neither AccessPartitionId nor EnableExtendedHypercalls
+- * being set in CPUID leaf 2.
++ * This hypercall seems to be erroneously issued by Windows
++ * despite EnableExtendedHypercalls not being set in CPUID leaf 2.
+ * Given that return a status of 'invalid code' has not so far
+ * caused any problems it's not worth logging.
+ */
+diff --git a/xen/arch/x86/hvm/vmx/entry.S b/xen/arch/x86/hvm/vmx/entry.S
+index e750544b4b..c6504e3207 100644
+--- a/xen/arch/x86/hvm/vmx/entry.S
++++ b/xen/arch/x86/hvm/vmx/entry.S
+@@ -38,9 +38,18 @@ ENTRY(vmx_asm_vmexit_handler)
+ movb $1,VCPU_vmx_launched(%rbx)
+ mov %rax,VCPU_hvm_guest_cr2(%rbx)
+
+- SPEC_CTRL_ENTRY_FROM_VMEXIT /* Req: b=curr %rsp=regs/cpuinfo, Clob: acd */
++ SPEC_CTRL_ENTRY_FROM_HVM /* Req: b=curr %rsp=regs/cpuinfo, Clob: acd */
+ /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */
+
++ /* Hardware clears MSR_DEBUGCTL on VMExit. Reinstate it if debugging Xen. */
++ .macro restore_lbr
++ mov $IA32_DEBUGCTLMSR_LBR, %eax
++ mov $MSR_IA32_DEBUGCTLMSR, %ecx
++ xor %edx, %edx
++ wrmsr
++ .endm
++ ALTERNATIVE __stringify(ASM_NOP14), restore_lbr, X86_FEATURE_XEN_LBR
++
+ mov %rsp,%rdi
+ call vmx_vmexit_handler
+
+@@ -76,7 +85,7 @@ UNLIKELY_END(realmode)
+ mov VCPUMSR_spec_ctrl_raw(%rax), %eax
+
+ /* WARNING! `ret`, `call *`, `jmp *` not safe beyond this point. */
+- SPEC_CTRL_EXIT_TO_GUEST /* Req: a=spec_ctrl %rsp=regs/cpuinfo, Clob: cd */
++ SPEC_CTRL_EXIT_TO_HVM /* Req: a=spec_ctrl %rsp=regs/cpuinfo, Clob: cd */
+
+ mov VCPU_hvm_guest_cr2(%rbx),%rax
+
+diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
+index a642118eab..2b223a16fe 100644
+--- a/xen/arch/x86/hvm/vmx/vmcs.c
++++ b/xen/arch/x86/hvm/vmx/vmcs.c
+@@ -38,6 +38,7 @@
+ #include <asm/flushtlb.h>
+ #include <asm/monitor.h>
+ #include <asm/shadow.h>
++#include <asm/spec_ctrl.h>
+ #include <asm/tboot.h>
+ #include <asm/apic.h>
+
+@@ -999,6 +1000,7 @@ static int construct_vmcs(struct vcpu *v)
+ unsigned long sysenter_eip;
+ u32 vmexit_ctl = vmx_vmexit_control;
+ u32 vmentry_ctl = vmx_vmentry_control;
++ int rc = 0;
+
+ vmx_vmcs_enter(v);
+
+@@ -1086,8 +1088,8 @@ static int construct_vmcs(struct vcpu *v)
+
+ if ( msr_bitmap == NULL )
+ {
+- vmx_vmcs_exit(v);
+- return -ENOMEM;
++ rc = -ENOMEM;
++ goto out;
+ }
+
+ memset(msr_bitmap, ~0, PAGE_SIZE);
+@@ -1146,7 +1148,9 @@ static int construct_vmcs(struct vcpu *v)
+ __vmwrite(HOST_GS_BASE, 0);
+
+ /* Host control registers. */
+- v->arch.hvm_vmx.host_cr0 = read_cr0() | X86_CR0_TS;
++ v->arch.hvm_vmx.host_cr0 = read_cr0() & ~X86_CR0_TS;
++ if ( !v->arch.fully_eager_fpu )
++ v->arch.hvm_vmx.host_cr0 |= X86_CR0_TS;
+ __vmwrite(HOST_CR0, v->arch.hvm_vmx.host_cr0);
+ __vmwrite(HOST_CR4, mmu_cr4_features);
+
+@@ -1226,7 +1230,7 @@ static int construct_vmcs(struct vcpu *v)
+
+ v->arch.hvm_vmx.exception_bitmap = HVM_TRAP_MASK
+ | (paging_mode_hap(d) ? 0 : (1U << TRAP_page_fault))
+- | (1U << TRAP_no_device);
++ | (v->arch.fully_eager_fpu ? 0 : (1U << TRAP_no_device));
+ vmx_update_exception_bitmap(v);
+
+ v->arch.hvm_vcpu.guest_cr[0] = X86_CR0_PE | X86_CR0_ET;
+@@ -1269,141 +1273,197 @@ static int construct_vmcs(struct vcpu *v)
+ if ( cpu_has_vmx_tsc_scaling )
+ __vmwrite(TSC_MULTIPLIER, d->arch.hvm_domain.tsc_scaling_ratio);
+
+- vmx_vmcs_exit(v);
+-
+ /* will update HOST & GUEST_CR3 as reqd */
+ paging_update_paging_modes(v);
+
+ vmx_vlapic_msr_changed(v);
+
+- return 0;
++ if ( opt_l1d_flush && paging_mode_hap(d) )
++ rc = vmx_add_msr(v, MSR_FLUSH_CMD, FLUSH_CMD_L1D,
++ VMX_MSR_GUEST_LOADONLY);
++
++ out:
++ vmx_vmcs_exit(v);
++
++ return rc;
+ }
+
+-static int vmx_msr_entry_key_cmp(const void *key, const void *elt)
++/*
++ * Search an MSR list looking for an MSR entry, or the slot in which it should
++ * live (to keep the data sorted) if an entry is not found.
++ *
++ * The return pointer is guaranteed to be bounded by start and end. However,
++ * it may point at end, and may be invalid for the caller to dereference.
++ */
++static struct vmx_msr_entry *locate_msr_entry(
++ struct vmx_msr_entry *start, struct vmx_msr_entry *end, uint32_t msr)
+ {
+- const u32 *msr = key;
+- const struct vmx_msr_entry *entry = elt;
++ while ( start < end )
++ {
++ struct vmx_msr_entry *mid = start + (end - start) / 2;
+
+- if ( *msr > entry->index )
+- return 1;
+- if ( *msr < entry->index )
+- return -1;
++ if ( msr < mid->index )
++ end = mid;
++ else if ( msr > mid->index )
++ start = mid + 1;
++ else
++ return mid;
++ }
+
+- return 0;
++ return start;
+ }
+
+-struct vmx_msr_entry *vmx_find_msr(u32 msr, int type)
++struct vmx_msr_entry *vmx_find_msr(const struct vcpu *v, uint32_t msr,
++ enum vmx_msr_list_type type)
+ {
+- struct vcpu *curr = current;
+- unsigned int msr_count;
+- struct vmx_msr_entry *msr_area;
++ const struct arch_vmx_struct *vmx = &v->arch.hvm_vmx;
++ struct vmx_msr_entry *start = NULL, *ent, *end;
++ unsigned int substart = 0, subend = vmx->msr_save_count;
++ unsigned int total = vmx->msr_load_count;
+
+- if ( type == VMX_GUEST_MSR )
+- {
+- msr_count = curr->arch.hvm_vmx.msr_count;
+- msr_area = curr->arch.hvm_vmx.msr_area;
+- }
+- else
++ ASSERT(v == current || !vcpu_runnable(v));
++
++ switch ( type )
+ {
+- ASSERT(type == VMX_HOST_MSR);
+- msr_count = curr->arch.hvm_vmx.host_msr_count;
+- msr_area = curr->arch.hvm_vmx.host_msr_area;
++ case VMX_MSR_HOST:
++ start = vmx->host_msr_area;
++ subend = vmx->host_msr_count;
++ total = subend;
++ break;
++
++ case VMX_MSR_GUEST:
++ start = vmx->msr_area;
++ break;
++
++ case VMX_MSR_GUEST_LOADONLY:
++ start = vmx->msr_area;
++ substart = subend;
++ subend = total;
++ break;
++
++ default:
++ ASSERT_UNREACHABLE();
+ }
+
+- if ( msr_area == NULL )
++ if ( !start )
+ return NULL;
+
+- return bsearch(&msr, msr_area, msr_count, sizeof(struct vmx_msr_entry),
+- vmx_msr_entry_key_cmp);
++ end = start + total;
++ ent = locate_msr_entry(start + substart, start + subend, msr);
++
++ return ((ent < end) && (ent->index == msr)) ? ent : NULL;
+ }
+
+-int vmx_read_guest_msr(u32 msr, u64 *val)
++int vmx_add_msr(struct vcpu *v, uint32_t msr, uint64_t val,
++ enum vmx_msr_list_type type)
+ {
+- struct vmx_msr_entry *ent;
++ struct arch_vmx_struct *vmx = &v->arch.hvm_vmx;
++ struct vmx_msr_entry **ptr, *start = NULL, *ent, *end;
++ unsigned int substart, subend, total;
++ int rc;
++
++ ASSERT(v == current || !vcpu_runnable(v));
+
+- if ( (ent = vmx_find_msr(msr, VMX_GUEST_MSR)) != NULL )
++ switch ( type )
+ {
+- *val = ent->data;
+- return 0;
+- }
++ case VMX_MSR_HOST:
++ ptr = &vmx->host_msr_area;
++ substart = 0;
++ subend = vmx->host_msr_count;
++ total = subend;
++ break;
+
+- return -ESRCH;
+-}
++ case VMX_MSR_GUEST:
++ ptr = &vmx->msr_area;
++ substart = 0;
++ subend = vmx->msr_save_count;
++ total = vmx->msr_load_count;
++ break;
+
+-int vmx_write_guest_msr(u32 msr, u64 val)
+-{
+- struct vmx_msr_entry *ent;
++ case VMX_MSR_GUEST_LOADONLY:
++ ptr = &vmx->msr_area;
++ substart = vmx->msr_save_count;
++ subend = vmx->msr_load_count;
++ total = subend;
++ break;
+
+- if ( (ent = vmx_find_msr(msr, VMX_GUEST_MSR)) != NULL )
+- {
+- ent->data = val;
+- return 0;
++ default:
++ ASSERT_UNREACHABLE();
++ return -EINVAL;
+ }
+
+- return -ESRCH;
+-}
+-
+-int vmx_add_msr(u32 msr, int type)
+-{
+- struct vcpu *curr = current;
+- unsigned int idx, *msr_count;
+- struct vmx_msr_entry **msr_area, *msr_area_elem;
++ vmx_vmcs_enter(v);
+
+- if ( type == VMX_GUEST_MSR )
++ /* Allocate memory on first use. */
++ if ( unlikely(!*ptr) )
+ {
+- msr_count = &curr->arch.hvm_vmx.msr_count;
+- msr_area = &curr->arch.hvm_vmx.msr_area;
+- }
+- else
+- {
+- ASSERT(type == VMX_HOST_MSR);
+- msr_count = &curr->arch.hvm_vmx.host_msr_count;
+- msr_area = &curr->arch.hvm_vmx.host_msr_area;
+- }
++ paddr_t addr;
+
+- if ( *msr_area == NULL )
+- {
+- if ( (*msr_area = alloc_xenheap_page()) == NULL )
+- return -ENOMEM;
++ if ( (*ptr = alloc_xenheap_page()) == NULL )
++ {
++ rc = -ENOMEM;
++ goto out;
++ }
++
++ addr = virt_to_maddr(*ptr);
+
+- if ( type == VMX_GUEST_MSR )
++ switch ( type )
+ {
+- __vmwrite(VM_EXIT_MSR_STORE_ADDR, virt_to_maddr(*msr_area));
+- __vmwrite(VM_ENTRY_MSR_LOAD_ADDR, virt_to_maddr(*msr_area));
++ case VMX_MSR_HOST:
++ __vmwrite(VM_EXIT_MSR_LOAD_ADDR, addr);
++ break;
++
++ case VMX_MSR_GUEST:
++ case VMX_MSR_GUEST_LOADONLY:
++ __vmwrite(VM_EXIT_MSR_STORE_ADDR, addr);
++ __vmwrite(VM_ENTRY_MSR_LOAD_ADDR, addr);
++ break;
+ }
+- else
+- __vmwrite(VM_EXIT_MSR_LOAD_ADDR, virt_to_maddr(*msr_area));
+ }
+
+- for ( idx = 0; idx < *msr_count && (*msr_area)[idx].index <= msr; idx++ )
+- if ( (*msr_area)[idx].index == msr )
+- return 0;
++ start = *ptr;
++ end = start + total;
++ ent = locate_msr_entry(start + substart, start + subend, msr);
+
+- if ( *msr_count == (PAGE_SIZE / sizeof(struct vmx_msr_entry)) )
+- return -ENOSPC;
++ if ( (ent < end) && (ent->index == msr) )
++ goto found;
+
+- memmove(*msr_area + idx + 1, *msr_area + idx,
+- sizeof(*msr_area_elem) * (*msr_count - idx));
++ /* If there isn't an existing entry for msr, insert room for one. */
++ if ( total == (PAGE_SIZE / sizeof(*ent)) )
++ {
++ rc = -ENOSPC;
++ goto out;
++ }
+
+- msr_area_elem = *msr_area + idx;
+- msr_area_elem->index = msr;
+- msr_area_elem->mbz = 0;
++ memmove(ent + 1, ent, sizeof(*ent) * (end - ent));
+
+- ++*msr_count;
++ ent->index = msr;
++ ent->mbz = 0;
+
+- if ( type == VMX_GUEST_MSR )
++ switch ( type )
+ {
+- msr_area_elem->data = 0;
+- __vmwrite(VM_EXIT_MSR_STORE_COUNT, *msr_count);
+- __vmwrite(VM_ENTRY_MSR_LOAD_COUNT, *msr_count);
+- }
+- else
+- {
+- rdmsrl(msr, msr_area_elem->data);
+- __vmwrite(VM_EXIT_MSR_LOAD_COUNT, *msr_count);
++ case VMX_MSR_HOST:
++ __vmwrite(VM_EXIT_MSR_LOAD_COUNT, ++vmx->host_msr_count);
++ break;
++
++ case VMX_MSR_GUEST:
++ __vmwrite(VM_EXIT_MSR_STORE_COUNT, ++vmx->msr_save_count);
++
++ /* Fallthrough */
++ case VMX_MSR_GUEST_LOADONLY:
++ __vmwrite(VM_ENTRY_MSR_LOAD_COUNT, ++vmx->msr_load_count);
++ break;
+ }
+
+- return 0;
++ /* Set the msr's value. */
++ found:
++ ent->data = val;
++ rc = 0;
++
++ out:
++ vmx_vmcs_exit(v);
++
++ return rc;
+ }
+
+ void vmx_set_eoi_exit_bitmap(struct vcpu *v, u8 vector)
+@@ -1784,10 +1844,7 @@ void vmcs_dump_vcpu(struct vcpu *v)
+ vmentry_ctl = vmr32(VM_ENTRY_CONTROLS),
+ vmexit_ctl = vmr32(VM_EXIT_CONTROLS);
+ cr4 = vmr(GUEST_CR4);
+-
+- /* EFER.LMA is read as zero, and is loaded from vmentry_ctl on entry. */
+- BUILD_BUG_ON(VM_ENTRY_IA32E_MODE << 1 != EFER_LMA);
+- efer = vmr(GUEST_EFER) | ((vmentry_ctl & VM_ENTRY_IA32E_MODE) << 1);
++ efer = vmr(GUEST_EFER);
+
+ printk("*** Guest State ***\n");
+ printk("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
+@@ -1797,7 +1854,7 @@ void vmcs_dump_vcpu(struct vcpu *v)
+ printk("CR3 = 0x%016lx\n", vmr(GUEST_CR3));
+ if ( (v->arch.hvm_vmx.secondary_exec_control &
+ SECONDARY_EXEC_ENABLE_EPT) &&
+- (cr4 & X86_CR4_PAE) && !(efer & EFER_LMA) )
++ (cr4 & X86_CR4_PAE) && !(vmentry_ctl & VM_ENTRY_IA32E_MODE) )
+ {
+ printk("PDPTE0 = 0x%016lx PDPTE1 = 0x%016lx\n",
+ vmr(GUEST_PDPTE(0)), vmr(GUEST_PDPTE(1)));
+diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
+index fc505c8cf7..508882e8d2 100644
+--- a/xen/arch/x86/hvm/vmx/vmx.c
++++ b/xen/arch/x86/hvm/vmx/vmx.c
+@@ -682,6 +682,12 @@ static void vmx_cpuid_policy_changed(struct vcpu *v)
+ vmx_clear_msr_intercept(v, MSR_PRED_CMD, VMX_MSR_RW);
+ else
+ vmx_set_msr_intercept(v, MSR_PRED_CMD, VMX_MSR_RW);
++
++ /* MSR_FLUSH_CMD is safe to pass through if the guest knows about it. */
++ if ( cp->feat.l1d_flush )
++ vmx_clear_msr_intercept(v, MSR_FLUSH_CMD, VMX_MSR_RW);
++ else
++ vmx_set_msr_intercept(v, MSR_FLUSH_CMD, VMX_MSR_RW);
+ }
+
+ int vmx_guest_x86_mode(struct vcpu *v)
+@@ -1035,7 +1041,8 @@ static void vmx_ctxt_switch_from(struct vcpu *v)
+ vmx_vmcs_reload(v);
+ }
+
+- vmx_fpu_leave(v);
++ if ( !v->arch.fully_eager_fpu )
++ vmx_fpu_leave(v);
+ vmx_save_guest_msrs(v);
+ vmx_restore_host_msrs();
+ vmx_save_dr(v);
+@@ -1597,7 +1604,10 @@ static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr)
+ if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) )
+ {
+ if ( v != current )
+- hw_cr0_mask |= X86_CR0_TS;
++ {
++ if ( !v->arch.fully_eager_fpu )
++ hw_cr0_mask |= X86_CR0_TS;
++ }
+ else if ( v->arch.hvm_vcpu.hw_cr[0] & X86_CR0_TS )
+ vmx_fpu_enter(v);
+ }
+@@ -2835,8 +2845,10 @@ enum
+
+ #define LBR_FROM_SIGNEXT_2MSB ((1ULL << 59) | (1ULL << 60))
+
+-#define FIXUP_LBR_TSX (1u << 0)
+-#define FIXUP_BDW_ERRATUM_BDF14 (1u << 1)
++#define LBR_MSRS_INSERTED (1u << 0)
++#define LBR_FIXUP_TSX (1u << 1)
++#define LBR_FIXUP_BDF14 (1u << 2)
++#define LBR_FIXUP_MASK (LBR_FIXUP_TSX | LBR_FIXUP_BDF14)
+
+ static bool __read_mostly lbr_tsx_fixup_needed;
+ static bool __read_mostly bdw_erratum_bdf14_fixup_needed;
+@@ -2899,7 +2911,7 @@ static int is_last_branch_msr(u32 ecx)
+
+ static int vmx_msr_read_intercept(unsigned int msr, uint64_t *msr_content)
+ {
+- const struct vcpu *curr = current;
++ struct vcpu *curr = current;
+
+ HVM_DBG_LOG(DBG_LEVEL_MSR, "ecx=%#x", msr);
+
+@@ -2958,7 +2970,7 @@ static int vmx_msr_read_intercept(unsigned int msr, uint64_t *msr_content)
+ goto done;
+ }
+
+- if ( vmx_read_guest_msr(msr, msr_content) == 0 )
++ if ( vmx_read_guest_msr(curr, msr, msr_content) == 0 )
+ break;
+
+ if ( is_last_branch_msr(msr) )
+@@ -3093,11 +3105,14 @@ void vmx_vlapic_msr_changed(struct vcpu *v)
+ static int vmx_msr_write_intercept(unsigned int msr, uint64_t msr_content)
+ {
+ struct vcpu *v = current;
++ const struct cpuid_policy *cp = v->domain->arch.cpuid;
+
+ HVM_DBG_LOG(DBG_LEVEL_MSR, "ecx=%#x, msr_value=%#"PRIx64, msr, msr_content);
+
+ switch ( msr )
+ {
++ uint64_t rsvd;
++
+ case MSR_IA32_SYSENTER_CS:
+ __vmwrite(GUEST_SYSENTER_CS, msr_content);
+ break;
+@@ -3111,45 +3126,85 @@ static int vmx_msr_write_intercept(unsigned int msr, uint64_t msr_content)
+ goto gp_fault;
+ __vmwrite(GUEST_SYSENTER_EIP, msr_content);
+ break;
+- case MSR_IA32_DEBUGCTLMSR: {
+- int i, rc = 0;
+- uint64_t supported = IA32_DEBUGCTLMSR_LBR | IA32_DEBUGCTLMSR_BTF;
++ case MSR_IA32_DEBUGCTLMSR:
++ rsvd = ~(IA32_DEBUGCTLMSR_LBR | IA32_DEBUGCTLMSR_BTF);
+
+- if ( boot_cpu_has(X86_FEATURE_RTM) )
+- supported |= IA32_DEBUGCTLMSR_RTM;
+- if ( msr_content & ~supported )
++ /* TODO: Wire vPMU settings properly through the CPUID policy */
++ if ( vpmu_is_set(vcpu_vpmu(v), VPMU_CPU_HAS_BTS) )
+ {
+- /* Perhaps some other bits are supported in vpmu. */
+- if ( vpmu_do_wrmsr(msr, msr_content, supported) )
+- break;
++ rsvd &= ~(IA32_DEBUGCTLMSR_TR | IA32_DEBUGCTLMSR_BTS |
++ IA32_DEBUGCTLMSR_BTINT);
++
++ if ( cpu_has(&current_cpu_data, X86_FEATURE_DSCPL) )
++ rsvd &= ~(IA32_DEBUGCTLMSR_BTS_OFF_OS |
++ IA32_DEBUGCTLMSR_BTS_OFF_USR);
+ }
+- if ( msr_content & IA32_DEBUGCTLMSR_LBR )
++
++ if ( cp->feat.rtm )
++ rsvd &= ~IA32_DEBUGCTLMSR_RTM;
++
++ if ( msr_content & rsvd )
++ goto gp_fault;
++
++ /*
++ * When a guest first enables LBR, arrange to save and restore the LBR
++ * MSRs and allow the guest direct access.
++ *
++ * MSR_DEBUGCTL and LBR has existed almost as long as MSRs have
++ * existed, and there is no architectural way to hide the feature, or
++ * fail the attempt to enable LBR.
++ *
++ * Unknown host LBR MSRs or hitting -ENOSPC with the guest load/save
++ * list are definitely hypervisor bugs, whereas -ENOMEM for allocating
++ * the load/save list is simply unlucky (and shouldn't occur with
++ * sensible management by the toolstack).
++ *
++ * Either way, there is nothing we can do right now to recover, and
++ * the guest won't execute correctly either. Simply crash the domain
++ * to make the failure obvious.
++ */
++ if ( !(v->arch.hvm_vmx.lbr_flags & LBR_MSRS_INSERTED) &&
++ (msr_content & IA32_DEBUGCTLMSR_LBR) )
+ {
+ const struct lbr_info *lbr = last_branch_msr_get();
+- if ( lbr == NULL )
+- break;
+
+- for ( ; (rc == 0) && lbr->count; lbr++ )
+- for ( i = 0; (rc == 0) && (i < lbr->count); i++ )
+- if ( (rc = vmx_add_guest_msr(lbr->base + i)) == 0 )
++ if ( unlikely(!lbr) )
++ {
++ gprintk(XENLOG_ERR, "Unknown Host LBR MSRs\n");
++ domain_crash(v->domain);
++ return X86EMUL_OKAY;
++ }
++
++ for ( ; lbr->count; lbr++ )
++ {
++ unsigned int i;
++
++ for ( i = 0; i < lbr->count; i++ )
++ {
++ int rc = vmx_add_guest_msr(v, lbr->base + i, 0);
++
++ if ( unlikely(rc) )
+ {
+- vmx_clear_msr_intercept(v, lbr->base + i, VMX_MSR_RW);
+- if ( lbr_tsx_fixup_needed )
+- v->arch.hvm_vmx.lbr_fixup_enabled |= FIXUP_LBR_TSX;
+- if ( bdw_erratum_bdf14_fixup_needed )
+- v->arch.hvm_vmx.lbr_fixup_enabled |=
+- FIXUP_BDW_ERRATUM_BDF14;
++ gprintk(XENLOG_ERR,
++ "Guest load/save list error %d\n", rc);
++ domain_crash(v->domain);
++ return X86EMUL_OKAY;
+ }
+- }
+
+- if ( (rc < 0) ||
+- (msr_content && (vmx_add_host_load_msr(msr) < 0)) )
+- hvm_inject_hw_exception(TRAP_machine_check, X86_EVENT_NO_EC);
+- else
+- __vmwrite(GUEST_IA32_DEBUGCTL, msr_content);
++ vmx_clear_msr_intercept(v, lbr->base + i, VMX_MSR_RW);
++ }
++ }
++
++ v->arch.hvm_vmx.lbr_flags |= LBR_MSRS_INSERTED;
++ if ( lbr_tsx_fixup_needed )
++ v->arch.hvm_vmx.lbr_flags |= LBR_FIXUP_TSX;
++ if ( bdw_erratum_bdf14_fixup_needed )
++ v->arch.hvm_vmx.lbr_flags |= LBR_FIXUP_BDF14;
++ }
+
++ __vmwrite(GUEST_IA32_DEBUGCTL, msr_content);
+ break;
+- }
++
+ case MSR_IA32_FEATURE_CONTROL:
+ case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
+ /* None of these MSRs are writeable. */
+@@ -3175,7 +3230,7 @@ static int vmx_msr_write_intercept(unsigned int msr, uint64_t msr_content)
+ switch ( long_mode_do_msr_write(msr, msr_content) )
+ {
+ case HNDL_unhandled:
+- if ( (vmx_write_guest_msr(msr, msr_content) != 0) &&
++ if ( (vmx_write_guest_msr(v, msr, msr_content) != 0) &&
+ !is_last_branch_msr(msr) )
+ switch ( wrmsr_hypervisor_regs(msr, msr_content) )
+ {
+@@ -3717,6 +3772,7 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
+ */
+ __vmread(EXIT_QUALIFICATION, &exit_qualification);
+ HVMTRACE_1D(TRAP_DEBUG, exit_qualification);
++ __restore_debug_registers(v);
+ write_debugreg(6, exit_qualification | DR_STATUS_RESERVED_ONE);
+ if ( !v->domain->debugger_attached )
+ {
+@@ -4186,11 +4242,11 @@ out:
+ static void lbr_tsx_fixup(void)
+ {
+ struct vcpu *curr = current;
+- unsigned int msr_count = curr->arch.hvm_vmx.msr_count;
++ unsigned int msr_count = curr->arch.hvm_vmx.msr_save_count;
+ struct vmx_msr_entry *msr_area = curr->arch.hvm_vmx.msr_area;
+ struct vmx_msr_entry *msr;
+
+- if ( (msr = vmx_find_msr(lbr_from_start, VMX_GUEST_MSR)) != NULL )
++ if ( (msr = vmx_find_msr(curr, lbr_from_start, VMX_MSR_GUEST)) != NULL )
+ {
+ /*
+ * Sign extend into bits 61:62 while preserving bit 63
+@@ -4200,15 +4256,15 @@ static void lbr_tsx_fixup(void)
+ msr->data |= ((LBR_FROM_SIGNEXT_2MSB & msr->data) << 2);
+ }
+
+- if ( (msr = vmx_find_msr(lbr_lastint_from, VMX_GUEST_MSR)) != NULL )
++ if ( (msr = vmx_find_msr(curr, lbr_lastint_from, VMX_MSR_GUEST)) != NULL )
+ msr->data |= ((LBR_FROM_SIGNEXT_2MSB & msr->data) << 2);
+ }
+
+-static void sign_extend_msr(u32 msr, int type)
++static void sign_extend_msr(struct vcpu *v, u32 msr, int type)
+ {
+ struct vmx_msr_entry *entry;
+
+- if ( (entry = vmx_find_msr(msr, type)) != NULL )
++ if ( (entry = vmx_find_msr(v, msr, type)) != NULL )
+ {
+ if ( entry->data & VADDR_TOP_BIT )
+ entry->data |= CANONICAL_MASK;
+@@ -4219,6 +4275,8 @@ static void sign_extend_msr(u32 msr, int type)
+
+ static void bdw_erratum_bdf14_fixup(void)
+ {
++ struct vcpu *curr = current;
++
+ /*
+ * Occasionally, on certain Broadwell CPUs MSR_IA32_LASTINTTOIP has
+ * been observed to have the top three bits corrupted as though the
+@@ -4228,17 +4286,17 @@ static void bdw_erratum_bdf14_fixup(void)
+ * erratum BDF14. Fix up MSR_IA32_LASTINT{FROM,TO}IP by
+ * sign-extending into bits 48:63.
+ */
+- sign_extend_msr(MSR_IA32_LASTINTFROMIP, VMX_GUEST_MSR);
+- sign_extend_msr(MSR_IA32_LASTINTTOIP, VMX_GUEST_MSR);
++ sign_extend_msr(curr, MSR_IA32_LASTINTFROMIP, VMX_MSR_GUEST);
++ sign_extend_msr(curr, MSR_IA32_LASTINTTOIP, VMX_MSR_GUEST);
+ }
+
+ static void lbr_fixup(void)
+ {
+ struct vcpu *curr = current;
+
+- if ( curr->arch.hvm_vmx.lbr_fixup_enabled & FIXUP_LBR_TSX )
++ if ( curr->arch.hvm_vmx.lbr_flags & LBR_FIXUP_TSX )
+ lbr_tsx_fixup();
+- if ( curr->arch.hvm_vmx.lbr_fixup_enabled & FIXUP_BDW_ERRATUM_BDF14 )
++ if ( curr->arch.hvm_vmx.lbr_flags & LBR_FIXUP_BDF14 )
+ bdw_erratum_bdf14_fixup();
+ }
+
+@@ -4306,7 +4364,7 @@ bool vmx_vmenter_helper(const struct cpu_user_regs *regs)
+ }
+
+ out:
+- if ( unlikely(curr->arch.hvm_vmx.lbr_fixup_enabled) )
++ if ( unlikely(curr->arch.hvm_vmx.lbr_flags & LBR_FIXUP_MASK) )
+ lbr_fixup();
+
+ HVMTRACE_ND(VMENTRY, 0, 1/*cycles*/, 0, 0, 0, 0, 0, 0, 0);
+diff --git a/xen/arch/x86/hvm/vpt.c b/xen/arch/x86/hvm/vpt.c
+index 181f4cb631..04e3c2e15b 100644
+--- a/xen/arch/x86/hvm/vpt.c
++++ b/xen/arch/x86/hvm/vpt.c
+@@ -107,31 +107,49 @@ static int pt_irq_vector(struct periodic_time *pt, enum hvm_intsrc src)
+ static int pt_irq_masked(struct periodic_time *pt)
+ {
+ struct vcpu *v = pt->vcpu;
+- unsigned int gsi, isa_irq;
+- int mask;
+- uint8_t pic_imr;
++ unsigned int gsi = pt->irq;
+
+- if ( pt->source == PTSRC_lapic )
++ switch ( pt->source )
++ {
++ case PTSRC_lapic:
+ {
+ struct vlapic *vlapic = vcpu_vlapic(v);
++
+ return (!vlapic_enabled(vlapic) ||
+ (vlapic_get_reg(vlapic, APIC_LVTT) & APIC_LVT_MASKED));
+ }
+
+- isa_irq = pt->irq;
+- gsi = hvm_isa_irq_to_gsi(isa_irq);
+- pic_imr = v->domain->arch.hvm_domain.vpic[isa_irq >> 3].imr;
+- mask = vioapic_get_mask(v->domain, gsi);
+- if ( mask < 0 )
++ case PTSRC_isa:
+ {
+- dprintk(XENLOG_WARNING, "d%u: invalid GSI (%u) for platform timer\n",
+- v->domain->domain_id, gsi);
+- domain_crash(v->domain);
+- return -1;
++ uint8_t pic_imr = v->domain->arch.hvm_domain.vpic[pt->irq >> 3].imr;
++
++ /* Check if the interrupt is unmasked in the PIC. */
++ if ( !(pic_imr & (1 << (pt->irq & 7))) && vlapic_accept_pic_intr(v) )
++ return 0;
++
++ gsi = hvm_isa_irq_to_gsi(pt->irq);
++ }
++
++ /* Fallthrough to check if the interrupt is masked on the IO APIC. */
++ case PTSRC_ioapic:
++ {
++ int mask = vioapic_get_mask(v->domain, gsi);
++
++ if ( mask < 0 )
++ {
++ dprintk(XENLOG_WARNING,
++ "d%d: invalid GSI (%u) for platform timer\n",
++ v->domain->domain_id, gsi);
++ domain_crash(v->domain);
++ return -1;
++ }
++
++ return mask;
++ }
+ }
+
+- return (((pic_imr & (1 << (isa_irq & 7))) || !vlapic_accept_pic_intr(v)) &&
+- mask);
++ ASSERT_UNREACHABLE();
++ return 1;
+ }
+
+ static void pt_lock(struct periodic_time *pt)
+@@ -252,7 +270,7 @@ int pt_update_irq(struct vcpu *v)
+ struct list_head *head = &v->arch.hvm_vcpu.tm_list;
+ struct periodic_time *pt, *temp, *earliest_pt;
+ uint64_t max_lag;
+- int irq, is_lapic, pt_vector;
++ int irq, pt_vector = -1;
+
+ spin_lock(&v->arch.hvm_vcpu.tm_lock);
+
+@@ -288,29 +306,26 @@ int pt_update_irq(struct vcpu *v)
+
+ earliest_pt->irq_issued = 1;
+ irq = earliest_pt->irq;
+- is_lapic = (earliest_pt->source == PTSRC_lapic);
+
+ spin_unlock(&v->arch.hvm_vcpu.tm_lock);
+
+- /*
+- * If periodic timer interrut is handled by lapic, its vector in
+- * IRR is returned and used to set eoi_exit_bitmap for virtual
+- * interrupt delivery case. Otherwise return -1 to do nothing.
+- */
+- if ( is_lapic )
++ switch ( earliest_pt->source )
+ {
++ case PTSRC_lapic:
++ /*
++ * If periodic timer interrupt is handled by lapic, its vector in
++ * IRR is returned and used to set eoi_exit_bitmap for virtual
++ * interrupt delivery case. Otherwise return -1 to do nothing.
++ */
+ vlapic_set_irq(vcpu_vlapic(v), irq, 0);
+ pt_vector = irq;
+- }
+- else
+- {
++ break;
++
++ case PTSRC_isa:
+ hvm_isa_irq_deassert(v->domain, irq);
+ if ( platform_legacy_irq(irq) && vlapic_accept_pic_intr(v) &&
+ v->domain->arch.hvm_domain.vpic[irq >> 3].int_output )
+- {
+ hvm_isa_irq_assert(v->domain, irq, NULL);
+- pt_vector = -1;
+- }
+ else
+ {
+ pt_vector = hvm_isa_irq_assert(v->domain, irq, vioapic_get_vector);
+@@ -321,6 +336,17 @@ int pt_update_irq(struct vcpu *v)
+ if ( pt_vector < 0 || !vlapic_test_irq(vcpu_vlapic(v), pt_vector) )
+ pt_vector = -1;
+ }
++ break;
++
++ case PTSRC_ioapic:
++ /*
++ * NB: At the moment IO-APIC routed interrupts generated by vpt devices
++ * (HPET) are edge-triggered.
++ */
++ pt_vector = hvm_ioapic_assert(v->domain, irq, false);
++ if ( pt_vector < 0 || !vlapic_test_irq(vcpu_vlapic(v), pt_vector) )
++ pt_vector = -1;
++ break;
+ }
+
+ return pt_vector;
+@@ -418,7 +444,14 @@ void create_periodic_time(
+ struct vcpu *v, struct periodic_time *pt, uint64_t delta,
+ uint64_t period, uint8_t irq, time_cb *cb, void *data)
+ {
+- ASSERT(pt->source != 0);
++ if ( !pt->source ||
++ (pt->irq >= NR_ISAIRQS && pt->source == PTSRC_isa) ||
++ (pt->irq >= hvm_domain_irq(v->domain)->nr_gsis &&
++ pt->source == PTSRC_ioapic) )
++ {
++ ASSERT_UNREACHABLE();
++ return;
++ }
+
+ destroy_periodic_time(pt);
+
+@@ -498,7 +531,7 @@ static void pt_adjust_vcpu(struct periodic_time *pt, struct vcpu *v)
+ {
+ int on_list;
+
+- ASSERT(pt->source == PTSRC_isa);
++ ASSERT(pt->source == PTSRC_isa || pt->source == PTSRC_ioapic);
+
+ if ( pt->vcpu == NULL )
+ return;
+diff --git a/xen/arch/x86/i387.c b/xen/arch/x86/i387.c
+index 88452522ad..a1d128dd94 100644
+--- a/xen/arch/x86/i387.c
++++ b/xen/arch/x86/i387.c
+@@ -15,6 +15,7 @@
+ #include <asm/i387.h>
+ #include <asm/xstate.h>
+ #include <asm/asm_defns.h>
++#include <asm/spec_ctrl.h>
+
+ /*******************************/
+ /* FPU Restore Functions */
+@@ -205,13 +206,13 @@ static inline void fpu_fxsave(struct vcpu *v)
+ /* VCPU FPU Functions */
+ /*******************************/
+ /* Restore FPU state whenever VCPU is schduled in. */
+-void vcpu_restore_fpu_eager(struct vcpu *v)
++void vcpu_restore_fpu_nonlazy(struct vcpu *v, bool need_stts)
+ {
+- ASSERT(!is_idle_vcpu(v));
+-
+ /* Restore nonlazy extended state (i.e. parts not tracked by CR0.TS). */
+- if ( !v->arch.nonlazy_xstate_used )
+- return;
++ if ( !v->arch.fully_eager_fpu && !v->arch.nonlazy_xstate_used )
++ goto maybe_stts;
++
++ ASSERT(!is_idle_vcpu(v));
+
+ /* Avoid recursion */
+ clts();
+@@ -221,17 +222,28 @@ void vcpu_restore_fpu_eager(struct vcpu *v)
+ * above) we also need to restore full state, to prevent subsequently
+ * saving state belonging to another vCPU.
+ */
+- if ( xstate_all(v) )
++ if ( v->arch.fully_eager_fpu || (v->arch.xsave_area && xstate_all(v)) )
+ {
+- fpu_xrstor(v, XSTATE_ALL);
++ if ( cpu_has_xsave )
++ fpu_xrstor(v, XSTATE_ALL);
++ else
++ fpu_fxrstor(v);
++
+ v->fpu_initialised = 1;
+ v->fpu_dirtied = 1;
++
++ /* Xen doesn't need TS set, but the guest might. */
++ need_stts = is_pv_vcpu(v) && (v->arch.pv_vcpu.ctrlreg[0] & X86_CR0_TS);
+ }
+ else
+ {
+ fpu_xrstor(v, XSTATE_NONLAZY);
+- stts();
++ need_stts = true;
+ }
++
++ maybe_stts:
++ if ( need_stts )
++ stts();
+ }
+
+ /*
+@@ -247,6 +259,8 @@ void vcpu_restore_fpu_lazy(struct vcpu *v)
+ if ( v->fpu_dirtied )
+ return;
+
++ ASSERT(!v->arch.fully_eager_fpu);
++
+ if ( cpu_has_xsave )
+ fpu_xrstor(v, XSTATE_LAZY);
+ else
+@@ -297,6 +311,8 @@ int vcpu_init_fpu(struct vcpu *v)
+ {
+ int rc;
+
++ v->arch.fully_eager_fpu = opt_eager_fpu;
++
+ if ( (rc = xstate_alloc_save_area(v)) != 0 )
+ return rc;
+
+diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
+index bb924e9225..f3dfe35785 100644
+--- a/xen/arch/x86/mm.c
++++ b/xen/arch/x86/mm.c
+@@ -125,6 +125,7 @@
+ #include <asm/guest.h>
+
+ #include <asm/hvm/grant_table.h>
++#include <asm/pv/domain.h>
+ #include <asm/pv/grant_table.h>
+
+ #include "pv/mm.h"
+@@ -503,12 +504,60 @@ void free_shared_domheap_page(struct page_info *page)
+
+ void make_cr3(struct vcpu *v, mfn_t mfn)
+ {
++ struct domain *d = v->domain;
++
+ v->arch.cr3 = mfn_x(mfn) << PAGE_SHIFT;
++ if ( is_pv_domain(d) && d->arch.pv_domain.pcid )
++ v->arch.cr3 |= get_pcid_bits(v, false);
++}
++
++unsigned long pv_guest_cr4_to_real_cr4(const struct vcpu *v)
++{
++ const struct domain *d = v->domain;
++ unsigned long cr4;
++
++ cr4 = v->arch.pv_vcpu.ctrlreg[4] & ~X86_CR4_DE;
++ cr4 |= mmu_cr4_features & (X86_CR4_PSE | X86_CR4_SMEP | X86_CR4_SMAP |
++ X86_CR4_OSXSAVE | X86_CR4_FSGSBASE);
++
++ if ( d->arch.pv_domain.pcid )
++ cr4 |= X86_CR4_PCIDE;
++ else if ( !d->arch.pv_domain.xpti )
++ cr4 |= X86_CR4_PGE;
++
++ cr4 |= d->arch.vtsc ? X86_CR4_TSD : 0;
++
++ return cr4;
+ }
+
+ void write_ptbase(struct vcpu *v)
+ {
+- write_cr3(v->arch.cr3);
++ struct cpu_info *cpu_info = get_cpu_info();
++ unsigned long new_cr4;
++
++ new_cr4 = (is_pv_vcpu(v) && !is_idle_vcpu(v))
++ ? pv_guest_cr4_to_real_cr4(v)
++ : ((read_cr4() & ~(X86_CR4_PCIDE | X86_CR4_TSD)) | X86_CR4_PGE);
++
++ if ( is_pv_vcpu(v) && v->domain->arch.pv_domain.xpti )
++ {
++ cpu_info->root_pgt_changed = true;
++ cpu_info->pv_cr3 = __pa(this_cpu(root_pgt));
++ if ( new_cr4 & X86_CR4_PCIDE )
++ cpu_info->pv_cr3 |= get_pcid_bits(v, true);
++ switch_cr3_cr4(v->arch.cr3, new_cr4);
++ }
++ else
++ {
++ /* Make sure to clear use_pv_cr3 and xen_cr3 before pv_cr3. */
++ cpu_info->use_pv_cr3 = false;
++ cpu_info->xen_cr3 = 0;
++ /* switch_cr3_cr4() serializes. */
++ switch_cr3_cr4(v->arch.cr3, new_cr4);
++ cpu_info->pv_cr3 = 0;
++ }
++
++ ASSERT(is_pv_vcpu(v) || read_cr4() == mmu_cr4_features);
+ }
+
+ /*
+@@ -605,6 +654,9 @@ static int alloc_segdesc_page(struct page_info *page)
+ return i == 512 ? 0 : -EINVAL;
+ }
+
++static int __get_page_type(struct page_info *page, unsigned long type,
++ int preemptible);
++
+ static int get_page_and_type_from_mfn(
+ mfn_t mfn, unsigned long type, struct domain *d,
+ int partial, int preemptible)
+@@ -616,9 +668,7 @@ static int get_page_and_type_from_mfn(
+ unlikely(!get_page_from_mfn(mfn, d)) )
+ return -EINVAL;
+
+- rc = (preemptible ?
+- get_page_type_preemptible(page, type) :
+- (get_page_type(page, type) ? 0 : -EINVAL));
++ rc = __get_page_type(page, type, preemptible);
+
+ if ( unlikely(rc) && partial >= 0 &&
+ (!preemptible || page != current->arch.old_guest_table) )
+@@ -1108,7 +1158,7 @@ get_page_from_l2e(
+ int rc;
+
+ if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
+- return 1;
++ return pv_l1tf_check_l2e(d, l2e) ? -ERESTART : 1;
+
+ if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) )
+ {
+@@ -1143,7 +1193,7 @@ get_page_from_l3e(
+ int rc;
+
+ if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
+- return 1;
++ return pv_l1tf_check_l3e(d, l3e) ? -ERESTART : 1;
+
+ if ( unlikely((l3e_get_flags(l3e) & l3_disallow_mask(d))) )
+ {
+@@ -1176,7 +1226,7 @@ get_page_from_l4e(
+ int rc;
+
+ if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
+- return 1;
++ return pv_l1tf_check_l4e(d, l4e) ? -ERESTART : 1;
+
+ if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) )
+ {
+@@ -1371,6 +1421,13 @@ static int alloc_l1_table(struct page_info *page)
+
+ for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
+ {
++ if ( !(l1e_get_flags(pl1e[i]) & _PAGE_PRESENT) )
++ {
++ ret = pv_l1tf_check_l1e(d, pl1e[i]) ? -ERESTART : 0;
++ if ( ret )
++ goto out;
++ }
++
+ switch ( ret = get_page_from_l1e(pl1e[i], d, d) )
+ {
+ default:
+@@ -1391,6 +1448,7 @@ static int alloc_l1_table(struct page_info *page)
+
+ fail:
+ gdprintk(XENLOG_WARNING, "Failure in alloc_l1_table: slot %#x\n", i);
++ out:
+ while ( i-- > 0 )
+ put_page_from_l1e(pl1e[i], d);
+
+@@ -1438,8 +1496,7 @@ static int create_pae_xen_mappings(struct domain *d, l3_pgentry_t *pl3e)
+ return 1;
+ }
+
+-static int alloc_l2_table(struct page_info *page, unsigned long type,
+- int preemptible)
++static int alloc_l2_table(struct page_info *page, unsigned long type)
+ {
+ struct domain *d = page_get_owner(page);
+ unsigned long pfn = mfn_x(page_to_mfn(page));
+@@ -1451,8 +1508,7 @@ static int alloc_l2_table(struct page_info *page, unsigned long type,
+
+ for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES; i++ )
+ {
+- if ( preemptible && i > page->nr_validated_ptes
+- && hypercall_preempt_check() )
++ if ( i > page->nr_validated_ptes && hypercall_preempt_check() )
+ {
+ page->nr_validated_ptes = i;
+ rc = -ERESTART;
+@@ -1463,6 +1519,12 @@ static int alloc_l2_table(struct page_info *page, unsigned long type,
+ (rc = get_page_from_l2e(pl2e[i], pfn, d)) > 0 )
+ continue;
+
++ if ( unlikely(rc == -ERESTART) )
++ {
++ page->nr_validated_ptes = i;
++ break;
++ }
++
+ if ( rc < 0 )
+ {
+ gdprintk(XENLOG_WARNING, "Failure in alloc_l2_table: slot %#x\n", i);
+@@ -1745,7 +1807,7 @@ static void free_l1_table(struct page_info *page)
+ }
+
+
+-static int free_l2_table(struct page_info *page, int preemptible)
++static int free_l2_table(struct page_info *page)
+ {
+ struct domain *d = page_get_owner(page);
+ unsigned long pfn = mfn_x(page_to_mfn(page));
+@@ -1759,7 +1821,7 @@ static int free_l2_table(struct page_info *page, int preemptible)
+ do {
+ if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) &&
+ put_page_from_l2e(pl2e[i], pfn) == 0 &&
+- preemptible && i && hypercall_preempt_check() )
++ i && hypercall_preempt_check() )
+ {
+ page->nr_validated_ptes = i;
+ err = -ERESTART;
+@@ -1983,6 +2045,8 @@ static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e,
+ rc = -EBUSY;
+ }
+ }
++ else if ( pv_l1tf_check_l1e(pt_dom, nl1e) )
++ return -ERESTART;
+ else if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, pt_vcpu,
+ preserve_ad)) )
+ {
+@@ -2046,6 +2110,8 @@ static int mod_l2_entry(l2_pgentry_t *pl2e,
+ rc = -EBUSY;
+ }
+ }
++ else if ( pv_l1tf_check_l2e(d, nl2e) )
++ return -ERESTART;
+ else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, vcpu,
+ preserve_ad)) )
+ {
+@@ -2107,6 +2173,8 @@ static int mod_l3_entry(l3_pgentry_t *pl3e,
+ rc = -EFAULT;
+ }
+ }
++ else if ( pv_l1tf_check_l3e(d, nl3e) )
++ return -ERESTART;
+ else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, vcpu,
+ preserve_ad)) )
+ {
+@@ -2172,6 +2240,8 @@ static int mod_l4_entry(l4_pgentry_t *pl4e,
+ rc = -EFAULT;
+ }
+ }
++ else if ( pv_l1tf_check_l4e(d, nl4e) )
++ return -ERESTART;
+ else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, vcpu,
+ preserve_ad)) )
+ {
+@@ -2301,7 +2371,8 @@ static int alloc_page_type(struct page_info *page, unsigned long type,
+ rc = alloc_l1_table(page);
+ break;
+ case PGT_l2_page_table:
+- rc = alloc_l2_table(page, type, preemptible);
++ ASSERT(preemptible);
++ rc = alloc_l2_table(page, type);
+ break;
+ case PGT_l3_page_table:
+ ASSERT(preemptible);
+@@ -2393,7 +2464,8 @@ int free_page_type(struct page_info *page, unsigned long type,
+ rc = 0;
+ break;
+ case PGT_l2_page_table:
+- rc = free_l2_table(page, preemptible);
++ ASSERT(preemptible);
++ rc = free_l2_table(page);
+ break;
+ case PGT_l3_page_table:
+ ASSERT(preemptible);
+@@ -2477,7 +2549,7 @@ static int _put_page_type(struct page_info *page, bool preemptible,
+ nx = x & ~(PGT_validated|PGT_partial);
+ if ( unlikely((y = cmpxchg(&page->u.inuse.type_info,
+ x, nx)) != x) )
+- continue;
++ goto maybe_preempt;
+ /* We cleared the 'valid bit' so we do the clean up. */
+ rc = _put_final_page_type(page, x, preemptible, ptpg);
+ ptpg = NULL;
+@@ -2509,12 +2581,13 @@ static int _put_page_type(struct page_info *page, bool preemptible,
+ */
+ cpu_relax();
+ y = page->u.inuse.type_info;
+- continue;
++ goto maybe_preempt;
+ }
+
+ if ( likely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) == x) )
+ break;
+
++ maybe_preempt:
+ if ( preemptible && hypercall_preempt_check() )
+ return -EINTR;
+ }
+@@ -2627,12 +2700,11 @@ static int __get_page_type(struct page_info *page, unsigned long type,
+ if ( !(x & PGT_partial) )
+ {
+ /* Someone else is updating validation of this page. Wait... */
+- while ( (y = page->u.inuse.type_info) == x )
+- {
++ do {
+ if ( preemptible && hypercall_preempt_check() )
+ return -EINTR;
+ cpu_relax();
+- }
++ } while ( (y = page->u.inuse.type_info) == x );
+ continue;
+ }
+ /* Type ref count was left at 1 when PGT_partial got set. */
+@@ -3469,12 +3541,9 @@ long do_mmuext_op(
+ }
+
+ if ( rc == -ERESTART )
+- {
+- ASSERT(i < count);
+ rc = hypercall_create_continuation(
+ __HYPERVISOR_mmuext_op, "hihi",
+ uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
+- }
+ else if ( curr->arch.old_guest_table )
+ {
+ XEN_GUEST_HANDLE_PARAM(void) null;
+@@ -3674,18 +3743,27 @@ long do_mmu_update(
+ case PGT_l4_page_table:
+ rc = mod_l4_entry(va, l4e_from_intpte(req.val), mfn,
+ cmd == MMU_PT_UPDATE_PRESERVE_AD, v);
+- /*
+- * No need to sync if all uses of the page can be accounted
+- * to the page lock we hold, its pinned status, and uses on
+- * this (v)CPU.
+- */
+- if ( !rc && !cpu_has_no_xpti &&
+- ((page->u.inuse.type_info & PGT_count_mask) >
+- (1 + !!(page->u.inuse.type_info & PGT_pinned) +
+- (pagetable_get_pfn(curr->arch.guest_table) == mfn) +
+- (pagetable_get_pfn(curr->arch.guest_table_user) ==
+- mfn))) )
+- sync_guest = true;
++ if ( !rc && !cpu_has_no_xpti )
++ {
++ bool local_in_use = false;
++
++ if ( pagetable_get_pfn(curr->arch.guest_table) == mfn )
++ {
++ local_in_use = true;
++ get_cpu_info()->root_pgt_changed = true;
++ }
++
++ /*
++ * No need to sync if all uses of the page can be
++ * accounted to the page lock we hold, its pinned
++ * status, and uses on this (v)CPU.
++ */
++ if ( (page->u.inuse.type_info & PGT_count_mask) >
++ (1 + !!(page->u.inuse.type_info & PGT_pinned) +
++ (pagetable_get_pfn(curr->arch.guest_table_user) ==
++ mfn) + local_in_use) )
++ sync_guest = true;
++ }
+ break;
+ case PGT_writable_page:
+ perfc_incr(writable_mmu_updates);
+@@ -3761,12 +3839,9 @@ long do_mmu_update(
+ }
+
+ if ( rc == -ERESTART )
+- {
+- ASSERT(i < count);
+ rc = hypercall_create_continuation(
+ __HYPERVISOR_mmu_update, "hihi",
+ ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
+- }
+ else if ( curr->arch.old_guest_table )
+ {
+ XEN_GUEST_HANDLE_PARAM(void) null;
+@@ -3799,7 +3874,7 @@ long do_mmu_update(
+
+ cpumask_andnot(mask, pt_owner->domain_dirty_cpumask, cpumask_of(cpu));
+ if ( !cpumask_empty(mask) )
+- flush_mask(mask, FLUSH_TLB_GLOBAL);
++ flush_mask(mask, FLUSH_TLB_GLOBAL | FLUSH_ROOT_PGTBL);
+ }
+
+ perfc_add(num_page_updates, i);
+@@ -4020,7 +4095,13 @@ static int __do_update_va_mapping(
+ long do_update_va_mapping(unsigned long va, u64 val64,
+ unsigned long flags)
+ {
+- return __do_update_va_mapping(va, val64, flags, current->domain);
++ int rc = __do_update_va_mapping(va, val64, flags, current->domain);
++
++ if ( rc == -ERESTART )
++ rc = hypercall_create_continuation(
++ __HYPERVISOR_update_va_mapping, "lll", va, val64, flags);
++
++ return rc;
+ }
+
+ long do_update_va_mapping_otherdomain(unsigned long va, u64 val64,
+@@ -4037,6 +4118,46 @@ long do_update_va_mapping_otherdomain(unsigned long va, u64 val64,
+
+ put_pg_owner(pg_owner);
+
++ if ( rc == -ERESTART )
++ rc = hypercall_create_continuation(
++ __HYPERVISOR_update_va_mapping_otherdomain,
++ "llli", va, val64, flags, domid);
++
++ return rc;
++}
++
++int compat_update_va_mapping(unsigned int va, uint32_t lo, uint32_t hi,
++ unsigned int flags)
++{
++ int rc = __do_update_va_mapping(va, ((uint64_t)hi << 32) | lo,
++ flags, current->domain);
++
++ if ( rc == -ERESTART )
++ rc = hypercall_create_continuation(
++ __HYPERVISOR_update_va_mapping, "iiii", va, lo, hi, flags);
++
++ return rc;
++}
++
++int compat_update_va_mapping_otherdomain(unsigned int va,
++ uint32_t lo, uint32_t hi,
++ unsigned int flags, domid_t domid)
++{
++ struct domain *pg_owner;
++ int rc;
++
++ if ( (pg_owner = get_pg_owner(domid)) == NULL )
++ return -ESRCH;
++
++ rc = __do_update_va_mapping(va, ((uint64_t)hi << 32) | lo, flags, pg_owner);
++
++ put_pg_owner(pg_owner);
++
++ if ( rc == -ERESTART )
++ rc = hypercall_create_continuation(
++ __HYPERVISOR_update_va_mapping_otherdomain,
++ "iiiii", va, lo, hi, flags, domid);
++
+ return rc;
+ }
+
+diff --git a/xen/arch/x86/mm/paging.c b/xen/arch/x86/mm/paging.c
+index 755a8f83ca..3954e74d43 100644
+--- a/xen/arch/x86/mm/paging.c
++++ b/xen/arch/x86/mm/paging.c
+@@ -879,6 +879,8 @@ void paging_dump_domain_info(struct domain *d)
+ printk(" paging assistance: ");
+ if ( paging_mode_shadow(d) )
+ printk("shadow ");
++ if ( paging_mode_sh_forced(d) )
++ printk("forced ");
+ if ( paging_mode_hap(d) )
+ printk("hap ");
+ if ( paging_mode_refcounts(d) )
+diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c
+index 3a33e0b148..199dbbc929 100644
+--- a/xen/arch/x86/mm/shadow/common.c
++++ b/xen/arch/x86/mm/shadow/common.c
+@@ -3057,6 +3057,15 @@ static void sh_new_mode(struct domain *d, u32 new_mode)
+ ASSERT(paging_locked_by_me(d));
+ ASSERT(d != current->domain);
+
++ /*
++ * If PG_SH_forced has previously been activated because of writing an
++ * L1TF-vulnerable PTE, it must remain active for the remaining lifetime
++ * of the domain, even if the logdirty mode needs to be controlled for
++ * migration purposes.
++ */
++ if ( paging_mode_sh_forced(d) )
++ new_mode |= PG_SH_forced | PG_SH_enable;
++
+ d->arch.paging.mode = new_mode;
+ for_each_vcpu(d, v)
+ sh_update_paging_modes(v);
+@@ -3935,6 +3944,33 @@ void shadow_audit_tables(struct vcpu *v)
+
+ #endif /* Shadow audit */
+
++#ifdef CONFIG_PV
++
++void pv_l1tf_tasklet(unsigned long data)
++{
++ struct domain *d = (void *)data;
++
++ domain_pause(d);
++ paging_lock(d);
++
++ if ( !paging_mode_sh_forced(d) && !d->is_dying )
++ {
++ int ret = shadow_one_bit_enable(d, PG_SH_forced);
++
++ if ( ret )
++ {
++ printk(XENLOG_G_ERR "d%d Failed to enable PG_SH_forced: %d\n",
++ d->domain_id, ret);
++ domain_crash(d);
++ }
++ }
++
++ paging_unlock(d);
++ domain_unpause(d);
++}
++
++#endif /* CONFIG_PV */
++
+ /*
+ * Local variables:
+ * mode: C
+diff --git a/xen/arch/x86/mm/shadow/multi.c b/xen/arch/x86/mm/shadow/multi.c
+index e93e3b36b1..8c03ba0158 100644
+--- a/xen/arch/x86/mm/shadow/multi.c
++++ b/xen/arch/x86/mm/shadow/multi.c
+@@ -952,6 +952,8 @@ static int shadow_set_l4e(struct domain *d,
+
+ /* Write the new entry */
+ shadow_write_entries(sl4e, &new_sl4e, 1, sl4mfn);
++ flush_root_pgtbl_domain(d);
++
+ flags |= SHADOW_SET_CHANGED;
+
+ if ( shadow_l4e_get_flags(old_sl4e) & _PAGE_PRESENT )
+@@ -966,6 +968,7 @@ static int shadow_set_l4e(struct domain *d,
+ }
+ sh_put_ref(d, osl3mfn, paddr);
+ }
++
+ return flags;
+ }
+
+diff --git a/xen/arch/x86/mpparse.c b/xen/arch/x86/mpparse.c
+index 49140e46f0..f3f6d48668 100644
+--- a/xen/arch/x86/mpparse.c
++++ b/xen/arch/x86/mpparse.c
+@@ -68,19 +68,26 @@ physid_mask_t phys_cpu_present_map;
+
+ void __init set_nr_cpu_ids(unsigned int max_cpus)
+ {
++ unsigned int tot_cpus = num_processors + disabled_cpus;
++
+ if (!max_cpus)
+- max_cpus = num_processors + disabled_cpus;
++ max_cpus = tot_cpus;
+ if (max_cpus > NR_CPUS)
+ max_cpus = NR_CPUS;
+ else if (!max_cpus)
+ max_cpus = 1;
+ printk(XENLOG_INFO "SMP: Allowing %u CPUs (%d hotplug CPUs)\n",
+ max_cpus, max_t(int, max_cpus - num_processors, 0));
+- nr_cpu_ids = max_cpus;
++
++ if (!park_offline_cpus)
++ tot_cpus = max_cpus;
++ nr_cpu_ids = min(tot_cpus, NR_CPUS + 0u);
++ if (park_offline_cpus && nr_cpu_ids < num_processors)
++ printk(XENLOG_WARNING "SMP: Cannot bring up %u further CPUs\n",
++ num_processors - nr_cpu_ids);
+
+ #ifndef nr_cpumask_bits
+- nr_cpumask_bits = (max_cpus + (BITS_PER_LONG - 1)) &
+- ~(BITS_PER_LONG - 1);
++ nr_cpumask_bits = ROUNDUP(nr_cpu_ids, BITS_PER_LONG);
+ printk(XENLOG_DEBUG "NR_CPUS:%u nr_cpumask_bits:%u\n",
+ NR_CPUS, nr_cpumask_bits);
+ #endif
+diff --git a/xen/arch/x86/msr.c b/xen/arch/x86/msr.c
+index 48d061d7da..8e39d5f271 100644
+--- a/xen/arch/x86/msr.c
++++ b/xen/arch/x86/msr.c
+@@ -131,6 +131,7 @@ int guest_rdmsr(const struct vcpu *v, uint32_t msr, uint64_t *val)
+ case MSR_AMD_PATCHLOADER:
+ case MSR_IA32_UCODE_WRITE:
+ case MSR_PRED_CMD:
++ case MSR_FLUSH_CMD:
+ /* Write-only */
+ goto gp_fault;
+
+@@ -178,6 +179,8 @@ int guest_wrmsr(struct vcpu *v, uint32_t msr, uint64_t val)
+
+ switch ( msr )
+ {
++ uint64_t rsvd;
++
+ case MSR_INTEL_PLATFORM_INFO:
+ case MSR_ARCH_CAPABILITIES:
+ /* Read-only */
+@@ -213,8 +216,10 @@ int guest_wrmsr(struct vcpu *v, uint32_t msr, uint64_t val)
+ * Note: SPEC_CTRL_STIBP is specified as safe to use (i.e. ignored)
+ * when STIBP isn't enumerated in hardware.
+ */
++ rsvd = ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP |
++ (cp->feat.ssbd ? SPEC_CTRL_SSBD : 0));
+
+- if ( val & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP) )
++ if ( val & rsvd )
+ goto gp_fault; /* Rsvd bit set? */
+
+ vp->spec_ctrl.raw = val;
+@@ -231,14 +236,25 @@ int guest_wrmsr(struct vcpu *v, uint32_t msr, uint64_t val)
+ wrmsrl(MSR_PRED_CMD, val);
+ break;
+
++ case MSR_FLUSH_CMD:
++ if ( !cp->feat.l1d_flush )
++ goto gp_fault; /* MSR available? */
++
++ if ( val & ~FLUSH_CMD_L1D )
++ goto gp_fault; /* Rsvd bit set? */
++
++ if ( v == curr )
++ wrmsrl(MSR_FLUSH_CMD, val);
++ break;
++
+ case MSR_INTEL_MISC_FEATURES_ENABLES:
+ {
+- uint64_t rsvd = ~0ull;
+ bool old_cpuid_faulting = vp->misc_features_enables.cpuid_faulting;
+
+ if ( !vp->misc_features_enables.available )
+ goto gp_fault;
+
++ rsvd = ~0ull;
+ if ( dp->plaform_info.cpuid_faulting )
+ rsvd &= ~MSR_MISC_FEATURES_CPUID_FAULTING;
+
+diff --git a/xen/arch/x86/oprofile/nmi_int.c b/xen/arch/x86/oprofile/nmi_int.c
+index d8f5230906..3dfb8fef93 100644
+--- a/xen/arch/x86/oprofile/nmi_int.c
++++ b/xen/arch/x86/oprofile/nmi_int.c
+@@ -182,7 +182,7 @@ int nmi_reserve_counters(void)
+ if (!allocate_msrs())
+ return -ENOMEM;
+
+- /* We walk a thin line between law and rape here.
++ /*
+ * We need to be careful to install our NMI handler
+ * without actually triggering any NMIs as this will
+ * break the core code horrifically.
+diff --git a/xen/arch/x86/percpu.c b/xen/arch/x86/percpu.c
+index c9997b7937..8be4ebddf4 100644
+--- a/xen/arch/x86/percpu.c
++++ b/xen/arch/x86/percpu.c
+@@ -28,7 +28,7 @@ static int init_percpu_area(unsigned int cpu)
+ char *p;
+
+ if ( __per_cpu_offset[cpu] != INVALID_PERCPU_AREA )
+- return -EBUSY;
++ return 0;
+
+ if ( (p = alloc_xenheap_pages(PERCPU_ORDER, 0)) == NULL )
+ return -ENOMEM;
+@@ -76,9 +76,12 @@ static int cpu_percpu_callback(
+ break;
+ case CPU_UP_CANCELED:
+ case CPU_DEAD:
+- free_percpu_area(cpu);
++ if ( !park_offline_cpus )
++ free_percpu_area(cpu);
+ break;
+- default:
++ case CPU_REMOVE:
++ if ( park_offline_cpus )
++ free_percpu_area(cpu);
+ break;
+ }
+
+diff --git a/xen/arch/x86/pv/dom0_build.c b/xen/arch/x86/pv/dom0_build.c
+index 5d8909fa13..1a8142f89b 100644
+--- a/xen/arch/x86/pv/dom0_build.c
++++ b/xen/arch/x86/pv/dom0_build.c
+@@ -388,6 +388,8 @@ int __init dom0_construct_pv(struct domain *d,
+ if ( compat32 )
+ {
+ d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 1;
++ d->arch.pv_domain.xpti = false;
++ d->arch.pv_domain.pcid = false;
+ v->vcpu_info = (void *)&d->shared_info->compat.vcpu_info[0];
+ if ( setup_compat_arg_xlat(v) != 0 )
+ BUG();
+@@ -721,7 +723,7 @@ int __init dom0_construct_pv(struct domain *d,
+ update_cr3(v);
+
+ /* We run on dom0's page tables for the final part of the build process. */
+- write_ptbase(v);
++ switch_cr3_cr4(cr3_pa(v->arch.cr3), read_cr4());
+ mapcache_override_current(v);
+
+ /* Copy the OS image and free temporary buffer. */
+@@ -742,7 +744,7 @@ int __init dom0_construct_pv(struct domain *d,
+ (parms.virt_hypercall >= v_end) )
+ {
+ mapcache_override_current(NULL);
+- write_ptbase(current);
++ switch_cr3_cr4(current->arch.cr3, read_cr4());
+ printk("Invalid HYPERCALL_PAGE field in ELF notes.\n");
+ rc = -1;
+ goto out;
+@@ -875,7 +877,7 @@ int __init dom0_construct_pv(struct domain *d,
+
+ /* Return to idle domain's page tables. */
+ mapcache_override_current(NULL);
+- write_ptbase(current);
++ switch_cr3_cr4(current->arch.cr3, read_cr4());
+
+ update_domain_wallclock_time(d);
+
+diff --git a/xen/arch/x86/pv/domain.c b/xen/arch/x86/pv/domain.c
+index 74e9e667d2..bdcbd38f15 100644
+--- a/xen/arch/x86/pv/domain.c
++++ b/xen/arch/x86/pv/domain.c
+@@ -9,7 +9,54 @@
+ #include <xen/lib.h>
+ #include <xen/sched.h>
+
++#include <asm/cpufeature.h>
++#include <asm/invpcid.h>
++#include <asm/spec_ctrl.h>
+ #include <asm/pv/domain.h>
++#include <asm/shadow.h>
++
++static __read_mostly enum {
++ PCID_OFF,
++ PCID_ALL,
++ PCID_XPTI,
++ PCID_NOXPTI
++} opt_pcid = PCID_XPTI;
++
++static __init int parse_pcid(const char *s)
++{
++ int rc = 0;
++
++ switch ( parse_bool(s, NULL) )
++ {
++ case 0:
++ opt_pcid = PCID_OFF;
++ break;
++
++ case 1:
++ opt_pcid = PCID_ALL;
++ break;
++
++ default:
++ switch ( parse_boolean("xpti", s, NULL) )
++ {
++ case 0:
++ opt_pcid = PCID_NOXPTI;
++ break;
++
++ case 1:
++ opt_pcid = PCID_XPTI;
++ break;
++
++ default:
++ rc = -EINVAL;
++ break;
++ }
++ break;
++ }
++
++ return rc;
++}
++custom_runtime_param("pcid", parse_pcid);
+
+ /* Override macros from asm/page.h to make them work with mfn_t */
+ #undef mfn_to_page
+@@ -81,6 +128,9 @@ int switch_compat(struct domain *d)
+
+ d->arch.x87_fip_width = 4;
+
++ d->arch.pv_domain.xpti = false;
++ d->arch.pv_domain.pcid = false;
++
+ return 0;
+
+ undo_and_fail:
+@@ -166,6 +216,8 @@ int pv_vcpu_initialise(struct vcpu *v)
+
+ void pv_domain_destroy(struct domain *d)
+ {
++ pv_l1tf_domain_destroy(d);
++
+ destroy_perdomain_mapping(d, GDT_LDT_VIRT_START,
+ GDT_LDT_MBYTES << (20 - PAGE_SHIFT));
+
+@@ -187,6 +239,8 @@ int pv_domain_initialise(struct domain *d, unsigned int domcr_flags,
+ };
+ int rc = -ENOMEM;
+
++ pv_l1tf_domain_init(d);
++
+ d->arch.pv_domain.gdt_ldt_l1tab =
+ alloc_xenheap_pages(0, MEMF_node(domain_to_node(d)));
+ if ( !d->arch.pv_domain.gdt_ldt_l1tab )
+@@ -212,6 +266,32 @@ int pv_domain_initialise(struct domain *d, unsigned int domcr_flags,
+ /* 64-bit PV guest by default. */
+ d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 0;
+
++ d->arch.pv_domain.xpti = opt_xpti & (is_hardware_domain(d)
++ ? OPT_XPTI_DOM0 : OPT_XPTI_DOMU);
++
++ if ( !is_pv_32bit_domain(d) && use_invpcid && cpu_has_pcid )
++ switch ( opt_pcid )
++ {
++ case PCID_OFF:
++ break;
++
++ case PCID_ALL:
++ d->arch.pv_domain.pcid = true;
++ break;
++
++ case PCID_XPTI:
++ d->arch.pv_domain.pcid = d->arch.pv_domain.xpti;
++ break;
++
++ case PCID_NOXPTI:
++ d->arch.pv_domain.pcid = !d->arch.pv_domain.xpti;
++ break;
++
++ default:
++ ASSERT_UNREACHABLE();
++ break;
++ }
++
+ return 0;
+
+ fail:
+@@ -239,13 +319,25 @@ void toggle_guest_mode(struct vcpu *v)
+
+ void toggle_guest_pt(struct vcpu *v)
+ {
++ const struct domain *d = v->domain;
++
+ if ( is_pv_32bit_vcpu(v) )
+ return;
+
+ v->arch.flags ^= TF_kernel_mode;
+ update_cr3(v);
++ if ( d->arch.pv_domain.xpti )
++ {
++ struct cpu_info *cpu_info = get_cpu_info();
++
++ cpu_info->root_pgt_changed = true;
++ cpu_info->pv_cr3 = __pa(this_cpu(root_pgt)) |
++ (d->arch.pv_domain.pcid
++ ? get_pcid_bits(v, true) : 0);
++ }
++
+ /* Don't flush user global mappings from the TLB. Don't tick TLB clock. */
+- asm volatile ( "mov %0, %%cr3" : : "r" (v->arch.cr3) : "memory" );
++ write_cr3(v->arch.cr3);
+
+ if ( !(v->arch.flags & TF_kernel_mode) )
+ return;
+diff --git a/xen/arch/x86/pv/emul-priv-op.c b/xen/arch/x86/pv/emul-priv-op.c
+index 642ca312bf..c281936af0 100644
+--- a/xen/arch/x86/pv/emul-priv-op.c
++++ b/xen/arch/x86/pv/emul-priv-op.c
+@@ -813,26 +813,6 @@ static int write_cr(unsigned int reg, unsigned long val,
+ return X86EMUL_UNHANDLEABLE;
+ }
+
+-static int read_dr(unsigned int reg, unsigned long *val,
+- struct x86_emulate_ctxt *ctxt)
+-{
+- unsigned long res = do_get_debugreg(reg);
+-
+- if ( IS_ERR_VALUE(res) )
+- return X86EMUL_UNHANDLEABLE;
+-
+- *val = res;
+-
+- return X86EMUL_OKAY;
+-}
+-
+-static int write_dr(unsigned int reg, unsigned long val,
+- struct x86_emulate_ctxt *ctxt)
+-{
+- return do_set_debugreg(reg, val) == 0
+- ? X86EMUL_OKAY : X86EMUL_UNHANDLEABLE;
+-}
+-
+ static inline uint64_t guest_misc_enable(uint64_t val)
+ {
+ val &= ~(MSR_IA32_MISC_ENABLE_PERF_AVAIL |
+@@ -906,9 +886,16 @@ static int read_msr(unsigned int reg, uint64_t *val,
+ return X86EMUL_OKAY;
+
+ case MSR_EFER:
+- *val = read_efer();
++ /* Hide unknown bits, and unconditionally hide SVME from guests. */
++ *val = read_efer() & EFER_KNOWN_MASK & ~EFER_SVME;
++ /*
++ * Hide the 64-bit features from 32-bit guests. SCE has
++ * vendor-dependent behaviour.
++ */
+ if ( is_pv_32bit_domain(currd) )
+- *val &= ~(EFER_LME | EFER_LMA | EFER_LMSLE);
++ *val &= ~(EFER_LME | EFER_LMA | EFER_LMSLE |
++ (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL
++ ? EFER_SCE : 0));
+ return X86EMUL_OKAY;
+
+ case MSR_K7_FID_VID_CTL:
+@@ -1326,8 +1313,8 @@ static const struct x86_emulate_ops priv_op_ops = {
+ .read_segment = read_segment,
+ .read_cr = read_cr,
+ .write_cr = write_cr,
+- .read_dr = read_dr,
+- .write_dr = write_dr,
++ .read_dr = x86emul_read_dr,
++ .write_dr = x86emul_write_dr,
+ .read_msr = read_msr,
+ .write_msr = write_msr,
+ .cpuid = pv_emul_cpuid,
+diff --git a/xen/arch/x86/pv/misc-hypercalls.c b/xen/arch/x86/pv/misc-hypercalls.c
+index 5862130697..1619be7874 100644
+--- a/xen/arch/x86/pv/misc-hypercalls.c
++++ b/xen/arch/x86/pv/misc-hypercalls.c
+@@ -30,22 +30,10 @@ long do_set_debugreg(int reg, unsigned long value)
+
+ unsigned long do_get_debugreg(int reg)
+ {
+- struct vcpu *curr = current;
++ unsigned long val;
++ int res = x86emul_read_dr(reg, &val, NULL);
+
+- switch ( reg )
+- {
+- case 0 ... 3:
+- case 6:
+- return curr->arch.debugreg[reg];
+- case 7:
+- return (curr->arch.debugreg[7] |
+- curr->arch.debugreg[5]);
+- case 4 ... 5:
+- return ((curr->arch.pv_vcpu.ctrlreg[4] & X86_CR4_DE) ?
+- curr->arch.debugreg[reg + 2] : 0);
+- }
+-
+- return -EINVAL;
++ return res == X86EMUL_OKAY ? val : -ENODEV;
+ }
+
+ long do_fpu_taskswitch(int set)
+diff --git a/xen/arch/x86/pv/ro-page-fault.c b/xen/arch/x86/pv/ro-page-fault.c
+index 6b2976d3df..622bb7dff0 100644
+--- a/xen/arch/x86/pv/ro-page-fault.c
++++ b/xen/arch/x86/pv/ro-page-fault.c
+@@ -29,6 +29,7 @@
+ #include <asm/mm.h>
+ #include <asm/pci.h>
+ #include <asm/pv/mm.h>
++#include <asm/shadow.h>
+
+ #include "emulate.h"
+ #include "mm.h"
+@@ -127,6 +128,10 @@ static int ptwr_emulated_update(unsigned long addr, paddr_t old, paddr_t val,
+
+ /* Check the new PTE. */
+ nl1e = l1e_from_intpte(val);
++
++ if ( !(l1e_get_flags(nl1e) & _PAGE_PRESENT) && pv_l1tf_check_l1e(d, nl1e) )
++ return X86EMUL_RETRY;
++
+ switch ( ret = get_page_from_l1e(nl1e, d, d) )
+ {
+ default:
+diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
+index 482fe11669..1e9eecae04 100644
+--- a/xen/arch/x86/setup.c
++++ b/xen/arch/x86/setup.c
+@@ -62,6 +62,14 @@ boolean_param("nosmp", opt_nosmp);
+ static unsigned int __initdata max_cpus;
+ integer_param("maxcpus", max_cpus);
+
++int8_t __read_mostly opt_smt = -1;
++boolean_param("smt", opt_smt);
++
++/* opt_invpcid: If false, don't use INVPCID instruction even if available. */
++static bool __initdata opt_invpcid = true;
++boolean_param("invpcid", opt_invpcid);
++bool __read_mostly use_invpcid;
++
+ unsigned long __read_mostly cr4_pv32_mask;
+
+ /* **** Linux config option: propagated to domain0. */
+@@ -169,9 +177,6 @@ static int __init parse_smap_param(const char *s)
+ }
+ custom_param("smap", parse_smap_param);
+
+-static int8_t __initdata opt_xpti = -1;
+-boolean_param("xpti", opt_xpti);
+-
+ bool __read_mostly acpi_disabled;
+ bool __initdata acpi_force;
+ static char __initdata acpi_param[10] = "";
+@@ -663,7 +668,7 @@ void __init noreturn __start_xen(unsigned long mbi_p)
+ {
+ char *memmap_type = NULL;
+ char *cmdline, *kextra, *loader;
+- unsigned int initrdidx, domcr_flags = DOMCRF_s3_integrity;
++ unsigned int initrdidx, num_parked = 0, domcr_flags = DOMCRF_s3_integrity;
+ multiboot_info_t *mbi;
+ module_t *mod;
+ unsigned long nr_pages, raw_max_page, modules_headroom, *module_map;
+@@ -905,6 +910,18 @@ void __init noreturn __start_xen(unsigned long mbi_p)
+ /* Sanitise the raw E820 map to produce a final clean version. */
+ max_page = raw_max_page = init_e820(memmap_type, &e820_raw);
+
++ if ( !efi_enabled(EFI_BOOT) )
++ {
++ /*
++ * Supplement the heuristics in l1tf_calculations() by assuming that
++ * anything referenced in the E820 may be cacheable.
++ */
++ l1tf_safe_maddr =
++ max(l1tf_safe_maddr,
++ ROUNDUP(e820_raw.map[e820_raw.nr_map - 1].addr +
++ e820_raw.map[e820_raw.nr_map - 1].size, PAGE_SIZE));
++ }
++
+ /* Create a temporary copy of the E820 map. */
+ memcpy(&boot_e820, &e820, sizeof(e820));
+
+@@ -1485,7 +1502,8 @@ void __init noreturn __start_xen(unsigned long mbi_p)
+ else
+ {
+ set_nr_cpu_ids(max_cpus);
+- max_cpus = nr_cpu_ids;
++ if ( !max_cpus )
++ max_cpus = nr_cpu_ids;
+ }
+
+ if ( xen_guest )
+@@ -1539,25 +1557,12 @@ void __init noreturn __start_xen(unsigned long mbi_p)
+
+ cr4_pv32_mask = mmu_cr4_features & XEN_CR4_PV32_BITS;
+
+- if ( opt_xpti < 0 )
+- {
+- uint64_t caps = 0;
+-
+- if ( boot_cpu_data.x86_vendor == X86_VENDOR_AMD )
+- caps = ARCH_CAPABILITIES_RDCL_NO;
+- else if ( boot_cpu_has(X86_FEATURE_ARCH_CAPS) )
+- rdmsrl(MSR_ARCH_CAPABILITIES, caps);
+-
+- opt_xpti = !(caps & ARCH_CAPABILITIES_RDCL_NO);
+- }
+- if ( opt_xpti )
+- setup_clear_cpu_cap(X86_FEATURE_NO_XPTI);
+- else
+- setup_force_cpu_cap(X86_FEATURE_NO_XPTI);
+-
+ if ( cpu_has_fsgsbase )
+ set_in_cr4(X86_CR4_FSGSBASE);
+
++ if ( opt_invpcid && cpu_has_invpcid )
++ use_invpcid = true;
++
+ init_speculation_mitigations();
+
+ init_idle_domain();
+@@ -1621,16 +1626,30 @@ void __init noreturn __start_xen(unsigned long mbi_p)
+ /* Set up node_to_cpumask based on cpu_to_node[]. */
+ numa_add_cpu(i);
+
+- if ( (num_online_cpus() < max_cpus) && !cpu_online(i) )
++ if ( (park_offline_cpus || num_online_cpus() < max_cpus) &&
++ !cpu_online(i) )
+ {
+ int ret = cpu_up(i);
+ if ( ret != 0 )
+ printk("Failed to bring up CPU %u (error %d)\n", i, ret);
++ else if ( num_online_cpus() > max_cpus ||
++ (!opt_smt &&
++ cpu_data[i].compute_unit_id == INVALID_CUID &&
++ cpumask_weight(per_cpu(cpu_sibling_mask, i)) > 1) )
++ {
++ ret = cpu_down(i);
++ if ( !ret )
++ ++num_parked;
++ else
++ printk("Could not re-offline CPU%u (%d)\n", i, ret);
++ }
+ }
+ }
+ }
+
+ printk("Brought up %ld CPUs\n", (long)num_online_cpus());
++ if ( num_parked )
++ printk(XENLOG_INFO "Parked %u CPUs\n", num_parked);
+ smp_cpus_done();
+
+ do_initcalls();
+@@ -1746,6 +1765,13 @@ void __init noreturn __start_xen(unsigned long mbi_p)
+
+ setup_io_bitmap(dom0);
+
++ if ( bsp_delay_spec_ctrl )
++ {
++ get_cpu_info()->spec_ctrl_flags &= ~SCF_use_shadow;
++ barrier();
++ wrmsrl(MSR_SPEC_CTRL, default_xen_spec_ctrl);
++ }
++
+ /* Jump to the 1:1 virtual mappings of cpu0_stack. */
+ asm volatile ("mov %[stk], %%rsp; jmp %c[fn]" ::
+ [stk] "g" (__va(__pa(get_stack_bottom()))),
+diff --git a/xen/arch/x86/smp.c b/xen/arch/x86/smp.c
+index fd6d254812..517ba2f70d 100644
+--- a/xen/arch/x86/smp.c
++++ b/xen/arch/x86/smp.c
+@@ -208,7 +208,7 @@ void invalidate_interrupt(struct cpu_user_regs *regs)
+ ack_APIC_irq();
+ perfc_incr(ipis);
+ if ( __sync_local_execstate() )
+- flags &= ~(FLUSH_TLB | FLUSH_TLB_GLOBAL);
++ flags &= ~(FLUSH_TLB | FLUSH_TLB_GLOBAL | FLUSH_ROOT_PGTBL);
+ flush_area_local(flush_va, flags);
+ cpumask_clear_cpu(smp_processor_id(), &flush_cpumask);
+ }
+diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c
+index e1d023428c..b0496eb66e 100644
+--- a/xen/arch/x86/smpboot.c
++++ b/xen/arch/x86/smpboot.c
+@@ -69,6 +69,8 @@ static cpumask_t scratch_cpu0mask;
+ cpumask_t cpu_online_map __read_mostly;
+ EXPORT_SYMBOL(cpu_online_map);
+
++bool __read_mostly park_offline_cpus;
++
+ unsigned int __read_mostly nr_sockets;
+ cpumask_t **__read_mostly socket_cpumask;
+ static cpumask_t *secondary_socket_cpumask;
+@@ -228,33 +230,41 @@ static void link_thread_siblings(int cpu1, int cpu2)
+ cpumask_set_cpu(cpu2, per_cpu(cpu_core_mask, cpu1));
+ }
+
+-static void set_cpu_sibling_map(int cpu)
++static void set_cpu_sibling_map(unsigned int cpu)
+ {
+- int i;
++ unsigned int i;
+ struct cpuinfo_x86 *c = cpu_data;
+
+ cpumask_set_cpu(cpu, &cpu_sibling_setup_map);
+
+ cpumask_set_cpu(cpu, socket_cpumask[cpu_to_socket(cpu)]);
++ cpumask_set_cpu(cpu, per_cpu(cpu_core_mask, cpu));
++ cpumask_set_cpu(cpu, per_cpu(cpu_sibling_mask, cpu));
+
+ if ( c[cpu].x86_num_siblings > 1 )
+ {
+ for_each_cpu ( i, &cpu_sibling_setup_map )
+ {
+- if ( cpu_has(c, X86_FEATURE_TOPOEXT) ) {
+- if ( (c[cpu].phys_proc_id == c[i].phys_proc_id) &&
+- (c[cpu].compute_unit_id == c[i].compute_unit_id) )
++ if ( cpu == i || c[cpu].phys_proc_id != c[i].phys_proc_id )
++ continue;
++ if ( c[cpu].compute_unit_id != INVALID_CUID &&
++ c[i].compute_unit_id != INVALID_CUID )
++ {
++ if ( c[cpu].compute_unit_id == c[i].compute_unit_id )
++ link_thread_siblings(cpu, i);
++ }
++ else if ( c[cpu].cpu_core_id != XEN_INVALID_CORE_ID &&
++ c[i].cpu_core_id != XEN_INVALID_CORE_ID )
++ {
++ if ( c[cpu].cpu_core_id == c[i].cpu_core_id )
+ link_thread_siblings(cpu, i);
+- } else if ( (c[cpu].phys_proc_id == c[i].phys_proc_id) &&
+- (c[cpu].cpu_core_id == c[i].cpu_core_id) ) {
+- link_thread_siblings(cpu, i);
+ }
++ else
++ printk(XENLOG_WARNING
++ "CPU%u: unclear relationship with CPU%u\n",
++ cpu, i);
+ }
+ }
+- else
+- {
+- cpumask_set_cpu(cpu, per_cpu(cpu_sibling_mask, cpu));
+- }
+
+ if ( c[cpu].x86_max_cores == 1 )
+ {
+@@ -330,8 +340,9 @@ void start_secondary(void *unused)
+ */
+ spin_debug_disable();
+
++ get_cpu_info()->use_pv_cr3 = false;
+ get_cpu_info()->xen_cr3 = 0;
+- get_cpu_info()->pv_cr3 = this_cpu(root_pgt) ? __pa(this_cpu(root_pgt)) : 0;
++ get_cpu_info()->pv_cr3 = 0;
+
+ load_system_tables();
+
+@@ -351,6 +362,14 @@ void start_secondary(void *unused)
+ else
+ microcode_resume_cpu(cpu);
+
++ /*
++ * If MSR_SPEC_CTRL is available, apply Xen's default setting and discard
++ * any firmware settings. Note: MSR_SPEC_CTRL may only become available
++ * after loading microcode.
++ */
++ if ( boot_cpu_has(X86_FEATURE_IBRSB) )
++ wrmsrl(MSR_SPEC_CTRL, default_xen_spec_ctrl);
++
+ if ( xen_guest )
+ hypervisor_ap_setup();
+
+@@ -870,11 +889,18 @@ static void cleanup_cpu_root_pgt(unsigned int cpu)
+ l2_pgentry_t *l2t = l3e_to_l2e(l3t[l3_table_offset(stub_linear)]);
+ l1_pgentry_t *l1t = l2e_to_l1e(l2t[l2_table_offset(stub_linear)]);
+
+- l1t[l2_table_offset(stub_linear)] = l1e_empty();
++ l1t[l1_table_offset(stub_linear)] = l1e_empty();
+ }
+ }
+
+-static void cpu_smpboot_free(unsigned int cpu)
++/*
++ * The 'remove' boolean controls whether a CPU is just getting offlined (and
++ * parked), or outright removed / offlined without parking. Parked CPUs need
++ * things like their stack, GDT, IDT, TSS, and per-CPU data still available.
++ * A few other items, in particular CPU masks, are also retained, as it's
++ * difficult to prove that they're entirely unreferenced from parked CPUs.
++ */
++static void cpu_smpboot_free(unsigned int cpu, bool remove)
+ {
+ unsigned int order, socket = cpu_to_socket(cpu);
+ struct cpuinfo_x86 *c = cpu_data;
+@@ -885,15 +911,19 @@ static void cpu_smpboot_free(unsigned int cpu)
+ socket_cpumask[socket] = NULL;
+ }
+
+- c[cpu].phys_proc_id = XEN_INVALID_SOCKET_ID;
+- c[cpu].cpu_core_id = XEN_INVALID_CORE_ID;
+- c[cpu].compute_unit_id = INVALID_CUID;
+ cpumask_clear_cpu(cpu, &cpu_sibling_setup_map);
+
+- free_cpumask_var(per_cpu(cpu_sibling_mask, cpu));
+- free_cpumask_var(per_cpu(cpu_core_mask, cpu));
+- if ( per_cpu(scratch_cpumask, cpu) != &scratch_cpu0mask )
+- free_cpumask_var(per_cpu(scratch_cpumask, cpu));
++ if ( remove )
++ {
++ c[cpu].phys_proc_id = XEN_INVALID_SOCKET_ID;
++ c[cpu].cpu_core_id = XEN_INVALID_CORE_ID;
++ c[cpu].compute_unit_id = INVALID_CUID;
++
++ FREE_CPUMASK_VAR(per_cpu(cpu_sibling_mask, cpu));
++ FREE_CPUMASK_VAR(per_cpu(cpu_core_mask, cpu));
++ if ( per_cpu(scratch_cpumask, cpu) != &scratch_cpu0mask )
++ FREE_CPUMASK_VAR(per_cpu(scratch_cpumask, cpu));
++ }
+
+ cleanup_cpu_root_pgt(cpu);
+
+@@ -915,19 +945,21 @@ static void cpu_smpboot_free(unsigned int cpu)
+ }
+
+ order = get_order_from_pages(NR_RESERVED_GDT_PAGES);
+- free_xenheap_pages(per_cpu(gdt_table, cpu), order);
++ if ( remove )
++ FREE_XENHEAP_PAGES(per_cpu(gdt_table, cpu), order);
+
+ free_xenheap_pages(per_cpu(compat_gdt_table, cpu), order);
+
+- order = get_order_from_bytes(IDT_ENTRIES * sizeof(idt_entry_t));
+- free_xenheap_pages(idt_tables[cpu], order);
+- idt_tables[cpu] = NULL;
+-
+- if ( stack_base[cpu] != NULL )
++ if ( remove )
+ {
+- memguard_unguard_stack(stack_base[cpu]);
+- free_xenheap_pages(stack_base[cpu], STACK_ORDER);
+- stack_base[cpu] = NULL;
++ order = get_order_from_bytes(IDT_ENTRIES * sizeof(idt_entry_t));
++ FREE_XENHEAP_PAGES(idt_tables[cpu], order);
++
++ if ( stack_base[cpu] )
++ {
++ memguard_unguard_stack(stack_base[cpu]);
++ FREE_XENHEAP_PAGES(stack_base[cpu], STACK_ORDER);
++ }
+ }
+ }
+
+@@ -941,15 +973,17 @@ static int cpu_smpboot_alloc(unsigned int cpu)
+ if ( node != NUMA_NO_NODE )
+ memflags = MEMF_node(node);
+
+- stack_base[cpu] = alloc_xenheap_pages(STACK_ORDER, memflags);
++ if ( stack_base[cpu] == NULL )
++ stack_base[cpu] = alloc_xenheap_pages(STACK_ORDER, memflags);
+ if ( stack_base[cpu] == NULL )
+ goto oom;
+ memguard_guard_stack(stack_base[cpu]);
+
+ order = get_order_from_pages(NR_RESERVED_GDT_PAGES);
+- per_cpu(gdt_table, cpu) = gdt = alloc_xenheap_pages(order, memflags);
++ gdt = per_cpu(gdt_table, cpu) ?: alloc_xenheap_pages(order, memflags);
+ if ( gdt == NULL )
+ goto oom;
++ per_cpu(gdt_table, cpu) = gdt;
+ memcpy(gdt, boot_cpu_gdt_table, NR_RESERVED_GDT_PAGES * PAGE_SIZE);
+ BUILD_BUG_ON(NR_CPUS > 0x10000);
+ gdt[PER_CPU_GDT_ENTRY - FIRST_RESERVED_GDT_ENTRY].a = cpu;
+@@ -961,13 +995,15 @@ static int cpu_smpboot_alloc(unsigned int cpu)
+ gdt[PER_CPU_GDT_ENTRY - FIRST_RESERVED_GDT_ENTRY].a = cpu;
+
+ order = get_order_from_bytes(IDT_ENTRIES * sizeof(idt_entry_t));
+- idt_tables[cpu] = alloc_xenheap_pages(order, memflags);
++ if ( idt_tables[cpu] == NULL )
++ idt_tables[cpu] = alloc_xenheap_pages(order, memflags);
+ if ( idt_tables[cpu] == NULL )
+ goto oom;
+ memcpy(idt_tables[cpu], idt_table, IDT_ENTRIES * sizeof(idt_entry_t));
+ set_ist(&idt_tables[cpu][TRAP_double_fault], IST_NONE);
+ set_ist(&idt_tables[cpu][TRAP_nmi], IST_NONE);
+ set_ist(&idt_tables[cpu][TRAP_machine_check], IST_NONE);
++ set_ist(&idt_tables[cpu][TRAP_debug], IST_NONE);
+
+ for ( stub_page = 0, i = cpu & ~(STUBS_PER_PAGE - 1);
+ i < nr_cpu_ids && i <= (cpu | (STUBS_PER_PAGE - 1)); ++i )
+@@ -989,13 +1025,13 @@ static int cpu_smpboot_alloc(unsigned int cpu)
+ (secondary_socket_cpumask = xzalloc(cpumask_t)) == NULL )
+ goto oom;
+
+- if ( zalloc_cpumask_var(&per_cpu(cpu_sibling_mask, cpu)) &&
+- zalloc_cpumask_var(&per_cpu(cpu_core_mask, cpu)) &&
+- alloc_cpumask_var(&per_cpu(scratch_cpumask, cpu)) )
++ if ( cond_zalloc_cpumask_var(&per_cpu(cpu_sibling_mask, cpu)) &&
++ cond_zalloc_cpumask_var(&per_cpu(cpu_core_mask, cpu)) &&
++ cond_alloc_cpumask_var(&per_cpu(scratch_cpumask, cpu)) )
+ return 0;
+
+ oom:
+- cpu_smpboot_free(cpu);
++ cpu_smpboot_free(cpu, true);
+ return -ENOMEM;
+ }
+
+@@ -1012,9 +1048,10 @@ static int cpu_smpboot_callback(
+ break;
+ case CPU_UP_CANCELED:
+ case CPU_DEAD:
+- cpu_smpboot_free(cpu);
++ cpu_smpboot_free(cpu, !park_offline_cpus);
+ break;
+- default:
++ case CPU_REMOVE:
++ cpu_smpboot_free(cpu, true);
+ break;
+ }
+
+@@ -1047,7 +1084,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
+ panic("Error %d setting up PV root page table\n", rc);
+ if ( per_cpu(root_pgt, 0) )
+ {
+- get_cpu_info()->pv_cr3 = __pa(per_cpu(root_pgt, 0));
++ get_cpu_info()->pv_cr3 = 0;
+
+ /*
+ * All entry points which may need to switch page tables have to start
+@@ -1126,6 +1163,7 @@ void __init smp_prepare_boot_cpu(void)
+ per_cpu(scratch_cpumask, cpu) = &scratch_cpu0mask;
+ #endif
+
++ get_cpu_info()->use_pv_cr3 = false;
+ get_cpu_info()->xen_cr3 = 0;
+ get_cpu_info()->pv_cr3 = 0;
+ }
+diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c
+index 3c7447bfe6..14e01faff9 100644
+--- a/xen/arch/x86/spec_ctrl.c
++++ b/xen/arch/x86/spec_ctrl.c
+@@ -19,13 +19,23 @@
+ #include <xen/errno.h>
+ #include <xen/init.h>
+ #include <xen/lib.h>
++#include <xen/warning.h>
+
+ #include <asm/microcode.h>
+ #include <asm/msr.h>
+ #include <asm/processor.h>
++#include <asm/pv/shim.h>
++#include <asm/setup.h>
+ #include <asm/spec_ctrl.h>
+ #include <asm/spec_ctrl_asm.h>
+
++/* Cmdline controls for Xen's alternative blocks. */
++static bool __initdata opt_msr_sc_pv = true;
++static bool __initdata opt_msr_sc_hvm = true;
++static bool __initdata opt_rsb_pv = true;
++static bool __initdata opt_rsb_hvm = true;
++
++/* Cmdline controls for Xen's speculative settings. */
+ static enum ind_thunk {
+ THUNK_DEFAULT, /* Decide which thunk to use at boot time. */
+ THUNK_NONE, /* Missing compiler support for thunks. */
+@@ -35,10 +45,18 @@ static enum ind_thunk {
+ THUNK_JMP,
+ } opt_thunk __initdata = THUNK_DEFAULT;
+ static int8_t __initdata opt_ibrs = -1;
+-static bool __initdata opt_rsb_native = true;
+-static bool __initdata opt_rsb_vmexit = true;
+ bool __read_mostly opt_ibpb = true;
+-uint8_t __read_mostly default_bti_ist_info;
++bool __read_mostly opt_ssbd = false;
++int8_t __read_mostly opt_eager_fpu = -1;
++int8_t __read_mostly opt_l1d_flush = -1;
++
++bool __initdata bsp_delay_spec_ctrl;
++uint8_t __read_mostly default_xen_spec_ctrl;
++uint8_t __read_mostly default_spec_ctrl_flags;
++
++paddr_t __read_mostly l1tf_addr_mask, __read_mostly l1tf_safe_maddr;
++static bool __initdata cpu_has_bug_l1tf;
++static unsigned int __initdata l1d_maxphysaddr;
+
+ static int __init parse_bti(const char *s)
+ {
+@@ -68,9 +86,9 @@ static int __init parse_bti(const char *s)
+ else if ( (val = parse_boolean("ibpb", s, ss)) >= 0 )
+ opt_ibpb = val;
+ else if ( (val = parse_boolean("rsb_native", s, ss)) >= 0 )
+- opt_rsb_native = val;
++ opt_rsb_pv = val;
+ else if ( (val = parse_boolean("rsb_vmexit", s, ss)) >= 0 )
+- opt_rsb_vmexit = val;
++ opt_rsb_hvm = val;
+ else
+ rc = -EINVAL;
+
+@@ -81,50 +99,244 @@ static int __init parse_bti(const char *s)
+ }
+ custom_param("bti", parse_bti);
+
+-static void __init print_details(enum ind_thunk thunk)
++static int __init parse_spec_ctrl(const char *s)
++{
++ const char *ss;
++ int val, rc = 0;
++
++ do {
++ ss = strchr(s, ',');
++ if ( !ss )
++ ss = strchr(s, '\0');
++
++ /* Global and Xen-wide disable. */
++ val = parse_bool(s, ss);
++ if ( !val )
++ {
++ opt_msr_sc_pv = false;
++ opt_msr_sc_hvm = false;
++
++ opt_eager_fpu = 0;
++
++ if ( opt_xpti < 0 )
++ opt_xpti = 0;
++
++ if ( opt_smt < 0 )
++ opt_smt = 1;
++
++ if ( opt_pv_l1tf < 0 )
++ opt_pv_l1tf = 0;
++
++ disable_common:
++ opt_rsb_pv = false;
++ opt_rsb_hvm = false;
++
++ opt_thunk = THUNK_JMP;
++ opt_ibrs = 0;
++ opt_ibpb = false;
++ opt_ssbd = false;
++ opt_l1d_flush = 0;
++ }
++ else if ( val > 0 )
++ rc = -EINVAL;
++ else if ( (val = parse_boolean("xen", s, ss)) >= 0 )
++ {
++ if ( !val )
++ goto disable_common;
++
++ rc = -EINVAL;
++ }
++
++ /* Xen's alternative blocks. */
++ else if ( (val = parse_boolean("pv", s, ss)) >= 0 )
++ {
++ opt_msr_sc_pv = val;
++ opt_rsb_pv = val;
++ }
++ else if ( (val = parse_boolean("hvm", s, ss)) >= 0 )
++ {
++ opt_msr_sc_hvm = val;
++ opt_rsb_hvm = val;
++ }
++ else if ( (val = parse_boolean("msr-sc", s, ss)) >= 0 )
++ {
++ opt_msr_sc_pv = val;
++ opt_msr_sc_hvm = val;
++ }
++ else if ( (val = parse_boolean("rsb", s, ss)) >= 0 )
++ {
++ opt_rsb_pv = val;
++ opt_rsb_hvm = val;
++ }
++
++ /* Xen's speculative sidechannel mitigation settings. */
++ else if ( !strncmp(s, "bti-thunk=", 10) )
++ {
++ s += 10;
++
++ if ( !strncmp(s, "retpoline", ss - s) )
++ opt_thunk = THUNK_RETPOLINE;
++ else if ( !strncmp(s, "lfence", ss - s) )
++ opt_thunk = THUNK_LFENCE;
++ else if ( !strncmp(s, "jmp", ss - s) )
++ opt_thunk = THUNK_JMP;
++ else
++ rc = -EINVAL;
++ }
++ else if ( (val = parse_boolean("ibrs", s, ss)) >= 0 )
++ opt_ibrs = val;
++ else if ( (val = parse_boolean("ibpb", s, ss)) >= 0 )
++ opt_ibpb = val;
++ else if ( (val = parse_boolean("ssbd", s, ss)) >= 0 )
++ opt_ssbd = val;
++ else if ( (val = parse_boolean("eager-fpu", s, ss)) >= 0 )
++ opt_eager_fpu = val;
++ else if ( (val = parse_boolean("l1d-flush", s, ss)) >= 0 )
++ opt_l1d_flush = val;
++ else
++ rc = -EINVAL;
++
++ s = ss + 1;
++ } while ( *ss );
++
++ return rc;
++}
++custom_param("spec-ctrl", parse_spec_ctrl);
++
++int8_t __read_mostly opt_pv_l1tf = -1;
++
++static __init int parse_pv_l1tf(const char *s)
++{
++ const char *ss;
++ int val, rc = 0;
++
++ /* Inhibit the defaults as an explicit choice has been given. */
++ if ( opt_pv_l1tf == -1 )
++ opt_pv_l1tf = 0;
++
++ /* Interpret 'pv-l1tf' alone in its positive boolean form. */
++ if ( *s == '\0' )
++ opt_pv_l1tf = OPT_PV_L1TF_DOM0 | OPT_PV_L1TF_DOMU;
++
++ do {
++ ss = strchr(s, ',');
++ if ( !ss )
++ ss = strchr(s, '\0');
++
++ switch ( parse_bool(s, ss) )
++ {
++ case 0:
++ opt_pv_l1tf = 0;
++ break;
++
++ case 1:
++ opt_pv_l1tf = OPT_PV_L1TF_DOM0 | OPT_PV_L1TF_DOMU;
++ break;
++
++ default:
++ if ( (val = parse_boolean("dom0", s, ss)) >= 0 )
++ opt_pv_l1tf = ((opt_pv_l1tf & ~OPT_PV_L1TF_DOM0) |
++ (val ? OPT_PV_L1TF_DOM0 : 0));
++ else if ( (val = parse_boolean("domu", s, ss)) >= 0 )
++ opt_pv_l1tf = ((opt_pv_l1tf & ~OPT_PV_L1TF_DOMU) |
++ (val ? OPT_PV_L1TF_DOMU : 0));
++ else
++ rc = -EINVAL;
++ break;
++ }
++
++ s = ss + 1;
++ } while ( *ss );
++
++ return rc;
++}
++custom_param("pv-l1tf", parse_pv_l1tf);
++
++static void __init print_details(enum ind_thunk thunk, uint64_t caps)
+ {
+ unsigned int _7d0 = 0, e8b = 0, tmp;
+- uint64_t caps = 0;
+
+ /* Collect diagnostics about available mitigations. */
+ if ( boot_cpu_data.cpuid_level >= 7 )
+ cpuid_count(7, 0, &tmp, &tmp, &tmp, &_7d0);
+ if ( boot_cpu_data.extended_cpuid_level >= 0x80000008 )
+ cpuid(0x80000008, &tmp, &e8b, &tmp, &tmp);
+- if ( _7d0 & cpufeat_mask(X86_FEATURE_ARCH_CAPS) )
+- rdmsrl(MSR_ARCH_CAPABILITIES, caps);
+
+- printk(XENLOG_DEBUG "Speculative mitigation facilities:\n");
++ printk("Speculative mitigation facilities:\n");
+
+ /* Hardware features which pertain to speculative mitigations. */
+- printk(XENLOG_DEBUG " Hardware features:%s%s%s%s%s\n",
++ printk(" Hardware features:%s%s%s%s%s%s%s%s%s%s\n",
+ (_7d0 & cpufeat_mask(X86_FEATURE_IBRSB)) ? " IBRS/IBPB" : "",
+ (_7d0 & cpufeat_mask(X86_FEATURE_STIBP)) ? " STIBP" : "",
++ (_7d0 & cpufeat_mask(X86_FEATURE_L1D_FLUSH)) ? " L1D_FLUSH" : "",
++ (_7d0 & cpufeat_mask(X86_FEATURE_SSBD)) ? " SSBD" : "",
+ (e8b & cpufeat_mask(X86_FEATURE_IBPB)) ? " IBPB" : "",
+ (caps & ARCH_CAPABILITIES_IBRS_ALL) ? " IBRS_ALL" : "",
+- (caps & ARCH_CAPABILITIES_RDCL_NO) ? " RDCL_NO" : "");
+-
+- /* Compiled-in support which pertains to BTI mitigations. */
+- if ( IS_ENABLED(CONFIG_INDIRECT_THUNK) )
+- printk(XENLOG_DEBUG " Compiled-in support: INDIRECT_THUNK\n");
+-
+- printk("BTI mitigations: Thunk %s, Others:%s%s%s%s\n",
++ (caps & ARCH_CAPABILITIES_RDCL_NO) ? " RDCL_NO" : "",
++ (caps & ARCH_CAPS_RSBA) ? " RSBA" : "",
++ (caps & ARCH_CAPS_SKIP_L1DFL) ? " SKIP_L1DFL": "",
++ (caps & ARCH_CAPS_SSB_NO) ? " SSB_NO" : "");
++
++ /* Compiled-in support which pertains to mitigations. */
++ if ( IS_ENABLED(CONFIG_INDIRECT_THUNK) || IS_ENABLED(CONFIG_SHADOW_PAGING) )
++ printk(" Compiled-in support:"
++#ifdef CONFIG_INDIRECT_THUNK
++ " INDIRECT_THUNK"
++#endif
++#ifdef CONFIG_SHADOW_PAGING
++ " SHADOW_PAGING"
++#endif
++ "\n");
++
++ /* Settings for Xen's protection, irrespective of guests. */
++ printk(" Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s, Other:%s%s\n",
+ thunk == THUNK_NONE ? "N/A" :
+ thunk == THUNK_RETPOLINE ? "RETPOLINE" :
+ thunk == THUNK_LFENCE ? "LFENCE" :
+ thunk == THUNK_JMP ? "JMP" : "?",
+- boot_cpu_has(X86_FEATURE_XEN_IBRS_SET) ? " IBRS+" :
+- boot_cpu_has(X86_FEATURE_XEN_IBRS_CLEAR) ? " IBRS-" : "",
+- opt_ibpb ? " IBPB" : "",
+- boot_cpu_has(X86_FEATURE_RSB_NATIVE) ? " RSB_NATIVE" : "",
+- boot_cpu_has(X86_FEATURE_RSB_VMEXIT) ? " RSB_VMEXIT" : "");
+-
+- printk("XPTI: %s\n",
+- boot_cpu_has(X86_FEATURE_NO_XPTI) ? "disabled" : "enabled");
++ !boot_cpu_has(X86_FEATURE_IBRSB) ? "No" :
++ (default_xen_spec_ctrl & SPEC_CTRL_IBRS) ? "IBRS+" : "IBRS-",
++ !boot_cpu_has(X86_FEATURE_SSBD) ? "" :
++ (default_xen_spec_ctrl & SPEC_CTRL_SSBD) ? " SSBD+" : " SSBD-",
++ opt_ibpb ? " IBPB" : "",
++ opt_l1d_flush ? " L1D_FLUSH" : "");
++
++ /* L1TF diagnostics, printed if vulnerable or PV shadowing is in use. */
++ if ( cpu_has_bug_l1tf || opt_pv_l1tf )
++ printk(" L1TF: believed%s vulnerable, maxphysaddr L1D %u, CPUID %u"
++ ", Safe address %"PRIx64"\n",
++ cpu_has_bug_l1tf ? "" : " not",
++ l1d_maxphysaddr, paddr_bits, l1tf_safe_maddr);
++
++ /*
++ * Alternatives blocks for protecting against and/or virtualising
++ * mitigation support for guests.
++ */
++ printk(" Support for VMs: PV:%s%s%s%s, HVM:%s%s%s%s\n",
++ (boot_cpu_has(X86_FEATURE_SC_MSR_PV) ||
++ boot_cpu_has(X86_FEATURE_SC_RSB_PV) ||
++ opt_eager_fpu) ? "" : " None",
++ boot_cpu_has(X86_FEATURE_SC_MSR_PV) ? " MSR_SPEC_CTRL" : "",
++ boot_cpu_has(X86_FEATURE_SC_RSB_PV) ? " RSB" : "",
++ opt_eager_fpu ? " EAGER_FPU" : "",
++ (boot_cpu_has(X86_FEATURE_SC_MSR_HVM) ||
++ boot_cpu_has(X86_FEATURE_SC_RSB_HVM) ||
++ opt_eager_fpu) ? "" : " None",
++ boot_cpu_has(X86_FEATURE_SC_MSR_HVM) ? " MSR_SPEC_CTRL" : "",
++ boot_cpu_has(X86_FEATURE_SC_RSB_HVM) ? " RSB" : "",
++ opt_eager_fpu ? " EAGER_FPU" : "");
++
++ printk(" XPTI (64-bit PV only): Dom0 %s, DomU %s\n",
++ opt_xpti & OPT_XPTI_DOM0 ? "enabled" : "disabled",
++ opt_xpti & OPT_XPTI_DOMU ? "enabled" : "disabled");
++
++ printk(" PV L1TF shadowing: Dom0 %s, DomU %s\n",
++ opt_pv_l1tf & OPT_PV_L1TF_DOM0 ? "enabled" : "disabled",
++ opt_pv_l1tf & OPT_PV_L1TF_DOMU ? "enabled" : "disabled");
+ }
+
+ /* Calculate whether Retpoline is known-safe on this CPU. */
+-static bool __init retpoline_safe(void)
++static bool __init retpoline_safe(uint64_t caps)
+ {
+ unsigned int ucode_rev = this_cpu(ucode_cpu_info).cpu_sig.rev;
+
+@@ -135,6 +347,13 @@ static bool __init retpoline_safe(void)
+ boot_cpu_data.x86 != 6 )
+ return false;
+
++ /*
++ * RSBA may be set by a hypervisor to indicate that we may move to a
++ * processor which isn't retpoline-safe.
++ */
++ if ( caps & ARCH_CAPS_RSBA )
++ return false;
++
+ switch ( boot_cpu_data.x86_model )
+ {
+ case 0x17: /* Penryn */
+@@ -161,26 +380,337 @@ static bool __init retpoline_safe(void)
+ * versions.
+ */
+ case 0x3d: /* Broadwell */
+- return ucode_rev >= 0x28;
++ return ucode_rev >= 0x2a;
+ case 0x47: /* Broadwell H */
+- return ucode_rev >= 0x1b;
++ return ucode_rev >= 0x1d;
+ case 0x4f: /* Broadwell EP/EX */
+- return ucode_rev >= 0xb000025;
++ return ucode_rev >= 0xb000021;
+ case 0x56: /* Broadwell D */
+- return false; /* TBD. */
++ switch ( boot_cpu_data.x86_mask )
++ {
++ case 2: return ucode_rev >= 0x15;
++ case 3: return ucode_rev >= 0x7000012;
++ case 4: return ucode_rev >= 0xf000011;
++ case 5: return ucode_rev >= 0xe000009;
++ default:
++ printk("Unrecognised CPU stepping %#x - assuming not reptpoline safe\n",
++ boot_cpu_data.x86_mask);
++ return false;
++ }
++ break;
+
+ /*
+- * Skylake and later processors are not retpoline-safe.
++ * Skylake, Kabylake and Cannonlake processors are not retpoline-safe.
+ */
++ case 0x4e:
++ case 0x55:
++ case 0x5e:
++ case 0x66:
++ case 0x67:
++ case 0x8e:
++ case 0x9e:
++ return false;
++
+ default:
++ printk("Unrecognised CPU model %#x - assuming not reptpoline safe\n",
++ boot_cpu_data.x86_model);
+ return false;
+ }
+ }
+
++/* Calculate whether this CPU speculates past #NM */
++static bool __init should_use_eager_fpu(void)
++{
++ /*
++ * Assume all unrecognised processors are ok. This is only known to
++ * affect Intel Family 6 processors.
++ */
++ if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
++ boot_cpu_data.x86 != 6 )
++ return false;
++
++ switch ( boot_cpu_data.x86_model )
++ {
++ /*
++ * Core processors since at least Nehalem are vulnerable.
++ */
++ case 0x1e: /* Nehalem */
++ case 0x1f: /* Auburndale / Havendale */
++ case 0x1a: /* Nehalem EP */
++ case 0x2e: /* Nehalem EX */
++ case 0x25: /* Westmere */
++ case 0x2c: /* Westmere EP */
++ case 0x2f: /* Westmere EX */
++ case 0x2a: /* SandyBridge */
++ case 0x2d: /* SandyBridge EP/EX */
++ case 0x3a: /* IvyBridge */
++ case 0x3e: /* IvyBridge EP/EX */
++ case 0x3c: /* Haswell */
++ case 0x3f: /* Haswell EX/EP */
++ case 0x45: /* Haswell D */
++ case 0x46: /* Haswell H */
++ case 0x3d: /* Broadwell */
++ case 0x47: /* Broadwell H */
++ case 0x4f: /* Broadwell EP/EX */
++ case 0x56: /* Broadwell D */
++ case 0x4e: /* Skylake M */
++ case 0x55: /* Skylake X */
++ case 0x5e: /* Skylake D */
++ case 0x66: /* Cannonlake */
++ case 0x67: /* Cannonlake? */
++ case 0x8e: /* Kabylake M */
++ case 0x9e: /* Kabylake D */
++ return true;
++
++ /*
++ * Atom processors are not vulnerable.
++ */
++ case 0x1c: /* Pineview */
++ case 0x26: /* Lincroft */
++ case 0x27: /* Penwell */
++ case 0x35: /* Cloverview */
++ case 0x36: /* Cedarview */
++ case 0x37: /* Baytrail / Valleyview (Silvermont) */
++ case 0x4d: /* Avaton / Rangely (Silvermont) */
++ case 0x4c: /* Cherrytrail / Brasswell */
++ case 0x4a: /* Merrifield */
++ case 0x5a: /* Moorefield */
++ case 0x5c: /* Goldmont */
++ case 0x5f: /* Denverton */
++ case 0x7a: /* Gemini Lake */
++ return false;
++
++ /*
++ * Knights processors are not vulnerable.
++ */
++ case 0x57: /* Knights Landing */
++ case 0x85: /* Knights Mill */
++ return false;
++
++ default:
++ printk("Unrecognised CPU model %#x - assuming vulnerable to LazyFPU\n",
++ boot_cpu_data.x86_model);
++ return true;
++ }
++}
++
++/* Calculate whether this CPU is vulnerable to L1TF. */
++static __init void l1tf_calculations(uint64_t caps)
++{
++ bool hit_default = false;
++
++ l1d_maxphysaddr = paddr_bits;
++
++ /* L1TF is only known to affect Intel Family 6 processors at this time. */
++ if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
++ boot_cpu_data.x86 == 6 )
++ {
++ switch ( boot_cpu_data.x86_model )
++ {
++ /*
++ * Core processors since at least Penryn are vulnerable.
++ */
++ case 0x17: /* Penryn */
++ case 0x1d: /* Dunnington */
++ cpu_has_bug_l1tf = true;
++ break;
++
++ case 0x1f: /* Auburndale / Havendale */
++ case 0x1e: /* Nehalem */
++ case 0x1a: /* Nehalem EP */
++ case 0x2e: /* Nehalem EX */
++ case 0x25: /* Westmere */
++ case 0x2c: /* Westmere EP */
++ case 0x2f: /* Westmere EX */
++ cpu_has_bug_l1tf = true;
++ l1d_maxphysaddr = 44;
++ break;
++
++ case 0x2a: /* SandyBridge */
++ case 0x2d: /* SandyBridge EP/EX */
++ case 0x3a: /* IvyBridge */
++ case 0x3e: /* IvyBridge EP/EX */
++ case 0x3c: /* Haswell */
++ case 0x3f: /* Haswell EX/EP */
++ case 0x45: /* Haswell D */
++ case 0x46: /* Haswell H */
++ case 0x3d: /* Broadwell */
++ case 0x47: /* Broadwell H */
++ case 0x4f: /* Broadwell EP/EX */
++ case 0x56: /* Broadwell D */
++ case 0x4e: /* Skylake M */
++ case 0x55: /* Skylake X */
++ case 0x5e: /* Skylake D */
++ case 0x66: /* Cannonlake */
++ case 0x67: /* Cannonlake? */
++ case 0x8e: /* Kabylake M */
++ case 0x9e: /* Kabylake D */
++ cpu_has_bug_l1tf = true;
++ l1d_maxphysaddr = 46;
++ break;
++
++ /*
++ * Atom processors are not vulnerable.
++ */
++ case 0x1c: /* Pineview */
++ case 0x26: /* Lincroft */
++ case 0x27: /* Penwell */
++ case 0x35: /* Cloverview */
++ case 0x36: /* Cedarview */
++ case 0x37: /* Baytrail / Valleyview (Silvermont) */
++ case 0x4d: /* Avaton / Rangely (Silvermont) */
++ case 0x4c: /* Cherrytrail / Brasswell */
++ case 0x4a: /* Merrifield */
++ case 0x5a: /* Moorefield */
++ case 0x5c: /* Goldmont */
++ case 0x5f: /* Denverton */
++ case 0x7a: /* Gemini Lake */
++ break;
++
++ /*
++ * Knights processors are not vulnerable.
++ */
++ case 0x57: /* Knights Landing */
++ case 0x85: /* Knights Mill */
++ break;
++
++ default:
++ /* Defer printk() until we've accounted for RDCL_NO. */
++ hit_default = true;
++ cpu_has_bug_l1tf = true;
++ break;
++ }
++ }
++
++ /* Any processor advertising RDCL_NO should be not vulnerable to L1TF. */
++ if ( caps & ARCH_CAPABILITIES_RDCL_NO )
++ cpu_has_bug_l1tf = false;
++
++ if ( cpu_has_bug_l1tf && hit_default )
++ printk("Unrecognised CPU model %#x - assuming vulnerable to L1TF\n",
++ boot_cpu_data.x86_model);
++
++ /*
++ * L1TF safe address heuristics. These apply to the real hardware we are
++ * running on, and are best-effort-only if Xen is virtualised.
++ *
++ * The address mask which the L1D cache uses, which might be wider than
++ * the CPUID-reported maxphysaddr.
++ */
++ l1tf_addr_mask = ((1ul << l1d_maxphysaddr) - 1) & PAGE_MASK;
++
++ /*
++ * To be safe, l1tf_safe_maddr must be above the highest cacheable entity
++ * in system physical address space. However, to preserve space for
++ * paged-out metadata, it should be as low as possible above the highest
++ * cacheable address, so as to require fewer high-order bits being set.
++ *
++ * These heuristics are based on some guesswork to improve the likelihood
++ * of safety in the common case, including Linux's L1TF mitigation of
++ * inverting all address bits in a non-present PTE.
++ *
++ * - If L1D is wider than CPUID (Nehalem and later mobile/desktop/low end
++ * server), setting any address bit beyond CPUID maxphysaddr guarantees
++ * to make the PTE safe. This case doesn't require all the high-order
++ * bits being set, and doesn't require any other source of information
++ * for safety.
++ *
++ * - If L1D is the same as CPUID (Pre-Nehalem, or high end server), we
++ * must sacrifice high order bits from the real address space for
++ * safety. Therefore, make a blind guess that there is nothing
++ * cacheable in the top quarter of physical address space.
++ *
++ * It is exceedingly unlikely for machines to be populated with this
++ * much RAM (likely 512G on pre-Nehalem, 16T on Nehalem/Westmere, 64T on
++ * Sandybridge and later) due to the sheer volume of DIMMs this would
++ * actually take.
++ *
++ * However, it is possible to find machines this large, so the "top
++ * quarter" guess is supplemented to push the limit higher if references
++ * to cacheable mappings (E820/SRAT/EFI/etc) are found above the top
++ * quarter boundary.
++ *
++ * Finally, this top quarter guess gives us a good chance of being safe
++ * when running virtualised (and the CPUID maxphysaddr hasn't been
++ * levelled for heterogeneous migration safety), where the safety
++ * consideration is still in terms of host details, but all E820/etc
++ * information is in terms of guest physical layout.
++ */
++ l1tf_safe_maddr = max(l1tf_safe_maddr, ((l1d_maxphysaddr > paddr_bits)
++ ? (1ul << paddr_bits)
++ : (3ul << (paddr_bits - 2))));
++}
++
++int8_t __read_mostly opt_xpti = -1;
++
++static __init void xpti_init_default(uint64_t caps)
++{
++ if ( boot_cpu_data.x86_vendor == X86_VENDOR_AMD )
++ caps = ARCH_CAPABILITIES_RDCL_NO;
++
++ if ( caps & ARCH_CAPABILITIES_RDCL_NO )
++ opt_xpti = 0;
++ else
++ opt_xpti = OPT_XPTI_DOM0 | OPT_XPTI_DOMU;
++}
++
++static __init int parse_xpti(const char *s)
++{
++ const char *ss;
++ int val, rc = 0;
++
++ /* Inhibit the defaults as an explicit choice has been given. */
++ if ( opt_xpti == -1 )
++ opt_xpti = 0;
++
++ /* Interpret 'xpti' alone in its positive boolean form. */
++ if ( *s == '\0' )
++ opt_xpti = OPT_XPTI_DOM0 | OPT_XPTI_DOMU;
++
++ do {
++ ss = strchr(s, ',');
++ if ( !ss )
++ ss = strchr(s, '\0');
++
++ switch ( parse_bool(s, ss) )
++ {
++ case 0:
++ opt_xpti = 0;
++ break;
++
++ case 1:
++ opt_xpti = OPT_XPTI_DOM0 | OPT_XPTI_DOMU;
++ break;
++
++ default:
++ if ( !strcmp(s, "default") )
++ opt_xpti = -1;
++ else if ( (val = parse_boolean("dom0", s, ss)) >= 0 )
++ opt_xpti = (opt_xpti & ~OPT_XPTI_DOM0) |
++ (val ? OPT_XPTI_DOM0 : 0);
++ else if ( (val = parse_boolean("domu", s, ss)) >= 0 )
++ opt_xpti = (opt_xpti & ~OPT_XPTI_DOMU) |
++ (val ? OPT_XPTI_DOMU : 0);
++ else
++ rc = -EINVAL;
++ break;
++ }
++
++ s = ss + 1;
++ } while ( *ss );
++
++ return rc;
++}
++custom_param("xpti", parse_xpti);
++
+ void __init init_speculation_mitigations(void)
+ {
+ enum ind_thunk thunk = THUNK_DEFAULT;
+- bool ibrs = false;
++ bool use_spec_ctrl = false, ibrs = false;
++ uint64_t caps = 0;
++
++ if ( boot_cpu_has(X86_FEATURE_ARCH_CAPS) )
++ rdmsrl(MSR_ARCH_CAPABILITIES, caps);
+
+ /*
+ * Has the user specified any custom BTI mitigations? If so, follow their
+@@ -209,7 +739,7 @@ void __init init_speculation_mitigations(void)
+ * On Intel hardware, we'd like to use retpoline in preference to
+ * IBRS, but only if it is safe on this hardware.
+ */
+- else if ( retpoline_safe() )
++ else if ( retpoline_safe(caps) )
+ thunk = THUNK_RETPOLINE;
+ else if ( boot_cpu_has(X86_FEATURE_IBRSB) )
+ ibrs = true;
+@@ -246,21 +776,35 @@ void __init init_speculation_mitigations(void)
+ else if ( thunk == THUNK_JMP )
+ setup_force_cpu_cap(X86_FEATURE_IND_THUNK_JMP);
+
++ /*
++ * If we are on hardware supporting MSR_SPEC_CTRL, see about setting up
++ * the alternatives blocks so we can virtualise support for guests.
++ */
+ if ( boot_cpu_has(X86_FEATURE_IBRSB) )
+ {
+- /*
+- * Even if we've chosen to not have IBRS set in Xen context, we still
+- * need the IBRS entry/exit logic to virtualise IBRS support for
+- * guests.
+- */
+- if ( ibrs )
+- setup_force_cpu_cap(X86_FEATURE_XEN_IBRS_SET);
+- else
+- setup_force_cpu_cap(X86_FEATURE_XEN_IBRS_CLEAR);
++ if ( opt_msr_sc_pv )
++ {
++ use_spec_ctrl = true;
++ setup_force_cpu_cap(X86_FEATURE_SC_MSR_PV);
++ }
++
++ if ( opt_msr_sc_hvm )
++ {
++ use_spec_ctrl = true;
++ setup_force_cpu_cap(X86_FEATURE_SC_MSR_HVM);
++ }
++
++ if ( use_spec_ctrl )
++ default_spec_ctrl_flags |= SCF_ist_wrmsr;
+
+- default_bti_ist_info |= BTI_IST_WRMSR | ibrs;
++ if ( ibrs )
++ default_xen_spec_ctrl |= SPEC_CTRL_IBRS;
+ }
+
++ /* If we have SSBD available, see whether we should use it. */
++ if ( boot_cpu_has(X86_FEATURE_SSBD) && opt_ssbd )
++ default_xen_spec_ctrl |= SPEC_CTRL_SSBD;
++
+ /*
+ * PV guests can poison the RSB to any virtual address from which
+ * they can execute a call instruction. This is necessarily outside
+@@ -274,33 +818,123 @@ void __init init_speculation_mitigations(void)
+ * If a processors speculates to 32bit PV guest kernel mappings, it is
+ * speculating in 64bit supervisor mode, and can leak data.
+ */
+- if ( opt_rsb_native )
++ if ( opt_rsb_pv )
+ {
+- setup_force_cpu_cap(X86_FEATURE_RSB_NATIVE);
+- default_bti_ist_info |= BTI_IST_RSB;
++ setup_force_cpu_cap(X86_FEATURE_SC_RSB_PV);
++ default_spec_ctrl_flags |= SCF_ist_rsb;
+ }
+
+ /*
+ * HVM guests can always poison the RSB to point at Xen supervisor
+ * mappings.
+ */
+- if ( opt_rsb_vmexit )
+- setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT);
++ if ( opt_rsb_hvm )
++ setup_force_cpu_cap(X86_FEATURE_SC_RSB_HVM);
+
+ /* Check we have hardware IBPB support before using it... */
+ if ( !boot_cpu_has(X86_FEATURE_IBRSB) && !boot_cpu_has(X86_FEATURE_IBPB) )
+ opt_ibpb = false;
+
+- /* (Re)init BSP state now that default_bti_ist_info has been calculated. */
++ /* Check whether Eager FPU should be enabled by default. */
++ if ( opt_eager_fpu == -1 )
++ opt_eager_fpu = should_use_eager_fpu();
++
++ /* (Re)init BSP state now that default_spec_ctrl_flags has been calculated. */
+ init_shadow_spec_ctrl_state();
+
+- print_details(thunk);
++ /* If Xen is using any MSR_SPEC_CTRL settings, adjust the idle path. */
++ if ( default_xen_spec_ctrl )
++ setup_force_cpu_cap(X86_FEATURE_SC_MSR_IDLE);
++
++ if ( opt_xpti == -1 )
++ xpti_init_default(caps);
++
++ if ( opt_xpti == 0 )
++ setup_force_cpu_cap(X86_FEATURE_NO_XPTI);
++ else
++ setup_clear_cpu_cap(X86_FEATURE_NO_XPTI);
++
++ l1tf_calculations(caps);
++
++ /*
++ * By default, enable PV domU L1TF mitigations on all L1TF-vulnerable
++ * hardware, except when running in shim mode.
++ *
++ * In shim mode, SHADOW is expected to be compiled out, and a malicious
++ * guest kernel can only attack the shim Xen, not the host Xen.
++ */
++ if ( opt_pv_l1tf == -1 )
++ {
++ if ( pv_shim || !cpu_has_bug_l1tf )
++ opt_pv_l1tf = 0;
++ else
++ opt_pv_l1tf = OPT_PV_L1TF_DOMU;
++ }
++
++ /*
++ * By default, enable L1D_FLUSH on L1TF-vulnerable hardware, unless
++ * instructed to skip the flush on vmentry by our outer hypervisor.
++ */
++ if ( !boot_cpu_has(X86_FEATURE_L1D_FLUSH) )
++ opt_l1d_flush = 0;
++ else if ( opt_l1d_flush == -1 )
++ opt_l1d_flush = cpu_has_bug_l1tf && !(caps & ARCH_CAPS_SKIP_L1DFL);
++
++ /*
++ * We do not disable HT by default on affected hardware.
++ *
++ * Firstly, if the user intends to use exclusively PV, or HVM shadow
++ * guests, HT isn't a concern and should remain fully enabled. Secondly,
++ * safety for HVM HAP guests can be arranged by the toolstack with core
++ * parking, pinning or cpupool configurations, including mixed setups.
++ *
++ * However, if we are on affected hardware, with HT enabled, and the user
++ * hasn't explicitly chosen whether to use HT or not, nag them to do so.
++ */
++ if ( opt_smt == -1 && cpu_has_bug_l1tf && !pv_shim &&
++ boot_cpu_data.x86_num_siblings > 1 )
++ warning_add(
++ "Booted on L1TF-vulnerable hardware with SMT/Hyperthreading\n"
++ "enabled. Please assess your configuration and choose an\n"
++ "explicit 'smt=<bool>' setting. See XSA-273.\n");
++
++ print_details(thunk, caps);
++
++ /*
++ * If MSR_SPEC_CTRL is available, apply Xen's default setting and discard
++ * any firmware settings. For performance reasons, when safe to do so, we
++ * delay applying non-zero settings until after dom0 has been constructed.
++ *
++ * "when safe to do so" is based on whether we are virtualised. A native
++ * boot won't have any other code running in a position to mount an
++ * attack.
++ */
++ if ( boot_cpu_has(X86_FEATURE_IBRSB) )
++ {
++ bsp_delay_spec_ctrl = !cpu_has_hypervisor && default_xen_spec_ctrl;
++
++ /*
++ * If delaying MSR_SPEC_CTRL setup, use the same mechanism as
++ * spec_ctrl_enter_idle(), by using a shadow value of zero.
++ */
++ if ( bsp_delay_spec_ctrl )
++ {
++ struct cpu_info *info = get_cpu_info();
++
++ info->shadow_spec_ctrl = 0;
++ barrier();
++ info->spec_ctrl_flags |= SCF_use_shadow;
++ barrier();
++ }
++
++ wrmsrl(MSR_SPEC_CTRL, bsp_delay_spec_ctrl ? 0 : default_xen_spec_ctrl);
++ }
+ }
+
+ static void __init __maybe_unused build_assertions(void)
+ {
+ /* The optimised assembly relies on this alias. */
+- BUILD_BUG_ON(BTI_IST_IBRS != SPEC_CTRL_IBRS);
++ BUILD_BUG_ON(SCF_use_shadow != 1);
+ }
+
+ /*
+diff --git a/xen/arch/x86/srat.c b/xen/arch/x86/srat.c
+index 166eb44fe2..2d70b45909 100644
+--- a/xen/arch/x86/srat.c
++++ b/xen/arch/x86/srat.c
+@@ -20,6 +20,7 @@
+ #include <xen/pfn.h>
+ #include <asm/e820.h>
+ #include <asm/page.h>
++#include <asm/spec_ctrl.h>
+
+ static struct acpi_table_slit *__read_mostly acpi_slit;
+
+@@ -284,6 +285,11 @@ acpi_numa_memory_affinity_init(const struct acpi_srat_mem_affinity *ma)
+ if (!(ma->flags & ACPI_SRAT_MEM_ENABLED))
+ return;
+
++ start = ma->base_address;
++ end = start + ma->length;
++ /* Supplement the heuristics in l1tf_calculations(). */
++ l1tf_safe_maddr = max(l1tf_safe_maddr, ROUNDUP(end, PAGE_SIZE));
++
+ if (num_node_memblks >= NR_NODE_MEMBLKS)
+ {
+ dprintk(XENLOG_WARNING,
+@@ -292,8 +298,6 @@ acpi_numa_memory_affinity_init(const struct acpi_srat_mem_affinity *ma)
+ return;
+ }
+
+- start = ma->base_address;
+- end = start + ma->length;
+ pxm = ma->proximity_domain;
+ if (srat_rev < 2)
+ pxm &= 0xff;
+diff --git a/xen/arch/x86/sysctl.c b/xen/arch/x86/sysctl.c
+index 6ba823ca69..e726eee974 100644
+--- a/xen/arch/x86/sysctl.c
++++ b/xen/arch/x86/sysctl.c
+@@ -23,6 +23,7 @@
+ #include <asm/hvm/hvm.h>
+ #include <asm/hvm/support.h>
+ #include <asm/processor.h>
++#include <asm/setup.h>
+ #include <asm/smp.h>
+ #include <asm/numa.h>
+ #include <xen/nodemask.h>
+@@ -48,14 +49,27 @@ static void l3_cache_get(void *arg)
+
+ long cpu_up_helper(void *data)
+ {
+- int cpu = (unsigned long)data;
++ unsigned int cpu = (unsigned long)data;
+ int ret = cpu_up(cpu);
++
+ if ( ret == -EBUSY )
+ {
+ /* On EBUSY, flush RCU work and have one more go. */
+ rcu_barrier();
+ ret = cpu_up(cpu);
+ }
++
++ if ( !ret && !opt_smt &&
++ cpu_data[cpu].compute_unit_id == INVALID_CUID &&
++ cpumask_weight(per_cpu(cpu_sibling_mask, cpu)) > 1 )
++ {
++ ret = cpu_down_helper(data);
++ if ( ret )
++ printk("Could not re-offline CPU%u (%d)\n", cpu, ret);
++ else
++ ret = -EPERM;
++ }
++
+ return ret;
+ }
+
+diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c
+index 906124331b..e8f85f716e 100644
+--- a/xen/arch/x86/traps.c
++++ b/xen/arch/x86/traps.c
+@@ -96,8 +96,6 @@ string_param("nmi", opt_nmi);
+ DEFINE_PER_CPU(u64, efer);
+ static DEFINE_PER_CPU(unsigned long, last_extable_addr);
+
+-DEFINE_PER_CPU_READ_MOSTLY(u32, ler_msr);
+-
+ DEFINE_PER_CPU_READ_MOSTLY(struct desc_struct *, gdt_table);
+ DEFINE_PER_CPU_READ_MOSTLY(struct desc_struct *, compat_gdt_table);
+
+@@ -117,6 +115,9 @@ integer_param("debug_stack_lines", debug_stack_lines);
+ static bool opt_ler;
+ boolean_param("ler", opt_ler);
+
++/* LastExceptionFromIP on this hardware. Zero if LER is not in use. */
++unsigned int __read_mostly ler_msr;
++
+ #define stack_words_per_line 4
+ #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)regs->rsp)
+
+@@ -325,13 +326,13 @@ static void show_guest_stack(struct vcpu *v, const struct cpu_user_regs *regs)
+ /*
+ * Notes for get_stack_trace_bottom() and get_stack_dump_bottom()
+ *
+- * Stack pages 0, 1 and 2:
++ * Stack pages 0 - 3:
+ * These are all 1-page IST stacks. Each of these stacks have an exception
+ * frame and saved register state at the top. The interesting bound for a
+ * trace is the word adjacent to this, while the bound for a dump is the
+ * very top, including the exception frame.
+ *
+- * Stack pages 3, 4 and 5:
++ * Stack pages 4 and 5:
+ * None of these are particularly interesting. With MEMORY_GUARD, page 5 is
+ * explicitly not present, so attempting to dump or trace it is
+ * counterproductive. Without MEMORY_GUARD, it is possible for a call chain
+@@ -352,12 +353,12 @@ unsigned long get_stack_trace_bottom(unsigned long sp)
+ {
+ switch ( get_stack_page(sp) )
+ {
+- case 0 ... 2:
++ case 0 ... 3:
+ return ROUNDUP(sp, PAGE_SIZE) -
+ offsetof(struct cpu_user_regs, es) - sizeof(unsigned long);
+
+ #ifndef MEMORY_GUARD
+- case 3 ... 5:
++ case 4 ... 5:
+ #endif
+ case 6 ... 7:
+ return ROUNDUP(sp, STACK_SIZE) -
+@@ -372,11 +373,11 @@ unsigned long get_stack_dump_bottom(unsigned long sp)
+ {
+ switch ( get_stack_page(sp) )
+ {
+- case 0 ... 2:
++ case 0 ... 3:
+ return ROUNDUP(sp, PAGE_SIZE) - sizeof(unsigned long);
+
+ #ifndef MEMORY_GUARD
+- case 3 ... 5:
++ case 4 ... 5:
+ #endif
+ case 6 ... 7:
+ return ROUNDUP(sp, STACK_SIZE) - sizeof(unsigned long);
+@@ -1722,7 +1723,21 @@ void do_device_not_available(struct cpu_user_regs *regs)
+ {
+ struct vcpu *curr = current;
+
+- BUG_ON(!guest_mode(regs));
++ if ( !guest_mode(regs) )
++ {
++ unsigned long fixup = search_exception_table(regs);
++
++ gprintk(XENLOG_ERR, "#NM: %p [%ps] -> %p\n",
++ _p(regs->rip), _p(regs->rip), _p(fixup));
++ /*
++ * We shouldn't be able to reach here, but for release builds have
++ * the recovery logic in place nevertheless.
++ */
++ ASSERT_UNREACHABLE();
++ BUG_ON(!fixup);
++ regs->rip = fixup;
++ return;
++ }
+
+ vcpu_restore_fpu_lazy(curr);
+
+@@ -1748,26 +1763,51 @@ void write_efer(u64 val)
+ wrmsrl(MSR_EFER, val);
+ }
+
+-static void ler_enable(void)
+-{
+- u64 debugctl;
+-
+- if ( !this_cpu(ler_msr) )
+- return;
+-
+- rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+- wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl | IA32_DEBUGCTLMSR_LBR);
+-}
+-
+ void do_debug(struct cpu_user_regs *regs)
+ {
++ unsigned long dr6;
+ struct vcpu *v = current;
+
++ /* Stash dr6 as early as possible. */
++ dr6 = read_debugreg(6);
++
+ if ( debugger_trap_entry(TRAP_debug, regs) )
+ return;
+
++ /*
++ * At the time of writing (March 2018), on the subject of %dr6:
++ *
++ * The Intel manual says:
++ * Certain debug exceptions may clear bits 0-3. The remaining contents
++ * of the DR6 register are never cleared by the processor. To avoid
++ * confusion in identifying debug exceptions, debug handlers should
++ * clear the register (except bit 16, which they should set) before
++ * returning to the interrupted task.
++ *
++ * The AMD manual says:
++ * Bits 15:13 of the DR6 register are not cleared by the processor and
++ * must be cleared by software after the contents have been read.
++ *
++ * Some bits are reserved set, some are reserved clear, and some bits
++ * which were previously reserved set are reused and cleared by hardware.
++ * For future compatibility, reset to the default value, which will allow
++ * us to spot any bit being changed by hardware to its non-default value.
++ */
++ write_debugreg(6, X86_DR6_DEFAULT);
++
++ /* #DB automatically disabled LBR. Reinstate it if debugging Xen. */
++ if ( cpu_has_xen_lbr )
++ wrmsrl(MSR_IA32_DEBUGCTLMSR, IA32_DEBUGCTLMSR_LBR);
++
+ if ( !guest_mode(regs) )
+ {
++ /*
++ * !!! WARNING !!!
++ *
++ * %dr6 is mostly guest controlled at this point. Any decsions base
++ * on its value must be crosschecked with non-guest controlled state.
++ */
++
+ if ( regs->eflags & X86_EFLAGS_TF )
+ {
+ /* In SYSENTER entry path we can't zap TF until EFLAGS is saved. */
+@@ -1776,7 +1816,7 @@ void do_debug(struct cpu_user_regs *regs)
+ {
+ if ( regs->rip == (unsigned long)sysenter_eflags_saved )
+ regs->eflags &= ~X86_EFLAGS_TF;
+- goto out;
++ return;
+ }
+ if ( !debugger_trap_fatal(TRAP_debug, regs) )
+ {
+@@ -1784,29 +1824,63 @@ void do_debug(struct cpu_user_regs *regs)
+ regs->eflags &= ~X86_EFLAGS_TF;
+ }
+ }
+- else
++
++ /*
++ * Check for fault conditions. General Detect, and instruction
++ * breakpoints are faults rather than traps, at which point attempting
++ * to ignore and continue will result in a livelock.
++ *
++ * However, on entering the #DB handler, hardware clears %dr7.gd for
++ * us (as confirmed by the earlier %dr6 accesses succeeding), meaning
++ * that a real General Detect exception is restartable.
++ *
++ * PV guests are not permitted to point %dr{0..3} at Xen linear
++ * addresses, and Instruction Breakpoints (being faults) don't get
++ * delayed by a MovSS shadow, so we should never encounter one in
++ * hypervisor context.
++ *
++ * If however we do, safety measures need to be enacted. Use a big
++ * hammer and clear all debug settings.
++ */
++ if ( dr6 & (DR_TRAP3 | DR_TRAP2 | DR_TRAP1 | DR_TRAP0) )
+ {
+- /*
+- * We ignore watchpoints when they trigger within Xen. This may
+- * happen when a buffer is passed to us which previously had a
+- * watchpoint set on it. No need to bump EIP; the only faulting
+- * trap is an instruction breakpoint, which can't happen to us.
+- */
+- WARN_ON(!search_exception_table(regs));
++ unsigned int bp, dr7 = read_debugreg(7);
++
++ for ( bp = 0; bp < 4; ++bp )
++ {
++ if ( (dr6 & (1u << bp)) && /* Breakpoint triggered? */
++ (dr7 & (3u << (bp * DR_ENABLE_SIZE))) && /* Enabled? */
++ ((dr7 & (3u << ((bp * DR_CONTROL_SIZE) + /* Insn? */
++ DR_CONTROL_SHIFT))) == DR_RW_EXECUTE) )
++ {
++ ASSERT_UNREACHABLE();
++
++ printk(XENLOG_ERR
++ "Hit instruction breakpoint in Xen context\n");
++ write_debugreg(7, 0);
++ break;
++ }
++ }
+ }
+- goto out;
++
++ /*
++ * Whatever caused this #DB should be restartable by this point. Note
++ * it and continue. Guests can trigger this in certain corner cases,
++ * so ensure the message is ratelimited.
++ */
++ gprintk(XENLOG_WARNING,
++ "Hit #DB in Xen context: %04x:%p [%ps], stk %04x:%p, dr6 %lx\n",
++ regs->cs, _p(regs->rip), _p(regs->rip),
++ regs->ss, _p(regs->rsp), dr6);
++
++ return;
+ }
+
+ /* Save debug status register where guest OS can peek at it */
+- v->arch.debugreg[6] = read_debugreg(6);
++ v->arch.debugreg[6] |= (dr6 & ~X86_DR6_DEFAULT);
++ v->arch.debugreg[6] &= (dr6 | ~X86_DR6_DEFAULT);
+
+- ler_enable();
+ pv_inject_hw_exception(TRAP_debug, X86_EVENT_NO_EC);
+- return;
+-
+- out:
+- ler_enable();
+- return;
+ }
+
+ static void __init noinline __set_intr_gate(unsigned int n,
+@@ -1850,38 +1924,46 @@ void load_TR(void)
+ : "=m" (old_gdt) : "rm" (TSS_ENTRY << 3), "m" (tss_gdt) : "memory" );
+ }
+
+-void percpu_traps_init(void)
++static unsigned int calc_ler_msr(void)
+ {
+- subarch_percpu_traps_init();
+-
+- if ( !opt_ler )
+- return;
+-
+ switch ( boot_cpu_data.x86_vendor )
+ {
+ case X86_VENDOR_INTEL:
+ switch ( boot_cpu_data.x86 )
+ {
+ case 6:
+- this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP;
+- break;
++ return MSR_IA32_LASTINTFROMIP;
++
+ case 15:
+- this_cpu(ler_msr) = MSR_P4_LER_FROM_LIP;
+- break;
++ return MSR_P4_LER_FROM_LIP;
+ }
+ break;
++
+ case X86_VENDOR_AMD:
+ switch ( boot_cpu_data.x86 )
+ {
+ case 6:
+ case 0xf ... 0x17:
+- this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP;
+- break;
++ return MSR_IA32_LASTINTFROMIP;
+ }
+ break;
+ }
+
+- ler_enable();
++ return 0;
++}
++
++void percpu_traps_init(void)
++{
++ subarch_percpu_traps_init();
++
++ if ( !opt_ler )
++ return;
++
++ if ( !ler_msr && (ler_msr = calc_ler_msr()) )
++ setup_force_cpu_cap(X86_FEATURE_XEN_LBR);
++
++ if ( cpu_has_xen_lbr )
++ wrmsrl(MSR_IA32_DEBUGCTLMSR, IA32_DEBUGCTLMSR_LBR);
+ }
+
+ void __init init_idt_traps(void)
+@@ -1917,6 +1999,7 @@ void __init init_idt_traps(void)
+ set_ist(&idt_table[TRAP_double_fault], IST_DF);
+ set_ist(&idt_table[TRAP_nmi], IST_NMI);
+ set_ist(&idt_table[TRAP_machine_check], IST_MCE);
++ set_ist(&idt_table[TRAP_debug], IST_DB);
+
+ /* CPU0 uses the master IDT. */
+ idt_tables[0] = idt_table;
+@@ -1984,6 +2067,12 @@ void activate_debugregs(const struct vcpu *curr)
+ }
+ }
+
++/*
++ * Used by hypercalls and the emulator.
++ * -ENODEV => #UD
++ * -EINVAL => #GP Invalid bit
++ * -EPERM => #GP Valid bit, but not permitted to use
++ */
+ long set_debugreg(struct vcpu *v, unsigned int reg, unsigned long value)
+ {
+ int i;
+@@ -2015,7 +2104,17 @@ long set_debugreg(struct vcpu *v, unsigned int reg, unsigned long value)
+ if ( v == curr )
+ write_debugreg(3, value);
+ break;
++
++ case 4:
++ if ( v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_DE )
++ return -ENODEV;
++
++ /* Fallthrough */
+ case 6:
++ /* The upper 32 bits are strictly reserved. */
++ if ( value != (uint32_t)value )
++ return -EINVAL;
++
+ /*
+ * DR6: Bits 4-11,16-31 reserved (set to 1).
+ * Bit 12 reserved (set to 0).
+@@ -2025,7 +2124,17 @@ long set_debugreg(struct vcpu *v, unsigned int reg, unsigned long value)
+ if ( v == curr )
+ write_debugreg(6, value);
+ break;
++
++ case 5:
++ if ( v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_DE )
++ return -ENODEV;
++
++ /* Fallthrough */
+ case 7:
++ /* The upper 32 bits are strictly reserved. */
++ if ( value != (uint32_t)value )
++ return -EINVAL;
++
+ /*
+ * DR7: Bit 10 reserved (set to 1).
+ * Bits 11-12,14-15 reserved (set to 0).
+@@ -2038,6 +2147,7 @@ long set_debugreg(struct vcpu *v, unsigned int reg, unsigned long value)
+ */
+ if ( value & DR_GENERAL_DETECT )
+ return -EPERM;
++
+ /* DR7.{G,L}E = 0 => debugging disabled for this domain. */
+ if ( value & DR7_ACTIVE_MASK )
+ {
+@@ -2066,11 +2176,15 @@ long set_debugreg(struct vcpu *v, unsigned int reg, unsigned long value)
+ !(v->arch.debugreg[7] & DR7_ACTIVE_MASK) )
+ activate_debugregs(v);
+ }
++ else
++ /* Zero the emulated controls if %dr7 isn't active. */
++ v->arch.debugreg[5] = 0;
++
+ if ( v == curr )
+ write_debugreg(7, value);
+ break;
+ default:
+- return -EINVAL;
++ return -ENODEV;
+ }
+
+ v->arch.debugreg[reg] = value;
+diff --git a/xen/arch/x86/x86_64/asm-offsets.c b/xen/arch/x86/x86_64/asm-offsets.c
+index 13478d4fc1..10c243a039 100644
+--- a/xen/arch/x86/x86_64/asm-offsets.c
++++ b/xen/arch/x86/x86_64/asm-offsets.c
+@@ -142,8 +142,10 @@ void __dummy__(void)
+ OFFSET(CPUINFO_xen_cr3, struct cpu_info, xen_cr3);
+ OFFSET(CPUINFO_pv_cr3, struct cpu_info, pv_cr3);
+ OFFSET(CPUINFO_shadow_spec_ctrl, struct cpu_info, shadow_spec_ctrl);
+- OFFSET(CPUINFO_use_shadow_spec_ctrl, struct cpu_info, use_shadow_spec_ctrl);
+- OFFSET(CPUINFO_bti_ist_info, struct cpu_info, bti_ist_info);
++ OFFSET(CPUINFO_xen_spec_ctrl, struct cpu_info, xen_spec_ctrl);
++ OFFSET(CPUINFO_spec_ctrl_flags, struct cpu_info, spec_ctrl_flags);
++ OFFSET(CPUINFO_root_pgt_changed, struct cpu_info, root_pgt_changed);
++ OFFSET(CPUINFO_use_pv_cr3, struct cpu_info, use_pv_cr3);
+ DEFINE(CPUINFO_sizeof, sizeof(struct cpu_info));
+ BLANK();
+
+diff --git a/xen/arch/x86/x86_64/compat/entry.S b/xen/arch/x86/x86_64/compat/entry.S
+index 75497bc292..1c4cd795d2 100644
+--- a/xen/arch/x86/x86_64/compat/entry.S
++++ b/xen/arch/x86/x86_64/compat/entry.S
+@@ -39,6 +39,12 @@ ENTRY(compat_test_all_events)
+ leaq irq_stat+IRQSTAT_softirq_pending(%rip),%rcx
+ cmpl $0,(%rcx,%rax,1)
+ jne compat_process_softirqs
++
++ /* Inject exception if pending. */
++ lea VCPU_trap_bounce(%rbx), %rdx
++ testb $TBF_EXCEPTION, TRAPBOUNCE_flags(%rdx)
++ jnz .Lcompat_process_trapbounce
++
+ testb $1,VCPU_mce_pending(%rbx)
+ jnz compat_process_mce
+ .Lcompat_test_guest_nmi:
+@@ -68,15 +74,24 @@ compat_process_softirqs:
+ call do_softirq
+ jmp compat_test_all_events
+
++ ALIGN
++/* %rbx: struct vcpu, %rdx: struct trap_bounce */
++.Lcompat_process_trapbounce:
++ sti
++.Lcompat_bounce_exception:
++ call compat_create_bounce_frame
++ movb $0, TRAPBOUNCE_flags(%rdx)
++ jmp compat_test_all_events
++
+ ALIGN
+ /* %rbx: struct vcpu */
+ compat_process_mce:
+ testb $1 << VCPU_TRAP_MCE,VCPU_async_exception_mask(%rbx)
+ jnz .Lcompat_test_guest_nmi
+ sti
+- movb $0,VCPU_mce_pending(%rbx)
+- call set_guest_machinecheck_trapbounce
+- testl %eax,%eax
++ movb $0, VCPU_mce_pending(%rbx)
++ call set_guest_machinecheck_trapbounce
++ test %al, %al
+ jz compat_test_all_events
+ movzbl VCPU_async_exception_mask(%rbx),%edx # save mask for the
+ movb %dl,VCPU_mce_old_mask(%rbx) # iret hypercall
+@@ -88,11 +103,11 @@ compat_process_mce:
+ /* %rbx: struct vcpu */
+ compat_process_nmi:
+ testb $1 << VCPU_TRAP_NMI,VCPU_async_exception_mask(%rbx)
+- jnz compat_test_guest_events
++ jnz compat_test_guest_events
+ sti
+- movb $0,VCPU_nmi_pending(%rbx)
++ movb $0, VCPU_nmi_pending(%rbx)
+ call set_guest_nmi_trapbounce
+- testl %eax,%eax
++ test %al, %al
+ jz compat_test_all_events
+ movzbl VCPU_async_exception_mask(%rbx),%edx # save mask for the
+ movb %dl,VCPU_nmi_old_mask(%rbx) # iret hypercall
+@@ -151,7 +166,7 @@ ENTRY(compat_restore_all_guest)
+ mov VCPUMSR_spec_ctrl_raw(%rax), %eax
+
+ /* WARNING! `ret`, `call *`, `jmp *` not safe beyond this point. */
+- SPEC_CTRL_EXIT_TO_GUEST /* Req: a=spec_ctrl %rsp=regs/cpuinfo, Clob: cd */
++ SPEC_CTRL_EXIT_TO_PV /* Req: a=spec_ctrl %rsp=regs/cpuinfo, Clob: cd */
+
+ RESTORE_ALL adj=8 compat=1
+ .Lft0: iretq
+@@ -189,15 +204,6 @@ ENTRY(cr4_pv32_restore)
+ xor %eax, %eax
+ ret
+
+-/* %rdx: trap_bounce, %rbx: struct vcpu */
+-ENTRY(compat_post_handle_exception)
+- testb $TBF_EXCEPTION,TRAPBOUNCE_flags(%rdx)
+- jz compat_test_all_events
+-.Lcompat_bounce_exception:
+- call compat_create_bounce_frame
+- movb $0,TRAPBOUNCE_flags(%rdx)
+- jmp compat_test_all_events
+-
+ .section .text.entry, "ax", @progbits
+
+ /* See lstar_enter for entry register state. */
+@@ -218,10 +224,9 @@ ENTRY(cstar_enter)
+
+ GET_STACK_END(bx)
+ mov STACK_CPUINFO_FIELD(xen_cr3)(%rbx), %rcx
+- neg %rcx
++ test %rcx, %rcx
+ jz .Lcstar_cr3_okay
+- mov %rcx, STACK_CPUINFO_FIELD(xen_cr3)(%rbx)
+- neg %rcx
++ movb $0, STACK_CPUINFO_FIELD(use_pv_cr3)(%rbx)
+ mov %rcx, %cr3
+ movq $0, STACK_CPUINFO_FIELD(xen_cr3)(%rbx)
+ .Lcstar_cr3_okay:
+diff --git a/xen/arch/x86/x86_64/compat/mm.c b/xen/arch/x86/x86_64/compat/mm.c
+index c2aa6f2fdb..02bc75b91e 100644
+--- a/xen/arch/x86/x86_64/compat/mm.c
++++ b/xen/arch/x86/x86_64/compat/mm.c
+@@ -163,19 +163,6 @@ int compat_arch_memory_op(unsigned long cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
+ return rc;
+ }
+
+-int compat_update_va_mapping(unsigned int va, u32 lo, u32 hi,
+- unsigned int flags)
+-{
+- return do_update_va_mapping(va, lo | ((u64)hi << 32), flags);
+-}
+-
+-int compat_update_va_mapping_otherdomain(unsigned long va, u32 lo, u32 hi,
+- unsigned long flags,
+- domid_t domid)
+-{
+- return do_update_va_mapping_otherdomain(va, lo | ((u64)hi << 32), flags, domid);
+-}
+-
+ DEFINE_XEN_GUEST_HANDLE(mmuext_op_compat_t);
+
+ int compat_mmuext_op(XEN_GUEST_HANDLE_PARAM(void) arg,
+diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S
+index bdd33e727f..c163c31a60 100644
+--- a/xen/arch/x86/x86_64/entry.S
++++ b/xen/arch/x86/x86_64/entry.S
+@@ -42,6 +42,12 @@ test_all_events:
+ leaq irq_stat+IRQSTAT_softirq_pending(%rip), %rcx
+ cmpl $0, (%rcx, %rax, 1)
+ jne process_softirqs
++
++ /* Inject exception if pending. */
++ lea VCPU_trap_bounce(%rbx), %rdx
++ testb $TBF_EXCEPTION, TRAPBOUNCE_flags(%rdx)
++ jnz .Lprocess_trapbounce
++
+ cmpb $0, VCPU_mce_pending(%rbx)
+ jne process_mce
+ .Ltest_guest_nmi:
+@@ -69,6 +75,15 @@ process_softirqs:
+ call do_softirq
+ jmp test_all_events
+
++ ALIGN
++/* %rbx: struct vcpu, %rdx struct trap_bounce */
++.Lprocess_trapbounce:
++ sti
++.Lbounce_exception:
++ call create_bounce_frame
++ movb $0, TRAPBOUNCE_flags(%rdx)
++ jmp test_all_events
++
+ ALIGN
+ /* %rbx: struct vcpu */
+ process_mce:
+@@ -77,7 +92,7 @@ process_mce:
+ sti
+ movb $0, VCPU_mce_pending(%rbx)
+ call set_guest_machinecheck_trapbounce
+- test %eax, %eax
++ test %al, %al
+ jz test_all_events
+ movzbl VCPU_async_exception_mask(%rbx), %edx # save mask for the
+ movb %dl, VCPU_mce_old_mask(%rbx) # iret hypercall
+@@ -93,7 +108,7 @@ process_nmi:
+ sti
+ movb $0, VCPU_nmi_pending(%rbx)
+ call set_guest_nmi_trapbounce
+- test %eax, %eax
++ test %al, %al
+ jz test_all_events
+ movzbl VCPU_async_exception_mask(%rbx), %edx # save mask for the
+ movb %dl, VCPU_nmi_old_mask(%rbx) # iret hypercall
+@@ -149,11 +164,15 @@ restore_all_guest:
+ mov VCPU_cr3(%rbx), %r9
+ GET_STACK_END(dx)
+ mov STACK_CPUINFO_FIELD(pv_cr3)(%rdx), %rdi
++ test %rdi, %rdi
++ jz .Lrag_keep_cr3
++ mov %rdi, %rax
++ cmpb $0, STACK_CPUINFO_FIELD(root_pgt_changed)(%rdx)
++ je .Lrag_copy_done
++ movb $0, STACK_CPUINFO_FIELD(root_pgt_changed)(%rdx)
+ movabs $PADDR_MASK & PAGE_MASK, %rsi
+ movabs $DIRECTMAP_VIRT_START, %rcx
+- mov %rdi, %rax
+ and %rsi, %rdi
+- jz .Lrag_keep_cr3
+ and %r9, %rsi
+ add %rcx, %rdi
+ add %rcx, %rsi
+@@ -168,20 +187,17 @@ restore_all_guest:
+ sub $(ROOT_PAGETABLE_FIRST_XEN_SLOT - \
+ ROOT_PAGETABLE_LAST_XEN_SLOT - 1) * 8, %rdi
+ rep movsq
+- mov STACK_CPUINFO_FIELD(cr4)(%rdx), %rdi
++.Lrag_copy_done:
+ mov %r9, STACK_CPUINFO_FIELD(xen_cr3)(%rdx)
+- mov %rdi, %rsi
+- and $~X86_CR4_PGE, %rdi
+- mov %rdi, %cr4
++ movb $1, STACK_CPUINFO_FIELD(use_pv_cr3)(%rdx)
+ mov %rax, %cr3
+- mov %rsi, %cr4
+ .Lrag_keep_cr3:
+
+ /* Restore stashed SPEC_CTRL value. */
+ mov %r15d, %eax
+
+ /* WARNING! `ret`, `call *`, `jmp *` not safe beyond this point. */
+- SPEC_CTRL_EXIT_TO_GUEST /* Req: a=spec_ctrl %rsp=regs/cpuinfo, Clob: cd */
++ SPEC_CTRL_EXIT_TO_PV /* Req: a=spec_ctrl %rsp=regs/cpuinfo, Clob: cd */
+
+ RESTORE_ALL
+ testw $TRAP_syscall,4(%rsp)
+@@ -222,20 +238,10 @@ restore_all_xen:
+ * case we return to late PV exit code (from an NMI or #MC).
+ */
+ GET_STACK_END(bx)
+- mov STACK_CPUINFO_FIELD(xen_cr3)(%rbx), %rdx
++ cmpb $0, STACK_CPUINFO_FIELD(use_pv_cr3)(%rbx)
++UNLIKELY_START(ne, exit_cr3)
+ mov STACK_CPUINFO_FIELD(pv_cr3)(%rbx), %rax
+- test %rdx, %rdx
+- /*
+- * Ideally the condition would be "nsz", but such doesn't exist,
+- * so "g" will have to do.
+- */
+-UNLIKELY_START(g, exit_cr3)
+- mov %cr4, %rdi
+- mov %rdi, %rsi
+- and $~X86_CR4_PGE, %rdi
+- mov %rdi, %cr4
+ mov %rax, %cr3
+- mov %rsi, %cr4
+ UNLIKELY_END(exit_cr3)
+
+ /* WARNING! `ret`, `call *`, `jmp *` not safe beyond this point. */
+@@ -276,10 +282,9 @@ ENTRY(lstar_enter)
+
+ GET_STACK_END(bx)
+ mov STACK_CPUINFO_FIELD(xen_cr3)(%rbx), %rcx
+- neg %rcx
++ test %rcx, %rcx
+ jz .Llstar_cr3_okay
+- mov %rcx, STACK_CPUINFO_FIELD(xen_cr3)(%rbx)
+- neg %rcx
++ movb $0, STACK_CPUINFO_FIELD(use_pv_cr3)(%rbx)
+ mov %rcx, %cr3
+ movq $0, STACK_CPUINFO_FIELD(xen_cr3)(%rbx)
+ .Llstar_cr3_okay:
+@@ -313,10 +318,9 @@ GLOBAL(sysenter_eflags_saved)
+ /* PUSHF above has saved EFLAGS.IF clear (the caller had it set). */
+ orl $X86_EFLAGS_IF, UREGS_eflags(%rsp)
+ mov STACK_CPUINFO_FIELD(xen_cr3)(%rbx), %rcx
+- neg %rcx
++ test %rcx, %rcx
+ jz .Lsyse_cr3_okay
+- mov %rcx, STACK_CPUINFO_FIELD(xen_cr3)(%rbx)
+- neg %rcx
++ movb $0, STACK_CPUINFO_FIELD(use_pv_cr3)(%rbx)
+ mov %rcx, %cr3
+ movq $0, STACK_CPUINFO_FIELD(xen_cr3)(%rbx)
+ .Lsyse_cr3_okay:
+@@ -363,10 +367,9 @@ ENTRY(int80_direct_trap)
+
+ GET_STACK_END(bx)
+ mov STACK_CPUINFO_FIELD(xen_cr3)(%rbx), %rcx
+- neg %rcx
++ test %rcx, %rcx
+ jz .Lint80_cr3_okay
+- mov %rcx, STACK_CPUINFO_FIELD(xen_cr3)(%rbx)
+- neg %rcx
++ movb $0, STACK_CPUINFO_FIELD(use_pv_cr3)(%rbx)
+ mov %rcx, %cr3
+ movq $0, STACK_CPUINFO_FIELD(xen_cr3)(%rbx)
+ .Lint80_cr3_okay:
+@@ -553,24 +556,24 @@ ENTRY(common_interrupt)
+ /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */
+
+ mov STACK_CPUINFO_FIELD(xen_cr3)(%r14), %rcx
++ mov STACK_CPUINFO_FIELD(use_pv_cr3)(%r14), %bl
+ mov %rcx, %r15
+- neg %rcx
++ test %rcx, %rcx
+ jz .Lintr_cr3_okay
+- jns .Lintr_cr3_load
+- mov %rcx, STACK_CPUINFO_FIELD(xen_cr3)(%r14)
+- neg %rcx
+-.Lintr_cr3_load:
++ movb $0, STACK_CPUINFO_FIELD(use_pv_cr3)(%r14)
+ mov %rcx, %cr3
+ xor %ecx, %ecx
+ mov %rcx, STACK_CPUINFO_FIELD(xen_cr3)(%r14)
+ testb $3, UREGS_cs(%rsp)
+ cmovnz %rcx, %r15
++ cmovnz %rcx, %rbx
+ .Lintr_cr3_okay:
+
+ CR4_PV32_RESTORE
+ movq %rsp,%rdi
+ callq do_IRQ
+ mov %r15, STACK_CPUINFO_FIELD(xen_cr3)(%r14)
++ mov %bl, STACK_CPUINFO_FIELD(use_pv_cr3)(%r14)
+ jmp ret_from_intr
+
+ ENTRY(page_fault)
+@@ -585,18 +588,17 @@ GLOBAL(handle_exception)
+ /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */
+
+ mov STACK_CPUINFO_FIELD(xen_cr3)(%r14), %rcx
++ mov STACK_CPUINFO_FIELD(use_pv_cr3)(%r14), %r13b
+ mov %rcx, %r15
+- neg %rcx
++ test %rcx, %rcx
+ jz .Lxcpt_cr3_okay
+- jns .Lxcpt_cr3_load
+- mov %rcx, STACK_CPUINFO_FIELD(xen_cr3)(%r14)
+- neg %rcx
+-.Lxcpt_cr3_load:
++ movb $0, STACK_CPUINFO_FIELD(use_pv_cr3)(%r14)
+ mov %rcx, %cr3
+ xor %ecx, %ecx
+ mov %rcx, STACK_CPUINFO_FIELD(xen_cr3)(%r14)
+ testb $3, UREGS_cs(%rsp)
+ cmovnz %rcx, %r15
++ cmovnz %rcx, %r13
+ .Lxcpt_cr3_okay:
+
+ handle_exception_saved:
+@@ -665,17 +667,12 @@ handle_exception_saved:
+ mov (%rdx, %rax, 8), %rdx
+ INDIRECT_CALL %rdx
+ mov %r15, STACK_CPUINFO_FIELD(xen_cr3)(%r14)
++ mov %r13b, STACK_CPUINFO_FIELD(use_pv_cr3)(%r14)
+ testb $3,UREGS_cs(%rsp)
+ jz restore_all_xen
+- leaq VCPU_trap_bounce(%rbx),%rdx
+ movq VCPU_domain(%rbx),%rax
+ testb $1,DOMAIN_is_32bit_pv(%rax)
+- jnz compat_post_handle_exception
+- testb $TBF_EXCEPTION,TRAPBOUNCE_flags(%rdx)
+- jz test_all_events
+-.Lbounce_exception:
+- call create_bounce_frame
+- movb $0,TRAPBOUNCE_flags(%rdx)
++ jnz compat_test_all_events
+ jmp test_all_events
+
+ /* No special register assumptions. */
+@@ -698,6 +695,7 @@ exception_with_ints_disabled:
+ 1: movq UREGS_error_code(%rsp),%rax # ec/ev
+ movq %rax,UREGS_kernel_sizeof(%rsp)
+ mov %r15, STACK_CPUINFO_FIELD(xen_cr3)(%r14)
++ mov %r13b, STACK_CPUINFO_FIELD(use_pv_cr3)(%r14)
+ jmp restore_all_xen # return to fixup code
+
+ /* No special register assumptions. */
+@@ -730,7 +728,7 @@ ENTRY(device_not_available)
+ ENTRY(debug)
+ pushq $0
+ movl $TRAP_debug,4(%rsp)
+- jmp handle_exception
++ jmp handle_ist_exception
+
+ ENTRY(int3)
+ pushq $0
+@@ -785,10 +783,9 @@ ENTRY(double_fault)
+ mov STACK_CPUINFO_FIELD(xen_cr3)(%r14), %rbx
+ test %rbx, %rbx
+ jz .Ldblf_cr3_okay
+- jns .Ldblf_cr3_load
+- neg %rbx
+-.Ldblf_cr3_load:
++ movb $0, STACK_CPUINFO_FIELD(use_pv_cr3)(%r14)
+ mov %rbx, %cr3
++ movq $0, STACK_CPUINFO_FIELD(xen_cr3)(%r14)
+ .Ldblf_cr3_okay:
+
+ movq %rsp,%rdi
+@@ -816,13 +813,11 @@ handle_ist_exception:
+ /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */
+
+ mov STACK_CPUINFO_FIELD(xen_cr3)(%r14), %rcx
++ mov STACK_CPUINFO_FIELD(use_pv_cr3)(%r14), %bl
+ mov %rcx, %r15
+- neg %rcx
++ test %rcx, %rcx
+ jz .List_cr3_okay
+- jns .List_cr3_load
+- mov %rcx, STACK_CPUINFO_FIELD(xen_cr3)(%r14)
+- neg %rcx
+-.List_cr3_load:
++ movb $0, STACK_CPUINFO_FIELD(use_pv_cr3)(%r14)
+ mov %rcx, %cr3
+ movq $0, STACK_CPUINFO_FIELD(xen_cr3)(%r14)
+ .List_cr3_okay:
+@@ -835,6 +830,7 @@ handle_ist_exception:
+ * and copy the context to stack bottom.
+ */
+ xor %r15, %r15
++ xor %ebx, %ebx
+ GET_CPUINFO_FIELD(guest_cpu_user_regs,di)
+ movq %rsp,%rsi
+ movl $UREGS_kernel_sizeof/8,%ecx
+@@ -846,6 +842,7 @@ handle_ist_exception:
+ mov (%rdx, %rax, 8), %rdx
+ INDIRECT_CALL %rdx
+ mov %r15, STACK_CPUINFO_FIELD(xen_cr3)(%r14)
++ mov %bl, STACK_CPUINFO_FIELD(use_pv_cr3)(%r14)
+ cmpb $TRAP_nmi,UREGS_entry_vector(%rsp)
+ jne ret_from_intr
+
+diff --git a/xen/arch/x86/x86_64/traps.c b/xen/arch/x86/x86_64/traps.c
+index 3652f5ff21..7e8c5de70d 100644
+--- a/xen/arch/x86/x86_64/traps.c
++++ b/xen/arch/x86/x86_64/traps.c
+@@ -144,11 +144,12 @@ void show_registers(const struct cpu_user_regs *regs)
+ printk("CPU: %d\n", smp_processor_id());
+ _show_registers(&fault_regs, fault_crs, context, v);
+
+- if ( this_cpu(ler_msr) && !guest_mode(regs) )
++ if ( ler_msr && !guest_mode(regs) )
+ {
+ u64 from, to;
+- rdmsrl(this_cpu(ler_msr), from);
+- rdmsrl(this_cpu(ler_msr) + 1, to);
++
++ rdmsrl(ler_msr, from);
++ rdmsrl(ler_msr + 1, to);
+ printk("ler: %016lx -> %016lx\n", from, to);
+ }
+ }
+diff --git a/xen/arch/x86/x86_emulate.c b/xen/arch/x86/x86_emulate.c
+index c7ba221d11..9125c67c9e 100644
+--- a/xen/arch/x86/x86_emulate.c
++++ b/xen/arch/x86/x86_emulate.c
+@@ -14,6 +14,7 @@
+ #include <asm/processor.h> /* current_cpu_info */
+ #include <asm/xstate.h>
+ #include <asm/amd.h> /* cpu_has_amd_erratum() */
++#include <asm/debugreg.h>
+
+ /* Avoid namespace pollution. */
+ #undef cmpxchg
+@@ -41,3 +42,75 @@
+ })
+
+ #include "x86_emulate/x86_emulate.c"
++
++/* Called with NULL ctxt in hypercall context. */
++int x86emul_read_dr(unsigned int reg, unsigned long *val,
++ struct x86_emulate_ctxt *ctxt)
++{
++ struct vcpu *curr = current;
++
++ /* HVM support requires a bit more plumbing before it will work. */
++ ASSERT(is_pv_vcpu(curr));
++
++ switch ( reg )
++ {
++ case 0 ... 3:
++ case 6:
++ *val = curr->arch.debugreg[reg];
++ break;
++
++ case 7:
++ *val = (curr->arch.debugreg[7] |
++ curr->arch.debugreg[5]);
++ break;
++
++ case 4 ... 5:
++ if ( !(curr->arch.pv_vcpu.ctrlreg[4] & X86_CR4_DE) )
++ {
++ *val = curr->arch.debugreg[reg + 2];
++ break;
++ }
++
++ /* Fallthrough */
++ default:
++ if ( ctxt )
++ x86_emul_hw_exception(TRAP_invalid_op, X86_EVENT_NO_EC, ctxt);
++
++ return X86EMUL_EXCEPTION;
++ }
++
++ return X86EMUL_OKAY;
++}
++
++int x86emul_write_dr(unsigned int reg, unsigned long val,
++ struct x86_emulate_ctxt *ctxt)
++{
++ struct vcpu *curr = current;
++
++ /* HVM support requires a bit more plumbing before it will work. */
++ ASSERT(is_pv_vcpu(curr));
++
++ switch ( set_debugreg(curr, reg, val) )
++ {
++ case 0:
++ return X86EMUL_OKAY;
++
++ case -ENODEV:
++ x86_emul_hw_exception(TRAP_invalid_op, X86_EVENT_NO_EC, ctxt);
++ return X86EMUL_EXCEPTION;
++
++ default:
++ x86_emul_hw_exception(TRAP_gp_fault, 0, ctxt);
++ return X86EMUL_EXCEPTION;
++ }
++}
++
++/*
++ * Local variables:
++ * mode: C
++ * c-file-style: "BSD"
++ * c-basic-offset: 4
++ * tab-width: 4
++ * indent-tabs-mode: nil
++ * End:
++ */
+diff --git a/xen/arch/x86/x86_emulate/x86_emulate.h b/xen/arch/x86/x86_emulate/x86_emulate.h
+index 0c8c80ad5a..9c2bb8157c 100644
+--- a/xen/arch/x86/x86_emulate/x86_emulate.h
++++ b/xen/arch/x86/x86_emulate/x86_emulate.h
+@@ -662,6 +662,11 @@ static inline void x86_emulate_free_state(struct x86_emulate_state *state) {}
+ void x86_emulate_free_state(struct x86_emulate_state *state);
+ #endif
+
++int x86emul_read_dr(unsigned int reg, unsigned long *val,
++ struct x86_emulate_ctxt *ctxt);
++int x86emul_write_dr(unsigned int reg, unsigned long val,
++ struct x86_emulate_ctxt *ctxt);
++
+ #endif
+
+ static inline void x86_emul_hw_exception(
+diff --git a/xen/arch/x86/xstate.c b/xen/arch/x86/xstate.c
+index 845208c189..ebd464e83a 100644
+--- a/xen/arch/x86/xstate.c
++++ b/xen/arch/x86/xstate.c
+@@ -670,12 +670,17 @@ static bool valid_xcr0(u64 xcr0)
+ return !(xcr0 & XSTATE_BNDREGS) == !(xcr0 & XSTATE_BNDCSR);
+ }
+
+-int validate_xstate(u64 xcr0, u64 xcr0_accum, const struct xsave_hdr *hdr)
++int validate_xstate(const struct domain *d, uint64_t xcr0, uint64_t xcr0_accum,
++ const struct xsave_hdr *hdr)
+ {
++ const struct cpuid_policy *cp = d->arch.cpuid;
++ uint64_t xcr0_max =
++ ((uint64_t)cp->xstate.xcr0_high << 32) | cp->xstate.xcr0_low;
+ unsigned int i;
+
+ if ( (hdr->xstate_bv & ~xcr0_accum) ||
+ (xcr0 & ~xcr0_accum) ||
++ (xcr0_accum & ~xcr0_max) ||
+ !valid_xcr0(xcr0) ||
+ !valid_xcr0(xcr0_accum) )
+ return -EINVAL;
+@@ -694,20 +699,40 @@ int validate_xstate(u64 xcr0, u64 xcr0_accum, const struct xsave_hdr *hdr)
+ int handle_xsetbv(u32 index, u64 new_bv)
+ {
+ struct vcpu *curr = current;
++ const struct cpuid_policy *cp = curr->domain->arch.cpuid;
++ uint64_t xcr0_max =
++ ((uint64_t)cp->xstate.xcr0_high << 32) | cp->xstate.xcr0_low;
+ u64 mask;
+
+ if ( index != XCR_XFEATURE_ENABLED_MASK )
+ return -EOPNOTSUPP;
+
+- if ( (new_bv & ~xfeature_mask) || !valid_xcr0(new_bv) )
++ /*
++ * The CPUID logic shouldn't be able to hand out an XCR0 exceeding Xen's
++ * maximum features, but keep the check for robustness.
++ */
++ if ( unlikely(xcr0_max & ~xfeature_mask) )
++ {
++ gprintk(XENLOG_ERR,
++ "xcr0_max %016" PRIx64 " exceeds hardware max %016" PRIx64 "\n",
++ xcr0_max, xfeature_mask);
++ domain_crash(curr->domain);
++
++ return -EINVAL;
++ }
++
++ if ( (new_bv & ~xcr0_max) || !valid_xcr0(new_bv) )
+ return -EINVAL;
+
+- /* XCR0.PKRU is disabled on PV mode. */
+- if ( is_pv_vcpu(curr) && (new_bv & XSTATE_PKRU) )
+- return -EOPNOTSUPP;
++ /* By this point, new_bv really should be accepted by hardware. */
++ if ( unlikely(!set_xcr0(new_bv)) )
++ {
++ gprintk(XENLOG_ERR, "new_bv %016" PRIx64 " rejected by hardware\n",
++ new_bv);
++ domain_crash(curr->domain);
+
+- if ( !set_xcr0(new_bv) )
+ return -EFAULT;
++ }
+
+ mask = new_bv & ~curr->arch.xcr0_accum;
+ curr->arch.xcr0 = new_bv;
+diff --git a/xen/common/cpu.c b/xen/common/cpu.c
+index 6350f150bd..653a56b840 100644
+--- a/xen/common/cpu.c
++++ b/xen/common/cpu.c
+@@ -67,12 +67,17 @@ void __init register_cpu_notifier(struct notifier_block *nb)
+ spin_unlock(&cpu_add_remove_lock);
+ }
+
+-static int take_cpu_down(void *unused)
++static void _take_cpu_down(void *unused)
+ {
+ void *hcpu = (void *)(long)smp_processor_id();
+ int notifier_rc = notifier_call_chain(&cpu_chain, CPU_DYING, hcpu, NULL);
+ BUG_ON(notifier_rc != NOTIFY_DONE);
+ __cpu_disable();
++}
++
++static int take_cpu_down(void *arg)
++{
++ _take_cpu_down(arg);
+ return 0;
+ }
+
+@@ -98,7 +103,9 @@ int cpu_down(unsigned int cpu)
+ goto fail;
+ }
+
+- if ( (err = stop_machine_run(take_cpu_down, NULL, cpu)) < 0 )
++ if ( unlikely(system_state < SYS_STATE_active) )
++ on_selected_cpus(cpumask_of(cpu), _take_cpu_down, NULL, true);
++ else if ( (err = stop_machine_run(take_cpu_down, NULL, cpu)) < 0 )
+ goto fail;
+
+ __cpu_die(cpu);
+diff --git a/xen/common/cpupool.c b/xen/common/cpupool.c
+index 999839444e..1e8edcbd57 100644
+--- a/xen/common/cpupool.c
++++ b/xen/common/cpupool.c
+@@ -490,7 +490,7 @@ static int cpupool_cpu_add(unsigned int cpu)
+ cpumask_clear_cpu(cpu, &cpupool_locked_cpus);
+ cpumask_set_cpu(cpu, &cpupool_free_cpus);
+
+- if ( system_state == SYS_STATE_resume )
++ if ( system_state == SYS_STATE_suspend || system_state == SYS_STATE_resume )
+ {
+ struct cpupool **c;
+
+@@ -522,6 +522,7 @@ static int cpupool_cpu_add(unsigned int cpu)
+ * (or unplugging would have failed) and that is the default behavior
+ * anyway.
+ */
++ per_cpu(cpupool, cpu) = NULL;
+ ret = cpupool_assign_cpu_locked(cpupool0, cpu);
+ }
+ out:
+diff --git a/xen/common/efi/boot.c b/xen/common/efi/boot.c
+index 01d33004e0..f1d724bd06 100644
+--- a/xen/common/efi/boot.c
++++ b/xen/common/efi/boot.c
+@@ -1304,6 +1304,8 @@ efi_start(EFI_HANDLE ImageHandle, EFI_SYSTEM_TABLE *SystemTable)
+
+ #ifndef CONFIG_ARM /* TODO - runtime service support */
+
++#include <asm/spec_ctrl.h>
++
+ static bool __initdata efi_map_uc;
+
+ static int __init parse_efi_param(const char *s)
+@@ -1419,6 +1421,16 @@ void __init efi_init_memory(void)
+ desc->PhysicalStart, desc->PhysicalStart + len - 1,
+ desc->Type, desc->Attribute);
+
++ if ( (desc->Attribute & (EFI_MEMORY_WB | EFI_MEMORY_WT)) ||
++ (efi_bs_revision >= EFI_REVISION(2, 5) &&
++ (desc->Attribute & EFI_MEMORY_WP)) )
++ {
++ /* Supplement the heuristics in l1tf_calculations(). */
++ l1tf_safe_maddr =
++ max(l1tf_safe_maddr,
++ ROUNDUP(desc->PhysicalStart + len, PAGE_SIZE));
++ }
++
+ if ( !efi_enabled(EFI_RS) ||
+ (!(desc->Attribute & EFI_MEMORY_RUNTIME) &&
+ (!map_bs ||
+diff --git a/xen/common/efi/runtime.c b/xen/common/efi/runtime.c
+index c38f00a64b..9aa070e77c 100644
+--- a/xen/common/efi/runtime.c
++++ b/xen/common/efi/runtime.c
+@@ -111,21 +111,23 @@ struct efi_rs_state efi_rs_enter(void)
+ asm volatile ( "lgdt %0" : : "m" (gdt_desc) );
+ }
+
+- write_cr3(virt_to_maddr(efi_l4_pgtable));
++ switch_cr3_cr4(virt_to_maddr(efi_l4_pgtable), read_cr4());
+
+ return state;
+ }
+
+ void efi_rs_leave(struct efi_rs_state *state)
+ {
++ struct vcpu *curr = current;
++
+ if ( !state->cr3 )
+ return;
+- write_cr3(state->cr3);
+- if ( is_pv_vcpu(current) && !is_idle_vcpu(current) )
++ switch_cr3_cr4(state->cr3, read_cr4());
++ if ( is_pv_vcpu(curr) && !is_idle_vcpu(curr) )
+ {
+ struct desc_ptr gdt_desc = {
+ .limit = LAST_RESERVED_GDT_BYTE,
+- .base = GDT_VIRT_START(current)
++ .base = GDT_VIRT_START(curr)
+ };
+
+ asm volatile ( "lgdt %0" : : "m" (gdt_desc) );
+@@ -133,7 +135,7 @@ void efi_rs_leave(struct efi_rs_state *state)
+ irq_exit();
+ efi_rs_on_cpu = NR_CPUS;
+ spin_unlock(&efi_rs_lock);
+- stts();
++ vcpu_restore_fpu_nonlazy(curr, true);
+ }
+
+ bool efi_rs_using_pgtables(void)
+diff --git a/xen/common/grant_table.c b/xen/common/grant_table.c
+index 7650e6d449..64f58fc815 100644
+--- a/xen/common/grant_table.c
++++ b/xen/common/grant_table.c
+@@ -97,6 +97,45 @@ static unsigned int __read_mostly max_maptrack_frames =
+ DEFAULT_MAX_MAPTRACK_FRAMES;
+ integer_runtime_param("gnttab_max_maptrack_frames", max_maptrack_frames);
+
++#ifndef GNTTAB_MAX_VERSION
++#define GNTTAB_MAX_VERSION 2
++#endif
++
++static unsigned int __read_mostly opt_gnttab_max_version = GNTTAB_MAX_VERSION;
++static bool __read_mostly opt_transitive_grants = true;
++
++static int __init parse_gnttab(const char *s)
++{
++ const char *ss, *e;
++ int val, rc = 0;
++
++ do {
++ ss = strchr(s, ',');
++ if ( !ss )
++ ss = strchr(s, '\0');
++
++ if ( !strncmp(s, "max-ver:", 8) ||
++ !strncmp(s, "max_ver:", 8) ) /* Alias for original XSA-226 patch */
++ {
++ long ver = simple_strtol(s + 8, &e, 10);
++
++ if ( e == ss && ver >= 1 && ver <= 2 )
++ opt_gnttab_max_version = ver;
++ else
++ rc = -EINVAL;
++ }
++ else if ( (val = parse_boolean("transitive", s, ss)) >= 0 )
++ opt_transitive_grants = val;
++ else
++ rc = -EINVAL;
++
++ s = ss + 1;
++ } while ( *ss );
++
++ return rc;
++}
++custom_param("gnttab", parse_gnttab);
++
+ /*
+ * Note that the three values below are effectively part of the ABI, even if
+ * we don't need to make them a formal part of it: A guest suspended for
+@@ -2725,7 +2764,8 @@ static int gnttab_copy_claim_buf(const struct gnttab_copy *op,
+ current->domain->domain_id,
+ buf->read_only,
+ &buf->frame, &buf->page,
+- &buf->ptr.offset, &buf->len, true);
++ &buf->ptr.offset, &buf->len,
++ opt_transitive_grants);
+ if ( rc != GNTST_okay )
+ goto out;
+ buf->ptr.u.ref = ptr->u.ref;
+@@ -2927,6 +2967,10 @@ gnttab_set_version(XEN_GUEST_HANDLE_PARAM(gnttab_set_version_t) uop)
+ if ( op.version != 1 && op.version != 2 )
+ goto out;
+
++ res = -ENOSYS;
++ if ( op.version == 2 && opt_gnttab_max_version == 1 )
++ goto out; /* Behave as before set_version was introduced. */
++
+ res = 0;
+ if ( gt->gt_version == op.version )
+ goto out;
+diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
+index 49b2a91751..6d6f2a0628 100644
+--- a/xen/common/page_alloc.c
++++ b/xen/common/page_alloc.c
+@@ -1424,7 +1424,7 @@ static void free_heap_pages(
+
+ page_list_del(predecessor, &heap(node, zone, order));
+
+- /* Keep predecessor's first_dirty if it is already set. */
++ /* Update predecessor's first_dirty if necessary. */
+ if ( predecessor->u.free.first_dirty == INVALID_DIRTY_IDX &&
+ pg->u.free.first_dirty != INVALID_DIRTY_IDX )
+ predecessor->u.free.first_dirty = (1U << order) +
+@@ -1445,6 +1445,12 @@ static void free_heap_pages(
+
+ check_and_stop_scrub(successor);
+
++ /* Update pg's first_dirty if necessary. */
++ if ( pg->u.free.first_dirty == INVALID_DIRTY_IDX &&
++ successor->u.free.first_dirty != INVALID_DIRTY_IDX )
++ pg->u.free.first_dirty = (1U << order) +
++ successor->u.free.first_dirty;
++
+ page_list_del(successor, &heap(node, zone, order));
+ }
+
+diff --git a/xen/common/schedule.c b/xen/common/schedule.c
+index b7884263f2..f21c3e5a64 100644
+--- a/xen/common/schedule.c
++++ b/xen/common/schedule.c
+@@ -436,14 +436,9 @@ void sched_destroy_domain(struct domain *d)
+ cpupool_rm_domain(d);
+ }
+
+-void vcpu_sleep_nosync(struct vcpu *v)
++void vcpu_sleep_nosync_locked(struct vcpu *v)
+ {
+- unsigned long flags;
+- spinlock_t *lock;
+-
+- TRACE_2D(TRC_SCHED_SLEEP, v->domain->domain_id, v->vcpu_id);
+-
+- lock = vcpu_schedule_lock_irqsave(v, &flags);
++ ASSERT(spin_is_locked(per_cpu(schedule_data,v->processor).schedule_lock));
+
+ if ( likely(!vcpu_runnable(v)) )
+ {
+@@ -452,6 +447,18 @@ void vcpu_sleep_nosync(struct vcpu *v)
+
+ SCHED_OP(vcpu_scheduler(v), sleep, v);
+ }
++}
++
++void vcpu_sleep_nosync(struct vcpu *v)
++{
++ unsigned long flags;
++ spinlock_t *lock;
++
++ TRACE_2D(TRC_SCHED_SLEEP, v->domain->domain_id, v->vcpu_id);
++
++ lock = vcpu_schedule_lock_irqsave(v, &flags);
++
++ vcpu_sleep_nosync_locked(v);
+
+ vcpu_schedule_unlock_irqrestore(lock, flags, v);
+ }
+@@ -567,13 +574,54 @@ static void vcpu_move_nosched(struct vcpu *v, unsigned int new_cpu)
+ sched_move_irqs(v);
+ }
+
+-static void vcpu_migrate(struct vcpu *v)
++/*
++ * Initiating migration
++ *
++ * In order to migrate, we need the vcpu in question to have stopped
++ * running and had SCHED_OP(sleep) called (to take it off any
++ * runqueues, for instance); and if it is currently running, it needs
++ * to be scheduled out. Finally, we need to hold the scheduling locks
++ * for both the processor we're migrating from, and the processor
++ * we're migrating to.
++ *
++ * In order to avoid deadlock while satisfying the final requirement,
++ * we must release any scheduling lock we hold, then try to grab both
++ * locks we want, then double-check to make sure that what we started
++ * to do hasn't been changed in the mean time.
++ *
++ * These steps are encapsulated in the following two functions; they
++ * should be called like this:
++ *
++ * lock = vcpu_schedule_lock_irq(v);
++ * vcpu_migrate_start(v);
++ * vcpu_schedule_unlock_irq(lock, v)
++ * vcpu_migrate_finish(v);
++ *
++ * vcpu_migrate_finish() will do the work now if it can, or simply
++ * return if it can't (because v is still running); in that case
++ * vcpu_migrate_finish() will be called by context_saved().
++ */
++void vcpu_migrate_start(struct vcpu *v)
++{
++ set_bit(_VPF_migrating, &v->pause_flags);
++ vcpu_sleep_nosync_locked(v);
++}
++
++static void vcpu_migrate_finish(struct vcpu *v)
+ {
+ unsigned long flags;
+ unsigned int old_cpu, new_cpu;
+ spinlock_t *old_lock, *new_lock;
+ bool_t pick_called = 0;
+
++ /*
++ * If the vcpu is currently running, this will be handled by
++ * context_saved(); and in any case, if the bit is cleared, then
++ * someone else has already done the work so we don't need to.
++ */
++ if ( v->is_running || !test_bit(_VPF_migrating, &v->pause_flags) )
++ return;
++
+ old_cpu = new_cpu = v->processor;
+ for ( ; ; )
+ {
+@@ -653,14 +701,11 @@ void vcpu_force_reschedule(struct vcpu *v)
+ spinlock_t *lock = vcpu_schedule_lock_irq(v);
+
+ if ( v->is_running )
+- set_bit(_VPF_migrating, &v->pause_flags);
++ vcpu_migrate_start(v);
++
+ vcpu_schedule_unlock_irq(lock, v);
+
+- if ( v->pause_flags & VPF_migrating )
+- {
+- vcpu_sleep_nosync(v);
+- vcpu_migrate(v);
+- }
++ vcpu_migrate_finish(v);
+ }
+
+ void restore_vcpu_affinity(struct domain *d)
+@@ -812,10 +857,10 @@ int cpu_disable_scheduler(unsigned int cpu)
+ * * the scheduler will always fine a suitable solution, or
+ * things would have failed before getting in here.
+ */
+- set_bit(_VPF_migrating, &v->pause_flags);
++ vcpu_migrate_start(v);
+ vcpu_schedule_unlock_irqrestore(lock, flags, v);
+- vcpu_sleep_nosync(v);
+- vcpu_migrate(v);
++
++ vcpu_migrate_finish(v);
+
+ /*
+ * The only caveat, in this case, is that if a vcpu active in
+@@ -849,18 +894,14 @@ static int vcpu_set_affinity(
+ * Always ask the scheduler to re-evaluate placement
+ * when changing the affinity.
+ */
+- set_bit(_VPF_migrating, &v->pause_flags);
++ vcpu_migrate_start(v);
+ }
+
+ vcpu_schedule_unlock_irq(lock, v);
+
+ domain_update_node_affinity(v->domain);
+
+- if ( v->pause_flags & VPF_migrating )
+- {
+- vcpu_sleep_nosync(v);
+- vcpu_migrate(v);
+- }
++ vcpu_migrate_finish(v);
+
+ return ret;
+ }
+@@ -1088,7 +1129,6 @@ int vcpu_pin_override(struct vcpu *v, int cpu)
+ {
+ cpumask_copy(v->cpu_hard_affinity, v->cpu_hard_affinity_saved);
+ v->affinity_broken = 0;
+- set_bit(_VPF_migrating, &v->pause_flags);
+ ret = 0;
+ }
+ }
+@@ -1101,20 +1141,18 @@ int vcpu_pin_override(struct vcpu *v, int cpu)
+ cpumask_copy(v->cpu_hard_affinity_saved, v->cpu_hard_affinity);
+ v->affinity_broken = 1;
+ cpumask_copy(v->cpu_hard_affinity, cpumask_of(cpu));
+- set_bit(_VPF_migrating, &v->pause_flags);
+ ret = 0;
+ }
+ }
+
++ if ( ret == 0 )
++ vcpu_migrate_start(v);
++
+ vcpu_schedule_unlock_irq(lock, v);
+
+ domain_update_node_affinity(v->domain);
+
+- if ( v->pause_flags & VPF_migrating )
+- {
+- vcpu_sleep_nosync(v);
+- vcpu_migrate(v);
+- }
++ vcpu_migrate_finish(v);
+
+ return ret;
+ }
+@@ -1501,8 +1539,7 @@ void context_saved(struct vcpu *prev)
+
+ SCHED_OP(vcpu_scheduler(prev), context_saved, prev);
+
+- if ( unlikely(prev->pause_flags & VPF_migrating) )
+- vcpu_migrate(prev);
++ vcpu_migrate_finish(prev);
+ }
+
+ /* The scheduler timer: force a run through the scheduler */
+diff --git a/xen/common/tasklet.c b/xen/common/tasklet.c
+index 0f0a6f8365..d4fea3151c 100644
+--- a/xen/common/tasklet.c
++++ b/xen/common/tasklet.c
+@@ -156,6 +156,10 @@ void tasklet_kill(struct tasklet *t)
+
+ spin_lock_irqsave(&tasklet_lock, flags);
+
++ /* Cope with uninitialised tasklets. */
++ if ( list_head_is_null(&t->list) )
++ goto unlock;
++
+ if ( !list_empty(&t->list) )
+ {
+ BUG_ON(t->is_dead || t->is_running || (t->scheduled_on < 0));
+@@ -172,6 +176,7 @@ void tasklet_kill(struct tasklet *t)
+ spin_lock_irqsave(&tasklet_lock, flags);
+ }
+
++ unlock:
+ spin_unlock_irqrestore(&tasklet_lock, flags);
+ }
+
+diff --git a/xen/include/asm-arm/arm32/system.h b/xen/include/asm-arm/arm32/system.h
+index c617b40438..ab57abfbc5 100644
+--- a/xen/include/asm-arm/arm32/system.h
++++ b/xen/include/asm-arm/arm32/system.h
+@@ -48,6 +48,24 @@ static inline int local_fiq_is_enabled(void)
+ return !(flags & PSR_FIQ_MASK);
+ }
+
++#define CSDB ".inst 0xe320f014"
++
++static inline unsigned long array_index_mask_nospec(unsigned long idx,
++ unsigned long sz)
++{
++ unsigned long mask;
++
++ asm volatile( "cmp %1, %2\n"
++ "sbc %0, %1, %1\n"
++ CSDB
++ : "=r" (mask)
++ : "r" (idx), "Ir" (sz)
++ : "cc" );
++
++ return mask;
++}
++#define array_index_mask_nospec array_index_mask_nospec
++
+ #endif
+ /*
+ * Local variables:
+diff --git a/xen/include/asm-arm/arm64/system.h b/xen/include/asm-arm/arm64/system.h
+index 2e2ee212a1..2e36573ac6 100644
+--- a/xen/include/asm-arm/arm64/system.h
++++ b/xen/include/asm-arm/arm64/system.h
+@@ -58,6 +58,28 @@ static inline int local_fiq_is_enabled(void)
+ return !(flags & PSR_FIQ_MASK);
+ }
+
++#define csdb() asm volatile ( "hint #20" : : : "memory" )
++
++/*
++ * Generate a mask for array_index__nospec() that is ~0UL when 0 <= idx < sz
++ * and 0 otherwise.
++ */
++static inline unsigned long array_index_mask_nospec(unsigned long idx,
++ unsigned long sz)
++{
++ unsigned long mask;
++
++ asm volatile ( "cmp %1, %2\n"
++ "sbc %0, xzr, xzr\n"
++ : "=r" (mask)
++ : "r" (idx), "Ir" (sz)
++ : "cc" );
++ csdb();
++
++ return mask;
++}
++#define array_index_mask_nospec array_index_mask_nospec
++
+ #endif
+ /*
+ * Local variables:
+diff --git a/xen/include/asm-arm/grant_table.h b/xen/include/asm-arm/grant_table.h
+index 5b8994cbd5..619721f121 100644
+--- a/xen/include/asm-arm/grant_table.h
++++ b/xen/include/asm-arm/grant_table.h
+@@ -7,6 +7,7 @@
+ #include <xen/sched.h>
+
+ #define INITIAL_NR_GRANT_FRAMES 1U
++#define GNTTAB_MAX_VERSION 1
+
+ struct grant_table_arch {
+ gfn_t *shared_gfn;
+diff --git a/xen/include/asm-x86/cpufeature.h b/xen/include/asm-x86/cpufeature.h
+index 62465b20c7..ff6f969e74 100644
+--- a/xen/include/asm-x86/cpufeature.h
++++ b/xen/include/asm-x86/cpufeature.h
+@@ -90,6 +90,7 @@
+ #define cpu_has_avx2 boot_cpu_has(X86_FEATURE_AVX2)
+ #define cpu_has_smep boot_cpu_has(X86_FEATURE_SMEP)
+ #define cpu_has_bmi2 boot_cpu_has(X86_FEATURE_BMI2)
++#define cpu_has_invpcid boot_cpu_has(X86_FEATURE_INVPCID)
+ #define cpu_has_rtm boot_cpu_has(X86_FEATURE_RTM)
+ #define cpu_has_fpu_sel (!boot_cpu_has(X86_FEATURE_NO_FPU_SEL))
+ #define cpu_has_mpx boot_cpu_has(X86_FEATURE_MPX)
+@@ -106,6 +107,7 @@
+ #define cpu_has_aperfmperf boot_cpu_has(X86_FEATURE_APERFMPERF)
+ #define cpu_has_lfence_dispatch boot_cpu_has(X86_FEATURE_LFENCE_DISPATCH)
+ #define cpu_has_no_xpti boot_cpu_has(X86_FEATURE_NO_XPTI)
++#define cpu_has_xen_lbr boot_cpu_has(X86_FEATURE_XEN_LBR)
+
+ enum _cache_type {
+ CACHE_TYPE_NULL = 0,
+diff --git a/xen/include/asm-x86/cpufeatures.h b/xen/include/asm-x86/cpufeatures.h
+index c9b1a48807..8e5cc53dde 100644
+--- a/xen/include/asm-x86/cpufeatures.h
++++ b/xen/include/asm-x86/cpufeatures.h
+@@ -26,8 +26,10 @@ XEN_CPUFEATURE(LFENCE_DISPATCH, (FSCAPINTS+0)*32+12) /* lfence set as Dispatch S
+ XEN_CPUFEATURE(IND_THUNK_LFENCE,(FSCAPINTS+0)*32+13) /* Use IND_THUNK_LFENCE */
+ XEN_CPUFEATURE(IND_THUNK_JMP, (FSCAPINTS+0)*32+14) /* Use IND_THUNK_JMP */
+ XEN_CPUFEATURE(XEN_IBPB, (FSCAPINTS+0)*32+15) /* IBRSB || IBPB */
+-XEN_CPUFEATURE(XEN_IBRS_SET, (FSCAPINTS+0)*32+16) /* IBRSB && IRBS set in Xen */
+-XEN_CPUFEATURE(XEN_IBRS_CLEAR, (FSCAPINTS+0)*32+17) /* IBRSB && IBRS clear in Xen */
+-XEN_CPUFEATURE(RSB_NATIVE, (FSCAPINTS+0)*32+18) /* RSB overwrite needed for native */
+-XEN_CPUFEATURE(RSB_VMEXIT, (FSCAPINTS+0)*32+19) /* RSB overwrite needed for vmexit */
++XEN_CPUFEATURE(SC_MSR_PV, (FSCAPINTS+0)*32+16) /* MSR_SPEC_CTRL used by Xen for PV */
++XEN_CPUFEATURE(SC_MSR_HVM, (FSCAPINTS+0)*32+17) /* MSR_SPEC_CTRL used by Xen for HVM */
++XEN_CPUFEATURE(SC_RSB_PV, (FSCAPINTS+0)*32+18) /* RSB overwrite needed for PV */
++XEN_CPUFEATURE(SC_RSB_HVM, (FSCAPINTS+0)*32+19) /* RSB overwrite needed for HVM */
+ XEN_CPUFEATURE(NO_XPTI, (FSCAPINTS+0)*32+20) /* XPTI mitigation not in use */
++XEN_CPUFEATURE(SC_MSR_IDLE, (FSCAPINTS+0)*32+21) /* (SC_MSR_PV || SC_MSR_HVM) && default_xen_spec_ctrl */
++XEN_CPUFEATURE(XEN_LBR, (FSCAPINTS+0)*32+22) /* Xen uses MSR_DEBUGCTL.LBR */
+diff --git a/xen/include/asm-x86/current.h b/xen/include/asm-x86/current.h
+index 4678a0fcf5..9a137a1311 100644
+--- a/xen/include/asm-x86/current.h
++++ b/xen/include/asm-x86/current.h
+@@ -44,20 +44,33 @@ struct cpu_info {
+ /*
+ * Of the two following fields the latter is being set to the CR3 value
+ * to be used on the given pCPU for loading whenever 64-bit PV guest
+- * context is being entered. The value never changes once set.
++ * context is being entered. A value of zero indicates no setting of CR3
++ * is to be performed.
+ * The former is the value to restore when re-entering Xen, if any. IOW
+- * its value being zero means there's nothing to restore. However, its
+- * value can also be negative, indicating to the exit-to-Xen code that
+- * restoring is not necessary, but allowing any nested entry code paths
+- * to still know the value to put back into CR3.
++ * its value being zero means there's nothing to restore.
+ */
+ unsigned long xen_cr3;
+ unsigned long pv_cr3;
+
+ /* See asm-x86/spec_ctrl_asm.h for usage. */
+ unsigned int shadow_spec_ctrl;
+- bool use_shadow_spec_ctrl;
+- uint8_t bti_ist_info;
++ uint8_t xen_spec_ctrl;
++ uint8_t spec_ctrl_flags;
++
++ /*
++ * The following field controls copying of the L4 page table of 64-bit
++ * PV guests to the per-cpu root page table on entering the guest context.
++ * If set the L4 page table is being copied to the root page table and
++ * the field will be reset.
++ */
++ bool root_pgt_changed;
++
++ /*
++ * use_pv_cr3 is set in case the value of pv_cr3 is to be written into
++ * CR3 when returning from an interrupt. The main use is when returning
++ * from a NMI or MCE to hypervisor code where pv_cr3 was active.
++ */
++ bool use_pv_cr3;
+
+ unsigned long __pad;
+ /* get_stack_bottom() must be 16-byte aligned */
+diff --git a/xen/include/asm-x86/debugreg.h b/xen/include/asm-x86/debugreg.h
+index c57914efc6..b3b10eaf40 100644
+--- a/xen/include/asm-x86/debugreg.h
++++ b/xen/include/asm-x86/debugreg.h
+@@ -24,6 +24,8 @@
+ #define DR_STATUS_RESERVED_ZERO (~0xffffeffful) /* Reserved, read as zero */
+ #define DR_STATUS_RESERVED_ONE 0xffff0ff0ul /* Reserved, read as one */
+
++#define X86_DR6_DEFAULT 0xffff0ff0ul /* Default %dr6 value. */
++
+ /* Now define a bunch of things for manipulating the control register.
+ The top two bytes of the control register consist of 4 fields of 4
+ bits - each field corresponds to one of the four debug registers,
+diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
+index 4679d5477d..0fbd36bf48 100644
+--- a/xen/include/asm-x86/domain.h
++++ b/xen/include/asm-x86/domain.h
+@@ -121,6 +121,11 @@ struct shadow_domain {
+
+ /* Has this domain ever used HVMOP_pagetable_dying? */
+ bool_t pagetable_dying_op;
++
++#ifdef CONFIG_PV
++ /* PV L1 Terminal Fault mitigation. */
++ struct tasklet pv_l1tf_tasklet;
++#endif /* CONFIG_PV */
+ #endif
+ };
+
+@@ -253,6 +258,13 @@ struct pv_domain
+
+ atomic_t nr_l4_pages;
+
++ /* XPTI active? */
++ bool xpti;
++ /* Use PCID feature? */
++ bool pcid;
++ /* Mitigate L1TF with shadow/crashing? */
++ bool check_l1tf;
++
+ /* map_domain_page() mapping cache. */
+ struct mapcache_domain mapcache;
+
+@@ -564,6 +576,9 @@ struct arch_vcpu
+ * and thus should be saved/restored. */
+ bool_t nonlazy_xstate_used;
+
++ /* Restore all FPU state (lazy and non-lazy state) on context switch? */
++ bool fully_eager_fpu;
++
+ /*
+ * The SMAP check policy when updating runstate_guest(v) and the
+ * secondary system time.
+@@ -612,18 +627,12 @@ void vcpu_show_registers(const struct vcpu *);
+ unsigned long pv_guest_cr4_fixup(const struct vcpu *, unsigned long guest_cr4);
+
+ /* Convert between guest-visible and real CR4 values. */
+-#define pv_guest_cr4_to_real_cr4(v) \
+- (((v)->arch.pv_vcpu.ctrlreg[4] \
+- | (mmu_cr4_features \
+- & (X86_CR4_PGE | X86_CR4_PSE | X86_CR4_SMEP | \
+- X86_CR4_SMAP | X86_CR4_OSXSAVE | \
+- X86_CR4_FSGSBASE)) \
+- | ((v)->domain->arch.vtsc ? X86_CR4_TSD : 0)) \
+- & ~X86_CR4_DE)
++unsigned long pv_guest_cr4_to_real_cr4(const struct vcpu *v);
++
+ #define real_cr4_to_pv_guest_cr4(c) \
+ ((c) & ~(X86_CR4_PGE | X86_CR4_PSE | X86_CR4_TSD | \
+ X86_CR4_OSXSAVE | X86_CR4_SMEP | \
+- X86_CR4_FSGSBASE | X86_CR4_SMAP))
++ X86_CR4_FSGSBASE | X86_CR4_SMAP | X86_CR4_PCIDE))
+
+ #define domain_max_vcpus(d) (is_hvm_domain(d) ? HVM_MAX_VCPUS : MAX_VIRT_CPUS)
+
+diff --git a/xen/include/asm-x86/flushtlb.h b/xen/include/asm-x86/flushtlb.h
+index 413db692e1..4a930448da 100644
+--- a/xen/include/asm-x86/flushtlb.h
++++ b/xen/include/asm-x86/flushtlb.h
+@@ -84,7 +84,7 @@ static inline unsigned long read_cr3(void)
+ }
+
+ /* Write pagetable base and implicitly tick the tlbflush clock. */
+-void write_cr3(unsigned long cr3);
++void switch_cr3_cr4(unsigned long cr3, unsigned long cr4);
+
+ /* flush_* flag fields: */
+ /*
+@@ -101,6 +101,8 @@ void write_cr3(unsigned long cr3);
+ #define FLUSH_CACHE 0x400
+ /* VA for the flush has a valid mapping */
+ #define FLUSH_VA_VALID 0x800
++ /* Flush the per-cpu root page table */
++#define FLUSH_ROOT_PGTBL 0x2000
+
+ /* Flush local TLBs/caches. */
+ unsigned int flush_area_local(const void *va, unsigned int flags);
+@@ -132,6 +134,12 @@ void flush_area_mask(const cpumask_t *, const void *va, unsigned int flags);
+ #define flush_tlb_one_all(v) \
+ flush_tlb_one_mask(&cpu_online_map, v)
+
++#define flush_root_pgtbl_domain(d) \
++{ \
++ if ( is_pv_domain(d) && (d)->arch.pv_domain.xpti ) \
++ flush_mask((d)->domain_dirty_cpumask, FLUSH_ROOT_PGTBL); \
++}
++
+ static inline void flush_page_to_ram(unsigned long mfn, bool sync_icache) {}
+ static inline int invalidate_dcache_va_range(const void *p,
+ unsigned long size)
+diff --git a/xen/include/asm-x86/hvm/irq.h b/xen/include/asm-x86/hvm/irq.h
+index f756cb5a0d..1a52ec6045 100644
+--- a/xen/include/asm-x86/hvm/irq.h
++++ b/xen/include/asm-x86/hvm/irq.h
+@@ -207,6 +207,9 @@ int hvm_set_pci_link_route(struct domain *d, u8 link, u8 isa_irq);
+
+ int hvm_inject_msi(struct domain *d, uint64_t addr, uint32_t data);
+
++/* Assert an IO APIC pin. */
++int hvm_ioapic_assert(struct domain *d, unsigned int gsi, bool level);
++
+ void hvm_maybe_deassert_evtchn_irq(void);
+ void hvm_assert_evtchn_irq(struct vcpu *v);
+ void hvm_set_callback_via(struct domain *d, uint64_t via);
+diff --git a/xen/include/asm-x86/hvm/vmx/vmcs.h b/xen/include/asm-x86/hvm/vmx/vmcs.h
+index 8fb9e3ceee..e216c4ac35 100644
+--- a/xen/include/asm-x86/hvm/vmx/vmcs.h
++++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
+@@ -130,10 +130,18 @@ struct arch_vmx_struct {
+ uint64_t sfmask;
+
+ struct vmx_msr_bitmap *msr_bitmap;
+- unsigned int msr_count;
++
++ /*
++ * Most accesses to the MSR host/guest load/save lists are in current
++ * context. However, the data can be modified by toolstack/migration
++ * actions. Remote access is only permitted for paused vcpus, and is
++ * protected under the domctl lock.
++ */
+ struct vmx_msr_entry *msr_area;
+- unsigned int host_msr_count;
+ struct vmx_msr_entry *host_msr_area;
++ unsigned int msr_load_count;
++ unsigned int msr_save_count;
++ unsigned int host_msr_count;
+
+ unsigned long eoi_exitmap_changed;
+ DECLARE_BITMAP(eoi_exit_bitmap, NR_VECTORS);
+@@ -149,7 +157,7 @@ struct arch_vmx_struct {
+ /* Are we emulating rather than VMENTERing? */
+ uint8_t vmx_emulate;
+
+- uint8_t lbr_fixup_enabled;
++ uint8_t lbr_flags;
+
+ /* Bitmask of segments that we can't safely use in virtual 8086 mode */
+ uint16_t vm86_segment_mask;
+@@ -511,9 +519,6 @@ enum vmcs_field {
+
+ #define VMCS_VPID_WIDTH 16
+
+-#define VMX_GUEST_MSR 0
+-#define VMX_HOST_MSR 1
+-
+ /* VM Instruction error numbers */
+ enum vmx_insn_errno
+ {
+@@ -531,6 +536,67 @@ enum vmx_insn_errno
+ VMX_INSN_FAIL_INVALID = ~0,
+ };
+
++/* MSR load/save list infrastructure. */
++enum vmx_msr_list_type {
++ VMX_MSR_HOST, /* MSRs loaded on VMExit. */
++ VMX_MSR_GUEST, /* MSRs saved on VMExit, loaded on VMEntry. */
++ VMX_MSR_GUEST_LOADONLY, /* MSRs loaded on VMEntry only. */
++};
++
++/**
++ * Add an MSR to an MSR list (inserting space for the entry if necessary), and
++ * set the MSRs value.
++ *
++ * It is undefined behaviour to try and insert the same MSR into both the
++ * GUEST and GUEST_LOADONLY list.
++ *
++ * May fail if unable to allocate memory for the list, or the total number of
++ * entries exceeds the memory allocated.
++ */
++int vmx_add_msr(struct vcpu *v, uint32_t msr, uint64_t val,
++ enum vmx_msr_list_type type);
++
++static inline int vmx_add_guest_msr(struct vcpu *v, uint32_t msr, uint64_t val)
++{
++ return vmx_add_msr(v, msr, val, VMX_MSR_GUEST);
++}
++static inline int vmx_add_host_load_msr(struct vcpu *v, uint32_t msr,
++ uint64_t val)
++{
++ return vmx_add_msr(v, msr, val, VMX_MSR_HOST);
++}
++
++struct vmx_msr_entry *vmx_find_msr(const struct vcpu *v, uint32_t msr,
++ enum vmx_msr_list_type type);
++
++static inline int vmx_read_guest_msr(const struct vcpu *v, uint32_t msr,
++ uint64_t *val)
++{
++ const struct vmx_msr_entry *ent = vmx_find_msr(v, msr, VMX_MSR_GUEST);
++
++ if ( !ent )
++ return -ESRCH;
++
++ *val = ent->data;
++
++ return 0;
++}
++
++static inline int vmx_write_guest_msr(struct vcpu *v, uint32_t msr,
++ uint64_t val)
++{
++ struct vmx_msr_entry *ent = vmx_find_msr(v, msr, VMX_MSR_GUEST);
++
++ if ( !ent )
++ return -ESRCH;
++
++ ent->data = val;
++
++ return 0;
++}
++
++
++/* MSR intercept bitmap infrastructure. */
+ enum vmx_msr_intercept_type {
+ VMX_MSR_R = 1,
+ VMX_MSR_W = 2,
+@@ -541,10 +607,6 @@ void vmx_clear_msr_intercept(struct vcpu *v, unsigned int msr,
+ enum vmx_msr_intercept_type type);
+ void vmx_set_msr_intercept(struct vcpu *v, unsigned int msr,
+ enum vmx_msr_intercept_type type);
+-int vmx_read_guest_msr(u32 msr, u64 *val);
+-int vmx_write_guest_msr(u32 msr, u64 val);
+-struct vmx_msr_entry *vmx_find_msr(u32 msr, int type);
+-int vmx_add_msr(u32 msr, int type);
+ void vmx_vmcs_switch(paddr_t from, paddr_t to);
+ void vmx_set_eoi_exit_bitmap(struct vcpu *v, u8 vector);
+ void vmx_clear_eoi_exit_bitmap(struct vcpu *v, u8 vector);
+@@ -559,15 +621,6 @@ void virtual_vmcs_vmwrite(const struct vcpu *, u32 encoding, u64 val);
+ enum vmx_insn_errno virtual_vmcs_vmwrite_safe(const struct vcpu *v,
+ u32 vmcs_encoding, u64 val);
+
+-static inline int vmx_add_guest_msr(u32 msr)
+-{
+- return vmx_add_msr(msr, VMX_GUEST_MSR);
+-}
+-static inline int vmx_add_host_load_msr(u32 msr)
+-{
+- return vmx_add_msr(msr, VMX_HOST_MSR);
+-}
+-
+ DECLARE_PER_CPU(bool_t, vmxon);
+
+ bool_t vmx_vcpu_pml_enabled(const struct vcpu *v);
+diff --git a/xen/include/asm-x86/hvm/vpt.h b/xen/include/asm-x86/hvm/vpt.h
+index 21166edd06..0eb5ff632e 100644
+--- a/xen/include/asm-x86/hvm/vpt.h
++++ b/xen/include/asm-x86/hvm/vpt.h
+@@ -44,6 +44,7 @@ struct periodic_time {
+ bool_t warned_timeout_too_short;
+ #define PTSRC_isa 1 /* ISA time source */
+ #define PTSRC_lapic 2 /* LAPIC time source */
++#define PTSRC_ioapic 3 /* IOAPIC time source */
+ u8 source; /* PTSRC_ */
+ u8 irq;
+ struct vcpu *vcpu; /* vcpu timer interrupt delivers to */
+diff --git a/xen/include/asm-x86/hypercall.h b/xen/include/asm-x86/hypercall.h
+index b9f3ecf9a3..0e1092845b 100644
+--- a/xen/include/asm-x86/hypercall.h
++++ b/xen/include/asm-x86/hypercall.h
+@@ -165,7 +165,7 @@ extern int compat_update_va_mapping(
+ unsigned int va, u32 lo, u32 hi, unsigned int flags);
+
+ extern int compat_update_va_mapping_otherdomain(
+- unsigned long va, u32 lo, u32 hi, unsigned long flags, domid_t domid);
++ unsigned int va, u32 lo, u32 hi, unsigned int flags, domid_t domid);
+
+ DEFINE_XEN_GUEST_HANDLE(trap_info_compat_t);
+ extern int compat_set_trap_table(XEN_GUEST_HANDLE(trap_info_compat_t) traps);
+diff --git a/xen/include/asm-x86/i387.h b/xen/include/asm-x86/i387.h
+index 7cfa215d30..243de672eb 100644
+--- a/xen/include/asm-x86/i387.h
++++ b/xen/include/asm-x86/i387.h
+@@ -28,7 +28,7 @@ struct ix87_env {
+ uint16_t fds, _res6;
+ };
+
+-void vcpu_restore_fpu_eager(struct vcpu *v);
++void vcpu_restore_fpu_nonlazy(struct vcpu *v, bool need_stts);
+ void vcpu_restore_fpu_lazy(struct vcpu *v);
+ void vcpu_save_fpu(struct vcpu *v);
+ void save_fpu_enable(void);
+diff --git a/xen/include/asm-x86/invpcid.h b/xen/include/asm-x86/invpcid.h
+new file mode 100644
+index 0000000000..edd8b68706
+--- /dev/null
++++ b/xen/include/asm-x86/invpcid.h
+@@ -0,0 +1,72 @@
++#ifndef _ASM_X86_INVPCID_H_
++#define _ASM_X86_INVPCID_H_
++
++#include <xen/types.h>
++
++extern bool use_invpcid;
++
++#define INVPCID_TYPE_INDIV_ADDR 0
++#define INVPCID_TYPE_SINGLE_CTXT 1
++#define INVPCID_TYPE_ALL_INCL_GLOBAL 2
++#define INVPCID_TYPE_ALL_NON_GLOBAL 3
++
++#define INVPCID_OPCODE ".byte 0x66, 0x0f, 0x38, 0x82\n"
++#define MODRM_ECX_01 ".byte 0x01\n"
++
++static inline void invpcid(unsigned int pcid, unsigned long addr,
++ unsigned int type)
++{
++ struct {
++ uint64_t pcid:12;
++ uint64_t reserved:52;
++ uint64_t addr;
++ } desc = { .pcid = pcid, .addr = addr };
++
++ asm volatile (
++#ifdef HAVE_AS_INVPCID
++ "invpcid %[desc], %q[type]"
++ : /* No output */
++ : [desc] "m" (desc), [type] "r" (type)
++#else
++ INVPCID_OPCODE MODRM_ECX_01
++ : /* No output */
++ : "a" (type), "c" (&desc)
++#endif
++ : "memory" );
++}
++
++/* Flush all mappings for a given PCID and addr, not including globals */
++static inline void invpcid_flush_one(unsigned int pcid, unsigned long addr)
++{
++ invpcid(pcid, addr, INVPCID_TYPE_INDIV_ADDR);
++}
++
++/* Flush all mappings for a given PCID, not including globals */
++static inline void invpcid_flush_single_context(unsigned int pcid)
++{
++ invpcid(pcid, 0, INVPCID_TYPE_SINGLE_CTXT);
++}
++
++/* Flush all mappings, including globals, for all PCIDs */
++static inline void invpcid_flush_all(void)
++{
++ invpcid(0, 0, INVPCID_TYPE_ALL_INCL_GLOBAL);
++}
++
++/* Flush all mappings for all PCIDs, excluding globals */
++static inline void invpcid_flush_all_nonglobals(void)
++{
++ invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL);
++}
++
++#endif /* _ASM_X86_INVPCID_H_ */
++
++/*
++ * Local variables:
++ * mode: C
++ * c-file-style: "BSD"
++ * c-basic-offset: 4
++ * tab-width: 4
++ * indent-tabs-mode: nil
++ * End:
++ */
+diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h
+index a8ceecf3e2..dfeba2821d 100644
+--- a/xen/include/asm-x86/msr-index.h
++++ b/xen/include/asm-x86/msr-index.h
+@@ -31,10 +31,14 @@
+ #define EFER_LMSLE (1<<_EFER_LMSLE)
+ #define EFER_FFXSE (1<<_EFER_FFXSE)
+
++#define EFER_KNOWN_MASK (EFER_SCE | EFER_LME | EFER_LMA | EFER_NX | \
++ EFER_SVME | EFER_LMSLE | EFER_FFXSE)
++
+ /* Speculation Controls. */
+ #define MSR_SPEC_CTRL 0x00000048
+ #define SPEC_CTRL_IBRS (_AC(1, ULL) << 0)
+ #define SPEC_CTRL_STIBP (_AC(1, ULL) << 1)
++#define SPEC_CTRL_SSBD (_AC(1, ULL) << 2)
+
+ #define MSR_PRED_CMD 0x00000049
+ #define PRED_CMD_IBPB (_AC(1, ULL) << 0)
+@@ -42,6 +46,12 @@
+ #define MSR_ARCH_CAPABILITIES 0x0000010a
+ #define ARCH_CAPABILITIES_RDCL_NO (_AC(1, ULL) << 0)
+ #define ARCH_CAPABILITIES_IBRS_ALL (_AC(1, ULL) << 1)
++#define ARCH_CAPS_RSBA (_AC(1, ULL) << 2)
++#define ARCH_CAPS_SKIP_L1DFL (_AC(1, ULL) << 3)
++#define ARCH_CAPS_SSB_NO (_AC(1, ULL) << 4)
++
++#define MSR_FLUSH_CMD 0x0000010b
++#define FLUSH_CMD_L1D (_AC(1, ULL) << 0)
+
+ /* Intel MSRs. Some also available on other CPUs */
+ #define MSR_IA32_PERFCTR0 0x000000c1
+diff --git a/xen/include/asm-x86/msr.h b/xen/include/asm-x86/msr.h
+index 2c9277b6d5..22d5b7181e 100644
+--- a/xen/include/asm-x86/msr.h
++++ b/xen/include/asm-x86/msr.h
+@@ -198,7 +198,7 @@ DECLARE_PER_CPU(u64, efer);
+ u64 read_efer(void);
+ void write_efer(u64 val);
+
+-DECLARE_PER_CPU(u32, ler_msr);
++extern unsigned int ler_msr;
+
+ DECLARE_PER_CPU(uint32_t, tsc_aux);
+
+diff --git a/xen/include/asm-x86/nops.h b/xen/include/asm-x86/nops.h
+index 37f9819e82..0016075616 100644
+--- a/xen/include/asm-x86/nops.h
++++ b/xen/include/asm-x86/nops.h
+@@ -61,11 +61,12 @@
+ #define ASM_NOP7 _ASM_MK_NOP(K8_NOP7)
+ #define ASM_NOP8 _ASM_MK_NOP(K8_NOP8)
+
++#define ASM_NOP14 ASM_NOP8; ASM_NOP6
+ #define ASM_NOP17 ASM_NOP8; ASM_NOP7; ASM_NOP2
+-#define ASM_NOP21 ASM_NOP8; ASM_NOP8; ASM_NOP5
+ #define ASM_NOP24 ASM_NOP8; ASM_NOP8; ASM_NOP8
+-#define ASM_NOP29 ASM_NOP8; ASM_NOP8; ASM_NOP8; ASM_NOP5
+-#define ASM_NOP32 ASM_NOP8; ASM_NOP8; ASM_NOP8; ASM_NOP8
++#define ASM_NOP25 ASM_NOP8; ASM_NOP8; ASM_NOP7; ASM_NOP2
++#define ASM_NOP33 ASM_NOP8; ASM_NOP8; ASM_NOP8; ASM_NOP7; ASM_NOP2
++#define ASM_NOP36 ASM_NOP8; ASM_NOP8; ASM_NOP8; ASM_NOP8; ASM_NOP4
+ #define ASM_NOP40 ASM_NOP8; ASM_NOP8; ASM_NOP8; ASM_NOP8; ASM_NOP8
+
+ #define ASM_NOP_MAX 8
+diff --git a/xen/include/asm-x86/paging.h b/xen/include/asm-x86/paging.h
+index 5607ab4b1f..fc326fe616 100644
+--- a/xen/include/asm-x86/paging.h
++++ b/xen/include/asm-x86/paging.h
+@@ -37,11 +37,14 @@
+
+ #define PG_SH_shift 20
+ #define PG_HAP_shift 21
++#define PG_SHF_shift 22
+ /* We're in one of the shadow modes */
+ #ifdef CONFIG_SHADOW_PAGING
+ #define PG_SH_enable (1U << PG_SH_shift)
++#define PG_SH_forced (1U << PG_SHF_shift)
+ #else
+ #define PG_SH_enable 0
++#define PG_SH_forced 0
+ #endif
+ #define PG_HAP_enable (1U << PG_HAP_shift)
+
+@@ -62,6 +65,7 @@
+
+ #define paging_mode_enabled(_d) (!!(_d)->arch.paging.mode)
+ #define paging_mode_shadow(_d) (!!((_d)->arch.paging.mode & PG_SH_enable))
++#define paging_mode_sh_forced(_d) (!!((_d)->arch.paging.mode & PG_SH_forced))
+ #define paging_mode_hap(_d) (!!((_d)->arch.paging.mode & PG_HAP_enable))
+
+ #define paging_mode_refcounts(_d) (!!((_d)->arch.paging.mode & PG_refcounts))
+diff --git a/xen/include/asm-x86/processor.h b/xen/include/asm-x86/processor.h
+index 80f8411355..90a2701d26 100644
+--- a/xen/include/asm-x86/processor.h
++++ b/xen/include/asm-x86/processor.h
+@@ -293,6 +293,21 @@ static inline unsigned long read_cr2(void)
+ return cr2;
+ }
+
++static inline void write_cr3(unsigned long val)
++{
++ asm volatile ( "mov %0, %%cr3" : : "r" (val) : "memory" );
++}
++
++static inline unsigned long cr3_pa(unsigned long cr3)
++{
++ return cr3 & X86_CR3_ADDR_MASK;
++}
++
++static inline unsigned long cr3_pcid(unsigned long cr3)
++{
++ return cr3 & X86_CR3_PCID_MASK;
++}
++
+ static inline unsigned long read_cr4(void)
+ {
+ return get_cpu_info()->cr4;
+@@ -300,6 +315,9 @@ static inline unsigned long read_cr4(void)
+
+ static inline void write_cr4(unsigned long val)
+ {
++ /* No global pages in case of PCIDs enabled! */
++ ASSERT(!(val & X86_CR4_PGE) || !(val & X86_CR4_PCIDE));
++
+ get_cpu_info()->cr4 = val;
+ asm volatile ( "mov %0,%%cr4" : : "r" (val) );
+ }
+@@ -329,12 +347,6 @@ static always_inline void set_in_cr4 (unsigned long mask)
+ write_cr4(read_cr4() | mask);
+ }
+
+-static always_inline void clear_in_cr4 (unsigned long mask)
+-{
+- mmu_cr4_features &= ~mask;
+- write_cr4(read_cr4() & ~mask);
+-}
+-
+ static inline unsigned int read_pkru(void)
+ {
+ unsigned int pkru;
+@@ -445,7 +457,8 @@ struct __packed __cacheline_aligned tss_struct {
+ #define IST_DF 1UL
+ #define IST_NMI 2UL
+ #define IST_MCE 3UL
+-#define IST_MAX 3UL
++#define IST_DB 4UL
++#define IST_MAX 4UL
+
+ /* Set the interrupt stack table used by a particular interrupt
+ * descriptor table entry. */
+diff --git a/xen/include/asm-x86/pv/domain.h b/xen/include/asm-x86/pv/domain.h
+index acdf140fbd..6778e1bb75 100644
+--- a/xen/include/asm-x86/pv/domain.h
++++ b/xen/include/asm-x86/pv/domain.h
+@@ -21,6 +21,37 @@
+ #ifndef __X86_PV_DOMAIN_H__
+ #define __X86_PV_DOMAIN_H__
+
++/*
++ * PCID values for the address spaces of 64-bit pv domains:
++ *
++ * We are using 4 PCID values for a 64 bit pv domain subject to XPTI:
++ * - hypervisor active and guest in kernel mode PCID 0
++ * - hypervisor active and guest in user mode PCID 1
++ * - guest active and in kernel mode PCID 2
++ * - guest active and in user mode PCID 3
++ *
++ * Without XPTI only 2 values are used:
++ * - guest in kernel mode PCID 0
++ * - guest in user mode PCID 1
++ */
++
++#define PCID_PV_PRIV 0x0000 /* Used for other domains, too. */
++#define PCID_PV_USER 0x0001
++#define PCID_PV_XPTI 0x0002 /* To be ORed to above values. */
++
++/*
++ * Return additional PCID specific cr3 bits.
++ *
++ * Note that X86_CR3_NOFLUSH will not be readable in cr3. Anyone consuming
++ * v->arch.cr3 should mask away X86_CR3_NOFLUSH and X86_CR3_PCIDMASK in case
++ * the value is used to address the root page table.
++ */
++static inline unsigned long get_pcid_bits(const struct vcpu *v, bool is_xpti)
++{
++ return X86_CR3_NOFLUSH | (is_xpti ? PCID_PV_XPTI : 0) |
++ ((v->arch.flags & TF_kernel_mode) ? PCID_PV_PRIV : PCID_PV_USER);
++}
++
+ #ifdef CONFIG_PV
+
+ void pv_vcpu_destroy(struct vcpu *v);
+diff --git a/xen/include/asm-x86/setup.h b/xen/include/asm-x86/setup.h
+index b68ec9de4d..ecfd0c2e7b 100644
+--- a/xen/include/asm-x86/setup.h
++++ b/xen/include/asm-x86/setup.h
+@@ -66,6 +66,8 @@ extern uint8_t kbd_shift_flags;
+ extern unsigned long highmem_start;
+ #endif
+
++extern int8_t opt_smt;
++
+ #ifdef CONFIG_SHADOW_PAGING
+ extern bool opt_dom0_shadow;
+ #else
+diff --git a/xen/include/asm-x86/shadow.h b/xen/include/asm-x86/shadow.h
+index 94a34fd16a..f40f411871 100644
+--- a/xen/include/asm-x86/shadow.h
++++ b/xen/include/asm-x86/shadow.h
+@@ -29,6 +29,7 @@
+ #include <asm/flushtlb.h>
+ #include <asm/paging.h>
+ #include <asm/p2m.h>
++#include <asm/spec_ctrl.h>
+
+ /*****************************************************************************
+ * Macros to tell which shadow paging mode a domain is in*/
+@@ -115,6 +116,131 @@ static inline int shadow_domctl(struct domain *d,
+
+ #endif /* CONFIG_SHADOW_PAGING */
+
++/*
++ * Mitigations for L1TF / CVE-2018-3620 for PV guests.
++ *
++ * We cannot alter an architecturally-legitimate PTE which a PV guest has
++ * chosen to write, as traditional paged-out metadata is L1TF-vulnerable.
++ * What we can do is force a PV guest which writes a vulnerable PTE into
++ * shadow mode, so Xen controls the pagetables which are reachable by the CPU
++ * pagewalk.
++ *
++ * The core of the L1TF vulnerability is that the address bits of the PTE
++ * (accounting for PSE and factoring in the level-relevant part of the linear
++ * access) are sent for an L1D lookup (to retrieve the next-level PTE, or
++ * eventual memory address) before the Present or reserved bits (which would
++ * cause a terminal fault) are accounted for. If an L1D hit occurs, the
++ * resulting data is available for potentially dependent instructions.
++ *
++ * For Present PTEs, the PV type-count safety logic ensures that the address
++ * bits always point at a guest-accessible frame, which is safe WRT L1TF from
++ * Xen's point of view. In practice, a PV guest should be unable to set any
++ * reserved bits, so should be unable to create any present L1TF-vulnerable
++ * PTEs at all.
++ *
++ * Therefore, these safety checks apply to Not-Present PTEs only, where
++ * traditionally, Xen would have let the guest write any value it chose.
++ *
++ * The all-zero PTE potentially leaks mfn 0. All software on the system is
++ * expected to cooperate and not put any secrets there. In a Xen system,
++ * neither Xen nor dom0 are expected to touch mfn 0, as it typically contains
++ * the real mode IVT and Bios Data Area. Therefore, mfn 0 is considered safe.
++ *
++ * Any PTE whose address is higher than the maximum cacheable address is safe,
++ * as it won't get an L1D hit.
++ *
++ * Speculative superpages also need accounting for, as PSE is considered
++ * irrespective of Present. We disallow PSE being set, as it allows an
++ * attacker to leak 2M or 1G of data starting from mfn 0. Also, because of
++ * recursive/linear pagetables, we must consider PSE even at L4, as hardware
++ * will interpret an L4e as an L3e during a recursive walk.
++ */
++
++static inline bool is_l1tf_safe_maddr(intpte_t pte)
++{
++ paddr_t maddr = pte & l1tf_addr_mask;
++
++ return maddr == 0 || maddr >= l1tf_safe_maddr;
++}
++
++static inline bool pv_l1tf_check_pte(struct domain *d, unsigned int level,
++ intpte_t pte)
++{
++ ASSERT(is_pv_domain(d));
++ ASSERT(!(pte & _PAGE_PRESENT));
++
++ if ( d->arch.pv_domain.check_l1tf && !paging_mode_sh_forced(d) &&
++ (((level > 1) && (pte & _PAGE_PSE)) || !is_l1tf_safe_maddr(pte)) )
++ {
++#ifdef CONFIG_SHADOW_PAGING
++ struct tasklet *t = &d->arch.paging.shadow.pv_l1tf_tasklet;
++
++ printk(XENLOG_G_WARNING
++ "d%d L1TF-vulnerable L%ue %016"PRIx64" - Shadowing\n",
++ d->domain_id, level, pte);
++ /*
++ * Safety consideration for accessing tasklet.scheduled_on without the
++ * tasklet lock. This is a singleshot tasklet with the side effect of
++ * setting PG_SH_forced (checked just above). Multiple vcpus can race
++ * to schedule the tasklet, but if we observe it scheduled anywhere,
++ * that is good enough.
++ */
++ smp_rmb();
++ if ( !tasklet_is_scheduled(t) )
++ tasklet_schedule(t);
++#else
++ printk(XENLOG_G_ERR
++ "d%d L1TF-vulnerable L%ue %016"PRIx64" - Crashing\n",
++ d->domain_id, level, pte);
++ domain_crash(d);
++#endif
++ return true;
++ }
++
++ return false;
++}
++
++static inline bool pv_l1tf_check_l1e(struct domain *d, l1_pgentry_t l1e)
++{
++ return pv_l1tf_check_pte(d, 1, l1e.l1);
++}
++
++static inline bool pv_l1tf_check_l2e(struct domain *d, l2_pgentry_t l2e)
++{
++ return pv_l1tf_check_pte(d, 2, l2e.l2);
++}
++
++static inline bool pv_l1tf_check_l3e(struct domain *d, l3_pgentry_t l3e)
++{
++ return pv_l1tf_check_pte(d, 3, l3e.l3);
++}
++
++static inline bool pv_l1tf_check_l4e(struct domain *d, l4_pgentry_t l4e)
++{
++ return pv_l1tf_check_pte(d, 4, l4e.l4);
++}
++
++void pv_l1tf_tasklet(unsigned long data);
++
++static inline void pv_l1tf_domain_init(struct domain *d)
++{
++ d->arch.pv_domain.check_l1tf =
++ opt_pv_l1tf & (is_hardware_domain(d)
++ ? OPT_PV_L1TF_DOM0 : OPT_PV_L1TF_DOMU);
++
++#if defined(CONFIG_SHADOW_PAGING) && defined(CONFIG_PV)
++ tasklet_init(&d->arch.paging.shadow.pv_l1tf_tasklet,
++ pv_l1tf_tasklet, (unsigned long)d);
++#endif
++}
++
++static inline void pv_l1tf_domain_destroy(struct domain *d)
++{
++#if defined(CONFIG_SHADOW_PAGING) && defined(CONFIG_PV)
++ tasklet_kill(&d->arch.paging.shadow.pv_l1tf_tasklet);
++#endif
++}
++
+ /* Remove all shadows of the guest mfn. */
+ static inline void shadow_remove_all_shadows(struct domain *d, mfn_t gmfn)
+ {
+diff --git a/xen/include/asm-x86/smp.h b/xen/include/asm-x86/smp.h
+index 4e5f673fec..09c55458df 100644
+--- a/xen/include/asm-x86/smp.h
++++ b/xen/include/asm-x86/smp.h
+@@ -26,6 +26,8 @@ DECLARE_PER_CPU(cpumask_var_t, cpu_sibling_mask);
+ DECLARE_PER_CPU(cpumask_var_t, cpu_core_mask);
+ DECLARE_PER_CPU(cpumask_var_t, scratch_cpumask);
+
++extern bool park_offline_cpus;
++
+ void smp_send_nmi_allbutself(void);
+
+ void send_IPI_mask(const cpumask_t *, int vector);
+diff --git a/xen/include/asm-x86/spec_ctrl.h b/xen/include/asm-x86/spec_ctrl.h
+index 5ab4ff3f68..8f8aad40bb 100644
+--- a/xen/include/asm-x86/spec_ctrl.h
++++ b/xen/include/asm-x86/spec_ctrl.h
+@@ -27,14 +27,36 @@
+ void init_speculation_mitigations(void);
+
+ extern bool opt_ibpb;
+-extern uint8_t default_bti_ist_info;
++extern bool opt_ssbd;
++extern int8_t opt_eager_fpu;
++extern int8_t opt_l1d_flush;
++
++extern bool bsp_delay_spec_ctrl;
++extern uint8_t default_xen_spec_ctrl;
++extern uint8_t default_spec_ctrl_flags;
++
++extern int8_t opt_xpti;
++#define OPT_XPTI_DOM0 0x01
++#define OPT_XPTI_DOMU 0x02
++
++extern int8_t opt_pv_l1tf;
++#define OPT_PV_L1TF_DOM0 0x01
++#define OPT_PV_L1TF_DOMU 0x02
++
++/*
++ * The L1D address mask, which might be wider than reported in CPUID, and the
++ * system physical address above which there are believed to be no cacheable
++ * memory regions, thus unable to leak data via the L1TF vulnerability.
++ */
++extern paddr_t l1tf_addr_mask, l1tf_safe_maddr;
+
+ static inline void init_shadow_spec_ctrl_state(void)
+ {
+ struct cpu_info *info = get_cpu_info();
+
+- info->shadow_spec_ctrl = info->use_shadow_spec_ctrl = 0;
+- info->bti_ist_info = default_bti_ist_info;
++ info->shadow_spec_ctrl = 0;
++ info->xen_spec_ctrl = default_xen_spec_ctrl;
++ info->spec_ctrl_flags = default_spec_ctrl_flags;
+ }
+
+ /* WARNING! `ret`, `call *`, `jmp *` not safe after this call. */
+@@ -48,24 +70,24 @@ static always_inline void spec_ctrl_enter_idle(struct cpu_info *info)
+ */
+ info->shadow_spec_ctrl = val;
+ barrier();
+- info->use_shadow_spec_ctrl = true;
++ info->spec_ctrl_flags |= SCF_use_shadow;
+ barrier();
+- asm volatile ( ALTERNATIVE(ASM_NOP3, "wrmsr", X86_FEATURE_XEN_IBRS_SET)
++ asm volatile ( ALTERNATIVE(ASM_NOP3, "wrmsr", X86_FEATURE_SC_MSR_IDLE)
+ :: "a" (val), "c" (MSR_SPEC_CTRL), "d" (0) : "memory" );
+ }
+
+ /* WARNING! `ret`, `call *`, `jmp *` not safe before this call. */
+ static always_inline void spec_ctrl_exit_idle(struct cpu_info *info)
+ {
+- uint32_t val = SPEC_CTRL_IBRS;
++ uint32_t val = info->xen_spec_ctrl;
+
+ /*
+ * Disable shadowing before updating the MSR. There are no SMP issues
+ * here; only local processor ordering concerns.
+ */
+- info->use_shadow_spec_ctrl = false;
++ info->spec_ctrl_flags &= ~SCF_use_shadow;
+ barrier();
+- asm volatile ( ALTERNATIVE(ASM_NOP3, "wrmsr", X86_FEATURE_XEN_IBRS_SET)
++ asm volatile ( ALTERNATIVE(ASM_NOP3, "wrmsr", X86_FEATURE_SC_MSR_IDLE)
+ :: "a" (val), "c" (MSR_SPEC_CTRL), "d" (0) : "memory" );
+ }
+
+diff --git a/xen/include/asm-x86/spec_ctrl_asm.h b/xen/include/asm-x86/spec_ctrl_asm.h
+index 1f2b6f3552..c659f3f146 100644
+--- a/xen/include/asm-x86/spec_ctrl_asm.h
++++ b/xen/include/asm-x86/spec_ctrl_asm.h
+@@ -20,10 +20,10 @@
+ #ifndef __X86_SPEC_CTRL_ASM_H__
+ #define __X86_SPEC_CTRL_ASM_H__
+
+-/* Encoding of the bottom bits in cpuinfo.bti_ist_info */
+-#define BTI_IST_IBRS (1 << 0)
+-#define BTI_IST_WRMSR (1 << 1)
+-#define BTI_IST_RSB (1 << 2)
++/* Encoding of cpuinfo.spec_ctrl_flags */
++#define SCF_use_shadow (1 << 0)
++#define SCF_ist_wrmsr (1 << 1)
++#define SCF_ist_rsb (1 << 2)
+
+ #ifdef __ASSEMBLY__
+ #include <asm/msr-index.h>
+@@ -50,20 +50,20 @@
+ * after VMEXIT. The VMEXIT-specific code reads MSR_SPEC_CTRL and updates
+ * current before loading Xen's MSR_SPEC_CTRL setting.
+ *
+- * Factor 2 is harder. We maintain a shadow_spec_ctrl value, and
+- * use_shadow_spec_ctrl boolean per cpu. The synchronous use is:
++ * Factor 2 is harder. We maintain a shadow_spec_ctrl value, and a use_shadow
++ * boolean in the per cpu spec_ctrl_flags. The synchronous use is:
+ *
+ * 1) Store guest value in shadow_spec_ctrl
+- * 2) Set use_shadow_spec_ctrl boolean
++ * 2) Set the use_shadow boolean
+ * 3) Load guest value into MSR_SPEC_CTRL
+ * 4) Exit to guest
+ * 5) Entry from guest
+- * 6) Clear use_shadow_spec_ctrl boolean
++ * 6) Clear the use_shadow boolean
+ * 7) Load Xen's value into MSR_SPEC_CTRL
+ *
+ * The asynchronous use for interrupts/exceptions is:
+ * - Set/clear IBRS on entry to Xen
+- * - On exit to Xen, check use_shadow_spec_ctrl
++ * - On exit to Xen, check use_shadow
+ * - If set, load shadow_spec_ctrl
+ *
+ * Therefore, an interrupt/exception which hits the synchronous path between
+@@ -72,11 +72,14 @@
+ *
+ * The following ASM fragments implement this algorithm. See their local
+ * comments for further details.
+- * - SPEC_CTRL_ENTRY_FROM_VMEXIT
++ * - SPEC_CTRL_ENTRY_FROM_HVM
+ * - SPEC_CTRL_ENTRY_FROM_PV
+ * - SPEC_CTRL_ENTRY_FROM_INTR
++ * - SPEC_CTRL_ENTRY_FROM_INTR_IST
++ * - SPEC_CTRL_EXIT_TO_XEN_IST
+ * - SPEC_CTRL_EXIT_TO_XEN
+- * - SPEC_CTRL_EXIT_TO_GUEST
++ * - SPEC_CTRL_EXIT_TO_PV
++ * - SPEC_CTRL_EXIT_TO_HVM
+ */
+
+ .macro DO_OVERWRITE_RSB tmp=rax
+@@ -117,7 +120,7 @@
+ mov %\tmp, %rsp /* Restore old %rsp */
+ .endm
+
+-.macro DO_SPEC_CTRL_ENTRY_FROM_VMEXIT ibrs_val:req
++.macro DO_SPEC_CTRL_ENTRY_FROM_HVM
+ /*
+ * Requires %rbx=current, %rsp=regs/cpuinfo
+ * Clobbers %rax, %rcx, %rdx
+@@ -135,14 +138,14 @@
+ xor %edx, %edx
+
+ /* Clear SPEC_CTRL shadowing *before* loading Xen's value. */
+- movb %dl, CPUINFO_use_shadow_spec_ctrl(%rsp)
++ andb $~SCF_use_shadow, CPUINFO_spec_ctrl_flags(%rsp)
+
+ /* Load Xen's intended value. */
+- mov $\ibrs_val, %eax
++ movzbl CPUINFO_xen_spec_ctrl(%rsp), %eax
+ wrmsr
+ .endm
+
+-.macro DO_SPEC_CTRL_ENTRY maybexen:req ibrs_val:req
++.macro DO_SPEC_CTRL_ENTRY maybexen:req
+ /*
+ * Requires %rsp=regs (also cpuinfo if !maybexen)
+ * Requires %r14=stack_end (if maybexen)
+@@ -161,16 +164,18 @@
+ * block so calculate the position directly.
+ */
+ .if \maybexen
++ xor %eax, %eax
+ /* Branchless `if ( !xen ) clear_shadowing` */
+ testb $3, UREGS_cs(%rsp)
+- setz %al
+- and %al, STACK_CPUINFO_FIELD(use_shadow_spec_ctrl)(%r14)
++ setnz %al
++ not %eax
++ and %al, STACK_CPUINFO_FIELD(spec_ctrl_flags)(%r14)
++ movzbl STACK_CPUINFO_FIELD(xen_spec_ctrl)(%r14), %eax
+ .else
+- movb %dl, CPUINFO_use_shadow_spec_ctrl(%rsp)
++ andb $~SCF_use_shadow, CPUINFO_spec_ctrl_flags(%rsp)
++ movzbl CPUINFO_xen_spec_ctrl(%rsp), %eax
+ .endif
+
+- /* Load Xen's intended value. */
+- mov $\ibrs_val, %eax
+ wrmsr
+ .endm
+
+@@ -185,8 +190,8 @@
+ */
+ xor %edx, %edx
+
+- cmpb %dl, STACK_CPUINFO_FIELD(use_shadow_spec_ctrl)(%rbx)
+- je .L\@_skip
++ testb $SCF_use_shadow, STACK_CPUINFO_FIELD(spec_ctrl_flags)(%rbx)
++ jz .L\@_skip
+
+ mov STACK_CPUINFO_FIELD(shadow_spec_ctrl)(%rbx), %eax
+ mov $MSR_SPEC_CTRL, %ecx
+@@ -207,7 +212,7 @@
+ mov %eax, CPUINFO_shadow_spec_ctrl(%rsp)
+
+ /* Set SPEC_CTRL shadowing *before* loading the guest value. */
+- movb $1, CPUINFO_use_shadow_spec_ctrl(%rsp)
++ orb $SCF_use_shadow, CPUINFO_spec_ctrl_flags(%rsp)
+
+ mov $MSR_SPEC_CTRL, %ecx
+ xor %edx, %edx
+@@ -215,52 +220,47 @@
+ .endm
+
+ /* Use after a VMEXIT from an HVM guest. */
+-#define SPEC_CTRL_ENTRY_FROM_VMEXIT \
++#define SPEC_CTRL_ENTRY_FROM_HVM \
+ ALTERNATIVE __stringify(ASM_NOP40), \
+- DO_OVERWRITE_RSB, X86_FEATURE_RSB_VMEXIT; \
+- ALTERNATIVE_2 __stringify(ASM_NOP32), \
+- __stringify(DO_SPEC_CTRL_ENTRY_FROM_VMEXIT \
+- ibrs_val=SPEC_CTRL_IBRS), \
+- X86_FEATURE_XEN_IBRS_SET, \
+- __stringify(DO_SPEC_CTRL_ENTRY_FROM_VMEXIT \
+- ibrs_val=0), \
+- X86_FEATURE_XEN_IBRS_CLEAR
++ DO_OVERWRITE_RSB, X86_FEATURE_SC_RSB_HVM; \
++ ALTERNATIVE __stringify(ASM_NOP36), \
++ DO_SPEC_CTRL_ENTRY_FROM_HVM, X86_FEATURE_SC_MSR_HVM
+
+ /* Use after an entry from PV context (syscall/sysenter/int80/int82/etc). */
+ #define SPEC_CTRL_ENTRY_FROM_PV \
+ ALTERNATIVE __stringify(ASM_NOP40), \
+- DO_OVERWRITE_RSB, X86_FEATURE_RSB_NATIVE; \
+- ALTERNATIVE_2 __stringify(ASM_NOP21), \
+- __stringify(DO_SPEC_CTRL_ENTRY maybexen=0 \
+- ibrs_val=SPEC_CTRL_IBRS), \
+- X86_FEATURE_XEN_IBRS_SET, \
+- __stringify(DO_SPEC_CTRL_ENTRY maybexen=0 ibrs_val=0), \
+- X86_FEATURE_XEN_IBRS_CLEAR
++ DO_OVERWRITE_RSB, X86_FEATURE_SC_RSB_PV; \
++ ALTERNATIVE __stringify(ASM_NOP25), \
++ __stringify(DO_SPEC_CTRL_ENTRY maybexen=0), X86_FEATURE_SC_MSR_PV
+
+ /* Use in interrupt/exception context. May interrupt Xen or PV context. */
+ #define SPEC_CTRL_ENTRY_FROM_INTR \
+ ALTERNATIVE __stringify(ASM_NOP40), \
+- DO_OVERWRITE_RSB, X86_FEATURE_RSB_NATIVE; \
+- ALTERNATIVE_2 __stringify(ASM_NOP29), \
+- __stringify(DO_SPEC_CTRL_ENTRY maybexen=1 \
+- ibrs_val=SPEC_CTRL_IBRS), \
+- X86_FEATURE_XEN_IBRS_SET, \
+- __stringify(DO_SPEC_CTRL_ENTRY maybexen=1 ibrs_val=0), \
+- X86_FEATURE_XEN_IBRS_CLEAR
++ DO_OVERWRITE_RSB, X86_FEATURE_SC_RSB_PV; \
++ ALTERNATIVE __stringify(ASM_NOP33), \
++ __stringify(DO_SPEC_CTRL_ENTRY maybexen=1), X86_FEATURE_SC_MSR_PV
+
+ /* Use when exiting to Xen context. */
+ #define SPEC_CTRL_EXIT_TO_XEN \
+- ALTERNATIVE_2 __stringify(ASM_NOP17), \
+- DO_SPEC_CTRL_EXIT_TO_XEN, X86_FEATURE_XEN_IBRS_SET, \
+- DO_SPEC_CTRL_EXIT_TO_XEN, X86_FEATURE_XEN_IBRS_CLEAR
++ ALTERNATIVE __stringify(ASM_NOP17), \
++ DO_SPEC_CTRL_EXIT_TO_XEN, X86_FEATURE_SC_MSR_PV
+
+-/* Use when exiting to guest context. */
+-#define SPEC_CTRL_EXIT_TO_GUEST \
+- ALTERNATIVE_2 __stringify(ASM_NOP24), \
+- DO_SPEC_CTRL_EXIT_TO_GUEST, X86_FEATURE_XEN_IBRS_SET, \
+- DO_SPEC_CTRL_EXIT_TO_GUEST, X86_FEATURE_XEN_IBRS_CLEAR
++/* Use when exiting to PV guest context. */
++#define SPEC_CTRL_EXIT_TO_PV \
++ ALTERNATIVE __stringify(ASM_NOP24), \
++ DO_SPEC_CTRL_EXIT_TO_GUEST, X86_FEATURE_SC_MSR_PV
+
+-/* TODO: Drop these when the alternatives infrastructure is NMI/#MC safe. */
++/* Use when exiting to HVM guest context. */
++#define SPEC_CTRL_EXIT_TO_HVM \
++ ALTERNATIVE __stringify(ASM_NOP24), \
++ DO_SPEC_CTRL_EXIT_TO_GUEST, X86_FEATURE_SC_MSR_HVM
++
++/*
++ * Use in IST interrupt/exception context. May interrupt Xen or PV context.
++ * Fine grain control of SCF_ist_wrmsr is needed for safety in the S3 resume
++ * path to avoid using MSR_SPEC_CTRL before the microcode introducing it has
++ * been reloaded.
++ */
+ .macro SPEC_CTRL_ENTRY_FROM_INTR_IST
+ /*
+ * Requires %rsp=regs, %r14=stack_end
+@@ -269,29 +269,27 @@
+ * This is logical merge of DO_OVERWRITE_RSB and DO_SPEC_CTRL_ENTRY
+ * maybexen=1, but with conditionals rather than alternatives.
+ */
+- movzbl STACK_CPUINFO_FIELD(bti_ist_info)(%r14), %eax
++ movzbl STACK_CPUINFO_FIELD(spec_ctrl_flags)(%r14), %eax
+
+- testb $BTI_IST_RSB, %al
++ test $SCF_ist_rsb, %al
+ jz .L\@_skip_rsb
+
+ DO_OVERWRITE_RSB tmp=rdx /* Clobbers %rcx/%rdx */
+
+ .L\@_skip_rsb:
+
+- testb $BTI_IST_WRMSR, %al
++ test $SCF_ist_wrmsr, %al
+ jz .L\@_skip_wrmsr
+
+ xor %edx, %edx
+ testb $3, UREGS_cs(%rsp)
+- setz %dl
+- and %dl, STACK_CPUINFO_FIELD(use_shadow_spec_ctrl)(%r14)
++ setnz %dl
++ not %edx
++ and %dl, STACK_CPUINFO_FIELD(spec_ctrl_flags)(%r14)
+
+- /*
+- * Load Xen's intended value. SPEC_CTRL_IBRS vs 0 is encoded in the
+- * bottom bit of bti_ist_info, via a deliberate alias with BTI_IST_IBRS.
+- */
++ /* Load Xen's intended value. */
+ mov $MSR_SPEC_CTRL, %ecx
+- and $BTI_IST_IBRS, %eax
++ movzbl STACK_CPUINFO_FIELD(xen_spec_ctrl)(%r14), %eax
+ xor %edx, %edx
+ wrmsr
+
+@@ -309,12 +307,13 @@ UNLIKELY_DISPATCH_LABEL(\@_serialise):
+ UNLIKELY_END(\@_serialise)
+ .endm
+
++/* Use when exiting to Xen in IST context. */
+ .macro SPEC_CTRL_EXIT_TO_XEN_IST
+ /*
+ * Requires %rbx=stack_end
+ * Clobbers %rax, %rcx, %rdx
+ */
+- testb $BTI_IST_WRMSR, STACK_CPUINFO_FIELD(bti_ist_info)(%rbx)
++ testb $SCF_ist_wrmsr, STACK_CPUINFO_FIELD(spec_ctrl_flags)(%rbx)
+ jz .L\@_skip
+
+ DO_SPEC_CTRL_EXIT_TO_XEN
+diff --git a/xen/include/asm-x86/system.h b/xen/include/asm-x86/system.h
+index eb498f5e71..605768be12 100644
+--- a/xen/include/asm-x86/system.h
++++ b/xen/include/asm-x86/system.h
+@@ -185,6 +185,30 @@ static always_inline unsigned long __xadd(
+ #define set_mb(var, value) do { xchg(&var, value); } while (0)
+ #define set_wmb(var, value) do { var = value; wmb(); } while (0)
+
++/**
++ * array_index_mask_nospec() - generate a mask that is ~0UL when the
++ * bounds check succeeds and 0 otherwise
++ * @index: array element index
++ * @size: number of elements in array
++ *
++ * Returns:
++ * 0 - (index < size)
++ */
++static inline unsigned long array_index_mask_nospec(unsigned long index,
++ unsigned long size)
++{
++ unsigned long mask;
++
++ asm volatile ( "cmp %[size], %[index]; sbb %[mask], %[mask];"
++ : [mask] "=r" (mask)
++ : [size] "g" (size), [index] "r" (index) );
++
++ return mask;
++}
++
++/* Override default implementation in nospec.h. */
++#define array_index_mask_nospec array_index_mask_nospec
++
+ #define local_irq_disable() asm volatile ( "cli" : : : "memory" )
+ #define local_irq_enable() asm volatile ( "sti" : : : "memory" )
+
+diff --git a/xen/include/asm-x86/x86-defns.h b/xen/include/asm-x86/x86-defns.h
+index 70453e8dfb..10b366a07d 100644
+--- a/xen/include/asm-x86/x86-defns.h
++++ b/xen/include/asm-x86/x86-defns.h
+@@ -42,6 +42,13 @@
+ #define X86_CR0_CD 0x40000000 /* Cache Disable (RW) */
+ #define X86_CR0_PG 0x80000000 /* Paging (RW) */
+
++/*
++ * Intel CPU flags in CR3
++ */
++#define X86_CR3_NOFLUSH (_AC(1, ULL) << 63)
++#define X86_CR3_ADDR_MASK (PAGE_MASK & PADDR_MASK)
++#define X86_CR3_PCID_MASK _AC(0x0fff, ULL) /* Mask for PCID */
++
+ /*
+ * Intel CPU features in CR4
+ */
+diff --git a/xen/include/asm-x86/xstate.h b/xen/include/asm-x86/xstate.h
+index d36f422b59..9ba2a04c74 100644
+--- a/xen/include/asm-x86/xstate.h
++++ b/xen/include/asm-x86/xstate.h
+@@ -116,8 +116,9 @@ void xsave(struct vcpu *v, uint64_t mask);
+ void xrstor(struct vcpu *v, uint64_t mask);
+ void xstate_set_init(uint64_t mask);
+ bool xsave_enabled(const struct vcpu *v);
+-int __must_check validate_xstate(u64 xcr0, u64 xcr0_accum,
+- const struct xsave_hdr *);
++int __must_check validate_xstate(const struct domain *d,
++ uint64_t xcr0, uint64_t xcr0_accum,
++ const struct xsave_hdr *hdr);
+ int __must_check handle_xsetbv(u32 index, u64 new_bv);
+ void expand_xsave_states(struct vcpu *v, void *dest, unsigned int size);
+ void compress_xsave_states(struct vcpu *v, const void *src, unsigned int size);
+diff --git a/xen/include/public/arch-x86/cpufeatureset.h b/xen/include/public/arch-x86/cpufeatureset.h
+index 8da5783f7a..6c82816fd3 100644
+--- a/xen/include/public/arch-x86/cpufeatureset.h
++++ b/xen/include/public/arch-x86/cpufeatureset.h
+@@ -243,8 +243,10 @@ XEN_CPUFEATURE(IBPB, 8*32+12) /*A IBPB support only (no IBRS, used by
+ XEN_CPUFEATURE(AVX512_4VNNIW, 9*32+ 2) /*A AVX512 Neural Network Instructions */
+ XEN_CPUFEATURE(AVX512_4FMAPS, 9*32+ 3) /*A AVX512 Multiply Accumulation Single Precision */
+ XEN_CPUFEATURE(IBRSB, 9*32+26) /*A IBRS and IBPB support (used by Intel) */
+-XEN_CPUFEATURE(STIBP, 9*32+27) /*A! STIBP */
++XEN_CPUFEATURE(STIBP, 9*32+27) /*A STIBP */
++XEN_CPUFEATURE(L1D_FLUSH, 9*32+28) /*S MSR_FLUSH_CMD and L1D flush. */
+ XEN_CPUFEATURE(ARCH_CAPS, 9*32+29) /* IA32_ARCH_CAPABILITIES MSR */
++XEN_CPUFEATURE(SSBD, 9*32+31) /*A MSR_SPEC_CTRL.SSBD available */
+
+ #endif /* XEN_CPUFEATURE */
+
+diff --git a/xen/include/xen/compiler.h b/xen/include/xen/compiler.h
+index 533a8ea0f3..a7e05681c9 100644
+--- a/xen/include/xen/compiler.h
++++ b/xen/include/xen/compiler.h
+@@ -81,6 +81,9 @@
+ #pragma GCC visibility push(hidden)
+ #endif
+
++/* Make the optimizer believe the variable can be manipulated arbitrarily. */
++#define OPTIMIZER_HIDE_VAR(var) __asm__ ( "" : "+g" (var) )
++
+ /* This macro obfuscates arithmetic on a variable address so that gcc
+ shouldn't recognize the original var, and make assumptions about it */
+ /*
+diff --git a/xen/include/xen/cpu.h b/xen/include/xen/cpu.h
+index ffefc09f8e..2fe3ec05d8 100644
+--- a/xen/include/xen/cpu.h
++++ b/xen/include/xen/cpu.h
+@@ -47,6 +47,8 @@ void register_cpu_notifier(struct notifier_block *nb);
+ #define CPU_DYING (0x0007 | NOTIFY_REVERSE)
+ /* CPU_DEAD: CPU is dead. */
+ #define CPU_DEAD (0x0008 | NOTIFY_REVERSE)
++/* CPU_REMOVE: CPU was removed. */
++#define CPU_REMOVE (0x0009 | NOTIFY_REVERSE)
+
+ /* Perform CPU hotplug. May return -EAGAIN. */
+ int cpu_down(unsigned int cpu);
+diff --git a/xen/include/xen/cpumask.h b/xen/include/xen/cpumask.h
+index 3f340d619a..ee4399865a 100644
+--- a/xen/include/xen/cpumask.h
++++ b/xen/include/xen/cpumask.h
+@@ -349,16 +349,35 @@ static inline bool_t alloc_cpumask_var(cpumask_var_t *mask)
+ return *mask != NULL;
+ }
+
++static inline bool cond_alloc_cpumask_var(cpumask_var_t *mask)
++{
++ if (*mask == NULL)
++ *mask = _xmalloc(nr_cpumask_bits / 8, sizeof(long));
++ return *mask != NULL;
++}
++
+ static inline bool_t zalloc_cpumask_var(cpumask_var_t *mask)
+ {
+ *(void **)mask = _xzalloc(nr_cpumask_bits / 8, sizeof(long));
+ return *mask != NULL;
+ }
+
++static inline bool cond_zalloc_cpumask_var(cpumask_var_t *mask)
++{
++ if (*mask == NULL)
++ *mask = _xzalloc(nr_cpumask_bits / 8, sizeof(long));
++ else
++ cpumask_clear(*mask);
++ return *mask != NULL;
++}
++
+ static inline void free_cpumask_var(cpumask_var_t mask)
+ {
+ xfree(mask);
+ }
++
++/* Free an allocated mask, and zero the pointer to it. */
++#define FREE_CPUMASK_VAR(m) XFREE(m)
+ #else
+ typedef cpumask_t cpumask_var_t[1];
+
+@@ -366,16 +385,20 @@ static inline bool_t alloc_cpumask_var(cpumask_var_t *mask)
+ {
+ return 1;
+ }
++#define cond_alloc_cpumask_var alloc_cpumask_var
+
+ static inline bool_t zalloc_cpumask_var(cpumask_var_t *mask)
+ {
+ cpumask_clear(*mask);
+ return 1;
+ }
++#define cond_zalloc_cpumask_var zalloc_cpumask_var
+
+ static inline void free_cpumask_var(cpumask_var_t mask)
+ {
+ }
++
++#define FREE_CPUMASK_VAR(m) free_cpumask_var(m)
+ #endif
+
+ #if NR_CPUS > 1
+diff --git a/xen/include/xen/list.h b/xen/include/xen/list.h
+index fa07d720ee..1387abb211 100644
+--- a/xen/include/xen/list.h
++++ b/xen/include/xen/list.h
+@@ -51,6 +51,11 @@ static inline void INIT_LIST_HEAD(struct list_head *list)
+ list->prev = list;
+ }
+
++static inline bool list_head_is_null(const struct list_head *list)
++{
++ return !list->next && !list->prev;
++}
++
+ /*
+ * Insert a new entry between two known consecutive entries.
+ *
+diff --git a/xen/include/xen/mm.h b/xen/include/xen/mm.h
+index e813c07b22..fdcb90841a 100644
+--- a/xen/include/xen/mm.h
++++ b/xen/include/xen/mm.h
+@@ -162,6 +162,14 @@ void free_xenheap_pages(void *v, unsigned int order);
+ bool scrub_free_pages(void);
+ #define alloc_xenheap_page() (alloc_xenheap_pages(0,0))
+ #define free_xenheap_page(v) (free_xenheap_pages(v,0))
++
++/* Free an allocation, and zero the pointer to it. */
++#define FREE_XENHEAP_PAGES(p, o) do { \
++ free_xenheap_pages(p, o); \
++ (p) = NULL; \
++} while ( false )
++#define FREE_XENHEAP_PAGE(p) FREE_XENHEAP_PAGES(p, 0)
++
+ /* Map machine page range in Xen virtual address space. */
+ int map_pages_to_xen(
+ unsigned long virt,
+diff --git a/xen/include/xen/nospec.h b/xen/include/xen/nospec.h
+new file mode 100644
+index 0000000000..48793996e8
+--- /dev/null
++++ b/xen/include/xen/nospec.h
+@@ -0,0 +1,70 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++/* Copyright(c) 2018 Linus Torvalds. All rights reserved. */
++/* Copyright(c) 2018 Alexei Starovoitov. All rights reserved. */
++/* Copyright(c) 2018 Intel Corporation. All rights reserved. */
++/* Copyright(c) 2018 Citrix Systems R&D Ltd. All rights reserved. */
++
++#ifndef XEN_NOSPEC_H
++#define XEN_NOSPEC_H
++
++#include <asm/system.h>
++
++/**
++ * array_index_mask_nospec() - generate a ~0 mask when index < size, 0 otherwise
++ * @index: array element index
++ * @size: number of elements in array
++ *
++ * When @index is out of bounds (@index >= @size), the sign bit will be
++ * set. Extend the sign bit to all bits and invert, giving a result of
++ * zero for an out of bounds index, or ~0 if within bounds [0, @size).
++ */
++#ifndef array_index_mask_nospec
++static inline unsigned long array_index_mask_nospec(unsigned long index,
++ unsigned long size)
++{
++ /*
++ * Always calculate and emit the mask even if the compiler
++ * thinks the mask is not needed. The compiler does not take
++ * into account the value of @index under speculation.
++ */
++ OPTIMIZER_HIDE_VAR(index);
++ return ~(long)(index | (size - 1UL - index)) >> (BITS_PER_LONG - 1);
++}
++#endif
++
++/*
++ * array_index_nospec - sanitize an array index after a bounds check
++ *
++ * For a code sequence like:
++ *
++ * if (index < size) {
++ * index = array_index_nospec(index, size);
++ * val = array[index];
++ * }
++ *
++ * ...if the CPU speculates past the bounds check then
++ * array_index_nospec() will clamp the index within the range of [0,
++ * size).
++ */
++#define array_index_nospec(index, size) \
++({ \
++ typeof(index) _i = (index); \
++ typeof(size) _s = (size); \
++ unsigned long _mask = array_index_mask_nospec(_i, _s); \
++ \
++ BUILD_BUG_ON(sizeof(_i) > sizeof(long)); \
++ BUILD_BUG_ON(sizeof(_s) > sizeof(long)); \
++ \
++ (typeof(_i)) (_i & _mask); \
++})
++
++#endif /* XEN_NOSPEC_H */
++
++/*
++ * Local variables:
++ * mode: C
++ * c-file-style: "BSD"
++ * c-basic-offset: 4
++ * indent-tabs-mode: nil
++ * End:
++ */
+diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
+index 2541ecb04f..eaa83dc97e 100644
+--- a/xen/include/xen/sched.h
++++ b/xen/include/xen/sched.h
+@@ -796,7 +796,7 @@ static inline struct domain *next_domain_in_cpupool(
+ #define _VPF_parked 8
+ #define VPF_parked (1UL<<_VPF_parked)
+
+-static inline int vcpu_runnable(struct vcpu *v)
++static inline bool vcpu_runnable(const struct vcpu *v)
+ {
+ return !(v->pause_flags |
+ atomic_read(&v->pause_count) |
+diff --git a/xen/include/xen/tasklet.h b/xen/include/xen/tasklet.h
+index 23d69c738e..bc9ddace6d 100644
+--- a/xen/include/xen/tasklet.h
++++ b/xen/include/xen/tasklet.h
+@@ -50,6 +50,11 @@ static inline bool tasklet_work_to_do(unsigned int cpu)
+ TASKLET_scheduled);
+ }
+
++static inline bool tasklet_is_scheduled(const struct tasklet *t)
++{
++ return t->scheduled_on != -1;
++}
++
+ void tasklet_schedule_on_cpu(struct tasklet *t, unsigned int cpu);
+ void tasklet_schedule(struct tasklet *t);
+ void do_tasklet(void);
+diff --git a/xen/include/xen/xmalloc.h b/xen/include/xen/xmalloc.h
+index cc2673d8ae..9aa5edf593 100644
+--- a/xen/include/xen/xmalloc.h
++++ b/xen/include/xen/xmalloc.h
+@@ -26,6 +26,12 @@
+ /* Free any of the above. */
+ extern void xfree(void *);
+
++/* Free an allocation, and zero the pointer to it. */
++#define XFREE(p) do { \
++ xfree(p); \
++ (p) = NULL; \
++} while ( false )
++
+ /* Underlying functions */
+ extern void *_xmalloc(unsigned long size, unsigned long align);
+ extern void *_xzalloc(unsigned long size, unsigned long align);
+diff --git a/xen/tools/gen-cpuid.py b/xen/tools/gen-cpuid.py
+index 613b909c3d..65526ff120 100755
+--- a/xen/tools/gen-cpuid.py
++++ b/xen/tools/gen-cpuid.py
+@@ -257,10 +257,19 @@ def crunch_numbers(state):
+ AVX512BW, AVX512VL, AVX512VBMI, AVX512_4VNNIW,
+ AVX512_4FMAPS, AVX512_VPOPCNTDQ],
+
+- # Single Thread Indirect Branch Predictors enumerates a new bit in the
+- # MSR enumerated by Indirect Branch Restricted Speculation/Indirect
+- # Branch Prediction Barrier enumeration.
+- IBRSB: [STIBP],
++ # The features:
++ # * Single Thread Indirect Branch Predictors
++ # * Speculative Store Bypass Disable
++ #
++ # enumerate new bits in MSR_SPEC_CTRL, which is enumerated by Indirect
++ # Branch Restricted Speculation/Indirect Branch Prediction Barrier.
++ #
++ # In practice, these features also enumerate the presense of
++ # MSR_SPEC_CTRL. However, no real hardware will exist with SSBD but
++ # not IBRSB, and we pass this MSR directly to guests. Treating them
++ # as dependent features simplifies Xen's logic, and prevents the guest
++ # from seeing implausible configurations.
++ IBRSB: [STIBP, SSBD],
+ }
+
+ deep_features = tuple(sorted(deps.keys()))
diff --git a/main/xen/xsa260-1.patch b/main/xen/xsa260-1.patch
deleted file mode 100644
index 21da59cddd6..00000000000
--- a/main/xen/xsa260-1.patch
+++ /dev/null
@@ -1,72 +0,0 @@
-From: Andrew Cooper <andrew.cooper3@citrix.com>
-Subject: x86/traps: Fix %dr6 handing in #DB handler
-
-Most bits in %dr6 accumulate, rather than being set directly based on the
-current source of #DB. Have the handler follow the manuals guidance, which
-avoids leaking hypervisor debugging activities into guest context.
-
-This is part of XSA-260 / CVE-2018-8897.
-
-Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
-Reviewed-by: Jan Beulich <jbeulich@suse.com>
-
---- a/xen/arch/x86/traps.c 2018-04-13 15:29:36.006747135 +0200
-+++ b/xen/arch/x86/traps.c 2018-04-13 15:44:57.015516185 +0200
-@@ -1761,11 +1761,36 @@ static void ler_enable(void)
-
- void do_debug(struct cpu_user_regs *regs)
- {
-+ unsigned long dr6;
- struct vcpu *v = current;
-
-+ /* Stash dr6 as early as possible. */
-+ dr6 = read_debugreg(6);
-+
- if ( debugger_trap_entry(TRAP_debug, regs) )
- return;
-
-+ /*
-+ * At the time of writing (March 2018), on the subject of %dr6:
-+ *
-+ * The Intel manual says:
-+ * Certain debug exceptions may clear bits 0-3. The remaining contents
-+ * of the DR6 register are never cleared by the processor. To avoid
-+ * confusion in identifying debug exceptions, debug handlers should
-+ * clear the register (except bit 16, which they should set) before
-+ * returning to the interrupted task.
-+ *
-+ * The AMD manual says:
-+ * Bits 15:13 of the DR6 register are not cleared by the processor and
-+ * must be cleared by software after the contents have been read.
-+ *
-+ * Some bits are reserved set, some are reserved clear, and some bits
-+ * which were previously reserved set are reused and cleared by hardware.
-+ * For future compatibility, reset to the default value, which will allow
-+ * us to spot any bit being changed by hardware to its non-default value.
-+ */
-+ write_debugreg(6, X86_DR6_DEFAULT);
-+
- if ( !guest_mode(regs) )
- {
- if ( regs->eflags & X86_EFLAGS_TF )
-@@ -1798,7 +1823,8 @@ void do_debug(struct cpu_user_regs *regs
- }
-
- /* Save debug status register where guest OS can peek at it */
-- v->arch.debugreg[6] = read_debugreg(6);
-+ v->arch.debugreg[6] |= (dr6 & ~X86_DR6_DEFAULT);
-+ v->arch.debugreg[6] &= (dr6 | ~X86_DR6_DEFAULT);
-
- ler_enable();
- pv_inject_hw_exception(TRAP_debug, X86_EVENT_NO_EC);
---- a/xen/include/asm-x86/debugreg.h 2015-02-11 09:36:29.000000000 +0100
-+++ b/xen/include/asm-x86/debugreg.h 2018-04-13 15:44:57.015516185 +0200
-@@ -24,6 +24,8 @@
- #define DR_STATUS_RESERVED_ZERO (~0xffffeffful) /* Reserved, read as zero */
- #define DR_STATUS_RESERVED_ONE 0xffff0ff0ul /* Reserved, read as one */
-
-+#define X86_DR6_DEFAULT 0xffff0ff0ul /* Default %dr6 value. */
-+
- /* Now define a bunch of things for manipulating the control register.
- The top two bytes of the control register consist of 4 fields of 4
- bits - each field corresponds to one of the four debug registers,
diff --git a/main/xen/xsa260-2.patch b/main/xen/xsa260-2.patch
deleted file mode 100644
index be71b2438f5..00000000000
--- a/main/xen/xsa260-2.patch
+++ /dev/null
@@ -1,110 +0,0 @@
-From: Andrew Cooper <andrew.cooper3@citrix.com>
-Subject: x86/pv: Move exception injection into {,compat_}test_all_events()
-
-This allows paths to jump straight to {,compat_}test_all_events() and have
-injection of pending exceptions happen automatically, rather than requiring
-all calling paths to handle exceptions themselves.
-
-The normal exception path is simplified as a result, and
-compat_post_handle_exception() is removed entirely.
-
-This is part of XSA-260 / CVE-2018-8897.
-
-Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
-Reviewed-by: Jan Beulich <jbeulich@suse.com>
-
---- a/xen/arch/x86/x86_64/compat/entry.S
-+++ b/xen/arch/x86/x86_64/compat/entry.S
-@@ -39,6 +39,12 @@ ENTRY(compat_test_all_events)
- leaq irq_stat+IRQSTAT_softirq_pending(%rip),%rcx
- cmpl $0,(%rcx,%rax,1)
- jne compat_process_softirqs
-+
-+ /* Inject exception if pending. */
-+ lea VCPU_trap_bounce(%rbx), %rdx
-+ testb $TBF_EXCEPTION, TRAPBOUNCE_flags(%rdx)
-+ jnz .Lcompat_process_trapbounce
-+
- testb $1,VCPU_mce_pending(%rbx)
- jnz compat_process_mce
- .Lcompat_test_guest_nmi:
-@@ -68,6 +74,15 @@ compat_process_softirqs:
- call do_softirq
- jmp compat_test_all_events
-
-+ ALIGN
-+/* %rbx: struct vcpu, %rdx: struct trap_bounce */
-+.Lcompat_process_trapbounce:
-+ sti
-+.Lcompat_bounce_exception:
-+ call compat_create_bounce_frame
-+ movb $0, TRAPBOUNCE_flags(%rdx)
-+ jmp compat_test_all_events
-+
- ALIGN
- /* %rbx: struct vcpu */
- compat_process_mce:
-@@ -189,15 +204,6 @@ ENTRY(cr4_pv32_restore)
- xor %eax, %eax
- ret
-
--/* %rdx: trap_bounce, %rbx: struct vcpu */
--ENTRY(compat_post_handle_exception)
-- testb $TBF_EXCEPTION,TRAPBOUNCE_flags(%rdx)
-- jz compat_test_all_events
--.Lcompat_bounce_exception:
-- call compat_create_bounce_frame
-- movb $0,TRAPBOUNCE_flags(%rdx)
-- jmp compat_test_all_events
--
- .section .text.entry, "ax", @progbits
-
- /* See lstar_enter for entry register state. */
---- a/xen/arch/x86/x86_64/entry.S
-+++ b/xen/arch/x86/x86_64/entry.S
-@@ -42,6 +42,12 @@ test_all_events:
- leaq irq_stat+IRQSTAT_softirq_pending(%rip), %rcx
- cmpl $0, (%rcx, %rax, 1)
- jne process_softirqs
-+
-+ /* Inject exception if pending. */
-+ lea VCPU_trap_bounce(%rbx), %rdx
-+ testb $TBF_EXCEPTION, TRAPBOUNCE_flags(%rdx)
-+ jnz .Lprocess_trapbounce
-+
- cmpb $0, VCPU_mce_pending(%rbx)
- jne process_mce
- .Ltest_guest_nmi:
-@@ -70,6 +76,15 @@ process_softirqs:
- jmp test_all_events
-
- ALIGN
-+/* %rbx: struct vcpu, %rdx struct trap_bounce */
-+.Lprocess_trapbounce:
-+ sti
-+.Lbounce_exception:
-+ call create_bounce_frame
-+ movb $0, TRAPBOUNCE_flags(%rdx)
-+ jmp test_all_events
-+
-+ ALIGN
- /* %rbx: struct vcpu */
- process_mce:
- testb $1 << VCPU_TRAP_MCE, VCPU_async_exception_mask(%rbx)
-@@ -667,15 +682,9 @@ handle_exception_saved:
- mov %r15, STACK_CPUINFO_FIELD(xen_cr3)(%r14)
- testb $3,UREGS_cs(%rsp)
- jz restore_all_xen
-- leaq VCPU_trap_bounce(%rbx),%rdx
- movq VCPU_domain(%rbx),%rax
- testb $1,DOMAIN_is_32bit_pv(%rax)
-- jnz compat_post_handle_exception
-- testb $TBF_EXCEPTION,TRAPBOUNCE_flags(%rdx)
-- jz test_all_events
--.Lbounce_exception:
-- call create_bounce_frame
-- movb $0,TRAPBOUNCE_flags(%rdx)
-+ jnz compat_test_all_events
- jmp test_all_events
-
- /* No special register assumptions. */
diff --git a/main/xen/xsa260-3.patch b/main/xen/xsa260-3.patch
deleted file mode 100644
index f0a0a5687dc..00000000000
--- a/main/xen/xsa260-3.patch
+++ /dev/null
@@ -1,138 +0,0 @@
-From: Andrew Cooper <andrew.cooper3@citrix.com>
-Subject: x86/traps: Use an Interrupt Stack Table for #DB
-
-PV guests can use architectural corner cases to cause #DB to be raised after
-transitioning into supervisor mode.
-
-Use an interrupt stack table for #DB to prevent the exception being taken with
-a guest controlled stack pointer.
-
-This is part of XSA-260 / CVE-2018-8897.
-
-Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
-Reviewed-by: Jan Beulich <jbeulich@suse.com>
-
---- a/xen/arch/x86/cpu/common.c
-+++ b/xen/arch/x86/cpu/common.c
-@@ -679,6 +679,7 @@ void load_system_tables(void)
- [IST_MCE - 1] = stack_top + IST_MCE * PAGE_SIZE,
- [IST_DF - 1] = stack_top + IST_DF * PAGE_SIZE,
- [IST_NMI - 1] = stack_top + IST_NMI * PAGE_SIZE,
-+ [IST_DB - 1] = stack_top + IST_DB * PAGE_SIZE,
-
- [IST_MAX ... ARRAY_SIZE(tss->ist) - 1] =
- 0x8600111111111111ul,
-@@ -706,6 +707,7 @@ void load_system_tables(void)
- set_ist(&idt_tables[cpu][TRAP_double_fault], IST_DF);
- set_ist(&idt_tables[cpu][TRAP_nmi], IST_NMI);
- set_ist(&idt_tables[cpu][TRAP_machine_check], IST_MCE);
-+ set_ist(&idt_tables[cpu][TRAP_debug], IST_DB);
-
- /*
- * Bottom-of-stack must be 16-byte aligned!
---- a/xen/arch/x86/hvm/svm/svm.c
-+++ b/xen/arch/x86/hvm/svm/svm.c
-@@ -1046,6 +1046,7 @@ static void svm_ctxt_switch_from(struct
- set_ist(&idt_tables[cpu][TRAP_double_fault], IST_DF);
- set_ist(&idt_tables[cpu][TRAP_nmi], IST_NMI);
- set_ist(&idt_tables[cpu][TRAP_machine_check], IST_MCE);
-+ set_ist(&idt_tables[cpu][TRAP_debug], IST_DB);
- }
-
- static void svm_ctxt_switch_to(struct vcpu *v)
-@@ -1067,6 +1068,7 @@ static void svm_ctxt_switch_to(struct vc
- set_ist(&idt_tables[cpu][TRAP_double_fault], IST_NONE);
- set_ist(&idt_tables[cpu][TRAP_nmi], IST_NONE);
- set_ist(&idt_tables[cpu][TRAP_machine_check], IST_NONE);
-+ set_ist(&idt_tables[cpu][TRAP_debug], IST_NONE);
-
- svm_restore_dr(v);
-
---- a/xen/arch/x86/smpboot.c
-+++ b/xen/arch/x86/smpboot.c
-@@ -964,6 +964,7 @@ static int cpu_smpboot_alloc(unsigned in
- set_ist(&idt_tables[cpu][TRAP_double_fault], IST_NONE);
- set_ist(&idt_tables[cpu][TRAP_nmi], IST_NONE);
- set_ist(&idt_tables[cpu][TRAP_machine_check], IST_NONE);
-+ set_ist(&idt_tables[cpu][TRAP_debug], IST_NONE);
-
- for ( stub_page = 0, i = cpu & ~(STUBS_PER_PAGE - 1);
- i < nr_cpu_ids && i <= (cpu | (STUBS_PER_PAGE - 1)); ++i )
---- a/xen/arch/x86/traps.c
-+++ b/xen/arch/x86/traps.c
-@@ -325,13 +325,13 @@ static void show_guest_stack(struct vcpu
- /*
- * Notes for get_stack_trace_bottom() and get_stack_dump_bottom()
- *
-- * Stack pages 0, 1 and 2:
-+ * Stack pages 0 - 3:
- * These are all 1-page IST stacks. Each of these stacks have an exception
- * frame and saved register state at the top. The interesting bound for a
- * trace is the word adjacent to this, while the bound for a dump is the
- * very top, including the exception frame.
- *
-- * Stack pages 3, 4 and 5:
-+ * Stack pages 4 and 5:
- * None of these are particularly interesting. With MEMORY_GUARD, page 5 is
- * explicitly not present, so attempting to dump or trace it is
- * counterproductive. Without MEMORY_GUARD, it is possible for a call chain
-@@ -352,12 +352,12 @@ unsigned long get_stack_trace_bottom(uns
- {
- switch ( get_stack_page(sp) )
- {
-- case 0 ... 2:
-+ case 0 ... 3:
- return ROUNDUP(sp, PAGE_SIZE) -
- offsetof(struct cpu_user_regs, es) - sizeof(unsigned long);
-
- #ifndef MEMORY_GUARD
-- case 3 ... 5:
-+ case 4 ... 5:
- #endif
- case 6 ... 7:
- return ROUNDUP(sp, STACK_SIZE) -
-@@ -372,11 +372,11 @@ unsigned long get_stack_dump_bottom(unsi
- {
- switch ( get_stack_page(sp) )
- {
-- case 0 ... 2:
-+ case 0 ... 3:
- return ROUNDUP(sp, PAGE_SIZE) - sizeof(unsigned long);
-
- #ifndef MEMORY_GUARD
-- case 3 ... 5:
-+ case 4 ... 5:
- #endif
- case 6 ... 7:
- return ROUNDUP(sp, STACK_SIZE) - sizeof(unsigned long);
-@@ -1943,6 +1943,7 @@ void __init init_idt_traps(void)
- set_ist(&idt_table[TRAP_double_fault], IST_DF);
- set_ist(&idt_table[TRAP_nmi], IST_NMI);
- set_ist(&idt_table[TRAP_machine_check], IST_MCE);
-+ set_ist(&idt_table[TRAP_debug], IST_DB);
-
- /* CPU0 uses the master IDT. */
- idt_tables[0] = idt_table;
---- a/xen/arch/x86/x86_64/entry.S
-+++ b/xen/arch/x86/x86_64/entry.S
-@@ -739,7 +739,7 @@ ENTRY(device_not_available)
- ENTRY(debug)
- pushq $0
- movl $TRAP_debug,4(%rsp)
-- jmp handle_exception
-+ jmp handle_ist_exception
-
- ENTRY(int3)
- pushq $0
---- a/xen/include/asm-x86/processor.h
-+++ b/xen/include/asm-x86/processor.h
-@@ -443,7 +443,8 @@ struct __packed __cacheline_aligned tss_
- #define IST_DF 1UL
- #define IST_NMI 2UL
- #define IST_MCE 3UL
--#define IST_MAX 3UL
-+#define IST_DB 4UL
-+#define IST_MAX 4UL
-
- /* Set the interrupt stack table used by a particular interrupt
- * descriptor table entry. */
diff --git a/main/xen/xsa260-4.patch b/main/xen/xsa260-4.patch
deleted file mode 100644
index c2fa02d6e12..00000000000
--- a/main/xen/xsa260-4.patch
+++ /dev/null
@@ -1,72 +0,0 @@
-From: Andrew Cooper <andrew.cooper3@citrix.com>
-Subject: x86/traps: Fix handling of #DB exceptions in hypervisor context
-
-The WARN_ON() can be triggered by guest activities, and emits a full stack
-trace without rate limiting. Swap it out for a ratelimited printk with just
-enough information to work out what is going on.
-
-Not all #DB exceptions are traps, so blindly continuing is not a safe action
-to take. We don't let PV guests select these settings in the real %dr7 to
-begin with, but for added safety against unexpected situations, detect the
-fault cases and crash in an obvious manner.
-
-This is part of XSA-260 / CVE-2018-8897.
-
-Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
-Reviewed-by: Jan Beulich <jbeulich@suse.com>
-
---- a/xen/arch/x86/traps.c
-+++ b/xen/arch/x86/traps.c
-@@ -1809,16 +1809,44 @@ void do_debug(struct cpu_user_regs *regs
- regs->eflags &= ~X86_EFLAGS_TF;
- }
- }
-- else
-+
-+ /*
-+ * Check for fault conditions. General Detect, and instruction
-+ * breakpoints are faults rather than traps, at which point attempting
-+ * to ignore and continue will result in a livelock.
-+ */
-+ if ( dr6 & DR_GENERAL_DETECT )
-+ {
-+ printk(XENLOG_ERR "Hit General Detect in Xen context\n");
-+ fatal_trap(regs, 0);
-+ }
-+
-+ if ( dr6 & (DR_TRAP3 | DR_TRAP2 | DR_TRAP1 | DR_TRAP0) )
- {
-- /*
-- * We ignore watchpoints when they trigger within Xen. This may
-- * happen when a buffer is passed to us which previously had a
-- * watchpoint set on it. No need to bump EIP; the only faulting
-- * trap is an instruction breakpoint, which can't happen to us.
-- */
-- WARN_ON(!search_exception_table(regs));
-+ unsigned int bp, dr7 = read_debugreg(7) >> DR_CONTROL_SHIFT;
-+
-+ for ( bp = 0; bp < 4; ++bp )
-+ {
-+ if ( (dr6 & (1u << bp)) && /* Breakpoint triggered? */
-+ ((dr7 & (3u << (bp * DR_CONTROL_SIZE))) == 0) /* Insn? */ )
-+ {
-+ printk(XENLOG_ERR
-+ "Hit instruction breakpoint in Xen context\n");
-+ fatal_trap(regs, 0);
-+ }
-+ }
- }
-+
-+ /*
-+ * Whatever caused this #DB should be a trap. Note it and continue.
-+ * Guests can trigger this in certain corner cases, so ensure the
-+ * message is ratelimited.
-+ */
-+ gprintk(XENLOG_WARNING,
-+ "Hit #DB in Xen context: %04x:%p [%ps], stk %04x:%p, dr6 %lx\n",
-+ regs->cs, _p(regs->rip), _p(regs->rip),
-+ regs->ss, _p(regs->rsp), dr6);
-+
- goto out;
- }
-
diff --git a/main/xen/xsa261.patch b/main/xen/xsa261.patch
deleted file mode 100644
index a51744b8d09..00000000000
--- a/main/xen/xsa261.patch
+++ /dev/null
@@ -1,279 +0,0 @@
-From: Xen Project Security Team <security@xenproject.org>
-Subject: x86/vpt: add support for IO-APIC routed interrupts
-
-And modify the HPET code to make use of it. Currently HPET interrupts
-are always treated as ISA and thus injected through the vPIC. This is
-wrong because HPET interrupts when not in legacy mode should be
-injected from the IO-APIC.
-
-To make things worse, the supported interrupt routing values are set
-to [20..23], which clearly falls outside of the ISA range, thus
-leading to an ASSERT in debug builds or memory corruption in non-debug
-builds because the interrupt injection code will write out of the
-bounds of the arch.hvm_domain.vpic array.
-
-Since the HPET interrupt source can change between ISA and IO-APIC
-always destroy the timer before changing the mode, or else Xen risks
-changing it while the timer is active.
-
-Note that vpt interrupt injection is racy in the sense that the
-vIO-APIC RTE entry can be written by the guest in between the call to
-pt_irq_masked and hvm_ioapic_assert, or the call to pt_update_irq and
-pt_intr_post. Those are not deemed to be security issues, but rather
-quirks of the current implementation. In the worse case the guest
-might lose interrupts or get multiple interrupt vectors injected for
-the same timer source.
-
-This is part of XSA-261.
-
-Address actual and potential compiler warnings. Fix formatting.
-
-Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
-Signed-off-by: Jan Beulich <jbeulich@suse.com>
----
-Changes since v2:
- - Move fallthrough comment to be just above the case label.
- - Fix now stale comment in pt_update_irq.
- - Use NR_ISAIRQS instead of 16.
- - Expand commit message to mention the quirkiness of vpt interrupt
- injection.
-
-Changes since v1:
- - Simply usage of gsi in pt_irq_masked.
- - Introduce hvm_ioapic_assert.
- - Fix pt->source == PTSRC_isa in create_periodic_time.
-
---- a/xen/arch/x86/hvm/hpet.c
-+++ b/xen/arch/x86/hvm/hpet.c
-@@ -264,13 +264,20 @@ static void hpet_set_timer(HPETState *h,
- diff = (timer_is_32bit(h, tn) && (-diff > HPET_TINY_TIME_SPAN))
- ? (uint32_t)diff : 0;
-
-+ destroy_periodic_time(&h->pt[tn]);
- if ( (tn <= 1) && (h->hpet.config & HPET_CFG_LEGACY) )
-+ {
- /* if LegacyReplacementRoute bit is set, HPET specification requires
- timer0 be routed to IRQ0 in NON-APIC or IRQ2 in the I/O APIC,
- timer1 be routed to IRQ8 in NON-APIC or IRQ8 in the I/O APIC. */
- irq = (tn == 0) ? 0 : 8;
-+ h->pt[tn].source = PTSRC_isa;
-+ }
- else
-+ {
- irq = timer_int_route(h, tn);
-+ h->pt[tn].source = PTSRC_ioapic;
-+ }
-
- /*
- * diff is the time from now when the timer should fire, for a periodic
---- a/xen/arch/x86/hvm/irq.c
-+++ b/xen/arch/x86/hvm/irq.c
-@@ -41,6 +41,26 @@ static void assert_gsi(struct domain *d,
- vioapic_irq_positive_edge(d, ioapic_gsi);
- }
-
-+int hvm_ioapic_assert(struct domain *d, unsigned int gsi, bool level)
-+{
-+ struct hvm_irq *hvm_irq = hvm_domain_irq(d);
-+ int vector;
-+
-+ if ( gsi >= hvm_irq->nr_gsis )
-+ {
-+ ASSERT_UNREACHABLE();
-+ return -1;
-+ }
-+
-+ spin_lock(&d->arch.hvm_domain.irq_lock);
-+ if ( !level || hvm_irq->gsi_assert_count[gsi]++ == 0 )
-+ assert_gsi(d, gsi);
-+ vector = vioapic_get_vector(d, gsi);
-+ spin_unlock(&d->arch.hvm_domain.irq_lock);
-+
-+ return vector;
-+}
-+
- static void assert_irq(struct domain *d, unsigned ioapic_gsi, unsigned pic_irq)
- {
- assert_gsi(d, ioapic_gsi);
---- a/xen/arch/x86/hvm/vpt.c
-+++ b/xen/arch/x86/hvm/vpt.c
-@@ -107,31 +107,49 @@ static int pt_irq_vector(struct periodic
- static int pt_irq_masked(struct periodic_time *pt)
- {
- struct vcpu *v = pt->vcpu;
-- unsigned int gsi, isa_irq;
-- int mask;
-- uint8_t pic_imr;
-+ unsigned int gsi = pt->irq;
-
-- if ( pt->source == PTSRC_lapic )
-+ switch ( pt->source )
-+ {
-+ case PTSRC_lapic:
- {
- struct vlapic *vlapic = vcpu_vlapic(v);
-+
- return (!vlapic_enabled(vlapic) ||
- (vlapic_get_reg(vlapic, APIC_LVTT) & APIC_LVT_MASKED));
- }
-
-- isa_irq = pt->irq;
-- gsi = hvm_isa_irq_to_gsi(isa_irq);
-- pic_imr = v->domain->arch.hvm_domain.vpic[isa_irq >> 3].imr;
-- mask = vioapic_get_mask(v->domain, gsi);
-- if ( mask < 0 )
-- {
-- dprintk(XENLOG_WARNING, "d%u: invalid GSI (%u) for platform timer\n",
-- v->domain->domain_id, gsi);
-- domain_crash(v->domain);
-- return -1;
-+ case PTSRC_isa:
-+ {
-+ uint8_t pic_imr = v->domain->arch.hvm_domain.vpic[pt->irq >> 3].imr;
-+
-+ /* Check if the interrupt is unmasked in the PIC. */
-+ if ( !(pic_imr & (1 << (pt->irq & 7))) && vlapic_accept_pic_intr(v) )
-+ return 0;
-+
-+ gsi = hvm_isa_irq_to_gsi(pt->irq);
-+ }
-+
-+ /* Fallthrough to check if the interrupt is masked on the IO APIC. */
-+ case PTSRC_ioapic:
-+ {
-+ int mask = vioapic_get_mask(v->domain, gsi);
-+
-+ if ( mask < 0 )
-+ {
-+ dprintk(XENLOG_WARNING,
-+ "d%d: invalid GSI (%u) for platform timer\n",
-+ v->domain->domain_id, gsi);
-+ domain_crash(v->domain);
-+ return -1;
-+ }
-+
-+ return mask;
-+ }
- }
-
-- return (((pic_imr & (1 << (isa_irq & 7))) || !vlapic_accept_pic_intr(v)) &&
-- mask);
-+ ASSERT_UNREACHABLE();
-+ return 1;
- }
-
- static void pt_lock(struct periodic_time *pt)
-@@ -252,7 +270,7 @@ int pt_update_irq(struct vcpu *v)
- struct list_head *head = &v->arch.hvm_vcpu.tm_list;
- struct periodic_time *pt, *temp, *earliest_pt;
- uint64_t max_lag;
-- int irq, is_lapic, pt_vector;
-+ int irq, pt_vector = -1;
-
- spin_lock(&v->arch.hvm_vcpu.tm_lock);
-
-@@ -288,29 +306,26 @@ int pt_update_irq(struct vcpu *v)
-
- earliest_pt->irq_issued = 1;
- irq = earliest_pt->irq;
-- is_lapic = (earliest_pt->source == PTSRC_lapic);
-
- spin_unlock(&v->arch.hvm_vcpu.tm_lock);
-
-- /*
-- * If periodic timer interrut is handled by lapic, its vector in
-- * IRR is returned and used to set eoi_exit_bitmap for virtual
-- * interrupt delivery case. Otherwise return -1 to do nothing.
-- */
-- if ( is_lapic )
-+ switch ( earliest_pt->source )
- {
-+ case PTSRC_lapic:
-+ /*
-+ * If periodic timer interrupt is handled by lapic, its vector in
-+ * IRR is returned and used to set eoi_exit_bitmap for virtual
-+ * interrupt delivery case. Otherwise return -1 to do nothing.
-+ */
- vlapic_set_irq(vcpu_vlapic(v), irq, 0);
- pt_vector = irq;
-- }
-- else
-- {
-+ break;
-+
-+ case PTSRC_isa:
- hvm_isa_irq_deassert(v->domain, irq);
- if ( platform_legacy_irq(irq) && vlapic_accept_pic_intr(v) &&
- v->domain->arch.hvm_domain.vpic[irq >> 3].int_output )
-- {
- hvm_isa_irq_assert(v->domain, irq, NULL);
-- pt_vector = -1;
-- }
- else
- {
- pt_vector = hvm_isa_irq_assert(v->domain, irq, vioapic_get_vector);
-@@ -321,6 +336,17 @@ int pt_update_irq(struct vcpu *v)
- if ( pt_vector < 0 || !vlapic_test_irq(vcpu_vlapic(v), pt_vector) )
- pt_vector = -1;
- }
-+ break;
-+
-+ case PTSRC_ioapic:
-+ /*
-+ * NB: At the moment IO-APIC routed interrupts generated by vpt devices
-+ * (HPET) are edge-triggered.
-+ */
-+ pt_vector = hvm_ioapic_assert(v->domain, irq, false);
-+ if ( pt_vector < 0 || !vlapic_test_irq(vcpu_vlapic(v), pt_vector) )
-+ pt_vector = -1;
-+ break;
- }
-
- return pt_vector;
-@@ -418,7 +444,14 @@ void create_periodic_time(
- struct vcpu *v, struct periodic_time *pt, uint64_t delta,
- uint64_t period, uint8_t irq, time_cb *cb, void *data)
- {
-- ASSERT(pt->source != 0);
-+ if ( !pt->source ||
-+ (pt->irq >= NR_ISAIRQS && pt->source == PTSRC_isa) ||
-+ (pt->irq >= hvm_domain_irq(v->domain)->nr_gsis &&
-+ pt->source == PTSRC_ioapic) )
-+ {
-+ ASSERT_UNREACHABLE();
-+ return;
-+ }
-
- destroy_periodic_time(pt);
-
-@@ -498,7 +531,7 @@ static void pt_adjust_vcpu(struct period
- {
- int on_list;
-
-- ASSERT(pt->source == PTSRC_isa);
-+ ASSERT(pt->source == PTSRC_isa || pt->source == PTSRC_ioapic);
-
- if ( pt->vcpu == NULL )
- return;
---- a/xen/include/asm-x86/hvm/irq.h
-+++ b/xen/include/asm-x86/hvm/irq.h
-@@ -207,6 +207,9 @@ int hvm_set_pci_link_route(struct domain
-
- int hvm_inject_msi(struct domain *d, uint64_t addr, uint32_t data);
-
-+/* Assert an IO APIC pin. */
-+int hvm_ioapic_assert(struct domain *d, unsigned int gsi, bool level);
-+
- void hvm_maybe_deassert_evtchn_irq(void);
- void hvm_assert_evtchn_irq(struct vcpu *v);
- void hvm_set_callback_via(struct domain *d, uint64_t via);
---- a/xen/include/asm-x86/hvm/vpt.h
-+++ b/xen/include/asm-x86/hvm/vpt.h
-@@ -44,6 +44,7 @@ struct periodic_time {
- bool_t warned_timeout_too_short;
- #define PTSRC_isa 1 /* ISA time source */
- #define PTSRC_lapic 2 /* LAPIC time source */
-+#define PTSRC_ioapic 3 /* IOAPIC time source */
- u8 source; /* PTSRC_ */
- u8 irq;
- struct vcpu *vcpu; /* vcpu timer interrupt delivers to */
diff --git a/main/xen/xsa262-4.10.patch b/main/xen/xsa262-4.10.patch
deleted file mode 100644
index ba9a8ffa22f..00000000000
--- a/main/xen/xsa262-4.10.patch
+++ /dev/null
@@ -1,76 +0,0 @@
-From: Jan Beulich <jbeulich@suse.com>
-Subject: x86/HVM: guard against emulator driving ioreq state in weird ways
-
-In the case where hvm_wait_for_io() calls wait_on_xen_event_channel(),
-p->state ends up being read twice in succession: once to determine that
-state != p->state, and then again at the top of the loop. This gives a
-compromised emulator a chance to change the state back between the two
-reads, potentially keeping Xen in a loop indefinitely.
-
-Instead:
-* Read p->state once in each of the wait_on_xen_event_channel() tests,
-* re-use that value the next time around,
-* and insist that the states continue to transition "forward" (with the
- exception of the transition to STATE_IOREQ_NONE).
-
-This is XSA-262.
-
-Signed-off-by: Jan Beulich <jbeulich@suse.com>
-Reviewed-by: George Dunlap <george.dunlap@citrix.com>
-
---- a/xen/arch/x86/hvm/ioreq.c
-+++ b/xen/arch/x86/hvm/ioreq.c
-@@ -87,14 +87,17 @@ static void hvm_io_assist(struct hvm_ior
-
- static bool hvm_wait_for_io(struct hvm_ioreq_vcpu *sv, ioreq_t *p)
- {
-+ unsigned int prev_state = STATE_IOREQ_NONE;
-+
- while ( sv->pending )
- {
- unsigned int state = p->state;
-
-- rmb();
-- switch ( state )
-+ smp_rmb();
-+
-+ recheck:
-+ if ( unlikely(state == STATE_IOREQ_NONE) )
- {
-- case STATE_IOREQ_NONE:
- /*
- * The only reason we should see this case is when an
- * emulator is dying and it races with an I/O being
-@@ -102,14 +105,30 @@ static bool hvm_wait_for_io(struct hvm_i
- */
- hvm_io_assist(sv, ~0ul);
- break;
-+ }
-+
-+ if ( unlikely(state < prev_state) )
-+ {
-+ gdprintk(XENLOG_ERR, "Weird HVM ioreq state transition %u -> %u\n",
-+ prev_state, state);
-+ sv->pending = false;
-+ domain_crash(sv->vcpu->domain);
-+ return false; /* bail */
-+ }
-+
-+ switch ( prev_state = state )
-+ {
- case STATE_IORESP_READY: /* IORESP_READY -> NONE */
- p->state = STATE_IOREQ_NONE;
- hvm_io_assist(sv, p->data);
- break;
- case STATE_IOREQ_READY: /* IOREQ_{READY,INPROCESS} -> IORESP_READY */
- case STATE_IOREQ_INPROCESS:
-- wait_on_xen_event_channel(sv->ioreq_evtchn, p->state != state);
-- break;
-+ wait_on_xen_event_channel(sv->ioreq_evtchn,
-+ ({ state = p->state;
-+ smp_rmb();
-+ state != prev_state; }));
-+ goto recheck;
- default:
- gdprintk(XENLOG_ERR, "Weird HVM iorequest state %u\n", state);
- sv->pending = false;