diff options
author | Natanael Copa <ncopa@alpinelinux.org> | 2018-09-06 08:03:40 +0200 |
---|---|---|
committer | Natanael Copa <ncopa@alpinelinux.org> | 2018-09-06 08:03:40 +0200 |
commit | 74dce6e0451466b8eb5078660886cc226f9704f4 (patch) | |
tree | 5c2dc6bd447da5baf73f9934102b5a5d2a916a6b | |
parent | d72f525745c3193dfb608c0ce2fd7054bdc45e1b (diff) |
main/xen: backport various security fixes
fixes #9295
-rw-r--r-- | main/xen/APKBUILD | 31 | ||||
-rw-r--r-- | main/xen/git.patch | 8547 | ||||
-rw-r--r-- | main/xen/xsa260-1.patch | 72 | ||||
-rw-r--r-- | main/xen/xsa260-2.patch | 110 | ||||
-rw-r--r-- | main/xen/xsa260-3.patch | 138 | ||||
-rw-r--r-- | main/xen/xsa260-4.patch | 72 | ||||
-rw-r--r-- | main/xen/xsa261.patch | 279 | ||||
-rw-r--r-- | main/xen/xsa262-4.10.patch | 76 |
8 files changed, 8564 insertions, 761 deletions
diff --git a/main/xen/APKBUILD b/main/xen/APKBUILD index 3958ff5032c..4c320d40cdc 100644 --- a/main/xen/APKBUILD +++ b/main/xen/APKBUILD @@ -3,7 +3,7 @@ # Maintainer: William Pitcock <nenolod@dereferenced.org> pkgname=xen pkgver=4.10.1 -pkgrel=2 +pkgrel=3 pkgdesc="Xen hypervisor" url="http://www.xen.org/" arch="x86_64 armhf aarch64" @@ -119,6 +119,19 @@ options="!strip" # - CVE-2018-8897 XSA-260 # - CVE-2018-10982 XSA-261 # - CVE-2018-10981 XSA-262 +# 4.10.1-r3: +# - CVE-2018-14678 XSA-274 +# - CVE-2018-3646 XSA-273 +# - CVE-2018-15470 XSA-272 +# - CVE-2018-14007 XSA-271 +# - CVE-2018-15471 XSA-270 +# - CVE-2018-15468 XSA-269 +# - CVE-2018-15469 XSA-268 +# - CVE-2018-3665 XSA-267 +# - CVE-2018-12892 XSA-266 +# - CVE-2018-12893 XSA-265 +# - CVE-2018-12891 XSA-264 +# - CVE-2018-3639 XSA-263 case "$CARCH" in x86*) @@ -165,6 +178,7 @@ source="https://downloads.xenproject.org/release/$pkgname/$pkgver/$pkgname-$pkgv http://xenbits.xen.org/xen-extfiles/tpm_emulator-$_TPMEMU_VERSION.tar.gz http://xenbits.xen.org/xen-extfiles/zlib-$_ZLIB_VERSION.tar.gz http://xenbits.xen.org/xen-extfiles/ipxe-git-$_IPXE_GIT_TAG.tar.gz + git.patch qemu-xen_paths.patch @@ -183,13 +197,6 @@ source="https://downloads.xenproject.org/release/$pkgname/$pkgver/$pkgname-$pkgv hotplug-Linux-iscsi-block-handle-lun-1.patch - xsa260-1.patch - xsa260-2.patch - xsa260-3.patch - xsa260-4.patch - xsa261.patch - xsa262-4.10.patch - xenstored.initd xenstored.confd xenconsoled.initd @@ -433,6 +440,8 @@ c2bc9ffc8583aeae71cee9ddcc4418969768d4e3764d47307da54f93981c0109fb07d84b061b3a36 4928b5b82f57645be9408362706ff2c4d9baa635b21b0d41b1c82930e8c60a759b1ea4fa74d7e6c7cae1b7692d006aa5cb72df0c3b88bf049779aa2b566f9d35 tpm_emulator-0.7.4.tar.gz 021b958fcd0d346c4ba761bcf0cc40f3522de6186cf5a0a6ea34a70504ce9622b1c2626fce40675bc8282cf5f5ade18473656abc38050f72f5d6480507a2106e zlib-1.2.3.tar.gz bbcce5e55040e7e29adebd4a5253a046016a6e2e7ff34cf801a42d147e1ec1af57e0297318249bfa9c5bbeac969fe4b37c18cbf845a80b2136d65387a4fc31da ipxe-git-356f6c1b64d7a97746d1816cef8ca22bdd8d0b5d.tar.gz +0fd2622469f3ff136b33a66576319920e050aac3fefa41c06306661eb6f6792fc21a4c15c8928febd10b1a14b4c712a2918532cdb23ccbddba9f1ba55d7d4478 git.patch +7fdb705d26f100c409c354d3d249afde2ee9273e1f0028d4f320bc67325dc4ffa411ac9c59d75b31c79e2f95c17ec3ef0b4ac98de4fefb073c5f2529d3c69be2 xsa271-xapi.patch 1936ab39a1867957fa640eb81c4070214ca4856a2743ba7e49c0cd017917071a9680d015f002c57fa7b9600dbadd29dcea5887f50e6c133305df2669a7a933f3 qemu-xen_paths.patch f095ea373f36381491ad36f0662fb4f53665031973721256b23166e596318581da7cbb0146d0beb2446729adfdb321e01468e377793f6563a67d68b8b0f7ffe3 hotplug-vif-vtrill.patch 77b08e9655e091b0352e4630d520b54c6ca6d659d1d38fbb4b3bfc9ff3e66db433a2e194ead32bb10ff962c382d800a670e82b7a62835b238e294b22808290ea musl-hvmloader-fix-stdint.patch @@ -443,12 +452,6 @@ e76816c6ad0e91dc5f81947f266da3429b20e6d976c3e8c41202c6179532eec878a3f0913921ef3a 69dfa60628ca838678862383528654ecbdf4269cbb5c9cfb6b84d976202a8dea85d711aa65a52fa1b477fb0b30604ca70cf1337192d6fb9388a08bbe7fe56077 xenstore_client_transaction_fix.patch 2094ea964fa610b2bf72fd2c7ede7e954899a75c0f5b08030cf1d74460fb759ade84866176e32f8fe29c921dfdc6dafd2b31e23ab9b0a3874d3dceeabdd1913b xenqemu-xattr-size-max.patch 8c9cfc6afca325df1d8026e21ed03fa8cd2c7e1a21a56cc1968301c5ab634bfe849951899e75d328951d7a41273d1e49a2448edbadec0029ed410c43c0549812 hotplug-Linux-iscsi-block-handle-lun-1.patch -08a35c2c14d51f4e004274367a948332b97f22d12b0b8f7647f5f026f3d57cfe294dd4c3f3e4d34439c9010f2efc30689e62ec805ca134cfd75fe85f0c53c94b xsa260-1.patch -6d152a54d38a8c06a8c1293ab637c484ad6baf53b7be54a8a916143011f5042a089972c5c08e489d510356507296da8c7aa8e89b17517b1c167a95084b5389db xsa260-2.patch -d7208e68d60581ad6a6a5f56528e7b820f0f6db56593a4b01a5c59f245e3e06596a6512f5cca6d3c88b662c787c46b98f7f0759822e375e10b2e2402c89262f6 xsa260-3.patch -2b26451201f0b754b19f7cd7f8ffdc3b2ea083fd3f54de6cd0c29bc0dba89d5dac4b33ed58b3b80a48887ffa11d9c82ded0c60a4df5895022ff97d1b11b2357c xsa260-4.patch -f6c55fb28915d54b05585c4ba177fd57f8a70b87930af24307c3142e97e39239f684b52c70d9051d1ac6a21a9e8eaabba482c451d7b4e3f48054a02048d5603e xsa261.patch -aa6089f017c0e00e0e464b6f8d82dd5c8d588ccff027b175f43dd9d4efd4014ac899fceedef2005854b892ea156c7951c71183c03479cdf70c6d0298f5f76522 xsa262-4.10.patch 52c43beb2596d645934d0f909f2d21f7587b6898ed5e5e7046799a8ed6d58f7a09c5809e1634fa26152f3fd4f3e7cfa07da7076f01b4a20cc8f5df8b9cb77e50 xenstored.initd 093f7fbd43faf0a16a226486a0776bade5dc1681d281c5946a3191c32d74f9699c6bf5d0ab8de9d1195a2461165d1660788e92a3156c9b3c7054d7b2d52d7ff0 xenstored.confd 3c86ed48fbee0af4051c65c4a3893f131fa66e47bf083caf20c9b6aa4b63fdead8832f84a58d0e27964bc49ec8397251b34e5be5c212c139f556916dc8da9523 xenconsoled.initd diff --git a/main/xen/git.patch b/main/xen/git.patch new file mode 100644 index 00000000000..b4224d8d674 --- /dev/null +++ b/main/xen/git.patch @@ -0,0 +1,8547 @@ +diff --git a/docs/man/xl.conf.pod.5 b/docs/man/xl.conf.pod.5 +index da91b8626c..37262a7ef8 100644 +--- a/docs/man/xl.conf.pod.5 ++++ b/docs/man/xl.conf.pod.5 +@@ -185,6 +185,28 @@ massively huge guests). + + =back + ++=item B<vm.cpumask>="CPULIST" ++ ++=item B<vm.hvm.cpumask>="CPULIST" ++ ++=item B<vm.pv.cpumask>="CPULIST" ++ ++Global masks that are applied when creating guests and pinning vcpus ++to indicate which cpus they are allowed to run on. Specifically, ++C<vm.cpumask> applies to all guest types, C<vm.hvm.cpumask> applies to ++both HVM and PVH guests and C<vm.pv.cpumask> applies to PV guests. ++ ++The hard affinity of guest's vcpus are logical-AND'ed with respective ++masks. If the resulting affinity mask is empty, operation will fail. ++ ++Use --ignore-global-affinity-masks to skip applying global masks. ++ ++The default value for these masks are all 1's, i.e. all cpus are allowed. ++ ++Due to bug(s), these options may not interact well with other options ++concerning CPU affinity. One example is CPU pools. Users should always double ++check that the required affinity has taken effect. ++ + =back + + =head1 SEE ALSO +diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown +index 6c673eedc8..470da80174 100644 +--- a/docs/misc/xen-command-line.markdown ++++ b/docs/misc/xen-command-line.markdown +@@ -248,6 +248,9 @@ the NMI watchdog is also enabled. + ### bti (x86) + > `= List of [ thunk=retpoline|lfence|jmp, ibrs=<bool>, ibpb=<bool>, rsb_{vmexit,native}=<bool> ]` + ++**WARNING: This command line option is deprecated, and superseded by ++_spec-ctrl=_ - using both options in combination is undefined.** ++ + Branch Target Injection controls. By default, Xen will pick the most + appropriate BTI mitigations based on compiled in support, loaded microcode, + and hardware details. +@@ -493,9 +496,10 @@ accounting for hardware capabilities as enumerated via CPUID. + + Currently accepted: + +-The Speculation Control hardware features `ibrsb`, `stibp`, `ibpb` are used by +-default if avaiable. They can be ignored, e.g. `no-ibrsb`, at which point Xen +-won't use them itself, and won't offer them to guests. ++The Speculation Control hardware features `ibrsb`, `stibp`, `ibpb`, ++`l1d-flush` and `ssbd` are used by default if available and applicable. They can ++be ignored, e.g. `no-ibrsb`, at which point Xen won't use them itself, and ++won't offer them to guests. + + ### cpuid\_mask\_cpu (AMD only) + > `= fam_0f_rev_c | fam_0f_rev_d | fam_0f_rev_e | fam_0f_rev_f | fam_0f_rev_g | fam_10_rev_b | fam_10_rev_c | fam_11_rev_b` +@@ -916,6 +920,21 @@ Controls EPT related features. + + Specify which console gdbstub should use. See **console**. + ++### gnttab ++> `= List of [ max-ver:<integer>, transitive=<bool> ]` ++ ++> Default: `gnttab=max-ver:2,transitive` ++ ++Control various aspects of the grant table behaviour available to guests. ++ ++* `max-ver` Select the maximum grant table version to offer to guests. Valid ++version are 1 and 2. ++* `transitive` Permit or disallow the use of transitive grants. Note that the ++use of grant table v2 without transitive grants is an ABI breakage from the ++guests point of view. ++ ++The usage of gnttab v2 is not security supported on ARM platforms. ++ + ### gnttab\_max\_frames + > `= <integer>` + +@@ -1348,6 +1367,15 @@ Because responsibility for APIC setup is shared between Xen and the + domain 0 kernel this option is automatically propagated to the domain + 0 command line. + ++### invpcid (x86) ++> `= <boolean>` ++ ++> Default: `true` ++ ++By default, Xen will use the INVPCID instruction for TLB management if ++it is available. This option can be used to cause Xen to fall back to ++older mechanisms, which are generally slower. ++ + ### noirqbalance + > `= <boolean>` + +@@ -1426,6 +1454,20 @@ Flag to enable Memory Protection Keys. + The protection-key feature provides an additional mechanism by which IA-32e + paging controls access to usermode addresses. + ++### pcid (x86) ++> `= <boolean> | xpti=<bool>` ++ ++> Default: `xpti` ++ ++> Can be modified at runtime (change takes effect only for domains created ++ afterwards) ++ ++If available, control usage of the PCID feature of the processor for ++64-bit pv-domains. PCID can be used either for no domain at all (`false`), ++for all of them (`true`), only for those subject to XPTI (`xpti`) or for ++those not subject to XPTI (`no-xpti`). The feature is used only in case ++INVPCID is supported and not disabled via `invpcid=false`. ++ + ### psr (Intel) + > `= List of ( cmt:<boolean> | rmid_max:<integer> | cat:<boolean> | cos_max:<integer> | cdp:<boolean> )` + +@@ -1486,6 +1528,30 @@ do; there may be other custom operating systems which do. If you're + certain you don't plan on having PV guests which use this feature, + turning it off can reduce the attack surface. + ++### pv-l1tf (x86) ++> `= List of [ <bool>, dom0=<bool>, domu=<bool> ]` ++ ++> Default: `false` on believed-unaffected hardware, or in pv-shim mode. ++> `domu` on believed-affected hardware. ++ ++Mitigations for L1TF / XSA-273 / CVE-2018-3620 for PV guests. ++ ++For backwards compatibility, we may not alter an architecturally-legitimate ++pagetable entry a PV guest chooses to write. We can however force such a ++guest into shadow mode so that Xen controls the PTEs which are reachable by ++the CPU pagewalk. ++ ++Shadowing is performed at the point where a PV guest first tries to write an ++L1TF-vulnerable PTE. Therefore, a PV guest kernel which has been updated with ++its own L1TF mitigations will not trigger shadow mode if it is well behaved. ++ ++If CONFIG\_SHADOW\_PAGING is not compiled in, this mitigation instead crashes ++the guest when an L1TF-vulnerable PTE is written, which still allows updated, ++well-behaved PV guests to run, despite Shadow being compiled out. ++ ++In the pv-shim case, Shadow is expected to be compiled out, and a malicious ++guest kernel can only leak data from the shim Xen, rather than the host Xen. ++ + ### pv-shim (x86) + > `= <boolean>` + +@@ -1690,6 +1756,13 @@ Use `smap=hvm` to allow SMAP use by HVM guests only. + Flag to enable Supervisor Mode Execution Protection + Use `smep=hvm` to allow SMEP use by HVM guests only. + ++### smt (x86) ++> `= <boolean>` ++ ++Default: `true` ++ ++Control bring up of multiple hyper-threads per CPU core. ++ + ### snb\_igd\_quirk + > `= <boolean> | cap | <integer>` + +@@ -1698,6 +1771,75 @@ enforces the maximum theoretically necessary timeout of 670ms. Any number + is being interpreted as a custom timeout in milliseconds. Zero or boolean + false disable the quirk workaround, which is also the default. + ++### spec-ctrl (x86) ++> `= List of [ <bool>, xen=<bool>, {pv,hvm,msr-sc,rsb}=<bool>, ++> bti-thunk=retpoline|lfence|jmp, {ibrs,ibpb,ssbd,eager-fpu, ++> l1d-flush}=<bool> ]` ++ ++Controls for speculative execution sidechannel mitigations. By default, Xen ++will pick the most appropriate mitigations based on compiled in support, ++loaded microcode, and hardware details, and will virtualise appropriate ++mitigations for guests to use. ++ ++**WARNING: Any use of this option may interfere with heuristics. Use with ++extreme care.** ++ ++An overall boolean value, `spec-ctrl=no`, can be specified to turn off all ++mitigations, including pieces of infrastructure used to virtualise certain ++mitigation features for guests. This also includes settings which `xpti`, ++`smt`, `pv-l1tf` control, unless the respective option(s) have been ++specified earlier on the command line. ++ ++Alternatively, a slightly more restricted `spec-ctrl=no-xen` can be used to ++turn off all of Xen's mitigations, while leaving the virtualisation support ++in place for guests to use. ++ ++Use of a positive boolean value for either of these options is invalid. ++ ++The booleans `pv=`, `hvm=`, `msr-sc=` and `rsb=` offer fine grained control ++over the alternative blocks used by Xen. These impact Xen's ability to ++protect itself, and Xen's ability to virtualise support for guests to use. ++ ++* `pv=` and `hvm=` offer control over all suboptions for PV and HVM guests ++ respectively. ++* `msr-sc=` offers control over Xen's support for manipulating MSR\_SPEC\_CTRL ++ on entry and exit. These blocks are necessary to virtualise support for ++ guests and if disabled, guests will be unable to use IBRS/STIBP/SSBD/etc. ++* `rsb=` offers control over whether to overwrite the Return Stack Buffer / ++ Return Address Stack on entry to Xen. ++ ++If Xen was compiled with INDIRECT\_THUNK support, `bti-thunk=` can be used to ++select which of the thunks gets patched into the `__x86_indirect_thunk_%reg` ++locations. The default thunk is `retpoline` (generally preferred for Intel ++hardware), with the alternatives being `jmp` (a `jmp *%reg` gadget, minimal ++overhead), and `lfence` (an `lfence; jmp *%reg` gadget, preferred for AMD). ++ ++On hardware supporting IBRS (Indirect Branch Restricted Speculation), the ++`ibrs=` option can be used to force or prevent Xen using the feature itself. ++If Xen is not using IBRS itself, functionality is still set up so IBRS can be ++virtualised for guests. ++ ++On hardware supporting IBPB (Indirect Branch Prediction Barrier), the `ibpb=` ++option can be used to force (the default) or prevent Xen from issuing branch ++prediction barriers on vcpu context switches. ++ ++On hardware supporting SSBD (Speculative Store Bypass Disable), the `ssbd=` ++option can be used to force or prevent Xen using the feature itself. On AMD ++hardware, this is a global option applied at boot, and not virtualised for ++guest use. On Intel hardware, the feature is virtualised for guests, ++independently of Xen's choice of setting. ++ ++On all hardware, the `eager-fpu=` option can be used to force or prevent Xen ++from using fully eager FPU context switches. This is currently implemented as ++a global control. By default, Xen will choose to use fully eager context ++switches on hardware believed to speculate past #NM exceptions. ++ ++On hardware supporting L1D_FLUSH, the `l1d-flush=` option can be used to force ++or prevent Xen from issuing an L1 data cache flush on each VMEntry. ++Irrespective of Xen's setting, the feature is virtualised for HVM guests to ++use. By default, Xen will enable this mitigation on hardware believed to be ++vulnerable to L1TF. ++ + ### sync\_console + > `= <boolean>` + +@@ -1923,14 +2065,24 @@ clustered mode. The default, given no hint from the **FADT**, is cluster + mode. + + ### xpti +-> `= <boolean>` ++> `= List of [ default | <boolean> | dom0=<bool> | domu=<bool> ]` + +-> Default: `false` on AMD hardware ++> Default: `false` on hardware not to be vulnerable to Meltdown (e.g. AMD) + > Default: `true` everywhere else + + Override default selection of whether to isolate 64-bit PV guest page + tables. + ++`true` activates page table isolation even on hardware not vulnerable by ++Meltdown for all domains. ++ ++`false` deactivates page table isolation on all systems for all domains. ++ ++`default` sets the default behaviour. ++ ++With `dom0` and `domu` it is possible to control page table isolation ++for dom0 or guest domains only. ++ + ### xsave + > `= <boolean>` + +diff --git a/tools/Makefile b/tools/Makefile +index ab7a01ee1b..67977ad850 100644 +--- a/tools/Makefile ++++ b/tools/Makefile +@@ -232,7 +232,7 @@ subdir-all-qemu-xen-dir: qemu-xen-dir-find + else \ + enable_trace_backend='' ; \ + fi ; \ +- PKG_CONFIG_PATH=$(XEN_ROOT)/tools/pkg-config \ ++ PKG_CONFIG_PATH=$(XEN_ROOT)/tools/pkg-config$${PKG_CONFIG_PATH:+:$${PKG_CONFIG_PATH}} \ + $$source/configure --enable-xen --target-list=i386-softmmu \ + $(QEMU_XEN_ENABLE_DEBUG) \ + $$enable_trace_backend \ +diff --git a/tools/examples/xl.conf b/tools/examples/xl.conf +index 374b6bbc2e..0446deb304 100644 +--- a/tools/examples/xl.conf ++++ b/tools/examples/xl.conf +@@ -37,3 +37,8 @@ + # (which can take a long time to find out if launching huge guests). + # see xl.conf(5) for details. + #claim_mode=1 ++ ++# Specify global vcpu hard affinity masks. See xl.conf(5) for details. ++#vm.cpumask="0-7" ++#vm.pv.cpumask="0-3" ++#vm.hvm.cpumask="3-7" +diff --git a/tools/libacpi/Makefile b/tools/libacpi/Makefile +index a47a658a25..c17f3924cc 100644 +--- a/tools/libacpi/Makefile ++++ b/tools/libacpi/Makefile +@@ -43,7 +43,7 @@ all: $(C_SRC) $(H_SRC) + + $(H_SRC): $(ACPI_BUILD_DIR)/%.h: %.asl iasl + iasl -vs -p $(ACPI_BUILD_DIR)/$*.$(TMP_SUFFIX) -tc $< +- sed -e 's/AmlCode/$*/g' $(ACPI_BUILD_DIR)/$*.hex >$@ ++ sed -e 's/AmlCode/$*/g' -e 's/_aml_code//g' $(ACPI_BUILD_DIR)/$*.hex >$@ + rm -f $(addprefix $(ACPI_BUILD_DIR)/, $*.aml $*.hex) + + $(MK_DSDT): mk_dsdt.c +@@ -76,7 +76,7 @@ $(ACPI_BUILD_DIR)/dsdt_anycpu_arm.asl: $(MK_DSDT) + + $(C_SRC): $(ACPI_BUILD_DIR)/%.c: iasl $(ACPI_BUILD_DIR)/%.asl + iasl -vs -p $(ACPI_BUILD_DIR)/$*.$(TMP_SUFFIX) -tc $(ACPI_BUILD_DIR)/$*.asl +- sed -e 's/AmlCode/$*/g' $(ACPI_BUILD_DIR)/$*.hex > $@.$(TMP_SUFFIX) ++ sed -e 's/AmlCode/$*/g' -e 's/_aml_code//g' $(ACPI_BUILD_DIR)/$*.hex > $@.$(TMP_SUFFIX) + echo "int $*_len=sizeof($*);" >> $@.$(TMP_SUFFIX) + mv -f $@.$(TMP_SUFFIX) $@ + rm -f $(addprefix $(ACPI_BUILD_DIR)/, $*.aml $*.hex) +diff --git a/tools/libxc/xc_cpuid_x86.c b/tools/libxc/xc_cpuid_x86.c +index 9fa2f7c360..21537f06f1 100644 +--- a/tools/libxc/xc_cpuid_x86.c ++++ b/tools/libxc/xc_cpuid_x86.c +@@ -575,6 +575,12 @@ static void xc_cpuid_pv_policy(xc_interface *xch, + break; + } + ++ case 0x80000008: ++ regs[0] &= 0x0000ffffu; ++ regs[1] = info->featureset[featureword_of(X86_FEATURE_CLZERO)]; ++ regs[2] = regs[3] = 0; ++ break; ++ + case 0x00000005: /* MONITOR/MWAIT */ + case 0x0000000b: /* Extended Topology Enumeration */ + case 0x8000000a: /* SVM revision and features */ +diff --git a/tools/libxl/libxl_cpuid.c b/tools/libxl/libxl_cpuid.c +index 3a21f4e7da..52e16c20ed 100644 +--- a/tools/libxl/libxl_cpuid.c ++++ b/tools/libxl/libxl_cpuid.c +@@ -204,7 +204,9 @@ int libxl_cpuid_parse_config(libxl_cpuid_policy_list *cpuid, const char* str) + {"avx512-4fmaps",0x00000007, 0, CPUID_REG_EDX, 3, 1}, + {"ibrsb", 0x00000007, 0, CPUID_REG_EDX, 26, 1}, + {"stibp", 0x00000007, 0, CPUID_REG_EDX, 27, 1}, ++ {"l1d-flush", 0x00000007, 0, CPUID_REG_EDX, 28, 1}, + {"arch-caps", 0x00000007, 0, CPUID_REG_EDX, 29, 1}, ++ {"ssbd", 0x00000007, 0, CPUID_REG_EDX, 31, 1}, + + {"lahfsahf", 0x80000001, NA, CPUID_REG_ECX, 0, 1}, + {"cmplegacy", 0x80000001, NA, CPUID_REG_ECX, 1, 1}, +diff --git a/tools/libxl/libxl_dm.c b/tools/libxl/libxl_dm.c +index b51178b9fd..07399bb8e0 100644 +--- a/tools/libxl/libxl_dm.c ++++ b/tools/libxl/libxl_dm.c +@@ -798,6 +798,8 @@ static char *qemu_disk_scsi_drive_string(libxl__gc *gc, const char *target_path, + int colo_mode) + { + char *drive = NULL; ++ char *common = GCSPRINTF("cache=writeback,readonly=%s", ++ disk->readwrite ? "off" : "on"); + const char *exportname = disk->colo_export; + const char *active_disk = disk->active_disk; + const char *hidden_disk = disk->hidden_disk; +@@ -805,8 +807,8 @@ static char *qemu_disk_scsi_drive_string(libxl__gc *gc, const char *target_path, + switch (colo_mode) { + case LIBXL__COLO_NONE: + drive = libxl__sprintf +- (gc, "file=%s,if=scsi,bus=0,unit=%d,format=%s,cache=writeback", +- target_path, unit, format); ++ (gc, "%s,file=%s,if=scsi,bus=0,unit=%d,format=%s", ++ common, target_path, unit, format); + break; + case LIBXL__COLO_PRIMARY: + /* +@@ -819,13 +821,13 @@ static char *qemu_disk_scsi_drive_string(libxl__gc *gc, const char *target_path, + * vote-threshold=1 + */ + drive = GCSPRINTF( +- "if=scsi,bus=0,unit=%d,cache=writeback,driver=quorum," ++ "%s,if=scsi,bus=0,unit=%d,,driver=quorum," + "id=%s," + "children.0.file.filename=%s," + "children.0.driver=%s," + "read-pattern=fifo," + "vote-threshold=1", +- unit, exportname, target_path, format); ++ common, unit, exportname, target_path, format); + break; + case LIBXL__COLO_SECONDARY: + /* +@@ -839,7 +841,7 @@ static char *qemu_disk_scsi_drive_string(libxl__gc *gc, const char *target_path, + * file.backing.backing=exportname, + */ + drive = GCSPRINTF( +- "if=scsi,id=top-colo,bus=0,unit=%d,cache=writeback," ++ "%s,if=scsi,id=top-colo,bus=0,unit=%d," + "driver=replication," + "mode=secondary," + "top-id=top-colo," +@@ -848,7 +850,7 @@ static char *qemu_disk_scsi_drive_string(libxl__gc *gc, const char *target_path, + "file.backing.driver=qcow2," + "file.backing.file.filename=%s," + "file.backing.backing=%s", +- unit, active_disk, hidden_disk, exportname); ++ common, unit, active_disk, hidden_disk, exportname); + break; + default: + abort(); +@@ -866,6 +868,8 @@ static char *qemu_disk_ide_drive_string(libxl__gc *gc, const char *target_path, + const char *exportname = disk->colo_export; + const char *active_disk = disk->active_disk; + const char *hidden_disk = disk->hidden_disk; ++ ++ assert(disk->readwrite); /* should have been checked earlier */ + + switch (colo_mode) { + case LIBXL__COLO_NONE: +@@ -1575,8 +1579,9 @@ static int libxl__build_device_model_args_new(libxl__gc *gc, + if (strncmp(disks[i].vdev, "sd", 2) == 0) { + if (colo_mode == LIBXL__COLO_SECONDARY) { + drive = libxl__sprintf +- (gc, "if=none,driver=%s,file=%s,id=%s", +- format, target_path, disks[i].colo_export); ++ (gc, "if=none,driver=%s,file=%s,id=%s,readonly=%s", ++ format, target_path, disks[i].colo_export, ++ disks[i].readwrite ? "off" : "on"); + + flexarray_append(dm_args, "-drive"); + flexarray_append(dm_args, drive); +@@ -2586,7 +2591,7 @@ int libxl__need_xenpv_qemu(libxl__gc *gc, libxl_domain_config *d_config) + goto out; + } + +- if (d_config->num_vfbs > 0) { ++ if (d_config->num_vfbs > 0 || d_config->num_p9s > 0) { + ret = 1; + goto out; + } +diff --git a/tools/misc/xen-cpuid.c b/tools/misc/xen-cpuid.c +index b1a46c667d..85298d277d 100644 +--- a/tools/misc/xen-cpuid.c ++++ b/tools/misc/xen-cpuid.c +@@ -165,9 +165,8 @@ static const char *str_7d0[32] = + [4 ... 25] = "REZ", + + [26] = "ibrsb", [27] = "stibp", +- [28] = "REZ", [29] = "arch_caps", +- +- [30 ... 31] = "REZ", ++ [28] = "l1d_flush", [29] = "arch_caps", ++ [30] = "REZ", [31] = "ssbd", + }; + + static struct { +diff --git a/tools/ocaml/xenstored/store.ml b/tools/ocaml/xenstored/store.ml +index 13cf3b5bf4..5a8c377603 100644 +--- a/tools/ocaml/xenstored/store.ml ++++ b/tools/ocaml/xenstored/store.ml +@@ -262,7 +262,8 @@ let path_write store perm path value = + Node.check_perm store.root perm Perms.WRITE; + Node.set_value store.root value, false + ) else +- Path.apply_modify store.root path do_write, !node_created ++ let root = Path.apply_modify store.root path do_write in ++ root, !node_created + + let path_rm store perm path = + let do_rm node name = +diff --git a/tools/xl/xl.c b/tools/xl/xl.c +index 179908b4f6..7d2142f16f 100644 +--- a/tools/xl/xl.c ++++ b/tools/xl/xl.c +@@ -28,6 +28,9 @@ + #include <libxl_utils.h> + #include <libxlutil.h> + #include "xl.h" ++#include "xl_parse.h" ++ ++#include "xl_utils.h" + + xentoollog_logger_stdiostream *logger; + int dryrun_only; +@@ -42,6 +45,9 @@ char *default_gatewaydev = NULL; + char *default_vifbackend = NULL; + char *default_remus_netbufscript = NULL; + char *default_colo_proxy_script = NULL; ++libxl_bitmap global_vm_affinity_mask; ++libxl_bitmap global_hvm_affinity_mask; ++libxl_bitmap global_pv_affinity_mask; + enum output_format default_output_format = OUTPUT_FORMAT_JSON; + int claim_mode = 1; + bool progress_use_cr = 0; +@@ -203,6 +209,26 @@ static void parse_global_config(const char *configfile, + if (!xlu_cfg_get_long (config, "max_maptrack_frames", &l, 0)) + max_maptrack_frames = l; + ++ libxl_bitmap_init(&global_vm_affinity_mask); ++ libxl_cpu_bitmap_alloc(ctx, &global_vm_affinity_mask, 0); ++ libxl_bitmap_init(&global_hvm_affinity_mask); ++ libxl_cpu_bitmap_alloc(ctx, &global_hvm_affinity_mask, 0); ++ libxl_bitmap_init(&global_pv_affinity_mask); ++ libxl_cpu_bitmap_alloc(ctx, &global_pv_affinity_mask, 0); ++ ++ if (!xlu_cfg_get_string (config, "vm.cpumask", &buf, 0)) ++ parse_cpurange(buf, &global_vm_affinity_mask); ++ else ++ libxl_bitmap_set_any(&global_vm_affinity_mask); ++ if (!xlu_cfg_get_string (config, "vm.hvm.cpumask", &buf, 0)) ++ parse_cpurange(buf, &global_hvm_affinity_mask); ++ else ++ libxl_bitmap_set_any(&global_hvm_affinity_mask); ++ if (!xlu_cfg_get_string (config, "vm.pv.cpumask", &buf, 0)) ++ parse_cpurange(buf, &global_pv_affinity_mask); ++ else ++ libxl_bitmap_set_any(&global_pv_affinity_mask); ++ + xlu_cfg_destroy(config); + } + +diff --git a/tools/xl/xl.h b/tools/xl/xl.h +index 6b60d1db50..7b9f58fc6c 100644 +--- a/tools/xl/xl.h ++++ b/tools/xl/xl.h +@@ -41,6 +41,7 @@ struct domain_create { + int vncautopass; + int console_autoconnect; + int checkpointed_stream; ++ int ignore_global_affinity_masks; + const char *config_file; + char *extra_config; /* extra config string */ + const char *restore_file; +@@ -277,6 +278,9 @@ extern char *default_colo_proxy_script; + extern char *blkdev_start; + extern int max_grant_frames; + extern int max_maptrack_frames; ++extern libxl_bitmap global_vm_affinity_mask; ++extern libxl_bitmap global_hvm_affinity_mask; ++extern libxl_bitmap global_pv_affinity_mask; + + enum output_format { + OUTPUT_FORMAT_JSON, +@@ -292,6 +296,9 @@ typedef enum { + } domain_restart_type; + + extern void printf_info_sexp(int domid, libxl_domain_config *d_config, FILE *fh); ++extern void apply_global_affinity_masks(libxl_domain_type type, ++ libxl_bitmap *vcpu_affinity_array, ++ unsigned int size); + + #define XL_GLOBAL_CONFIG XEN_CONFIG_DIR "/xl.conf" + #define XL_LOCK_FILE XEN_LOCK_DIR "/xl" +diff --git a/tools/xl/xl_cmdtable.c b/tools/xl/xl_cmdtable.c +index 5546cf66e7..1a6c28dfdc 100644 +--- a/tools/xl/xl_cmdtable.c ++++ b/tools/xl/xl_cmdtable.c +@@ -34,7 +34,8 @@ struct cmd_spec cmd_table[] = { + "-e Do not wait in the background for the death of the domain.\n" + "-V, --vncviewer Connect to the VNC display after the domain is created.\n" + "-A, --vncviewer-autopass\n" +- " Pass VNC password to viewer via stdin." ++ " Pass VNC password to viewer via stdin.\n" ++ "--ignore-global-affinity-masks Ignore global masks in xl.conf." + }, + { "config-update", + &main_config_update, 1, 1, +@@ -224,7 +225,8 @@ struct cmd_spec cmd_table[] = { + &main_vcpupin, 1, 1, + "Set which CPUs a VCPU can use", + "[option] <Domain> <VCPU|all> <Hard affinity|-|all> <Soft affinity|-|all>", +- "-f, --force undo an override pinning done by the kernel", ++ "-f, --force undo an override pinning done by the kernel\n" ++ "--ignore-global-affinity-masks Ignore global masks in xl.conf", + }, + { "vcpu-set", + &main_vcpuset, 0, 1, +diff --git a/tools/xl/xl_vcpu.c b/tools/xl/xl_vcpu.c +index 8e735b38c1..3384eeed06 100644 +--- a/tools/xl/xl_vcpu.c ++++ b/tools/xl/xl_vcpu.c +@@ -68,6 +68,61 @@ static void print_domain_vcpuinfo(uint32_t domid, uint32_t nr_cpus) + libxl_vcpuinfo_list_free(vcpuinfo, nb_vcpu); + } + ++void apply_global_affinity_masks(libxl_domain_type type, ++ libxl_bitmap *vcpu_affinity_array, ++ unsigned int size) ++{ ++ libxl_bitmap *mask = &global_vm_affinity_mask; ++ libxl_bitmap *type_mask; ++ unsigned int i; ++ ++ switch (type) { ++ case LIBXL_DOMAIN_TYPE_HVM: ++ case LIBXL_DOMAIN_TYPE_PVH: ++ type_mask = &global_hvm_affinity_mask; ++ break; ++ case LIBXL_DOMAIN_TYPE_PV: ++ type_mask = &global_pv_affinity_mask; ++ break; ++ default: ++ fprintf(stderr, "Unknown guest type\n"); ++ exit(EXIT_FAILURE); ++ } ++ ++ for (i = 0; i < size; i++) { ++ int rc; ++ libxl_bitmap *t = &vcpu_affinity_array[i]; ++ libxl_bitmap b1, b2; ++ ++ libxl_bitmap_init(&b1); ++ libxl_bitmap_init(&b2); ++ ++ rc = libxl_bitmap_and(ctx, &b1, t, mask); ++ if (rc) { ++ fprintf(stderr, "libxl_bitmap_and errored\n"); ++ exit(EXIT_FAILURE); ++ } ++ rc = libxl_bitmap_and(ctx, &b2, &b1, type_mask); ++ if (rc) { ++ fprintf(stderr, "libxl_bitmap_and errored\n"); ++ exit(EXIT_FAILURE); ++ } ++ ++ if (libxl_bitmap_is_empty(&b2)) { ++ fprintf(stderr, "vcpu hard affinity map is empty\n"); ++ exit(EXIT_FAILURE); ++ } ++ ++ /* Replace target bitmap with the result */ ++ libxl_bitmap_dispose(t); ++ libxl_bitmap_init(t); ++ libxl_bitmap_copy_alloc(ctx, t, &b2); ++ ++ libxl_bitmap_dispose(&b1); ++ libxl_bitmap_dispose(&b2); ++ } ++} ++ + static void vcpulist(int argc, char **argv) + { + libxl_dominfo *dominfo; +@@ -118,6 +173,7 @@ int main_vcpupin(int argc, char **argv) + { + static struct option opts[] = { + {"force", 0, 0, 'f'}, ++ {"ignore-global-affinity-masks", 0, 0, 'i'}, + COMMON_LONG_OPTS + }; + libxl_vcpuinfo *vcpuinfo; +@@ -132,15 +188,18 @@ int main_vcpupin(int argc, char **argv) + const char *vcpu, *hard_str, *soft_str; + char *endptr; + int opt, nb_cpu, nb_vcpu, rc = EXIT_FAILURE; +- bool force = false; ++ bool force = false, ignore_masks = false; + + libxl_bitmap_init(&cpumap_hard); + libxl_bitmap_init(&cpumap_soft); + +- SWITCH_FOREACH_OPT(opt, "f", opts, "vcpu-pin", 3) { ++ SWITCH_FOREACH_OPT(opt, "fi", opts, "vcpu-pin", 3) { + case 'f': + force = true; + break; ++ case 'i': ++ ignore_masks = true; ++ break; + default: + break; + } +@@ -222,6 +281,23 @@ int main_vcpupin(int argc, char **argv) + goto out; + } + ++ /* Only hard affinity matters here */ ++ if (!ignore_masks) { ++ libxl_domain_config d_config; ++ ++ libxl_domain_config_init(&d_config); ++ rc = libxl_retrieve_domain_configuration(ctx, domid, &d_config); ++ if (rc) { ++ fprintf(stderr, "Could not retrieve domain configuration\n"); ++ libxl_domain_config_dispose(&d_config); ++ goto out; ++ } ++ ++ apply_global_affinity_masks(d_config.b_info.type, hard, 1); ++ ++ libxl_domain_config_dispose(&d_config); ++ } ++ + if (force) { + if (libxl_set_vcpuaffinity_force(ctx, domid, vcpuid, hard, soft)) { + fprintf(stderr, "Could not set affinity for vcpu `%ld'.\n", +diff --git a/tools/xl/xl_vmcontrol.c b/tools/xl/xl_vmcontrol.c +index 89c2b25ded..a1d633795c 100644 +--- a/tools/xl/xl_vmcontrol.c ++++ b/tools/xl/xl_vmcontrol.c +@@ -804,6 +804,36 @@ int create_domain(struct domain_create *dom_info) + parse_config_data(config_source, config_data, config_len, &d_config); + } + ++ if (!dom_info->ignore_global_affinity_masks) { ++ libxl_domain_build_info *b_info = &d_config.b_info; ++ ++ /* It is possible that no hard affinity is specified in config file. ++ * Generate hard affinity maps now if we care about those. ++ */ ++ if (b_info->num_vcpu_hard_affinity == 0 && ++ (!libxl_bitmap_is_full(&global_vm_affinity_mask) || ++ (b_info->type == LIBXL_DOMAIN_TYPE_PV && ++ !libxl_bitmap_is_full(&global_pv_affinity_mask)) || ++ (b_info->type != LIBXL_DOMAIN_TYPE_PV && ++ !libxl_bitmap_is_full(&global_hvm_affinity_mask)) ++ )) { ++ b_info->num_vcpu_hard_affinity = b_info->max_vcpus; ++ b_info->vcpu_hard_affinity = ++ xmalloc(b_info->max_vcpus * sizeof(libxl_bitmap)); ++ ++ for (i = 0; i < b_info->num_vcpu_hard_affinity; i++) { ++ libxl_bitmap *m = &b_info->vcpu_hard_affinity[i]; ++ libxl_bitmap_init(m); ++ libxl_cpu_bitmap_alloc(ctx, m, 0); ++ libxl_bitmap_set_any(m); ++ } ++ } ++ ++ apply_global_affinity_masks(b_info->type, ++ b_info->vcpu_hard_affinity, ++ b_info->num_vcpu_hard_affinity); ++ } ++ + if (migrate_fd >= 0) { + if (d_config.c_info.name) { + /* when we receive a domain we get its name from the config +@@ -1124,7 +1154,7 @@ int main_create(int argc, char **argv) + const char *filename = NULL; + struct domain_create dom_info; + int paused = 0, debug = 0, daemonize = 1, console_autoconnect = 0, +- quiet = 0, monitor = 1, vnc = 0, vncautopass = 0; ++ quiet = 0, monitor = 1, vnc = 0, vncautopass = 0, ignore_masks = 0; + int opt, rc; + static struct option opts[] = { + {"dryrun", 0, 0, 'n'}, +@@ -1132,6 +1162,7 @@ int main_create(int argc, char **argv) + {"defconfig", 1, 0, 'f'}, + {"vncviewer", 0, 0, 'V'}, + {"vncviewer-autopass", 0, 0, 'A'}, ++ {"ignore-global-affinity-masks", 0, 0, 'i'}, + COMMON_LONG_OPTS + }; + +@@ -1142,7 +1173,7 @@ int main_create(int argc, char **argv) + argc--; argv++; + } + +- SWITCH_FOREACH_OPT(opt, "Fnqf:pcdeVA", opts, "create", 0) { ++ SWITCH_FOREACH_OPT(opt, "Fnqf:pcdeVAi", opts, "create", 0) { + case 'f': + filename = optarg; + break; +@@ -1174,6 +1205,9 @@ int main_create(int argc, char **argv) + case 'A': + vnc = vncautopass = 1; + break; ++ case 'i': ++ ignore_masks = 1; ++ break; + } + + memset(&dom_info, 0, sizeof(dom_info)); +@@ -1203,6 +1237,7 @@ int main_create(int argc, char **argv) + dom_info.vnc = vnc; + dom_info.vncautopass = vncautopass; + dom_info.console_autoconnect = console_autoconnect; ++ dom_info.ignore_global_affinity_masks = ignore_masks; + + rc = create_domain(&dom_info); + if (rc < 0) { +diff --git a/xen/Makefile b/xen/Makefile +index ecec297b9b..580af86931 100644 +--- a/xen/Makefile ++++ b/xen/Makefile +@@ -2,7 +2,7 @@ + # All other places this is stored (eg. compile.h) should be autogenerated. + export XEN_VERSION = 4 + export XEN_SUBVERSION = 10 +-export XEN_EXTRAVERSION ?= .1$(XEN_VENDORVERSION) ++export XEN_EXTRAVERSION ?= .2-pre$(XEN_VENDORVERSION) + export XEN_FULLVERSION = $(XEN_VERSION).$(XEN_SUBVERSION)$(XEN_EXTRAVERSION) + -include xen-version + +diff --git a/xen/arch/x86/Kconfig b/xen/arch/x86/Kconfig +index f621e799ed..33535ca9aa 100644 +--- a/xen/arch/x86/Kconfig ++++ b/xen/arch/x86/Kconfig +@@ -71,6 +71,7 @@ config SHADOW_PAGING + * Running HVM guests on hardware lacking hardware paging support + (First-generation Intel VT-x or AMD SVM). + * Live migration of PV guests. ++ * L1TF sidechannel mitigation for PV guests. + + Under a small number of specific workloads, shadow paging may be + deliberately used as a performance optimisation. +diff --git a/xen/arch/x86/Rules.mk b/xen/arch/x86/Rules.mk +index 70e9d8f5cf..03b1b581f3 100644 +--- a/xen/arch/x86/Rules.mk ++++ b/xen/arch/x86/Rules.mk +@@ -23,6 +23,7 @@ $(call as-insn-check,CFLAGS,CC,"rdseed %eax",-DHAVE_GAS_RDSEED) + $(call as-insn-check,CFLAGS,CC,".equ \"x\"$$(comma)1", \ + -U__OBJECT_LABEL__ -DHAVE_GAS_QUOTED_SYM \ + '-D__OBJECT_LABEL__=$(subst $(BASEDIR)/,,$(CURDIR))/$$@') ++$(call as-insn-check,CFLAGS,CC,"invpcid (%rax)$$(comma)%rax",-DHAVE_AS_INVPCID) + + CFLAGS += -mno-red-zone -mno-sse -fpic + CFLAGS += -fno-asynchronous-unwind-tables +diff --git a/xen/arch/x86/acpi/power.c b/xen/arch/x86/acpi/power.c +index 1e4e5680a7..f3480aa800 100644 +--- a/xen/arch/x86/acpi/power.c ++++ b/xen/arch/x86/acpi/power.c +@@ -28,6 +28,7 @@ + #include <asm/tboot.h> + #include <asm/apic.h> + #include <asm/io_apic.h> ++#include <asm/spec_ctrl.h> + #include <acpi/cpufreq/cpufreq.h> + + uint32_t system_reset_counter = 1; +@@ -163,6 +164,7 @@ static int enter_state(u32 state) + { + unsigned long flags; + int error; ++ struct cpu_info *ci; + unsigned long cr4; + + if ( (state <= ACPI_STATE_S0) || (state > ACPI_S_STATES_MAX) ) +@@ -203,12 +205,18 @@ static int enter_state(u32 state) + printk(XENLOG_ERR "Some devices failed to power down."); + system_state = SYS_STATE_resume; + device_power_up(error); ++ console_end_sync(); + error = -EIO; + goto done; + } + else + error = 0; + ++ ci = get_cpu_info(); ++ spec_ctrl_enter_idle(ci); ++ /* Avoid NMI/#MC using MSR_SPEC_CTRL until we've reloaded microcode. */ ++ ci->spec_ctrl_flags &= ~SCF_ist_wrmsr; ++ + ACPI_FLUSH_CPU_CACHE(); + + switch ( state ) +@@ -243,17 +251,23 @@ static int enter_state(u32 state) + if ( (state == ACPI_STATE_S3) && error ) + tboot_s3_error(error); + ++ console_end_sync(); ++ ++ microcode_resume_cpu(0); ++ ++ /* Re-enabled default NMI/#MC use of MSR_SPEC_CTRL. */ ++ ci->spec_ctrl_flags |= (default_spec_ctrl_flags & SCF_ist_wrmsr); ++ spec_ctrl_exit_idle(ci); ++ + done: + spin_debug_enable(); + local_irq_restore(flags); +- console_end_sync(); + acpi_sleep_post(state); + if ( hvm_cpu_up() ) + BUG(); ++ cpufreq_add_cpu(0); + + enable_cpu: +- cpufreq_add_cpu(0); +- microcode_resume_cpu(0); + rcu_barrier(); + mtrr_aps_sync_begin(); + enable_nonboot_cpus(); +diff --git a/xen/arch/x86/cpu/amd.c b/xen/arch/x86/cpu/amd.c +index fc9677f020..76078b55b2 100644 +--- a/xen/arch/x86/cpu/amd.c ++++ b/xen/arch/x86/cpu/amd.c +@@ -9,6 +9,7 @@ + #include <asm/amd.h> + #include <asm/hvm/support.h> + #include <asm/setup.h> /* amd_init_cpu */ ++#include <asm/spec_ctrl.h> + #include <asm/acpi.h> + #include <asm/apic.h> + +@@ -504,17 +505,23 @@ static void amd_get_topology(struct cpuinfo_x86 *c) + u32 eax, ebx, ecx, edx; + + cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); +- c->compute_unit_id = ebx & 0xFF; + c->x86_num_siblings = ((ebx >> 8) & 0x3) + 1; ++ ++ if (c->x86 < 0x17) ++ c->compute_unit_id = ebx & 0xFF; ++ else { ++ c->cpu_core_id = ebx & 0xFF; ++ c->x86_max_cores /= c->x86_num_siblings; ++ } + } + + if (opt_cpu_info) + printk("CPU %d(%d) -> Processor %d, %s %d\n", + cpu, c->x86_max_cores, c->phys_proc_id, +- cpu_has(c, X86_FEATURE_TOPOEXT) ? "Compute Unit" : +- "Core", +- cpu_has(c, X86_FEATURE_TOPOEXT) ? c->compute_unit_id : +- c->cpu_core_id); ++ c->compute_unit_id != INVALID_CUID ? "Compute Unit" ++ : "Core", ++ c->compute_unit_id != INVALID_CUID ? c->compute_unit_id ++ : c->cpu_core_id); + } + + static void early_init_amd(struct cpuinfo_x86 *c) +@@ -594,6 +601,25 @@ static void init_amd(struct cpuinfo_x86 *c) + c->x86_capability); + } + ++ /* ++ * If the user has explicitly chosen to disable Memory Disambiguation ++ * to mitigiate Speculative Store Bypass, poke the appropriate MSR. ++ */ ++ if (opt_ssbd) { ++ int bit = -1; ++ ++ switch (c->x86) { ++ case 0x15: bit = 54; break; ++ case 0x16: bit = 33; break; ++ case 0x17: bit = 10; break; ++ } ++ ++ if (bit >= 0 && !rdmsr_safe(MSR_AMD64_LS_CFG, value)) { ++ value |= 1ull << bit; ++ wrmsr_safe(MSR_AMD64_LS_CFG, value); ++ } ++ } ++ + /* MFENCE stops RDTSC speculation */ + if (!cpu_has_lfence_dispatch) + __set_bit(X86_FEATURE_MFENCE_RDTSC, c->x86_capability); +diff --git a/xen/arch/x86/cpu/common.c b/xen/arch/x86/cpu/common.c +index fdb2bf1779..eb266c5ba6 100644 +--- a/xen/arch/x86/cpu/common.c ++++ b/xen/arch/x86/cpu/common.c +@@ -14,6 +14,7 @@ + #include <public/sysctl.h> /* for XEN_INVALID_{SOCKET,CORE}_ID */ + + #include "cpu.h" ++#include "mcheck/x86_mca.h" + + bool_t opt_arat = 1; + boolean_param("arat", opt_arat); +@@ -345,6 +346,9 @@ static void __init early_cpu_detect(void) + hap_paddr_bits = PADDR_BITS; + } + ++ if (c->x86_vendor != X86_VENDOR_AMD) ++ park_offline_cpus = opt_mce; ++ + initialize_cpu_data(0); + } + +@@ -747,6 +751,7 @@ void load_system_tables(void) + [IST_MCE - 1] = stack_top + IST_MCE * PAGE_SIZE, + [IST_DF - 1] = stack_top + IST_DF * PAGE_SIZE, + [IST_NMI - 1] = stack_top + IST_NMI * PAGE_SIZE, ++ [IST_DB - 1] = stack_top + IST_DB * PAGE_SIZE, + + [IST_MAX ... ARRAY_SIZE(tss->ist) - 1] = + 0x8600111111111111ul, +@@ -774,6 +779,7 @@ void load_system_tables(void) + set_ist(&idt_tables[cpu][TRAP_double_fault], IST_DF); + set_ist(&idt_tables[cpu][TRAP_nmi], IST_NMI); + set_ist(&idt_tables[cpu][TRAP_machine_check], IST_MCE); ++ set_ist(&idt_tables[cpu][TRAP_debug], IST_DB); + + /* + * Bottom-of-stack must be 16-byte aligned! +diff --git a/xen/arch/x86/cpu/mcheck/mce.c b/xen/arch/x86/cpu/mcheck/mce.c +index df0106ec3c..1ee1870de9 100644 +--- a/xen/arch/x86/cpu/mcheck/mce.c ++++ b/xen/arch/x86/cpu/mcheck/mce.c +@@ -695,12 +695,15 @@ static void cpu_bank_free(unsigned int cpu) + + mcabanks_free(poll); + mcabanks_free(clr); ++ ++ per_cpu(poll_bankmask, cpu) = NULL; ++ per_cpu(mce_clear_banks, cpu) = NULL; + } + + static int cpu_bank_alloc(unsigned int cpu) + { +- struct mca_banks *poll = mcabanks_alloc(); +- struct mca_banks *clr = mcabanks_alloc(); ++ struct mca_banks *poll = per_cpu(poll_bankmask, cpu) ?: mcabanks_alloc(); ++ struct mca_banks *clr = per_cpu(mce_clear_banks, cpu) ?: mcabanks_alloc(); + + if ( !poll || !clr ) + { +@@ -728,7 +731,13 @@ static int cpu_callback( + + case CPU_UP_CANCELED: + case CPU_DEAD: +- cpu_bank_free(cpu); ++ if ( !park_offline_cpus ) ++ cpu_bank_free(cpu); ++ break; ++ ++ case CPU_REMOVE: ++ if ( park_offline_cpus ) ++ cpu_bank_free(cpu); + break; + } + +diff --git a/xen/arch/x86/cpu/mcheck/mce_intel.c b/xen/arch/x86/cpu/mcheck/mce_intel.c +index e5dd956a24..4474a34e34 100644 +--- a/xen/arch/x86/cpu/mcheck/mce_intel.c ++++ b/xen/arch/x86/cpu/mcheck/mce_intel.c +@@ -636,8 +636,6 @@ static void clear_cmci(void) + + static void cpu_mcheck_disable(void) + { +- clear_in_cr4(X86_CR4_MCE); +- + if ( cmci_support && opt_mce ) + clear_cmci(); + } +diff --git a/xen/arch/x86/cpu/mtrr/generic.c b/xen/arch/x86/cpu/mtrr/generic.c +index e9c0e5e059..7ba0c3f0fe 100644 +--- a/xen/arch/x86/cpu/mtrr/generic.c ++++ b/xen/arch/x86/cpu/mtrr/generic.c +@@ -5,6 +5,7 @@ + #include <xen/mm.h> + #include <xen/stdbool.h> + #include <asm/flushtlb.h> ++#include <asm/invpcid.h> + #include <asm/io.h> + #include <asm/mtrr.h> + #include <asm/msr.h> +@@ -400,8 +401,10 @@ static DEFINE_SPINLOCK(set_atomicity_lock); + * has been called. + */ + +-static void prepare_set(void) ++static bool prepare_set(void) + { ++ unsigned long cr4; ++ + /* Note that this is not ideal, since the cache is only flushed/disabled + for this CPU while the MTRRs are changed, but changing this requires + more invasive changes to the way the kernel boots */ +@@ -412,18 +415,24 @@ static void prepare_set(void) + write_cr0(read_cr0() | X86_CR0_CD); + wbinvd(); + +- /* TLB flushing here relies on Xen always using CR4.PGE. */ +- BUILD_BUG_ON(!(XEN_MINIMAL_CR4 & X86_CR4_PGE)); +- write_cr4(read_cr4() & ~X86_CR4_PGE); ++ cr4 = read_cr4(); ++ if (cr4 & X86_CR4_PGE) ++ write_cr4(cr4 & ~X86_CR4_PGE); ++ else if (use_invpcid) ++ invpcid_flush_all(); ++ else ++ write_cr3(read_cr3()); + + /* Save MTRR state */ + rdmsrl(MSR_MTRRdefType, deftype); + + /* Disable MTRRs, and set the default type to uncached */ + mtrr_wrmsr(MSR_MTRRdefType, deftype & ~0xcff); ++ ++ return cr4 & X86_CR4_PGE; + } + +-static void post_set(void) ++static void post_set(bool pge) + { + /* Intel (P6) standard MTRRs */ + mtrr_wrmsr(MSR_MTRRdefType, deftype); +@@ -432,7 +441,12 @@ static void post_set(void) + write_cr0(read_cr0() & ~X86_CR0_CD); + + /* Reenable CR4.PGE (also flushes the TLB) */ +- write_cr4(read_cr4() | X86_CR4_PGE); ++ if (pge) ++ write_cr4(read_cr4() | X86_CR4_PGE); ++ else if (use_invpcid) ++ invpcid_flush_all(); ++ else ++ write_cr3(read_cr3()); + + spin_unlock(&set_atomicity_lock); + } +@@ -441,14 +455,15 @@ static void generic_set_all(void) + { + unsigned long mask, count; + unsigned long flags; ++ bool pge; + + local_irq_save(flags); +- prepare_set(); ++ pge = prepare_set(); + + /* Actually set the state */ + mask = set_mtrr_state(); + +- post_set(); ++ post_set(pge); + local_irq_restore(flags); + + /* Use the atomic bitops to update the global mask */ +@@ -457,7 +472,6 @@ static void generic_set_all(void) + set_bit(count, &smp_changes_mask); + mask >>= 1; + } +- + } + + static void generic_set_mtrr(unsigned int reg, unsigned long base, +@@ -474,11 +488,12 @@ static void generic_set_mtrr(unsigned int reg, unsigned long base, + { + unsigned long flags; + struct mtrr_var_range *vr; ++ bool pge; + + vr = &mtrr_state.var_ranges[reg]; + + local_irq_save(flags); +- prepare_set(); ++ pge = prepare_set(); + + if (size == 0) { + /* The invalid bit is kept in the mask, so we simply clear the +@@ -499,7 +514,7 @@ static void generic_set_mtrr(unsigned int reg, unsigned long base, + mtrr_wrmsr(MSR_IA32_MTRR_PHYSMASK(reg), vr->mask); + } + +- post_set(); ++ post_set(pge); + local_irq_restore(flags); + } + +diff --git a/xen/arch/x86/cpu/vpmu_intel.c b/xen/arch/x86/cpu/vpmu_intel.c +index 207e2e712c..6e27f6ec8e 100644 +--- a/xen/arch/x86/cpu/vpmu_intel.c ++++ b/xen/arch/x86/cpu/vpmu_intel.c +@@ -454,13 +454,11 @@ static int core2_vpmu_alloc_resource(struct vcpu *v) + + if ( is_hvm_vcpu(v) ) + { +- wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); +- if ( vmx_add_host_load_msr(MSR_CORE_PERF_GLOBAL_CTRL) ) ++ if ( vmx_add_host_load_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, 0) ) + goto out_err; + +- if ( vmx_add_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL) ) ++ if ( vmx_add_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, 0) ) + goto out_err; +- vmx_write_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, 0); + } + + core2_vpmu_cxt = xzalloc_bytes(sizeof(*core2_vpmu_cxt) + +@@ -535,27 +533,7 @@ static int core2_vpmu_do_wrmsr(unsigned int msr, uint64_t msr_content, + uint64_t *enabled_cntrs; + + if ( !core2_vpmu_msr_common_check(msr, &type, &index) ) +- { +- /* Special handling for BTS */ +- if ( msr == MSR_IA32_DEBUGCTLMSR ) +- { +- supported |= IA32_DEBUGCTLMSR_TR | IA32_DEBUGCTLMSR_BTS | +- IA32_DEBUGCTLMSR_BTINT; +- +- if ( cpu_has(¤t_cpu_data, X86_FEATURE_DSCPL) ) +- supported |= IA32_DEBUGCTLMSR_BTS_OFF_OS | +- IA32_DEBUGCTLMSR_BTS_OFF_USR; +- if ( !(msr_content & ~supported) && +- vpmu_is_set(vpmu, VPMU_CPU_HAS_BTS) ) +- return 0; +- if ( (msr_content & supported) && +- !vpmu_is_set(vpmu, VPMU_CPU_HAS_BTS) ) +- printk(XENLOG_G_WARNING +- "%pv: Debug Store unsupported on this CPU\n", +- current); +- } + return -EINVAL; +- } + + ASSERT(!supported); + +@@ -613,7 +591,7 @@ static int core2_vpmu_do_wrmsr(unsigned int msr, uint64_t msr_content, + return -EINVAL; + + if ( is_hvm_vcpu(v) ) +- vmx_read_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, ++ vmx_read_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, + &core2_vpmu_cxt->global_ctrl); + else + rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, core2_vpmu_cxt->global_ctrl); +@@ -682,7 +660,7 @@ static int core2_vpmu_do_wrmsr(unsigned int msr, uint64_t msr_content, + return -EINVAL; + + if ( is_hvm_vcpu(v) ) +- vmx_read_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, ++ vmx_read_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, + &core2_vpmu_cxt->global_ctrl); + else + rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, core2_vpmu_cxt->global_ctrl); +@@ -701,7 +679,7 @@ static int core2_vpmu_do_wrmsr(unsigned int msr, uint64_t msr_content, + else + { + if ( is_hvm_vcpu(v) ) +- vmx_write_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, msr_content); ++ vmx_write_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, msr_content); + else + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, msr_content); + } +@@ -735,7 +713,7 @@ static int core2_vpmu_do_rdmsr(unsigned int msr, uint64_t *msr_content) + break; + case MSR_CORE_PERF_GLOBAL_CTRL: + if ( is_hvm_vcpu(v) ) +- vmx_read_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, msr_content); ++ vmx_read_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, msr_content); + else + rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, *msr_content); + break; +diff --git a/xen/arch/x86/cpuid.c b/xen/arch/x86/cpuid.c +index b3c9ac6c48..24b9495faa 100644 +--- a/xen/arch/x86/cpuid.c ++++ b/xen/arch/x86/cpuid.c +@@ -43,6 +43,16 @@ static int __init parse_xen_cpuid(const char *s) + if ( !val ) + setup_clear_cpu_cap(X86_FEATURE_STIBP); + } ++ else if ( (val = parse_boolean("l1d-flush", s, ss)) >= 0 ) ++ { ++ if ( !val ) ++ setup_clear_cpu_cap(X86_FEATURE_L1D_FLUSH); ++ } ++ else if ( (val = parse_boolean("ssbd", s, ss)) >= 0 ) ++ { ++ if ( !val ) ++ setup_clear_cpu_cap(X86_FEATURE_SSBD); ++ } + else + rc = -EINVAL; + +@@ -368,6 +378,28 @@ static void __init calculate_host_policy(void) + } + } + ++static void __init guest_common_feature_adjustments(uint32_t *fs) ++{ ++ /* Unconditionally claim to be able to set the hypervisor bit. */ ++ __set_bit(X86_FEATURE_HYPERVISOR, fs); ++ ++ /* ++ * If IBRS is offered to the guest, unconditionally offer STIBP. It is a ++ * nop on non-HT hardware, and has this behaviour to make heterogeneous ++ * setups easier to manage. ++ */ ++ if ( test_bit(X86_FEATURE_IBRSB, fs) ) ++ __set_bit(X86_FEATURE_STIBP, fs); ++ ++ /* ++ * On hardware which supports IBRS/IBPB, we can offer IBPB independently ++ * of IBRS by using the AMD feature bit. An administrator may wish for ++ * performance reasons to offer IBPB without IBRS. ++ */ ++ if ( host_cpuid_policy.feat.ibrsb ) ++ __set_bit(X86_FEATURE_IBPB, fs); ++} ++ + static void __init calculate_pv_max_policy(void) + { + struct cpuid_policy *p = &pv_max_cpuid_policy; +@@ -380,18 +412,14 @@ static void __init calculate_pv_max_policy(void) + for ( i = 0; i < ARRAY_SIZE(pv_featureset); ++i ) + pv_featureset[i] &= pv_featuremask[i]; + +- /* Unconditionally claim to be able to set the hypervisor bit. */ +- __set_bit(X86_FEATURE_HYPERVISOR, pv_featureset); +- +- /* On hardware with IBRS/IBPB support, there are further adjustments. */ +- if ( test_bit(X86_FEATURE_IBRSB, pv_featureset) ) +- { +- /* Offer STIBP unconditionally. It is a nop on non-HT hardware. */ +- __set_bit(X86_FEATURE_STIBP, pv_featureset); ++ /* ++ * If Xen isn't virtualising MSR_SPEC_CTRL for PV guests because of ++ * administrator choice, hide the feature. ++ */ ++ if ( !boot_cpu_has(X86_FEATURE_SC_MSR_PV) ) ++ __clear_bit(X86_FEATURE_IBRSB, pv_featureset); + +- /* AMD's IBPB is a subset of IBRS/IBPB. */ +- __set_bit(X86_FEATURE_IBPB, pv_featureset); +- } ++ guest_common_feature_adjustments(pv_featureset); + + sanitise_featureset(pv_featureset); + cpuid_featureset_to_policy(pv_featureset, p); +@@ -419,9 +447,6 @@ static void __init calculate_hvm_max_policy(void) + for ( i = 0; i < ARRAY_SIZE(hvm_featureset); ++i ) + hvm_featureset[i] &= hvm_featuremask[i]; + +- /* Unconditionally claim to be able to set the hypervisor bit. */ +- __set_bit(X86_FEATURE_HYPERVISOR, hvm_featureset); +- + /* + * Xen can provide an APIC emulation to HVM guests even if the host's APIC + * isn't enabled. +@@ -437,6 +462,13 @@ static void __init calculate_hvm_max_policy(void) + raw_cpuid_policy.basic.sep ) + __set_bit(X86_FEATURE_SEP, hvm_featureset); + ++ /* ++ * If Xen isn't virtualising MSR_SPEC_CTRL for HVM guests because of ++ * administrator choice, hide the feature. ++ */ ++ if ( !boot_cpu_has(X86_FEATURE_SC_MSR_HVM) ) ++ __clear_bit(X86_FEATURE_IBRSB, hvm_featureset); ++ + /* + * With VT-x, some features are only supported by Xen if dedicated + * hardware support is also available. +@@ -450,15 +482,7 @@ static void __init calculate_hvm_max_policy(void) + __clear_bit(X86_FEATURE_XSAVES, hvm_featureset); + } + +- /* On hardware with IBRS/IBPB support, there are further adjustments. */ +- if ( test_bit(X86_FEATURE_IBRSB, hvm_featureset) ) +- { +- /* Offer STIBP unconditionally. It is a nop on non-HT hardware. */ +- __set_bit(X86_FEATURE_STIBP, hvm_featureset); +- +- /* AMD's IBPB is a subset of IBRS/IBPB. */ +- __set_bit(X86_FEATURE_IBPB, hvm_featureset); +- } ++ guest_common_feature_adjustments(hvm_featureset); + + sanitise_featureset(hvm_featureset); + cpuid_featureset_to_policy(hvm_featureset, p); +@@ -601,14 +625,6 @@ void recalculate_cpuid_policy(struct domain *d) + recalculate_xstate(p); + recalculate_misc(p); + +- /* +- * Override STIBP to match IBRS. Guests can safely use STIBP +- * functionality on non-HT hardware, but can't necesserily protect +- * themselves from SP2/Spectre/Branch Target Injection if STIBP is hidden +- * on HT-capable hardware. +- */ +- p->feat.stibp = p->feat.ibrsb; +- + for ( i = 0; i < ARRAY_SIZE(p->cache.raw); ++i ) + { + if ( p->cache.subleaf[i].type >= 1 && +diff --git a/xen/arch/x86/debug.c b/xen/arch/x86/debug.c +index 9159f32db4..a500df01ac 100644 +--- a/xen/arch/x86/debug.c ++++ b/xen/arch/x86/debug.c +@@ -98,7 +98,7 @@ dbg_pv_va2mfn(dbgva_t vaddr, struct domain *dp, uint64_t pgd3val) + l2_pgentry_t l2e, *l2t; + l1_pgentry_t l1e, *l1t; + unsigned long cr3 = (pgd3val ? pgd3val : dp->vcpu[0]->arch.cr3); +- mfn_t mfn = maddr_to_mfn(cr3); ++ mfn_t mfn = maddr_to_mfn(cr3_pa(cr3)); + + DBGP2("vaddr:%lx domid:%d cr3:%lx pgd3:%lx\n", vaddr, dp->domain_id, + cr3, pgd3val); +diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c +index 5a4d5c3bfc..2020e0b682 100644 +--- a/xen/arch/x86/domain.c ++++ b/xen/arch/x86/domain.c +@@ -107,10 +107,11 @@ static void play_dead(void) + local_irq_disable(); + + /* +- * NOTE: After cpu_exit_clear, per-cpu variables are no longer accessible, +- * as they may be freed at any time. In this case, heap corruption or +- * #PF can occur (when heap debugging is enabled). For example, even +- * printk() can involve tasklet scheduling, which touches per-cpu vars. ++ * NOTE: After cpu_exit_clear, per-cpu variables may no longer accessible, ++ * as they may be freed at any time if offline CPUs don't get parked. In ++ * this case, heap corruption or #PF can occur (when heap debugging is ++ * enabled). For example, even printk() can involve tasklet scheduling, ++ * which touches per-cpu vars. + * + * Consider very carefully when adding code to *dead_idle. Most hypervisor + * subsystems are unsafe to call. +@@ -1517,17 +1518,12 @@ void paravirt_ctxt_switch_from(struct vcpu *v) + void paravirt_ctxt_switch_to(struct vcpu *v) + { + root_pgentry_t *root_pgt = this_cpu(root_pgt); +- unsigned long cr4; + + if ( root_pgt ) + root_pgt[root_table_offset(PERDOMAIN_VIRT_START)] = + l4e_from_page(v->domain->arch.perdomain_l3_pg, + __PAGE_HYPERVISOR_RW); + +- cr4 = pv_guest_cr4_to_real_cr4(v); +- if ( unlikely(cr4 != read_cr4()) ) +- write_cr4(cr4); +- + if ( unlikely(v->arch.debugreg[7] & DR7_ACTIVE_MASK) ) + activate_debugregs(v); + +@@ -1640,7 +1636,7 @@ static void __context_switch(void) + if ( cpu_has_xsaves && is_hvm_vcpu(n) ) + set_msr_xss(n->arch.hvm_vcpu.msr_xss); + } +- vcpu_restore_fpu_eager(n); ++ vcpu_restore_fpu_nonlazy(n, false); + nd->arch.ctxt_switch->to(n); + } + +@@ -1693,6 +1689,7 @@ void context_switch(struct vcpu *prev, struct vcpu *next) + + ASSERT(local_irq_is_enabled()); + ++ get_cpu_info()->use_pv_cr3 = false; + get_cpu_info()->xen_cr3 = 0; + + cpumask_copy(&dirty_mask, next->vcpu_dirty_cpumask); +diff --git a/xen/arch/x86/domain_page.c b/xen/arch/x86/domain_page.c +index 3432a854dd..9a52276866 100644 +--- a/xen/arch/x86/domain_page.c ++++ b/xen/arch/x86/domain_page.c +@@ -51,7 +51,7 @@ static inline struct vcpu *mapcache_current_vcpu(void) + if ( (v = idle_vcpu[smp_processor_id()]) == current ) + sync_local_execstate(); + /* We must now be running on the idle page table. */ +- ASSERT(read_cr3() == __pa(idle_pg_table)); ++ ASSERT(cr3_pa(read_cr3()) == __pa(idle_pg_table)); + } + + return v; +diff --git a/xen/arch/x86/domctl.c b/xen/arch/x86/domctl.c +index 74e9749d89..eefd94eb92 100644 +--- a/xen/arch/x86/domctl.c ++++ b/xen/arch/x86/domctl.c +@@ -226,7 +226,8 @@ static int update_domain_cpuid_info(struct domain *d, + */ + call_policy_changed = (is_hvm_domain(d) && + ((old_7d0 ^ p->feat.raw[0].d) & +- cpufeat_mask(X86_FEATURE_IBRSB))); ++ (cpufeat_mask(X86_FEATURE_IBRSB) | ++ cpufeat_mask(X86_FEATURE_L1D_FLUSH)))); + break; + + case 0xa: +@@ -1227,7 +1228,7 @@ long arch_do_domctl( + if ( _xcr0_accum ) + { + if ( evc->size >= PV_XSAVE_HDR_SIZE + XSTATE_AREA_MIN_SIZE ) +- ret = validate_xstate(_xcr0, _xcr0_accum, ++ ret = validate_xstate(d, _xcr0, _xcr0_accum, + &_xsave_area->xsave_hdr); + } + else if ( !_xcr0 ) +@@ -1251,8 +1252,7 @@ long arch_do_domctl( + vcpu_pause(v); + v->arch.xcr0 = _xcr0; + v->arch.xcr0_accum = _xcr0_accum; +- if ( _xcr0_accum & XSTATE_NONLAZY ) +- v->arch.nonlazy_xstate_used = 1; ++ v->arch.nonlazy_xstate_used = _xcr0_accum & XSTATE_NONLAZY; + compress_xsave_states(v, _xsave_area, + evc->size - PV_XSAVE_HDR_SIZE); + vcpu_unpause(v); +diff --git a/xen/arch/x86/flushtlb.c b/xen/arch/x86/flushtlb.c +index f6d7ad1650..797c5d52cc 100644 +--- a/xen/arch/x86/flushtlb.c ++++ b/xen/arch/x86/flushtlb.c +@@ -8,9 +8,12 @@ + */ + + #include <xen/sched.h> ++#include <xen/smp.h> + #include <xen/softirq.h> + #include <asm/flushtlb.h> ++#include <asm/invpcid.h> + #include <asm/page.h> ++#include <asm/pv/domain.h> + + /* Debug builds: Wrap frequently to stress-test the wrap logic. */ + #ifdef NDEBUG +@@ -49,6 +52,8 @@ static u32 pre_flush(void) + raise_softirq(NEW_TLBFLUSH_CLOCK_PERIOD_SOFTIRQ); + + skip_clocktick: ++ hvm_flush_guest_tlbs(); ++ + return t2; + } + +@@ -69,21 +74,65 @@ static void post_flush(u32 t) + this_cpu(tlbflush_time) = t; + } + +-void write_cr3(unsigned long cr3) ++static void do_tlb_flush(void) ++{ ++ u32 t = pre_flush(); ++ ++ if ( use_invpcid ) ++ invpcid_flush_all(); ++ else ++ { ++ unsigned long cr4 = read_cr4(); ++ ++ write_cr4(cr4 ^ X86_CR4_PGE); ++ write_cr4(cr4); ++ } ++ ++ post_flush(t); ++} ++ ++void switch_cr3_cr4(unsigned long cr3, unsigned long cr4) + { +- unsigned long flags, cr4 = read_cr4(); ++ unsigned long flags, old_cr4; + u32 t; ++ unsigned long old_pcid = cr3_pcid(read_cr3()); + + /* This non-reentrant function is sometimes called in interrupt context. */ + local_irq_save(flags); + + t = pre_flush(); + +- hvm_flush_guest_tlbs(); +- +- write_cr4(cr4 & ~X86_CR4_PGE); +- asm volatile ( "mov %0, %%cr3" : : "r" (cr3) : "memory" ); +- write_cr4(cr4); ++ old_cr4 = read_cr4(); ++ if ( old_cr4 & X86_CR4_PGE ) ++ { ++ /* ++ * X86_CR4_PGE set means PCID is inactive. ++ * We have to purge the TLB via flipping cr4.pge. ++ */ ++ old_cr4 = cr4 & ~X86_CR4_PGE; ++ write_cr4(old_cr4); ++ } ++ else if ( use_invpcid ) ++ /* ++ * Flushing the TLB via INVPCID is necessary only in case PCIDs are ++ * in use, which is true only with INVPCID being available. ++ * Without PCID usage the following write_cr3() will purge the TLB ++ * (we are in the cr4.pge off path) of all entries. ++ * Using invpcid_flush_all_nonglobals() seems to be faster than ++ * invpcid_flush_all(), so use that. ++ */ ++ invpcid_flush_all_nonglobals(); ++ ++ write_cr3(cr3); ++ ++ if ( old_cr4 != cr4 ) ++ write_cr4(cr4); ++ else if ( old_pcid != cr3_pcid(cr3) ) ++ /* ++ * Make sure no TLB entries related to the old PCID created between ++ * flushing the TLB and writing the new %cr3 value remain in the TLB. ++ */ ++ invpcid_flush_single_context(old_pcid); + + post_flush(t); + +@@ -113,22 +162,32 @@ unsigned int flush_area_local(const void *va, unsigned int flags) + * are various errata surrounding INVLPG usage on superpages, and + * a full flush is in any case not *that* expensive. + */ +- asm volatile ( "invlpg %0" +- : : "m" (*(const char *)(va)) : "memory" ); ++ if ( read_cr4() & X86_CR4_PCIDE ) ++ { ++ unsigned long addr = (unsigned long)va; ++ ++ /* ++ * Flush the addresses for all potential address spaces. ++ * We can't check the current domain for being subject to ++ * XPTI as current might be the idle vcpu while we still have ++ * some XPTI domain TLB entries. ++ * Using invpcid is okay here, as with PCID enabled we always ++ * have global pages disabled. ++ */ ++ invpcid_flush_one(PCID_PV_PRIV, addr); ++ invpcid_flush_one(PCID_PV_USER, addr); ++ if ( !cpu_has_no_xpti ) ++ { ++ invpcid_flush_one(PCID_PV_PRIV | PCID_PV_XPTI, addr); ++ invpcid_flush_one(PCID_PV_USER | PCID_PV_XPTI, addr); ++ } ++ } ++ else ++ asm volatile ( "invlpg %0" ++ : : "m" (*(const char *)(va)) : "memory" ); + } + else +- { +- u32 t = pre_flush(); +- unsigned long cr4 = read_cr4(); +- +- hvm_flush_guest_tlbs(); +- +- write_cr4(cr4 & ~X86_CR4_PGE); +- barrier(); +- write_cr4(cr4); +- +- post_flush(t); +- } ++ do_tlb_flush(); + } + + if ( flags & FLUSH_CACHE ) +@@ -161,5 +220,8 @@ unsigned int flush_area_local(const void *va, unsigned int flags) + + local_irq_restore(irqfl); + ++ if ( flags & FLUSH_ROOT_PGTBL ) ++ get_cpu_info()->root_pgt_changed = true; ++ + return flags; + } +diff --git a/xen/arch/x86/genapic/x2apic.c b/xen/arch/x86/genapic/x2apic.c +index 5fffb317d9..4fb9a2225d 100644 +--- a/xen/arch/x86/genapic/x2apic.c ++++ b/xen/arch/x86/genapic/x2apic.c +@@ -201,18 +201,21 @@ static int update_clusterinfo( + if ( !cluster_cpus_spare ) + cluster_cpus_spare = xzalloc(cpumask_t); + if ( !cluster_cpus_spare || +- !alloc_cpumask_var(&per_cpu(scratch_mask, cpu)) ) ++ !cond_alloc_cpumask_var(&per_cpu(scratch_mask, cpu)) ) + err = -ENOMEM; + break; + case CPU_UP_CANCELED: + case CPU_DEAD: ++ case CPU_REMOVE: ++ if ( park_offline_cpus == (action != CPU_REMOVE) ) ++ break; + if ( per_cpu(cluster_cpus, cpu) ) + { + cpumask_clear_cpu(cpu, per_cpu(cluster_cpus, cpu)); + if ( cpumask_empty(per_cpu(cluster_cpus, cpu)) ) +- xfree(per_cpu(cluster_cpus, cpu)); ++ XFREE(per_cpu(cluster_cpus, cpu)); + } +- free_cpumask_var(per_cpu(scratch_mask, cpu)); ++ FREE_CPUMASK_VAR(per_cpu(scratch_mask, cpu)); + break; + } + +diff --git a/xen/arch/x86/hpet.c b/xen/arch/x86/hpet.c +index 8229c635e4..f18cbbd55a 100644 +--- a/xen/arch/x86/hpet.c ++++ b/xen/arch/x86/hpet.c +@@ -509,6 +509,8 @@ static void hpet_attach_channel(unsigned int cpu, + static void hpet_detach_channel(unsigned int cpu, + struct hpet_event_channel *ch) + { ++ unsigned int next; ++ + spin_lock_irq(&ch->lock); + + ASSERT(ch == per_cpu(cpu_bc_channel, cpu)); +@@ -517,7 +519,7 @@ static void hpet_detach_channel(unsigned int cpu, + + if ( cpu != ch->cpu ) + spin_unlock_irq(&ch->lock); +- else if ( cpumask_empty(ch->cpumask) ) ++ else if ( (next = cpumask_first(ch->cpumask)) >= nr_cpu_ids ) + { + ch->cpu = -1; + clear_bit(HPET_EVT_USED_BIT, &ch->flags); +@@ -525,7 +527,7 @@ static void hpet_detach_channel(unsigned int cpu, + } + else + { +- ch->cpu = cpumask_first(ch->cpumask); ++ ch->cpu = next; + set_channel_irq_affinity(ch); + local_irq_enable(); + } +diff --git a/xen/arch/x86/hvm/emulate.c b/xen/arch/x86/hvm/emulate.c +index b282089e03..03db6b3d31 100644 +--- a/xen/arch/x86/hvm/emulate.c ++++ b/xen/arch/x86/hvm/emulate.c +@@ -1900,6 +1900,7 @@ static int hvmemul_get_fpu( + * masking of all exceptions by FNSTENV.) + */ + save_fpu_enable(); ++ curr->fpu_initialised = true; + curr->fpu_dirtied = true; + if ( (fpu_ctxt->fcw & 0x3f) != 0x3f ) + { +@@ -1991,13 +1992,20 @@ static void hvmemul_put_fpu( + if ( backout == X86EMUL_FPU_fpu ) + { + /* +- * To back out changes to the register file simply adjust state such +- * that upon next FPU insn use by the guest we'll reload the state +- * saved (or freshly loaded) by hvmemul_get_fpu(). ++ * To back out changes to the register file ++ * - in fully eager mode, restore original state immediately, ++ * - in lazy mode, simply adjust state such that upon next FPU insn ++ * use by the guest we'll reload the state saved (or freshly loaded) ++ * by hvmemul_get_fpu(). + */ +- curr->fpu_dirtied = false; +- stts(); +- hvm_funcs.fpu_leave(curr); ++ if ( curr->arch.fully_eager_fpu ) ++ vcpu_restore_fpu_nonlazy(curr, false); ++ else ++ { ++ curr->fpu_dirtied = false; ++ stts(); ++ hvm_funcs.fpu_leave(curr); ++ } + } + } + +@@ -2113,22 +2121,20 @@ static int _hvm_emulate_one(struct hvm_emulate_ctxt *hvmemul_ctxt, + + vio->mmio_retry = 0; + +- switch ( rc = x86_emulate(&hvmemul_ctxt->ctxt, ops) ) ++ rc = x86_emulate(&hvmemul_ctxt->ctxt, ops); ++ if ( rc == X86EMUL_OKAY && vio->mmio_retry ) ++ rc = X86EMUL_RETRY; ++ ++ if ( !hvm_vcpu_io_need_completion(vio) ) + { +- case X86EMUL_OKAY: +- if ( vio->mmio_retry ) +- rc = X86EMUL_RETRY; +- /* fall through */ +- default: + vio->mmio_cache_count = 0; + vio->mmio_insn_bytes = 0; +- break; +- +- case X86EMUL_RETRY: ++ } ++ else ++ { + BUILD_BUG_ON(sizeof(vio->mmio_insn) < sizeof(hvmemul_ctxt->insn_buf)); + vio->mmio_insn_bytes = hvmemul_ctxt->insn_buf_bytes; + memcpy(vio->mmio_insn, hvmemul_ctxt->insn_buf, vio->mmio_insn_bytes); +- break; + } + + if ( hvmemul_ctxt->ctxt.retire.singlestep ) +diff --git a/xen/arch/x86/hvm/hpet.c b/xen/arch/x86/hvm/hpet.c +index f7aed7f69e..28377091ca 100644 +--- a/xen/arch/x86/hvm/hpet.c ++++ b/xen/arch/x86/hvm/hpet.c +@@ -264,13 +264,20 @@ static void hpet_set_timer(HPETState *h, unsigned int tn, + diff = (timer_is_32bit(h, tn) && (-diff > HPET_TINY_TIME_SPAN)) + ? (uint32_t)diff : 0; + ++ destroy_periodic_time(&h->pt[tn]); + if ( (tn <= 1) && (h->hpet.config & HPET_CFG_LEGACY) ) ++ { + /* if LegacyReplacementRoute bit is set, HPET specification requires + timer0 be routed to IRQ0 in NON-APIC or IRQ2 in the I/O APIC, + timer1 be routed to IRQ8 in NON-APIC or IRQ8 in the I/O APIC. */ + irq = (tn == 0) ? 0 : 8; ++ h->pt[tn].source = PTSRC_isa; ++ } + else ++ { + irq = timer_int_route(h, tn); ++ h->pt[tn].source = PTSRC_ioapic; ++ } + + /* + * diff is the time from now when the timer should fire, for a periodic +diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c +index 18adec5ad8..8e237eb1ac 100644 +--- a/xen/arch/x86/hvm/hvm.c ++++ b/xen/arch/x86/hvm/hvm.c +@@ -895,6 +895,9 @@ const char *hvm_efer_valid(const struct vcpu *v, uint64_t value, + else + p = &host_cpuid_policy; + ++ if ( value & ~EFER_KNOWN_MASK ) ++ return "Unknown bits set"; ++ + if ( (value & EFER_SCE) && !p->extd.syscall ) + return "SCE without feature"; + +@@ -1258,7 +1261,7 @@ static int hvm_load_cpu_xsave_states(struct domain *d, hvm_domain_context_t *h) + ctxt = (struct hvm_hw_cpu_xsave *)&h->data[h->cur]; + h->cur += desc->length; + +- err = validate_xstate(ctxt->xcr0, ctxt->xcr0_accum, ++ err = validate_xstate(d, ctxt->xcr0, ctxt->xcr0_accum, + (const void *)&ctxt->save_area.xsave_hdr); + if ( err ) + { +@@ -1313,8 +1316,7 @@ static int hvm_load_cpu_xsave_states(struct domain *d, hvm_domain_context_t *h) + + v->arch.xcr0 = ctxt->xcr0; + v->arch.xcr0_accum = ctxt->xcr0_accum; +- if ( ctxt->xcr0_accum & XSTATE_NONLAZY ) +- v->arch.nonlazy_xstate_used = 1; ++ v->arch.nonlazy_xstate_used = ctxt->xcr0_accum & XSTATE_NONLAZY; + compress_xsave_states(v, &ctxt->save_area, + size - offsetof(struct hvm_hw_cpu_xsave, save_area)); + +diff --git a/xen/arch/x86/hvm/ioreq.c b/xen/arch/x86/hvm/ioreq.c +index d5afe20cc8..25b2445429 100644 +--- a/xen/arch/x86/hvm/ioreq.c ++++ b/xen/arch/x86/hvm/ioreq.c +@@ -87,14 +87,17 @@ static void hvm_io_assist(struct hvm_ioreq_vcpu *sv, uint64_t data) + + static bool hvm_wait_for_io(struct hvm_ioreq_vcpu *sv, ioreq_t *p) + { ++ unsigned int prev_state = STATE_IOREQ_NONE; ++ + while ( sv->pending ) + { + unsigned int state = p->state; + +- rmb(); +- switch ( state ) ++ smp_rmb(); ++ ++ recheck: ++ if ( unlikely(state == STATE_IOREQ_NONE) ) + { +- case STATE_IOREQ_NONE: + /* + * The only reason we should see this case is when an + * emulator is dying and it races with an I/O being +@@ -102,14 +105,30 @@ static bool hvm_wait_for_io(struct hvm_ioreq_vcpu *sv, ioreq_t *p) + */ + hvm_io_assist(sv, ~0ul); + break; ++ } ++ ++ if ( unlikely(state < prev_state) ) ++ { ++ gdprintk(XENLOG_ERR, "Weird HVM ioreq state transition %u -> %u\n", ++ prev_state, state); ++ sv->pending = false; ++ domain_crash(sv->vcpu->domain); ++ return false; /* bail */ ++ } ++ ++ switch ( prev_state = state ) ++ { + case STATE_IORESP_READY: /* IORESP_READY -> NONE */ + p->state = STATE_IOREQ_NONE; + hvm_io_assist(sv, p->data); + break; + case STATE_IOREQ_READY: /* IOREQ_{READY,INPROCESS} -> IORESP_READY */ + case STATE_IOREQ_INPROCESS: +- wait_on_xen_event_channel(sv->ioreq_evtchn, p->state != state); +- break; ++ wait_on_xen_event_channel(sv->ioreq_evtchn, ++ ({ state = p->state; ++ smp_rmb(); ++ state != prev_state; })); ++ goto recheck; + default: + gdprintk(XENLOG_ERR, "Weird HVM iorequest state %u\n", state); + sv->pending = false; +diff --git a/xen/arch/x86/hvm/irq.c b/xen/arch/x86/hvm/irq.c +index f528e2d081..c85d004402 100644 +--- a/xen/arch/x86/hvm/irq.c ++++ b/xen/arch/x86/hvm/irq.c +@@ -41,6 +41,26 @@ static void assert_gsi(struct domain *d, unsigned ioapic_gsi) + vioapic_irq_positive_edge(d, ioapic_gsi); + } + ++int hvm_ioapic_assert(struct domain *d, unsigned int gsi, bool level) ++{ ++ struct hvm_irq *hvm_irq = hvm_domain_irq(d); ++ int vector; ++ ++ if ( gsi >= hvm_irq->nr_gsis ) ++ { ++ ASSERT_UNREACHABLE(); ++ return -1; ++ } ++ ++ spin_lock(&d->arch.hvm_domain.irq_lock); ++ if ( !level || hvm_irq->gsi_assert_count[gsi]++ == 0 ) ++ assert_gsi(d, gsi); ++ vector = vioapic_get_vector(d, gsi); ++ spin_unlock(&d->arch.hvm_domain.irq_lock); ++ ++ return vector; ++} ++ + static void assert_irq(struct domain *d, unsigned ioapic_gsi, unsigned pic_irq) + { + assert_gsi(d, ioapic_gsi); +diff --git a/xen/arch/x86/hvm/svm/entry.S b/xen/arch/x86/hvm/svm/entry.S +index bf092fe071..5e7c080c7c 100644 +--- a/xen/arch/x86/hvm/svm/entry.S ++++ b/xen/arch/x86/hvm/svm/entry.S +@@ -83,7 +83,7 @@ UNLIKELY_END(svm_trace) + mov VCPUMSR_spec_ctrl_raw(%rax), %eax + + /* WARNING! `ret`, `call *`, `jmp *` not safe beyond this point. */ +- SPEC_CTRL_EXIT_TO_GUEST /* Req: a=spec_ctrl %rsp=regs/cpuinfo, Clob: cd */ ++ SPEC_CTRL_EXIT_TO_HVM /* Req: a=spec_ctrl %rsp=regs/cpuinfo, Clob: cd */ + + pop %r15 + pop %r14 +@@ -108,7 +108,7 @@ UNLIKELY_END(svm_trace) + + GET_CURRENT(bx) + +- SPEC_CTRL_ENTRY_FROM_VMEXIT /* Req: b=curr %rsp=regs/cpuinfo, Clob: acd */ ++ SPEC_CTRL_ENTRY_FROM_HVM /* Req: b=curr %rsp=regs/cpuinfo, Clob: acd */ + /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */ + + mov VCPU_svm_vmcb(%rbx),%rcx +diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c +index dedec5752d..aad3655855 100644 +--- a/xen/arch/x86/hvm/svm/svm.c ++++ b/xen/arch/x86/hvm/svm/svm.c +@@ -546,7 +546,10 @@ void svm_update_guest_cr(struct vcpu *v, unsigned int cr) + if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) ) + { + if ( v != current ) +- hw_cr0_mask |= X86_CR0_TS; ++ { ++ if ( !v->arch.fully_eager_fpu ) ++ hw_cr0_mask |= X86_CR0_TS; ++ } + else if ( vmcb_get_cr0(vmcb) & X86_CR0_TS ) + svm_fpu_enter(v); + } +@@ -1033,7 +1036,8 @@ static void svm_ctxt_switch_from(struct vcpu *v) + if ( unlikely((read_efer() & EFER_SVME) == 0) ) + return; + +- svm_fpu_leave(v); ++ if ( !v->arch.fully_eager_fpu ) ++ svm_fpu_leave(v); + + svm_save_dr(v); + svm_lwp_save(v); +@@ -1046,6 +1050,7 @@ static void svm_ctxt_switch_from(struct vcpu *v) + set_ist(&idt_tables[cpu][TRAP_double_fault], IST_DF); + set_ist(&idt_tables[cpu][TRAP_nmi], IST_NMI); + set_ist(&idt_tables[cpu][TRAP_machine_check], IST_MCE); ++ set_ist(&idt_tables[cpu][TRAP_debug], IST_DB); + } + + static void svm_ctxt_switch_to(struct vcpu *v) +@@ -1067,6 +1072,7 @@ static void svm_ctxt_switch_to(struct vcpu *v) + set_ist(&idt_tables[cpu][TRAP_double_fault], IST_NONE); + set_ist(&idt_tables[cpu][TRAP_nmi], IST_NONE); + set_ist(&idt_tables[cpu][TRAP_machine_check], IST_NONE); ++ set_ist(&idt_tables[cpu][TRAP_debug], IST_NONE); + + svm_restore_dr(v); + +@@ -1361,24 +1367,18 @@ static void svm_inject_event(const struct x86_event *event) + * Xen must emulate enough of the event injection to be sure that a + * further fault shouldn't occur during delivery. This covers the fact + * that hardware doesn't perform DPL checking on injection. +- * +- * Also, it accounts for proper positioning of %rip for an event with trap +- * semantics (where %rip should point after the instruction) which suffers +- * a fault during injection (at which point %rip should point at the +- * instruction). + */ + if ( event->type == X86_EVENTTYPE_PRI_SW_EXCEPTION || +- (!cpu_has_svm_nrips && (event->type == X86_EVENTTYPE_SW_INTERRUPT || +- event->type == X86_EVENTTYPE_SW_EXCEPTION)) ) ++ (!cpu_has_svm_nrips && (event->type >= X86_EVENTTYPE_SW_INTERRUPT)) ) + svm_emul_swint_injection(&_event); + +- switch ( _event.vector ) ++ switch ( _event.vector | -(_event.type == X86_EVENTTYPE_SW_INTERRUPT) ) + { + case TRAP_debug: + if ( regs->eflags & X86_EFLAGS_TF ) + { + __restore_debug_registers(vmcb, curr); +- vmcb_set_dr6(vmcb, vmcb_get_dr6(vmcb) | 0x4000); ++ vmcb_set_dr6(vmcb, vmcb_get_dr6(vmcb) | DR_STEP); + } + /* fall through */ + case TRAP_int3: +@@ -1388,6 +1388,13 @@ static void svm_inject_event(const struct x86_event *event) + domain_pause_for_debugger(); + return; + } ++ break; ++ ++ case TRAP_page_fault: ++ ASSERT(_event.type == X86_EVENTTYPE_HW_EXCEPTION); ++ curr->arch.hvm_vcpu.guest_cr[2] = _event.cr2; ++ vmcb_set_cr2(vmcb, _event.cr2); ++ break; + } + + if ( unlikely(eventinj.fields.v) && +@@ -1410,13 +1417,9 @@ static void svm_inject_event(const struct x86_event *event) + * icebp, software events with trap semantics need emulating, so %rip in + * the trap frame points after the instruction. + * +- * The x86 emulator (if requested by the x86_swint_emulate_* choice) will +- * have performed checks such as presence/dpl/etc and believes that the +- * event injection will succeed without faulting. +- * +- * The x86 emulator will always provide fault semantics for software +- * events, with _trap.insn_len set appropriately. If the injection +- * requires emulation, move %rip forwards at this point. ++ * svm_emul_swint_injection() has already confirmed that events with trap ++ * semantics won't fault on injection. Position %rip/NextRIP suitably, ++ * and restrict the event type to what hardware will tolerate. + */ + switch ( _event.type ) + { +@@ -1473,16 +1476,12 @@ static void svm_inject_event(const struct x86_event *event) + eventinj.fields.errorcode == (uint16_t)eventinj.fields.errorcode); + vmcb->eventinj = eventinj; + +- if ( _event.vector == TRAP_page_fault ) +- { +- curr->arch.hvm_vcpu.guest_cr[2] = _event.cr2; +- vmcb_set_cr2(vmcb, _event.cr2); +- HVMTRACE_LONG_2D(PF_INJECT, _event.error_code, TRC_PAR_LONG(_event.cr2)); +- } ++ if ( _event.vector == TRAP_page_fault && ++ _event.type == X86_EVENTTYPE_HW_EXCEPTION ) ++ HVMTRACE_LONG_2D(PF_INJECT, _event.error_code, ++ TRC_PAR_LONG(_event.cr2)); + else +- { + HVMTRACE_2D(INJ_EXC, _event.vector, _event.error_code); +- } + } + + static int svm_event_pending(struct vcpu *v) +@@ -1836,6 +1835,25 @@ static int svm_msr_read_intercept(unsigned int msr, uint64_t *msr_content) + struct vcpu *v = current; + struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; + ++ switch ( msr ) ++ { ++ /* ++ * Sync not needed while the cross-vendor logic is in unilateral effect. ++ case MSR_IA32_SYSENTER_CS: ++ case MSR_IA32_SYSENTER_ESP: ++ case MSR_IA32_SYSENTER_EIP: ++ */ ++ case MSR_STAR: ++ case MSR_LSTAR: ++ case MSR_CSTAR: ++ case MSR_SYSCALL_MASK: ++ case MSR_FS_BASE: ++ case MSR_GS_BASE: ++ case MSR_SHADOW_GS_BASE: ++ svm_sync_vmcb(v); ++ break; ++ } ++ + switch ( msr ) + { + case MSR_IA32_SYSENTER_CS: +@@ -1848,6 +1866,34 @@ static int svm_msr_read_intercept(unsigned int msr, uint64_t *msr_content) + *msr_content = v->arch.hvm_svm.guest_sysenter_eip; + break; + ++ case MSR_STAR: ++ *msr_content = vmcb->star; ++ break; ++ ++ case MSR_LSTAR: ++ *msr_content = vmcb->lstar; ++ break; ++ ++ case MSR_CSTAR: ++ *msr_content = vmcb->cstar; ++ break; ++ ++ case MSR_SYSCALL_MASK: ++ *msr_content = vmcb->sfmask; ++ break; ++ ++ case MSR_FS_BASE: ++ *msr_content = vmcb->fs.base; ++ break; ++ ++ case MSR_GS_BASE: ++ *msr_content = vmcb->gs.base; ++ break; ++ ++ case MSR_SHADOW_GS_BASE: ++ *msr_content = vmcb->kerngsbase; ++ break; ++ + case MSR_IA32_MCx_MISC(4): /* Threshold register */ + case MSR_F10_MC4_MISC1 ... MSR_F10_MC4_MISC3: + /* +@@ -1976,32 +2022,81 @@ static int svm_msr_write_intercept(unsigned int msr, uint64_t msr_content) + int ret, result = X86EMUL_OKAY; + struct vcpu *v = current; + struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; +- int sync = 0; ++ bool sync = false; + + switch ( msr ) + { + case MSR_IA32_SYSENTER_CS: + case MSR_IA32_SYSENTER_ESP: + case MSR_IA32_SYSENTER_EIP: +- sync = 1; +- break; +- default: ++ case MSR_STAR: ++ case MSR_LSTAR: ++ case MSR_CSTAR: ++ case MSR_SYSCALL_MASK: ++ case MSR_FS_BASE: ++ case MSR_GS_BASE: ++ case MSR_SHADOW_GS_BASE: ++ sync = true; + break; + } + + if ( sync ) +- svm_sync_vmcb(v); ++ svm_sync_vmcb(v); + + switch ( msr ) + { ++ case MSR_IA32_SYSENTER_ESP: ++ case MSR_IA32_SYSENTER_EIP: ++ case MSR_LSTAR: ++ case MSR_CSTAR: ++ case MSR_FS_BASE: ++ case MSR_GS_BASE: ++ case MSR_SHADOW_GS_BASE: ++ if ( !is_canonical_address(msr_content) ) ++ goto gpf; ++ ++ switch ( msr ) ++ { ++ case MSR_IA32_SYSENTER_ESP: ++ vmcb->sysenter_esp = v->arch.hvm_svm.guest_sysenter_esp = msr_content; ++ break; ++ ++ case MSR_IA32_SYSENTER_EIP: ++ vmcb->sysenter_eip = v->arch.hvm_svm.guest_sysenter_eip = msr_content; ++ break; ++ ++ case MSR_LSTAR: ++ vmcb->lstar = msr_content; ++ break; ++ ++ case MSR_CSTAR: ++ vmcb->cstar = msr_content; ++ break; ++ ++ case MSR_FS_BASE: ++ vmcb->fs.base = msr_content; ++ break; ++ ++ case MSR_GS_BASE: ++ vmcb->gs.base = msr_content; ++ break; ++ ++ case MSR_SHADOW_GS_BASE: ++ vmcb->kerngsbase = msr_content; ++ break; ++ } ++ break; ++ + case MSR_IA32_SYSENTER_CS: + vmcb->sysenter_cs = v->arch.hvm_svm.guest_sysenter_cs = msr_content; + break; +- case MSR_IA32_SYSENTER_ESP: +- vmcb->sysenter_esp = v->arch.hvm_svm.guest_sysenter_esp = msr_content; ++ ++ case MSR_STAR: ++ vmcb->star = msr_content; + break; +- case MSR_IA32_SYSENTER_EIP: +- vmcb->sysenter_eip = v->arch.hvm_svm.guest_sysenter_eip = msr_content; ++ ++ case MSR_SYSCALL_MASK: ++ vmcb->sfmask = msr_content; + break; + + case MSR_IA32_DEBUGCTLMSR: +diff --git a/xen/arch/x86/hvm/svm/svmdebug.c b/xen/arch/x86/hvm/svm/svmdebug.c +index 89ef2db932..b5b946aa94 100644 +--- a/xen/arch/x86/hvm/svm/svmdebug.c ++++ b/xen/arch/x86/hvm/svm/svmdebug.c +@@ -131,9 +131,8 @@ bool svm_vmcb_isvalid(const char *from, const struct vmcb_struct *vmcb, + PRINTF("DR7: bits [63:32] are not zero (%#"PRIx64")\n", + vmcb_get_dr7(vmcb)); + +- if ( efer & ~(EFER_SCE | EFER_LME | EFER_LMA | EFER_NX | EFER_SVME | +- EFER_LMSLE | EFER_FFXSE) ) +- PRINTF("EFER: undefined bits are not zero (%#"PRIx64")\n", efer); ++ if ( efer & ~EFER_KNOWN_MASK ) ++ PRINTF("EFER: unknown bits are not zero (%#"PRIx64")\n", efer); + + if ( hvm_efer_valid(v, efer, -1) ) + PRINTF("EFER: %s (%"PRIx64")\n", hvm_efer_valid(v, efer, -1), efer); +diff --git a/xen/arch/x86/hvm/svm/vmcb.c b/xen/arch/x86/hvm/svm/vmcb.c +index 997e7597e0..612ced2f0d 100644 +--- a/xen/arch/x86/hvm/svm/vmcb.c ++++ b/xen/arch/x86/hvm/svm/vmcb.c +@@ -178,8 +178,8 @@ static int construct_vmcb(struct vcpu *v) + paging_update_paging_modes(v); + + vmcb->_exception_intercepts = +- HVM_TRAP_MASK +- | (1U << TRAP_no_device); ++ HVM_TRAP_MASK | ++ (v->arch.fully_eager_fpu ? 0 : (1U << TRAP_no_device)); + + if ( paging_mode_hap(v->domain) ) + { +diff --git a/xen/arch/x86/hvm/viridian.c b/xen/arch/x86/hvm/viridian.c +index f0fa59d7d5..b02a70d086 100644 +--- a/xen/arch/x86/hvm/viridian.c ++++ b/xen/arch/x86/hvm/viridian.c +@@ -245,7 +245,7 @@ void cpuid_viridian_leaves(const struct vcpu *v, uint32_t leaf, + }; + union { + HV_PARTITION_PRIVILEGE_MASK mask; +- uint32_t lo, hi; ++ struct { uint32_t lo, hi; }; + } u; + + if ( !(viridian_feature_mask(d) & HVMPV_no_freq) ) +@@ -966,12 +966,10 @@ int viridian_hypercall(struct cpu_user_regs *regs) + gprintk(XENLOG_WARNING, "unimplemented hypercall %04x\n", + input.call_code); + /* Fallthrough. */ +- case HvGetPartitionId: + case HvExtCallQueryCapabilities: + /* +- * These hypercalls seem to be erroneously issued by Windows +- * despite neither AccessPartitionId nor EnableExtendedHypercalls +- * being set in CPUID leaf 2. ++ * This hypercall seems to be erroneously issued by Windows ++ * despite EnableExtendedHypercalls not being set in CPUID leaf 2. + * Given that return a status of 'invalid code' has not so far + * caused any problems it's not worth logging. + */ +diff --git a/xen/arch/x86/hvm/vmx/entry.S b/xen/arch/x86/hvm/vmx/entry.S +index e750544b4b..c6504e3207 100644 +--- a/xen/arch/x86/hvm/vmx/entry.S ++++ b/xen/arch/x86/hvm/vmx/entry.S +@@ -38,9 +38,18 @@ ENTRY(vmx_asm_vmexit_handler) + movb $1,VCPU_vmx_launched(%rbx) + mov %rax,VCPU_hvm_guest_cr2(%rbx) + +- SPEC_CTRL_ENTRY_FROM_VMEXIT /* Req: b=curr %rsp=regs/cpuinfo, Clob: acd */ ++ SPEC_CTRL_ENTRY_FROM_HVM /* Req: b=curr %rsp=regs/cpuinfo, Clob: acd */ + /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */ + ++ /* Hardware clears MSR_DEBUGCTL on VMExit. Reinstate it if debugging Xen. */ ++ .macro restore_lbr ++ mov $IA32_DEBUGCTLMSR_LBR, %eax ++ mov $MSR_IA32_DEBUGCTLMSR, %ecx ++ xor %edx, %edx ++ wrmsr ++ .endm ++ ALTERNATIVE __stringify(ASM_NOP14), restore_lbr, X86_FEATURE_XEN_LBR ++ + mov %rsp,%rdi + call vmx_vmexit_handler + +@@ -76,7 +85,7 @@ UNLIKELY_END(realmode) + mov VCPUMSR_spec_ctrl_raw(%rax), %eax + + /* WARNING! `ret`, `call *`, `jmp *` not safe beyond this point. */ +- SPEC_CTRL_EXIT_TO_GUEST /* Req: a=spec_ctrl %rsp=regs/cpuinfo, Clob: cd */ ++ SPEC_CTRL_EXIT_TO_HVM /* Req: a=spec_ctrl %rsp=regs/cpuinfo, Clob: cd */ + + mov VCPU_hvm_guest_cr2(%rbx),%rax + +diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c +index a642118eab..2b223a16fe 100644 +--- a/xen/arch/x86/hvm/vmx/vmcs.c ++++ b/xen/arch/x86/hvm/vmx/vmcs.c +@@ -38,6 +38,7 @@ + #include <asm/flushtlb.h> + #include <asm/monitor.h> + #include <asm/shadow.h> ++#include <asm/spec_ctrl.h> + #include <asm/tboot.h> + #include <asm/apic.h> + +@@ -999,6 +1000,7 @@ static int construct_vmcs(struct vcpu *v) + unsigned long sysenter_eip; + u32 vmexit_ctl = vmx_vmexit_control; + u32 vmentry_ctl = vmx_vmentry_control; ++ int rc = 0; + + vmx_vmcs_enter(v); + +@@ -1086,8 +1088,8 @@ static int construct_vmcs(struct vcpu *v) + + if ( msr_bitmap == NULL ) + { +- vmx_vmcs_exit(v); +- return -ENOMEM; ++ rc = -ENOMEM; ++ goto out; + } + + memset(msr_bitmap, ~0, PAGE_SIZE); +@@ -1146,7 +1148,9 @@ static int construct_vmcs(struct vcpu *v) + __vmwrite(HOST_GS_BASE, 0); + + /* Host control registers. */ +- v->arch.hvm_vmx.host_cr0 = read_cr0() | X86_CR0_TS; ++ v->arch.hvm_vmx.host_cr0 = read_cr0() & ~X86_CR0_TS; ++ if ( !v->arch.fully_eager_fpu ) ++ v->arch.hvm_vmx.host_cr0 |= X86_CR0_TS; + __vmwrite(HOST_CR0, v->arch.hvm_vmx.host_cr0); + __vmwrite(HOST_CR4, mmu_cr4_features); + +@@ -1226,7 +1230,7 @@ static int construct_vmcs(struct vcpu *v) + + v->arch.hvm_vmx.exception_bitmap = HVM_TRAP_MASK + | (paging_mode_hap(d) ? 0 : (1U << TRAP_page_fault)) +- | (1U << TRAP_no_device); ++ | (v->arch.fully_eager_fpu ? 0 : (1U << TRAP_no_device)); + vmx_update_exception_bitmap(v); + + v->arch.hvm_vcpu.guest_cr[0] = X86_CR0_PE | X86_CR0_ET; +@@ -1269,141 +1273,197 @@ static int construct_vmcs(struct vcpu *v) + if ( cpu_has_vmx_tsc_scaling ) + __vmwrite(TSC_MULTIPLIER, d->arch.hvm_domain.tsc_scaling_ratio); + +- vmx_vmcs_exit(v); +- + /* will update HOST & GUEST_CR3 as reqd */ + paging_update_paging_modes(v); + + vmx_vlapic_msr_changed(v); + +- return 0; ++ if ( opt_l1d_flush && paging_mode_hap(d) ) ++ rc = vmx_add_msr(v, MSR_FLUSH_CMD, FLUSH_CMD_L1D, ++ VMX_MSR_GUEST_LOADONLY); ++ ++ out: ++ vmx_vmcs_exit(v); ++ ++ return rc; + } + +-static int vmx_msr_entry_key_cmp(const void *key, const void *elt) ++/* ++ * Search an MSR list looking for an MSR entry, or the slot in which it should ++ * live (to keep the data sorted) if an entry is not found. ++ * ++ * The return pointer is guaranteed to be bounded by start and end. However, ++ * it may point at end, and may be invalid for the caller to dereference. ++ */ ++static struct vmx_msr_entry *locate_msr_entry( ++ struct vmx_msr_entry *start, struct vmx_msr_entry *end, uint32_t msr) + { +- const u32 *msr = key; +- const struct vmx_msr_entry *entry = elt; ++ while ( start < end ) ++ { ++ struct vmx_msr_entry *mid = start + (end - start) / 2; + +- if ( *msr > entry->index ) +- return 1; +- if ( *msr < entry->index ) +- return -1; ++ if ( msr < mid->index ) ++ end = mid; ++ else if ( msr > mid->index ) ++ start = mid + 1; ++ else ++ return mid; ++ } + +- return 0; ++ return start; + } + +-struct vmx_msr_entry *vmx_find_msr(u32 msr, int type) ++struct vmx_msr_entry *vmx_find_msr(const struct vcpu *v, uint32_t msr, ++ enum vmx_msr_list_type type) + { +- struct vcpu *curr = current; +- unsigned int msr_count; +- struct vmx_msr_entry *msr_area; ++ const struct arch_vmx_struct *vmx = &v->arch.hvm_vmx; ++ struct vmx_msr_entry *start = NULL, *ent, *end; ++ unsigned int substart = 0, subend = vmx->msr_save_count; ++ unsigned int total = vmx->msr_load_count; + +- if ( type == VMX_GUEST_MSR ) +- { +- msr_count = curr->arch.hvm_vmx.msr_count; +- msr_area = curr->arch.hvm_vmx.msr_area; +- } +- else ++ ASSERT(v == current || !vcpu_runnable(v)); ++ ++ switch ( type ) + { +- ASSERT(type == VMX_HOST_MSR); +- msr_count = curr->arch.hvm_vmx.host_msr_count; +- msr_area = curr->arch.hvm_vmx.host_msr_area; ++ case VMX_MSR_HOST: ++ start = vmx->host_msr_area; ++ subend = vmx->host_msr_count; ++ total = subend; ++ break; ++ ++ case VMX_MSR_GUEST: ++ start = vmx->msr_area; ++ break; ++ ++ case VMX_MSR_GUEST_LOADONLY: ++ start = vmx->msr_area; ++ substart = subend; ++ subend = total; ++ break; ++ ++ default: ++ ASSERT_UNREACHABLE(); + } + +- if ( msr_area == NULL ) ++ if ( !start ) + return NULL; + +- return bsearch(&msr, msr_area, msr_count, sizeof(struct vmx_msr_entry), +- vmx_msr_entry_key_cmp); ++ end = start + total; ++ ent = locate_msr_entry(start + substart, start + subend, msr); ++ ++ return ((ent < end) && (ent->index == msr)) ? ent : NULL; + } + +-int vmx_read_guest_msr(u32 msr, u64 *val) ++int vmx_add_msr(struct vcpu *v, uint32_t msr, uint64_t val, ++ enum vmx_msr_list_type type) + { +- struct vmx_msr_entry *ent; ++ struct arch_vmx_struct *vmx = &v->arch.hvm_vmx; ++ struct vmx_msr_entry **ptr, *start = NULL, *ent, *end; ++ unsigned int substart, subend, total; ++ int rc; ++ ++ ASSERT(v == current || !vcpu_runnable(v)); + +- if ( (ent = vmx_find_msr(msr, VMX_GUEST_MSR)) != NULL ) ++ switch ( type ) + { +- *val = ent->data; +- return 0; +- } ++ case VMX_MSR_HOST: ++ ptr = &vmx->host_msr_area; ++ substart = 0; ++ subend = vmx->host_msr_count; ++ total = subend; ++ break; + +- return -ESRCH; +-} ++ case VMX_MSR_GUEST: ++ ptr = &vmx->msr_area; ++ substart = 0; ++ subend = vmx->msr_save_count; ++ total = vmx->msr_load_count; ++ break; + +-int vmx_write_guest_msr(u32 msr, u64 val) +-{ +- struct vmx_msr_entry *ent; ++ case VMX_MSR_GUEST_LOADONLY: ++ ptr = &vmx->msr_area; ++ substart = vmx->msr_save_count; ++ subend = vmx->msr_load_count; ++ total = subend; ++ break; + +- if ( (ent = vmx_find_msr(msr, VMX_GUEST_MSR)) != NULL ) +- { +- ent->data = val; +- return 0; ++ default: ++ ASSERT_UNREACHABLE(); ++ return -EINVAL; + } + +- return -ESRCH; +-} +- +-int vmx_add_msr(u32 msr, int type) +-{ +- struct vcpu *curr = current; +- unsigned int idx, *msr_count; +- struct vmx_msr_entry **msr_area, *msr_area_elem; ++ vmx_vmcs_enter(v); + +- if ( type == VMX_GUEST_MSR ) ++ /* Allocate memory on first use. */ ++ if ( unlikely(!*ptr) ) + { +- msr_count = &curr->arch.hvm_vmx.msr_count; +- msr_area = &curr->arch.hvm_vmx.msr_area; +- } +- else +- { +- ASSERT(type == VMX_HOST_MSR); +- msr_count = &curr->arch.hvm_vmx.host_msr_count; +- msr_area = &curr->arch.hvm_vmx.host_msr_area; +- } ++ paddr_t addr; + +- if ( *msr_area == NULL ) +- { +- if ( (*msr_area = alloc_xenheap_page()) == NULL ) +- return -ENOMEM; ++ if ( (*ptr = alloc_xenheap_page()) == NULL ) ++ { ++ rc = -ENOMEM; ++ goto out; ++ } ++ ++ addr = virt_to_maddr(*ptr); + +- if ( type == VMX_GUEST_MSR ) ++ switch ( type ) + { +- __vmwrite(VM_EXIT_MSR_STORE_ADDR, virt_to_maddr(*msr_area)); +- __vmwrite(VM_ENTRY_MSR_LOAD_ADDR, virt_to_maddr(*msr_area)); ++ case VMX_MSR_HOST: ++ __vmwrite(VM_EXIT_MSR_LOAD_ADDR, addr); ++ break; ++ ++ case VMX_MSR_GUEST: ++ case VMX_MSR_GUEST_LOADONLY: ++ __vmwrite(VM_EXIT_MSR_STORE_ADDR, addr); ++ __vmwrite(VM_ENTRY_MSR_LOAD_ADDR, addr); ++ break; + } +- else +- __vmwrite(VM_EXIT_MSR_LOAD_ADDR, virt_to_maddr(*msr_area)); + } + +- for ( idx = 0; idx < *msr_count && (*msr_area)[idx].index <= msr; idx++ ) +- if ( (*msr_area)[idx].index == msr ) +- return 0; ++ start = *ptr; ++ end = start + total; ++ ent = locate_msr_entry(start + substart, start + subend, msr); + +- if ( *msr_count == (PAGE_SIZE / sizeof(struct vmx_msr_entry)) ) +- return -ENOSPC; ++ if ( (ent < end) && (ent->index == msr) ) ++ goto found; + +- memmove(*msr_area + idx + 1, *msr_area + idx, +- sizeof(*msr_area_elem) * (*msr_count - idx)); ++ /* If there isn't an existing entry for msr, insert room for one. */ ++ if ( total == (PAGE_SIZE / sizeof(*ent)) ) ++ { ++ rc = -ENOSPC; ++ goto out; ++ } + +- msr_area_elem = *msr_area + idx; +- msr_area_elem->index = msr; +- msr_area_elem->mbz = 0; ++ memmove(ent + 1, ent, sizeof(*ent) * (end - ent)); + +- ++*msr_count; ++ ent->index = msr; ++ ent->mbz = 0; + +- if ( type == VMX_GUEST_MSR ) ++ switch ( type ) + { +- msr_area_elem->data = 0; +- __vmwrite(VM_EXIT_MSR_STORE_COUNT, *msr_count); +- __vmwrite(VM_ENTRY_MSR_LOAD_COUNT, *msr_count); +- } +- else +- { +- rdmsrl(msr, msr_area_elem->data); +- __vmwrite(VM_EXIT_MSR_LOAD_COUNT, *msr_count); ++ case VMX_MSR_HOST: ++ __vmwrite(VM_EXIT_MSR_LOAD_COUNT, ++vmx->host_msr_count); ++ break; ++ ++ case VMX_MSR_GUEST: ++ __vmwrite(VM_EXIT_MSR_STORE_COUNT, ++vmx->msr_save_count); ++ ++ /* Fallthrough */ ++ case VMX_MSR_GUEST_LOADONLY: ++ __vmwrite(VM_ENTRY_MSR_LOAD_COUNT, ++vmx->msr_load_count); ++ break; + } + +- return 0; ++ /* Set the msr's value. */ ++ found: ++ ent->data = val; ++ rc = 0; ++ ++ out: ++ vmx_vmcs_exit(v); ++ ++ return rc; + } + + void vmx_set_eoi_exit_bitmap(struct vcpu *v, u8 vector) +@@ -1784,10 +1844,7 @@ void vmcs_dump_vcpu(struct vcpu *v) + vmentry_ctl = vmr32(VM_ENTRY_CONTROLS), + vmexit_ctl = vmr32(VM_EXIT_CONTROLS); + cr4 = vmr(GUEST_CR4); +- +- /* EFER.LMA is read as zero, and is loaded from vmentry_ctl on entry. */ +- BUILD_BUG_ON(VM_ENTRY_IA32E_MODE << 1 != EFER_LMA); +- efer = vmr(GUEST_EFER) | ((vmentry_ctl & VM_ENTRY_IA32E_MODE) << 1); ++ efer = vmr(GUEST_EFER); + + printk("*** Guest State ***\n"); + printk("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", +@@ -1797,7 +1854,7 @@ void vmcs_dump_vcpu(struct vcpu *v) + printk("CR3 = 0x%016lx\n", vmr(GUEST_CR3)); + if ( (v->arch.hvm_vmx.secondary_exec_control & + SECONDARY_EXEC_ENABLE_EPT) && +- (cr4 & X86_CR4_PAE) && !(efer & EFER_LMA) ) ++ (cr4 & X86_CR4_PAE) && !(vmentry_ctl & VM_ENTRY_IA32E_MODE) ) + { + printk("PDPTE0 = 0x%016lx PDPTE1 = 0x%016lx\n", + vmr(GUEST_PDPTE(0)), vmr(GUEST_PDPTE(1))); +diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c +index fc505c8cf7..508882e8d2 100644 +--- a/xen/arch/x86/hvm/vmx/vmx.c ++++ b/xen/arch/x86/hvm/vmx/vmx.c +@@ -682,6 +682,12 @@ static void vmx_cpuid_policy_changed(struct vcpu *v) + vmx_clear_msr_intercept(v, MSR_PRED_CMD, VMX_MSR_RW); + else + vmx_set_msr_intercept(v, MSR_PRED_CMD, VMX_MSR_RW); ++ ++ /* MSR_FLUSH_CMD is safe to pass through if the guest knows about it. */ ++ if ( cp->feat.l1d_flush ) ++ vmx_clear_msr_intercept(v, MSR_FLUSH_CMD, VMX_MSR_RW); ++ else ++ vmx_set_msr_intercept(v, MSR_FLUSH_CMD, VMX_MSR_RW); + } + + int vmx_guest_x86_mode(struct vcpu *v) +@@ -1035,7 +1041,8 @@ static void vmx_ctxt_switch_from(struct vcpu *v) + vmx_vmcs_reload(v); + } + +- vmx_fpu_leave(v); ++ if ( !v->arch.fully_eager_fpu ) ++ vmx_fpu_leave(v); + vmx_save_guest_msrs(v); + vmx_restore_host_msrs(); + vmx_save_dr(v); +@@ -1597,7 +1604,10 @@ static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr) + if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) ) + { + if ( v != current ) +- hw_cr0_mask |= X86_CR0_TS; ++ { ++ if ( !v->arch.fully_eager_fpu ) ++ hw_cr0_mask |= X86_CR0_TS; ++ } + else if ( v->arch.hvm_vcpu.hw_cr[0] & X86_CR0_TS ) + vmx_fpu_enter(v); + } +@@ -2835,8 +2845,10 @@ enum + + #define LBR_FROM_SIGNEXT_2MSB ((1ULL << 59) | (1ULL << 60)) + +-#define FIXUP_LBR_TSX (1u << 0) +-#define FIXUP_BDW_ERRATUM_BDF14 (1u << 1) ++#define LBR_MSRS_INSERTED (1u << 0) ++#define LBR_FIXUP_TSX (1u << 1) ++#define LBR_FIXUP_BDF14 (1u << 2) ++#define LBR_FIXUP_MASK (LBR_FIXUP_TSX | LBR_FIXUP_BDF14) + + static bool __read_mostly lbr_tsx_fixup_needed; + static bool __read_mostly bdw_erratum_bdf14_fixup_needed; +@@ -2899,7 +2911,7 @@ static int is_last_branch_msr(u32 ecx) + + static int vmx_msr_read_intercept(unsigned int msr, uint64_t *msr_content) + { +- const struct vcpu *curr = current; ++ struct vcpu *curr = current; + + HVM_DBG_LOG(DBG_LEVEL_MSR, "ecx=%#x", msr); + +@@ -2958,7 +2970,7 @@ static int vmx_msr_read_intercept(unsigned int msr, uint64_t *msr_content) + goto done; + } + +- if ( vmx_read_guest_msr(msr, msr_content) == 0 ) ++ if ( vmx_read_guest_msr(curr, msr, msr_content) == 0 ) + break; + + if ( is_last_branch_msr(msr) ) +@@ -3093,11 +3105,14 @@ void vmx_vlapic_msr_changed(struct vcpu *v) + static int vmx_msr_write_intercept(unsigned int msr, uint64_t msr_content) + { + struct vcpu *v = current; ++ const struct cpuid_policy *cp = v->domain->arch.cpuid; + + HVM_DBG_LOG(DBG_LEVEL_MSR, "ecx=%#x, msr_value=%#"PRIx64, msr, msr_content); + + switch ( msr ) + { ++ uint64_t rsvd; ++ + case MSR_IA32_SYSENTER_CS: + __vmwrite(GUEST_SYSENTER_CS, msr_content); + break; +@@ -3111,45 +3126,85 @@ static int vmx_msr_write_intercept(unsigned int msr, uint64_t msr_content) + goto gp_fault; + __vmwrite(GUEST_SYSENTER_EIP, msr_content); + break; +- case MSR_IA32_DEBUGCTLMSR: { +- int i, rc = 0; +- uint64_t supported = IA32_DEBUGCTLMSR_LBR | IA32_DEBUGCTLMSR_BTF; ++ case MSR_IA32_DEBUGCTLMSR: ++ rsvd = ~(IA32_DEBUGCTLMSR_LBR | IA32_DEBUGCTLMSR_BTF); + +- if ( boot_cpu_has(X86_FEATURE_RTM) ) +- supported |= IA32_DEBUGCTLMSR_RTM; +- if ( msr_content & ~supported ) ++ /* TODO: Wire vPMU settings properly through the CPUID policy */ ++ if ( vpmu_is_set(vcpu_vpmu(v), VPMU_CPU_HAS_BTS) ) + { +- /* Perhaps some other bits are supported in vpmu. */ +- if ( vpmu_do_wrmsr(msr, msr_content, supported) ) +- break; ++ rsvd &= ~(IA32_DEBUGCTLMSR_TR | IA32_DEBUGCTLMSR_BTS | ++ IA32_DEBUGCTLMSR_BTINT); ++ ++ if ( cpu_has(¤t_cpu_data, X86_FEATURE_DSCPL) ) ++ rsvd &= ~(IA32_DEBUGCTLMSR_BTS_OFF_OS | ++ IA32_DEBUGCTLMSR_BTS_OFF_USR); + } +- if ( msr_content & IA32_DEBUGCTLMSR_LBR ) ++ ++ if ( cp->feat.rtm ) ++ rsvd &= ~IA32_DEBUGCTLMSR_RTM; ++ ++ if ( msr_content & rsvd ) ++ goto gp_fault; ++ ++ /* ++ * When a guest first enables LBR, arrange to save and restore the LBR ++ * MSRs and allow the guest direct access. ++ * ++ * MSR_DEBUGCTL and LBR has existed almost as long as MSRs have ++ * existed, and there is no architectural way to hide the feature, or ++ * fail the attempt to enable LBR. ++ * ++ * Unknown host LBR MSRs or hitting -ENOSPC with the guest load/save ++ * list are definitely hypervisor bugs, whereas -ENOMEM for allocating ++ * the load/save list is simply unlucky (and shouldn't occur with ++ * sensible management by the toolstack). ++ * ++ * Either way, there is nothing we can do right now to recover, and ++ * the guest won't execute correctly either. Simply crash the domain ++ * to make the failure obvious. ++ */ ++ if ( !(v->arch.hvm_vmx.lbr_flags & LBR_MSRS_INSERTED) && ++ (msr_content & IA32_DEBUGCTLMSR_LBR) ) + { + const struct lbr_info *lbr = last_branch_msr_get(); +- if ( lbr == NULL ) +- break; + +- for ( ; (rc == 0) && lbr->count; lbr++ ) +- for ( i = 0; (rc == 0) && (i < lbr->count); i++ ) +- if ( (rc = vmx_add_guest_msr(lbr->base + i)) == 0 ) ++ if ( unlikely(!lbr) ) ++ { ++ gprintk(XENLOG_ERR, "Unknown Host LBR MSRs\n"); ++ domain_crash(v->domain); ++ return X86EMUL_OKAY; ++ } ++ ++ for ( ; lbr->count; lbr++ ) ++ { ++ unsigned int i; ++ ++ for ( i = 0; i < lbr->count; i++ ) ++ { ++ int rc = vmx_add_guest_msr(v, lbr->base + i, 0); ++ ++ if ( unlikely(rc) ) + { +- vmx_clear_msr_intercept(v, lbr->base + i, VMX_MSR_RW); +- if ( lbr_tsx_fixup_needed ) +- v->arch.hvm_vmx.lbr_fixup_enabled |= FIXUP_LBR_TSX; +- if ( bdw_erratum_bdf14_fixup_needed ) +- v->arch.hvm_vmx.lbr_fixup_enabled |= +- FIXUP_BDW_ERRATUM_BDF14; ++ gprintk(XENLOG_ERR, ++ "Guest load/save list error %d\n", rc); ++ domain_crash(v->domain); ++ return X86EMUL_OKAY; + } +- } + +- if ( (rc < 0) || +- (msr_content && (vmx_add_host_load_msr(msr) < 0)) ) +- hvm_inject_hw_exception(TRAP_machine_check, X86_EVENT_NO_EC); +- else +- __vmwrite(GUEST_IA32_DEBUGCTL, msr_content); ++ vmx_clear_msr_intercept(v, lbr->base + i, VMX_MSR_RW); ++ } ++ } ++ ++ v->arch.hvm_vmx.lbr_flags |= LBR_MSRS_INSERTED; ++ if ( lbr_tsx_fixup_needed ) ++ v->arch.hvm_vmx.lbr_flags |= LBR_FIXUP_TSX; ++ if ( bdw_erratum_bdf14_fixup_needed ) ++ v->arch.hvm_vmx.lbr_flags |= LBR_FIXUP_BDF14; ++ } + ++ __vmwrite(GUEST_IA32_DEBUGCTL, msr_content); + break; +- } ++ + case MSR_IA32_FEATURE_CONTROL: + case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: + /* None of these MSRs are writeable. */ +@@ -3175,7 +3230,7 @@ static int vmx_msr_write_intercept(unsigned int msr, uint64_t msr_content) + switch ( long_mode_do_msr_write(msr, msr_content) ) + { + case HNDL_unhandled: +- if ( (vmx_write_guest_msr(msr, msr_content) != 0) && ++ if ( (vmx_write_guest_msr(v, msr, msr_content) != 0) && + !is_last_branch_msr(msr) ) + switch ( wrmsr_hypervisor_regs(msr, msr_content) ) + { +@@ -3717,6 +3772,7 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs) + */ + __vmread(EXIT_QUALIFICATION, &exit_qualification); + HVMTRACE_1D(TRAP_DEBUG, exit_qualification); ++ __restore_debug_registers(v); + write_debugreg(6, exit_qualification | DR_STATUS_RESERVED_ONE); + if ( !v->domain->debugger_attached ) + { +@@ -4186,11 +4242,11 @@ out: + static void lbr_tsx_fixup(void) + { + struct vcpu *curr = current; +- unsigned int msr_count = curr->arch.hvm_vmx.msr_count; ++ unsigned int msr_count = curr->arch.hvm_vmx.msr_save_count; + struct vmx_msr_entry *msr_area = curr->arch.hvm_vmx.msr_area; + struct vmx_msr_entry *msr; + +- if ( (msr = vmx_find_msr(lbr_from_start, VMX_GUEST_MSR)) != NULL ) ++ if ( (msr = vmx_find_msr(curr, lbr_from_start, VMX_MSR_GUEST)) != NULL ) + { + /* + * Sign extend into bits 61:62 while preserving bit 63 +@@ -4200,15 +4256,15 @@ static void lbr_tsx_fixup(void) + msr->data |= ((LBR_FROM_SIGNEXT_2MSB & msr->data) << 2); + } + +- if ( (msr = vmx_find_msr(lbr_lastint_from, VMX_GUEST_MSR)) != NULL ) ++ if ( (msr = vmx_find_msr(curr, lbr_lastint_from, VMX_MSR_GUEST)) != NULL ) + msr->data |= ((LBR_FROM_SIGNEXT_2MSB & msr->data) << 2); + } + +-static void sign_extend_msr(u32 msr, int type) ++static void sign_extend_msr(struct vcpu *v, u32 msr, int type) + { + struct vmx_msr_entry *entry; + +- if ( (entry = vmx_find_msr(msr, type)) != NULL ) ++ if ( (entry = vmx_find_msr(v, msr, type)) != NULL ) + { + if ( entry->data & VADDR_TOP_BIT ) + entry->data |= CANONICAL_MASK; +@@ -4219,6 +4275,8 @@ static void sign_extend_msr(u32 msr, int type) + + static void bdw_erratum_bdf14_fixup(void) + { ++ struct vcpu *curr = current; ++ + /* + * Occasionally, on certain Broadwell CPUs MSR_IA32_LASTINTTOIP has + * been observed to have the top three bits corrupted as though the +@@ -4228,17 +4286,17 @@ static void bdw_erratum_bdf14_fixup(void) + * erratum BDF14. Fix up MSR_IA32_LASTINT{FROM,TO}IP by + * sign-extending into bits 48:63. + */ +- sign_extend_msr(MSR_IA32_LASTINTFROMIP, VMX_GUEST_MSR); +- sign_extend_msr(MSR_IA32_LASTINTTOIP, VMX_GUEST_MSR); ++ sign_extend_msr(curr, MSR_IA32_LASTINTFROMIP, VMX_MSR_GUEST); ++ sign_extend_msr(curr, MSR_IA32_LASTINTTOIP, VMX_MSR_GUEST); + } + + static void lbr_fixup(void) + { + struct vcpu *curr = current; + +- if ( curr->arch.hvm_vmx.lbr_fixup_enabled & FIXUP_LBR_TSX ) ++ if ( curr->arch.hvm_vmx.lbr_flags & LBR_FIXUP_TSX ) + lbr_tsx_fixup(); +- if ( curr->arch.hvm_vmx.lbr_fixup_enabled & FIXUP_BDW_ERRATUM_BDF14 ) ++ if ( curr->arch.hvm_vmx.lbr_flags & LBR_FIXUP_BDF14 ) + bdw_erratum_bdf14_fixup(); + } + +@@ -4306,7 +4364,7 @@ bool vmx_vmenter_helper(const struct cpu_user_regs *regs) + } + + out: +- if ( unlikely(curr->arch.hvm_vmx.lbr_fixup_enabled) ) ++ if ( unlikely(curr->arch.hvm_vmx.lbr_flags & LBR_FIXUP_MASK) ) + lbr_fixup(); + + HVMTRACE_ND(VMENTRY, 0, 1/*cycles*/, 0, 0, 0, 0, 0, 0, 0); +diff --git a/xen/arch/x86/hvm/vpt.c b/xen/arch/x86/hvm/vpt.c +index 181f4cb631..04e3c2e15b 100644 +--- a/xen/arch/x86/hvm/vpt.c ++++ b/xen/arch/x86/hvm/vpt.c +@@ -107,31 +107,49 @@ static int pt_irq_vector(struct periodic_time *pt, enum hvm_intsrc src) + static int pt_irq_masked(struct periodic_time *pt) + { + struct vcpu *v = pt->vcpu; +- unsigned int gsi, isa_irq; +- int mask; +- uint8_t pic_imr; ++ unsigned int gsi = pt->irq; + +- if ( pt->source == PTSRC_lapic ) ++ switch ( pt->source ) ++ { ++ case PTSRC_lapic: + { + struct vlapic *vlapic = vcpu_vlapic(v); ++ + return (!vlapic_enabled(vlapic) || + (vlapic_get_reg(vlapic, APIC_LVTT) & APIC_LVT_MASKED)); + } + +- isa_irq = pt->irq; +- gsi = hvm_isa_irq_to_gsi(isa_irq); +- pic_imr = v->domain->arch.hvm_domain.vpic[isa_irq >> 3].imr; +- mask = vioapic_get_mask(v->domain, gsi); +- if ( mask < 0 ) ++ case PTSRC_isa: + { +- dprintk(XENLOG_WARNING, "d%u: invalid GSI (%u) for platform timer\n", +- v->domain->domain_id, gsi); +- domain_crash(v->domain); +- return -1; ++ uint8_t pic_imr = v->domain->arch.hvm_domain.vpic[pt->irq >> 3].imr; ++ ++ /* Check if the interrupt is unmasked in the PIC. */ ++ if ( !(pic_imr & (1 << (pt->irq & 7))) && vlapic_accept_pic_intr(v) ) ++ return 0; ++ ++ gsi = hvm_isa_irq_to_gsi(pt->irq); ++ } ++ ++ /* Fallthrough to check if the interrupt is masked on the IO APIC. */ ++ case PTSRC_ioapic: ++ { ++ int mask = vioapic_get_mask(v->domain, gsi); ++ ++ if ( mask < 0 ) ++ { ++ dprintk(XENLOG_WARNING, ++ "d%d: invalid GSI (%u) for platform timer\n", ++ v->domain->domain_id, gsi); ++ domain_crash(v->domain); ++ return -1; ++ } ++ ++ return mask; ++ } + } + +- return (((pic_imr & (1 << (isa_irq & 7))) || !vlapic_accept_pic_intr(v)) && +- mask); ++ ASSERT_UNREACHABLE(); ++ return 1; + } + + static void pt_lock(struct periodic_time *pt) +@@ -252,7 +270,7 @@ int pt_update_irq(struct vcpu *v) + struct list_head *head = &v->arch.hvm_vcpu.tm_list; + struct periodic_time *pt, *temp, *earliest_pt; + uint64_t max_lag; +- int irq, is_lapic, pt_vector; ++ int irq, pt_vector = -1; + + spin_lock(&v->arch.hvm_vcpu.tm_lock); + +@@ -288,29 +306,26 @@ int pt_update_irq(struct vcpu *v) + + earliest_pt->irq_issued = 1; + irq = earliest_pt->irq; +- is_lapic = (earliest_pt->source == PTSRC_lapic); + + spin_unlock(&v->arch.hvm_vcpu.tm_lock); + +- /* +- * If periodic timer interrut is handled by lapic, its vector in +- * IRR is returned and used to set eoi_exit_bitmap for virtual +- * interrupt delivery case. Otherwise return -1 to do nothing. +- */ +- if ( is_lapic ) ++ switch ( earliest_pt->source ) + { ++ case PTSRC_lapic: ++ /* ++ * If periodic timer interrupt is handled by lapic, its vector in ++ * IRR is returned and used to set eoi_exit_bitmap for virtual ++ * interrupt delivery case. Otherwise return -1 to do nothing. ++ */ + vlapic_set_irq(vcpu_vlapic(v), irq, 0); + pt_vector = irq; +- } +- else +- { ++ break; ++ ++ case PTSRC_isa: + hvm_isa_irq_deassert(v->domain, irq); + if ( platform_legacy_irq(irq) && vlapic_accept_pic_intr(v) && + v->domain->arch.hvm_domain.vpic[irq >> 3].int_output ) +- { + hvm_isa_irq_assert(v->domain, irq, NULL); +- pt_vector = -1; +- } + else + { + pt_vector = hvm_isa_irq_assert(v->domain, irq, vioapic_get_vector); +@@ -321,6 +336,17 @@ int pt_update_irq(struct vcpu *v) + if ( pt_vector < 0 || !vlapic_test_irq(vcpu_vlapic(v), pt_vector) ) + pt_vector = -1; + } ++ break; ++ ++ case PTSRC_ioapic: ++ /* ++ * NB: At the moment IO-APIC routed interrupts generated by vpt devices ++ * (HPET) are edge-triggered. ++ */ ++ pt_vector = hvm_ioapic_assert(v->domain, irq, false); ++ if ( pt_vector < 0 || !vlapic_test_irq(vcpu_vlapic(v), pt_vector) ) ++ pt_vector = -1; ++ break; + } + + return pt_vector; +@@ -418,7 +444,14 @@ void create_periodic_time( + struct vcpu *v, struct periodic_time *pt, uint64_t delta, + uint64_t period, uint8_t irq, time_cb *cb, void *data) + { +- ASSERT(pt->source != 0); ++ if ( !pt->source || ++ (pt->irq >= NR_ISAIRQS && pt->source == PTSRC_isa) || ++ (pt->irq >= hvm_domain_irq(v->domain)->nr_gsis && ++ pt->source == PTSRC_ioapic) ) ++ { ++ ASSERT_UNREACHABLE(); ++ return; ++ } + + destroy_periodic_time(pt); + +@@ -498,7 +531,7 @@ static void pt_adjust_vcpu(struct periodic_time *pt, struct vcpu *v) + { + int on_list; + +- ASSERT(pt->source == PTSRC_isa); ++ ASSERT(pt->source == PTSRC_isa || pt->source == PTSRC_ioapic); + + if ( pt->vcpu == NULL ) + return; +diff --git a/xen/arch/x86/i387.c b/xen/arch/x86/i387.c +index 88452522ad..a1d128dd94 100644 +--- a/xen/arch/x86/i387.c ++++ b/xen/arch/x86/i387.c +@@ -15,6 +15,7 @@ + #include <asm/i387.h> + #include <asm/xstate.h> + #include <asm/asm_defns.h> ++#include <asm/spec_ctrl.h> + + /*******************************/ + /* FPU Restore Functions */ +@@ -205,13 +206,13 @@ static inline void fpu_fxsave(struct vcpu *v) + /* VCPU FPU Functions */ + /*******************************/ + /* Restore FPU state whenever VCPU is schduled in. */ +-void vcpu_restore_fpu_eager(struct vcpu *v) ++void vcpu_restore_fpu_nonlazy(struct vcpu *v, bool need_stts) + { +- ASSERT(!is_idle_vcpu(v)); +- + /* Restore nonlazy extended state (i.e. parts not tracked by CR0.TS). */ +- if ( !v->arch.nonlazy_xstate_used ) +- return; ++ if ( !v->arch.fully_eager_fpu && !v->arch.nonlazy_xstate_used ) ++ goto maybe_stts; ++ ++ ASSERT(!is_idle_vcpu(v)); + + /* Avoid recursion */ + clts(); +@@ -221,17 +222,28 @@ void vcpu_restore_fpu_eager(struct vcpu *v) + * above) we also need to restore full state, to prevent subsequently + * saving state belonging to another vCPU. + */ +- if ( xstate_all(v) ) ++ if ( v->arch.fully_eager_fpu || (v->arch.xsave_area && xstate_all(v)) ) + { +- fpu_xrstor(v, XSTATE_ALL); ++ if ( cpu_has_xsave ) ++ fpu_xrstor(v, XSTATE_ALL); ++ else ++ fpu_fxrstor(v); ++ + v->fpu_initialised = 1; + v->fpu_dirtied = 1; ++ ++ /* Xen doesn't need TS set, but the guest might. */ ++ need_stts = is_pv_vcpu(v) && (v->arch.pv_vcpu.ctrlreg[0] & X86_CR0_TS); + } + else + { + fpu_xrstor(v, XSTATE_NONLAZY); +- stts(); ++ need_stts = true; + } ++ ++ maybe_stts: ++ if ( need_stts ) ++ stts(); + } + + /* +@@ -247,6 +259,8 @@ void vcpu_restore_fpu_lazy(struct vcpu *v) + if ( v->fpu_dirtied ) + return; + ++ ASSERT(!v->arch.fully_eager_fpu); ++ + if ( cpu_has_xsave ) + fpu_xrstor(v, XSTATE_LAZY); + else +@@ -297,6 +311,8 @@ int vcpu_init_fpu(struct vcpu *v) + { + int rc; + ++ v->arch.fully_eager_fpu = opt_eager_fpu; ++ + if ( (rc = xstate_alloc_save_area(v)) != 0 ) + return rc; + +diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c +index bb924e9225..f3dfe35785 100644 +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -125,6 +125,7 @@ + #include <asm/guest.h> + + #include <asm/hvm/grant_table.h> ++#include <asm/pv/domain.h> + #include <asm/pv/grant_table.h> + + #include "pv/mm.h" +@@ -503,12 +504,60 @@ void free_shared_domheap_page(struct page_info *page) + + void make_cr3(struct vcpu *v, mfn_t mfn) + { ++ struct domain *d = v->domain; ++ + v->arch.cr3 = mfn_x(mfn) << PAGE_SHIFT; ++ if ( is_pv_domain(d) && d->arch.pv_domain.pcid ) ++ v->arch.cr3 |= get_pcid_bits(v, false); ++} ++ ++unsigned long pv_guest_cr4_to_real_cr4(const struct vcpu *v) ++{ ++ const struct domain *d = v->domain; ++ unsigned long cr4; ++ ++ cr4 = v->arch.pv_vcpu.ctrlreg[4] & ~X86_CR4_DE; ++ cr4 |= mmu_cr4_features & (X86_CR4_PSE | X86_CR4_SMEP | X86_CR4_SMAP | ++ X86_CR4_OSXSAVE | X86_CR4_FSGSBASE); ++ ++ if ( d->arch.pv_domain.pcid ) ++ cr4 |= X86_CR4_PCIDE; ++ else if ( !d->arch.pv_domain.xpti ) ++ cr4 |= X86_CR4_PGE; ++ ++ cr4 |= d->arch.vtsc ? X86_CR4_TSD : 0; ++ ++ return cr4; + } + + void write_ptbase(struct vcpu *v) + { +- write_cr3(v->arch.cr3); ++ struct cpu_info *cpu_info = get_cpu_info(); ++ unsigned long new_cr4; ++ ++ new_cr4 = (is_pv_vcpu(v) && !is_idle_vcpu(v)) ++ ? pv_guest_cr4_to_real_cr4(v) ++ : ((read_cr4() & ~(X86_CR4_PCIDE | X86_CR4_TSD)) | X86_CR4_PGE); ++ ++ if ( is_pv_vcpu(v) && v->domain->arch.pv_domain.xpti ) ++ { ++ cpu_info->root_pgt_changed = true; ++ cpu_info->pv_cr3 = __pa(this_cpu(root_pgt)); ++ if ( new_cr4 & X86_CR4_PCIDE ) ++ cpu_info->pv_cr3 |= get_pcid_bits(v, true); ++ switch_cr3_cr4(v->arch.cr3, new_cr4); ++ } ++ else ++ { ++ /* Make sure to clear use_pv_cr3 and xen_cr3 before pv_cr3. */ ++ cpu_info->use_pv_cr3 = false; ++ cpu_info->xen_cr3 = 0; ++ /* switch_cr3_cr4() serializes. */ ++ switch_cr3_cr4(v->arch.cr3, new_cr4); ++ cpu_info->pv_cr3 = 0; ++ } ++ ++ ASSERT(is_pv_vcpu(v) || read_cr4() == mmu_cr4_features); + } + + /* +@@ -605,6 +654,9 @@ static int alloc_segdesc_page(struct page_info *page) + return i == 512 ? 0 : -EINVAL; + } + ++static int __get_page_type(struct page_info *page, unsigned long type, ++ int preemptible); ++ + static int get_page_and_type_from_mfn( + mfn_t mfn, unsigned long type, struct domain *d, + int partial, int preemptible) +@@ -616,9 +668,7 @@ static int get_page_and_type_from_mfn( + unlikely(!get_page_from_mfn(mfn, d)) ) + return -EINVAL; + +- rc = (preemptible ? +- get_page_type_preemptible(page, type) : +- (get_page_type(page, type) ? 0 : -EINVAL)); ++ rc = __get_page_type(page, type, preemptible); + + if ( unlikely(rc) && partial >= 0 && + (!preemptible || page != current->arch.old_guest_table) ) +@@ -1108,7 +1158,7 @@ get_page_from_l2e( + int rc; + + if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ) +- return 1; ++ return pv_l1tf_check_l2e(d, l2e) ? -ERESTART : 1; + + if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) ) + { +@@ -1143,7 +1193,7 @@ get_page_from_l3e( + int rc; + + if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) ) +- return 1; ++ return pv_l1tf_check_l3e(d, l3e) ? -ERESTART : 1; + + if ( unlikely((l3e_get_flags(l3e) & l3_disallow_mask(d))) ) + { +@@ -1176,7 +1226,7 @@ get_page_from_l4e( + int rc; + + if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) ) +- return 1; ++ return pv_l1tf_check_l4e(d, l4e) ? -ERESTART : 1; + + if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) ) + { +@@ -1371,6 +1421,13 @@ static int alloc_l1_table(struct page_info *page) + + for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ ) + { ++ if ( !(l1e_get_flags(pl1e[i]) & _PAGE_PRESENT) ) ++ { ++ ret = pv_l1tf_check_l1e(d, pl1e[i]) ? -ERESTART : 0; ++ if ( ret ) ++ goto out; ++ } ++ + switch ( ret = get_page_from_l1e(pl1e[i], d, d) ) + { + default: +@@ -1391,6 +1448,7 @@ static int alloc_l1_table(struct page_info *page) + + fail: + gdprintk(XENLOG_WARNING, "Failure in alloc_l1_table: slot %#x\n", i); ++ out: + while ( i-- > 0 ) + put_page_from_l1e(pl1e[i], d); + +@@ -1438,8 +1496,7 @@ static int create_pae_xen_mappings(struct domain *d, l3_pgentry_t *pl3e) + return 1; + } + +-static int alloc_l2_table(struct page_info *page, unsigned long type, +- int preemptible) ++static int alloc_l2_table(struct page_info *page, unsigned long type) + { + struct domain *d = page_get_owner(page); + unsigned long pfn = mfn_x(page_to_mfn(page)); +@@ -1451,8 +1508,7 @@ static int alloc_l2_table(struct page_info *page, unsigned long type, + + for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES; i++ ) + { +- if ( preemptible && i > page->nr_validated_ptes +- && hypercall_preempt_check() ) ++ if ( i > page->nr_validated_ptes && hypercall_preempt_check() ) + { + page->nr_validated_ptes = i; + rc = -ERESTART; +@@ -1463,6 +1519,12 @@ static int alloc_l2_table(struct page_info *page, unsigned long type, + (rc = get_page_from_l2e(pl2e[i], pfn, d)) > 0 ) + continue; + ++ if ( unlikely(rc == -ERESTART) ) ++ { ++ page->nr_validated_ptes = i; ++ break; ++ } ++ + if ( rc < 0 ) + { + gdprintk(XENLOG_WARNING, "Failure in alloc_l2_table: slot %#x\n", i); +@@ -1745,7 +1807,7 @@ static void free_l1_table(struct page_info *page) + } + + +-static int free_l2_table(struct page_info *page, int preemptible) ++static int free_l2_table(struct page_info *page) + { + struct domain *d = page_get_owner(page); + unsigned long pfn = mfn_x(page_to_mfn(page)); +@@ -1759,7 +1821,7 @@ static int free_l2_table(struct page_info *page, int preemptible) + do { + if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) && + put_page_from_l2e(pl2e[i], pfn) == 0 && +- preemptible && i && hypercall_preempt_check() ) ++ i && hypercall_preempt_check() ) + { + page->nr_validated_ptes = i; + err = -ERESTART; +@@ -1983,6 +2045,8 @@ static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e, + rc = -EBUSY; + } + } ++ else if ( pv_l1tf_check_l1e(pt_dom, nl1e) ) ++ return -ERESTART; + else if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, pt_vcpu, + preserve_ad)) ) + { +@@ -2046,6 +2110,8 @@ static int mod_l2_entry(l2_pgentry_t *pl2e, + rc = -EBUSY; + } + } ++ else if ( pv_l1tf_check_l2e(d, nl2e) ) ++ return -ERESTART; + else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, vcpu, + preserve_ad)) ) + { +@@ -2107,6 +2173,8 @@ static int mod_l3_entry(l3_pgentry_t *pl3e, + rc = -EFAULT; + } + } ++ else if ( pv_l1tf_check_l3e(d, nl3e) ) ++ return -ERESTART; + else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, vcpu, + preserve_ad)) ) + { +@@ -2172,6 +2240,8 @@ static int mod_l4_entry(l4_pgentry_t *pl4e, + rc = -EFAULT; + } + } ++ else if ( pv_l1tf_check_l4e(d, nl4e) ) ++ return -ERESTART; + else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, vcpu, + preserve_ad)) ) + { +@@ -2301,7 +2371,8 @@ static int alloc_page_type(struct page_info *page, unsigned long type, + rc = alloc_l1_table(page); + break; + case PGT_l2_page_table: +- rc = alloc_l2_table(page, type, preemptible); ++ ASSERT(preemptible); ++ rc = alloc_l2_table(page, type); + break; + case PGT_l3_page_table: + ASSERT(preemptible); +@@ -2393,7 +2464,8 @@ int free_page_type(struct page_info *page, unsigned long type, + rc = 0; + break; + case PGT_l2_page_table: +- rc = free_l2_table(page, preemptible); ++ ASSERT(preemptible); ++ rc = free_l2_table(page); + break; + case PGT_l3_page_table: + ASSERT(preemptible); +@@ -2477,7 +2549,7 @@ static int _put_page_type(struct page_info *page, bool preemptible, + nx = x & ~(PGT_validated|PGT_partial); + if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, + x, nx)) != x) ) +- continue; ++ goto maybe_preempt; + /* We cleared the 'valid bit' so we do the clean up. */ + rc = _put_final_page_type(page, x, preemptible, ptpg); + ptpg = NULL; +@@ -2509,12 +2581,13 @@ static int _put_page_type(struct page_info *page, bool preemptible, + */ + cpu_relax(); + y = page->u.inuse.type_info; +- continue; ++ goto maybe_preempt; + } + + if ( likely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) == x) ) + break; + ++ maybe_preempt: + if ( preemptible && hypercall_preempt_check() ) + return -EINTR; + } +@@ -2627,12 +2700,11 @@ static int __get_page_type(struct page_info *page, unsigned long type, + if ( !(x & PGT_partial) ) + { + /* Someone else is updating validation of this page. Wait... */ +- while ( (y = page->u.inuse.type_info) == x ) +- { ++ do { + if ( preemptible && hypercall_preempt_check() ) + return -EINTR; + cpu_relax(); +- } ++ } while ( (y = page->u.inuse.type_info) == x ); + continue; + } + /* Type ref count was left at 1 when PGT_partial got set. */ +@@ -3469,12 +3541,9 @@ long do_mmuext_op( + } + + if ( rc == -ERESTART ) +- { +- ASSERT(i < count); + rc = hypercall_create_continuation( + __HYPERVISOR_mmuext_op, "hihi", + uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom); +- } + else if ( curr->arch.old_guest_table ) + { + XEN_GUEST_HANDLE_PARAM(void) null; +@@ -3674,18 +3743,27 @@ long do_mmu_update( + case PGT_l4_page_table: + rc = mod_l4_entry(va, l4e_from_intpte(req.val), mfn, + cmd == MMU_PT_UPDATE_PRESERVE_AD, v); +- /* +- * No need to sync if all uses of the page can be accounted +- * to the page lock we hold, its pinned status, and uses on +- * this (v)CPU. +- */ +- if ( !rc && !cpu_has_no_xpti && +- ((page->u.inuse.type_info & PGT_count_mask) > +- (1 + !!(page->u.inuse.type_info & PGT_pinned) + +- (pagetable_get_pfn(curr->arch.guest_table) == mfn) + +- (pagetable_get_pfn(curr->arch.guest_table_user) == +- mfn))) ) +- sync_guest = true; ++ if ( !rc && !cpu_has_no_xpti ) ++ { ++ bool local_in_use = false; ++ ++ if ( pagetable_get_pfn(curr->arch.guest_table) == mfn ) ++ { ++ local_in_use = true; ++ get_cpu_info()->root_pgt_changed = true; ++ } ++ ++ /* ++ * No need to sync if all uses of the page can be ++ * accounted to the page lock we hold, its pinned ++ * status, and uses on this (v)CPU. ++ */ ++ if ( (page->u.inuse.type_info & PGT_count_mask) > ++ (1 + !!(page->u.inuse.type_info & PGT_pinned) + ++ (pagetable_get_pfn(curr->arch.guest_table_user) == ++ mfn) + local_in_use) ) ++ sync_guest = true; ++ } + break; + case PGT_writable_page: + perfc_incr(writable_mmu_updates); +@@ -3761,12 +3839,9 @@ long do_mmu_update( + } + + if ( rc == -ERESTART ) +- { +- ASSERT(i < count); + rc = hypercall_create_continuation( + __HYPERVISOR_mmu_update, "hihi", + ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom); +- } + else if ( curr->arch.old_guest_table ) + { + XEN_GUEST_HANDLE_PARAM(void) null; +@@ -3799,7 +3874,7 @@ long do_mmu_update( + + cpumask_andnot(mask, pt_owner->domain_dirty_cpumask, cpumask_of(cpu)); + if ( !cpumask_empty(mask) ) +- flush_mask(mask, FLUSH_TLB_GLOBAL); ++ flush_mask(mask, FLUSH_TLB_GLOBAL | FLUSH_ROOT_PGTBL); + } + + perfc_add(num_page_updates, i); +@@ -4020,7 +4095,13 @@ static int __do_update_va_mapping( + long do_update_va_mapping(unsigned long va, u64 val64, + unsigned long flags) + { +- return __do_update_va_mapping(va, val64, flags, current->domain); ++ int rc = __do_update_va_mapping(va, val64, flags, current->domain); ++ ++ if ( rc == -ERESTART ) ++ rc = hypercall_create_continuation( ++ __HYPERVISOR_update_va_mapping, "lll", va, val64, flags); ++ ++ return rc; + } + + long do_update_va_mapping_otherdomain(unsigned long va, u64 val64, +@@ -4037,6 +4118,46 @@ long do_update_va_mapping_otherdomain(unsigned long va, u64 val64, + + put_pg_owner(pg_owner); + ++ if ( rc == -ERESTART ) ++ rc = hypercall_create_continuation( ++ __HYPERVISOR_update_va_mapping_otherdomain, ++ "llli", va, val64, flags, domid); ++ ++ return rc; ++} ++ ++int compat_update_va_mapping(unsigned int va, uint32_t lo, uint32_t hi, ++ unsigned int flags) ++{ ++ int rc = __do_update_va_mapping(va, ((uint64_t)hi << 32) | lo, ++ flags, current->domain); ++ ++ if ( rc == -ERESTART ) ++ rc = hypercall_create_continuation( ++ __HYPERVISOR_update_va_mapping, "iiii", va, lo, hi, flags); ++ ++ return rc; ++} ++ ++int compat_update_va_mapping_otherdomain(unsigned int va, ++ uint32_t lo, uint32_t hi, ++ unsigned int flags, domid_t domid) ++{ ++ struct domain *pg_owner; ++ int rc; ++ ++ if ( (pg_owner = get_pg_owner(domid)) == NULL ) ++ return -ESRCH; ++ ++ rc = __do_update_va_mapping(va, ((uint64_t)hi << 32) | lo, flags, pg_owner); ++ ++ put_pg_owner(pg_owner); ++ ++ if ( rc == -ERESTART ) ++ rc = hypercall_create_continuation( ++ __HYPERVISOR_update_va_mapping_otherdomain, ++ "iiiii", va, lo, hi, flags, domid); ++ + return rc; + } + +diff --git a/xen/arch/x86/mm/paging.c b/xen/arch/x86/mm/paging.c +index 755a8f83ca..3954e74d43 100644 +--- a/xen/arch/x86/mm/paging.c ++++ b/xen/arch/x86/mm/paging.c +@@ -879,6 +879,8 @@ void paging_dump_domain_info(struct domain *d) + printk(" paging assistance: "); + if ( paging_mode_shadow(d) ) + printk("shadow "); ++ if ( paging_mode_sh_forced(d) ) ++ printk("forced "); + if ( paging_mode_hap(d) ) + printk("hap "); + if ( paging_mode_refcounts(d) ) +diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c +index 3a33e0b148..199dbbc929 100644 +--- a/xen/arch/x86/mm/shadow/common.c ++++ b/xen/arch/x86/mm/shadow/common.c +@@ -3057,6 +3057,15 @@ static void sh_new_mode(struct domain *d, u32 new_mode) + ASSERT(paging_locked_by_me(d)); + ASSERT(d != current->domain); + ++ /* ++ * If PG_SH_forced has previously been activated because of writing an ++ * L1TF-vulnerable PTE, it must remain active for the remaining lifetime ++ * of the domain, even if the logdirty mode needs to be controlled for ++ * migration purposes. ++ */ ++ if ( paging_mode_sh_forced(d) ) ++ new_mode |= PG_SH_forced | PG_SH_enable; ++ + d->arch.paging.mode = new_mode; + for_each_vcpu(d, v) + sh_update_paging_modes(v); +@@ -3935,6 +3944,33 @@ void shadow_audit_tables(struct vcpu *v) + + #endif /* Shadow audit */ + ++#ifdef CONFIG_PV ++ ++void pv_l1tf_tasklet(unsigned long data) ++{ ++ struct domain *d = (void *)data; ++ ++ domain_pause(d); ++ paging_lock(d); ++ ++ if ( !paging_mode_sh_forced(d) && !d->is_dying ) ++ { ++ int ret = shadow_one_bit_enable(d, PG_SH_forced); ++ ++ if ( ret ) ++ { ++ printk(XENLOG_G_ERR "d%d Failed to enable PG_SH_forced: %d\n", ++ d->domain_id, ret); ++ domain_crash(d); ++ } ++ } ++ ++ paging_unlock(d); ++ domain_unpause(d); ++} ++ ++#endif /* CONFIG_PV */ ++ + /* + * Local variables: + * mode: C +diff --git a/xen/arch/x86/mm/shadow/multi.c b/xen/arch/x86/mm/shadow/multi.c +index e93e3b36b1..8c03ba0158 100644 +--- a/xen/arch/x86/mm/shadow/multi.c ++++ b/xen/arch/x86/mm/shadow/multi.c +@@ -952,6 +952,8 @@ static int shadow_set_l4e(struct domain *d, + + /* Write the new entry */ + shadow_write_entries(sl4e, &new_sl4e, 1, sl4mfn); ++ flush_root_pgtbl_domain(d); ++ + flags |= SHADOW_SET_CHANGED; + + if ( shadow_l4e_get_flags(old_sl4e) & _PAGE_PRESENT ) +@@ -966,6 +968,7 @@ static int shadow_set_l4e(struct domain *d, + } + sh_put_ref(d, osl3mfn, paddr); + } ++ + return flags; + } + +diff --git a/xen/arch/x86/mpparse.c b/xen/arch/x86/mpparse.c +index 49140e46f0..f3f6d48668 100644 +--- a/xen/arch/x86/mpparse.c ++++ b/xen/arch/x86/mpparse.c +@@ -68,19 +68,26 @@ physid_mask_t phys_cpu_present_map; + + void __init set_nr_cpu_ids(unsigned int max_cpus) + { ++ unsigned int tot_cpus = num_processors + disabled_cpus; ++ + if (!max_cpus) +- max_cpus = num_processors + disabled_cpus; ++ max_cpus = tot_cpus; + if (max_cpus > NR_CPUS) + max_cpus = NR_CPUS; + else if (!max_cpus) + max_cpus = 1; + printk(XENLOG_INFO "SMP: Allowing %u CPUs (%d hotplug CPUs)\n", + max_cpus, max_t(int, max_cpus - num_processors, 0)); +- nr_cpu_ids = max_cpus; ++ ++ if (!park_offline_cpus) ++ tot_cpus = max_cpus; ++ nr_cpu_ids = min(tot_cpus, NR_CPUS + 0u); ++ if (park_offline_cpus && nr_cpu_ids < num_processors) ++ printk(XENLOG_WARNING "SMP: Cannot bring up %u further CPUs\n", ++ num_processors - nr_cpu_ids); + + #ifndef nr_cpumask_bits +- nr_cpumask_bits = (max_cpus + (BITS_PER_LONG - 1)) & +- ~(BITS_PER_LONG - 1); ++ nr_cpumask_bits = ROUNDUP(nr_cpu_ids, BITS_PER_LONG); + printk(XENLOG_DEBUG "NR_CPUS:%u nr_cpumask_bits:%u\n", + NR_CPUS, nr_cpumask_bits); + #endif +diff --git a/xen/arch/x86/msr.c b/xen/arch/x86/msr.c +index 48d061d7da..8e39d5f271 100644 +--- a/xen/arch/x86/msr.c ++++ b/xen/arch/x86/msr.c +@@ -131,6 +131,7 @@ int guest_rdmsr(const struct vcpu *v, uint32_t msr, uint64_t *val) + case MSR_AMD_PATCHLOADER: + case MSR_IA32_UCODE_WRITE: + case MSR_PRED_CMD: ++ case MSR_FLUSH_CMD: + /* Write-only */ + goto gp_fault; + +@@ -178,6 +179,8 @@ int guest_wrmsr(struct vcpu *v, uint32_t msr, uint64_t val) + + switch ( msr ) + { ++ uint64_t rsvd; ++ + case MSR_INTEL_PLATFORM_INFO: + case MSR_ARCH_CAPABILITIES: + /* Read-only */ +@@ -213,8 +216,10 @@ int guest_wrmsr(struct vcpu *v, uint32_t msr, uint64_t val) + * Note: SPEC_CTRL_STIBP is specified as safe to use (i.e. ignored) + * when STIBP isn't enumerated in hardware. + */ ++ rsvd = ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | ++ (cp->feat.ssbd ? SPEC_CTRL_SSBD : 0)); + +- if ( val & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP) ) ++ if ( val & rsvd ) + goto gp_fault; /* Rsvd bit set? */ + + vp->spec_ctrl.raw = val; +@@ -231,14 +236,25 @@ int guest_wrmsr(struct vcpu *v, uint32_t msr, uint64_t val) + wrmsrl(MSR_PRED_CMD, val); + break; + ++ case MSR_FLUSH_CMD: ++ if ( !cp->feat.l1d_flush ) ++ goto gp_fault; /* MSR available? */ ++ ++ if ( val & ~FLUSH_CMD_L1D ) ++ goto gp_fault; /* Rsvd bit set? */ ++ ++ if ( v == curr ) ++ wrmsrl(MSR_FLUSH_CMD, val); ++ break; ++ + case MSR_INTEL_MISC_FEATURES_ENABLES: + { +- uint64_t rsvd = ~0ull; + bool old_cpuid_faulting = vp->misc_features_enables.cpuid_faulting; + + if ( !vp->misc_features_enables.available ) + goto gp_fault; + ++ rsvd = ~0ull; + if ( dp->plaform_info.cpuid_faulting ) + rsvd &= ~MSR_MISC_FEATURES_CPUID_FAULTING; + +diff --git a/xen/arch/x86/oprofile/nmi_int.c b/xen/arch/x86/oprofile/nmi_int.c +index d8f5230906..3dfb8fef93 100644 +--- a/xen/arch/x86/oprofile/nmi_int.c ++++ b/xen/arch/x86/oprofile/nmi_int.c +@@ -182,7 +182,7 @@ int nmi_reserve_counters(void) + if (!allocate_msrs()) + return -ENOMEM; + +- /* We walk a thin line between law and rape here. ++ /* + * We need to be careful to install our NMI handler + * without actually triggering any NMIs as this will + * break the core code horrifically. +diff --git a/xen/arch/x86/percpu.c b/xen/arch/x86/percpu.c +index c9997b7937..8be4ebddf4 100644 +--- a/xen/arch/x86/percpu.c ++++ b/xen/arch/x86/percpu.c +@@ -28,7 +28,7 @@ static int init_percpu_area(unsigned int cpu) + char *p; + + if ( __per_cpu_offset[cpu] != INVALID_PERCPU_AREA ) +- return -EBUSY; ++ return 0; + + if ( (p = alloc_xenheap_pages(PERCPU_ORDER, 0)) == NULL ) + return -ENOMEM; +@@ -76,9 +76,12 @@ static int cpu_percpu_callback( + break; + case CPU_UP_CANCELED: + case CPU_DEAD: +- free_percpu_area(cpu); ++ if ( !park_offline_cpus ) ++ free_percpu_area(cpu); + break; +- default: ++ case CPU_REMOVE: ++ if ( park_offline_cpus ) ++ free_percpu_area(cpu); + break; + } + +diff --git a/xen/arch/x86/pv/dom0_build.c b/xen/arch/x86/pv/dom0_build.c +index 5d8909fa13..1a8142f89b 100644 +--- a/xen/arch/x86/pv/dom0_build.c ++++ b/xen/arch/x86/pv/dom0_build.c +@@ -388,6 +388,8 @@ int __init dom0_construct_pv(struct domain *d, + if ( compat32 ) + { + d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 1; ++ d->arch.pv_domain.xpti = false; ++ d->arch.pv_domain.pcid = false; + v->vcpu_info = (void *)&d->shared_info->compat.vcpu_info[0]; + if ( setup_compat_arg_xlat(v) != 0 ) + BUG(); +@@ -721,7 +723,7 @@ int __init dom0_construct_pv(struct domain *d, + update_cr3(v); + + /* We run on dom0's page tables for the final part of the build process. */ +- write_ptbase(v); ++ switch_cr3_cr4(cr3_pa(v->arch.cr3), read_cr4()); + mapcache_override_current(v); + + /* Copy the OS image and free temporary buffer. */ +@@ -742,7 +744,7 @@ int __init dom0_construct_pv(struct domain *d, + (parms.virt_hypercall >= v_end) ) + { + mapcache_override_current(NULL); +- write_ptbase(current); ++ switch_cr3_cr4(current->arch.cr3, read_cr4()); + printk("Invalid HYPERCALL_PAGE field in ELF notes.\n"); + rc = -1; + goto out; +@@ -875,7 +877,7 @@ int __init dom0_construct_pv(struct domain *d, + + /* Return to idle domain's page tables. */ + mapcache_override_current(NULL); +- write_ptbase(current); ++ switch_cr3_cr4(current->arch.cr3, read_cr4()); + + update_domain_wallclock_time(d); + +diff --git a/xen/arch/x86/pv/domain.c b/xen/arch/x86/pv/domain.c +index 74e9e667d2..bdcbd38f15 100644 +--- a/xen/arch/x86/pv/domain.c ++++ b/xen/arch/x86/pv/domain.c +@@ -9,7 +9,54 @@ + #include <xen/lib.h> + #include <xen/sched.h> + ++#include <asm/cpufeature.h> ++#include <asm/invpcid.h> ++#include <asm/spec_ctrl.h> + #include <asm/pv/domain.h> ++#include <asm/shadow.h> ++ ++static __read_mostly enum { ++ PCID_OFF, ++ PCID_ALL, ++ PCID_XPTI, ++ PCID_NOXPTI ++} opt_pcid = PCID_XPTI; ++ ++static __init int parse_pcid(const char *s) ++{ ++ int rc = 0; ++ ++ switch ( parse_bool(s, NULL) ) ++ { ++ case 0: ++ opt_pcid = PCID_OFF; ++ break; ++ ++ case 1: ++ opt_pcid = PCID_ALL; ++ break; ++ ++ default: ++ switch ( parse_boolean("xpti", s, NULL) ) ++ { ++ case 0: ++ opt_pcid = PCID_NOXPTI; ++ break; ++ ++ case 1: ++ opt_pcid = PCID_XPTI; ++ break; ++ ++ default: ++ rc = -EINVAL; ++ break; ++ } ++ break; ++ } ++ ++ return rc; ++} ++custom_runtime_param("pcid", parse_pcid); + + /* Override macros from asm/page.h to make them work with mfn_t */ + #undef mfn_to_page +@@ -81,6 +128,9 @@ int switch_compat(struct domain *d) + + d->arch.x87_fip_width = 4; + ++ d->arch.pv_domain.xpti = false; ++ d->arch.pv_domain.pcid = false; ++ + return 0; + + undo_and_fail: +@@ -166,6 +216,8 @@ int pv_vcpu_initialise(struct vcpu *v) + + void pv_domain_destroy(struct domain *d) + { ++ pv_l1tf_domain_destroy(d); ++ + destroy_perdomain_mapping(d, GDT_LDT_VIRT_START, + GDT_LDT_MBYTES << (20 - PAGE_SHIFT)); + +@@ -187,6 +239,8 @@ int pv_domain_initialise(struct domain *d, unsigned int domcr_flags, + }; + int rc = -ENOMEM; + ++ pv_l1tf_domain_init(d); ++ + d->arch.pv_domain.gdt_ldt_l1tab = + alloc_xenheap_pages(0, MEMF_node(domain_to_node(d))); + if ( !d->arch.pv_domain.gdt_ldt_l1tab ) +@@ -212,6 +266,32 @@ int pv_domain_initialise(struct domain *d, unsigned int domcr_flags, + /* 64-bit PV guest by default. */ + d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 0; + ++ d->arch.pv_domain.xpti = opt_xpti & (is_hardware_domain(d) ++ ? OPT_XPTI_DOM0 : OPT_XPTI_DOMU); ++ ++ if ( !is_pv_32bit_domain(d) && use_invpcid && cpu_has_pcid ) ++ switch ( opt_pcid ) ++ { ++ case PCID_OFF: ++ break; ++ ++ case PCID_ALL: ++ d->arch.pv_domain.pcid = true; ++ break; ++ ++ case PCID_XPTI: ++ d->arch.pv_domain.pcid = d->arch.pv_domain.xpti; ++ break; ++ ++ case PCID_NOXPTI: ++ d->arch.pv_domain.pcid = !d->arch.pv_domain.xpti; ++ break; ++ ++ default: ++ ASSERT_UNREACHABLE(); ++ break; ++ } ++ + return 0; + + fail: +@@ -239,13 +319,25 @@ void toggle_guest_mode(struct vcpu *v) + + void toggle_guest_pt(struct vcpu *v) + { ++ const struct domain *d = v->domain; ++ + if ( is_pv_32bit_vcpu(v) ) + return; + + v->arch.flags ^= TF_kernel_mode; + update_cr3(v); ++ if ( d->arch.pv_domain.xpti ) ++ { ++ struct cpu_info *cpu_info = get_cpu_info(); ++ ++ cpu_info->root_pgt_changed = true; ++ cpu_info->pv_cr3 = __pa(this_cpu(root_pgt)) | ++ (d->arch.pv_domain.pcid ++ ? get_pcid_bits(v, true) : 0); ++ } ++ + /* Don't flush user global mappings from the TLB. Don't tick TLB clock. */ +- asm volatile ( "mov %0, %%cr3" : : "r" (v->arch.cr3) : "memory" ); ++ write_cr3(v->arch.cr3); + + if ( !(v->arch.flags & TF_kernel_mode) ) + return; +diff --git a/xen/arch/x86/pv/emul-priv-op.c b/xen/arch/x86/pv/emul-priv-op.c +index 642ca312bf..c281936af0 100644 +--- a/xen/arch/x86/pv/emul-priv-op.c ++++ b/xen/arch/x86/pv/emul-priv-op.c +@@ -813,26 +813,6 @@ static int write_cr(unsigned int reg, unsigned long val, + return X86EMUL_UNHANDLEABLE; + } + +-static int read_dr(unsigned int reg, unsigned long *val, +- struct x86_emulate_ctxt *ctxt) +-{ +- unsigned long res = do_get_debugreg(reg); +- +- if ( IS_ERR_VALUE(res) ) +- return X86EMUL_UNHANDLEABLE; +- +- *val = res; +- +- return X86EMUL_OKAY; +-} +- +-static int write_dr(unsigned int reg, unsigned long val, +- struct x86_emulate_ctxt *ctxt) +-{ +- return do_set_debugreg(reg, val) == 0 +- ? X86EMUL_OKAY : X86EMUL_UNHANDLEABLE; +-} +- + static inline uint64_t guest_misc_enable(uint64_t val) + { + val &= ~(MSR_IA32_MISC_ENABLE_PERF_AVAIL | +@@ -906,9 +886,16 @@ static int read_msr(unsigned int reg, uint64_t *val, + return X86EMUL_OKAY; + + case MSR_EFER: +- *val = read_efer(); ++ /* Hide unknown bits, and unconditionally hide SVME from guests. */ ++ *val = read_efer() & EFER_KNOWN_MASK & ~EFER_SVME; ++ /* ++ * Hide the 64-bit features from 32-bit guests. SCE has ++ * vendor-dependent behaviour. ++ */ + if ( is_pv_32bit_domain(currd) ) +- *val &= ~(EFER_LME | EFER_LMA | EFER_LMSLE); ++ *val &= ~(EFER_LME | EFER_LMA | EFER_LMSLE | ++ (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ++ ? EFER_SCE : 0)); + return X86EMUL_OKAY; + + case MSR_K7_FID_VID_CTL: +@@ -1326,8 +1313,8 @@ static const struct x86_emulate_ops priv_op_ops = { + .read_segment = read_segment, + .read_cr = read_cr, + .write_cr = write_cr, +- .read_dr = read_dr, +- .write_dr = write_dr, ++ .read_dr = x86emul_read_dr, ++ .write_dr = x86emul_write_dr, + .read_msr = read_msr, + .write_msr = write_msr, + .cpuid = pv_emul_cpuid, +diff --git a/xen/arch/x86/pv/misc-hypercalls.c b/xen/arch/x86/pv/misc-hypercalls.c +index 5862130697..1619be7874 100644 +--- a/xen/arch/x86/pv/misc-hypercalls.c ++++ b/xen/arch/x86/pv/misc-hypercalls.c +@@ -30,22 +30,10 @@ long do_set_debugreg(int reg, unsigned long value) + + unsigned long do_get_debugreg(int reg) + { +- struct vcpu *curr = current; ++ unsigned long val; ++ int res = x86emul_read_dr(reg, &val, NULL); + +- switch ( reg ) +- { +- case 0 ... 3: +- case 6: +- return curr->arch.debugreg[reg]; +- case 7: +- return (curr->arch.debugreg[7] | +- curr->arch.debugreg[5]); +- case 4 ... 5: +- return ((curr->arch.pv_vcpu.ctrlreg[4] & X86_CR4_DE) ? +- curr->arch.debugreg[reg + 2] : 0); +- } +- +- return -EINVAL; ++ return res == X86EMUL_OKAY ? val : -ENODEV; + } + + long do_fpu_taskswitch(int set) +diff --git a/xen/arch/x86/pv/ro-page-fault.c b/xen/arch/x86/pv/ro-page-fault.c +index 6b2976d3df..622bb7dff0 100644 +--- a/xen/arch/x86/pv/ro-page-fault.c ++++ b/xen/arch/x86/pv/ro-page-fault.c +@@ -29,6 +29,7 @@ + #include <asm/mm.h> + #include <asm/pci.h> + #include <asm/pv/mm.h> ++#include <asm/shadow.h> + + #include "emulate.h" + #include "mm.h" +@@ -127,6 +128,10 @@ static int ptwr_emulated_update(unsigned long addr, paddr_t old, paddr_t val, + + /* Check the new PTE. */ + nl1e = l1e_from_intpte(val); ++ ++ if ( !(l1e_get_flags(nl1e) & _PAGE_PRESENT) && pv_l1tf_check_l1e(d, nl1e) ) ++ return X86EMUL_RETRY; ++ + switch ( ret = get_page_from_l1e(nl1e, d, d) ) + { + default: +diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c +index 482fe11669..1e9eecae04 100644 +--- a/xen/arch/x86/setup.c ++++ b/xen/arch/x86/setup.c +@@ -62,6 +62,14 @@ boolean_param("nosmp", opt_nosmp); + static unsigned int __initdata max_cpus; + integer_param("maxcpus", max_cpus); + ++int8_t __read_mostly opt_smt = -1; ++boolean_param("smt", opt_smt); ++ ++/* opt_invpcid: If false, don't use INVPCID instruction even if available. */ ++static bool __initdata opt_invpcid = true; ++boolean_param("invpcid", opt_invpcid); ++bool __read_mostly use_invpcid; ++ + unsigned long __read_mostly cr4_pv32_mask; + + /* **** Linux config option: propagated to domain0. */ +@@ -169,9 +177,6 @@ static int __init parse_smap_param(const char *s) + } + custom_param("smap", parse_smap_param); + +-static int8_t __initdata opt_xpti = -1; +-boolean_param("xpti", opt_xpti); +- + bool __read_mostly acpi_disabled; + bool __initdata acpi_force; + static char __initdata acpi_param[10] = ""; +@@ -663,7 +668,7 @@ void __init noreturn __start_xen(unsigned long mbi_p) + { + char *memmap_type = NULL; + char *cmdline, *kextra, *loader; +- unsigned int initrdidx, domcr_flags = DOMCRF_s3_integrity; ++ unsigned int initrdidx, num_parked = 0, domcr_flags = DOMCRF_s3_integrity; + multiboot_info_t *mbi; + module_t *mod; + unsigned long nr_pages, raw_max_page, modules_headroom, *module_map; +@@ -905,6 +910,18 @@ void __init noreturn __start_xen(unsigned long mbi_p) + /* Sanitise the raw E820 map to produce a final clean version. */ + max_page = raw_max_page = init_e820(memmap_type, &e820_raw); + ++ if ( !efi_enabled(EFI_BOOT) ) ++ { ++ /* ++ * Supplement the heuristics in l1tf_calculations() by assuming that ++ * anything referenced in the E820 may be cacheable. ++ */ ++ l1tf_safe_maddr = ++ max(l1tf_safe_maddr, ++ ROUNDUP(e820_raw.map[e820_raw.nr_map - 1].addr + ++ e820_raw.map[e820_raw.nr_map - 1].size, PAGE_SIZE)); ++ } ++ + /* Create a temporary copy of the E820 map. */ + memcpy(&boot_e820, &e820, sizeof(e820)); + +@@ -1485,7 +1502,8 @@ void __init noreturn __start_xen(unsigned long mbi_p) + else + { + set_nr_cpu_ids(max_cpus); +- max_cpus = nr_cpu_ids; ++ if ( !max_cpus ) ++ max_cpus = nr_cpu_ids; + } + + if ( xen_guest ) +@@ -1539,25 +1557,12 @@ void __init noreturn __start_xen(unsigned long mbi_p) + + cr4_pv32_mask = mmu_cr4_features & XEN_CR4_PV32_BITS; + +- if ( opt_xpti < 0 ) +- { +- uint64_t caps = 0; +- +- if ( boot_cpu_data.x86_vendor == X86_VENDOR_AMD ) +- caps = ARCH_CAPABILITIES_RDCL_NO; +- else if ( boot_cpu_has(X86_FEATURE_ARCH_CAPS) ) +- rdmsrl(MSR_ARCH_CAPABILITIES, caps); +- +- opt_xpti = !(caps & ARCH_CAPABILITIES_RDCL_NO); +- } +- if ( opt_xpti ) +- setup_clear_cpu_cap(X86_FEATURE_NO_XPTI); +- else +- setup_force_cpu_cap(X86_FEATURE_NO_XPTI); +- + if ( cpu_has_fsgsbase ) + set_in_cr4(X86_CR4_FSGSBASE); + ++ if ( opt_invpcid && cpu_has_invpcid ) ++ use_invpcid = true; ++ + init_speculation_mitigations(); + + init_idle_domain(); +@@ -1621,16 +1626,30 @@ void __init noreturn __start_xen(unsigned long mbi_p) + /* Set up node_to_cpumask based on cpu_to_node[]. */ + numa_add_cpu(i); + +- if ( (num_online_cpus() < max_cpus) && !cpu_online(i) ) ++ if ( (park_offline_cpus || num_online_cpus() < max_cpus) && ++ !cpu_online(i) ) + { + int ret = cpu_up(i); + if ( ret != 0 ) + printk("Failed to bring up CPU %u (error %d)\n", i, ret); ++ else if ( num_online_cpus() > max_cpus || ++ (!opt_smt && ++ cpu_data[i].compute_unit_id == INVALID_CUID && ++ cpumask_weight(per_cpu(cpu_sibling_mask, i)) > 1) ) ++ { ++ ret = cpu_down(i); ++ if ( !ret ) ++ ++num_parked; ++ else ++ printk("Could not re-offline CPU%u (%d)\n", i, ret); ++ } + } + } + } + + printk("Brought up %ld CPUs\n", (long)num_online_cpus()); ++ if ( num_parked ) ++ printk(XENLOG_INFO "Parked %u CPUs\n", num_parked); + smp_cpus_done(); + + do_initcalls(); +@@ -1746,6 +1765,13 @@ void __init noreturn __start_xen(unsigned long mbi_p) + + setup_io_bitmap(dom0); + ++ if ( bsp_delay_spec_ctrl ) ++ { ++ get_cpu_info()->spec_ctrl_flags &= ~SCF_use_shadow; ++ barrier(); ++ wrmsrl(MSR_SPEC_CTRL, default_xen_spec_ctrl); ++ } ++ + /* Jump to the 1:1 virtual mappings of cpu0_stack. */ + asm volatile ("mov %[stk], %%rsp; jmp %c[fn]" :: + [stk] "g" (__va(__pa(get_stack_bottom()))), +diff --git a/xen/arch/x86/smp.c b/xen/arch/x86/smp.c +index fd6d254812..517ba2f70d 100644 +--- a/xen/arch/x86/smp.c ++++ b/xen/arch/x86/smp.c +@@ -208,7 +208,7 @@ void invalidate_interrupt(struct cpu_user_regs *regs) + ack_APIC_irq(); + perfc_incr(ipis); + if ( __sync_local_execstate() ) +- flags &= ~(FLUSH_TLB | FLUSH_TLB_GLOBAL); ++ flags &= ~(FLUSH_TLB | FLUSH_TLB_GLOBAL | FLUSH_ROOT_PGTBL); + flush_area_local(flush_va, flags); + cpumask_clear_cpu(smp_processor_id(), &flush_cpumask); + } +diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c +index e1d023428c..b0496eb66e 100644 +--- a/xen/arch/x86/smpboot.c ++++ b/xen/arch/x86/smpboot.c +@@ -69,6 +69,8 @@ static cpumask_t scratch_cpu0mask; + cpumask_t cpu_online_map __read_mostly; + EXPORT_SYMBOL(cpu_online_map); + ++bool __read_mostly park_offline_cpus; ++ + unsigned int __read_mostly nr_sockets; + cpumask_t **__read_mostly socket_cpumask; + static cpumask_t *secondary_socket_cpumask; +@@ -228,33 +230,41 @@ static void link_thread_siblings(int cpu1, int cpu2) + cpumask_set_cpu(cpu2, per_cpu(cpu_core_mask, cpu1)); + } + +-static void set_cpu_sibling_map(int cpu) ++static void set_cpu_sibling_map(unsigned int cpu) + { +- int i; ++ unsigned int i; + struct cpuinfo_x86 *c = cpu_data; + + cpumask_set_cpu(cpu, &cpu_sibling_setup_map); + + cpumask_set_cpu(cpu, socket_cpumask[cpu_to_socket(cpu)]); ++ cpumask_set_cpu(cpu, per_cpu(cpu_core_mask, cpu)); ++ cpumask_set_cpu(cpu, per_cpu(cpu_sibling_mask, cpu)); + + if ( c[cpu].x86_num_siblings > 1 ) + { + for_each_cpu ( i, &cpu_sibling_setup_map ) + { +- if ( cpu_has(c, X86_FEATURE_TOPOEXT) ) { +- if ( (c[cpu].phys_proc_id == c[i].phys_proc_id) && +- (c[cpu].compute_unit_id == c[i].compute_unit_id) ) ++ if ( cpu == i || c[cpu].phys_proc_id != c[i].phys_proc_id ) ++ continue; ++ if ( c[cpu].compute_unit_id != INVALID_CUID && ++ c[i].compute_unit_id != INVALID_CUID ) ++ { ++ if ( c[cpu].compute_unit_id == c[i].compute_unit_id ) ++ link_thread_siblings(cpu, i); ++ } ++ else if ( c[cpu].cpu_core_id != XEN_INVALID_CORE_ID && ++ c[i].cpu_core_id != XEN_INVALID_CORE_ID ) ++ { ++ if ( c[cpu].cpu_core_id == c[i].cpu_core_id ) + link_thread_siblings(cpu, i); +- } else if ( (c[cpu].phys_proc_id == c[i].phys_proc_id) && +- (c[cpu].cpu_core_id == c[i].cpu_core_id) ) { +- link_thread_siblings(cpu, i); + } ++ else ++ printk(XENLOG_WARNING ++ "CPU%u: unclear relationship with CPU%u\n", ++ cpu, i); + } + } +- else +- { +- cpumask_set_cpu(cpu, per_cpu(cpu_sibling_mask, cpu)); +- } + + if ( c[cpu].x86_max_cores == 1 ) + { +@@ -330,8 +340,9 @@ void start_secondary(void *unused) + */ + spin_debug_disable(); + ++ get_cpu_info()->use_pv_cr3 = false; + get_cpu_info()->xen_cr3 = 0; +- get_cpu_info()->pv_cr3 = this_cpu(root_pgt) ? __pa(this_cpu(root_pgt)) : 0; ++ get_cpu_info()->pv_cr3 = 0; + + load_system_tables(); + +@@ -351,6 +362,14 @@ void start_secondary(void *unused) + else + microcode_resume_cpu(cpu); + ++ /* ++ * If MSR_SPEC_CTRL is available, apply Xen's default setting and discard ++ * any firmware settings. Note: MSR_SPEC_CTRL may only become available ++ * after loading microcode. ++ */ ++ if ( boot_cpu_has(X86_FEATURE_IBRSB) ) ++ wrmsrl(MSR_SPEC_CTRL, default_xen_spec_ctrl); ++ + if ( xen_guest ) + hypervisor_ap_setup(); + +@@ -870,11 +889,18 @@ static void cleanup_cpu_root_pgt(unsigned int cpu) + l2_pgentry_t *l2t = l3e_to_l2e(l3t[l3_table_offset(stub_linear)]); + l1_pgentry_t *l1t = l2e_to_l1e(l2t[l2_table_offset(stub_linear)]); + +- l1t[l2_table_offset(stub_linear)] = l1e_empty(); ++ l1t[l1_table_offset(stub_linear)] = l1e_empty(); + } + } + +-static void cpu_smpboot_free(unsigned int cpu) ++/* ++ * The 'remove' boolean controls whether a CPU is just getting offlined (and ++ * parked), or outright removed / offlined without parking. Parked CPUs need ++ * things like their stack, GDT, IDT, TSS, and per-CPU data still available. ++ * A few other items, in particular CPU masks, are also retained, as it's ++ * difficult to prove that they're entirely unreferenced from parked CPUs. ++ */ ++static void cpu_smpboot_free(unsigned int cpu, bool remove) + { + unsigned int order, socket = cpu_to_socket(cpu); + struct cpuinfo_x86 *c = cpu_data; +@@ -885,15 +911,19 @@ static void cpu_smpboot_free(unsigned int cpu) + socket_cpumask[socket] = NULL; + } + +- c[cpu].phys_proc_id = XEN_INVALID_SOCKET_ID; +- c[cpu].cpu_core_id = XEN_INVALID_CORE_ID; +- c[cpu].compute_unit_id = INVALID_CUID; + cpumask_clear_cpu(cpu, &cpu_sibling_setup_map); + +- free_cpumask_var(per_cpu(cpu_sibling_mask, cpu)); +- free_cpumask_var(per_cpu(cpu_core_mask, cpu)); +- if ( per_cpu(scratch_cpumask, cpu) != &scratch_cpu0mask ) +- free_cpumask_var(per_cpu(scratch_cpumask, cpu)); ++ if ( remove ) ++ { ++ c[cpu].phys_proc_id = XEN_INVALID_SOCKET_ID; ++ c[cpu].cpu_core_id = XEN_INVALID_CORE_ID; ++ c[cpu].compute_unit_id = INVALID_CUID; ++ ++ FREE_CPUMASK_VAR(per_cpu(cpu_sibling_mask, cpu)); ++ FREE_CPUMASK_VAR(per_cpu(cpu_core_mask, cpu)); ++ if ( per_cpu(scratch_cpumask, cpu) != &scratch_cpu0mask ) ++ FREE_CPUMASK_VAR(per_cpu(scratch_cpumask, cpu)); ++ } + + cleanup_cpu_root_pgt(cpu); + +@@ -915,19 +945,21 @@ static void cpu_smpboot_free(unsigned int cpu) + } + + order = get_order_from_pages(NR_RESERVED_GDT_PAGES); +- free_xenheap_pages(per_cpu(gdt_table, cpu), order); ++ if ( remove ) ++ FREE_XENHEAP_PAGES(per_cpu(gdt_table, cpu), order); + + free_xenheap_pages(per_cpu(compat_gdt_table, cpu), order); + +- order = get_order_from_bytes(IDT_ENTRIES * sizeof(idt_entry_t)); +- free_xenheap_pages(idt_tables[cpu], order); +- idt_tables[cpu] = NULL; +- +- if ( stack_base[cpu] != NULL ) ++ if ( remove ) + { +- memguard_unguard_stack(stack_base[cpu]); +- free_xenheap_pages(stack_base[cpu], STACK_ORDER); +- stack_base[cpu] = NULL; ++ order = get_order_from_bytes(IDT_ENTRIES * sizeof(idt_entry_t)); ++ FREE_XENHEAP_PAGES(idt_tables[cpu], order); ++ ++ if ( stack_base[cpu] ) ++ { ++ memguard_unguard_stack(stack_base[cpu]); ++ FREE_XENHEAP_PAGES(stack_base[cpu], STACK_ORDER); ++ } + } + } + +@@ -941,15 +973,17 @@ static int cpu_smpboot_alloc(unsigned int cpu) + if ( node != NUMA_NO_NODE ) + memflags = MEMF_node(node); + +- stack_base[cpu] = alloc_xenheap_pages(STACK_ORDER, memflags); ++ if ( stack_base[cpu] == NULL ) ++ stack_base[cpu] = alloc_xenheap_pages(STACK_ORDER, memflags); + if ( stack_base[cpu] == NULL ) + goto oom; + memguard_guard_stack(stack_base[cpu]); + + order = get_order_from_pages(NR_RESERVED_GDT_PAGES); +- per_cpu(gdt_table, cpu) = gdt = alloc_xenheap_pages(order, memflags); ++ gdt = per_cpu(gdt_table, cpu) ?: alloc_xenheap_pages(order, memflags); + if ( gdt == NULL ) + goto oom; ++ per_cpu(gdt_table, cpu) = gdt; + memcpy(gdt, boot_cpu_gdt_table, NR_RESERVED_GDT_PAGES * PAGE_SIZE); + BUILD_BUG_ON(NR_CPUS > 0x10000); + gdt[PER_CPU_GDT_ENTRY - FIRST_RESERVED_GDT_ENTRY].a = cpu; +@@ -961,13 +995,15 @@ static int cpu_smpboot_alloc(unsigned int cpu) + gdt[PER_CPU_GDT_ENTRY - FIRST_RESERVED_GDT_ENTRY].a = cpu; + + order = get_order_from_bytes(IDT_ENTRIES * sizeof(idt_entry_t)); +- idt_tables[cpu] = alloc_xenheap_pages(order, memflags); ++ if ( idt_tables[cpu] == NULL ) ++ idt_tables[cpu] = alloc_xenheap_pages(order, memflags); + if ( idt_tables[cpu] == NULL ) + goto oom; + memcpy(idt_tables[cpu], idt_table, IDT_ENTRIES * sizeof(idt_entry_t)); + set_ist(&idt_tables[cpu][TRAP_double_fault], IST_NONE); + set_ist(&idt_tables[cpu][TRAP_nmi], IST_NONE); + set_ist(&idt_tables[cpu][TRAP_machine_check], IST_NONE); ++ set_ist(&idt_tables[cpu][TRAP_debug], IST_NONE); + + for ( stub_page = 0, i = cpu & ~(STUBS_PER_PAGE - 1); + i < nr_cpu_ids && i <= (cpu | (STUBS_PER_PAGE - 1)); ++i ) +@@ -989,13 +1025,13 @@ static int cpu_smpboot_alloc(unsigned int cpu) + (secondary_socket_cpumask = xzalloc(cpumask_t)) == NULL ) + goto oom; + +- if ( zalloc_cpumask_var(&per_cpu(cpu_sibling_mask, cpu)) && +- zalloc_cpumask_var(&per_cpu(cpu_core_mask, cpu)) && +- alloc_cpumask_var(&per_cpu(scratch_cpumask, cpu)) ) ++ if ( cond_zalloc_cpumask_var(&per_cpu(cpu_sibling_mask, cpu)) && ++ cond_zalloc_cpumask_var(&per_cpu(cpu_core_mask, cpu)) && ++ cond_alloc_cpumask_var(&per_cpu(scratch_cpumask, cpu)) ) + return 0; + + oom: +- cpu_smpboot_free(cpu); ++ cpu_smpboot_free(cpu, true); + return -ENOMEM; + } + +@@ -1012,9 +1048,10 @@ static int cpu_smpboot_callback( + break; + case CPU_UP_CANCELED: + case CPU_DEAD: +- cpu_smpboot_free(cpu); ++ cpu_smpboot_free(cpu, !park_offline_cpus); + break; +- default: ++ case CPU_REMOVE: ++ cpu_smpboot_free(cpu, true); + break; + } + +@@ -1047,7 +1084,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus) + panic("Error %d setting up PV root page table\n", rc); + if ( per_cpu(root_pgt, 0) ) + { +- get_cpu_info()->pv_cr3 = __pa(per_cpu(root_pgt, 0)); ++ get_cpu_info()->pv_cr3 = 0; + + /* + * All entry points which may need to switch page tables have to start +@@ -1126,6 +1163,7 @@ void __init smp_prepare_boot_cpu(void) + per_cpu(scratch_cpumask, cpu) = &scratch_cpu0mask; + #endif + ++ get_cpu_info()->use_pv_cr3 = false; + get_cpu_info()->xen_cr3 = 0; + get_cpu_info()->pv_cr3 = 0; + } +diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c +index 3c7447bfe6..14e01faff9 100644 +--- a/xen/arch/x86/spec_ctrl.c ++++ b/xen/arch/x86/spec_ctrl.c +@@ -19,13 +19,23 @@ + #include <xen/errno.h> + #include <xen/init.h> + #include <xen/lib.h> ++#include <xen/warning.h> + + #include <asm/microcode.h> + #include <asm/msr.h> + #include <asm/processor.h> ++#include <asm/pv/shim.h> ++#include <asm/setup.h> + #include <asm/spec_ctrl.h> + #include <asm/spec_ctrl_asm.h> + ++/* Cmdline controls for Xen's alternative blocks. */ ++static bool __initdata opt_msr_sc_pv = true; ++static bool __initdata opt_msr_sc_hvm = true; ++static bool __initdata opt_rsb_pv = true; ++static bool __initdata opt_rsb_hvm = true; ++ ++/* Cmdline controls for Xen's speculative settings. */ + static enum ind_thunk { + THUNK_DEFAULT, /* Decide which thunk to use at boot time. */ + THUNK_NONE, /* Missing compiler support for thunks. */ +@@ -35,10 +45,18 @@ static enum ind_thunk { + THUNK_JMP, + } opt_thunk __initdata = THUNK_DEFAULT; + static int8_t __initdata opt_ibrs = -1; +-static bool __initdata opt_rsb_native = true; +-static bool __initdata opt_rsb_vmexit = true; + bool __read_mostly opt_ibpb = true; +-uint8_t __read_mostly default_bti_ist_info; ++bool __read_mostly opt_ssbd = false; ++int8_t __read_mostly opt_eager_fpu = -1; ++int8_t __read_mostly opt_l1d_flush = -1; ++ ++bool __initdata bsp_delay_spec_ctrl; ++uint8_t __read_mostly default_xen_spec_ctrl; ++uint8_t __read_mostly default_spec_ctrl_flags; ++ ++paddr_t __read_mostly l1tf_addr_mask, __read_mostly l1tf_safe_maddr; ++static bool __initdata cpu_has_bug_l1tf; ++static unsigned int __initdata l1d_maxphysaddr; + + static int __init parse_bti(const char *s) + { +@@ -68,9 +86,9 @@ static int __init parse_bti(const char *s) + else if ( (val = parse_boolean("ibpb", s, ss)) >= 0 ) + opt_ibpb = val; + else if ( (val = parse_boolean("rsb_native", s, ss)) >= 0 ) +- opt_rsb_native = val; ++ opt_rsb_pv = val; + else if ( (val = parse_boolean("rsb_vmexit", s, ss)) >= 0 ) +- opt_rsb_vmexit = val; ++ opt_rsb_hvm = val; + else + rc = -EINVAL; + +@@ -81,50 +99,244 @@ static int __init parse_bti(const char *s) + } + custom_param("bti", parse_bti); + +-static void __init print_details(enum ind_thunk thunk) ++static int __init parse_spec_ctrl(const char *s) ++{ ++ const char *ss; ++ int val, rc = 0; ++ ++ do { ++ ss = strchr(s, ','); ++ if ( !ss ) ++ ss = strchr(s, '\0'); ++ ++ /* Global and Xen-wide disable. */ ++ val = parse_bool(s, ss); ++ if ( !val ) ++ { ++ opt_msr_sc_pv = false; ++ opt_msr_sc_hvm = false; ++ ++ opt_eager_fpu = 0; ++ ++ if ( opt_xpti < 0 ) ++ opt_xpti = 0; ++ ++ if ( opt_smt < 0 ) ++ opt_smt = 1; ++ ++ if ( opt_pv_l1tf < 0 ) ++ opt_pv_l1tf = 0; ++ ++ disable_common: ++ opt_rsb_pv = false; ++ opt_rsb_hvm = false; ++ ++ opt_thunk = THUNK_JMP; ++ opt_ibrs = 0; ++ opt_ibpb = false; ++ opt_ssbd = false; ++ opt_l1d_flush = 0; ++ } ++ else if ( val > 0 ) ++ rc = -EINVAL; ++ else if ( (val = parse_boolean("xen", s, ss)) >= 0 ) ++ { ++ if ( !val ) ++ goto disable_common; ++ ++ rc = -EINVAL; ++ } ++ ++ /* Xen's alternative blocks. */ ++ else if ( (val = parse_boolean("pv", s, ss)) >= 0 ) ++ { ++ opt_msr_sc_pv = val; ++ opt_rsb_pv = val; ++ } ++ else if ( (val = parse_boolean("hvm", s, ss)) >= 0 ) ++ { ++ opt_msr_sc_hvm = val; ++ opt_rsb_hvm = val; ++ } ++ else if ( (val = parse_boolean("msr-sc", s, ss)) >= 0 ) ++ { ++ opt_msr_sc_pv = val; ++ opt_msr_sc_hvm = val; ++ } ++ else if ( (val = parse_boolean("rsb", s, ss)) >= 0 ) ++ { ++ opt_rsb_pv = val; ++ opt_rsb_hvm = val; ++ } ++ ++ /* Xen's speculative sidechannel mitigation settings. */ ++ else if ( !strncmp(s, "bti-thunk=", 10) ) ++ { ++ s += 10; ++ ++ if ( !strncmp(s, "retpoline", ss - s) ) ++ opt_thunk = THUNK_RETPOLINE; ++ else if ( !strncmp(s, "lfence", ss - s) ) ++ opt_thunk = THUNK_LFENCE; ++ else if ( !strncmp(s, "jmp", ss - s) ) ++ opt_thunk = THUNK_JMP; ++ else ++ rc = -EINVAL; ++ } ++ else if ( (val = parse_boolean("ibrs", s, ss)) >= 0 ) ++ opt_ibrs = val; ++ else if ( (val = parse_boolean("ibpb", s, ss)) >= 0 ) ++ opt_ibpb = val; ++ else if ( (val = parse_boolean("ssbd", s, ss)) >= 0 ) ++ opt_ssbd = val; ++ else if ( (val = parse_boolean("eager-fpu", s, ss)) >= 0 ) ++ opt_eager_fpu = val; ++ else if ( (val = parse_boolean("l1d-flush", s, ss)) >= 0 ) ++ opt_l1d_flush = val; ++ else ++ rc = -EINVAL; ++ ++ s = ss + 1; ++ } while ( *ss ); ++ ++ return rc; ++} ++custom_param("spec-ctrl", parse_spec_ctrl); ++ ++int8_t __read_mostly opt_pv_l1tf = -1; ++ ++static __init int parse_pv_l1tf(const char *s) ++{ ++ const char *ss; ++ int val, rc = 0; ++ ++ /* Inhibit the defaults as an explicit choice has been given. */ ++ if ( opt_pv_l1tf == -1 ) ++ opt_pv_l1tf = 0; ++ ++ /* Interpret 'pv-l1tf' alone in its positive boolean form. */ ++ if ( *s == '\0' ) ++ opt_pv_l1tf = OPT_PV_L1TF_DOM0 | OPT_PV_L1TF_DOMU; ++ ++ do { ++ ss = strchr(s, ','); ++ if ( !ss ) ++ ss = strchr(s, '\0'); ++ ++ switch ( parse_bool(s, ss) ) ++ { ++ case 0: ++ opt_pv_l1tf = 0; ++ break; ++ ++ case 1: ++ opt_pv_l1tf = OPT_PV_L1TF_DOM0 | OPT_PV_L1TF_DOMU; ++ break; ++ ++ default: ++ if ( (val = parse_boolean("dom0", s, ss)) >= 0 ) ++ opt_pv_l1tf = ((opt_pv_l1tf & ~OPT_PV_L1TF_DOM0) | ++ (val ? OPT_PV_L1TF_DOM0 : 0)); ++ else if ( (val = parse_boolean("domu", s, ss)) >= 0 ) ++ opt_pv_l1tf = ((opt_pv_l1tf & ~OPT_PV_L1TF_DOMU) | ++ (val ? OPT_PV_L1TF_DOMU : 0)); ++ else ++ rc = -EINVAL; ++ break; ++ } ++ ++ s = ss + 1; ++ } while ( *ss ); ++ ++ return rc; ++} ++custom_param("pv-l1tf", parse_pv_l1tf); ++ ++static void __init print_details(enum ind_thunk thunk, uint64_t caps) + { + unsigned int _7d0 = 0, e8b = 0, tmp; +- uint64_t caps = 0; + + /* Collect diagnostics about available mitigations. */ + if ( boot_cpu_data.cpuid_level >= 7 ) + cpuid_count(7, 0, &tmp, &tmp, &tmp, &_7d0); + if ( boot_cpu_data.extended_cpuid_level >= 0x80000008 ) + cpuid(0x80000008, &tmp, &e8b, &tmp, &tmp); +- if ( _7d0 & cpufeat_mask(X86_FEATURE_ARCH_CAPS) ) +- rdmsrl(MSR_ARCH_CAPABILITIES, caps); + +- printk(XENLOG_DEBUG "Speculative mitigation facilities:\n"); ++ printk("Speculative mitigation facilities:\n"); + + /* Hardware features which pertain to speculative mitigations. */ +- printk(XENLOG_DEBUG " Hardware features:%s%s%s%s%s\n", ++ printk(" Hardware features:%s%s%s%s%s%s%s%s%s%s\n", + (_7d0 & cpufeat_mask(X86_FEATURE_IBRSB)) ? " IBRS/IBPB" : "", + (_7d0 & cpufeat_mask(X86_FEATURE_STIBP)) ? " STIBP" : "", ++ (_7d0 & cpufeat_mask(X86_FEATURE_L1D_FLUSH)) ? " L1D_FLUSH" : "", ++ (_7d0 & cpufeat_mask(X86_FEATURE_SSBD)) ? " SSBD" : "", + (e8b & cpufeat_mask(X86_FEATURE_IBPB)) ? " IBPB" : "", + (caps & ARCH_CAPABILITIES_IBRS_ALL) ? " IBRS_ALL" : "", +- (caps & ARCH_CAPABILITIES_RDCL_NO) ? " RDCL_NO" : ""); +- +- /* Compiled-in support which pertains to BTI mitigations. */ +- if ( IS_ENABLED(CONFIG_INDIRECT_THUNK) ) +- printk(XENLOG_DEBUG " Compiled-in support: INDIRECT_THUNK\n"); +- +- printk("BTI mitigations: Thunk %s, Others:%s%s%s%s\n", ++ (caps & ARCH_CAPABILITIES_RDCL_NO) ? " RDCL_NO" : "", ++ (caps & ARCH_CAPS_RSBA) ? " RSBA" : "", ++ (caps & ARCH_CAPS_SKIP_L1DFL) ? " SKIP_L1DFL": "", ++ (caps & ARCH_CAPS_SSB_NO) ? " SSB_NO" : ""); ++ ++ /* Compiled-in support which pertains to mitigations. */ ++ if ( IS_ENABLED(CONFIG_INDIRECT_THUNK) || IS_ENABLED(CONFIG_SHADOW_PAGING) ) ++ printk(" Compiled-in support:" ++#ifdef CONFIG_INDIRECT_THUNK ++ " INDIRECT_THUNK" ++#endif ++#ifdef CONFIG_SHADOW_PAGING ++ " SHADOW_PAGING" ++#endif ++ "\n"); ++ ++ /* Settings for Xen's protection, irrespective of guests. */ ++ printk(" Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s, Other:%s%s\n", + thunk == THUNK_NONE ? "N/A" : + thunk == THUNK_RETPOLINE ? "RETPOLINE" : + thunk == THUNK_LFENCE ? "LFENCE" : + thunk == THUNK_JMP ? "JMP" : "?", +- boot_cpu_has(X86_FEATURE_XEN_IBRS_SET) ? " IBRS+" : +- boot_cpu_has(X86_FEATURE_XEN_IBRS_CLEAR) ? " IBRS-" : "", +- opt_ibpb ? " IBPB" : "", +- boot_cpu_has(X86_FEATURE_RSB_NATIVE) ? " RSB_NATIVE" : "", +- boot_cpu_has(X86_FEATURE_RSB_VMEXIT) ? " RSB_VMEXIT" : ""); +- +- printk("XPTI: %s\n", +- boot_cpu_has(X86_FEATURE_NO_XPTI) ? "disabled" : "enabled"); ++ !boot_cpu_has(X86_FEATURE_IBRSB) ? "No" : ++ (default_xen_spec_ctrl & SPEC_CTRL_IBRS) ? "IBRS+" : "IBRS-", ++ !boot_cpu_has(X86_FEATURE_SSBD) ? "" : ++ (default_xen_spec_ctrl & SPEC_CTRL_SSBD) ? " SSBD+" : " SSBD-", ++ opt_ibpb ? " IBPB" : "", ++ opt_l1d_flush ? " L1D_FLUSH" : ""); ++ ++ /* L1TF diagnostics, printed if vulnerable or PV shadowing is in use. */ ++ if ( cpu_has_bug_l1tf || opt_pv_l1tf ) ++ printk(" L1TF: believed%s vulnerable, maxphysaddr L1D %u, CPUID %u" ++ ", Safe address %"PRIx64"\n", ++ cpu_has_bug_l1tf ? "" : " not", ++ l1d_maxphysaddr, paddr_bits, l1tf_safe_maddr); ++ ++ /* ++ * Alternatives blocks for protecting against and/or virtualising ++ * mitigation support for guests. ++ */ ++ printk(" Support for VMs: PV:%s%s%s%s, HVM:%s%s%s%s\n", ++ (boot_cpu_has(X86_FEATURE_SC_MSR_PV) || ++ boot_cpu_has(X86_FEATURE_SC_RSB_PV) || ++ opt_eager_fpu) ? "" : " None", ++ boot_cpu_has(X86_FEATURE_SC_MSR_PV) ? " MSR_SPEC_CTRL" : "", ++ boot_cpu_has(X86_FEATURE_SC_RSB_PV) ? " RSB" : "", ++ opt_eager_fpu ? " EAGER_FPU" : "", ++ (boot_cpu_has(X86_FEATURE_SC_MSR_HVM) || ++ boot_cpu_has(X86_FEATURE_SC_RSB_HVM) || ++ opt_eager_fpu) ? "" : " None", ++ boot_cpu_has(X86_FEATURE_SC_MSR_HVM) ? " MSR_SPEC_CTRL" : "", ++ boot_cpu_has(X86_FEATURE_SC_RSB_HVM) ? " RSB" : "", ++ opt_eager_fpu ? " EAGER_FPU" : ""); ++ ++ printk(" XPTI (64-bit PV only): Dom0 %s, DomU %s\n", ++ opt_xpti & OPT_XPTI_DOM0 ? "enabled" : "disabled", ++ opt_xpti & OPT_XPTI_DOMU ? "enabled" : "disabled"); ++ ++ printk(" PV L1TF shadowing: Dom0 %s, DomU %s\n", ++ opt_pv_l1tf & OPT_PV_L1TF_DOM0 ? "enabled" : "disabled", ++ opt_pv_l1tf & OPT_PV_L1TF_DOMU ? "enabled" : "disabled"); + } + + /* Calculate whether Retpoline is known-safe on this CPU. */ +-static bool __init retpoline_safe(void) ++static bool __init retpoline_safe(uint64_t caps) + { + unsigned int ucode_rev = this_cpu(ucode_cpu_info).cpu_sig.rev; + +@@ -135,6 +347,13 @@ static bool __init retpoline_safe(void) + boot_cpu_data.x86 != 6 ) + return false; + ++ /* ++ * RSBA may be set by a hypervisor to indicate that we may move to a ++ * processor which isn't retpoline-safe. ++ */ ++ if ( caps & ARCH_CAPS_RSBA ) ++ return false; ++ + switch ( boot_cpu_data.x86_model ) + { + case 0x17: /* Penryn */ +@@ -161,26 +380,337 @@ static bool __init retpoline_safe(void) + * versions. + */ + case 0x3d: /* Broadwell */ +- return ucode_rev >= 0x28; ++ return ucode_rev >= 0x2a; + case 0x47: /* Broadwell H */ +- return ucode_rev >= 0x1b; ++ return ucode_rev >= 0x1d; + case 0x4f: /* Broadwell EP/EX */ +- return ucode_rev >= 0xb000025; ++ return ucode_rev >= 0xb000021; + case 0x56: /* Broadwell D */ +- return false; /* TBD. */ ++ switch ( boot_cpu_data.x86_mask ) ++ { ++ case 2: return ucode_rev >= 0x15; ++ case 3: return ucode_rev >= 0x7000012; ++ case 4: return ucode_rev >= 0xf000011; ++ case 5: return ucode_rev >= 0xe000009; ++ default: ++ printk("Unrecognised CPU stepping %#x - assuming not reptpoline safe\n", ++ boot_cpu_data.x86_mask); ++ return false; ++ } ++ break; + + /* +- * Skylake and later processors are not retpoline-safe. ++ * Skylake, Kabylake and Cannonlake processors are not retpoline-safe. + */ ++ case 0x4e: ++ case 0x55: ++ case 0x5e: ++ case 0x66: ++ case 0x67: ++ case 0x8e: ++ case 0x9e: ++ return false; ++ + default: ++ printk("Unrecognised CPU model %#x - assuming not reptpoline safe\n", ++ boot_cpu_data.x86_model); + return false; + } + } + ++/* Calculate whether this CPU speculates past #NM */ ++static bool __init should_use_eager_fpu(void) ++{ ++ /* ++ * Assume all unrecognised processors are ok. This is only known to ++ * affect Intel Family 6 processors. ++ */ ++ if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || ++ boot_cpu_data.x86 != 6 ) ++ return false; ++ ++ switch ( boot_cpu_data.x86_model ) ++ { ++ /* ++ * Core processors since at least Nehalem are vulnerable. ++ */ ++ case 0x1e: /* Nehalem */ ++ case 0x1f: /* Auburndale / Havendale */ ++ case 0x1a: /* Nehalem EP */ ++ case 0x2e: /* Nehalem EX */ ++ case 0x25: /* Westmere */ ++ case 0x2c: /* Westmere EP */ ++ case 0x2f: /* Westmere EX */ ++ case 0x2a: /* SandyBridge */ ++ case 0x2d: /* SandyBridge EP/EX */ ++ case 0x3a: /* IvyBridge */ ++ case 0x3e: /* IvyBridge EP/EX */ ++ case 0x3c: /* Haswell */ ++ case 0x3f: /* Haswell EX/EP */ ++ case 0x45: /* Haswell D */ ++ case 0x46: /* Haswell H */ ++ case 0x3d: /* Broadwell */ ++ case 0x47: /* Broadwell H */ ++ case 0x4f: /* Broadwell EP/EX */ ++ case 0x56: /* Broadwell D */ ++ case 0x4e: /* Skylake M */ ++ case 0x55: /* Skylake X */ ++ case 0x5e: /* Skylake D */ ++ case 0x66: /* Cannonlake */ ++ case 0x67: /* Cannonlake? */ ++ case 0x8e: /* Kabylake M */ ++ case 0x9e: /* Kabylake D */ ++ return true; ++ ++ /* ++ * Atom processors are not vulnerable. ++ */ ++ case 0x1c: /* Pineview */ ++ case 0x26: /* Lincroft */ ++ case 0x27: /* Penwell */ ++ case 0x35: /* Cloverview */ ++ case 0x36: /* Cedarview */ ++ case 0x37: /* Baytrail / Valleyview (Silvermont) */ ++ case 0x4d: /* Avaton / Rangely (Silvermont) */ ++ case 0x4c: /* Cherrytrail / Brasswell */ ++ case 0x4a: /* Merrifield */ ++ case 0x5a: /* Moorefield */ ++ case 0x5c: /* Goldmont */ ++ case 0x5f: /* Denverton */ ++ case 0x7a: /* Gemini Lake */ ++ return false; ++ ++ /* ++ * Knights processors are not vulnerable. ++ */ ++ case 0x57: /* Knights Landing */ ++ case 0x85: /* Knights Mill */ ++ return false; ++ ++ default: ++ printk("Unrecognised CPU model %#x - assuming vulnerable to LazyFPU\n", ++ boot_cpu_data.x86_model); ++ return true; ++ } ++} ++ ++/* Calculate whether this CPU is vulnerable to L1TF. */ ++static __init void l1tf_calculations(uint64_t caps) ++{ ++ bool hit_default = false; ++ ++ l1d_maxphysaddr = paddr_bits; ++ ++ /* L1TF is only known to affect Intel Family 6 processors at this time. */ ++ if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && ++ boot_cpu_data.x86 == 6 ) ++ { ++ switch ( boot_cpu_data.x86_model ) ++ { ++ /* ++ * Core processors since at least Penryn are vulnerable. ++ */ ++ case 0x17: /* Penryn */ ++ case 0x1d: /* Dunnington */ ++ cpu_has_bug_l1tf = true; ++ break; ++ ++ case 0x1f: /* Auburndale / Havendale */ ++ case 0x1e: /* Nehalem */ ++ case 0x1a: /* Nehalem EP */ ++ case 0x2e: /* Nehalem EX */ ++ case 0x25: /* Westmere */ ++ case 0x2c: /* Westmere EP */ ++ case 0x2f: /* Westmere EX */ ++ cpu_has_bug_l1tf = true; ++ l1d_maxphysaddr = 44; ++ break; ++ ++ case 0x2a: /* SandyBridge */ ++ case 0x2d: /* SandyBridge EP/EX */ ++ case 0x3a: /* IvyBridge */ ++ case 0x3e: /* IvyBridge EP/EX */ ++ case 0x3c: /* Haswell */ ++ case 0x3f: /* Haswell EX/EP */ ++ case 0x45: /* Haswell D */ ++ case 0x46: /* Haswell H */ ++ case 0x3d: /* Broadwell */ ++ case 0x47: /* Broadwell H */ ++ case 0x4f: /* Broadwell EP/EX */ ++ case 0x56: /* Broadwell D */ ++ case 0x4e: /* Skylake M */ ++ case 0x55: /* Skylake X */ ++ case 0x5e: /* Skylake D */ ++ case 0x66: /* Cannonlake */ ++ case 0x67: /* Cannonlake? */ ++ case 0x8e: /* Kabylake M */ ++ case 0x9e: /* Kabylake D */ ++ cpu_has_bug_l1tf = true; ++ l1d_maxphysaddr = 46; ++ break; ++ ++ /* ++ * Atom processors are not vulnerable. ++ */ ++ case 0x1c: /* Pineview */ ++ case 0x26: /* Lincroft */ ++ case 0x27: /* Penwell */ ++ case 0x35: /* Cloverview */ ++ case 0x36: /* Cedarview */ ++ case 0x37: /* Baytrail / Valleyview (Silvermont) */ ++ case 0x4d: /* Avaton / Rangely (Silvermont) */ ++ case 0x4c: /* Cherrytrail / Brasswell */ ++ case 0x4a: /* Merrifield */ ++ case 0x5a: /* Moorefield */ ++ case 0x5c: /* Goldmont */ ++ case 0x5f: /* Denverton */ ++ case 0x7a: /* Gemini Lake */ ++ break; ++ ++ /* ++ * Knights processors are not vulnerable. ++ */ ++ case 0x57: /* Knights Landing */ ++ case 0x85: /* Knights Mill */ ++ break; ++ ++ default: ++ /* Defer printk() until we've accounted for RDCL_NO. */ ++ hit_default = true; ++ cpu_has_bug_l1tf = true; ++ break; ++ } ++ } ++ ++ /* Any processor advertising RDCL_NO should be not vulnerable to L1TF. */ ++ if ( caps & ARCH_CAPABILITIES_RDCL_NO ) ++ cpu_has_bug_l1tf = false; ++ ++ if ( cpu_has_bug_l1tf && hit_default ) ++ printk("Unrecognised CPU model %#x - assuming vulnerable to L1TF\n", ++ boot_cpu_data.x86_model); ++ ++ /* ++ * L1TF safe address heuristics. These apply to the real hardware we are ++ * running on, and are best-effort-only if Xen is virtualised. ++ * ++ * The address mask which the L1D cache uses, which might be wider than ++ * the CPUID-reported maxphysaddr. ++ */ ++ l1tf_addr_mask = ((1ul << l1d_maxphysaddr) - 1) & PAGE_MASK; ++ ++ /* ++ * To be safe, l1tf_safe_maddr must be above the highest cacheable entity ++ * in system physical address space. However, to preserve space for ++ * paged-out metadata, it should be as low as possible above the highest ++ * cacheable address, so as to require fewer high-order bits being set. ++ * ++ * These heuristics are based on some guesswork to improve the likelihood ++ * of safety in the common case, including Linux's L1TF mitigation of ++ * inverting all address bits in a non-present PTE. ++ * ++ * - If L1D is wider than CPUID (Nehalem and later mobile/desktop/low end ++ * server), setting any address bit beyond CPUID maxphysaddr guarantees ++ * to make the PTE safe. This case doesn't require all the high-order ++ * bits being set, and doesn't require any other source of information ++ * for safety. ++ * ++ * - If L1D is the same as CPUID (Pre-Nehalem, or high end server), we ++ * must sacrifice high order bits from the real address space for ++ * safety. Therefore, make a blind guess that there is nothing ++ * cacheable in the top quarter of physical address space. ++ * ++ * It is exceedingly unlikely for machines to be populated with this ++ * much RAM (likely 512G on pre-Nehalem, 16T on Nehalem/Westmere, 64T on ++ * Sandybridge and later) due to the sheer volume of DIMMs this would ++ * actually take. ++ * ++ * However, it is possible to find machines this large, so the "top ++ * quarter" guess is supplemented to push the limit higher if references ++ * to cacheable mappings (E820/SRAT/EFI/etc) are found above the top ++ * quarter boundary. ++ * ++ * Finally, this top quarter guess gives us a good chance of being safe ++ * when running virtualised (and the CPUID maxphysaddr hasn't been ++ * levelled for heterogeneous migration safety), where the safety ++ * consideration is still in terms of host details, but all E820/etc ++ * information is in terms of guest physical layout. ++ */ ++ l1tf_safe_maddr = max(l1tf_safe_maddr, ((l1d_maxphysaddr > paddr_bits) ++ ? (1ul << paddr_bits) ++ : (3ul << (paddr_bits - 2)))); ++} ++ ++int8_t __read_mostly opt_xpti = -1; ++ ++static __init void xpti_init_default(uint64_t caps) ++{ ++ if ( boot_cpu_data.x86_vendor == X86_VENDOR_AMD ) ++ caps = ARCH_CAPABILITIES_RDCL_NO; ++ ++ if ( caps & ARCH_CAPABILITIES_RDCL_NO ) ++ opt_xpti = 0; ++ else ++ opt_xpti = OPT_XPTI_DOM0 | OPT_XPTI_DOMU; ++} ++ ++static __init int parse_xpti(const char *s) ++{ ++ const char *ss; ++ int val, rc = 0; ++ ++ /* Inhibit the defaults as an explicit choice has been given. */ ++ if ( opt_xpti == -1 ) ++ opt_xpti = 0; ++ ++ /* Interpret 'xpti' alone in its positive boolean form. */ ++ if ( *s == '\0' ) ++ opt_xpti = OPT_XPTI_DOM0 | OPT_XPTI_DOMU; ++ ++ do { ++ ss = strchr(s, ','); ++ if ( !ss ) ++ ss = strchr(s, '\0'); ++ ++ switch ( parse_bool(s, ss) ) ++ { ++ case 0: ++ opt_xpti = 0; ++ break; ++ ++ case 1: ++ opt_xpti = OPT_XPTI_DOM0 | OPT_XPTI_DOMU; ++ break; ++ ++ default: ++ if ( !strcmp(s, "default") ) ++ opt_xpti = -1; ++ else if ( (val = parse_boolean("dom0", s, ss)) >= 0 ) ++ opt_xpti = (opt_xpti & ~OPT_XPTI_DOM0) | ++ (val ? OPT_XPTI_DOM0 : 0); ++ else if ( (val = parse_boolean("domu", s, ss)) >= 0 ) ++ opt_xpti = (opt_xpti & ~OPT_XPTI_DOMU) | ++ (val ? OPT_XPTI_DOMU : 0); ++ else ++ rc = -EINVAL; ++ break; ++ } ++ ++ s = ss + 1; ++ } while ( *ss ); ++ ++ return rc; ++} ++custom_param("xpti", parse_xpti); ++ + void __init init_speculation_mitigations(void) + { + enum ind_thunk thunk = THUNK_DEFAULT; +- bool ibrs = false; ++ bool use_spec_ctrl = false, ibrs = false; ++ uint64_t caps = 0; ++ ++ if ( boot_cpu_has(X86_FEATURE_ARCH_CAPS) ) ++ rdmsrl(MSR_ARCH_CAPABILITIES, caps); + + /* + * Has the user specified any custom BTI mitigations? If so, follow their +@@ -209,7 +739,7 @@ void __init init_speculation_mitigations(void) + * On Intel hardware, we'd like to use retpoline in preference to + * IBRS, but only if it is safe on this hardware. + */ +- else if ( retpoline_safe() ) ++ else if ( retpoline_safe(caps) ) + thunk = THUNK_RETPOLINE; + else if ( boot_cpu_has(X86_FEATURE_IBRSB) ) + ibrs = true; +@@ -246,21 +776,35 @@ void __init init_speculation_mitigations(void) + else if ( thunk == THUNK_JMP ) + setup_force_cpu_cap(X86_FEATURE_IND_THUNK_JMP); + ++ /* ++ * If we are on hardware supporting MSR_SPEC_CTRL, see about setting up ++ * the alternatives blocks so we can virtualise support for guests. ++ */ + if ( boot_cpu_has(X86_FEATURE_IBRSB) ) + { +- /* +- * Even if we've chosen to not have IBRS set in Xen context, we still +- * need the IBRS entry/exit logic to virtualise IBRS support for +- * guests. +- */ +- if ( ibrs ) +- setup_force_cpu_cap(X86_FEATURE_XEN_IBRS_SET); +- else +- setup_force_cpu_cap(X86_FEATURE_XEN_IBRS_CLEAR); ++ if ( opt_msr_sc_pv ) ++ { ++ use_spec_ctrl = true; ++ setup_force_cpu_cap(X86_FEATURE_SC_MSR_PV); ++ } ++ ++ if ( opt_msr_sc_hvm ) ++ { ++ use_spec_ctrl = true; ++ setup_force_cpu_cap(X86_FEATURE_SC_MSR_HVM); ++ } ++ ++ if ( use_spec_ctrl ) ++ default_spec_ctrl_flags |= SCF_ist_wrmsr; + +- default_bti_ist_info |= BTI_IST_WRMSR | ibrs; ++ if ( ibrs ) ++ default_xen_spec_ctrl |= SPEC_CTRL_IBRS; + } + ++ /* If we have SSBD available, see whether we should use it. */ ++ if ( boot_cpu_has(X86_FEATURE_SSBD) && opt_ssbd ) ++ default_xen_spec_ctrl |= SPEC_CTRL_SSBD; ++ + /* + * PV guests can poison the RSB to any virtual address from which + * they can execute a call instruction. This is necessarily outside +@@ -274,33 +818,123 @@ void __init init_speculation_mitigations(void) + * If a processors speculates to 32bit PV guest kernel mappings, it is + * speculating in 64bit supervisor mode, and can leak data. + */ +- if ( opt_rsb_native ) ++ if ( opt_rsb_pv ) + { +- setup_force_cpu_cap(X86_FEATURE_RSB_NATIVE); +- default_bti_ist_info |= BTI_IST_RSB; ++ setup_force_cpu_cap(X86_FEATURE_SC_RSB_PV); ++ default_spec_ctrl_flags |= SCF_ist_rsb; + } + + /* + * HVM guests can always poison the RSB to point at Xen supervisor + * mappings. + */ +- if ( opt_rsb_vmexit ) +- setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT); ++ if ( opt_rsb_hvm ) ++ setup_force_cpu_cap(X86_FEATURE_SC_RSB_HVM); + + /* Check we have hardware IBPB support before using it... */ + if ( !boot_cpu_has(X86_FEATURE_IBRSB) && !boot_cpu_has(X86_FEATURE_IBPB) ) + opt_ibpb = false; + +- /* (Re)init BSP state now that default_bti_ist_info has been calculated. */ ++ /* Check whether Eager FPU should be enabled by default. */ ++ if ( opt_eager_fpu == -1 ) ++ opt_eager_fpu = should_use_eager_fpu(); ++ ++ /* (Re)init BSP state now that default_spec_ctrl_flags has been calculated. */ + init_shadow_spec_ctrl_state(); + +- print_details(thunk); ++ /* If Xen is using any MSR_SPEC_CTRL settings, adjust the idle path. */ ++ if ( default_xen_spec_ctrl ) ++ setup_force_cpu_cap(X86_FEATURE_SC_MSR_IDLE); ++ ++ if ( opt_xpti == -1 ) ++ xpti_init_default(caps); ++ ++ if ( opt_xpti == 0 ) ++ setup_force_cpu_cap(X86_FEATURE_NO_XPTI); ++ else ++ setup_clear_cpu_cap(X86_FEATURE_NO_XPTI); ++ ++ l1tf_calculations(caps); ++ ++ /* ++ * By default, enable PV domU L1TF mitigations on all L1TF-vulnerable ++ * hardware, except when running in shim mode. ++ * ++ * In shim mode, SHADOW is expected to be compiled out, and a malicious ++ * guest kernel can only attack the shim Xen, not the host Xen. ++ */ ++ if ( opt_pv_l1tf == -1 ) ++ { ++ if ( pv_shim || !cpu_has_bug_l1tf ) ++ opt_pv_l1tf = 0; ++ else ++ opt_pv_l1tf = OPT_PV_L1TF_DOMU; ++ } ++ ++ /* ++ * By default, enable L1D_FLUSH on L1TF-vulnerable hardware, unless ++ * instructed to skip the flush on vmentry by our outer hypervisor. ++ */ ++ if ( !boot_cpu_has(X86_FEATURE_L1D_FLUSH) ) ++ opt_l1d_flush = 0; ++ else if ( opt_l1d_flush == -1 ) ++ opt_l1d_flush = cpu_has_bug_l1tf && !(caps & ARCH_CAPS_SKIP_L1DFL); ++ ++ /* ++ * We do not disable HT by default on affected hardware. ++ * ++ * Firstly, if the user intends to use exclusively PV, or HVM shadow ++ * guests, HT isn't a concern and should remain fully enabled. Secondly, ++ * safety for HVM HAP guests can be arranged by the toolstack with core ++ * parking, pinning or cpupool configurations, including mixed setups. ++ * ++ * However, if we are on affected hardware, with HT enabled, and the user ++ * hasn't explicitly chosen whether to use HT or not, nag them to do so. ++ */ ++ if ( opt_smt == -1 && cpu_has_bug_l1tf && !pv_shim && ++ boot_cpu_data.x86_num_siblings > 1 ) ++ warning_add( ++ "Booted on L1TF-vulnerable hardware with SMT/Hyperthreading\n" ++ "enabled. Please assess your configuration and choose an\n" ++ "explicit 'smt=<bool>' setting. See XSA-273.\n"); ++ ++ print_details(thunk, caps); ++ ++ /* ++ * If MSR_SPEC_CTRL is available, apply Xen's default setting and discard ++ * any firmware settings. For performance reasons, when safe to do so, we ++ * delay applying non-zero settings until after dom0 has been constructed. ++ * ++ * "when safe to do so" is based on whether we are virtualised. A native ++ * boot won't have any other code running in a position to mount an ++ * attack. ++ */ ++ if ( boot_cpu_has(X86_FEATURE_IBRSB) ) ++ { ++ bsp_delay_spec_ctrl = !cpu_has_hypervisor && default_xen_spec_ctrl; ++ ++ /* ++ * If delaying MSR_SPEC_CTRL setup, use the same mechanism as ++ * spec_ctrl_enter_idle(), by using a shadow value of zero. ++ */ ++ if ( bsp_delay_spec_ctrl ) ++ { ++ struct cpu_info *info = get_cpu_info(); ++ ++ info->shadow_spec_ctrl = 0; ++ barrier(); ++ info->spec_ctrl_flags |= SCF_use_shadow; ++ barrier(); ++ } ++ ++ wrmsrl(MSR_SPEC_CTRL, bsp_delay_spec_ctrl ? 0 : default_xen_spec_ctrl); ++ } + } + + static void __init __maybe_unused build_assertions(void) + { + /* The optimised assembly relies on this alias. */ +- BUILD_BUG_ON(BTI_IST_IBRS != SPEC_CTRL_IBRS); ++ BUILD_BUG_ON(SCF_use_shadow != 1); + } + + /* +diff --git a/xen/arch/x86/srat.c b/xen/arch/x86/srat.c +index 166eb44fe2..2d70b45909 100644 +--- a/xen/arch/x86/srat.c ++++ b/xen/arch/x86/srat.c +@@ -20,6 +20,7 @@ + #include <xen/pfn.h> + #include <asm/e820.h> + #include <asm/page.h> ++#include <asm/spec_ctrl.h> + + static struct acpi_table_slit *__read_mostly acpi_slit; + +@@ -284,6 +285,11 @@ acpi_numa_memory_affinity_init(const struct acpi_srat_mem_affinity *ma) + if (!(ma->flags & ACPI_SRAT_MEM_ENABLED)) + return; + ++ start = ma->base_address; ++ end = start + ma->length; ++ /* Supplement the heuristics in l1tf_calculations(). */ ++ l1tf_safe_maddr = max(l1tf_safe_maddr, ROUNDUP(end, PAGE_SIZE)); ++ + if (num_node_memblks >= NR_NODE_MEMBLKS) + { + dprintk(XENLOG_WARNING, +@@ -292,8 +298,6 @@ acpi_numa_memory_affinity_init(const struct acpi_srat_mem_affinity *ma) + return; + } + +- start = ma->base_address; +- end = start + ma->length; + pxm = ma->proximity_domain; + if (srat_rev < 2) + pxm &= 0xff; +diff --git a/xen/arch/x86/sysctl.c b/xen/arch/x86/sysctl.c +index 6ba823ca69..e726eee974 100644 +--- a/xen/arch/x86/sysctl.c ++++ b/xen/arch/x86/sysctl.c +@@ -23,6 +23,7 @@ + #include <asm/hvm/hvm.h> + #include <asm/hvm/support.h> + #include <asm/processor.h> ++#include <asm/setup.h> + #include <asm/smp.h> + #include <asm/numa.h> + #include <xen/nodemask.h> +@@ -48,14 +49,27 @@ static void l3_cache_get(void *arg) + + long cpu_up_helper(void *data) + { +- int cpu = (unsigned long)data; ++ unsigned int cpu = (unsigned long)data; + int ret = cpu_up(cpu); ++ + if ( ret == -EBUSY ) + { + /* On EBUSY, flush RCU work and have one more go. */ + rcu_barrier(); + ret = cpu_up(cpu); + } ++ ++ if ( !ret && !opt_smt && ++ cpu_data[cpu].compute_unit_id == INVALID_CUID && ++ cpumask_weight(per_cpu(cpu_sibling_mask, cpu)) > 1 ) ++ { ++ ret = cpu_down_helper(data); ++ if ( ret ) ++ printk("Could not re-offline CPU%u (%d)\n", cpu, ret); ++ else ++ ret = -EPERM; ++ } ++ + return ret; + } + +diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c +index 906124331b..e8f85f716e 100644 +--- a/xen/arch/x86/traps.c ++++ b/xen/arch/x86/traps.c +@@ -96,8 +96,6 @@ string_param("nmi", opt_nmi); + DEFINE_PER_CPU(u64, efer); + static DEFINE_PER_CPU(unsigned long, last_extable_addr); + +-DEFINE_PER_CPU_READ_MOSTLY(u32, ler_msr); +- + DEFINE_PER_CPU_READ_MOSTLY(struct desc_struct *, gdt_table); + DEFINE_PER_CPU_READ_MOSTLY(struct desc_struct *, compat_gdt_table); + +@@ -117,6 +115,9 @@ integer_param("debug_stack_lines", debug_stack_lines); + static bool opt_ler; + boolean_param("ler", opt_ler); + ++/* LastExceptionFromIP on this hardware. Zero if LER is not in use. */ ++unsigned int __read_mostly ler_msr; ++ + #define stack_words_per_line 4 + #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)regs->rsp) + +@@ -325,13 +326,13 @@ static void show_guest_stack(struct vcpu *v, const struct cpu_user_regs *regs) + /* + * Notes for get_stack_trace_bottom() and get_stack_dump_bottom() + * +- * Stack pages 0, 1 and 2: ++ * Stack pages 0 - 3: + * These are all 1-page IST stacks. Each of these stacks have an exception + * frame and saved register state at the top. The interesting bound for a + * trace is the word adjacent to this, while the bound for a dump is the + * very top, including the exception frame. + * +- * Stack pages 3, 4 and 5: ++ * Stack pages 4 and 5: + * None of these are particularly interesting. With MEMORY_GUARD, page 5 is + * explicitly not present, so attempting to dump or trace it is + * counterproductive. Without MEMORY_GUARD, it is possible for a call chain +@@ -352,12 +353,12 @@ unsigned long get_stack_trace_bottom(unsigned long sp) + { + switch ( get_stack_page(sp) ) + { +- case 0 ... 2: ++ case 0 ... 3: + return ROUNDUP(sp, PAGE_SIZE) - + offsetof(struct cpu_user_regs, es) - sizeof(unsigned long); + + #ifndef MEMORY_GUARD +- case 3 ... 5: ++ case 4 ... 5: + #endif + case 6 ... 7: + return ROUNDUP(sp, STACK_SIZE) - +@@ -372,11 +373,11 @@ unsigned long get_stack_dump_bottom(unsigned long sp) + { + switch ( get_stack_page(sp) ) + { +- case 0 ... 2: ++ case 0 ... 3: + return ROUNDUP(sp, PAGE_SIZE) - sizeof(unsigned long); + + #ifndef MEMORY_GUARD +- case 3 ... 5: ++ case 4 ... 5: + #endif + case 6 ... 7: + return ROUNDUP(sp, STACK_SIZE) - sizeof(unsigned long); +@@ -1722,7 +1723,21 @@ void do_device_not_available(struct cpu_user_regs *regs) + { + struct vcpu *curr = current; + +- BUG_ON(!guest_mode(regs)); ++ if ( !guest_mode(regs) ) ++ { ++ unsigned long fixup = search_exception_table(regs); ++ ++ gprintk(XENLOG_ERR, "#NM: %p [%ps] -> %p\n", ++ _p(regs->rip), _p(regs->rip), _p(fixup)); ++ /* ++ * We shouldn't be able to reach here, but for release builds have ++ * the recovery logic in place nevertheless. ++ */ ++ ASSERT_UNREACHABLE(); ++ BUG_ON(!fixup); ++ regs->rip = fixup; ++ return; ++ } + + vcpu_restore_fpu_lazy(curr); + +@@ -1748,26 +1763,51 @@ void write_efer(u64 val) + wrmsrl(MSR_EFER, val); + } + +-static void ler_enable(void) +-{ +- u64 debugctl; +- +- if ( !this_cpu(ler_msr) ) +- return; +- +- rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); +- wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl | IA32_DEBUGCTLMSR_LBR); +-} +- + void do_debug(struct cpu_user_regs *regs) + { ++ unsigned long dr6; + struct vcpu *v = current; + ++ /* Stash dr6 as early as possible. */ ++ dr6 = read_debugreg(6); ++ + if ( debugger_trap_entry(TRAP_debug, regs) ) + return; + ++ /* ++ * At the time of writing (March 2018), on the subject of %dr6: ++ * ++ * The Intel manual says: ++ * Certain debug exceptions may clear bits 0-3. The remaining contents ++ * of the DR6 register are never cleared by the processor. To avoid ++ * confusion in identifying debug exceptions, debug handlers should ++ * clear the register (except bit 16, which they should set) before ++ * returning to the interrupted task. ++ * ++ * The AMD manual says: ++ * Bits 15:13 of the DR6 register are not cleared by the processor and ++ * must be cleared by software after the contents have been read. ++ * ++ * Some bits are reserved set, some are reserved clear, and some bits ++ * which were previously reserved set are reused and cleared by hardware. ++ * For future compatibility, reset to the default value, which will allow ++ * us to spot any bit being changed by hardware to its non-default value. ++ */ ++ write_debugreg(6, X86_DR6_DEFAULT); ++ ++ /* #DB automatically disabled LBR. Reinstate it if debugging Xen. */ ++ if ( cpu_has_xen_lbr ) ++ wrmsrl(MSR_IA32_DEBUGCTLMSR, IA32_DEBUGCTLMSR_LBR); ++ + if ( !guest_mode(regs) ) + { ++ /* ++ * !!! WARNING !!! ++ * ++ * %dr6 is mostly guest controlled at this point. Any decsions base ++ * on its value must be crosschecked with non-guest controlled state. ++ */ ++ + if ( regs->eflags & X86_EFLAGS_TF ) + { + /* In SYSENTER entry path we can't zap TF until EFLAGS is saved. */ +@@ -1776,7 +1816,7 @@ void do_debug(struct cpu_user_regs *regs) + { + if ( regs->rip == (unsigned long)sysenter_eflags_saved ) + regs->eflags &= ~X86_EFLAGS_TF; +- goto out; ++ return; + } + if ( !debugger_trap_fatal(TRAP_debug, regs) ) + { +@@ -1784,29 +1824,63 @@ void do_debug(struct cpu_user_regs *regs) + regs->eflags &= ~X86_EFLAGS_TF; + } + } +- else ++ ++ /* ++ * Check for fault conditions. General Detect, and instruction ++ * breakpoints are faults rather than traps, at which point attempting ++ * to ignore and continue will result in a livelock. ++ * ++ * However, on entering the #DB handler, hardware clears %dr7.gd for ++ * us (as confirmed by the earlier %dr6 accesses succeeding), meaning ++ * that a real General Detect exception is restartable. ++ * ++ * PV guests are not permitted to point %dr{0..3} at Xen linear ++ * addresses, and Instruction Breakpoints (being faults) don't get ++ * delayed by a MovSS shadow, so we should never encounter one in ++ * hypervisor context. ++ * ++ * If however we do, safety measures need to be enacted. Use a big ++ * hammer and clear all debug settings. ++ */ ++ if ( dr6 & (DR_TRAP3 | DR_TRAP2 | DR_TRAP1 | DR_TRAP0) ) + { +- /* +- * We ignore watchpoints when they trigger within Xen. This may +- * happen when a buffer is passed to us which previously had a +- * watchpoint set on it. No need to bump EIP; the only faulting +- * trap is an instruction breakpoint, which can't happen to us. +- */ +- WARN_ON(!search_exception_table(regs)); ++ unsigned int bp, dr7 = read_debugreg(7); ++ ++ for ( bp = 0; bp < 4; ++bp ) ++ { ++ if ( (dr6 & (1u << bp)) && /* Breakpoint triggered? */ ++ (dr7 & (3u << (bp * DR_ENABLE_SIZE))) && /* Enabled? */ ++ ((dr7 & (3u << ((bp * DR_CONTROL_SIZE) + /* Insn? */ ++ DR_CONTROL_SHIFT))) == DR_RW_EXECUTE) ) ++ { ++ ASSERT_UNREACHABLE(); ++ ++ printk(XENLOG_ERR ++ "Hit instruction breakpoint in Xen context\n"); ++ write_debugreg(7, 0); ++ break; ++ } ++ } + } +- goto out; ++ ++ /* ++ * Whatever caused this #DB should be restartable by this point. Note ++ * it and continue. Guests can trigger this in certain corner cases, ++ * so ensure the message is ratelimited. ++ */ ++ gprintk(XENLOG_WARNING, ++ "Hit #DB in Xen context: %04x:%p [%ps], stk %04x:%p, dr6 %lx\n", ++ regs->cs, _p(regs->rip), _p(regs->rip), ++ regs->ss, _p(regs->rsp), dr6); ++ ++ return; + } + + /* Save debug status register where guest OS can peek at it */ +- v->arch.debugreg[6] = read_debugreg(6); ++ v->arch.debugreg[6] |= (dr6 & ~X86_DR6_DEFAULT); ++ v->arch.debugreg[6] &= (dr6 | ~X86_DR6_DEFAULT); + +- ler_enable(); + pv_inject_hw_exception(TRAP_debug, X86_EVENT_NO_EC); +- return; +- +- out: +- ler_enable(); +- return; + } + + static void __init noinline __set_intr_gate(unsigned int n, +@@ -1850,38 +1924,46 @@ void load_TR(void) + : "=m" (old_gdt) : "rm" (TSS_ENTRY << 3), "m" (tss_gdt) : "memory" ); + } + +-void percpu_traps_init(void) ++static unsigned int calc_ler_msr(void) + { +- subarch_percpu_traps_init(); +- +- if ( !opt_ler ) +- return; +- + switch ( boot_cpu_data.x86_vendor ) + { + case X86_VENDOR_INTEL: + switch ( boot_cpu_data.x86 ) + { + case 6: +- this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP; +- break; ++ return MSR_IA32_LASTINTFROMIP; ++ + case 15: +- this_cpu(ler_msr) = MSR_P4_LER_FROM_LIP; +- break; ++ return MSR_P4_LER_FROM_LIP; + } + break; ++ + case X86_VENDOR_AMD: + switch ( boot_cpu_data.x86 ) + { + case 6: + case 0xf ... 0x17: +- this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP; +- break; ++ return MSR_IA32_LASTINTFROMIP; + } + break; + } + +- ler_enable(); ++ return 0; ++} ++ ++void percpu_traps_init(void) ++{ ++ subarch_percpu_traps_init(); ++ ++ if ( !opt_ler ) ++ return; ++ ++ if ( !ler_msr && (ler_msr = calc_ler_msr()) ) ++ setup_force_cpu_cap(X86_FEATURE_XEN_LBR); ++ ++ if ( cpu_has_xen_lbr ) ++ wrmsrl(MSR_IA32_DEBUGCTLMSR, IA32_DEBUGCTLMSR_LBR); + } + + void __init init_idt_traps(void) +@@ -1917,6 +1999,7 @@ void __init init_idt_traps(void) + set_ist(&idt_table[TRAP_double_fault], IST_DF); + set_ist(&idt_table[TRAP_nmi], IST_NMI); + set_ist(&idt_table[TRAP_machine_check], IST_MCE); ++ set_ist(&idt_table[TRAP_debug], IST_DB); + + /* CPU0 uses the master IDT. */ + idt_tables[0] = idt_table; +@@ -1984,6 +2067,12 @@ void activate_debugregs(const struct vcpu *curr) + } + } + ++/* ++ * Used by hypercalls and the emulator. ++ * -ENODEV => #UD ++ * -EINVAL => #GP Invalid bit ++ * -EPERM => #GP Valid bit, but not permitted to use ++ */ + long set_debugreg(struct vcpu *v, unsigned int reg, unsigned long value) + { + int i; +@@ -2015,7 +2104,17 @@ long set_debugreg(struct vcpu *v, unsigned int reg, unsigned long value) + if ( v == curr ) + write_debugreg(3, value); + break; ++ ++ case 4: ++ if ( v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_DE ) ++ return -ENODEV; ++ ++ /* Fallthrough */ + case 6: ++ /* The upper 32 bits are strictly reserved. */ ++ if ( value != (uint32_t)value ) ++ return -EINVAL; ++ + /* + * DR6: Bits 4-11,16-31 reserved (set to 1). + * Bit 12 reserved (set to 0). +@@ -2025,7 +2124,17 @@ long set_debugreg(struct vcpu *v, unsigned int reg, unsigned long value) + if ( v == curr ) + write_debugreg(6, value); + break; ++ ++ case 5: ++ if ( v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_DE ) ++ return -ENODEV; ++ ++ /* Fallthrough */ + case 7: ++ /* The upper 32 bits are strictly reserved. */ ++ if ( value != (uint32_t)value ) ++ return -EINVAL; ++ + /* + * DR7: Bit 10 reserved (set to 1). + * Bits 11-12,14-15 reserved (set to 0). +@@ -2038,6 +2147,7 @@ long set_debugreg(struct vcpu *v, unsigned int reg, unsigned long value) + */ + if ( value & DR_GENERAL_DETECT ) + return -EPERM; ++ + /* DR7.{G,L}E = 0 => debugging disabled for this domain. */ + if ( value & DR7_ACTIVE_MASK ) + { +@@ -2066,11 +2176,15 @@ long set_debugreg(struct vcpu *v, unsigned int reg, unsigned long value) + !(v->arch.debugreg[7] & DR7_ACTIVE_MASK) ) + activate_debugregs(v); + } ++ else ++ /* Zero the emulated controls if %dr7 isn't active. */ ++ v->arch.debugreg[5] = 0; ++ + if ( v == curr ) + write_debugreg(7, value); + break; + default: +- return -EINVAL; ++ return -ENODEV; + } + + v->arch.debugreg[reg] = value; +diff --git a/xen/arch/x86/x86_64/asm-offsets.c b/xen/arch/x86/x86_64/asm-offsets.c +index 13478d4fc1..10c243a039 100644 +--- a/xen/arch/x86/x86_64/asm-offsets.c ++++ b/xen/arch/x86/x86_64/asm-offsets.c +@@ -142,8 +142,10 @@ void __dummy__(void) + OFFSET(CPUINFO_xen_cr3, struct cpu_info, xen_cr3); + OFFSET(CPUINFO_pv_cr3, struct cpu_info, pv_cr3); + OFFSET(CPUINFO_shadow_spec_ctrl, struct cpu_info, shadow_spec_ctrl); +- OFFSET(CPUINFO_use_shadow_spec_ctrl, struct cpu_info, use_shadow_spec_ctrl); +- OFFSET(CPUINFO_bti_ist_info, struct cpu_info, bti_ist_info); ++ OFFSET(CPUINFO_xen_spec_ctrl, struct cpu_info, xen_spec_ctrl); ++ OFFSET(CPUINFO_spec_ctrl_flags, struct cpu_info, spec_ctrl_flags); ++ OFFSET(CPUINFO_root_pgt_changed, struct cpu_info, root_pgt_changed); ++ OFFSET(CPUINFO_use_pv_cr3, struct cpu_info, use_pv_cr3); + DEFINE(CPUINFO_sizeof, sizeof(struct cpu_info)); + BLANK(); + +diff --git a/xen/arch/x86/x86_64/compat/entry.S b/xen/arch/x86/x86_64/compat/entry.S +index 75497bc292..1c4cd795d2 100644 +--- a/xen/arch/x86/x86_64/compat/entry.S ++++ b/xen/arch/x86/x86_64/compat/entry.S +@@ -39,6 +39,12 @@ ENTRY(compat_test_all_events) + leaq irq_stat+IRQSTAT_softirq_pending(%rip),%rcx + cmpl $0,(%rcx,%rax,1) + jne compat_process_softirqs ++ ++ /* Inject exception if pending. */ ++ lea VCPU_trap_bounce(%rbx), %rdx ++ testb $TBF_EXCEPTION, TRAPBOUNCE_flags(%rdx) ++ jnz .Lcompat_process_trapbounce ++ + testb $1,VCPU_mce_pending(%rbx) + jnz compat_process_mce + .Lcompat_test_guest_nmi: +@@ -68,15 +74,24 @@ compat_process_softirqs: + call do_softirq + jmp compat_test_all_events + ++ ALIGN ++/* %rbx: struct vcpu, %rdx: struct trap_bounce */ ++.Lcompat_process_trapbounce: ++ sti ++.Lcompat_bounce_exception: ++ call compat_create_bounce_frame ++ movb $0, TRAPBOUNCE_flags(%rdx) ++ jmp compat_test_all_events ++ + ALIGN + /* %rbx: struct vcpu */ + compat_process_mce: + testb $1 << VCPU_TRAP_MCE,VCPU_async_exception_mask(%rbx) + jnz .Lcompat_test_guest_nmi + sti +- movb $0,VCPU_mce_pending(%rbx) +- call set_guest_machinecheck_trapbounce +- testl %eax,%eax ++ movb $0, VCPU_mce_pending(%rbx) ++ call set_guest_machinecheck_trapbounce ++ test %al, %al + jz compat_test_all_events + movzbl VCPU_async_exception_mask(%rbx),%edx # save mask for the + movb %dl,VCPU_mce_old_mask(%rbx) # iret hypercall +@@ -88,11 +103,11 @@ compat_process_mce: + /* %rbx: struct vcpu */ + compat_process_nmi: + testb $1 << VCPU_TRAP_NMI,VCPU_async_exception_mask(%rbx) +- jnz compat_test_guest_events ++ jnz compat_test_guest_events + sti +- movb $0,VCPU_nmi_pending(%rbx) ++ movb $0, VCPU_nmi_pending(%rbx) + call set_guest_nmi_trapbounce +- testl %eax,%eax ++ test %al, %al + jz compat_test_all_events + movzbl VCPU_async_exception_mask(%rbx),%edx # save mask for the + movb %dl,VCPU_nmi_old_mask(%rbx) # iret hypercall +@@ -151,7 +166,7 @@ ENTRY(compat_restore_all_guest) + mov VCPUMSR_spec_ctrl_raw(%rax), %eax + + /* WARNING! `ret`, `call *`, `jmp *` not safe beyond this point. */ +- SPEC_CTRL_EXIT_TO_GUEST /* Req: a=spec_ctrl %rsp=regs/cpuinfo, Clob: cd */ ++ SPEC_CTRL_EXIT_TO_PV /* Req: a=spec_ctrl %rsp=regs/cpuinfo, Clob: cd */ + + RESTORE_ALL adj=8 compat=1 + .Lft0: iretq +@@ -189,15 +204,6 @@ ENTRY(cr4_pv32_restore) + xor %eax, %eax + ret + +-/* %rdx: trap_bounce, %rbx: struct vcpu */ +-ENTRY(compat_post_handle_exception) +- testb $TBF_EXCEPTION,TRAPBOUNCE_flags(%rdx) +- jz compat_test_all_events +-.Lcompat_bounce_exception: +- call compat_create_bounce_frame +- movb $0,TRAPBOUNCE_flags(%rdx) +- jmp compat_test_all_events +- + .section .text.entry, "ax", @progbits + + /* See lstar_enter for entry register state. */ +@@ -218,10 +224,9 @@ ENTRY(cstar_enter) + + GET_STACK_END(bx) + mov STACK_CPUINFO_FIELD(xen_cr3)(%rbx), %rcx +- neg %rcx ++ test %rcx, %rcx + jz .Lcstar_cr3_okay +- mov %rcx, STACK_CPUINFO_FIELD(xen_cr3)(%rbx) +- neg %rcx ++ movb $0, STACK_CPUINFO_FIELD(use_pv_cr3)(%rbx) + mov %rcx, %cr3 + movq $0, STACK_CPUINFO_FIELD(xen_cr3)(%rbx) + .Lcstar_cr3_okay: +diff --git a/xen/arch/x86/x86_64/compat/mm.c b/xen/arch/x86/x86_64/compat/mm.c +index c2aa6f2fdb..02bc75b91e 100644 +--- a/xen/arch/x86/x86_64/compat/mm.c ++++ b/xen/arch/x86/x86_64/compat/mm.c +@@ -163,19 +163,6 @@ int compat_arch_memory_op(unsigned long cmd, XEN_GUEST_HANDLE_PARAM(void) arg) + return rc; + } + +-int compat_update_va_mapping(unsigned int va, u32 lo, u32 hi, +- unsigned int flags) +-{ +- return do_update_va_mapping(va, lo | ((u64)hi << 32), flags); +-} +- +-int compat_update_va_mapping_otherdomain(unsigned long va, u32 lo, u32 hi, +- unsigned long flags, +- domid_t domid) +-{ +- return do_update_va_mapping_otherdomain(va, lo | ((u64)hi << 32), flags, domid); +-} +- + DEFINE_XEN_GUEST_HANDLE(mmuext_op_compat_t); + + int compat_mmuext_op(XEN_GUEST_HANDLE_PARAM(void) arg, +diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S +index bdd33e727f..c163c31a60 100644 +--- a/xen/arch/x86/x86_64/entry.S ++++ b/xen/arch/x86/x86_64/entry.S +@@ -42,6 +42,12 @@ test_all_events: + leaq irq_stat+IRQSTAT_softirq_pending(%rip), %rcx + cmpl $0, (%rcx, %rax, 1) + jne process_softirqs ++ ++ /* Inject exception if pending. */ ++ lea VCPU_trap_bounce(%rbx), %rdx ++ testb $TBF_EXCEPTION, TRAPBOUNCE_flags(%rdx) ++ jnz .Lprocess_trapbounce ++ + cmpb $0, VCPU_mce_pending(%rbx) + jne process_mce + .Ltest_guest_nmi: +@@ -69,6 +75,15 @@ process_softirqs: + call do_softirq + jmp test_all_events + ++ ALIGN ++/* %rbx: struct vcpu, %rdx struct trap_bounce */ ++.Lprocess_trapbounce: ++ sti ++.Lbounce_exception: ++ call create_bounce_frame ++ movb $0, TRAPBOUNCE_flags(%rdx) ++ jmp test_all_events ++ + ALIGN + /* %rbx: struct vcpu */ + process_mce: +@@ -77,7 +92,7 @@ process_mce: + sti + movb $0, VCPU_mce_pending(%rbx) + call set_guest_machinecheck_trapbounce +- test %eax, %eax ++ test %al, %al + jz test_all_events + movzbl VCPU_async_exception_mask(%rbx), %edx # save mask for the + movb %dl, VCPU_mce_old_mask(%rbx) # iret hypercall +@@ -93,7 +108,7 @@ process_nmi: + sti + movb $0, VCPU_nmi_pending(%rbx) + call set_guest_nmi_trapbounce +- test %eax, %eax ++ test %al, %al + jz test_all_events + movzbl VCPU_async_exception_mask(%rbx), %edx # save mask for the + movb %dl, VCPU_nmi_old_mask(%rbx) # iret hypercall +@@ -149,11 +164,15 @@ restore_all_guest: + mov VCPU_cr3(%rbx), %r9 + GET_STACK_END(dx) + mov STACK_CPUINFO_FIELD(pv_cr3)(%rdx), %rdi ++ test %rdi, %rdi ++ jz .Lrag_keep_cr3 ++ mov %rdi, %rax ++ cmpb $0, STACK_CPUINFO_FIELD(root_pgt_changed)(%rdx) ++ je .Lrag_copy_done ++ movb $0, STACK_CPUINFO_FIELD(root_pgt_changed)(%rdx) + movabs $PADDR_MASK & PAGE_MASK, %rsi + movabs $DIRECTMAP_VIRT_START, %rcx +- mov %rdi, %rax + and %rsi, %rdi +- jz .Lrag_keep_cr3 + and %r9, %rsi + add %rcx, %rdi + add %rcx, %rsi +@@ -168,20 +187,17 @@ restore_all_guest: + sub $(ROOT_PAGETABLE_FIRST_XEN_SLOT - \ + ROOT_PAGETABLE_LAST_XEN_SLOT - 1) * 8, %rdi + rep movsq +- mov STACK_CPUINFO_FIELD(cr4)(%rdx), %rdi ++.Lrag_copy_done: + mov %r9, STACK_CPUINFO_FIELD(xen_cr3)(%rdx) +- mov %rdi, %rsi +- and $~X86_CR4_PGE, %rdi +- mov %rdi, %cr4 ++ movb $1, STACK_CPUINFO_FIELD(use_pv_cr3)(%rdx) + mov %rax, %cr3 +- mov %rsi, %cr4 + .Lrag_keep_cr3: + + /* Restore stashed SPEC_CTRL value. */ + mov %r15d, %eax + + /* WARNING! `ret`, `call *`, `jmp *` not safe beyond this point. */ +- SPEC_CTRL_EXIT_TO_GUEST /* Req: a=spec_ctrl %rsp=regs/cpuinfo, Clob: cd */ ++ SPEC_CTRL_EXIT_TO_PV /* Req: a=spec_ctrl %rsp=regs/cpuinfo, Clob: cd */ + + RESTORE_ALL + testw $TRAP_syscall,4(%rsp) +@@ -222,20 +238,10 @@ restore_all_xen: + * case we return to late PV exit code (from an NMI or #MC). + */ + GET_STACK_END(bx) +- mov STACK_CPUINFO_FIELD(xen_cr3)(%rbx), %rdx ++ cmpb $0, STACK_CPUINFO_FIELD(use_pv_cr3)(%rbx) ++UNLIKELY_START(ne, exit_cr3) + mov STACK_CPUINFO_FIELD(pv_cr3)(%rbx), %rax +- test %rdx, %rdx +- /* +- * Ideally the condition would be "nsz", but such doesn't exist, +- * so "g" will have to do. +- */ +-UNLIKELY_START(g, exit_cr3) +- mov %cr4, %rdi +- mov %rdi, %rsi +- and $~X86_CR4_PGE, %rdi +- mov %rdi, %cr4 + mov %rax, %cr3 +- mov %rsi, %cr4 + UNLIKELY_END(exit_cr3) + + /* WARNING! `ret`, `call *`, `jmp *` not safe beyond this point. */ +@@ -276,10 +282,9 @@ ENTRY(lstar_enter) + + GET_STACK_END(bx) + mov STACK_CPUINFO_FIELD(xen_cr3)(%rbx), %rcx +- neg %rcx ++ test %rcx, %rcx + jz .Llstar_cr3_okay +- mov %rcx, STACK_CPUINFO_FIELD(xen_cr3)(%rbx) +- neg %rcx ++ movb $0, STACK_CPUINFO_FIELD(use_pv_cr3)(%rbx) + mov %rcx, %cr3 + movq $0, STACK_CPUINFO_FIELD(xen_cr3)(%rbx) + .Llstar_cr3_okay: +@@ -313,10 +318,9 @@ GLOBAL(sysenter_eflags_saved) + /* PUSHF above has saved EFLAGS.IF clear (the caller had it set). */ + orl $X86_EFLAGS_IF, UREGS_eflags(%rsp) + mov STACK_CPUINFO_FIELD(xen_cr3)(%rbx), %rcx +- neg %rcx ++ test %rcx, %rcx + jz .Lsyse_cr3_okay +- mov %rcx, STACK_CPUINFO_FIELD(xen_cr3)(%rbx) +- neg %rcx ++ movb $0, STACK_CPUINFO_FIELD(use_pv_cr3)(%rbx) + mov %rcx, %cr3 + movq $0, STACK_CPUINFO_FIELD(xen_cr3)(%rbx) + .Lsyse_cr3_okay: +@@ -363,10 +367,9 @@ ENTRY(int80_direct_trap) + + GET_STACK_END(bx) + mov STACK_CPUINFO_FIELD(xen_cr3)(%rbx), %rcx +- neg %rcx ++ test %rcx, %rcx + jz .Lint80_cr3_okay +- mov %rcx, STACK_CPUINFO_FIELD(xen_cr3)(%rbx) +- neg %rcx ++ movb $0, STACK_CPUINFO_FIELD(use_pv_cr3)(%rbx) + mov %rcx, %cr3 + movq $0, STACK_CPUINFO_FIELD(xen_cr3)(%rbx) + .Lint80_cr3_okay: +@@ -553,24 +556,24 @@ ENTRY(common_interrupt) + /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */ + + mov STACK_CPUINFO_FIELD(xen_cr3)(%r14), %rcx ++ mov STACK_CPUINFO_FIELD(use_pv_cr3)(%r14), %bl + mov %rcx, %r15 +- neg %rcx ++ test %rcx, %rcx + jz .Lintr_cr3_okay +- jns .Lintr_cr3_load +- mov %rcx, STACK_CPUINFO_FIELD(xen_cr3)(%r14) +- neg %rcx +-.Lintr_cr3_load: ++ movb $0, STACK_CPUINFO_FIELD(use_pv_cr3)(%r14) + mov %rcx, %cr3 + xor %ecx, %ecx + mov %rcx, STACK_CPUINFO_FIELD(xen_cr3)(%r14) + testb $3, UREGS_cs(%rsp) + cmovnz %rcx, %r15 ++ cmovnz %rcx, %rbx + .Lintr_cr3_okay: + + CR4_PV32_RESTORE + movq %rsp,%rdi + callq do_IRQ + mov %r15, STACK_CPUINFO_FIELD(xen_cr3)(%r14) ++ mov %bl, STACK_CPUINFO_FIELD(use_pv_cr3)(%r14) + jmp ret_from_intr + + ENTRY(page_fault) +@@ -585,18 +588,17 @@ GLOBAL(handle_exception) + /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */ + + mov STACK_CPUINFO_FIELD(xen_cr3)(%r14), %rcx ++ mov STACK_CPUINFO_FIELD(use_pv_cr3)(%r14), %r13b + mov %rcx, %r15 +- neg %rcx ++ test %rcx, %rcx + jz .Lxcpt_cr3_okay +- jns .Lxcpt_cr3_load +- mov %rcx, STACK_CPUINFO_FIELD(xen_cr3)(%r14) +- neg %rcx +-.Lxcpt_cr3_load: ++ movb $0, STACK_CPUINFO_FIELD(use_pv_cr3)(%r14) + mov %rcx, %cr3 + xor %ecx, %ecx + mov %rcx, STACK_CPUINFO_FIELD(xen_cr3)(%r14) + testb $3, UREGS_cs(%rsp) + cmovnz %rcx, %r15 ++ cmovnz %rcx, %r13 + .Lxcpt_cr3_okay: + + handle_exception_saved: +@@ -665,17 +667,12 @@ handle_exception_saved: + mov (%rdx, %rax, 8), %rdx + INDIRECT_CALL %rdx + mov %r15, STACK_CPUINFO_FIELD(xen_cr3)(%r14) ++ mov %r13b, STACK_CPUINFO_FIELD(use_pv_cr3)(%r14) + testb $3,UREGS_cs(%rsp) + jz restore_all_xen +- leaq VCPU_trap_bounce(%rbx),%rdx + movq VCPU_domain(%rbx),%rax + testb $1,DOMAIN_is_32bit_pv(%rax) +- jnz compat_post_handle_exception +- testb $TBF_EXCEPTION,TRAPBOUNCE_flags(%rdx) +- jz test_all_events +-.Lbounce_exception: +- call create_bounce_frame +- movb $0,TRAPBOUNCE_flags(%rdx) ++ jnz compat_test_all_events + jmp test_all_events + + /* No special register assumptions. */ +@@ -698,6 +695,7 @@ exception_with_ints_disabled: + 1: movq UREGS_error_code(%rsp),%rax # ec/ev + movq %rax,UREGS_kernel_sizeof(%rsp) + mov %r15, STACK_CPUINFO_FIELD(xen_cr3)(%r14) ++ mov %r13b, STACK_CPUINFO_FIELD(use_pv_cr3)(%r14) + jmp restore_all_xen # return to fixup code + + /* No special register assumptions. */ +@@ -730,7 +728,7 @@ ENTRY(device_not_available) + ENTRY(debug) + pushq $0 + movl $TRAP_debug,4(%rsp) +- jmp handle_exception ++ jmp handle_ist_exception + + ENTRY(int3) + pushq $0 +@@ -785,10 +783,9 @@ ENTRY(double_fault) + mov STACK_CPUINFO_FIELD(xen_cr3)(%r14), %rbx + test %rbx, %rbx + jz .Ldblf_cr3_okay +- jns .Ldblf_cr3_load +- neg %rbx +-.Ldblf_cr3_load: ++ movb $0, STACK_CPUINFO_FIELD(use_pv_cr3)(%r14) + mov %rbx, %cr3 ++ movq $0, STACK_CPUINFO_FIELD(xen_cr3)(%r14) + .Ldblf_cr3_okay: + + movq %rsp,%rdi +@@ -816,13 +813,11 @@ handle_ist_exception: + /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */ + + mov STACK_CPUINFO_FIELD(xen_cr3)(%r14), %rcx ++ mov STACK_CPUINFO_FIELD(use_pv_cr3)(%r14), %bl + mov %rcx, %r15 +- neg %rcx ++ test %rcx, %rcx + jz .List_cr3_okay +- jns .List_cr3_load +- mov %rcx, STACK_CPUINFO_FIELD(xen_cr3)(%r14) +- neg %rcx +-.List_cr3_load: ++ movb $0, STACK_CPUINFO_FIELD(use_pv_cr3)(%r14) + mov %rcx, %cr3 + movq $0, STACK_CPUINFO_FIELD(xen_cr3)(%r14) + .List_cr3_okay: +@@ -835,6 +830,7 @@ handle_ist_exception: + * and copy the context to stack bottom. + */ + xor %r15, %r15 ++ xor %ebx, %ebx + GET_CPUINFO_FIELD(guest_cpu_user_regs,di) + movq %rsp,%rsi + movl $UREGS_kernel_sizeof/8,%ecx +@@ -846,6 +842,7 @@ handle_ist_exception: + mov (%rdx, %rax, 8), %rdx + INDIRECT_CALL %rdx + mov %r15, STACK_CPUINFO_FIELD(xen_cr3)(%r14) ++ mov %bl, STACK_CPUINFO_FIELD(use_pv_cr3)(%r14) + cmpb $TRAP_nmi,UREGS_entry_vector(%rsp) + jne ret_from_intr + +diff --git a/xen/arch/x86/x86_64/traps.c b/xen/arch/x86/x86_64/traps.c +index 3652f5ff21..7e8c5de70d 100644 +--- a/xen/arch/x86/x86_64/traps.c ++++ b/xen/arch/x86/x86_64/traps.c +@@ -144,11 +144,12 @@ void show_registers(const struct cpu_user_regs *regs) + printk("CPU: %d\n", smp_processor_id()); + _show_registers(&fault_regs, fault_crs, context, v); + +- if ( this_cpu(ler_msr) && !guest_mode(regs) ) ++ if ( ler_msr && !guest_mode(regs) ) + { + u64 from, to; +- rdmsrl(this_cpu(ler_msr), from); +- rdmsrl(this_cpu(ler_msr) + 1, to); ++ ++ rdmsrl(ler_msr, from); ++ rdmsrl(ler_msr + 1, to); + printk("ler: %016lx -> %016lx\n", from, to); + } + } +diff --git a/xen/arch/x86/x86_emulate.c b/xen/arch/x86/x86_emulate.c +index c7ba221d11..9125c67c9e 100644 +--- a/xen/arch/x86/x86_emulate.c ++++ b/xen/arch/x86/x86_emulate.c +@@ -14,6 +14,7 @@ + #include <asm/processor.h> /* current_cpu_info */ + #include <asm/xstate.h> + #include <asm/amd.h> /* cpu_has_amd_erratum() */ ++#include <asm/debugreg.h> + + /* Avoid namespace pollution. */ + #undef cmpxchg +@@ -41,3 +42,75 @@ + }) + + #include "x86_emulate/x86_emulate.c" ++ ++/* Called with NULL ctxt in hypercall context. */ ++int x86emul_read_dr(unsigned int reg, unsigned long *val, ++ struct x86_emulate_ctxt *ctxt) ++{ ++ struct vcpu *curr = current; ++ ++ /* HVM support requires a bit more plumbing before it will work. */ ++ ASSERT(is_pv_vcpu(curr)); ++ ++ switch ( reg ) ++ { ++ case 0 ... 3: ++ case 6: ++ *val = curr->arch.debugreg[reg]; ++ break; ++ ++ case 7: ++ *val = (curr->arch.debugreg[7] | ++ curr->arch.debugreg[5]); ++ break; ++ ++ case 4 ... 5: ++ if ( !(curr->arch.pv_vcpu.ctrlreg[4] & X86_CR4_DE) ) ++ { ++ *val = curr->arch.debugreg[reg + 2]; ++ break; ++ } ++ ++ /* Fallthrough */ ++ default: ++ if ( ctxt ) ++ x86_emul_hw_exception(TRAP_invalid_op, X86_EVENT_NO_EC, ctxt); ++ ++ return X86EMUL_EXCEPTION; ++ } ++ ++ return X86EMUL_OKAY; ++} ++ ++int x86emul_write_dr(unsigned int reg, unsigned long val, ++ struct x86_emulate_ctxt *ctxt) ++{ ++ struct vcpu *curr = current; ++ ++ /* HVM support requires a bit more plumbing before it will work. */ ++ ASSERT(is_pv_vcpu(curr)); ++ ++ switch ( set_debugreg(curr, reg, val) ) ++ { ++ case 0: ++ return X86EMUL_OKAY; ++ ++ case -ENODEV: ++ x86_emul_hw_exception(TRAP_invalid_op, X86_EVENT_NO_EC, ctxt); ++ return X86EMUL_EXCEPTION; ++ ++ default: ++ x86_emul_hw_exception(TRAP_gp_fault, 0, ctxt); ++ return X86EMUL_EXCEPTION; ++ } ++} ++ ++/* ++ * Local variables: ++ * mode: C ++ * c-file-style: "BSD" ++ * c-basic-offset: 4 ++ * tab-width: 4 ++ * indent-tabs-mode: nil ++ * End: ++ */ +diff --git a/xen/arch/x86/x86_emulate/x86_emulate.h b/xen/arch/x86/x86_emulate/x86_emulate.h +index 0c8c80ad5a..9c2bb8157c 100644 +--- a/xen/arch/x86/x86_emulate/x86_emulate.h ++++ b/xen/arch/x86/x86_emulate/x86_emulate.h +@@ -662,6 +662,11 @@ static inline void x86_emulate_free_state(struct x86_emulate_state *state) {} + void x86_emulate_free_state(struct x86_emulate_state *state); + #endif + ++int x86emul_read_dr(unsigned int reg, unsigned long *val, ++ struct x86_emulate_ctxt *ctxt); ++int x86emul_write_dr(unsigned int reg, unsigned long val, ++ struct x86_emulate_ctxt *ctxt); ++ + #endif + + static inline void x86_emul_hw_exception( +diff --git a/xen/arch/x86/xstate.c b/xen/arch/x86/xstate.c +index 845208c189..ebd464e83a 100644 +--- a/xen/arch/x86/xstate.c ++++ b/xen/arch/x86/xstate.c +@@ -670,12 +670,17 @@ static bool valid_xcr0(u64 xcr0) + return !(xcr0 & XSTATE_BNDREGS) == !(xcr0 & XSTATE_BNDCSR); + } + +-int validate_xstate(u64 xcr0, u64 xcr0_accum, const struct xsave_hdr *hdr) ++int validate_xstate(const struct domain *d, uint64_t xcr0, uint64_t xcr0_accum, ++ const struct xsave_hdr *hdr) + { ++ const struct cpuid_policy *cp = d->arch.cpuid; ++ uint64_t xcr0_max = ++ ((uint64_t)cp->xstate.xcr0_high << 32) | cp->xstate.xcr0_low; + unsigned int i; + + if ( (hdr->xstate_bv & ~xcr0_accum) || + (xcr0 & ~xcr0_accum) || ++ (xcr0_accum & ~xcr0_max) || + !valid_xcr0(xcr0) || + !valid_xcr0(xcr0_accum) ) + return -EINVAL; +@@ -694,20 +699,40 @@ int validate_xstate(u64 xcr0, u64 xcr0_accum, const struct xsave_hdr *hdr) + int handle_xsetbv(u32 index, u64 new_bv) + { + struct vcpu *curr = current; ++ const struct cpuid_policy *cp = curr->domain->arch.cpuid; ++ uint64_t xcr0_max = ++ ((uint64_t)cp->xstate.xcr0_high << 32) | cp->xstate.xcr0_low; + u64 mask; + + if ( index != XCR_XFEATURE_ENABLED_MASK ) + return -EOPNOTSUPP; + +- if ( (new_bv & ~xfeature_mask) || !valid_xcr0(new_bv) ) ++ /* ++ * The CPUID logic shouldn't be able to hand out an XCR0 exceeding Xen's ++ * maximum features, but keep the check for robustness. ++ */ ++ if ( unlikely(xcr0_max & ~xfeature_mask) ) ++ { ++ gprintk(XENLOG_ERR, ++ "xcr0_max %016" PRIx64 " exceeds hardware max %016" PRIx64 "\n", ++ xcr0_max, xfeature_mask); ++ domain_crash(curr->domain); ++ ++ return -EINVAL; ++ } ++ ++ if ( (new_bv & ~xcr0_max) || !valid_xcr0(new_bv) ) + return -EINVAL; + +- /* XCR0.PKRU is disabled on PV mode. */ +- if ( is_pv_vcpu(curr) && (new_bv & XSTATE_PKRU) ) +- return -EOPNOTSUPP; ++ /* By this point, new_bv really should be accepted by hardware. */ ++ if ( unlikely(!set_xcr0(new_bv)) ) ++ { ++ gprintk(XENLOG_ERR, "new_bv %016" PRIx64 " rejected by hardware\n", ++ new_bv); ++ domain_crash(curr->domain); + +- if ( !set_xcr0(new_bv) ) + return -EFAULT; ++ } + + mask = new_bv & ~curr->arch.xcr0_accum; + curr->arch.xcr0 = new_bv; +diff --git a/xen/common/cpu.c b/xen/common/cpu.c +index 6350f150bd..653a56b840 100644 +--- a/xen/common/cpu.c ++++ b/xen/common/cpu.c +@@ -67,12 +67,17 @@ void __init register_cpu_notifier(struct notifier_block *nb) + spin_unlock(&cpu_add_remove_lock); + } + +-static int take_cpu_down(void *unused) ++static void _take_cpu_down(void *unused) + { + void *hcpu = (void *)(long)smp_processor_id(); + int notifier_rc = notifier_call_chain(&cpu_chain, CPU_DYING, hcpu, NULL); + BUG_ON(notifier_rc != NOTIFY_DONE); + __cpu_disable(); ++} ++ ++static int take_cpu_down(void *arg) ++{ ++ _take_cpu_down(arg); + return 0; + } + +@@ -98,7 +103,9 @@ int cpu_down(unsigned int cpu) + goto fail; + } + +- if ( (err = stop_machine_run(take_cpu_down, NULL, cpu)) < 0 ) ++ if ( unlikely(system_state < SYS_STATE_active) ) ++ on_selected_cpus(cpumask_of(cpu), _take_cpu_down, NULL, true); ++ else if ( (err = stop_machine_run(take_cpu_down, NULL, cpu)) < 0 ) + goto fail; + + __cpu_die(cpu); +diff --git a/xen/common/cpupool.c b/xen/common/cpupool.c +index 999839444e..1e8edcbd57 100644 +--- a/xen/common/cpupool.c ++++ b/xen/common/cpupool.c +@@ -490,7 +490,7 @@ static int cpupool_cpu_add(unsigned int cpu) + cpumask_clear_cpu(cpu, &cpupool_locked_cpus); + cpumask_set_cpu(cpu, &cpupool_free_cpus); + +- if ( system_state == SYS_STATE_resume ) ++ if ( system_state == SYS_STATE_suspend || system_state == SYS_STATE_resume ) + { + struct cpupool **c; + +@@ -522,6 +522,7 @@ static int cpupool_cpu_add(unsigned int cpu) + * (or unplugging would have failed) and that is the default behavior + * anyway. + */ ++ per_cpu(cpupool, cpu) = NULL; + ret = cpupool_assign_cpu_locked(cpupool0, cpu); + } + out: +diff --git a/xen/common/efi/boot.c b/xen/common/efi/boot.c +index 01d33004e0..f1d724bd06 100644 +--- a/xen/common/efi/boot.c ++++ b/xen/common/efi/boot.c +@@ -1304,6 +1304,8 @@ efi_start(EFI_HANDLE ImageHandle, EFI_SYSTEM_TABLE *SystemTable) + + #ifndef CONFIG_ARM /* TODO - runtime service support */ + ++#include <asm/spec_ctrl.h> ++ + static bool __initdata efi_map_uc; + + static int __init parse_efi_param(const char *s) +@@ -1419,6 +1421,16 @@ void __init efi_init_memory(void) + desc->PhysicalStart, desc->PhysicalStart + len - 1, + desc->Type, desc->Attribute); + ++ if ( (desc->Attribute & (EFI_MEMORY_WB | EFI_MEMORY_WT)) || ++ (efi_bs_revision >= EFI_REVISION(2, 5) && ++ (desc->Attribute & EFI_MEMORY_WP)) ) ++ { ++ /* Supplement the heuristics in l1tf_calculations(). */ ++ l1tf_safe_maddr = ++ max(l1tf_safe_maddr, ++ ROUNDUP(desc->PhysicalStart + len, PAGE_SIZE)); ++ } ++ + if ( !efi_enabled(EFI_RS) || + (!(desc->Attribute & EFI_MEMORY_RUNTIME) && + (!map_bs || +diff --git a/xen/common/efi/runtime.c b/xen/common/efi/runtime.c +index c38f00a64b..9aa070e77c 100644 +--- a/xen/common/efi/runtime.c ++++ b/xen/common/efi/runtime.c +@@ -111,21 +111,23 @@ struct efi_rs_state efi_rs_enter(void) + asm volatile ( "lgdt %0" : : "m" (gdt_desc) ); + } + +- write_cr3(virt_to_maddr(efi_l4_pgtable)); ++ switch_cr3_cr4(virt_to_maddr(efi_l4_pgtable), read_cr4()); + + return state; + } + + void efi_rs_leave(struct efi_rs_state *state) + { ++ struct vcpu *curr = current; ++ + if ( !state->cr3 ) + return; +- write_cr3(state->cr3); +- if ( is_pv_vcpu(current) && !is_idle_vcpu(current) ) ++ switch_cr3_cr4(state->cr3, read_cr4()); ++ if ( is_pv_vcpu(curr) && !is_idle_vcpu(curr) ) + { + struct desc_ptr gdt_desc = { + .limit = LAST_RESERVED_GDT_BYTE, +- .base = GDT_VIRT_START(current) ++ .base = GDT_VIRT_START(curr) + }; + + asm volatile ( "lgdt %0" : : "m" (gdt_desc) ); +@@ -133,7 +135,7 @@ void efi_rs_leave(struct efi_rs_state *state) + irq_exit(); + efi_rs_on_cpu = NR_CPUS; + spin_unlock(&efi_rs_lock); +- stts(); ++ vcpu_restore_fpu_nonlazy(curr, true); + } + + bool efi_rs_using_pgtables(void) +diff --git a/xen/common/grant_table.c b/xen/common/grant_table.c +index 7650e6d449..64f58fc815 100644 +--- a/xen/common/grant_table.c ++++ b/xen/common/grant_table.c +@@ -97,6 +97,45 @@ static unsigned int __read_mostly max_maptrack_frames = + DEFAULT_MAX_MAPTRACK_FRAMES; + integer_runtime_param("gnttab_max_maptrack_frames", max_maptrack_frames); + ++#ifndef GNTTAB_MAX_VERSION ++#define GNTTAB_MAX_VERSION 2 ++#endif ++ ++static unsigned int __read_mostly opt_gnttab_max_version = GNTTAB_MAX_VERSION; ++static bool __read_mostly opt_transitive_grants = true; ++ ++static int __init parse_gnttab(const char *s) ++{ ++ const char *ss, *e; ++ int val, rc = 0; ++ ++ do { ++ ss = strchr(s, ','); ++ if ( !ss ) ++ ss = strchr(s, '\0'); ++ ++ if ( !strncmp(s, "max-ver:", 8) || ++ !strncmp(s, "max_ver:", 8) ) /* Alias for original XSA-226 patch */ ++ { ++ long ver = simple_strtol(s + 8, &e, 10); ++ ++ if ( e == ss && ver >= 1 && ver <= 2 ) ++ opt_gnttab_max_version = ver; ++ else ++ rc = -EINVAL; ++ } ++ else if ( (val = parse_boolean("transitive", s, ss)) >= 0 ) ++ opt_transitive_grants = val; ++ else ++ rc = -EINVAL; ++ ++ s = ss + 1; ++ } while ( *ss ); ++ ++ return rc; ++} ++custom_param("gnttab", parse_gnttab); ++ + /* + * Note that the three values below are effectively part of the ABI, even if + * we don't need to make them a formal part of it: A guest suspended for +@@ -2725,7 +2764,8 @@ static int gnttab_copy_claim_buf(const struct gnttab_copy *op, + current->domain->domain_id, + buf->read_only, + &buf->frame, &buf->page, +- &buf->ptr.offset, &buf->len, true); ++ &buf->ptr.offset, &buf->len, ++ opt_transitive_grants); + if ( rc != GNTST_okay ) + goto out; + buf->ptr.u.ref = ptr->u.ref; +@@ -2927,6 +2967,10 @@ gnttab_set_version(XEN_GUEST_HANDLE_PARAM(gnttab_set_version_t) uop) + if ( op.version != 1 && op.version != 2 ) + goto out; + ++ res = -ENOSYS; ++ if ( op.version == 2 && opt_gnttab_max_version == 1 ) ++ goto out; /* Behave as before set_version was introduced. */ ++ + res = 0; + if ( gt->gt_version == op.version ) + goto out; +diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c +index 49b2a91751..6d6f2a0628 100644 +--- a/xen/common/page_alloc.c ++++ b/xen/common/page_alloc.c +@@ -1424,7 +1424,7 @@ static void free_heap_pages( + + page_list_del(predecessor, &heap(node, zone, order)); + +- /* Keep predecessor's first_dirty if it is already set. */ ++ /* Update predecessor's first_dirty if necessary. */ + if ( predecessor->u.free.first_dirty == INVALID_DIRTY_IDX && + pg->u.free.first_dirty != INVALID_DIRTY_IDX ) + predecessor->u.free.first_dirty = (1U << order) + +@@ -1445,6 +1445,12 @@ static void free_heap_pages( + + check_and_stop_scrub(successor); + ++ /* Update pg's first_dirty if necessary. */ ++ if ( pg->u.free.first_dirty == INVALID_DIRTY_IDX && ++ successor->u.free.first_dirty != INVALID_DIRTY_IDX ) ++ pg->u.free.first_dirty = (1U << order) + ++ successor->u.free.first_dirty; ++ + page_list_del(successor, &heap(node, zone, order)); + } + +diff --git a/xen/common/schedule.c b/xen/common/schedule.c +index b7884263f2..f21c3e5a64 100644 +--- a/xen/common/schedule.c ++++ b/xen/common/schedule.c +@@ -436,14 +436,9 @@ void sched_destroy_domain(struct domain *d) + cpupool_rm_domain(d); + } + +-void vcpu_sleep_nosync(struct vcpu *v) ++void vcpu_sleep_nosync_locked(struct vcpu *v) + { +- unsigned long flags; +- spinlock_t *lock; +- +- TRACE_2D(TRC_SCHED_SLEEP, v->domain->domain_id, v->vcpu_id); +- +- lock = vcpu_schedule_lock_irqsave(v, &flags); ++ ASSERT(spin_is_locked(per_cpu(schedule_data,v->processor).schedule_lock)); + + if ( likely(!vcpu_runnable(v)) ) + { +@@ -452,6 +447,18 @@ void vcpu_sleep_nosync(struct vcpu *v) + + SCHED_OP(vcpu_scheduler(v), sleep, v); + } ++} ++ ++void vcpu_sleep_nosync(struct vcpu *v) ++{ ++ unsigned long flags; ++ spinlock_t *lock; ++ ++ TRACE_2D(TRC_SCHED_SLEEP, v->domain->domain_id, v->vcpu_id); ++ ++ lock = vcpu_schedule_lock_irqsave(v, &flags); ++ ++ vcpu_sleep_nosync_locked(v); + + vcpu_schedule_unlock_irqrestore(lock, flags, v); + } +@@ -567,13 +574,54 @@ static void vcpu_move_nosched(struct vcpu *v, unsigned int new_cpu) + sched_move_irqs(v); + } + +-static void vcpu_migrate(struct vcpu *v) ++/* ++ * Initiating migration ++ * ++ * In order to migrate, we need the vcpu in question to have stopped ++ * running and had SCHED_OP(sleep) called (to take it off any ++ * runqueues, for instance); and if it is currently running, it needs ++ * to be scheduled out. Finally, we need to hold the scheduling locks ++ * for both the processor we're migrating from, and the processor ++ * we're migrating to. ++ * ++ * In order to avoid deadlock while satisfying the final requirement, ++ * we must release any scheduling lock we hold, then try to grab both ++ * locks we want, then double-check to make sure that what we started ++ * to do hasn't been changed in the mean time. ++ * ++ * These steps are encapsulated in the following two functions; they ++ * should be called like this: ++ * ++ * lock = vcpu_schedule_lock_irq(v); ++ * vcpu_migrate_start(v); ++ * vcpu_schedule_unlock_irq(lock, v) ++ * vcpu_migrate_finish(v); ++ * ++ * vcpu_migrate_finish() will do the work now if it can, or simply ++ * return if it can't (because v is still running); in that case ++ * vcpu_migrate_finish() will be called by context_saved(). ++ */ ++void vcpu_migrate_start(struct vcpu *v) ++{ ++ set_bit(_VPF_migrating, &v->pause_flags); ++ vcpu_sleep_nosync_locked(v); ++} ++ ++static void vcpu_migrate_finish(struct vcpu *v) + { + unsigned long flags; + unsigned int old_cpu, new_cpu; + spinlock_t *old_lock, *new_lock; + bool_t pick_called = 0; + ++ /* ++ * If the vcpu is currently running, this will be handled by ++ * context_saved(); and in any case, if the bit is cleared, then ++ * someone else has already done the work so we don't need to. ++ */ ++ if ( v->is_running || !test_bit(_VPF_migrating, &v->pause_flags) ) ++ return; ++ + old_cpu = new_cpu = v->processor; + for ( ; ; ) + { +@@ -653,14 +701,11 @@ void vcpu_force_reschedule(struct vcpu *v) + spinlock_t *lock = vcpu_schedule_lock_irq(v); + + if ( v->is_running ) +- set_bit(_VPF_migrating, &v->pause_flags); ++ vcpu_migrate_start(v); ++ + vcpu_schedule_unlock_irq(lock, v); + +- if ( v->pause_flags & VPF_migrating ) +- { +- vcpu_sleep_nosync(v); +- vcpu_migrate(v); +- } ++ vcpu_migrate_finish(v); + } + + void restore_vcpu_affinity(struct domain *d) +@@ -812,10 +857,10 @@ int cpu_disable_scheduler(unsigned int cpu) + * * the scheduler will always fine a suitable solution, or + * things would have failed before getting in here. + */ +- set_bit(_VPF_migrating, &v->pause_flags); ++ vcpu_migrate_start(v); + vcpu_schedule_unlock_irqrestore(lock, flags, v); +- vcpu_sleep_nosync(v); +- vcpu_migrate(v); ++ ++ vcpu_migrate_finish(v); + + /* + * The only caveat, in this case, is that if a vcpu active in +@@ -849,18 +894,14 @@ static int vcpu_set_affinity( + * Always ask the scheduler to re-evaluate placement + * when changing the affinity. + */ +- set_bit(_VPF_migrating, &v->pause_flags); ++ vcpu_migrate_start(v); + } + + vcpu_schedule_unlock_irq(lock, v); + + domain_update_node_affinity(v->domain); + +- if ( v->pause_flags & VPF_migrating ) +- { +- vcpu_sleep_nosync(v); +- vcpu_migrate(v); +- } ++ vcpu_migrate_finish(v); + + return ret; + } +@@ -1088,7 +1129,6 @@ int vcpu_pin_override(struct vcpu *v, int cpu) + { + cpumask_copy(v->cpu_hard_affinity, v->cpu_hard_affinity_saved); + v->affinity_broken = 0; +- set_bit(_VPF_migrating, &v->pause_flags); + ret = 0; + } + } +@@ -1101,20 +1141,18 @@ int vcpu_pin_override(struct vcpu *v, int cpu) + cpumask_copy(v->cpu_hard_affinity_saved, v->cpu_hard_affinity); + v->affinity_broken = 1; + cpumask_copy(v->cpu_hard_affinity, cpumask_of(cpu)); +- set_bit(_VPF_migrating, &v->pause_flags); + ret = 0; + } + } + ++ if ( ret == 0 ) ++ vcpu_migrate_start(v); ++ + vcpu_schedule_unlock_irq(lock, v); + + domain_update_node_affinity(v->domain); + +- if ( v->pause_flags & VPF_migrating ) +- { +- vcpu_sleep_nosync(v); +- vcpu_migrate(v); +- } ++ vcpu_migrate_finish(v); + + return ret; + } +@@ -1501,8 +1539,7 @@ void context_saved(struct vcpu *prev) + + SCHED_OP(vcpu_scheduler(prev), context_saved, prev); + +- if ( unlikely(prev->pause_flags & VPF_migrating) ) +- vcpu_migrate(prev); ++ vcpu_migrate_finish(prev); + } + + /* The scheduler timer: force a run through the scheduler */ +diff --git a/xen/common/tasklet.c b/xen/common/tasklet.c +index 0f0a6f8365..d4fea3151c 100644 +--- a/xen/common/tasklet.c ++++ b/xen/common/tasklet.c +@@ -156,6 +156,10 @@ void tasklet_kill(struct tasklet *t) + + spin_lock_irqsave(&tasklet_lock, flags); + ++ /* Cope with uninitialised tasklets. */ ++ if ( list_head_is_null(&t->list) ) ++ goto unlock; ++ + if ( !list_empty(&t->list) ) + { + BUG_ON(t->is_dead || t->is_running || (t->scheduled_on < 0)); +@@ -172,6 +176,7 @@ void tasklet_kill(struct tasklet *t) + spin_lock_irqsave(&tasklet_lock, flags); + } + ++ unlock: + spin_unlock_irqrestore(&tasklet_lock, flags); + } + +diff --git a/xen/include/asm-arm/arm32/system.h b/xen/include/asm-arm/arm32/system.h +index c617b40438..ab57abfbc5 100644 +--- a/xen/include/asm-arm/arm32/system.h ++++ b/xen/include/asm-arm/arm32/system.h +@@ -48,6 +48,24 @@ static inline int local_fiq_is_enabled(void) + return !(flags & PSR_FIQ_MASK); + } + ++#define CSDB ".inst 0xe320f014" ++ ++static inline unsigned long array_index_mask_nospec(unsigned long idx, ++ unsigned long sz) ++{ ++ unsigned long mask; ++ ++ asm volatile( "cmp %1, %2\n" ++ "sbc %0, %1, %1\n" ++ CSDB ++ : "=r" (mask) ++ : "r" (idx), "Ir" (sz) ++ : "cc" ); ++ ++ return mask; ++} ++#define array_index_mask_nospec array_index_mask_nospec ++ + #endif + /* + * Local variables: +diff --git a/xen/include/asm-arm/arm64/system.h b/xen/include/asm-arm/arm64/system.h +index 2e2ee212a1..2e36573ac6 100644 +--- a/xen/include/asm-arm/arm64/system.h ++++ b/xen/include/asm-arm/arm64/system.h +@@ -58,6 +58,28 @@ static inline int local_fiq_is_enabled(void) + return !(flags & PSR_FIQ_MASK); + } + ++#define csdb() asm volatile ( "hint #20" : : : "memory" ) ++ ++/* ++ * Generate a mask for array_index__nospec() that is ~0UL when 0 <= idx < sz ++ * and 0 otherwise. ++ */ ++static inline unsigned long array_index_mask_nospec(unsigned long idx, ++ unsigned long sz) ++{ ++ unsigned long mask; ++ ++ asm volatile ( "cmp %1, %2\n" ++ "sbc %0, xzr, xzr\n" ++ : "=r" (mask) ++ : "r" (idx), "Ir" (sz) ++ : "cc" ); ++ csdb(); ++ ++ return mask; ++} ++#define array_index_mask_nospec array_index_mask_nospec ++ + #endif + /* + * Local variables: +diff --git a/xen/include/asm-arm/grant_table.h b/xen/include/asm-arm/grant_table.h +index 5b8994cbd5..619721f121 100644 +--- a/xen/include/asm-arm/grant_table.h ++++ b/xen/include/asm-arm/grant_table.h +@@ -7,6 +7,7 @@ + #include <xen/sched.h> + + #define INITIAL_NR_GRANT_FRAMES 1U ++#define GNTTAB_MAX_VERSION 1 + + struct grant_table_arch { + gfn_t *shared_gfn; +diff --git a/xen/include/asm-x86/cpufeature.h b/xen/include/asm-x86/cpufeature.h +index 62465b20c7..ff6f969e74 100644 +--- a/xen/include/asm-x86/cpufeature.h ++++ b/xen/include/asm-x86/cpufeature.h +@@ -90,6 +90,7 @@ + #define cpu_has_avx2 boot_cpu_has(X86_FEATURE_AVX2) + #define cpu_has_smep boot_cpu_has(X86_FEATURE_SMEP) + #define cpu_has_bmi2 boot_cpu_has(X86_FEATURE_BMI2) ++#define cpu_has_invpcid boot_cpu_has(X86_FEATURE_INVPCID) + #define cpu_has_rtm boot_cpu_has(X86_FEATURE_RTM) + #define cpu_has_fpu_sel (!boot_cpu_has(X86_FEATURE_NO_FPU_SEL)) + #define cpu_has_mpx boot_cpu_has(X86_FEATURE_MPX) +@@ -106,6 +107,7 @@ + #define cpu_has_aperfmperf boot_cpu_has(X86_FEATURE_APERFMPERF) + #define cpu_has_lfence_dispatch boot_cpu_has(X86_FEATURE_LFENCE_DISPATCH) + #define cpu_has_no_xpti boot_cpu_has(X86_FEATURE_NO_XPTI) ++#define cpu_has_xen_lbr boot_cpu_has(X86_FEATURE_XEN_LBR) + + enum _cache_type { + CACHE_TYPE_NULL = 0, +diff --git a/xen/include/asm-x86/cpufeatures.h b/xen/include/asm-x86/cpufeatures.h +index c9b1a48807..8e5cc53dde 100644 +--- a/xen/include/asm-x86/cpufeatures.h ++++ b/xen/include/asm-x86/cpufeatures.h +@@ -26,8 +26,10 @@ XEN_CPUFEATURE(LFENCE_DISPATCH, (FSCAPINTS+0)*32+12) /* lfence set as Dispatch S + XEN_CPUFEATURE(IND_THUNK_LFENCE,(FSCAPINTS+0)*32+13) /* Use IND_THUNK_LFENCE */ + XEN_CPUFEATURE(IND_THUNK_JMP, (FSCAPINTS+0)*32+14) /* Use IND_THUNK_JMP */ + XEN_CPUFEATURE(XEN_IBPB, (FSCAPINTS+0)*32+15) /* IBRSB || IBPB */ +-XEN_CPUFEATURE(XEN_IBRS_SET, (FSCAPINTS+0)*32+16) /* IBRSB && IRBS set in Xen */ +-XEN_CPUFEATURE(XEN_IBRS_CLEAR, (FSCAPINTS+0)*32+17) /* IBRSB && IBRS clear in Xen */ +-XEN_CPUFEATURE(RSB_NATIVE, (FSCAPINTS+0)*32+18) /* RSB overwrite needed for native */ +-XEN_CPUFEATURE(RSB_VMEXIT, (FSCAPINTS+0)*32+19) /* RSB overwrite needed for vmexit */ ++XEN_CPUFEATURE(SC_MSR_PV, (FSCAPINTS+0)*32+16) /* MSR_SPEC_CTRL used by Xen for PV */ ++XEN_CPUFEATURE(SC_MSR_HVM, (FSCAPINTS+0)*32+17) /* MSR_SPEC_CTRL used by Xen for HVM */ ++XEN_CPUFEATURE(SC_RSB_PV, (FSCAPINTS+0)*32+18) /* RSB overwrite needed for PV */ ++XEN_CPUFEATURE(SC_RSB_HVM, (FSCAPINTS+0)*32+19) /* RSB overwrite needed for HVM */ + XEN_CPUFEATURE(NO_XPTI, (FSCAPINTS+0)*32+20) /* XPTI mitigation not in use */ ++XEN_CPUFEATURE(SC_MSR_IDLE, (FSCAPINTS+0)*32+21) /* (SC_MSR_PV || SC_MSR_HVM) && default_xen_spec_ctrl */ ++XEN_CPUFEATURE(XEN_LBR, (FSCAPINTS+0)*32+22) /* Xen uses MSR_DEBUGCTL.LBR */ +diff --git a/xen/include/asm-x86/current.h b/xen/include/asm-x86/current.h +index 4678a0fcf5..9a137a1311 100644 +--- a/xen/include/asm-x86/current.h ++++ b/xen/include/asm-x86/current.h +@@ -44,20 +44,33 @@ struct cpu_info { + /* + * Of the two following fields the latter is being set to the CR3 value + * to be used on the given pCPU for loading whenever 64-bit PV guest +- * context is being entered. The value never changes once set. ++ * context is being entered. A value of zero indicates no setting of CR3 ++ * is to be performed. + * The former is the value to restore when re-entering Xen, if any. IOW +- * its value being zero means there's nothing to restore. However, its +- * value can also be negative, indicating to the exit-to-Xen code that +- * restoring is not necessary, but allowing any nested entry code paths +- * to still know the value to put back into CR3. ++ * its value being zero means there's nothing to restore. + */ + unsigned long xen_cr3; + unsigned long pv_cr3; + + /* See asm-x86/spec_ctrl_asm.h for usage. */ + unsigned int shadow_spec_ctrl; +- bool use_shadow_spec_ctrl; +- uint8_t bti_ist_info; ++ uint8_t xen_spec_ctrl; ++ uint8_t spec_ctrl_flags; ++ ++ /* ++ * The following field controls copying of the L4 page table of 64-bit ++ * PV guests to the per-cpu root page table on entering the guest context. ++ * If set the L4 page table is being copied to the root page table and ++ * the field will be reset. ++ */ ++ bool root_pgt_changed; ++ ++ /* ++ * use_pv_cr3 is set in case the value of pv_cr3 is to be written into ++ * CR3 when returning from an interrupt. The main use is when returning ++ * from a NMI or MCE to hypervisor code where pv_cr3 was active. ++ */ ++ bool use_pv_cr3; + + unsigned long __pad; + /* get_stack_bottom() must be 16-byte aligned */ +diff --git a/xen/include/asm-x86/debugreg.h b/xen/include/asm-x86/debugreg.h +index c57914efc6..b3b10eaf40 100644 +--- a/xen/include/asm-x86/debugreg.h ++++ b/xen/include/asm-x86/debugreg.h +@@ -24,6 +24,8 @@ + #define DR_STATUS_RESERVED_ZERO (~0xffffeffful) /* Reserved, read as zero */ + #define DR_STATUS_RESERVED_ONE 0xffff0ff0ul /* Reserved, read as one */ + ++#define X86_DR6_DEFAULT 0xffff0ff0ul /* Default %dr6 value. */ ++ + /* Now define a bunch of things for manipulating the control register. + The top two bytes of the control register consist of 4 fields of 4 + bits - each field corresponds to one of the four debug registers, +diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h +index 4679d5477d..0fbd36bf48 100644 +--- a/xen/include/asm-x86/domain.h ++++ b/xen/include/asm-x86/domain.h +@@ -121,6 +121,11 @@ struct shadow_domain { + + /* Has this domain ever used HVMOP_pagetable_dying? */ + bool_t pagetable_dying_op; ++ ++#ifdef CONFIG_PV ++ /* PV L1 Terminal Fault mitigation. */ ++ struct tasklet pv_l1tf_tasklet; ++#endif /* CONFIG_PV */ + #endif + }; + +@@ -253,6 +258,13 @@ struct pv_domain + + atomic_t nr_l4_pages; + ++ /* XPTI active? */ ++ bool xpti; ++ /* Use PCID feature? */ ++ bool pcid; ++ /* Mitigate L1TF with shadow/crashing? */ ++ bool check_l1tf; ++ + /* map_domain_page() mapping cache. */ + struct mapcache_domain mapcache; + +@@ -564,6 +576,9 @@ struct arch_vcpu + * and thus should be saved/restored. */ + bool_t nonlazy_xstate_used; + ++ /* Restore all FPU state (lazy and non-lazy state) on context switch? */ ++ bool fully_eager_fpu; ++ + /* + * The SMAP check policy when updating runstate_guest(v) and the + * secondary system time. +@@ -612,18 +627,12 @@ void vcpu_show_registers(const struct vcpu *); + unsigned long pv_guest_cr4_fixup(const struct vcpu *, unsigned long guest_cr4); + + /* Convert between guest-visible and real CR4 values. */ +-#define pv_guest_cr4_to_real_cr4(v) \ +- (((v)->arch.pv_vcpu.ctrlreg[4] \ +- | (mmu_cr4_features \ +- & (X86_CR4_PGE | X86_CR4_PSE | X86_CR4_SMEP | \ +- X86_CR4_SMAP | X86_CR4_OSXSAVE | \ +- X86_CR4_FSGSBASE)) \ +- | ((v)->domain->arch.vtsc ? X86_CR4_TSD : 0)) \ +- & ~X86_CR4_DE) ++unsigned long pv_guest_cr4_to_real_cr4(const struct vcpu *v); ++ + #define real_cr4_to_pv_guest_cr4(c) \ + ((c) & ~(X86_CR4_PGE | X86_CR4_PSE | X86_CR4_TSD | \ + X86_CR4_OSXSAVE | X86_CR4_SMEP | \ +- X86_CR4_FSGSBASE | X86_CR4_SMAP)) ++ X86_CR4_FSGSBASE | X86_CR4_SMAP | X86_CR4_PCIDE)) + + #define domain_max_vcpus(d) (is_hvm_domain(d) ? HVM_MAX_VCPUS : MAX_VIRT_CPUS) + +diff --git a/xen/include/asm-x86/flushtlb.h b/xen/include/asm-x86/flushtlb.h +index 413db692e1..4a930448da 100644 +--- a/xen/include/asm-x86/flushtlb.h ++++ b/xen/include/asm-x86/flushtlb.h +@@ -84,7 +84,7 @@ static inline unsigned long read_cr3(void) + } + + /* Write pagetable base and implicitly tick the tlbflush clock. */ +-void write_cr3(unsigned long cr3); ++void switch_cr3_cr4(unsigned long cr3, unsigned long cr4); + + /* flush_* flag fields: */ + /* +@@ -101,6 +101,8 @@ void write_cr3(unsigned long cr3); + #define FLUSH_CACHE 0x400 + /* VA for the flush has a valid mapping */ + #define FLUSH_VA_VALID 0x800 ++ /* Flush the per-cpu root page table */ ++#define FLUSH_ROOT_PGTBL 0x2000 + + /* Flush local TLBs/caches. */ + unsigned int flush_area_local(const void *va, unsigned int flags); +@@ -132,6 +134,12 @@ void flush_area_mask(const cpumask_t *, const void *va, unsigned int flags); + #define flush_tlb_one_all(v) \ + flush_tlb_one_mask(&cpu_online_map, v) + ++#define flush_root_pgtbl_domain(d) \ ++{ \ ++ if ( is_pv_domain(d) && (d)->arch.pv_domain.xpti ) \ ++ flush_mask((d)->domain_dirty_cpumask, FLUSH_ROOT_PGTBL); \ ++} ++ + static inline void flush_page_to_ram(unsigned long mfn, bool sync_icache) {} + static inline int invalidate_dcache_va_range(const void *p, + unsigned long size) +diff --git a/xen/include/asm-x86/hvm/irq.h b/xen/include/asm-x86/hvm/irq.h +index f756cb5a0d..1a52ec6045 100644 +--- a/xen/include/asm-x86/hvm/irq.h ++++ b/xen/include/asm-x86/hvm/irq.h +@@ -207,6 +207,9 @@ int hvm_set_pci_link_route(struct domain *d, u8 link, u8 isa_irq); + + int hvm_inject_msi(struct domain *d, uint64_t addr, uint32_t data); + ++/* Assert an IO APIC pin. */ ++int hvm_ioapic_assert(struct domain *d, unsigned int gsi, bool level); ++ + void hvm_maybe_deassert_evtchn_irq(void); + void hvm_assert_evtchn_irq(struct vcpu *v); + void hvm_set_callback_via(struct domain *d, uint64_t via); +diff --git a/xen/include/asm-x86/hvm/vmx/vmcs.h b/xen/include/asm-x86/hvm/vmx/vmcs.h +index 8fb9e3ceee..e216c4ac35 100644 +--- a/xen/include/asm-x86/hvm/vmx/vmcs.h ++++ b/xen/include/asm-x86/hvm/vmx/vmcs.h +@@ -130,10 +130,18 @@ struct arch_vmx_struct { + uint64_t sfmask; + + struct vmx_msr_bitmap *msr_bitmap; +- unsigned int msr_count; ++ ++ /* ++ * Most accesses to the MSR host/guest load/save lists are in current ++ * context. However, the data can be modified by toolstack/migration ++ * actions. Remote access is only permitted for paused vcpus, and is ++ * protected under the domctl lock. ++ */ + struct vmx_msr_entry *msr_area; +- unsigned int host_msr_count; + struct vmx_msr_entry *host_msr_area; ++ unsigned int msr_load_count; ++ unsigned int msr_save_count; ++ unsigned int host_msr_count; + + unsigned long eoi_exitmap_changed; + DECLARE_BITMAP(eoi_exit_bitmap, NR_VECTORS); +@@ -149,7 +157,7 @@ struct arch_vmx_struct { + /* Are we emulating rather than VMENTERing? */ + uint8_t vmx_emulate; + +- uint8_t lbr_fixup_enabled; ++ uint8_t lbr_flags; + + /* Bitmask of segments that we can't safely use in virtual 8086 mode */ + uint16_t vm86_segment_mask; +@@ -511,9 +519,6 @@ enum vmcs_field { + + #define VMCS_VPID_WIDTH 16 + +-#define VMX_GUEST_MSR 0 +-#define VMX_HOST_MSR 1 +- + /* VM Instruction error numbers */ + enum vmx_insn_errno + { +@@ -531,6 +536,67 @@ enum vmx_insn_errno + VMX_INSN_FAIL_INVALID = ~0, + }; + ++/* MSR load/save list infrastructure. */ ++enum vmx_msr_list_type { ++ VMX_MSR_HOST, /* MSRs loaded on VMExit. */ ++ VMX_MSR_GUEST, /* MSRs saved on VMExit, loaded on VMEntry. */ ++ VMX_MSR_GUEST_LOADONLY, /* MSRs loaded on VMEntry only. */ ++}; ++ ++/** ++ * Add an MSR to an MSR list (inserting space for the entry if necessary), and ++ * set the MSRs value. ++ * ++ * It is undefined behaviour to try and insert the same MSR into both the ++ * GUEST and GUEST_LOADONLY list. ++ * ++ * May fail if unable to allocate memory for the list, or the total number of ++ * entries exceeds the memory allocated. ++ */ ++int vmx_add_msr(struct vcpu *v, uint32_t msr, uint64_t val, ++ enum vmx_msr_list_type type); ++ ++static inline int vmx_add_guest_msr(struct vcpu *v, uint32_t msr, uint64_t val) ++{ ++ return vmx_add_msr(v, msr, val, VMX_MSR_GUEST); ++} ++static inline int vmx_add_host_load_msr(struct vcpu *v, uint32_t msr, ++ uint64_t val) ++{ ++ return vmx_add_msr(v, msr, val, VMX_MSR_HOST); ++} ++ ++struct vmx_msr_entry *vmx_find_msr(const struct vcpu *v, uint32_t msr, ++ enum vmx_msr_list_type type); ++ ++static inline int vmx_read_guest_msr(const struct vcpu *v, uint32_t msr, ++ uint64_t *val) ++{ ++ const struct vmx_msr_entry *ent = vmx_find_msr(v, msr, VMX_MSR_GUEST); ++ ++ if ( !ent ) ++ return -ESRCH; ++ ++ *val = ent->data; ++ ++ return 0; ++} ++ ++static inline int vmx_write_guest_msr(struct vcpu *v, uint32_t msr, ++ uint64_t val) ++{ ++ struct vmx_msr_entry *ent = vmx_find_msr(v, msr, VMX_MSR_GUEST); ++ ++ if ( !ent ) ++ return -ESRCH; ++ ++ ent->data = val; ++ ++ return 0; ++} ++ ++ ++/* MSR intercept bitmap infrastructure. */ + enum vmx_msr_intercept_type { + VMX_MSR_R = 1, + VMX_MSR_W = 2, +@@ -541,10 +607,6 @@ void vmx_clear_msr_intercept(struct vcpu *v, unsigned int msr, + enum vmx_msr_intercept_type type); + void vmx_set_msr_intercept(struct vcpu *v, unsigned int msr, + enum vmx_msr_intercept_type type); +-int vmx_read_guest_msr(u32 msr, u64 *val); +-int vmx_write_guest_msr(u32 msr, u64 val); +-struct vmx_msr_entry *vmx_find_msr(u32 msr, int type); +-int vmx_add_msr(u32 msr, int type); + void vmx_vmcs_switch(paddr_t from, paddr_t to); + void vmx_set_eoi_exit_bitmap(struct vcpu *v, u8 vector); + void vmx_clear_eoi_exit_bitmap(struct vcpu *v, u8 vector); +@@ -559,15 +621,6 @@ void virtual_vmcs_vmwrite(const struct vcpu *, u32 encoding, u64 val); + enum vmx_insn_errno virtual_vmcs_vmwrite_safe(const struct vcpu *v, + u32 vmcs_encoding, u64 val); + +-static inline int vmx_add_guest_msr(u32 msr) +-{ +- return vmx_add_msr(msr, VMX_GUEST_MSR); +-} +-static inline int vmx_add_host_load_msr(u32 msr) +-{ +- return vmx_add_msr(msr, VMX_HOST_MSR); +-} +- + DECLARE_PER_CPU(bool_t, vmxon); + + bool_t vmx_vcpu_pml_enabled(const struct vcpu *v); +diff --git a/xen/include/asm-x86/hvm/vpt.h b/xen/include/asm-x86/hvm/vpt.h +index 21166edd06..0eb5ff632e 100644 +--- a/xen/include/asm-x86/hvm/vpt.h ++++ b/xen/include/asm-x86/hvm/vpt.h +@@ -44,6 +44,7 @@ struct periodic_time { + bool_t warned_timeout_too_short; + #define PTSRC_isa 1 /* ISA time source */ + #define PTSRC_lapic 2 /* LAPIC time source */ ++#define PTSRC_ioapic 3 /* IOAPIC time source */ + u8 source; /* PTSRC_ */ + u8 irq; + struct vcpu *vcpu; /* vcpu timer interrupt delivers to */ +diff --git a/xen/include/asm-x86/hypercall.h b/xen/include/asm-x86/hypercall.h +index b9f3ecf9a3..0e1092845b 100644 +--- a/xen/include/asm-x86/hypercall.h ++++ b/xen/include/asm-x86/hypercall.h +@@ -165,7 +165,7 @@ extern int compat_update_va_mapping( + unsigned int va, u32 lo, u32 hi, unsigned int flags); + + extern int compat_update_va_mapping_otherdomain( +- unsigned long va, u32 lo, u32 hi, unsigned long flags, domid_t domid); ++ unsigned int va, u32 lo, u32 hi, unsigned int flags, domid_t domid); + + DEFINE_XEN_GUEST_HANDLE(trap_info_compat_t); + extern int compat_set_trap_table(XEN_GUEST_HANDLE(trap_info_compat_t) traps); +diff --git a/xen/include/asm-x86/i387.h b/xen/include/asm-x86/i387.h +index 7cfa215d30..243de672eb 100644 +--- a/xen/include/asm-x86/i387.h ++++ b/xen/include/asm-x86/i387.h +@@ -28,7 +28,7 @@ struct ix87_env { + uint16_t fds, _res6; + }; + +-void vcpu_restore_fpu_eager(struct vcpu *v); ++void vcpu_restore_fpu_nonlazy(struct vcpu *v, bool need_stts); + void vcpu_restore_fpu_lazy(struct vcpu *v); + void vcpu_save_fpu(struct vcpu *v); + void save_fpu_enable(void); +diff --git a/xen/include/asm-x86/invpcid.h b/xen/include/asm-x86/invpcid.h +new file mode 100644 +index 0000000000..edd8b68706 +--- /dev/null ++++ b/xen/include/asm-x86/invpcid.h +@@ -0,0 +1,72 @@ ++#ifndef _ASM_X86_INVPCID_H_ ++#define _ASM_X86_INVPCID_H_ ++ ++#include <xen/types.h> ++ ++extern bool use_invpcid; ++ ++#define INVPCID_TYPE_INDIV_ADDR 0 ++#define INVPCID_TYPE_SINGLE_CTXT 1 ++#define INVPCID_TYPE_ALL_INCL_GLOBAL 2 ++#define INVPCID_TYPE_ALL_NON_GLOBAL 3 ++ ++#define INVPCID_OPCODE ".byte 0x66, 0x0f, 0x38, 0x82\n" ++#define MODRM_ECX_01 ".byte 0x01\n" ++ ++static inline void invpcid(unsigned int pcid, unsigned long addr, ++ unsigned int type) ++{ ++ struct { ++ uint64_t pcid:12; ++ uint64_t reserved:52; ++ uint64_t addr; ++ } desc = { .pcid = pcid, .addr = addr }; ++ ++ asm volatile ( ++#ifdef HAVE_AS_INVPCID ++ "invpcid %[desc], %q[type]" ++ : /* No output */ ++ : [desc] "m" (desc), [type] "r" (type) ++#else ++ INVPCID_OPCODE MODRM_ECX_01 ++ : /* No output */ ++ : "a" (type), "c" (&desc) ++#endif ++ : "memory" ); ++} ++ ++/* Flush all mappings for a given PCID and addr, not including globals */ ++static inline void invpcid_flush_one(unsigned int pcid, unsigned long addr) ++{ ++ invpcid(pcid, addr, INVPCID_TYPE_INDIV_ADDR); ++} ++ ++/* Flush all mappings for a given PCID, not including globals */ ++static inline void invpcid_flush_single_context(unsigned int pcid) ++{ ++ invpcid(pcid, 0, INVPCID_TYPE_SINGLE_CTXT); ++} ++ ++/* Flush all mappings, including globals, for all PCIDs */ ++static inline void invpcid_flush_all(void) ++{ ++ invpcid(0, 0, INVPCID_TYPE_ALL_INCL_GLOBAL); ++} ++ ++/* Flush all mappings for all PCIDs, excluding globals */ ++static inline void invpcid_flush_all_nonglobals(void) ++{ ++ invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL); ++} ++ ++#endif /* _ASM_X86_INVPCID_H_ */ ++ ++/* ++ * Local variables: ++ * mode: C ++ * c-file-style: "BSD" ++ * c-basic-offset: 4 ++ * tab-width: 4 ++ * indent-tabs-mode: nil ++ * End: ++ */ +diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h +index a8ceecf3e2..dfeba2821d 100644 +--- a/xen/include/asm-x86/msr-index.h ++++ b/xen/include/asm-x86/msr-index.h +@@ -31,10 +31,14 @@ + #define EFER_LMSLE (1<<_EFER_LMSLE) + #define EFER_FFXSE (1<<_EFER_FFXSE) + ++#define EFER_KNOWN_MASK (EFER_SCE | EFER_LME | EFER_LMA | EFER_NX | \ ++ EFER_SVME | EFER_LMSLE | EFER_FFXSE) ++ + /* Speculation Controls. */ + #define MSR_SPEC_CTRL 0x00000048 + #define SPEC_CTRL_IBRS (_AC(1, ULL) << 0) + #define SPEC_CTRL_STIBP (_AC(1, ULL) << 1) ++#define SPEC_CTRL_SSBD (_AC(1, ULL) << 2) + + #define MSR_PRED_CMD 0x00000049 + #define PRED_CMD_IBPB (_AC(1, ULL) << 0) +@@ -42,6 +46,12 @@ + #define MSR_ARCH_CAPABILITIES 0x0000010a + #define ARCH_CAPABILITIES_RDCL_NO (_AC(1, ULL) << 0) + #define ARCH_CAPABILITIES_IBRS_ALL (_AC(1, ULL) << 1) ++#define ARCH_CAPS_RSBA (_AC(1, ULL) << 2) ++#define ARCH_CAPS_SKIP_L1DFL (_AC(1, ULL) << 3) ++#define ARCH_CAPS_SSB_NO (_AC(1, ULL) << 4) ++ ++#define MSR_FLUSH_CMD 0x0000010b ++#define FLUSH_CMD_L1D (_AC(1, ULL) << 0) + + /* Intel MSRs. Some also available on other CPUs */ + #define MSR_IA32_PERFCTR0 0x000000c1 +diff --git a/xen/include/asm-x86/msr.h b/xen/include/asm-x86/msr.h +index 2c9277b6d5..22d5b7181e 100644 +--- a/xen/include/asm-x86/msr.h ++++ b/xen/include/asm-x86/msr.h +@@ -198,7 +198,7 @@ DECLARE_PER_CPU(u64, efer); + u64 read_efer(void); + void write_efer(u64 val); + +-DECLARE_PER_CPU(u32, ler_msr); ++extern unsigned int ler_msr; + + DECLARE_PER_CPU(uint32_t, tsc_aux); + +diff --git a/xen/include/asm-x86/nops.h b/xen/include/asm-x86/nops.h +index 37f9819e82..0016075616 100644 +--- a/xen/include/asm-x86/nops.h ++++ b/xen/include/asm-x86/nops.h +@@ -61,11 +61,12 @@ + #define ASM_NOP7 _ASM_MK_NOP(K8_NOP7) + #define ASM_NOP8 _ASM_MK_NOP(K8_NOP8) + ++#define ASM_NOP14 ASM_NOP8; ASM_NOP6 + #define ASM_NOP17 ASM_NOP8; ASM_NOP7; ASM_NOP2 +-#define ASM_NOP21 ASM_NOP8; ASM_NOP8; ASM_NOP5 + #define ASM_NOP24 ASM_NOP8; ASM_NOP8; ASM_NOP8 +-#define ASM_NOP29 ASM_NOP8; ASM_NOP8; ASM_NOP8; ASM_NOP5 +-#define ASM_NOP32 ASM_NOP8; ASM_NOP8; ASM_NOP8; ASM_NOP8 ++#define ASM_NOP25 ASM_NOP8; ASM_NOP8; ASM_NOP7; ASM_NOP2 ++#define ASM_NOP33 ASM_NOP8; ASM_NOP8; ASM_NOP8; ASM_NOP7; ASM_NOP2 ++#define ASM_NOP36 ASM_NOP8; ASM_NOP8; ASM_NOP8; ASM_NOP8; ASM_NOP4 + #define ASM_NOP40 ASM_NOP8; ASM_NOP8; ASM_NOP8; ASM_NOP8; ASM_NOP8 + + #define ASM_NOP_MAX 8 +diff --git a/xen/include/asm-x86/paging.h b/xen/include/asm-x86/paging.h +index 5607ab4b1f..fc326fe616 100644 +--- a/xen/include/asm-x86/paging.h ++++ b/xen/include/asm-x86/paging.h +@@ -37,11 +37,14 @@ + + #define PG_SH_shift 20 + #define PG_HAP_shift 21 ++#define PG_SHF_shift 22 + /* We're in one of the shadow modes */ + #ifdef CONFIG_SHADOW_PAGING + #define PG_SH_enable (1U << PG_SH_shift) ++#define PG_SH_forced (1U << PG_SHF_shift) + #else + #define PG_SH_enable 0 ++#define PG_SH_forced 0 + #endif + #define PG_HAP_enable (1U << PG_HAP_shift) + +@@ -62,6 +65,7 @@ + + #define paging_mode_enabled(_d) (!!(_d)->arch.paging.mode) + #define paging_mode_shadow(_d) (!!((_d)->arch.paging.mode & PG_SH_enable)) ++#define paging_mode_sh_forced(_d) (!!((_d)->arch.paging.mode & PG_SH_forced)) + #define paging_mode_hap(_d) (!!((_d)->arch.paging.mode & PG_HAP_enable)) + + #define paging_mode_refcounts(_d) (!!((_d)->arch.paging.mode & PG_refcounts)) +diff --git a/xen/include/asm-x86/processor.h b/xen/include/asm-x86/processor.h +index 80f8411355..90a2701d26 100644 +--- a/xen/include/asm-x86/processor.h ++++ b/xen/include/asm-x86/processor.h +@@ -293,6 +293,21 @@ static inline unsigned long read_cr2(void) + return cr2; + } + ++static inline void write_cr3(unsigned long val) ++{ ++ asm volatile ( "mov %0, %%cr3" : : "r" (val) : "memory" ); ++} ++ ++static inline unsigned long cr3_pa(unsigned long cr3) ++{ ++ return cr3 & X86_CR3_ADDR_MASK; ++} ++ ++static inline unsigned long cr3_pcid(unsigned long cr3) ++{ ++ return cr3 & X86_CR3_PCID_MASK; ++} ++ + static inline unsigned long read_cr4(void) + { + return get_cpu_info()->cr4; +@@ -300,6 +315,9 @@ static inline unsigned long read_cr4(void) + + static inline void write_cr4(unsigned long val) + { ++ /* No global pages in case of PCIDs enabled! */ ++ ASSERT(!(val & X86_CR4_PGE) || !(val & X86_CR4_PCIDE)); ++ + get_cpu_info()->cr4 = val; + asm volatile ( "mov %0,%%cr4" : : "r" (val) ); + } +@@ -329,12 +347,6 @@ static always_inline void set_in_cr4 (unsigned long mask) + write_cr4(read_cr4() | mask); + } + +-static always_inline void clear_in_cr4 (unsigned long mask) +-{ +- mmu_cr4_features &= ~mask; +- write_cr4(read_cr4() & ~mask); +-} +- + static inline unsigned int read_pkru(void) + { + unsigned int pkru; +@@ -445,7 +457,8 @@ struct __packed __cacheline_aligned tss_struct { + #define IST_DF 1UL + #define IST_NMI 2UL + #define IST_MCE 3UL +-#define IST_MAX 3UL ++#define IST_DB 4UL ++#define IST_MAX 4UL + + /* Set the interrupt stack table used by a particular interrupt + * descriptor table entry. */ +diff --git a/xen/include/asm-x86/pv/domain.h b/xen/include/asm-x86/pv/domain.h +index acdf140fbd..6778e1bb75 100644 +--- a/xen/include/asm-x86/pv/domain.h ++++ b/xen/include/asm-x86/pv/domain.h +@@ -21,6 +21,37 @@ + #ifndef __X86_PV_DOMAIN_H__ + #define __X86_PV_DOMAIN_H__ + ++/* ++ * PCID values for the address spaces of 64-bit pv domains: ++ * ++ * We are using 4 PCID values for a 64 bit pv domain subject to XPTI: ++ * - hypervisor active and guest in kernel mode PCID 0 ++ * - hypervisor active and guest in user mode PCID 1 ++ * - guest active and in kernel mode PCID 2 ++ * - guest active and in user mode PCID 3 ++ * ++ * Without XPTI only 2 values are used: ++ * - guest in kernel mode PCID 0 ++ * - guest in user mode PCID 1 ++ */ ++ ++#define PCID_PV_PRIV 0x0000 /* Used for other domains, too. */ ++#define PCID_PV_USER 0x0001 ++#define PCID_PV_XPTI 0x0002 /* To be ORed to above values. */ ++ ++/* ++ * Return additional PCID specific cr3 bits. ++ * ++ * Note that X86_CR3_NOFLUSH will not be readable in cr3. Anyone consuming ++ * v->arch.cr3 should mask away X86_CR3_NOFLUSH and X86_CR3_PCIDMASK in case ++ * the value is used to address the root page table. ++ */ ++static inline unsigned long get_pcid_bits(const struct vcpu *v, bool is_xpti) ++{ ++ return X86_CR3_NOFLUSH | (is_xpti ? PCID_PV_XPTI : 0) | ++ ((v->arch.flags & TF_kernel_mode) ? PCID_PV_PRIV : PCID_PV_USER); ++} ++ + #ifdef CONFIG_PV + + void pv_vcpu_destroy(struct vcpu *v); +diff --git a/xen/include/asm-x86/setup.h b/xen/include/asm-x86/setup.h +index b68ec9de4d..ecfd0c2e7b 100644 +--- a/xen/include/asm-x86/setup.h ++++ b/xen/include/asm-x86/setup.h +@@ -66,6 +66,8 @@ extern uint8_t kbd_shift_flags; + extern unsigned long highmem_start; + #endif + ++extern int8_t opt_smt; ++ + #ifdef CONFIG_SHADOW_PAGING + extern bool opt_dom0_shadow; + #else +diff --git a/xen/include/asm-x86/shadow.h b/xen/include/asm-x86/shadow.h +index 94a34fd16a..f40f411871 100644 +--- a/xen/include/asm-x86/shadow.h ++++ b/xen/include/asm-x86/shadow.h +@@ -29,6 +29,7 @@ + #include <asm/flushtlb.h> + #include <asm/paging.h> + #include <asm/p2m.h> ++#include <asm/spec_ctrl.h> + + /***************************************************************************** + * Macros to tell which shadow paging mode a domain is in*/ +@@ -115,6 +116,131 @@ static inline int shadow_domctl(struct domain *d, + + #endif /* CONFIG_SHADOW_PAGING */ + ++/* ++ * Mitigations for L1TF / CVE-2018-3620 for PV guests. ++ * ++ * We cannot alter an architecturally-legitimate PTE which a PV guest has ++ * chosen to write, as traditional paged-out metadata is L1TF-vulnerable. ++ * What we can do is force a PV guest which writes a vulnerable PTE into ++ * shadow mode, so Xen controls the pagetables which are reachable by the CPU ++ * pagewalk. ++ * ++ * The core of the L1TF vulnerability is that the address bits of the PTE ++ * (accounting for PSE and factoring in the level-relevant part of the linear ++ * access) are sent for an L1D lookup (to retrieve the next-level PTE, or ++ * eventual memory address) before the Present or reserved bits (which would ++ * cause a terminal fault) are accounted for. If an L1D hit occurs, the ++ * resulting data is available for potentially dependent instructions. ++ * ++ * For Present PTEs, the PV type-count safety logic ensures that the address ++ * bits always point at a guest-accessible frame, which is safe WRT L1TF from ++ * Xen's point of view. In practice, a PV guest should be unable to set any ++ * reserved bits, so should be unable to create any present L1TF-vulnerable ++ * PTEs at all. ++ * ++ * Therefore, these safety checks apply to Not-Present PTEs only, where ++ * traditionally, Xen would have let the guest write any value it chose. ++ * ++ * The all-zero PTE potentially leaks mfn 0. All software on the system is ++ * expected to cooperate and not put any secrets there. In a Xen system, ++ * neither Xen nor dom0 are expected to touch mfn 0, as it typically contains ++ * the real mode IVT and Bios Data Area. Therefore, mfn 0 is considered safe. ++ * ++ * Any PTE whose address is higher than the maximum cacheable address is safe, ++ * as it won't get an L1D hit. ++ * ++ * Speculative superpages also need accounting for, as PSE is considered ++ * irrespective of Present. We disallow PSE being set, as it allows an ++ * attacker to leak 2M or 1G of data starting from mfn 0. Also, because of ++ * recursive/linear pagetables, we must consider PSE even at L4, as hardware ++ * will interpret an L4e as an L3e during a recursive walk. ++ */ ++ ++static inline bool is_l1tf_safe_maddr(intpte_t pte) ++{ ++ paddr_t maddr = pte & l1tf_addr_mask; ++ ++ return maddr == 0 || maddr >= l1tf_safe_maddr; ++} ++ ++static inline bool pv_l1tf_check_pte(struct domain *d, unsigned int level, ++ intpte_t pte) ++{ ++ ASSERT(is_pv_domain(d)); ++ ASSERT(!(pte & _PAGE_PRESENT)); ++ ++ if ( d->arch.pv_domain.check_l1tf && !paging_mode_sh_forced(d) && ++ (((level > 1) && (pte & _PAGE_PSE)) || !is_l1tf_safe_maddr(pte)) ) ++ { ++#ifdef CONFIG_SHADOW_PAGING ++ struct tasklet *t = &d->arch.paging.shadow.pv_l1tf_tasklet; ++ ++ printk(XENLOG_G_WARNING ++ "d%d L1TF-vulnerable L%ue %016"PRIx64" - Shadowing\n", ++ d->domain_id, level, pte); ++ /* ++ * Safety consideration for accessing tasklet.scheduled_on without the ++ * tasklet lock. This is a singleshot tasklet with the side effect of ++ * setting PG_SH_forced (checked just above). Multiple vcpus can race ++ * to schedule the tasklet, but if we observe it scheduled anywhere, ++ * that is good enough. ++ */ ++ smp_rmb(); ++ if ( !tasklet_is_scheduled(t) ) ++ tasklet_schedule(t); ++#else ++ printk(XENLOG_G_ERR ++ "d%d L1TF-vulnerable L%ue %016"PRIx64" - Crashing\n", ++ d->domain_id, level, pte); ++ domain_crash(d); ++#endif ++ return true; ++ } ++ ++ return false; ++} ++ ++static inline bool pv_l1tf_check_l1e(struct domain *d, l1_pgentry_t l1e) ++{ ++ return pv_l1tf_check_pte(d, 1, l1e.l1); ++} ++ ++static inline bool pv_l1tf_check_l2e(struct domain *d, l2_pgentry_t l2e) ++{ ++ return pv_l1tf_check_pte(d, 2, l2e.l2); ++} ++ ++static inline bool pv_l1tf_check_l3e(struct domain *d, l3_pgentry_t l3e) ++{ ++ return pv_l1tf_check_pte(d, 3, l3e.l3); ++} ++ ++static inline bool pv_l1tf_check_l4e(struct domain *d, l4_pgentry_t l4e) ++{ ++ return pv_l1tf_check_pte(d, 4, l4e.l4); ++} ++ ++void pv_l1tf_tasklet(unsigned long data); ++ ++static inline void pv_l1tf_domain_init(struct domain *d) ++{ ++ d->arch.pv_domain.check_l1tf = ++ opt_pv_l1tf & (is_hardware_domain(d) ++ ? OPT_PV_L1TF_DOM0 : OPT_PV_L1TF_DOMU); ++ ++#if defined(CONFIG_SHADOW_PAGING) && defined(CONFIG_PV) ++ tasklet_init(&d->arch.paging.shadow.pv_l1tf_tasklet, ++ pv_l1tf_tasklet, (unsigned long)d); ++#endif ++} ++ ++static inline void pv_l1tf_domain_destroy(struct domain *d) ++{ ++#if defined(CONFIG_SHADOW_PAGING) && defined(CONFIG_PV) ++ tasklet_kill(&d->arch.paging.shadow.pv_l1tf_tasklet); ++#endif ++} ++ + /* Remove all shadows of the guest mfn. */ + static inline void shadow_remove_all_shadows(struct domain *d, mfn_t gmfn) + { +diff --git a/xen/include/asm-x86/smp.h b/xen/include/asm-x86/smp.h +index 4e5f673fec..09c55458df 100644 +--- a/xen/include/asm-x86/smp.h ++++ b/xen/include/asm-x86/smp.h +@@ -26,6 +26,8 @@ DECLARE_PER_CPU(cpumask_var_t, cpu_sibling_mask); + DECLARE_PER_CPU(cpumask_var_t, cpu_core_mask); + DECLARE_PER_CPU(cpumask_var_t, scratch_cpumask); + ++extern bool park_offline_cpus; ++ + void smp_send_nmi_allbutself(void); + + void send_IPI_mask(const cpumask_t *, int vector); +diff --git a/xen/include/asm-x86/spec_ctrl.h b/xen/include/asm-x86/spec_ctrl.h +index 5ab4ff3f68..8f8aad40bb 100644 +--- a/xen/include/asm-x86/spec_ctrl.h ++++ b/xen/include/asm-x86/spec_ctrl.h +@@ -27,14 +27,36 @@ + void init_speculation_mitigations(void); + + extern bool opt_ibpb; +-extern uint8_t default_bti_ist_info; ++extern bool opt_ssbd; ++extern int8_t opt_eager_fpu; ++extern int8_t opt_l1d_flush; ++ ++extern bool bsp_delay_spec_ctrl; ++extern uint8_t default_xen_spec_ctrl; ++extern uint8_t default_spec_ctrl_flags; ++ ++extern int8_t opt_xpti; ++#define OPT_XPTI_DOM0 0x01 ++#define OPT_XPTI_DOMU 0x02 ++ ++extern int8_t opt_pv_l1tf; ++#define OPT_PV_L1TF_DOM0 0x01 ++#define OPT_PV_L1TF_DOMU 0x02 ++ ++/* ++ * The L1D address mask, which might be wider than reported in CPUID, and the ++ * system physical address above which there are believed to be no cacheable ++ * memory regions, thus unable to leak data via the L1TF vulnerability. ++ */ ++extern paddr_t l1tf_addr_mask, l1tf_safe_maddr; + + static inline void init_shadow_spec_ctrl_state(void) + { + struct cpu_info *info = get_cpu_info(); + +- info->shadow_spec_ctrl = info->use_shadow_spec_ctrl = 0; +- info->bti_ist_info = default_bti_ist_info; ++ info->shadow_spec_ctrl = 0; ++ info->xen_spec_ctrl = default_xen_spec_ctrl; ++ info->spec_ctrl_flags = default_spec_ctrl_flags; + } + + /* WARNING! `ret`, `call *`, `jmp *` not safe after this call. */ +@@ -48,24 +70,24 @@ static always_inline void spec_ctrl_enter_idle(struct cpu_info *info) + */ + info->shadow_spec_ctrl = val; + barrier(); +- info->use_shadow_spec_ctrl = true; ++ info->spec_ctrl_flags |= SCF_use_shadow; + barrier(); +- asm volatile ( ALTERNATIVE(ASM_NOP3, "wrmsr", X86_FEATURE_XEN_IBRS_SET) ++ asm volatile ( ALTERNATIVE(ASM_NOP3, "wrmsr", X86_FEATURE_SC_MSR_IDLE) + :: "a" (val), "c" (MSR_SPEC_CTRL), "d" (0) : "memory" ); + } + + /* WARNING! `ret`, `call *`, `jmp *` not safe before this call. */ + static always_inline void spec_ctrl_exit_idle(struct cpu_info *info) + { +- uint32_t val = SPEC_CTRL_IBRS; ++ uint32_t val = info->xen_spec_ctrl; + + /* + * Disable shadowing before updating the MSR. There are no SMP issues + * here; only local processor ordering concerns. + */ +- info->use_shadow_spec_ctrl = false; ++ info->spec_ctrl_flags &= ~SCF_use_shadow; + barrier(); +- asm volatile ( ALTERNATIVE(ASM_NOP3, "wrmsr", X86_FEATURE_XEN_IBRS_SET) ++ asm volatile ( ALTERNATIVE(ASM_NOP3, "wrmsr", X86_FEATURE_SC_MSR_IDLE) + :: "a" (val), "c" (MSR_SPEC_CTRL), "d" (0) : "memory" ); + } + +diff --git a/xen/include/asm-x86/spec_ctrl_asm.h b/xen/include/asm-x86/spec_ctrl_asm.h +index 1f2b6f3552..c659f3f146 100644 +--- a/xen/include/asm-x86/spec_ctrl_asm.h ++++ b/xen/include/asm-x86/spec_ctrl_asm.h +@@ -20,10 +20,10 @@ + #ifndef __X86_SPEC_CTRL_ASM_H__ + #define __X86_SPEC_CTRL_ASM_H__ + +-/* Encoding of the bottom bits in cpuinfo.bti_ist_info */ +-#define BTI_IST_IBRS (1 << 0) +-#define BTI_IST_WRMSR (1 << 1) +-#define BTI_IST_RSB (1 << 2) ++/* Encoding of cpuinfo.spec_ctrl_flags */ ++#define SCF_use_shadow (1 << 0) ++#define SCF_ist_wrmsr (1 << 1) ++#define SCF_ist_rsb (1 << 2) + + #ifdef __ASSEMBLY__ + #include <asm/msr-index.h> +@@ -50,20 +50,20 @@ + * after VMEXIT. The VMEXIT-specific code reads MSR_SPEC_CTRL and updates + * current before loading Xen's MSR_SPEC_CTRL setting. + * +- * Factor 2 is harder. We maintain a shadow_spec_ctrl value, and +- * use_shadow_spec_ctrl boolean per cpu. The synchronous use is: ++ * Factor 2 is harder. We maintain a shadow_spec_ctrl value, and a use_shadow ++ * boolean in the per cpu spec_ctrl_flags. The synchronous use is: + * + * 1) Store guest value in shadow_spec_ctrl +- * 2) Set use_shadow_spec_ctrl boolean ++ * 2) Set the use_shadow boolean + * 3) Load guest value into MSR_SPEC_CTRL + * 4) Exit to guest + * 5) Entry from guest +- * 6) Clear use_shadow_spec_ctrl boolean ++ * 6) Clear the use_shadow boolean + * 7) Load Xen's value into MSR_SPEC_CTRL + * + * The asynchronous use for interrupts/exceptions is: + * - Set/clear IBRS on entry to Xen +- * - On exit to Xen, check use_shadow_spec_ctrl ++ * - On exit to Xen, check use_shadow + * - If set, load shadow_spec_ctrl + * + * Therefore, an interrupt/exception which hits the synchronous path between +@@ -72,11 +72,14 @@ + * + * The following ASM fragments implement this algorithm. See their local + * comments for further details. +- * - SPEC_CTRL_ENTRY_FROM_VMEXIT ++ * - SPEC_CTRL_ENTRY_FROM_HVM + * - SPEC_CTRL_ENTRY_FROM_PV + * - SPEC_CTRL_ENTRY_FROM_INTR ++ * - SPEC_CTRL_ENTRY_FROM_INTR_IST ++ * - SPEC_CTRL_EXIT_TO_XEN_IST + * - SPEC_CTRL_EXIT_TO_XEN +- * - SPEC_CTRL_EXIT_TO_GUEST ++ * - SPEC_CTRL_EXIT_TO_PV ++ * - SPEC_CTRL_EXIT_TO_HVM + */ + + .macro DO_OVERWRITE_RSB tmp=rax +@@ -117,7 +120,7 @@ + mov %\tmp, %rsp /* Restore old %rsp */ + .endm + +-.macro DO_SPEC_CTRL_ENTRY_FROM_VMEXIT ibrs_val:req ++.macro DO_SPEC_CTRL_ENTRY_FROM_HVM + /* + * Requires %rbx=current, %rsp=regs/cpuinfo + * Clobbers %rax, %rcx, %rdx +@@ -135,14 +138,14 @@ + xor %edx, %edx + + /* Clear SPEC_CTRL shadowing *before* loading Xen's value. */ +- movb %dl, CPUINFO_use_shadow_spec_ctrl(%rsp) ++ andb $~SCF_use_shadow, CPUINFO_spec_ctrl_flags(%rsp) + + /* Load Xen's intended value. */ +- mov $\ibrs_val, %eax ++ movzbl CPUINFO_xen_spec_ctrl(%rsp), %eax + wrmsr + .endm + +-.macro DO_SPEC_CTRL_ENTRY maybexen:req ibrs_val:req ++.macro DO_SPEC_CTRL_ENTRY maybexen:req + /* + * Requires %rsp=regs (also cpuinfo if !maybexen) + * Requires %r14=stack_end (if maybexen) +@@ -161,16 +164,18 @@ + * block so calculate the position directly. + */ + .if \maybexen ++ xor %eax, %eax + /* Branchless `if ( !xen ) clear_shadowing` */ + testb $3, UREGS_cs(%rsp) +- setz %al +- and %al, STACK_CPUINFO_FIELD(use_shadow_spec_ctrl)(%r14) ++ setnz %al ++ not %eax ++ and %al, STACK_CPUINFO_FIELD(spec_ctrl_flags)(%r14) ++ movzbl STACK_CPUINFO_FIELD(xen_spec_ctrl)(%r14), %eax + .else +- movb %dl, CPUINFO_use_shadow_spec_ctrl(%rsp) ++ andb $~SCF_use_shadow, CPUINFO_spec_ctrl_flags(%rsp) ++ movzbl CPUINFO_xen_spec_ctrl(%rsp), %eax + .endif + +- /* Load Xen's intended value. */ +- mov $\ibrs_val, %eax + wrmsr + .endm + +@@ -185,8 +190,8 @@ + */ + xor %edx, %edx + +- cmpb %dl, STACK_CPUINFO_FIELD(use_shadow_spec_ctrl)(%rbx) +- je .L\@_skip ++ testb $SCF_use_shadow, STACK_CPUINFO_FIELD(spec_ctrl_flags)(%rbx) ++ jz .L\@_skip + + mov STACK_CPUINFO_FIELD(shadow_spec_ctrl)(%rbx), %eax + mov $MSR_SPEC_CTRL, %ecx +@@ -207,7 +212,7 @@ + mov %eax, CPUINFO_shadow_spec_ctrl(%rsp) + + /* Set SPEC_CTRL shadowing *before* loading the guest value. */ +- movb $1, CPUINFO_use_shadow_spec_ctrl(%rsp) ++ orb $SCF_use_shadow, CPUINFO_spec_ctrl_flags(%rsp) + + mov $MSR_SPEC_CTRL, %ecx + xor %edx, %edx +@@ -215,52 +220,47 @@ + .endm + + /* Use after a VMEXIT from an HVM guest. */ +-#define SPEC_CTRL_ENTRY_FROM_VMEXIT \ ++#define SPEC_CTRL_ENTRY_FROM_HVM \ + ALTERNATIVE __stringify(ASM_NOP40), \ +- DO_OVERWRITE_RSB, X86_FEATURE_RSB_VMEXIT; \ +- ALTERNATIVE_2 __stringify(ASM_NOP32), \ +- __stringify(DO_SPEC_CTRL_ENTRY_FROM_VMEXIT \ +- ibrs_val=SPEC_CTRL_IBRS), \ +- X86_FEATURE_XEN_IBRS_SET, \ +- __stringify(DO_SPEC_CTRL_ENTRY_FROM_VMEXIT \ +- ibrs_val=0), \ +- X86_FEATURE_XEN_IBRS_CLEAR ++ DO_OVERWRITE_RSB, X86_FEATURE_SC_RSB_HVM; \ ++ ALTERNATIVE __stringify(ASM_NOP36), \ ++ DO_SPEC_CTRL_ENTRY_FROM_HVM, X86_FEATURE_SC_MSR_HVM + + /* Use after an entry from PV context (syscall/sysenter/int80/int82/etc). */ + #define SPEC_CTRL_ENTRY_FROM_PV \ + ALTERNATIVE __stringify(ASM_NOP40), \ +- DO_OVERWRITE_RSB, X86_FEATURE_RSB_NATIVE; \ +- ALTERNATIVE_2 __stringify(ASM_NOP21), \ +- __stringify(DO_SPEC_CTRL_ENTRY maybexen=0 \ +- ibrs_val=SPEC_CTRL_IBRS), \ +- X86_FEATURE_XEN_IBRS_SET, \ +- __stringify(DO_SPEC_CTRL_ENTRY maybexen=0 ibrs_val=0), \ +- X86_FEATURE_XEN_IBRS_CLEAR ++ DO_OVERWRITE_RSB, X86_FEATURE_SC_RSB_PV; \ ++ ALTERNATIVE __stringify(ASM_NOP25), \ ++ __stringify(DO_SPEC_CTRL_ENTRY maybexen=0), X86_FEATURE_SC_MSR_PV + + /* Use in interrupt/exception context. May interrupt Xen or PV context. */ + #define SPEC_CTRL_ENTRY_FROM_INTR \ + ALTERNATIVE __stringify(ASM_NOP40), \ +- DO_OVERWRITE_RSB, X86_FEATURE_RSB_NATIVE; \ +- ALTERNATIVE_2 __stringify(ASM_NOP29), \ +- __stringify(DO_SPEC_CTRL_ENTRY maybexen=1 \ +- ibrs_val=SPEC_CTRL_IBRS), \ +- X86_FEATURE_XEN_IBRS_SET, \ +- __stringify(DO_SPEC_CTRL_ENTRY maybexen=1 ibrs_val=0), \ +- X86_FEATURE_XEN_IBRS_CLEAR ++ DO_OVERWRITE_RSB, X86_FEATURE_SC_RSB_PV; \ ++ ALTERNATIVE __stringify(ASM_NOP33), \ ++ __stringify(DO_SPEC_CTRL_ENTRY maybexen=1), X86_FEATURE_SC_MSR_PV + + /* Use when exiting to Xen context. */ + #define SPEC_CTRL_EXIT_TO_XEN \ +- ALTERNATIVE_2 __stringify(ASM_NOP17), \ +- DO_SPEC_CTRL_EXIT_TO_XEN, X86_FEATURE_XEN_IBRS_SET, \ +- DO_SPEC_CTRL_EXIT_TO_XEN, X86_FEATURE_XEN_IBRS_CLEAR ++ ALTERNATIVE __stringify(ASM_NOP17), \ ++ DO_SPEC_CTRL_EXIT_TO_XEN, X86_FEATURE_SC_MSR_PV + +-/* Use when exiting to guest context. */ +-#define SPEC_CTRL_EXIT_TO_GUEST \ +- ALTERNATIVE_2 __stringify(ASM_NOP24), \ +- DO_SPEC_CTRL_EXIT_TO_GUEST, X86_FEATURE_XEN_IBRS_SET, \ +- DO_SPEC_CTRL_EXIT_TO_GUEST, X86_FEATURE_XEN_IBRS_CLEAR ++/* Use when exiting to PV guest context. */ ++#define SPEC_CTRL_EXIT_TO_PV \ ++ ALTERNATIVE __stringify(ASM_NOP24), \ ++ DO_SPEC_CTRL_EXIT_TO_GUEST, X86_FEATURE_SC_MSR_PV + +-/* TODO: Drop these when the alternatives infrastructure is NMI/#MC safe. */ ++/* Use when exiting to HVM guest context. */ ++#define SPEC_CTRL_EXIT_TO_HVM \ ++ ALTERNATIVE __stringify(ASM_NOP24), \ ++ DO_SPEC_CTRL_EXIT_TO_GUEST, X86_FEATURE_SC_MSR_HVM ++ ++/* ++ * Use in IST interrupt/exception context. May interrupt Xen or PV context. ++ * Fine grain control of SCF_ist_wrmsr is needed for safety in the S3 resume ++ * path to avoid using MSR_SPEC_CTRL before the microcode introducing it has ++ * been reloaded. ++ */ + .macro SPEC_CTRL_ENTRY_FROM_INTR_IST + /* + * Requires %rsp=regs, %r14=stack_end +@@ -269,29 +269,27 @@ + * This is logical merge of DO_OVERWRITE_RSB and DO_SPEC_CTRL_ENTRY + * maybexen=1, but with conditionals rather than alternatives. + */ +- movzbl STACK_CPUINFO_FIELD(bti_ist_info)(%r14), %eax ++ movzbl STACK_CPUINFO_FIELD(spec_ctrl_flags)(%r14), %eax + +- testb $BTI_IST_RSB, %al ++ test $SCF_ist_rsb, %al + jz .L\@_skip_rsb + + DO_OVERWRITE_RSB tmp=rdx /* Clobbers %rcx/%rdx */ + + .L\@_skip_rsb: + +- testb $BTI_IST_WRMSR, %al ++ test $SCF_ist_wrmsr, %al + jz .L\@_skip_wrmsr + + xor %edx, %edx + testb $3, UREGS_cs(%rsp) +- setz %dl +- and %dl, STACK_CPUINFO_FIELD(use_shadow_spec_ctrl)(%r14) ++ setnz %dl ++ not %edx ++ and %dl, STACK_CPUINFO_FIELD(spec_ctrl_flags)(%r14) + +- /* +- * Load Xen's intended value. SPEC_CTRL_IBRS vs 0 is encoded in the +- * bottom bit of bti_ist_info, via a deliberate alias with BTI_IST_IBRS. +- */ ++ /* Load Xen's intended value. */ + mov $MSR_SPEC_CTRL, %ecx +- and $BTI_IST_IBRS, %eax ++ movzbl STACK_CPUINFO_FIELD(xen_spec_ctrl)(%r14), %eax + xor %edx, %edx + wrmsr + +@@ -309,12 +307,13 @@ UNLIKELY_DISPATCH_LABEL(\@_serialise): + UNLIKELY_END(\@_serialise) + .endm + ++/* Use when exiting to Xen in IST context. */ + .macro SPEC_CTRL_EXIT_TO_XEN_IST + /* + * Requires %rbx=stack_end + * Clobbers %rax, %rcx, %rdx + */ +- testb $BTI_IST_WRMSR, STACK_CPUINFO_FIELD(bti_ist_info)(%rbx) ++ testb $SCF_ist_wrmsr, STACK_CPUINFO_FIELD(spec_ctrl_flags)(%rbx) + jz .L\@_skip + + DO_SPEC_CTRL_EXIT_TO_XEN +diff --git a/xen/include/asm-x86/system.h b/xen/include/asm-x86/system.h +index eb498f5e71..605768be12 100644 +--- a/xen/include/asm-x86/system.h ++++ b/xen/include/asm-x86/system.h +@@ -185,6 +185,30 @@ static always_inline unsigned long __xadd( + #define set_mb(var, value) do { xchg(&var, value); } while (0) + #define set_wmb(var, value) do { var = value; wmb(); } while (0) + ++/** ++ * array_index_mask_nospec() - generate a mask that is ~0UL when the ++ * bounds check succeeds and 0 otherwise ++ * @index: array element index ++ * @size: number of elements in array ++ * ++ * Returns: ++ * 0 - (index < size) ++ */ ++static inline unsigned long array_index_mask_nospec(unsigned long index, ++ unsigned long size) ++{ ++ unsigned long mask; ++ ++ asm volatile ( "cmp %[size], %[index]; sbb %[mask], %[mask];" ++ : [mask] "=r" (mask) ++ : [size] "g" (size), [index] "r" (index) ); ++ ++ return mask; ++} ++ ++/* Override default implementation in nospec.h. */ ++#define array_index_mask_nospec array_index_mask_nospec ++ + #define local_irq_disable() asm volatile ( "cli" : : : "memory" ) + #define local_irq_enable() asm volatile ( "sti" : : : "memory" ) + +diff --git a/xen/include/asm-x86/x86-defns.h b/xen/include/asm-x86/x86-defns.h +index 70453e8dfb..10b366a07d 100644 +--- a/xen/include/asm-x86/x86-defns.h ++++ b/xen/include/asm-x86/x86-defns.h +@@ -42,6 +42,13 @@ + #define X86_CR0_CD 0x40000000 /* Cache Disable (RW) */ + #define X86_CR0_PG 0x80000000 /* Paging (RW) */ + ++/* ++ * Intel CPU flags in CR3 ++ */ ++#define X86_CR3_NOFLUSH (_AC(1, ULL) << 63) ++#define X86_CR3_ADDR_MASK (PAGE_MASK & PADDR_MASK) ++#define X86_CR3_PCID_MASK _AC(0x0fff, ULL) /* Mask for PCID */ ++ + /* + * Intel CPU features in CR4 + */ +diff --git a/xen/include/asm-x86/xstate.h b/xen/include/asm-x86/xstate.h +index d36f422b59..9ba2a04c74 100644 +--- a/xen/include/asm-x86/xstate.h ++++ b/xen/include/asm-x86/xstate.h +@@ -116,8 +116,9 @@ void xsave(struct vcpu *v, uint64_t mask); + void xrstor(struct vcpu *v, uint64_t mask); + void xstate_set_init(uint64_t mask); + bool xsave_enabled(const struct vcpu *v); +-int __must_check validate_xstate(u64 xcr0, u64 xcr0_accum, +- const struct xsave_hdr *); ++int __must_check validate_xstate(const struct domain *d, ++ uint64_t xcr0, uint64_t xcr0_accum, ++ const struct xsave_hdr *hdr); + int __must_check handle_xsetbv(u32 index, u64 new_bv); + void expand_xsave_states(struct vcpu *v, void *dest, unsigned int size); + void compress_xsave_states(struct vcpu *v, const void *src, unsigned int size); +diff --git a/xen/include/public/arch-x86/cpufeatureset.h b/xen/include/public/arch-x86/cpufeatureset.h +index 8da5783f7a..6c82816fd3 100644 +--- a/xen/include/public/arch-x86/cpufeatureset.h ++++ b/xen/include/public/arch-x86/cpufeatureset.h +@@ -243,8 +243,10 @@ XEN_CPUFEATURE(IBPB, 8*32+12) /*A IBPB support only (no IBRS, used by + XEN_CPUFEATURE(AVX512_4VNNIW, 9*32+ 2) /*A AVX512 Neural Network Instructions */ + XEN_CPUFEATURE(AVX512_4FMAPS, 9*32+ 3) /*A AVX512 Multiply Accumulation Single Precision */ + XEN_CPUFEATURE(IBRSB, 9*32+26) /*A IBRS and IBPB support (used by Intel) */ +-XEN_CPUFEATURE(STIBP, 9*32+27) /*A! STIBP */ ++XEN_CPUFEATURE(STIBP, 9*32+27) /*A STIBP */ ++XEN_CPUFEATURE(L1D_FLUSH, 9*32+28) /*S MSR_FLUSH_CMD and L1D flush. */ + XEN_CPUFEATURE(ARCH_CAPS, 9*32+29) /* IA32_ARCH_CAPABILITIES MSR */ ++XEN_CPUFEATURE(SSBD, 9*32+31) /*A MSR_SPEC_CTRL.SSBD available */ + + #endif /* XEN_CPUFEATURE */ + +diff --git a/xen/include/xen/compiler.h b/xen/include/xen/compiler.h +index 533a8ea0f3..a7e05681c9 100644 +--- a/xen/include/xen/compiler.h ++++ b/xen/include/xen/compiler.h +@@ -81,6 +81,9 @@ + #pragma GCC visibility push(hidden) + #endif + ++/* Make the optimizer believe the variable can be manipulated arbitrarily. */ ++#define OPTIMIZER_HIDE_VAR(var) __asm__ ( "" : "+g" (var) ) ++ + /* This macro obfuscates arithmetic on a variable address so that gcc + shouldn't recognize the original var, and make assumptions about it */ + /* +diff --git a/xen/include/xen/cpu.h b/xen/include/xen/cpu.h +index ffefc09f8e..2fe3ec05d8 100644 +--- a/xen/include/xen/cpu.h ++++ b/xen/include/xen/cpu.h +@@ -47,6 +47,8 @@ void register_cpu_notifier(struct notifier_block *nb); + #define CPU_DYING (0x0007 | NOTIFY_REVERSE) + /* CPU_DEAD: CPU is dead. */ + #define CPU_DEAD (0x0008 | NOTIFY_REVERSE) ++/* CPU_REMOVE: CPU was removed. */ ++#define CPU_REMOVE (0x0009 | NOTIFY_REVERSE) + + /* Perform CPU hotplug. May return -EAGAIN. */ + int cpu_down(unsigned int cpu); +diff --git a/xen/include/xen/cpumask.h b/xen/include/xen/cpumask.h +index 3f340d619a..ee4399865a 100644 +--- a/xen/include/xen/cpumask.h ++++ b/xen/include/xen/cpumask.h +@@ -349,16 +349,35 @@ static inline bool_t alloc_cpumask_var(cpumask_var_t *mask) + return *mask != NULL; + } + ++static inline bool cond_alloc_cpumask_var(cpumask_var_t *mask) ++{ ++ if (*mask == NULL) ++ *mask = _xmalloc(nr_cpumask_bits / 8, sizeof(long)); ++ return *mask != NULL; ++} ++ + static inline bool_t zalloc_cpumask_var(cpumask_var_t *mask) + { + *(void **)mask = _xzalloc(nr_cpumask_bits / 8, sizeof(long)); + return *mask != NULL; + } + ++static inline bool cond_zalloc_cpumask_var(cpumask_var_t *mask) ++{ ++ if (*mask == NULL) ++ *mask = _xzalloc(nr_cpumask_bits / 8, sizeof(long)); ++ else ++ cpumask_clear(*mask); ++ return *mask != NULL; ++} ++ + static inline void free_cpumask_var(cpumask_var_t mask) + { + xfree(mask); + } ++ ++/* Free an allocated mask, and zero the pointer to it. */ ++#define FREE_CPUMASK_VAR(m) XFREE(m) + #else + typedef cpumask_t cpumask_var_t[1]; + +@@ -366,16 +385,20 @@ static inline bool_t alloc_cpumask_var(cpumask_var_t *mask) + { + return 1; + } ++#define cond_alloc_cpumask_var alloc_cpumask_var + + static inline bool_t zalloc_cpumask_var(cpumask_var_t *mask) + { + cpumask_clear(*mask); + return 1; + } ++#define cond_zalloc_cpumask_var zalloc_cpumask_var + + static inline void free_cpumask_var(cpumask_var_t mask) + { + } ++ ++#define FREE_CPUMASK_VAR(m) free_cpumask_var(m) + #endif + + #if NR_CPUS > 1 +diff --git a/xen/include/xen/list.h b/xen/include/xen/list.h +index fa07d720ee..1387abb211 100644 +--- a/xen/include/xen/list.h ++++ b/xen/include/xen/list.h +@@ -51,6 +51,11 @@ static inline void INIT_LIST_HEAD(struct list_head *list) + list->prev = list; + } + ++static inline bool list_head_is_null(const struct list_head *list) ++{ ++ return !list->next && !list->prev; ++} ++ + /* + * Insert a new entry between two known consecutive entries. + * +diff --git a/xen/include/xen/mm.h b/xen/include/xen/mm.h +index e813c07b22..fdcb90841a 100644 +--- a/xen/include/xen/mm.h ++++ b/xen/include/xen/mm.h +@@ -162,6 +162,14 @@ void free_xenheap_pages(void *v, unsigned int order); + bool scrub_free_pages(void); + #define alloc_xenheap_page() (alloc_xenheap_pages(0,0)) + #define free_xenheap_page(v) (free_xenheap_pages(v,0)) ++ ++/* Free an allocation, and zero the pointer to it. */ ++#define FREE_XENHEAP_PAGES(p, o) do { \ ++ free_xenheap_pages(p, o); \ ++ (p) = NULL; \ ++} while ( false ) ++#define FREE_XENHEAP_PAGE(p) FREE_XENHEAP_PAGES(p, 0) ++ + /* Map machine page range in Xen virtual address space. */ + int map_pages_to_xen( + unsigned long virt, +diff --git a/xen/include/xen/nospec.h b/xen/include/xen/nospec.h +new file mode 100644 +index 0000000000..48793996e8 +--- /dev/null ++++ b/xen/include/xen/nospec.h +@@ -0,0 +1,70 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* Copyright(c) 2018 Linus Torvalds. All rights reserved. */ ++/* Copyright(c) 2018 Alexei Starovoitov. All rights reserved. */ ++/* Copyright(c) 2018 Intel Corporation. All rights reserved. */ ++/* Copyright(c) 2018 Citrix Systems R&D Ltd. All rights reserved. */ ++ ++#ifndef XEN_NOSPEC_H ++#define XEN_NOSPEC_H ++ ++#include <asm/system.h> ++ ++/** ++ * array_index_mask_nospec() - generate a ~0 mask when index < size, 0 otherwise ++ * @index: array element index ++ * @size: number of elements in array ++ * ++ * When @index is out of bounds (@index >= @size), the sign bit will be ++ * set. Extend the sign bit to all bits and invert, giving a result of ++ * zero for an out of bounds index, or ~0 if within bounds [0, @size). ++ */ ++#ifndef array_index_mask_nospec ++static inline unsigned long array_index_mask_nospec(unsigned long index, ++ unsigned long size) ++{ ++ /* ++ * Always calculate and emit the mask even if the compiler ++ * thinks the mask is not needed. The compiler does not take ++ * into account the value of @index under speculation. ++ */ ++ OPTIMIZER_HIDE_VAR(index); ++ return ~(long)(index | (size - 1UL - index)) >> (BITS_PER_LONG - 1); ++} ++#endif ++ ++/* ++ * array_index_nospec - sanitize an array index after a bounds check ++ * ++ * For a code sequence like: ++ * ++ * if (index < size) { ++ * index = array_index_nospec(index, size); ++ * val = array[index]; ++ * } ++ * ++ * ...if the CPU speculates past the bounds check then ++ * array_index_nospec() will clamp the index within the range of [0, ++ * size). ++ */ ++#define array_index_nospec(index, size) \ ++({ \ ++ typeof(index) _i = (index); \ ++ typeof(size) _s = (size); \ ++ unsigned long _mask = array_index_mask_nospec(_i, _s); \ ++ \ ++ BUILD_BUG_ON(sizeof(_i) > sizeof(long)); \ ++ BUILD_BUG_ON(sizeof(_s) > sizeof(long)); \ ++ \ ++ (typeof(_i)) (_i & _mask); \ ++}) ++ ++#endif /* XEN_NOSPEC_H */ ++ ++/* ++ * Local variables: ++ * mode: C ++ * c-file-style: "BSD" ++ * c-basic-offset: 4 ++ * indent-tabs-mode: nil ++ * End: ++ */ +diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h +index 2541ecb04f..eaa83dc97e 100644 +--- a/xen/include/xen/sched.h ++++ b/xen/include/xen/sched.h +@@ -796,7 +796,7 @@ static inline struct domain *next_domain_in_cpupool( + #define _VPF_parked 8 + #define VPF_parked (1UL<<_VPF_parked) + +-static inline int vcpu_runnable(struct vcpu *v) ++static inline bool vcpu_runnable(const struct vcpu *v) + { + return !(v->pause_flags | + atomic_read(&v->pause_count) | +diff --git a/xen/include/xen/tasklet.h b/xen/include/xen/tasklet.h +index 23d69c738e..bc9ddace6d 100644 +--- a/xen/include/xen/tasklet.h ++++ b/xen/include/xen/tasklet.h +@@ -50,6 +50,11 @@ static inline bool tasklet_work_to_do(unsigned int cpu) + TASKLET_scheduled); + } + ++static inline bool tasklet_is_scheduled(const struct tasklet *t) ++{ ++ return t->scheduled_on != -1; ++} ++ + void tasklet_schedule_on_cpu(struct tasklet *t, unsigned int cpu); + void tasklet_schedule(struct tasklet *t); + void do_tasklet(void); +diff --git a/xen/include/xen/xmalloc.h b/xen/include/xen/xmalloc.h +index cc2673d8ae..9aa5edf593 100644 +--- a/xen/include/xen/xmalloc.h ++++ b/xen/include/xen/xmalloc.h +@@ -26,6 +26,12 @@ + /* Free any of the above. */ + extern void xfree(void *); + ++/* Free an allocation, and zero the pointer to it. */ ++#define XFREE(p) do { \ ++ xfree(p); \ ++ (p) = NULL; \ ++} while ( false ) ++ + /* Underlying functions */ + extern void *_xmalloc(unsigned long size, unsigned long align); + extern void *_xzalloc(unsigned long size, unsigned long align); +diff --git a/xen/tools/gen-cpuid.py b/xen/tools/gen-cpuid.py +index 613b909c3d..65526ff120 100755 +--- a/xen/tools/gen-cpuid.py ++++ b/xen/tools/gen-cpuid.py +@@ -257,10 +257,19 @@ def crunch_numbers(state): + AVX512BW, AVX512VL, AVX512VBMI, AVX512_4VNNIW, + AVX512_4FMAPS, AVX512_VPOPCNTDQ], + +- # Single Thread Indirect Branch Predictors enumerates a new bit in the +- # MSR enumerated by Indirect Branch Restricted Speculation/Indirect +- # Branch Prediction Barrier enumeration. +- IBRSB: [STIBP], ++ # The features: ++ # * Single Thread Indirect Branch Predictors ++ # * Speculative Store Bypass Disable ++ # ++ # enumerate new bits in MSR_SPEC_CTRL, which is enumerated by Indirect ++ # Branch Restricted Speculation/Indirect Branch Prediction Barrier. ++ # ++ # In practice, these features also enumerate the presense of ++ # MSR_SPEC_CTRL. However, no real hardware will exist with SSBD but ++ # not IBRSB, and we pass this MSR directly to guests. Treating them ++ # as dependent features simplifies Xen's logic, and prevents the guest ++ # from seeing implausible configurations. ++ IBRSB: [STIBP, SSBD], + } + + deep_features = tuple(sorted(deps.keys())) diff --git a/main/xen/xsa260-1.patch b/main/xen/xsa260-1.patch deleted file mode 100644 index 21da59cddd6..00000000000 --- a/main/xen/xsa260-1.patch +++ /dev/null @@ -1,72 +0,0 @@ -From: Andrew Cooper <andrew.cooper3@citrix.com> -Subject: x86/traps: Fix %dr6 handing in #DB handler - -Most bits in %dr6 accumulate, rather than being set directly based on the -current source of #DB. Have the handler follow the manuals guidance, which -avoids leaking hypervisor debugging activities into guest context. - -This is part of XSA-260 / CVE-2018-8897. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> - ---- a/xen/arch/x86/traps.c 2018-04-13 15:29:36.006747135 +0200 -+++ b/xen/arch/x86/traps.c 2018-04-13 15:44:57.015516185 +0200 -@@ -1761,11 +1761,36 @@ static void ler_enable(void) - - void do_debug(struct cpu_user_regs *regs) - { -+ unsigned long dr6; - struct vcpu *v = current; - -+ /* Stash dr6 as early as possible. */ -+ dr6 = read_debugreg(6); -+ - if ( debugger_trap_entry(TRAP_debug, regs) ) - return; - -+ /* -+ * At the time of writing (March 2018), on the subject of %dr6: -+ * -+ * The Intel manual says: -+ * Certain debug exceptions may clear bits 0-3. The remaining contents -+ * of the DR6 register are never cleared by the processor. To avoid -+ * confusion in identifying debug exceptions, debug handlers should -+ * clear the register (except bit 16, which they should set) before -+ * returning to the interrupted task. -+ * -+ * The AMD manual says: -+ * Bits 15:13 of the DR6 register are not cleared by the processor and -+ * must be cleared by software after the contents have been read. -+ * -+ * Some bits are reserved set, some are reserved clear, and some bits -+ * which were previously reserved set are reused and cleared by hardware. -+ * For future compatibility, reset to the default value, which will allow -+ * us to spot any bit being changed by hardware to its non-default value. -+ */ -+ write_debugreg(6, X86_DR6_DEFAULT); -+ - if ( !guest_mode(regs) ) - { - if ( regs->eflags & X86_EFLAGS_TF ) -@@ -1798,7 +1823,8 @@ void do_debug(struct cpu_user_regs *regs - } - - /* Save debug status register where guest OS can peek at it */ -- v->arch.debugreg[6] = read_debugreg(6); -+ v->arch.debugreg[6] |= (dr6 & ~X86_DR6_DEFAULT); -+ v->arch.debugreg[6] &= (dr6 | ~X86_DR6_DEFAULT); - - ler_enable(); - pv_inject_hw_exception(TRAP_debug, X86_EVENT_NO_EC); ---- a/xen/include/asm-x86/debugreg.h 2015-02-11 09:36:29.000000000 +0100 -+++ b/xen/include/asm-x86/debugreg.h 2018-04-13 15:44:57.015516185 +0200 -@@ -24,6 +24,8 @@ - #define DR_STATUS_RESERVED_ZERO (~0xffffeffful) /* Reserved, read as zero */ - #define DR_STATUS_RESERVED_ONE 0xffff0ff0ul /* Reserved, read as one */ - -+#define X86_DR6_DEFAULT 0xffff0ff0ul /* Default %dr6 value. */ -+ - /* Now define a bunch of things for manipulating the control register. - The top two bytes of the control register consist of 4 fields of 4 - bits - each field corresponds to one of the four debug registers, diff --git a/main/xen/xsa260-2.patch b/main/xen/xsa260-2.patch deleted file mode 100644 index be71b2438f5..00000000000 --- a/main/xen/xsa260-2.patch +++ /dev/null @@ -1,110 +0,0 @@ -From: Andrew Cooper <andrew.cooper3@citrix.com> -Subject: x86/pv: Move exception injection into {,compat_}test_all_events() - -This allows paths to jump straight to {,compat_}test_all_events() and have -injection of pending exceptions happen automatically, rather than requiring -all calling paths to handle exceptions themselves. - -The normal exception path is simplified as a result, and -compat_post_handle_exception() is removed entirely. - -This is part of XSA-260 / CVE-2018-8897. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> - ---- a/xen/arch/x86/x86_64/compat/entry.S -+++ b/xen/arch/x86/x86_64/compat/entry.S -@@ -39,6 +39,12 @@ ENTRY(compat_test_all_events) - leaq irq_stat+IRQSTAT_softirq_pending(%rip),%rcx - cmpl $0,(%rcx,%rax,1) - jne compat_process_softirqs -+ -+ /* Inject exception if pending. */ -+ lea VCPU_trap_bounce(%rbx), %rdx -+ testb $TBF_EXCEPTION, TRAPBOUNCE_flags(%rdx) -+ jnz .Lcompat_process_trapbounce -+ - testb $1,VCPU_mce_pending(%rbx) - jnz compat_process_mce - .Lcompat_test_guest_nmi: -@@ -68,6 +74,15 @@ compat_process_softirqs: - call do_softirq - jmp compat_test_all_events - -+ ALIGN -+/* %rbx: struct vcpu, %rdx: struct trap_bounce */ -+.Lcompat_process_trapbounce: -+ sti -+.Lcompat_bounce_exception: -+ call compat_create_bounce_frame -+ movb $0, TRAPBOUNCE_flags(%rdx) -+ jmp compat_test_all_events -+ - ALIGN - /* %rbx: struct vcpu */ - compat_process_mce: -@@ -189,15 +204,6 @@ ENTRY(cr4_pv32_restore) - xor %eax, %eax - ret - --/* %rdx: trap_bounce, %rbx: struct vcpu */ --ENTRY(compat_post_handle_exception) -- testb $TBF_EXCEPTION,TRAPBOUNCE_flags(%rdx) -- jz compat_test_all_events --.Lcompat_bounce_exception: -- call compat_create_bounce_frame -- movb $0,TRAPBOUNCE_flags(%rdx) -- jmp compat_test_all_events -- - .section .text.entry, "ax", @progbits - - /* See lstar_enter for entry register state. */ ---- a/xen/arch/x86/x86_64/entry.S -+++ b/xen/arch/x86/x86_64/entry.S -@@ -42,6 +42,12 @@ test_all_events: - leaq irq_stat+IRQSTAT_softirq_pending(%rip), %rcx - cmpl $0, (%rcx, %rax, 1) - jne process_softirqs -+ -+ /* Inject exception if pending. */ -+ lea VCPU_trap_bounce(%rbx), %rdx -+ testb $TBF_EXCEPTION, TRAPBOUNCE_flags(%rdx) -+ jnz .Lprocess_trapbounce -+ - cmpb $0, VCPU_mce_pending(%rbx) - jne process_mce - .Ltest_guest_nmi: -@@ -70,6 +76,15 @@ process_softirqs: - jmp test_all_events - - ALIGN -+/* %rbx: struct vcpu, %rdx struct trap_bounce */ -+.Lprocess_trapbounce: -+ sti -+.Lbounce_exception: -+ call create_bounce_frame -+ movb $0, TRAPBOUNCE_flags(%rdx) -+ jmp test_all_events -+ -+ ALIGN - /* %rbx: struct vcpu */ - process_mce: - testb $1 << VCPU_TRAP_MCE, VCPU_async_exception_mask(%rbx) -@@ -667,15 +682,9 @@ handle_exception_saved: - mov %r15, STACK_CPUINFO_FIELD(xen_cr3)(%r14) - testb $3,UREGS_cs(%rsp) - jz restore_all_xen -- leaq VCPU_trap_bounce(%rbx),%rdx - movq VCPU_domain(%rbx),%rax - testb $1,DOMAIN_is_32bit_pv(%rax) -- jnz compat_post_handle_exception -- testb $TBF_EXCEPTION,TRAPBOUNCE_flags(%rdx) -- jz test_all_events --.Lbounce_exception: -- call create_bounce_frame -- movb $0,TRAPBOUNCE_flags(%rdx) -+ jnz compat_test_all_events - jmp test_all_events - - /* No special register assumptions. */ diff --git a/main/xen/xsa260-3.patch b/main/xen/xsa260-3.patch deleted file mode 100644 index f0a0a5687dc..00000000000 --- a/main/xen/xsa260-3.patch +++ /dev/null @@ -1,138 +0,0 @@ -From: Andrew Cooper <andrew.cooper3@citrix.com> -Subject: x86/traps: Use an Interrupt Stack Table for #DB - -PV guests can use architectural corner cases to cause #DB to be raised after -transitioning into supervisor mode. - -Use an interrupt stack table for #DB to prevent the exception being taken with -a guest controlled stack pointer. - -This is part of XSA-260 / CVE-2018-8897. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> - ---- a/xen/arch/x86/cpu/common.c -+++ b/xen/arch/x86/cpu/common.c -@@ -679,6 +679,7 @@ void load_system_tables(void) - [IST_MCE - 1] = stack_top + IST_MCE * PAGE_SIZE, - [IST_DF - 1] = stack_top + IST_DF * PAGE_SIZE, - [IST_NMI - 1] = stack_top + IST_NMI * PAGE_SIZE, -+ [IST_DB - 1] = stack_top + IST_DB * PAGE_SIZE, - - [IST_MAX ... ARRAY_SIZE(tss->ist) - 1] = - 0x8600111111111111ul, -@@ -706,6 +707,7 @@ void load_system_tables(void) - set_ist(&idt_tables[cpu][TRAP_double_fault], IST_DF); - set_ist(&idt_tables[cpu][TRAP_nmi], IST_NMI); - set_ist(&idt_tables[cpu][TRAP_machine_check], IST_MCE); -+ set_ist(&idt_tables[cpu][TRAP_debug], IST_DB); - - /* - * Bottom-of-stack must be 16-byte aligned! ---- a/xen/arch/x86/hvm/svm/svm.c -+++ b/xen/arch/x86/hvm/svm/svm.c -@@ -1046,6 +1046,7 @@ static void svm_ctxt_switch_from(struct - set_ist(&idt_tables[cpu][TRAP_double_fault], IST_DF); - set_ist(&idt_tables[cpu][TRAP_nmi], IST_NMI); - set_ist(&idt_tables[cpu][TRAP_machine_check], IST_MCE); -+ set_ist(&idt_tables[cpu][TRAP_debug], IST_DB); - } - - static void svm_ctxt_switch_to(struct vcpu *v) -@@ -1067,6 +1068,7 @@ static void svm_ctxt_switch_to(struct vc - set_ist(&idt_tables[cpu][TRAP_double_fault], IST_NONE); - set_ist(&idt_tables[cpu][TRAP_nmi], IST_NONE); - set_ist(&idt_tables[cpu][TRAP_machine_check], IST_NONE); -+ set_ist(&idt_tables[cpu][TRAP_debug], IST_NONE); - - svm_restore_dr(v); - ---- a/xen/arch/x86/smpboot.c -+++ b/xen/arch/x86/smpboot.c -@@ -964,6 +964,7 @@ static int cpu_smpboot_alloc(unsigned in - set_ist(&idt_tables[cpu][TRAP_double_fault], IST_NONE); - set_ist(&idt_tables[cpu][TRAP_nmi], IST_NONE); - set_ist(&idt_tables[cpu][TRAP_machine_check], IST_NONE); -+ set_ist(&idt_tables[cpu][TRAP_debug], IST_NONE); - - for ( stub_page = 0, i = cpu & ~(STUBS_PER_PAGE - 1); - i < nr_cpu_ids && i <= (cpu | (STUBS_PER_PAGE - 1)); ++i ) ---- a/xen/arch/x86/traps.c -+++ b/xen/arch/x86/traps.c -@@ -325,13 +325,13 @@ static void show_guest_stack(struct vcpu - /* - * Notes for get_stack_trace_bottom() and get_stack_dump_bottom() - * -- * Stack pages 0, 1 and 2: -+ * Stack pages 0 - 3: - * These are all 1-page IST stacks. Each of these stacks have an exception - * frame and saved register state at the top. The interesting bound for a - * trace is the word adjacent to this, while the bound for a dump is the - * very top, including the exception frame. - * -- * Stack pages 3, 4 and 5: -+ * Stack pages 4 and 5: - * None of these are particularly interesting. With MEMORY_GUARD, page 5 is - * explicitly not present, so attempting to dump or trace it is - * counterproductive. Without MEMORY_GUARD, it is possible for a call chain -@@ -352,12 +352,12 @@ unsigned long get_stack_trace_bottom(uns - { - switch ( get_stack_page(sp) ) - { -- case 0 ... 2: -+ case 0 ... 3: - return ROUNDUP(sp, PAGE_SIZE) - - offsetof(struct cpu_user_regs, es) - sizeof(unsigned long); - - #ifndef MEMORY_GUARD -- case 3 ... 5: -+ case 4 ... 5: - #endif - case 6 ... 7: - return ROUNDUP(sp, STACK_SIZE) - -@@ -372,11 +372,11 @@ unsigned long get_stack_dump_bottom(unsi - { - switch ( get_stack_page(sp) ) - { -- case 0 ... 2: -+ case 0 ... 3: - return ROUNDUP(sp, PAGE_SIZE) - sizeof(unsigned long); - - #ifndef MEMORY_GUARD -- case 3 ... 5: -+ case 4 ... 5: - #endif - case 6 ... 7: - return ROUNDUP(sp, STACK_SIZE) - sizeof(unsigned long); -@@ -1943,6 +1943,7 @@ void __init init_idt_traps(void) - set_ist(&idt_table[TRAP_double_fault], IST_DF); - set_ist(&idt_table[TRAP_nmi], IST_NMI); - set_ist(&idt_table[TRAP_machine_check], IST_MCE); -+ set_ist(&idt_table[TRAP_debug], IST_DB); - - /* CPU0 uses the master IDT. */ - idt_tables[0] = idt_table; ---- a/xen/arch/x86/x86_64/entry.S -+++ b/xen/arch/x86/x86_64/entry.S -@@ -739,7 +739,7 @@ ENTRY(device_not_available) - ENTRY(debug) - pushq $0 - movl $TRAP_debug,4(%rsp) -- jmp handle_exception -+ jmp handle_ist_exception - - ENTRY(int3) - pushq $0 ---- a/xen/include/asm-x86/processor.h -+++ b/xen/include/asm-x86/processor.h -@@ -443,7 +443,8 @@ struct __packed __cacheline_aligned tss_ - #define IST_DF 1UL - #define IST_NMI 2UL - #define IST_MCE 3UL --#define IST_MAX 3UL -+#define IST_DB 4UL -+#define IST_MAX 4UL - - /* Set the interrupt stack table used by a particular interrupt - * descriptor table entry. */ diff --git a/main/xen/xsa260-4.patch b/main/xen/xsa260-4.patch deleted file mode 100644 index c2fa02d6e12..00000000000 --- a/main/xen/xsa260-4.patch +++ /dev/null @@ -1,72 +0,0 @@ -From: Andrew Cooper <andrew.cooper3@citrix.com> -Subject: x86/traps: Fix handling of #DB exceptions in hypervisor context - -The WARN_ON() can be triggered by guest activities, and emits a full stack -trace without rate limiting. Swap it out for a ratelimited printk with just -enough information to work out what is going on. - -Not all #DB exceptions are traps, so blindly continuing is not a safe action -to take. We don't let PV guests select these settings in the real %dr7 to -begin with, but for added safety against unexpected situations, detect the -fault cases and crash in an obvious manner. - -This is part of XSA-260 / CVE-2018-8897. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> - ---- a/xen/arch/x86/traps.c -+++ b/xen/arch/x86/traps.c -@@ -1809,16 +1809,44 @@ void do_debug(struct cpu_user_regs *regs - regs->eflags &= ~X86_EFLAGS_TF; - } - } -- else -+ -+ /* -+ * Check for fault conditions. General Detect, and instruction -+ * breakpoints are faults rather than traps, at which point attempting -+ * to ignore and continue will result in a livelock. -+ */ -+ if ( dr6 & DR_GENERAL_DETECT ) -+ { -+ printk(XENLOG_ERR "Hit General Detect in Xen context\n"); -+ fatal_trap(regs, 0); -+ } -+ -+ if ( dr6 & (DR_TRAP3 | DR_TRAP2 | DR_TRAP1 | DR_TRAP0) ) - { -- /* -- * We ignore watchpoints when they trigger within Xen. This may -- * happen when a buffer is passed to us which previously had a -- * watchpoint set on it. No need to bump EIP; the only faulting -- * trap is an instruction breakpoint, which can't happen to us. -- */ -- WARN_ON(!search_exception_table(regs)); -+ unsigned int bp, dr7 = read_debugreg(7) >> DR_CONTROL_SHIFT; -+ -+ for ( bp = 0; bp < 4; ++bp ) -+ { -+ if ( (dr6 & (1u << bp)) && /* Breakpoint triggered? */ -+ ((dr7 & (3u << (bp * DR_CONTROL_SIZE))) == 0) /* Insn? */ ) -+ { -+ printk(XENLOG_ERR -+ "Hit instruction breakpoint in Xen context\n"); -+ fatal_trap(regs, 0); -+ } -+ } - } -+ -+ /* -+ * Whatever caused this #DB should be a trap. Note it and continue. -+ * Guests can trigger this in certain corner cases, so ensure the -+ * message is ratelimited. -+ */ -+ gprintk(XENLOG_WARNING, -+ "Hit #DB in Xen context: %04x:%p [%ps], stk %04x:%p, dr6 %lx\n", -+ regs->cs, _p(regs->rip), _p(regs->rip), -+ regs->ss, _p(regs->rsp), dr6); -+ - goto out; - } - diff --git a/main/xen/xsa261.patch b/main/xen/xsa261.patch deleted file mode 100644 index a51744b8d09..00000000000 --- a/main/xen/xsa261.patch +++ /dev/null @@ -1,279 +0,0 @@ -From: Xen Project Security Team <security@xenproject.org> -Subject: x86/vpt: add support for IO-APIC routed interrupts - -And modify the HPET code to make use of it. Currently HPET interrupts -are always treated as ISA and thus injected through the vPIC. This is -wrong because HPET interrupts when not in legacy mode should be -injected from the IO-APIC. - -To make things worse, the supported interrupt routing values are set -to [20..23], which clearly falls outside of the ISA range, thus -leading to an ASSERT in debug builds or memory corruption in non-debug -builds because the interrupt injection code will write out of the -bounds of the arch.hvm_domain.vpic array. - -Since the HPET interrupt source can change between ISA and IO-APIC -always destroy the timer before changing the mode, or else Xen risks -changing it while the timer is active. - -Note that vpt interrupt injection is racy in the sense that the -vIO-APIC RTE entry can be written by the guest in between the call to -pt_irq_masked and hvm_ioapic_assert, or the call to pt_update_irq and -pt_intr_post. Those are not deemed to be security issues, but rather -quirks of the current implementation. In the worse case the guest -might lose interrupts or get multiple interrupt vectors injected for -the same timer source. - -This is part of XSA-261. - -Address actual and potential compiler warnings. Fix formatting. - -Signed-off-by: Roger Pau MonnĂ© <roger.pau@citrix.com> -Signed-off-by: Jan Beulich <jbeulich@suse.com> ---- -Changes since v2: - - Move fallthrough comment to be just above the case label. - - Fix now stale comment in pt_update_irq. - - Use NR_ISAIRQS instead of 16. - - Expand commit message to mention the quirkiness of vpt interrupt - injection. - -Changes since v1: - - Simply usage of gsi in pt_irq_masked. - - Introduce hvm_ioapic_assert. - - Fix pt->source == PTSRC_isa in create_periodic_time. - ---- a/xen/arch/x86/hvm/hpet.c -+++ b/xen/arch/x86/hvm/hpet.c -@@ -264,13 +264,20 @@ static void hpet_set_timer(HPETState *h, - diff = (timer_is_32bit(h, tn) && (-diff > HPET_TINY_TIME_SPAN)) - ? (uint32_t)diff : 0; - -+ destroy_periodic_time(&h->pt[tn]); - if ( (tn <= 1) && (h->hpet.config & HPET_CFG_LEGACY) ) -+ { - /* if LegacyReplacementRoute bit is set, HPET specification requires - timer0 be routed to IRQ0 in NON-APIC or IRQ2 in the I/O APIC, - timer1 be routed to IRQ8 in NON-APIC or IRQ8 in the I/O APIC. */ - irq = (tn == 0) ? 0 : 8; -+ h->pt[tn].source = PTSRC_isa; -+ } - else -+ { - irq = timer_int_route(h, tn); -+ h->pt[tn].source = PTSRC_ioapic; -+ } - - /* - * diff is the time from now when the timer should fire, for a periodic ---- a/xen/arch/x86/hvm/irq.c -+++ b/xen/arch/x86/hvm/irq.c -@@ -41,6 +41,26 @@ static void assert_gsi(struct domain *d, - vioapic_irq_positive_edge(d, ioapic_gsi); - } - -+int hvm_ioapic_assert(struct domain *d, unsigned int gsi, bool level) -+{ -+ struct hvm_irq *hvm_irq = hvm_domain_irq(d); -+ int vector; -+ -+ if ( gsi >= hvm_irq->nr_gsis ) -+ { -+ ASSERT_UNREACHABLE(); -+ return -1; -+ } -+ -+ spin_lock(&d->arch.hvm_domain.irq_lock); -+ if ( !level || hvm_irq->gsi_assert_count[gsi]++ == 0 ) -+ assert_gsi(d, gsi); -+ vector = vioapic_get_vector(d, gsi); -+ spin_unlock(&d->arch.hvm_domain.irq_lock); -+ -+ return vector; -+} -+ - static void assert_irq(struct domain *d, unsigned ioapic_gsi, unsigned pic_irq) - { - assert_gsi(d, ioapic_gsi); ---- a/xen/arch/x86/hvm/vpt.c -+++ b/xen/arch/x86/hvm/vpt.c -@@ -107,31 +107,49 @@ static int pt_irq_vector(struct periodic - static int pt_irq_masked(struct periodic_time *pt) - { - struct vcpu *v = pt->vcpu; -- unsigned int gsi, isa_irq; -- int mask; -- uint8_t pic_imr; -+ unsigned int gsi = pt->irq; - -- if ( pt->source == PTSRC_lapic ) -+ switch ( pt->source ) -+ { -+ case PTSRC_lapic: - { - struct vlapic *vlapic = vcpu_vlapic(v); -+ - return (!vlapic_enabled(vlapic) || - (vlapic_get_reg(vlapic, APIC_LVTT) & APIC_LVT_MASKED)); - } - -- isa_irq = pt->irq; -- gsi = hvm_isa_irq_to_gsi(isa_irq); -- pic_imr = v->domain->arch.hvm_domain.vpic[isa_irq >> 3].imr; -- mask = vioapic_get_mask(v->domain, gsi); -- if ( mask < 0 ) -- { -- dprintk(XENLOG_WARNING, "d%u: invalid GSI (%u) for platform timer\n", -- v->domain->domain_id, gsi); -- domain_crash(v->domain); -- return -1; -+ case PTSRC_isa: -+ { -+ uint8_t pic_imr = v->domain->arch.hvm_domain.vpic[pt->irq >> 3].imr; -+ -+ /* Check if the interrupt is unmasked in the PIC. */ -+ if ( !(pic_imr & (1 << (pt->irq & 7))) && vlapic_accept_pic_intr(v) ) -+ return 0; -+ -+ gsi = hvm_isa_irq_to_gsi(pt->irq); -+ } -+ -+ /* Fallthrough to check if the interrupt is masked on the IO APIC. */ -+ case PTSRC_ioapic: -+ { -+ int mask = vioapic_get_mask(v->domain, gsi); -+ -+ if ( mask < 0 ) -+ { -+ dprintk(XENLOG_WARNING, -+ "d%d: invalid GSI (%u) for platform timer\n", -+ v->domain->domain_id, gsi); -+ domain_crash(v->domain); -+ return -1; -+ } -+ -+ return mask; -+ } - } - -- return (((pic_imr & (1 << (isa_irq & 7))) || !vlapic_accept_pic_intr(v)) && -- mask); -+ ASSERT_UNREACHABLE(); -+ return 1; - } - - static void pt_lock(struct periodic_time *pt) -@@ -252,7 +270,7 @@ int pt_update_irq(struct vcpu *v) - struct list_head *head = &v->arch.hvm_vcpu.tm_list; - struct periodic_time *pt, *temp, *earliest_pt; - uint64_t max_lag; -- int irq, is_lapic, pt_vector; -+ int irq, pt_vector = -1; - - spin_lock(&v->arch.hvm_vcpu.tm_lock); - -@@ -288,29 +306,26 @@ int pt_update_irq(struct vcpu *v) - - earliest_pt->irq_issued = 1; - irq = earliest_pt->irq; -- is_lapic = (earliest_pt->source == PTSRC_lapic); - - spin_unlock(&v->arch.hvm_vcpu.tm_lock); - -- /* -- * If periodic timer interrut is handled by lapic, its vector in -- * IRR is returned and used to set eoi_exit_bitmap for virtual -- * interrupt delivery case. Otherwise return -1 to do nothing. -- */ -- if ( is_lapic ) -+ switch ( earliest_pt->source ) - { -+ case PTSRC_lapic: -+ /* -+ * If periodic timer interrupt is handled by lapic, its vector in -+ * IRR is returned and used to set eoi_exit_bitmap for virtual -+ * interrupt delivery case. Otherwise return -1 to do nothing. -+ */ - vlapic_set_irq(vcpu_vlapic(v), irq, 0); - pt_vector = irq; -- } -- else -- { -+ break; -+ -+ case PTSRC_isa: - hvm_isa_irq_deassert(v->domain, irq); - if ( platform_legacy_irq(irq) && vlapic_accept_pic_intr(v) && - v->domain->arch.hvm_domain.vpic[irq >> 3].int_output ) -- { - hvm_isa_irq_assert(v->domain, irq, NULL); -- pt_vector = -1; -- } - else - { - pt_vector = hvm_isa_irq_assert(v->domain, irq, vioapic_get_vector); -@@ -321,6 +336,17 @@ int pt_update_irq(struct vcpu *v) - if ( pt_vector < 0 || !vlapic_test_irq(vcpu_vlapic(v), pt_vector) ) - pt_vector = -1; - } -+ break; -+ -+ case PTSRC_ioapic: -+ /* -+ * NB: At the moment IO-APIC routed interrupts generated by vpt devices -+ * (HPET) are edge-triggered. -+ */ -+ pt_vector = hvm_ioapic_assert(v->domain, irq, false); -+ if ( pt_vector < 0 || !vlapic_test_irq(vcpu_vlapic(v), pt_vector) ) -+ pt_vector = -1; -+ break; - } - - return pt_vector; -@@ -418,7 +444,14 @@ void create_periodic_time( - struct vcpu *v, struct periodic_time *pt, uint64_t delta, - uint64_t period, uint8_t irq, time_cb *cb, void *data) - { -- ASSERT(pt->source != 0); -+ if ( !pt->source || -+ (pt->irq >= NR_ISAIRQS && pt->source == PTSRC_isa) || -+ (pt->irq >= hvm_domain_irq(v->domain)->nr_gsis && -+ pt->source == PTSRC_ioapic) ) -+ { -+ ASSERT_UNREACHABLE(); -+ return; -+ } - - destroy_periodic_time(pt); - -@@ -498,7 +531,7 @@ static void pt_adjust_vcpu(struct period - { - int on_list; - -- ASSERT(pt->source == PTSRC_isa); -+ ASSERT(pt->source == PTSRC_isa || pt->source == PTSRC_ioapic); - - if ( pt->vcpu == NULL ) - return; ---- a/xen/include/asm-x86/hvm/irq.h -+++ b/xen/include/asm-x86/hvm/irq.h -@@ -207,6 +207,9 @@ int hvm_set_pci_link_route(struct domain - - int hvm_inject_msi(struct domain *d, uint64_t addr, uint32_t data); - -+/* Assert an IO APIC pin. */ -+int hvm_ioapic_assert(struct domain *d, unsigned int gsi, bool level); -+ - void hvm_maybe_deassert_evtchn_irq(void); - void hvm_assert_evtchn_irq(struct vcpu *v); - void hvm_set_callback_via(struct domain *d, uint64_t via); ---- a/xen/include/asm-x86/hvm/vpt.h -+++ b/xen/include/asm-x86/hvm/vpt.h -@@ -44,6 +44,7 @@ struct periodic_time { - bool_t warned_timeout_too_short; - #define PTSRC_isa 1 /* ISA time source */ - #define PTSRC_lapic 2 /* LAPIC time source */ -+#define PTSRC_ioapic 3 /* IOAPIC time source */ - u8 source; /* PTSRC_ */ - u8 irq; - struct vcpu *vcpu; /* vcpu timer interrupt delivers to */ diff --git a/main/xen/xsa262-4.10.patch b/main/xen/xsa262-4.10.patch deleted file mode 100644 index ba9a8ffa22f..00000000000 --- a/main/xen/xsa262-4.10.patch +++ /dev/null @@ -1,76 +0,0 @@ -From: Jan Beulich <jbeulich@suse.com> -Subject: x86/HVM: guard against emulator driving ioreq state in weird ways - -In the case where hvm_wait_for_io() calls wait_on_xen_event_channel(), -p->state ends up being read twice in succession: once to determine that -state != p->state, and then again at the top of the loop. This gives a -compromised emulator a chance to change the state back between the two -reads, potentially keeping Xen in a loop indefinitely. - -Instead: -* Read p->state once in each of the wait_on_xen_event_channel() tests, -* re-use that value the next time around, -* and insist that the states continue to transition "forward" (with the - exception of the transition to STATE_IOREQ_NONE). - -This is XSA-262. - -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Reviewed-by: George Dunlap <george.dunlap@citrix.com> - ---- a/xen/arch/x86/hvm/ioreq.c -+++ b/xen/arch/x86/hvm/ioreq.c -@@ -87,14 +87,17 @@ static void hvm_io_assist(struct hvm_ior - - static bool hvm_wait_for_io(struct hvm_ioreq_vcpu *sv, ioreq_t *p) - { -+ unsigned int prev_state = STATE_IOREQ_NONE; -+ - while ( sv->pending ) - { - unsigned int state = p->state; - -- rmb(); -- switch ( state ) -+ smp_rmb(); -+ -+ recheck: -+ if ( unlikely(state == STATE_IOREQ_NONE) ) - { -- case STATE_IOREQ_NONE: - /* - * The only reason we should see this case is when an - * emulator is dying and it races with an I/O being -@@ -102,14 +105,30 @@ static bool hvm_wait_for_io(struct hvm_i - */ - hvm_io_assist(sv, ~0ul); - break; -+ } -+ -+ if ( unlikely(state < prev_state) ) -+ { -+ gdprintk(XENLOG_ERR, "Weird HVM ioreq state transition %u -> %u\n", -+ prev_state, state); -+ sv->pending = false; -+ domain_crash(sv->vcpu->domain); -+ return false; /* bail */ -+ } -+ -+ switch ( prev_state = state ) -+ { - case STATE_IORESP_READY: /* IORESP_READY -> NONE */ - p->state = STATE_IOREQ_NONE; - hvm_io_assist(sv, p->data); - break; - case STATE_IOREQ_READY: /* IOREQ_{READY,INPROCESS} -> IORESP_READY */ - case STATE_IOREQ_INPROCESS: -- wait_on_xen_event_channel(sv->ioreq_evtchn, p->state != state); -- break; -+ wait_on_xen_event_channel(sv->ioreq_evtchn, -+ ({ state = p->state; -+ smp_rmb(); -+ state != prev_state; })); -+ goto recheck; - default: - gdprintk(XENLOG_ERR, "Weird HVM iorequest state %u\n", state); - sv->pending = false; |