diff options
author | omni <omni+alpine@hack.org> | 2024-03-18 23:54:19 +0000 |
---|---|---|
committer | omni <omni+alpine@hack.org> | 2024-03-19 00:00:06 +0000 |
commit | 7bf0401d2127ae56b44e9a667fcc60191f37af7c (patch) | |
tree | 5a67e444f491dd637b6cbcb8fbfdd5ebbc928b7e | |
parent | d453d8a898509dbc616c4d467a9bf537b785ca13 (diff) |
main/xen: add mitigations for XSA-451, XSA-452 & XSA-4533.16-stable
-rw-r--r-- | main/xen/APKBUILD | 39 | ||||
-rw-r--r-- | main/xen/xsa451-4.16.patch | 193 | ||||
-rw-r--r-- | main/xen/xsa452-4.16-1.patch | 304 | ||||
-rw-r--r-- | main/xen/xsa452-4.16-2.patch | 87 | ||||
-rw-r--r-- | main/xen/xsa452-4.16-3.patch | 135 | ||||
-rw-r--r-- | main/xen/xsa452-4.16-4.patch | 197 | ||||
-rw-r--r-- | main/xen/xsa452-4.16-5.patch | 237 | ||||
-rw-r--r-- | main/xen/xsa452-4.16-6.patch | 163 | ||||
-rw-r--r-- | main/xen/xsa452-4.16-7.patch | 299 | ||||
-rw-r--r-- | main/xen/xsa453-4.16-1.patch | 148 | ||||
-rw-r--r-- | main/xen/xsa453-4.16-2.patch | 49 | ||||
-rw-r--r-- | main/xen/xsa453-4.16-3.patch | 313 | ||||
-rw-r--r-- | main/xen/xsa453-4.16-4.patch | 113 | ||||
-rw-r--r-- | main/xen/xsa453-4.16-5.patch | 75 | ||||
-rw-r--r-- | main/xen/xsa453-4.16-6.patch | 382 | ||||
-rw-r--r-- | main/xen/xsa453-4.16-7.patch | 61 | ||||
-rw-r--r-- | main/xen/xsa453-4.16-8.patch | 201 |
17 files changed, 2995 insertions, 1 deletions
diff --git a/main/xen/APKBUILD b/main/xen/APKBUILD index bbf9359e7ed..6f385ecada9 100644 --- a/main/xen/APKBUILD +++ b/main/xen/APKBUILD @@ -2,7 +2,7 @@ # Maintainer: Natanael Copa <ncopa@alpinelinux.org> pkgname=xen pkgver=4.16.5 -pkgrel=6 +pkgrel=7 pkgdesc="Xen hypervisor" url="https://www.xenproject.org/" arch="x86_64 armv7 aarch64" # enable armv7 when builds with gcc8 @@ -45,6 +45,7 @@ options="!strip" # 0: # - CVE-2020-29568 XSA-349 # - CVE-2020-29569 XSA-350 +# - CVE-2023-46840 XSA-450 # 4.7.0-r0: # - CVE-2016-6258 XSA-182 # - CVE-2016-6259 XSA-183 @@ -354,6 +355,10 @@ options="!strip" # - CVE-2023-46837 XSA-447 # 4.16.5-r6: # - CVE-2023-46839 XSA-449 +# 4.16.5-r7: +# - CVE-2023-46841 XSA-451 +# - CVE-2023-28746 XSA-452 +# - CVE-2024-2193 XSA-453 case "$CARCH" in x86*) @@ -419,6 +424,22 @@ source="https://downloads.xenproject.org/release/xen/$pkgver/xen-$pkgver.tar.gz xsa446.patch xsa447-4.16.patch xsa449-4.16.patch + xsa451-4.16.patch + xsa452-4.16-1.patch + xsa452-4.16-2.patch + xsa452-4.16-3.patch + xsa452-4.16-4.patch + xsa452-4.16-5.patch + xsa452-4.16-6.patch + xsa452-4.16-7.patch + xsa453-4.16-1.patch + xsa453-4.16-2.patch + xsa453-4.16-3.patch + xsa453-4.16-4.patch + xsa453-4.16-5.patch + xsa453-4.16-6.patch + xsa453-4.16-7.patch + xsa453-4.16-8.patch mini-os-__divmoddi4.patch qemu-xen_paths.patch @@ -725,6 +746,22 @@ f3694e355ae921528591dfec79c187eb22890f5a294ad2f4d4e96ed0476aa290c9a993f30a51e2ca 229319de61f83d98b41ff7bf8ac944f7d5283f190ae54ed01087409b2cf42c141455b2a56c28898288db85780587803670671c1f5f446359a1d9767259f975d5 xsa446.patch 98ac1fd6b2755e4034d70f283253eb18011b81ecb78f6507629ff8144faf422008ec6c603b6f9727bc752f57f7d09f9fce3cde3127b006c6e4ef0aeab319647f xsa447-4.16.patch 4baf6b93eaa46b90a5784502857fcbe271a06bf433d2b58b47c6777e3bc0860d94bb4c4e8e93b9cb9295c475e6d030fd59b4d9b7efd57ad087108650c5022656 xsa449-4.16.patch +8a8228b9da87b4d10217c3d7a8091655feee5d43c7a370ea869e4cf9f7b5c679ef56ec530fa49dbac5b607c24687bd8810b33a040734f5750aa04917f6a8e250 xsa451-4.16.patch +64c8bf4350955dc2ef052713415d7dace6c7afcc35d18b9a0d648e25de4cb02783fcec3828fad5c33d2e482ca42e13936c14a8274ac34f0f9130905c8f010b3f xsa452-4.16-1.patch +16aaabfec8abf90c416c86c123c8f657c6d9e3f5a40cf558fe4b207f9fc6e95253697c0a9f4a8c9332782bd627de10bd8c930b2c31a93d02717795c36e36e4b1 xsa452-4.16-2.patch +a81677105f358974f8709fbee70d0ba16eee64d4261de69f012bc0ec33955cf2f76f4fc11a4a027ac6552d7b448fc18a5fd4f9bb58bcab819b9492376254a3c1 xsa452-4.16-3.patch +8344d55fea823ec8711d27fb6188ee0e8a613383f814a0f83046c7e822a72e6583cb94c0e7536c47d9a9209805492fa6a99505ae925f60f0ac68911a4fd06e54 xsa452-4.16-4.patch +dfe195a53d90ee4fd1e44ba5ab270ac01d8a241f279fe9c1c0977a0e5c3d4c1792ab9fa20bd37ece7fd494d00d9009f8a1922c3438b08eed852982e88ed85638 xsa452-4.16-5.patch +6c045a64ff119ec5e90cad0974230aca7cc94e7990a85136ec0fd38a051f6179139d75de95766d36e6d6330f48999721d37ba7102a2eab2850fff9c56b236abb xsa452-4.16-6.patch +f196f4e3877093d2892106778aaa61d318a65392adde7d38a8256ccf5e76eab4f8d833e3f7f119611598b26c190f197e85bd97ecf7ad73d362ec4e2b8302e37a xsa452-4.16-7.patch +23abb30d679285fff2687a0183425c30aa5214026b0d061d079911a471ffa823d48d6e3929e9180c583db8eab03bf871236b94c5ebbe9b94f016896474734abc xsa453-4.16-1.patch +2bf1ec8e49347f2fb305e1bb8d6b6be98b409409c93ce0d6a83751aea681be1548200acb111c242767033489595c040ffff32b31ae5e03dce21e5916cc59b470 xsa453-4.16-2.patch +a8be0ffce584d408ae865679ea131e76157ee73e83c878b3a36aa795cb75ef63b6c6ab74fd54f4b0f283f557299d4eb9d2dae4f526c3dd8b9b0c0ea4e4fd369e xsa453-4.16-3.patch +3c0cc89564ae76714304041a67612fe035617f7709c62a3d069c66b09228fa611881cedf81b7d9a405c9410477796cdfe62399b7f95790789b708b05311df912 xsa453-4.16-4.patch +5004450f8446c4e7dc9b3c40a5140aae9779a1cf36be1465b6e46a3b6b433acecc248b557713771129c6ec7a06080330bb9e9f4055f9787105ec2e118efbe893 xsa453-4.16-5.patch +ed7c23e0ae1ab9df5c6ddc581110345208f924071fa012f5471991fa26f35245c7c8cbb6d822d5c98a8f5a8bcf0b267243429b9b6e416201898a78a152de1d23 xsa453-4.16-6.patch +3559110c1f4b26b0714cdd0d33f644262e331d3fc6bf29377ad85194a74e2efdb58841cfe9f6b2932a853c3ff702d0e38e0bc9c6e25aa65c7236b236f17ebdd1 xsa453-4.16-7.patch +cf01db150adc7d737161cd9c2c607e85e05b0d5d74371b1bcbc2e87d52f38adaa17bb798dacfd690e63de2cf165421847c722eed0eef5474565b3fdd1c6b25e6 xsa453-4.16-8.patch 2e0b0fd23e6f10742a5517981e5171c6e88b0a93c83da701b296f5c0861d72c19782daab589a7eac3f9032152a0fc7eff7f5362db8fccc4859564a9aa82329cf gmp-4.3.2.tar.bz2 c2bc9ffc8583aeae71cee9ddcc4418969768d4e3764d47307da54f93981c0109fb07d84b061b3a3628bd00ba4d14a54742bc04848110eb3ae8ca25dbfbaabadb grub-0.97.tar.gz 1465b58279af1647f909450e394fe002ca165f0ff4a0254bfa9fe0e64316f50facdde2729d79a4e632565b4500cf4d6c74192ac0dd3bc9fe09129bbd67ba089d lwip-1.3.0.tar.gz diff --git a/main/xen/xsa451-4.16.patch b/main/xen/xsa451-4.16.patch new file mode 100644 index 00000000000..540ae0e0e5c --- /dev/null +++ b/main/xen/xsa451-4.16.patch @@ -0,0 +1,193 @@ +From: Jan Beulich <jbeulich@suse.com> +Subject: x86: account for shadow stack in exception-from-stub recovery + +Dealing with exceptions raised from within emulation stubs involves +discarding return address (replaced by exception related information). +Such discarding of course also requires removing the corresponding entry +from the shadow stack. + +Also amend the comment in fixup_exception_return(), to further clarify +why use of ptr[1] can't be an out-of-bounds access. + +This is CVE-2023-46841 / XSA-451. + +Fixes: 209fb9919b50 ("x86/extable: Adjust extable handling to be shadow stack compatible") +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> + +--- a/xen/arch/x86/extable.c ++++ b/xen/arch/x86/extable.c +@@ -86,26 +86,29 @@ search_one_extable(const struct exceptio + } + + unsigned long +-search_exception_table(const struct cpu_user_regs *regs) ++search_exception_table(const struct cpu_user_regs *regs, unsigned long *stub_ra) + { + const struct virtual_region *region = find_text_region(regs->rip); + unsigned long stub = this_cpu(stubs.addr); + + if ( region && region->ex ) ++ { ++ *stub_ra = 0; + return search_one_extable(region->ex, region->ex_end, regs->rip); ++ } + + if ( regs->rip >= stub + STUB_BUF_SIZE / 2 && + regs->rip < stub + STUB_BUF_SIZE && + regs->rsp > (unsigned long)regs && + regs->rsp < (unsigned long)get_cpu_info() ) + { +- unsigned long retptr = *(unsigned long *)regs->rsp; ++ unsigned long retaddr = *(unsigned long *)regs->rsp, fixup; + +- region = find_text_region(retptr); +- retptr = region && region->ex +- ? search_one_extable(region->ex, region->ex_end, retptr) +- : 0; +- if ( retptr ) ++ region = find_text_region(retaddr); ++ fixup = region && region->ex ++ ? search_one_extable(region->ex, region->ex_end, retaddr) ++ : 0; ++ if ( fixup ) + { + /* + * Put trap number and error code on the stack (in place of the +@@ -117,7 +120,8 @@ search_exception_table(const struct cpu_ + }; + + *(unsigned long *)regs->rsp = token.raw; +- return retptr; ++ *stub_ra = retaddr; ++ return fixup; + } + } + +--- a/xen/arch/x86/traps.c ++++ b/xen/arch/x86/traps.c +@@ -895,7 +895,7 @@ static void do_reserved_trap(struct cpu_ + } + + static void fixup_exception_return(struct cpu_user_regs *regs, +- unsigned long fixup) ++ unsigned long fixup, unsigned long stub_ra) + { + if ( IS_ENABLED(CONFIG_XEN_SHSTK) ) + { +@@ -912,7 +912,8 @@ static void fixup_exception_return(struc + /* + * Search for %rip. The shstk currently looks like this: + * +- * ... [Likely pointed to by SSP] ++ * tok [Supervisor token, == &tok | BUSY, only with FRED inactive] ++ * ... [Pointed to by SSP for most exceptions, empty in IST cases] + * %cs [== regs->cs] + * %rip [== regs->rip] + * SSP [Likely points to 3 slots higher, above %cs] +@@ -930,7 +931,56 @@ static void fixup_exception_return(struc + */ + if ( ptr[0] == regs->rip && ptr[1] == regs->cs ) + { ++ unsigned long primary_shstk = ++ (ssp & ~(STACK_SIZE - 1)) + ++ (PRIMARY_SHSTK_SLOT + 1) * PAGE_SIZE - 8; ++ + wrss(fixup, ptr); ++ ++ if ( !stub_ra ) ++ goto shstk_done; ++ ++ /* ++ * Stub recovery ought to happen only when the outer context ++ * was on the main shadow stack. We need to also "pop" the ++ * stub's return address from the interrupted context's shadow ++ * stack. That is, ++ * - if we're still on the main stack, we need to move the ++ * entire stack (up to and including the exception frame) ++ * up by one slot, incrementing the original SSP in the ++ * exception frame, ++ * - if we're on an IST stack, we need to increment the ++ * original SSP. ++ */ ++ BUG_ON((ptr[-1] ^ primary_shstk) >> PAGE_SHIFT); ++ ++ if ( (ssp ^ primary_shstk) >> PAGE_SHIFT ) ++ { ++ /* ++ * We're on an IST stack. First make sure the two return ++ * addresses actually match. Then increment the interrupted ++ * context's SSP. ++ */ ++ BUG_ON(stub_ra != *(unsigned long*)ptr[-1]); ++ wrss(ptr[-1] + 8, &ptr[-1]); ++ goto shstk_done; ++ } ++ ++ /* Make sure the two return addresses actually match. */ ++ BUG_ON(stub_ra != ptr[2]); ++ ++ /* Move exception frame, updating SSP there. */ ++ wrss(ptr[1], &ptr[2]); /* %cs */ ++ wrss(ptr[0], &ptr[1]); /* %rip */ ++ wrss(ptr[-1] + 8, &ptr[0]); /* SSP */ ++ ++ /* Move all newer entries. */ ++ while ( --ptr != _p(ssp) ) ++ wrss(ptr[-1], &ptr[0]); ++ ++ /* Finally account for our own stack having shifted up. */ ++ asm volatile ( "incsspd %0" :: "r" (2) ); ++ + goto shstk_done; + } + } +@@ -951,7 +1001,8 @@ static void fixup_exception_return(struc + + static bool extable_fixup(struct cpu_user_regs *regs, bool print) + { +- unsigned long fixup = search_exception_table(regs); ++ unsigned long stub_ra = 0; ++ unsigned long fixup = search_exception_table(regs, &stub_ra); + + if ( unlikely(fixup == 0) ) + return false; +@@ -965,7 +1016,7 @@ static bool extable_fixup(struct cpu_use + vec_name(regs->entry_vector), regs->error_code, + _p(regs->rip), _p(regs->rip), _p(fixup)); + +- fixup_exception_return(regs, fixup); ++ fixup_exception_return(regs, fixup, stub_ra); + this_cpu(last_extable_addr) = regs->rip; + + return true; +@@ -1256,7 +1307,7 @@ void do_invalid_op(struct cpu_user_regs + void (*fn)(struct cpu_user_regs *) = bug_ptr(bug); + + fn(regs); +- fixup_exception_return(regs, (unsigned long)eip); ++ fixup_exception_return(regs, (unsigned long)eip, 0); + return; + } + +@@ -1277,7 +1328,7 @@ void do_invalid_op(struct cpu_user_regs + case BUGFRAME_warn: + printk("Xen WARN at %s%s:%d\n", prefix, filename, lineno); + show_execution_state(regs); +- fixup_exception_return(regs, (unsigned long)eip); ++ fixup_exception_return(regs, (unsigned long)eip, 0); + return; + + case BUGFRAME_bug: +--- a/xen/include/asm-x86/uaccess.h ++++ b/xen/include/asm-x86/uaccess.h +@@ -421,7 +421,8 @@ union stub_exception_token { + unsigned long raw; + }; + +-extern unsigned long search_exception_table(const struct cpu_user_regs *regs); ++extern unsigned long search_exception_table(const struct cpu_user_regs *regs, ++ unsigned long *stub_ra); + extern void sort_exception_tables(void); + extern void sort_exception_table(struct exception_table_entry *start, + const struct exception_table_entry *stop); diff --git a/main/xen/xsa452-4.16-1.patch b/main/xen/xsa452-4.16-1.patch new file mode 100644 index 00000000000..d86b5eff8e1 --- /dev/null +++ b/main/xen/xsa452-4.16-1.patch @@ -0,0 +1,304 @@ +From: Andrew Cooper <andrew.cooper3@citrix.com> +Subject: x86/entry: Introduce EFRAME_* constants + +restore_all_guest() does a lot of manipulation of the stack after popping the +GPRs, and uses raw %rsp displacements to do so. Also, almost all entrypaths +use raw %rsp displacements prior to pushing GPRs. + +Provide better mnemonics, to aid readability and reduce the chance of errors +when editing. + +No functional change. The resulting binary is identical. + +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +(cherry picked from commit 37541208f119a9c552c6c6c3246ea61be0d44035) + +diff --git a/xen/arch/x86/x86_64/asm-offsets.c b/xen/arch/x86/x86_64/asm-offsets.c +index 287dac101ad4..31fa63b77fd1 100644 +--- a/xen/arch/x86/x86_64/asm-offsets.c ++++ b/xen/arch/x86/x86_64/asm-offsets.c +@@ -51,6 +51,23 @@ void __dummy__(void) + OFFSET(UREGS_kernel_sizeof, struct cpu_user_regs, es); + BLANK(); + ++ /* ++ * EFRAME_* is for the entry/exit logic where %rsp is pointing at ++ * UREGS_error_code and GPRs are still/already guest values. ++ */ ++#define OFFSET_EF(sym, mem) \ ++ DEFINE(sym, offsetof(struct cpu_user_regs, mem) - \ ++ offsetof(struct cpu_user_regs, error_code)) ++ ++ OFFSET_EF(EFRAME_entry_vector, entry_vector); ++ OFFSET_EF(EFRAME_rip, rip); ++ OFFSET_EF(EFRAME_cs, cs); ++ OFFSET_EF(EFRAME_eflags, eflags); ++ OFFSET_EF(EFRAME_rsp, rsp); ++ BLANK(); ++ ++#undef OFFSET_EF ++ + OFFSET(VCPU_processor, struct vcpu, processor); + OFFSET(VCPU_domain, struct vcpu, domain); + OFFSET(VCPU_vcpu_info, struct vcpu, vcpu_info); +diff --git a/xen/arch/x86/x86_64/compat/entry.S b/xen/arch/x86/x86_64/compat/entry.S +index 253bb1688c4f..7c211314d885 100644 +--- a/xen/arch/x86/x86_64/compat/entry.S ++++ b/xen/arch/x86/x86_64/compat/entry.S +@@ -15,7 +15,7 @@ ENTRY(entry_int82) + ENDBR64 + ALTERNATIVE "", clac, X86_FEATURE_XEN_SMAP + pushq $0 +- movl $HYPERCALL_VECTOR, 4(%rsp) ++ movl $HYPERCALL_VECTOR, EFRAME_entry_vector(%rsp) + SAVE_ALL compat=1 /* DPL1 gate, restricted to 32bit PV guests only. */ + + SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, %rdx=0, Clob: acd */ +diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S +index 837a31b40524..10f11986d8b9 100644 +--- a/xen/arch/x86/x86_64/entry.S ++++ b/xen/arch/x86/x86_64/entry.S +@@ -190,15 +190,15 @@ restore_all_guest: + SPEC_CTRL_EXIT_TO_PV /* Req: a=spec_ctrl %rsp=regs/cpuinfo, Clob: cd */ + + RESTORE_ALL +- testw $TRAP_syscall,4(%rsp) ++ testw $TRAP_syscall, EFRAME_entry_vector(%rsp) + jz iret_exit_to_guest + +- movq 24(%rsp),%r11 # RFLAGS ++ mov EFRAME_eflags(%rsp), %r11 + andq $~(X86_EFLAGS_IOPL | X86_EFLAGS_VM), %r11 + orq $X86_EFLAGS_IF,%r11 + + /* Don't use SYSRET path if the return address is not canonical. */ +- movq 8(%rsp),%rcx ++ mov EFRAME_rip(%rsp), %rcx + sarq $47,%rcx + incl %ecx + cmpl $1,%ecx +@@ -213,20 +213,20 @@ restore_all_guest: + ALTERNATIVE "", rag_clrssbsy, X86_FEATURE_XEN_SHSTK + #endif + +- movq 8(%rsp), %rcx # RIP +- cmpw $FLAT_USER_CS32,16(%rsp)# CS +- movq 32(%rsp),%rsp # RSP ++ mov EFRAME_rip(%rsp), %rcx ++ cmpw $FLAT_USER_CS32, EFRAME_cs(%rsp) ++ mov EFRAME_rsp(%rsp), %rsp + je 1f + sysretq + 1: sysretl + + ALIGN + .Lrestore_rcx_iret_exit_to_guest: +- movq 8(%rsp), %rcx # RIP ++ mov EFRAME_rip(%rsp), %rcx + /* No special register assumptions. */ + iret_exit_to_guest: +- andl $~(X86_EFLAGS_IOPL | X86_EFLAGS_VM), 24(%rsp) +- orl $X86_EFLAGS_IF,24(%rsp) ++ andl $~(X86_EFLAGS_IOPL | X86_EFLAGS_VM), EFRAME_eflags(%rsp) ++ orl $X86_EFLAGS_IF, EFRAME_eflags(%rsp) + addq $8,%rsp + .Lft0: iretq + _ASM_PRE_EXTABLE(.Lft0, handle_exception) +@@ -257,7 +257,7 @@ ENTRY(lstar_enter) + pushq $FLAT_KERNEL_CS64 + pushq %rcx + pushq $0 +- movl $TRAP_syscall, 4(%rsp) ++ movl $TRAP_syscall, EFRAME_entry_vector(%rsp) + SAVE_ALL + + SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, %rdx=0, Clob: acd */ +@@ -294,7 +294,7 @@ ENTRY(cstar_enter) + pushq $FLAT_USER_CS32 + pushq %rcx + pushq $0 +- movl $TRAP_syscall, 4(%rsp) ++ movl $TRAP_syscall, EFRAME_entry_vector(%rsp) + SAVE_ALL + + SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, %rdx=0, Clob: acd */ +@@ -335,7 +335,7 @@ GLOBAL(sysenter_eflags_saved) + pushq $3 /* ring 3 null cs */ + pushq $0 /* null rip */ + pushq $0 +- movl $TRAP_syscall, 4(%rsp) ++ movl $TRAP_syscall, EFRAME_entry_vector(%rsp) + SAVE_ALL + + SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, %rdx=0, Clob: acd */ +@@ -389,7 +389,7 @@ ENTRY(int80_direct_trap) + ENDBR64 + ALTERNATIVE "", clac, X86_FEATURE_XEN_SMAP + pushq $0 +- movl $0x80, 4(%rsp) ++ movl $0x80, EFRAME_entry_vector(%rsp) + SAVE_ALL + + SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, %rdx=0, Clob: acd */ +@@ -707,7 +707,7 @@ ENTRY(common_interrupt) + + ENTRY(page_fault) + ENDBR64 +- movl $TRAP_page_fault,4(%rsp) ++ movl $TRAP_page_fault, EFRAME_entry_vector(%rsp) + /* No special register assumptions. */ + GLOBAL(handle_exception) + ALTERNATIVE "", clac, X86_FEATURE_XEN_SMAP +@@ -849,90 +849,90 @@ FATAL_exception_with_ints_disabled: + ENTRY(divide_error) + ENDBR64 + pushq $0 +- movl $TRAP_divide_error,4(%rsp) ++ movl $TRAP_divide_error, EFRAME_entry_vector(%rsp) + jmp handle_exception + + ENTRY(coprocessor_error) + ENDBR64 + pushq $0 +- movl $TRAP_copro_error,4(%rsp) ++ movl $TRAP_copro_error, EFRAME_entry_vector(%rsp) + jmp handle_exception + + ENTRY(simd_coprocessor_error) + ENDBR64 + pushq $0 +- movl $TRAP_simd_error,4(%rsp) ++ movl $TRAP_simd_error, EFRAME_entry_vector(%rsp) + jmp handle_exception + + ENTRY(device_not_available) + ENDBR64 + pushq $0 +- movl $TRAP_no_device,4(%rsp) ++ movl $TRAP_no_device, EFRAME_entry_vector(%rsp) + jmp handle_exception + + ENTRY(debug) + ENDBR64 + pushq $0 +- movl $TRAP_debug,4(%rsp) ++ movl $TRAP_debug, EFRAME_entry_vector(%rsp) + jmp handle_ist_exception + + ENTRY(int3) + ENDBR64 + pushq $0 +- movl $TRAP_int3,4(%rsp) ++ movl $TRAP_int3, EFRAME_entry_vector(%rsp) + jmp handle_exception + + ENTRY(overflow) + ENDBR64 + pushq $0 +- movl $TRAP_overflow,4(%rsp) ++ movl $TRAP_overflow, EFRAME_entry_vector(%rsp) + jmp handle_exception + + ENTRY(bounds) + ENDBR64 + pushq $0 +- movl $TRAP_bounds,4(%rsp) ++ movl $TRAP_bounds, EFRAME_entry_vector(%rsp) + jmp handle_exception + + ENTRY(invalid_op) + ENDBR64 + pushq $0 +- movl $TRAP_invalid_op,4(%rsp) ++ movl $TRAP_invalid_op, EFRAME_entry_vector(%rsp) + jmp handle_exception + + ENTRY(invalid_TSS) + ENDBR64 +- movl $TRAP_invalid_tss,4(%rsp) ++ movl $TRAP_invalid_tss, EFRAME_entry_vector(%rsp) + jmp handle_exception + + ENTRY(segment_not_present) + ENDBR64 +- movl $TRAP_no_segment,4(%rsp) ++ movl $TRAP_no_segment, EFRAME_entry_vector(%rsp) + jmp handle_exception + + ENTRY(stack_segment) + ENDBR64 +- movl $TRAP_stack_error,4(%rsp) ++ movl $TRAP_stack_error, EFRAME_entry_vector(%rsp) + jmp handle_exception + + ENTRY(general_protection) + ENDBR64 +- movl $TRAP_gp_fault,4(%rsp) ++ movl $TRAP_gp_fault, EFRAME_entry_vector(%rsp) + jmp handle_exception + + ENTRY(alignment_check) + ENDBR64 +- movl $TRAP_alignment_check,4(%rsp) ++ movl $TRAP_alignment_check, EFRAME_entry_vector(%rsp) + jmp handle_exception + + ENTRY(entry_CP) + ENDBR64 +- movl $X86_EXC_CP, 4(%rsp) ++ movl $X86_EXC_CP, EFRAME_entry_vector(%rsp) + jmp handle_exception + + ENTRY(double_fault) + ENDBR64 +- movl $TRAP_double_fault,4(%rsp) ++ movl $TRAP_double_fault, EFRAME_entry_vector(%rsp) + /* Set AC to reduce chance of further SMAP faults */ + ALTERNATIVE "", stac, X86_FEATURE_XEN_SMAP + SAVE_ALL +@@ -958,7 +958,7 @@ ENTRY(double_fault) + .pushsection .init.text, "ax", @progbits + ENTRY(early_page_fault) + ENDBR64 +- movl $TRAP_page_fault,4(%rsp) ++ movl $TRAP_page_fault, EFRAME_entry_vector(%rsp) + SAVE_ALL + movq %rsp,%rdi + call do_early_page_fault +@@ -968,7 +968,7 @@ ENTRY(early_page_fault) + ENTRY(nmi) + ENDBR64 + pushq $0 +- movl $TRAP_nmi,4(%rsp) ++ movl $TRAP_nmi, EFRAME_entry_vector(%rsp) + handle_ist_exception: + ALTERNATIVE "", clac, X86_FEATURE_XEN_SMAP + SAVE_ALL +@@ -1075,7 +1075,7 @@ handle_ist_exception: + ENTRY(machine_check) + ENDBR64 + pushq $0 +- movl $TRAP_machine_check,4(%rsp) ++ movl $TRAP_machine_check, EFRAME_entry_vector(%rsp) + jmp handle_ist_exception + + /* No op trap handler. Required for kexec crash path. */ +@@ -1112,7 +1112,7 @@ autogen_stubs: /* Automatically generated stubs. */ + 1: + ENDBR64 + pushq $0 +- movb $vec,4(%rsp) ++ movb $vec, EFRAME_entry_vector(%rsp) + jmp common_interrupt + + entrypoint 1b +@@ -1126,7 +1126,7 @@ autogen_stubs: /* Automatically generated stubs. */ + test $8,%spl /* 64bit exception frames are 16 byte aligned, but the word */ + jz 2f /* size is 8 bytes. Check whether the processor gave us an */ + pushq $0 /* error code, and insert an empty one if not. */ +-2: movb $vec,4(%rsp) ++2: movb $vec, EFRAME_entry_vector(%rsp) + jmp handle_exception + + entrypoint 1b diff --git a/main/xen/xsa452-4.16-2.patch b/main/xen/xsa452-4.16-2.patch new file mode 100644 index 00000000000..5df731528a4 --- /dev/null +++ b/main/xen/xsa452-4.16-2.patch @@ -0,0 +1,87 @@ +From: Andrew Cooper <andrew.cooper3@citrix.com> +Subject: x86/cpu-policy: Allow for levelling of VERW side effects + +MD_CLEAR and FB_CLEAR need OR-ing across a migrate pool. Allow this, by +having them unconditinally set in max, with the host values reflected in +default. Annotate the bits as having special properies. + +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> +(cherry picked from commit de17162cafd27f2865a3102a2ec0f386a02ed03d) + +diff --git a/xen/arch/x86/cpu-policy.c b/xen/arch/x86/cpu-policy.c +index f38063b667b0..34f778dbafbb 100644 +--- a/xen/arch/x86/cpu-policy.c ++++ b/xen/arch/x86/cpu-policy.c +@@ -434,6 +434,16 @@ static void __init guest_common_max_feature_adjustments(uint32_t *fs) + __set_bit(X86_FEATURE_RSBA, fs); + __set_bit(X86_FEATURE_RRSBA, fs); + ++ /* ++ * These bits indicate that the VERW instruction may have gained ++ * scrubbing side effects. With pooling, they mean "you might migrate ++ * somewhere where scrubbing is necessary", and may need exposing on ++ * unaffected hardware. This is fine, because the VERW instruction ++ * has been around since the 286. ++ */ ++ __set_bit(X86_FEATURE_MD_CLEAR, fs); ++ __set_bit(X86_FEATURE_FB_CLEAR, fs); ++ + /* + * The Gather Data Sampling microcode mitigation (August 2023) has an + * adverse performance impact on the CLWB instruction on SKX/CLX/CPX. +@@ -468,6 +478,20 @@ static void __init guest_common_default_feature_adjustments(uint32_t *fs) + cpu_has_rdrand && !is_forced_cpu_cap(X86_FEATURE_RDRAND) ) + __clear_bit(X86_FEATURE_RDRAND, fs); + ++ /* ++ * These bits indicate that the VERW instruction may have gained ++ * scrubbing side effects. The max policy has them set for migration ++ * reasons, so reset the default policy back to the host values in ++ * case we're unaffected. ++ */ ++ __clear_bit(X86_FEATURE_MD_CLEAR, fs); ++ if ( cpu_has_md_clear ) ++ __set_bit(X86_FEATURE_MD_CLEAR, fs); ++ ++ __clear_bit(X86_FEATURE_FB_CLEAR, fs); ++ if ( cpu_has_fb_clear ) ++ __set_bit(X86_FEATURE_FB_CLEAR, fs); ++ + /* + * The Gather Data Sampling microcode mitigation (August 2023) has an + * adverse performance impact on the CLWB instruction on SKX/CLX/CPX. +diff --git a/xen/include/asm-x86/cpufeature.h b/xen/include/asm-x86/cpufeature.h +index 1ac3d3a1f946..81ac4d76eea6 100644 +--- a/xen/include/asm-x86/cpufeature.h ++++ b/xen/include/asm-x86/cpufeature.h +@@ -134,6 +134,7 @@ + #define cpu_has_avx512_4fmaps boot_cpu_has(X86_FEATURE_AVX512_4FMAPS) + #define cpu_has_avx512_vp2intersect boot_cpu_has(X86_FEATURE_AVX512_VP2INTERSECT) + #define cpu_has_srbds_ctrl boot_cpu_has(X86_FEATURE_SRBDS_CTRL) ++#define cpu_has_md_clear boot_cpu_has(X86_FEATURE_MD_CLEAR) + #define cpu_has_rtm_always_abort boot_cpu_has(X86_FEATURE_RTM_ALWAYS_ABORT) + #define cpu_has_tsx_force_abort boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT) + #define cpu_has_serialize boot_cpu_has(X86_FEATURE_SERIALIZE) +diff --git a/xen/include/public/arch-x86/cpufeatureset.h b/xen/include/public/arch-x86/cpufeatureset.h +index 0ee1d1d90330..2906eaa6c290 100644 +--- a/xen/include/public/arch-x86/cpufeatureset.h ++++ b/xen/include/public/arch-x86/cpufeatureset.h +@@ -275,7 +275,7 @@ XEN_CPUFEATURE(AVX512_4FMAPS, 9*32+ 3) /*A AVX512 Multiply Accumulation Single + XEN_CPUFEATURE(FSRM, 9*32+ 4) /*A Fast Short REP MOVS */ + XEN_CPUFEATURE(AVX512_VP2INTERSECT, 9*32+8) /*a VP2INTERSECT{D,Q} insns */ + XEN_CPUFEATURE(SRBDS_CTRL, 9*32+ 9) /* MSR_MCU_OPT_CTRL and RNGDS_MITG_DIS. */ +-XEN_CPUFEATURE(MD_CLEAR, 9*32+10) /*A VERW clears microarchitectural buffers */ ++XEN_CPUFEATURE(MD_CLEAR, 9*32+10) /*!A VERW clears microarchitectural buffers */ + XEN_CPUFEATURE(RTM_ALWAYS_ABORT, 9*32+11) /*! June 2021 TSX defeaturing in microcode. */ + XEN_CPUFEATURE(TSX_FORCE_ABORT, 9*32+13) /* MSR_TSX_FORCE_ABORT.RTM_ABORT */ + XEN_CPUFEATURE(SERIALIZE, 9*32+14) /*a SERIALIZE insn */ +@@ -329,7 +329,7 @@ XEN_CPUFEATURE(DOITM, 16*32+12) /* Data Operand Invariant Timing + XEN_CPUFEATURE(SBDR_SSDP_NO, 16*32+13) /*A No Shared Buffer Data Read or Sideband Stale Data Propagation */ + XEN_CPUFEATURE(FBSDP_NO, 16*32+14) /*A No Fill Buffer Stale Data Propagation */ + XEN_CPUFEATURE(PSDP_NO, 16*32+15) /*A No Primary Stale Data Propagation */ +-XEN_CPUFEATURE(FB_CLEAR, 16*32+17) /*A Fill Buffers cleared by VERW */ ++XEN_CPUFEATURE(FB_CLEAR, 16*32+17) /*!A Fill Buffers cleared by VERW */ + XEN_CPUFEATURE(FB_CLEAR_CTRL, 16*32+18) /* MSR_OPT_CPU_CTRL.FB_CLEAR_DIS */ + XEN_CPUFEATURE(RRSBA, 16*32+19) /*! Restricted RSB Alternative */ + XEN_CPUFEATURE(BHI_NO, 16*32+20) /*A No Branch History Injection */ diff --git a/main/xen/xsa452-4.16-3.patch b/main/xen/xsa452-4.16-3.patch new file mode 100644 index 00000000000..bd15964146a --- /dev/null +++ b/main/xen/xsa452-4.16-3.patch @@ -0,0 +1,135 @@ +From: Andrew Cooper <andrew.cooper3@citrix.com> +Subject: x86/vmx: Perform VERW flushing later in the VMExit path + +Broken out of the following patch because this change is subtle enough on its +own. See it for the rational of why we're moving VERW. + +As for how, extend the trick already used to hold one condition in +flags (RESUME vs LAUNCH) through the POPing of GPRs. + +Move the MOV CR earlier. Intel specify flags to be undefined across it. + +Encode the two conditions we want using SF and PF. See the code comment for +exactly how. + +Leave a comment to explain the lack of any content around +SPEC_CTRL_EXIT_TO_VMX, but leave the block in place. Sods law says if we +delete it, we'll need to reintroduce it. + +This is part of XSA-452 / CVE-2023-28746. + +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +(cherry picked from commit 475fa20b7384464210f42bad7195f87bd6f1c63f) + +diff --git a/xen/arch/x86/hvm/vmx/entry.S b/xen/arch/x86/hvm/vmx/entry.S +index 5f5de45a1309..cdde76e13892 100644 +--- a/xen/arch/x86/hvm/vmx/entry.S ++++ b/xen/arch/x86/hvm/vmx/entry.S +@@ -87,17 +87,39 @@ UNLIKELY_END(realmode) + + /* WARNING! `ret`, `call *`, `jmp *` not safe beyond this point. */ + /* SPEC_CTRL_EXIT_TO_VMX Req: %rsp=regs/cpuinfo Clob: */ +- DO_SPEC_CTRL_COND_VERW ++ /* ++ * All speculation safety work happens to be elsewhere. VERW is after ++ * popping the GPRs, while restoring the guest MSR_SPEC_CTRL is left ++ * to the MSR load list. ++ */ + + mov VCPU_hvm_guest_cr2(%rbx),%rax ++ mov %rax, %cr2 ++ ++ /* ++ * We need to perform two conditional actions (VERW, and Resume vs ++ * Launch) after popping GPRs. With some cunning, we can encode both ++ * of these in eflags together. ++ * ++ * Parity is only calculated over the bottom byte of the answer, while ++ * Sign is simply the top bit. ++ * ++ * Therefore, the final OR instruction ends up producing: ++ * SF = VCPU_vmx_launched ++ * PF = !SCF_verw ++ */ ++ BUILD_BUG_ON(SCF_verw & ~0xff) ++ movzbl VCPU_vmx_launched(%rbx), %ecx ++ shl $31, %ecx ++ movzbl CPUINFO_spec_ctrl_flags(%rsp), %eax ++ and $SCF_verw, %eax ++ or %eax, %ecx + + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp +- mov %rax,%cr2 +- cmpb $0,VCPU_vmx_launched(%rbx) + pop %rbx + pop %r11 + pop %r10 +@@ -108,7 +130,13 @@ UNLIKELY_END(realmode) + pop %rdx + pop %rsi + pop %rdi +- je .Lvmx_launch ++ ++ jpe .L_skip_verw ++ /* VERW clobbers ZF, but preserves all others, including SF. */ ++ verw STK_REL(CPUINFO_verw_sel, CPUINFO_error_code)(%rsp) ++.L_skip_verw: ++ ++ jns .Lvmx_launch + + /*.Lvmx_resume:*/ + VMRESUME +diff --git a/xen/arch/x86/x86_64/asm-offsets.c b/xen/arch/x86/x86_64/asm-offsets.c +index 31fa63b77fd1..a4e94d693024 100644 +--- a/xen/arch/x86/x86_64/asm-offsets.c ++++ b/xen/arch/x86/x86_64/asm-offsets.c +@@ -135,6 +135,7 @@ void __dummy__(void) + #endif + + OFFSET(CPUINFO_guest_cpu_user_regs, struct cpu_info, guest_cpu_user_regs); ++ OFFSET(CPUINFO_error_code, struct cpu_info, guest_cpu_user_regs.error_code); + OFFSET(CPUINFO_verw_sel, struct cpu_info, verw_sel); + OFFSET(CPUINFO_current_vcpu, struct cpu_info, current_vcpu); + OFFSET(CPUINFO_per_cpu_offset, struct cpu_info, per_cpu_offset); +diff --git a/xen/include/asm-x86/asm_defns.h b/xen/include/asm-x86/asm_defns.h +index d9431180cfba..abc6822b08c8 100644 +--- a/xen/include/asm-x86/asm_defns.h ++++ b/xen/include/asm-x86/asm_defns.h +@@ -81,6 +81,14 @@ register unsigned long current_stack_pointer asm("rsp"); + + #ifdef __ASSEMBLY__ + ++.macro BUILD_BUG_ON condstr, cond:vararg ++ .if \cond ++ .error "Condition \"\condstr\" not satisfied" ++ .endif ++.endm ++/* preprocessor macro to make error message more user friendly */ ++#define BUILD_BUG_ON(cond) BUILD_BUG_ON #cond, cond ++ + #ifdef HAVE_AS_QUOTED_SYM + #define SUBSECTION_LBL(tag) \ + .ifndef .L.tag; \ +diff --git a/xen/include/asm-x86/spec_ctrl_asm.h b/xen/include/asm-x86/spec_ctrl_asm.h +index 0e69971f663f..e807ff6d1db2 100644 +--- a/xen/include/asm-x86/spec_ctrl_asm.h ++++ b/xen/include/asm-x86/spec_ctrl_asm.h +@@ -169,6 +169,13 @@ + #endif + .endm + ++/* ++ * Helper to improve the readibility of stack dispacements with %rsp in ++ * unusual positions. Both @field and @top_of_stack should be constants from ++ * the same object. @top_of_stack should be where %rsp is currently pointing. ++ */ ++#define STK_REL(field, top_of_stk) ((field) - (top_of_stk)) ++ + .macro DO_SPEC_CTRL_COND_VERW + /* + * Requires %rsp=cpuinfo diff --git a/main/xen/xsa452-4.16-4.patch b/main/xen/xsa452-4.16-4.patch new file mode 100644 index 00000000000..59a124067df --- /dev/null +++ b/main/xen/xsa452-4.16-4.patch @@ -0,0 +1,197 @@ +From: Andrew Cooper <andrew.cooper3@citrix.com> +Subject: x86/spec-ctrl: Perform VERW flushing later in exit paths + +On parts vulnerable to RFDS, VERW's side effects are extended to scrub all +non-architectural entries in various Physical Register Files. To remove all +of Xen's values, the VERW must be after popping the GPRs. + +Rework SPEC_CTRL_COND_VERW to default to an CPUINFO_error_code %rsp position, +but with overrides for other contexts. Identify that it clobbers eflags; this +is particularly relevant for the SYSRET path. + +For the IST exit return to Xen, have the main SPEC_CTRL_EXIT_TO_XEN put a +shadow copy of spec_ctrl_flags, as GPRs can't be used at the point we want to +issue the VERW. + +This is part of XSA-452 / CVE-2023-28746. + +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +(cherry picked from commit 0a666cf2cd99df6faf3eebc81a1fc286e4eca4c7) + +diff --git a/xen/arch/x86/x86_64/asm-offsets.c b/xen/arch/x86/x86_64/asm-offsets.c +index a4e94d693024..4cd5938d7b9d 100644 +--- a/xen/arch/x86/x86_64/asm-offsets.c ++++ b/xen/arch/x86/x86_64/asm-offsets.c +@@ -55,14 +55,22 @@ void __dummy__(void) + * EFRAME_* is for the entry/exit logic where %rsp is pointing at + * UREGS_error_code and GPRs are still/already guest values. + */ +-#define OFFSET_EF(sym, mem) \ ++#define OFFSET_EF(sym, mem, ...) \ + DEFINE(sym, offsetof(struct cpu_user_regs, mem) - \ +- offsetof(struct cpu_user_regs, error_code)) ++ offsetof(struct cpu_user_regs, error_code) __VA_ARGS__) + + OFFSET_EF(EFRAME_entry_vector, entry_vector); + OFFSET_EF(EFRAME_rip, rip); + OFFSET_EF(EFRAME_cs, cs); + OFFSET_EF(EFRAME_eflags, eflags); ++ ++ /* ++ * These aren't real fields. They're spare space, used by the IST ++ * exit-to-xen path. ++ */ ++ OFFSET_EF(EFRAME_shadow_scf, eflags, +4); ++ OFFSET_EF(EFRAME_shadow_sel, eflags, +6); ++ + OFFSET_EF(EFRAME_rsp, rsp); + BLANK(); + +@@ -136,6 +144,7 @@ void __dummy__(void) + + OFFSET(CPUINFO_guest_cpu_user_regs, struct cpu_info, guest_cpu_user_regs); + OFFSET(CPUINFO_error_code, struct cpu_info, guest_cpu_user_regs.error_code); ++ OFFSET(CPUINFO_rip, struct cpu_info, guest_cpu_user_regs.rip); + OFFSET(CPUINFO_verw_sel, struct cpu_info, verw_sel); + OFFSET(CPUINFO_current_vcpu, struct cpu_info, current_vcpu); + OFFSET(CPUINFO_per_cpu_offset, struct cpu_info, per_cpu_offset); +diff --git a/xen/arch/x86/x86_64/compat/entry.S b/xen/arch/x86/x86_64/compat/entry.S +index 7c211314d885..3b2fbcd8733a 100644 +--- a/xen/arch/x86/x86_64/compat/entry.S ++++ b/xen/arch/x86/x86_64/compat/entry.S +@@ -161,6 +161,12 @@ ENTRY(compat_restore_all_guest) + SPEC_CTRL_EXIT_TO_PV /* Req: a=spec_ctrl %rsp=regs/cpuinfo, Clob: cd */ + + RESTORE_ALL adj=8 compat=1 ++ ++ /* Account for ev/ec having already been popped off the stack. */ ++ SPEC_CTRL_COND_VERW \ ++ scf=STK_REL(CPUINFO_spec_ctrl_flags, CPUINFO_rip), \ ++ sel=STK_REL(CPUINFO_verw_sel, CPUINFO_rip) ++ + .Lft0: iretq + _ASM_PRE_EXTABLE(.Lft0, handle_exception) + +diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S +index 10f11986d8b9..9b1fa9ed192f 100644 +--- a/xen/arch/x86/x86_64/entry.S ++++ b/xen/arch/x86/x86_64/entry.S +@@ -214,6 +214,9 @@ restore_all_guest: + #endif + + mov EFRAME_rip(%rsp), %rcx ++ ++ SPEC_CTRL_COND_VERW /* Req: %rsp=eframe Clob: efl */ ++ + cmpw $FLAT_USER_CS32, EFRAME_cs(%rsp) + mov EFRAME_rsp(%rsp), %rsp + je 1f +@@ -227,6 +230,9 @@ restore_all_guest: + iret_exit_to_guest: + andl $~(X86_EFLAGS_IOPL | X86_EFLAGS_VM), EFRAME_eflags(%rsp) + orl $X86_EFLAGS_IF, EFRAME_eflags(%rsp) ++ ++ SPEC_CTRL_COND_VERW /* Req: %rsp=eframe Clob: efl */ ++ + addq $8,%rsp + .Lft0: iretq + _ASM_PRE_EXTABLE(.Lft0, handle_exception) +@@ -670,9 +676,22 @@ UNLIKELY_START(ne, exit_cr3) + UNLIKELY_END(exit_cr3) + + /* WARNING! `ret`, `call *`, `jmp *` not safe beyond this point. */ +- SPEC_CTRL_EXIT_TO_XEN /* Req: %r12=ist_exit %r14=end, Clob: abcd */ ++ SPEC_CTRL_EXIT_TO_XEN /* Req: %r12=ist_exit %r14=end %rsp=regs, Clob: abcd */ + + RESTORE_ALL adj=8 ++ ++ /* ++ * When the CPU pushed this exception frame, it zero-extended eflags. ++ * For an IST exit, SPEC_CTRL_EXIT_TO_XEN stashed shadow copies of ++ * spec_ctrl_flags and ver_sel above eflags, as we can't use any GPRs, ++ * and we're at a random place on the stack, not in a CPUFINFO block. ++ * ++ * Account for ev/ec having already been popped off the stack. ++ */ ++ SPEC_CTRL_COND_VERW \ ++ scf=STK_REL(EFRAME_shadow_scf, EFRAME_rip), \ ++ sel=STK_REL(EFRAME_shadow_sel, EFRAME_rip) ++ + iretq + + ENTRY(common_interrupt) +diff --git a/xen/include/asm-x86/spec_ctrl_asm.h b/xen/include/asm-x86/spec_ctrl_asm.h +index e807ff6d1db2..6e7725c11f3a 100644 +--- a/xen/include/asm-x86/spec_ctrl_asm.h ++++ b/xen/include/asm-x86/spec_ctrl_asm.h +@@ -176,16 +176,23 @@ + */ + #define STK_REL(field, top_of_stk) ((field) - (top_of_stk)) + +-.macro DO_SPEC_CTRL_COND_VERW ++.macro SPEC_CTRL_COND_VERW \ ++ scf=STK_REL(CPUINFO_spec_ctrl_flags, CPUINFO_error_code), \ ++ sel=STK_REL(CPUINFO_verw_sel, CPUINFO_error_code) + /* +- * Requires %rsp=cpuinfo ++ * Requires \scf and \sel as %rsp-relative expressions ++ * Clobbers eflags ++ * ++ * VERW needs to run after guest GPRs have been restored, where only %rsp is ++ * good to use. Default to expecting %rsp pointing at CPUINFO_error_code. ++ * Contexts where this is not true must provide an alternative \scf and \sel. + * + * Issue a VERW for its flushing side effect, if indicated. This is a Spectre + * v1 gadget, but the IRET/VMEntry is serialising. + */ +- testb $SCF_verw, CPUINFO_spec_ctrl_flags(%rsp) ++ testb $SCF_verw, \scf(%rsp) + jz .L\@_verw_skip +- verw CPUINFO_verw_sel(%rsp) ++ verw \sel(%rsp) + .L\@_verw_skip: + .endm + +@@ -303,8 +310,6 @@ + */ + ALTERNATIVE "", DO_SPEC_CTRL_EXIT_TO_GUEST, X86_FEATURE_SC_MSR_PV + +- DO_SPEC_CTRL_COND_VERW +- + ALTERNATIVE "", DO_SPEC_CTRL_DIV, X86_FEATURE_SC_DIV + .endm + +@@ -384,7 +389,7 @@ UNLIKELY_DISPATCH_LABEL(\@_serialise): + */ + .macro SPEC_CTRL_EXIT_TO_XEN + /* +- * Requires %r12=ist_exit, %r14=stack_end ++ * Requires %r12=ist_exit, %r14=stack_end, %rsp=regs + * Clobbers %rax, %rbx, %rcx, %rdx + */ + movzbl STACK_CPUINFO_FIELD(spec_ctrl_flags)(%r14), %ebx +@@ -412,11 +417,18 @@ UNLIKELY_DISPATCH_LABEL(\@_serialise): + test %r12, %r12 + jz .L\@_skip_ist_exit + +- /* Logically DO_SPEC_CTRL_COND_VERW but without the %rsp=cpuinfo dependency */ +- testb $SCF_verw, %bl +- jz .L\@_skip_verw +- verw STACK_CPUINFO_FIELD(verw_sel)(%r14) +-.L\@_skip_verw: ++ /* ++ * Stash SCF and verw_sel above eflags in the case of an IST_exit. The ++ * VERW logic needs to run after guest GPRs have been restored; i.e. where ++ * we cannot use %r12 or %r14 for the purposes they have here. ++ * ++ * When the CPU pushed this exception frame, it zero-extended eflags. ++ * Therefore it is safe for the VERW logic to look at the stashed SCF ++ * outside of the ist_exit condition. Also, this stashing won't influence ++ * any other restore_all_guest() paths. ++ */ ++ or $(__HYPERVISOR_DS32 << 16), %ebx ++ mov %ebx, UREGS_eflags + 4(%rsp) /* EFRAME_shadow_scf/sel */ + + ALTERNATIVE "", DO_SPEC_CTRL_DIV, X86_FEATURE_SC_DIV + diff --git a/main/xen/xsa452-4.16-5.patch b/main/xen/xsa452-4.16-5.patch new file mode 100644 index 00000000000..48ce9b02cf3 --- /dev/null +++ b/main/xen/xsa452-4.16-5.patch @@ -0,0 +1,237 @@ +From: Andrew Cooper <andrew.cooper3@citrix.com> +Subject: x86/spec-ctrl: Rename VERW related options + +VERW is going to be used for a 3rd purpose, and the existing nomenclature +didn't survive the Stale MMIO issues terribly well. + +Rename the command line option from `md-clear=` to `verw=`. This is more +consistent with other options which tend to be named based on what they're +doing, not which feature enumeration they use behind the scenes. Retain +`md-clear=` as a deprecated alias. + +Rename opt_md_clear_{pv,hvm} and opt_fb_clear_mmio to opt_verw_{pv,hvm,mmio}, +which has a side effect of making spec_ctrl_init_domain() rather clearer to +follow. + +No functional change. + +This is part of XSA-452 / CVE-2023-28746. + +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +(cherry picked from commit f7603ca252e4226739eb3129a5290ee3da3f8ea4) + +diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc +index a7a1362bac28..029002fa82d6 100644 +--- a/docs/misc/xen-command-line.pandoc ++++ b/docs/misc/xen-command-line.pandoc +@@ -2260,7 +2260,7 @@ By default SSBD will be mitigated at runtime (i.e `ssbd=runtime`). + + ### spec-ctrl (x86) + > `= List of [ <bool>, xen=<bool>, {pv,hvm}=<bool>, +-> {msr-sc,rsb,md-clear,ibpb-entry}=<bool>|{pv,hvm}=<bool>, ++> {msr-sc,rsb,verw,ibpb-entry}=<bool>|{pv,hvm}=<bool>, + > bti-thunk=retpoline|lfence|jmp, {ibrs,ibpb,ssbd,psfd, + > eager-fpu,l1d-flush,branch-harden,srb-lock, + > unpriv-mmio,gds-mit,div-scrub}=<bool> ]` +@@ -2285,7 +2285,7 @@ in place for guests to use. + + Use of a positive boolean value for either of these options is invalid. + +-The `pv=`, `hvm=`, `msr-sc=`, `rsb=`, `md-clear=` and `ibpb-entry=` options ++The `pv=`, `hvm=`, `msr-sc=`, `rsb=`, `verw=` and `ibpb-entry=` options + offer fine grained control over the primitives by Xen. These impact Xen's + ability to protect itself, and/or Xen's ability to virtualise support for + guests to use. +@@ -2302,11 +2302,12 @@ guests to use. + guests and if disabled, guests will be unable to use IBRS/STIBP/SSBD/etc. + * `rsb=` offers control over whether to overwrite the Return Stack Buffer / + Return Address Stack on entry to Xen and on idle. +-* `md-clear=` offers control over whether to use VERW to flush +- microarchitectural buffers on idle and exit from Xen. *Note: For +- compatibility with development versions of this fix, `mds=` is also accepted +- on Xen 4.12 and earlier as an alias. Consult vendor documentation in +- preference to here.* ++* `verw=` offers control over whether to use VERW for its scrubbing side ++ effects at appropriate privilege transitions. The exact side effects are ++ microarchitecture and microcode specific. *Note: `md-clear=` is accepted as ++ a deprecated alias. For compatibility with development versions of XSA-297, ++ `mds=` is also accepted on Xen 4.12 and earlier as an alias. Consult vendor ++ documentation in preference to here.* + * `ibpb-entry=` offers control over whether IBPB (Indirect Branch Prediction + Barrier) is used on entry to Xen. This is used by default on hardware + vulnerable to Branch Type Confusion, and hardware vulnerable to Speculative +diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c +index 6e82a126a3e2..292b5b1c7ba1 100644 +--- a/xen/arch/x86/spec_ctrl.c ++++ b/xen/arch/x86/spec_ctrl.c +@@ -37,8 +37,8 @@ static bool __initdata opt_msr_sc_pv = true; + static bool __initdata opt_msr_sc_hvm = true; + static int8_t __initdata opt_rsb_pv = -1; + static bool __initdata opt_rsb_hvm = true; +-static int8_t __read_mostly opt_md_clear_pv = -1; +-static int8_t __read_mostly opt_md_clear_hvm = -1; ++static int8_t __read_mostly opt_verw_pv = -1; ++static int8_t __read_mostly opt_verw_hvm = -1; + + static int8_t __read_mostly opt_ibpb_entry_pv = -1; + static int8_t __read_mostly opt_ibpb_entry_hvm = -1; +@@ -77,7 +77,7 @@ static bool __initdata cpu_has_bug_mds; /* Any other M{LP,SB,FB}DS combination. + + static int8_t __initdata opt_srb_lock = -1; + static bool __initdata opt_unpriv_mmio; +-static bool __read_mostly opt_fb_clear_mmio; ++static bool __read_mostly opt_verw_mmio; + static int8_t __initdata opt_gds_mit = -1; + static int8_t __initdata opt_div_scrub = -1; + +@@ -119,8 +119,8 @@ static int __init parse_spec_ctrl(const char *s) + disable_common: + opt_rsb_pv = false; + opt_rsb_hvm = false; +- opt_md_clear_pv = 0; +- opt_md_clear_hvm = 0; ++ opt_verw_pv = 0; ++ opt_verw_hvm = 0; + opt_ibpb_entry_pv = 0; + opt_ibpb_entry_hvm = 0; + opt_ibpb_entry_dom0 = false; +@@ -151,14 +151,14 @@ static int __init parse_spec_ctrl(const char *s) + { + opt_msr_sc_pv = val; + opt_rsb_pv = val; +- opt_md_clear_pv = val; ++ opt_verw_pv = val; + opt_ibpb_entry_pv = val; + } + else if ( (val = parse_boolean("hvm", s, ss)) >= 0 ) + { + opt_msr_sc_hvm = val; + opt_rsb_hvm = val; +- opt_md_clear_hvm = val; ++ opt_verw_hvm = val; + opt_ibpb_entry_hvm = val; + } + else if ( (val = parse_boolean("msr-sc", s, ss)) != -1 ) +@@ -203,21 +203,22 @@ static int __init parse_spec_ctrl(const char *s) + break; + } + } +- else if ( (val = parse_boolean("md-clear", s, ss)) != -1 ) ++ else if ( (val = parse_boolean("verw", s, ss)) != -1 || ++ (val = parse_boolean("md-clear", s, ss)) != -1 ) + { + switch ( val ) + { + case 0: + case 1: +- opt_md_clear_pv = opt_md_clear_hvm = val; ++ opt_verw_pv = opt_verw_hvm = val; + break; + + case -2: +- s += strlen("md-clear="); ++ s += (*s == 'v') ? strlen("verw=") : strlen("md-clear="); + if ( (val = parse_boolean("pv", s, ss)) >= 0 ) +- opt_md_clear_pv = val; ++ opt_verw_pv = val; + else if ( (val = parse_boolean("hvm", s, ss)) >= 0 ) +- opt_md_clear_hvm = val; ++ opt_verw_hvm = val; + else + default: + rc = -EINVAL; +@@ -512,8 +513,8 @@ static void __init print_details(enum ind_thunk thunk) + opt_srb_lock ? " SRB_LOCK+" : " SRB_LOCK-", + opt_ibpb_ctxt_switch ? " IBPB-ctxt" : "", + opt_l1d_flush ? " L1D_FLUSH" : "", +- opt_md_clear_pv || opt_md_clear_hvm || +- opt_fb_clear_mmio ? " VERW" : "", ++ opt_verw_pv || opt_verw_hvm || ++ opt_verw_mmio ? " VERW" : "", + opt_div_scrub ? " DIV" : "", + opt_branch_harden ? " BRANCH_HARDEN" : ""); + +@@ -533,11 +534,11 @@ static void __init print_details(enum ind_thunk thunk) + (boot_cpu_has(X86_FEATURE_SC_MSR_HVM) || + boot_cpu_has(X86_FEATURE_SC_RSB_HVM) || + boot_cpu_has(X86_FEATURE_IBPB_ENTRY_HVM) || +- opt_eager_fpu || opt_md_clear_hvm) ? "" : " None", ++ opt_eager_fpu || opt_verw_hvm) ? "" : " None", + boot_cpu_has(X86_FEATURE_SC_MSR_HVM) ? " MSR_SPEC_CTRL" : "", + boot_cpu_has(X86_FEATURE_SC_RSB_HVM) ? " RSB" : "", + opt_eager_fpu ? " EAGER_FPU" : "", +- opt_md_clear_hvm ? " MD_CLEAR" : "", ++ opt_verw_hvm ? " VERW" : "", + boot_cpu_has(X86_FEATURE_IBPB_ENTRY_HVM) ? " IBPB-entry" : ""); + + #endif +@@ -546,11 +547,11 @@ static void __init print_details(enum ind_thunk thunk) + (boot_cpu_has(X86_FEATURE_SC_MSR_PV) || + boot_cpu_has(X86_FEATURE_SC_RSB_PV) || + boot_cpu_has(X86_FEATURE_IBPB_ENTRY_PV) || +- opt_eager_fpu || opt_md_clear_pv) ? "" : " None", ++ opt_eager_fpu || opt_verw_pv) ? "" : " None", + boot_cpu_has(X86_FEATURE_SC_MSR_PV) ? " MSR_SPEC_CTRL" : "", + boot_cpu_has(X86_FEATURE_SC_RSB_PV) ? " RSB" : "", + opt_eager_fpu ? " EAGER_FPU" : "", +- opt_md_clear_pv ? " MD_CLEAR" : "", ++ opt_verw_pv ? " VERW" : "", + boot_cpu_has(X86_FEATURE_IBPB_ENTRY_PV) ? " IBPB-entry" : ""); + + printk(" XPTI (64-bit PV only): Dom0 %s, DomU %s (with%s PCID)\n", +@@ -1479,8 +1480,8 @@ void spec_ctrl_init_domain(struct domain *d) + { + bool pv = is_pv_domain(d); + +- bool verw = ((pv ? opt_md_clear_pv : opt_md_clear_hvm) || +- (opt_fb_clear_mmio && is_iommu_enabled(d))); ++ bool verw = ((pv ? opt_verw_pv : opt_verw_hvm) || ++ (opt_verw_mmio && is_iommu_enabled(d))); + + bool ibpb = ((pv ? opt_ibpb_entry_pv : opt_ibpb_entry_hvm) && + (d->domain_id != 0 || opt_ibpb_entry_dom0)); +@@ -1838,19 +1839,20 @@ void __init init_speculation_mitigations(void) + * the return-to-guest path. + */ + if ( opt_unpriv_mmio ) +- opt_fb_clear_mmio = cpu_has_fb_clear; ++ opt_verw_mmio = cpu_has_fb_clear; + + /* + * By default, enable PV and HVM mitigations on MDS-vulnerable hardware. + * This will only be a token effort for MLPDS/MFBDS when HT is enabled, + * but it is somewhat better than nothing. + */ +- if ( opt_md_clear_pv == -1 ) +- opt_md_clear_pv = ((cpu_has_bug_mds || cpu_has_bug_msbds_only) && +- boot_cpu_has(X86_FEATURE_MD_CLEAR)); +- if ( opt_md_clear_hvm == -1 ) +- opt_md_clear_hvm = ((cpu_has_bug_mds || cpu_has_bug_msbds_only) && +- boot_cpu_has(X86_FEATURE_MD_CLEAR)); ++ if ( opt_verw_pv == -1 ) ++ opt_verw_pv = ((cpu_has_bug_mds || cpu_has_bug_msbds_only) && ++ cpu_has_md_clear); ++ ++ if ( opt_verw_hvm == -1 ) ++ opt_verw_hvm = ((cpu_has_bug_mds || cpu_has_bug_msbds_only) && ++ cpu_has_md_clear); + + /* + * Enable MDS/MMIO defences as applicable. The Idle blocks need using if +@@ -1863,12 +1865,12 @@ void __init init_speculation_mitigations(void) + * MDS mitigations. L1D_FLUSH is not safe for MMIO mitigations.) + * + * After calculating the appropriate idle setting, simplify +- * opt_md_clear_hvm to mean just "should we VERW on the way into HVM ++ * opt_verw_hvm to mean just "should we VERW on the way into HVM + * guests", so spec_ctrl_init_domain() can calculate suitable settings. + */ +- if ( opt_md_clear_pv || opt_md_clear_hvm || opt_fb_clear_mmio ) ++ if ( opt_verw_pv || opt_verw_hvm || opt_verw_mmio ) + setup_force_cpu_cap(X86_FEATURE_SC_VERW_IDLE); +- opt_md_clear_hvm &= !cpu_has_skip_l1dfl && !opt_l1d_flush; ++ opt_verw_hvm &= !cpu_has_skip_l1dfl && !opt_l1d_flush; + + /* + * Warn the user if they are on MLPDS/MFBDS-vulnerable hardware with HT diff --git a/main/xen/xsa452-4.16-6.patch b/main/xen/xsa452-4.16-6.patch new file mode 100644 index 00000000000..e7e9ff09589 --- /dev/null +++ b/main/xen/xsa452-4.16-6.patch @@ -0,0 +1,163 @@ +From: Andrew Cooper <andrew.cooper3@citrix.com> +Subject: x86/spec-ctrl: VERW-handling adjustments + +... before we add yet more complexity to this logic. Mostly expanded +comments, but with three minor changes. + +1) Introduce cpu_has_useful_md_clear to simplify later logic in this patch and + future ones. + +2) We only ever need SC_VERW_IDLE when SMT is active. If SMT isn't active, + then there's no re-partition of pipeline resources based on thread-idleness + to worry about. + +3) The logic to adjust HVM VERW based on L1D_FLUSH is unmaintainable and, as + it turns out, wrong. SKIP_L1DFL is just a hint bit, whereas opt_l1d_flush + is the relevant decision of whether to use L1D_FLUSH based on + susceptibility and user preference. + + Rewrite the logic so it can be followed, and incorporate the fact that when + FB_CLEAR is visible, L1D_FLUSH isn't a safe substitution. + +This is part of XSA-452 / CVE-2023-28746. + +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Acked-by: Jan Beulich <jbeulich@suse.com> +(cherry picked from commit 1eb91a8a06230b4b64228c9a380194f8cfe6c5e2) + +diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c +index 292b5b1c7ba1..2e80e0871642 100644 +--- a/xen/arch/x86/spec_ctrl.c ++++ b/xen/arch/x86/spec_ctrl.c +@@ -1496,7 +1496,7 @@ void __init init_speculation_mitigations(void) + { + enum ind_thunk thunk = THUNK_DEFAULT; + bool has_spec_ctrl, ibrs = false, hw_smt_enabled; +- bool cpu_has_bug_taa, retpoline_safe; ++ bool cpu_has_bug_taa, cpu_has_useful_md_clear, retpoline_safe; + + hw_smt_enabled = check_smt_enabled(); + +@@ -1827,50 +1827,97 @@ void __init init_speculation_mitigations(void) + "enabled. Please assess your configuration and choose an\n" + "explicit 'smt=<bool>' setting. See XSA-273.\n"); + ++ /* ++ * A brief summary of VERW-related changes. ++ * ++ * https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/technical-documentation/intel-analysis-microarchitectural-data-sampling.html ++ * https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/technical-documentation/processor-mmio-stale-data-vulnerabilities.html ++ * ++ * Relevant ucodes: ++ * ++ * - May 2019, for MDS. Introduces the MD_CLEAR CPUID bit and VERW side ++ * effects to scrub Store/Load/Fill buffers as applicable. MD_CLEAR ++ * exists architecturally, even when the side effects have been removed. ++ * ++ * Use VERW to scrub on return-to-guest. Parts with L1D_FLUSH to ++ * mitigate L1TF have the same side effect, so no need to do both. ++ * ++ * Various Atoms suffer from Store-buffer sampling only. Store buffers ++ * are statically partitioned between non-idle threads, so scrubbing is ++ * wanted when going idle too. ++ * ++ * Load ports and Fill buffers are competitively shared between threads. ++ * SMT must be disabled for VERW scrubbing to be fully effective. ++ * ++ * - November 2019, for TAA. Extended VERW side effects to TSX-enabled ++ * MDS_NO parts. ++ * ++ * - February 2022, for Client TSX de-feature. Removed VERW side effects ++ * from Client CPUs only. ++ * ++ * - May 2022, for MMIO Stale Data. (Re)introduced Fill Buffer scrubbing ++ * on all MMIO-affected parts which didn't already have it for MDS ++ * reasons, enumerating FB_CLEAR on those parts only. ++ * ++ * If FB_CLEAR is enumerated, L1D_FLUSH does not have the same scrubbing ++ * side effects as VERW and cannot be used in its place. ++ */ + mds_calculations(); + + /* +- * Parts which enumerate FB_CLEAR are those which are post-MDS_NO and have +- * reintroduced the VERW fill buffer flushing side effect because of a +- * susceptibility to FBSDP. ++ * Parts which enumerate FB_CLEAR are those with now-updated microcode ++ * which weren't susceptible to the original MFBDS (and therefore didn't ++ * have Fill Buffer scrubbing side effects to begin with, or were Client ++ * MDS_NO non-TAA_NO parts where the scrubbing was removed), but have had ++ * the scrubbing reintroduced because of a susceptibility to FBSDP. + * + * If unprivileged guests have (or will have) MMIO mappings, we can + * mitigate cross-domain leakage of fill buffer data by issuing VERW on +- * the return-to-guest path. ++ * the return-to-guest path. This is only a token effort if SMT is ++ * active. + */ + if ( opt_unpriv_mmio ) + opt_verw_mmio = cpu_has_fb_clear; + + /* +- * By default, enable PV and HVM mitigations on MDS-vulnerable hardware. +- * This will only be a token effort for MLPDS/MFBDS when HT is enabled, +- * but it is somewhat better than nothing. ++ * MD_CLEAR is enumerated architecturally forevermore, even after the ++ * scrubbing side effects have been removed. Create ourselves an version ++ * which expressed whether we think MD_CLEAR is having any useful side ++ * effect. ++ */ ++ cpu_has_useful_md_clear = (cpu_has_md_clear && ++ (cpu_has_bug_mds || cpu_has_bug_msbds_only)); ++ ++ /* ++ * By default, use VERW scrubbing on applicable hardware, if we think it's ++ * going to have an effect. This will only be a token effort for ++ * MLPDS/MFBDS when SMT is enabled. + */ + if ( opt_verw_pv == -1 ) +- opt_verw_pv = ((cpu_has_bug_mds || cpu_has_bug_msbds_only) && +- cpu_has_md_clear); ++ opt_verw_pv = cpu_has_useful_md_clear; + + if ( opt_verw_hvm == -1 ) +- opt_verw_hvm = ((cpu_has_bug_mds || cpu_has_bug_msbds_only) && +- cpu_has_md_clear); ++ opt_verw_hvm = cpu_has_useful_md_clear; + + /* +- * Enable MDS/MMIO defences as applicable. The Idle blocks need using if +- * either the PV or HVM MDS defences are used, or if we may give MMIO +- * access to untrusted guests. +- * +- * HVM is more complicated. The MD_CLEAR microcode extends L1D_FLUSH with +- * equivalent semantics to avoid needing to perform both flushes on the +- * HVM path. Therefore, we don't need VERW in addition to L1D_FLUSH (for +- * MDS mitigations. L1D_FLUSH is not safe for MMIO mitigations.) +- * +- * After calculating the appropriate idle setting, simplify +- * opt_verw_hvm to mean just "should we VERW on the way into HVM +- * guests", so spec_ctrl_init_domain() can calculate suitable settings. ++ * If SMT is active, and we're protecting against MDS or MMIO stale data, ++ * we need to scrub before going idle as well as on return to guest. ++ * Various pipeline resources are repartitioned amongst non-idle threads. + */ +- if ( opt_verw_pv || opt_verw_hvm || opt_verw_mmio ) ++ if ( ((cpu_has_useful_md_clear && (opt_verw_pv || opt_verw_hvm)) || ++ opt_verw_mmio) && hw_smt_enabled ) + setup_force_cpu_cap(X86_FEATURE_SC_VERW_IDLE); +- opt_verw_hvm &= !cpu_has_skip_l1dfl && !opt_l1d_flush; ++ ++ /* ++ * After calculating the appropriate idle setting, simplify opt_verw_hvm ++ * to mean just "should we VERW on the way into HVM guests", so ++ * spec_ctrl_init_domain() can calculate suitable settings. ++ * ++ * It is only safe to use L1D_FLUSH in place of VERW when MD_CLEAR is the ++ * only *_CLEAR we can see. ++ */ ++ if ( opt_l1d_flush && cpu_has_md_clear && !cpu_has_fb_clear ) ++ opt_verw_hvm = false; + + /* + * Warn the user if they are on MLPDS/MFBDS-vulnerable hardware with HT diff --git a/main/xen/xsa452-4.16-7.patch b/main/xen/xsa452-4.16-7.patch new file mode 100644 index 00000000000..9862522917d --- /dev/null +++ b/main/xen/xsa452-4.16-7.patch @@ -0,0 +1,299 @@ +From: Andrew Cooper <andrew.cooper3@citrix.com> +Subject: x86/spec-ctrl: Mitigation Register File Data Sampling + +RFDS affects Atom cores, also branded E-cores, between the Goldmont and +Gracemont microarchitectures. This includes Alder Lake and Raptor Lake hybrid +clien systems which have a mix of Gracemont and other types of cores. + +Two new bits have been defined; RFDS_CLEAR to indicate VERW has more side +effets, and RFDS_NO to incidate that the system is unaffected. Plenty of +unaffected CPUs won't be getting RFDS_NO retrofitted in microcode, so we +synthesise it. Alder Lake and Raptor Lake Xeon-E's are unaffected due to +their platform configuration, and we must use the Hybrid CPUID bit to +distinguish them from their non-Xeon counterparts. + +Like MD_CLEAR and FB_CLEAR, RFDS_CLEAR needs OR-ing across a resource pool, so +set it in the max policies and reflect the host setting in default. + +This is part of XSA-452 / CVE-2023-28746. + +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +(cherry picked from commit fb5b6f6744713410c74cfc12b7176c108e3c9a31) + +diff --git a/tools/misc/xen-cpuid.c b/tools/misc/xen-cpuid.c +index c55a6e767809..0c792679e594 100644 +--- a/tools/misc/xen-cpuid.c ++++ b/tools/misc/xen-cpuid.c +@@ -170,7 +170,7 @@ static const char *const str_7d0[32] = + [ 8] = "avx512-vp2intersect", [ 9] = "srbds-ctrl", + [10] = "md-clear", [11] = "rtm-always-abort", + /* 12 */ [13] = "tsx-force-abort", +- [14] = "serialize", ++ [14] = "serialize", [15] = "hybrid", + [16] = "tsxldtrk", + [18] = "pconfig", + [20] = "cet-ibt", +@@ -230,7 +230,8 @@ static const char *const str_m10Al[32] = + [20] = "bhi-no", [21] = "xapic-status", + /* 22 */ [23] = "ovrclk-status", + [24] = "pbrsb-no", [25] = "gds-ctrl", +- [26] = "gds-no", ++ [26] = "gds-no", [27] = "rfds-no", ++ [28] = "rfds-clear", + }; + + static const char *const str_m10Ah[32] = +diff --git a/xen/arch/x86/cpu-policy.c b/xen/arch/x86/cpu-policy.c +index 34f778dbafbb..c872afda3e2b 100644 +--- a/xen/arch/x86/cpu-policy.c ++++ b/xen/arch/x86/cpu-policy.c +@@ -443,6 +443,7 @@ static void __init guest_common_max_feature_adjustments(uint32_t *fs) + */ + __set_bit(X86_FEATURE_MD_CLEAR, fs); + __set_bit(X86_FEATURE_FB_CLEAR, fs); ++ __set_bit(X86_FEATURE_RFDS_CLEAR, fs); + + /* + * The Gather Data Sampling microcode mitigation (August 2023) has an +@@ -492,6 +493,10 @@ static void __init guest_common_default_feature_adjustments(uint32_t *fs) + if ( cpu_has_fb_clear ) + __set_bit(X86_FEATURE_FB_CLEAR, fs); + ++ __clear_bit(X86_FEATURE_RFDS_CLEAR, fs); ++ if ( cpu_has_rfds_clear ) ++ __set_bit(X86_FEATURE_RFDS_CLEAR, fs); ++ + /* + * The Gather Data Sampling microcode mitigation (August 2023) has an + * adverse performance impact on the CLWB instruction on SKX/CLX/CPX. +diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c +index 2e80e0871642..24bf98a018a0 100644 +--- a/xen/arch/x86/spec_ctrl.c ++++ b/xen/arch/x86/spec_ctrl.c +@@ -432,7 +432,7 @@ static void __init print_details(enum ind_thunk thunk) + * Hardware read-only information, stating immunity to certain issues, or + * suggestions of which mitigation to use. + */ +- printk(" Hardware hints:%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", ++ printk(" Hardware hints:%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", + (caps & ARCH_CAPS_RDCL_NO) ? " RDCL_NO" : "", + (caps & ARCH_CAPS_EIBRS) ? " EIBRS" : "", + (caps & ARCH_CAPS_RSBA) ? " RSBA" : "", +@@ -448,6 +448,7 @@ static void __init print_details(enum ind_thunk thunk) + (caps & ARCH_CAPS_FB_CLEAR) ? " FB_CLEAR" : "", + (caps & ARCH_CAPS_PBRSB_NO) ? " PBRSB_NO" : "", + (caps & ARCH_CAPS_GDS_NO) ? " GDS_NO" : "", ++ (caps & ARCH_CAPS_RFDS_NO) ? " RFDS_NO" : "", + (e8b & cpufeat_mask(X86_FEATURE_IBRS_ALWAYS)) ? " IBRS_ALWAYS" : "", + (e8b & cpufeat_mask(X86_FEATURE_STIBP_ALWAYS)) ? " STIBP_ALWAYS" : "", + (e8b & cpufeat_mask(X86_FEATURE_IBRS_FAST)) ? " IBRS_FAST" : "", +@@ -458,7 +459,7 @@ static void __init print_details(enum ind_thunk thunk) + (e21a & cpufeat_mask(X86_FEATURE_SRSO_NO)) ? " SRSO_NO" : ""); + + /* Hardware features which need driving to mitigate issues. */ +- printk(" Hardware features:%s%s%s%s%s%s%s%s%s%s%s%s%s\n", ++ printk(" Hardware features:%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", + (e8b & cpufeat_mask(X86_FEATURE_IBPB)) || + (_7d0 & cpufeat_mask(X86_FEATURE_IBRSB)) ? " IBPB" : "", + (e8b & cpufeat_mask(X86_FEATURE_IBRS)) || +@@ -476,6 +477,7 @@ static void __init print_details(enum ind_thunk thunk) + (caps & ARCH_CAPS_TSX_CTRL) ? " TSX_CTRL" : "", + (caps & ARCH_CAPS_FB_CLEAR_CTRL) ? " FB_CLEAR_CTRL" : "", + (caps & ARCH_CAPS_GDS_CTRL) ? " GDS_CTRL" : "", ++ (caps & ARCH_CAPS_RFDS_CLEAR) ? " RFDS_CLEAR" : "", + (e21a & cpufeat_mask(X86_FEATURE_SBPB)) ? " SBPB" : ""); + + /* Compiled-in support which pertains to mitigations. */ +@@ -1324,6 +1326,83 @@ static __init void mds_calculations(void) + } + } + ++/* ++ * Register File Data Sampling affects Atom cores from the Goldmont to ++ * Gracemont microarchitectures. The March 2024 microcode adds RFDS_NO to ++ * some but not all unaffected parts, and RFDS_CLEAR to affected parts still ++ * in support. ++ * ++ * Alder Lake and Raptor Lake client CPUs have a mix of P cores ++ * (Golden/Raptor Cove, not vulnerable) and E cores (Gracemont, ++ * vulnerable), and both enumerate RFDS_CLEAR. ++ * ++ * Both exist in a Xeon SKU, which has the E cores (Gracemont) disabled by ++ * platform configuration, and enumerate RFDS_NO. ++ * ++ * With older parts, or with out-of-date microcode, synthesise RFDS_NO when ++ * safe to do so. ++ * ++ * https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/advisory-guidance/register-file-data-sampling.html ++ */ ++static void __init rfds_calculations(void) ++{ ++ /* RFDS is only known to affect Intel Family 6 processors at this time. */ ++ if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || ++ boot_cpu_data.x86 != 6 ) ++ return; ++ ++ /* ++ * If RFDS_NO or RFDS_CLEAR are visible, we've either got suitable ++ * microcode, or an RFDS-aware hypervisor is levelling us in a pool. ++ */ ++ if ( cpu_has_rfds_no || cpu_has_rfds_clear ) ++ return; ++ ++ /* If we're virtualised, don't attempt to synthesise RFDS_NO. */ ++ if ( cpu_has_hypervisor ) ++ return; ++ ++ /* ++ * Not all CPUs are expected to get a microcode update enumerating one of ++ * RFDS_{NO,CLEAR}, or we might have out-of-date microcode. ++ */ ++ switch ( boot_cpu_data.x86_model ) ++ { ++ case 0x97: /* INTEL_FAM6_ALDERLAKE */ ++ case 0xB7: /* INTEL_FAM6_RAPTORLAKE */ ++ /* ++ * Alder Lake and Raptor Lake might be a client SKU (with the ++ * Gracemont cores active, and therefore vulnerable) or might be a ++ * server SKU (with the Gracemont cores disabled, and therefore not ++ * vulnerable). ++ * ++ * See if the CPU identifies as hybrid to distinguish the two cases. ++ */ ++ if ( !cpu_has_hybrid ) ++ break; ++ /* fallthrough */ ++ case 0x9A: /* INTEL_FAM6_ALDERLAKE_L */ ++ case 0xBA: /* INTEL_FAM6_RAPTORLAKE_P */ ++ case 0xBF: /* INTEL_FAM6_RAPTORLAKE_S */ ++ ++ case 0x5C: /* INTEL_FAM6_ATOM_GOLDMONT */ /* Apollo Lake */ ++ case 0x5F: /* INTEL_FAM6_ATOM_GOLDMONT_D */ /* Denverton */ ++ case 0x7A: /* INTEL_FAM6_ATOM_GOLDMONT_PLUS */ /* Gemini Lake */ ++ case 0x86: /* INTEL_FAM6_ATOM_TREMONT_D */ /* Snow Ridge / Parker Ridge */ ++ case 0x96: /* INTEL_FAM6_ATOM_TREMONT */ /* Elkhart Lake */ ++ case 0x9C: /* INTEL_FAM6_ATOM_TREMONT_L */ /* Jasper Lake */ ++ case 0xBE: /* INTEL_FAM6_ATOM_GRACEMONT */ /* Alder Lake N */ ++ return; ++ } ++ ++ /* ++ * We appear to be on an unaffected CPU which didn't enumerate RFDS_NO, ++ * perhaps because of it's age or because of out-of-date microcode. ++ * Synthesise it. ++ */ ++ setup_force_cpu_cap(X86_FEATURE_RFDS_NO); ++} ++ + static bool __init cpu_has_gds(void) + { + /* +@@ -1832,6 +1911,7 @@ void __init init_speculation_mitigations(void) + * + * https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/technical-documentation/intel-analysis-microarchitectural-data-sampling.html + * https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/technical-documentation/processor-mmio-stale-data-vulnerabilities.html ++ * https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/advisory-guidance/register-file-data-sampling.html + * + * Relevant ucodes: + * +@@ -1861,8 +1941,12 @@ void __init init_speculation_mitigations(void) + * + * If FB_CLEAR is enumerated, L1D_FLUSH does not have the same scrubbing + * side effects as VERW and cannot be used in its place. ++ * ++ * - March 2023, for RFDS. Enumerate RFDS_CLEAR to mean that VERW now ++ * scrubs non-architectural entries from certain register files. + */ + mds_calculations(); ++ rfds_calculations(); + + /* + * Parts which enumerate FB_CLEAR are those with now-updated microcode +@@ -1894,15 +1978,19 @@ void __init init_speculation_mitigations(void) + * MLPDS/MFBDS when SMT is enabled. + */ + if ( opt_verw_pv == -1 ) +- opt_verw_pv = cpu_has_useful_md_clear; ++ opt_verw_pv = cpu_has_useful_md_clear || cpu_has_rfds_clear; + + if ( opt_verw_hvm == -1 ) +- opt_verw_hvm = cpu_has_useful_md_clear; ++ opt_verw_hvm = cpu_has_useful_md_clear || cpu_has_rfds_clear; + + /* + * If SMT is active, and we're protecting against MDS or MMIO stale data, + * we need to scrub before going idle as well as on return to guest. + * Various pipeline resources are repartitioned amongst non-idle threads. ++ * ++ * We don't need to scrub on idle for RFDS. There are no affected cores ++ * which support SMT, despite there being affected cores in hybrid systems ++ * which have SMT elsewhere in the platform. + */ + if ( ((cpu_has_useful_md_clear && (opt_verw_pv || opt_verw_hvm)) || + opt_verw_mmio) && hw_smt_enabled ) +@@ -1916,7 +2004,8 @@ void __init init_speculation_mitigations(void) + * It is only safe to use L1D_FLUSH in place of VERW when MD_CLEAR is the + * only *_CLEAR we can see. + */ +- if ( opt_l1d_flush && cpu_has_md_clear && !cpu_has_fb_clear ) ++ if ( opt_l1d_flush && cpu_has_md_clear && !cpu_has_fb_clear && ++ !cpu_has_rfds_clear ) + opt_verw_hvm = false; + + /* +diff --git a/xen/include/asm-x86/cpufeature.h b/xen/include/asm-x86/cpufeature.h +index 81ac4d76eea6..1869732bcb9b 100644 +--- a/xen/include/asm-x86/cpufeature.h ++++ b/xen/include/asm-x86/cpufeature.h +@@ -138,6 +138,7 @@ + #define cpu_has_rtm_always_abort boot_cpu_has(X86_FEATURE_RTM_ALWAYS_ABORT) + #define cpu_has_tsx_force_abort boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT) + #define cpu_has_serialize boot_cpu_has(X86_FEATURE_SERIALIZE) ++#define cpu_has_hybrid boot_cpu_has(X86_FEATURE_HYBRID) + #define cpu_has_arch_caps boot_cpu_has(X86_FEATURE_ARCH_CAPS) + + /* CPUID level 0x00000007:1.eax */ +@@ -157,6 +158,8 @@ + #define cpu_has_rrsba boot_cpu_has(X86_FEATURE_RRSBA) + #define cpu_has_gds_ctrl boot_cpu_has(X86_FEATURE_GDS_CTRL) + #define cpu_has_gds_no boot_cpu_has(X86_FEATURE_GDS_NO) ++#define cpu_has_rfds_no boot_cpu_has(X86_FEATURE_RFDS_NO) ++#define cpu_has_rfds_clear boot_cpu_has(X86_FEATURE_RFDS_CLEAR) + + /* Synthesized. */ + #define cpu_has_arch_perfmon boot_cpu_has(X86_FEATURE_ARCH_PERFMON) +diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h +index 8251b8258b79..eb6295d8a7a4 100644 +--- a/xen/include/asm-x86/msr-index.h ++++ b/xen/include/asm-x86/msr-index.h +@@ -77,6 +77,8 @@ + #define ARCH_CAPS_PBRSB_NO (_AC(1, ULL) << 24) + #define ARCH_CAPS_GDS_CTRL (_AC(1, ULL) << 25) + #define ARCH_CAPS_GDS_NO (_AC(1, ULL) << 26) ++#define ARCH_CAPS_RFDS_NO (_AC(1, ULL) << 27) ++#define ARCH_CAPS_RFDS_CLEAR (_AC(1, ULL) << 28) + + #define MSR_FLUSH_CMD 0x0000010b + #define FLUSH_CMD_L1D (_AC(1, ULL) << 0) +diff --git a/xen/include/public/arch-x86/cpufeatureset.h b/xen/include/public/arch-x86/cpufeatureset.h +index 2906eaa6c290..7a9d8d05d3fb 100644 +--- a/xen/include/public/arch-x86/cpufeatureset.h ++++ b/xen/include/public/arch-x86/cpufeatureset.h +@@ -279,6 +279,7 @@ XEN_CPUFEATURE(MD_CLEAR, 9*32+10) /*!A VERW clears microarchitectural buffe + XEN_CPUFEATURE(RTM_ALWAYS_ABORT, 9*32+11) /*! June 2021 TSX defeaturing in microcode. */ + XEN_CPUFEATURE(TSX_FORCE_ABORT, 9*32+13) /* MSR_TSX_FORCE_ABORT.RTM_ABORT */ + XEN_CPUFEATURE(SERIALIZE, 9*32+14) /*a SERIALIZE insn */ ++XEN_CPUFEATURE(HYBRID, 9*32+15) /* Heterogeneous platform */ + XEN_CPUFEATURE(TSXLDTRK, 9*32+16) /*a TSX load tracking suspend/resume insns */ + XEN_CPUFEATURE(CET_IBT, 9*32+20) /* CET - Indirect Branch Tracking */ + XEN_CPUFEATURE(IBRSB, 9*32+26) /*A IBRS and IBPB support (used by Intel) */ +@@ -338,6 +339,8 @@ XEN_CPUFEATURE(OVRCLK_STATUS, 16*32+23) /* MSR_OVERCLOCKING_STATUS */ + XEN_CPUFEATURE(PBRSB_NO, 16*32+24) /*A No Post-Barrier RSB predictions */ + XEN_CPUFEATURE(GDS_CTRL, 16*32+25) /* MCU_OPT_CTRL.GDS_MIT_{DIS,LOCK} */ + XEN_CPUFEATURE(GDS_NO, 16*32+26) /*A No Gather Data Sampling */ ++XEN_CPUFEATURE(RFDS_NO, 16*32+27) /*A No Register File Data Sampling */ ++XEN_CPUFEATURE(RFDS_CLEAR, 16*32+28) /*!A Register File(s) cleared by VERW */ + + /* Intel-defined CPU features, MSR_ARCH_CAPS 0x10a.edx, word 17 */ + diff --git a/main/xen/xsa453-4.16-1.patch b/main/xen/xsa453-4.16-1.patch new file mode 100644 index 00000000000..c9fd8c21bb8 --- /dev/null +++ b/main/xen/xsa453-4.16-1.patch @@ -0,0 +1,148 @@ +From: Andrew Cooper <andrew.cooper3@citrix.com> +Subject: x86/paging: Delete update_cr3()'s do_locking parameter + +Nicola reports that the XSA-438 fix introduced new MISRA violations because of +some incidental tidying it tried to do. The parameter is useless, so resolve +the MISRA regression by removing it. + +hap_update_cr3() discards the parameter entirely, while sh_update_cr3() uses +it to distinguish internal and external callers and therefore whether the +paging lock should be taken. + +However, we have paging_lock_recursive() for this purpose, which also avoids +the ability for the shadow internal callers to accidentally not hold the lock. + +Fixes: fb0ff49fe9f7 ("x86/shadow: defer releasing of PV's top-level shadow reference") +Reported-by: Nicola Vetrini <nicola.vetrini@bugseng.com> +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +Release-acked-by: Henry Wang <Henry.Wang@arm.com> +(cherry picked from commit e71157d1ac2a7fbf413130663cf0a93ff9fbcf7e) + +diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c +index fa479d3d97b3..63c29da696dd 100644 +--- a/xen/arch/x86/mm/hap/hap.c ++++ b/xen/arch/x86/mm/hap/hap.c +@@ -728,7 +728,7 @@ static bool_t hap_invlpg(struct vcpu *v, unsigned long linear) + return 1; + } + +-static pagetable_t hap_update_cr3(struct vcpu *v, bool do_locking, bool noflush) ++static pagetable_t hap_update_cr3(struct vcpu *v, bool noflush) + { + v->arch.hvm.hw_cr[3] = v->arch.hvm.guest_cr[3]; + hvm_update_guest_cr3(v, noflush); +@@ -818,7 +818,7 @@ static void hap_update_paging_modes(struct vcpu *v) + } + + /* CR3 is effectively updated by a mode change. Flush ASIDs, etc. */ +- hap_update_cr3(v, 0, false); ++ hap_update_cr3(v, false); + + unlock: + paging_unlock(d); +diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c +index 242b93537f9a..a8869a3fb7eb 100644 +--- a/xen/arch/x86/mm/shadow/common.c ++++ b/xen/arch/x86/mm/shadow/common.c +@@ -2563,7 +2563,7 @@ static void sh_update_paging_modes(struct vcpu *v) + } + #endif /* OOS */ + +- v->arch.paging.mode->update_cr3(v, 0, false); ++ v->arch.paging.mode->update_cr3(v, false); + } + + void shadow_update_paging_modes(struct vcpu *v) +diff --git a/xen/arch/x86/mm/shadow/multi.c b/xen/arch/x86/mm/shadow/multi.c +index cf3ded70e75e..78bb89f1ee04 100644 +--- a/xen/arch/x86/mm/shadow/multi.c ++++ b/xen/arch/x86/mm/shadow/multi.c +@@ -2499,7 +2499,7 @@ static int sh_page_fault(struct vcpu *v, + * In any case, in the PAE case, the ASSERT is not true; it can + * happen because of actions the guest is taking. */ + #if GUEST_PAGING_LEVELS == 3 +- v->arch.paging.mode->update_cr3(v, 0, false); ++ v->arch.paging.mode->update_cr3(v, false); + #else + ASSERT(d->is_shutting_down); + #endif +@@ -3219,17 +3219,13 @@ sh_detach_old_tables(struct vcpu *v) + } + } + +-static pagetable_t +-sh_update_cr3(struct vcpu *v, bool do_locking, bool noflush) ++static pagetable_t sh_update_cr3(struct vcpu *v, bool noflush) + /* Updates vcpu->arch.cr3 after the guest has changed CR3. + * Paravirtual guests should set v->arch.guest_table (and guest_table_user, + * if appropriate). + * HVM guests should also make sure hvm_get_guest_cntl_reg(v, 3) works; + * this function will call hvm_update_guest_cr(v, 3) to tell them where the + * shadow tables are. +- * If do_locking != 0, assume we are being called from outside the +- * shadow code, and must take and release the paging lock; otherwise +- * that is the caller's responsibility. + */ + { + struct domain *d = v->domain; +@@ -3247,7 +3243,11 @@ sh_update_cr3(struct vcpu *v, bool do_locking, bool noflush) + return old_entry; + } + +- if ( do_locking ) paging_lock(v->domain); ++ /* ++ * This is used externally (with the paging lock not taken) and internally ++ * by the shadow code (with the lock already taken). ++ */ ++ paging_lock_recursive(v->domain); + + #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) + /* Need to resync all the shadow entries on a TLB flush. Resync +@@ -3483,8 +3483,7 @@ sh_update_cr3(struct vcpu *v, bool do_locking, bool noflush) + shadow_sync_other_vcpus(v); + #endif + +- /* Release the lock, if we took it (otherwise it's the caller's problem) */ +- if ( do_locking ) paging_unlock(v->domain); ++ paging_unlock(v->domain); + + return old_entry; + } +diff --git a/xen/arch/x86/mm/shadow/none.c b/xen/arch/x86/mm/shadow/none.c +index 2a5fd409b2d8..003536980803 100644 +--- a/xen/arch/x86/mm/shadow/none.c ++++ b/xen/arch/x86/mm/shadow/none.c +@@ -52,7 +52,7 @@ static unsigned long _gva_to_gfn(struct vcpu *v, struct p2m_domain *p2m, + } + #endif + +-static pagetable_t _update_cr3(struct vcpu *v, bool do_locking, bool noflush) ++static pagetable_t _update_cr3(struct vcpu *v, bool noflush) + { + ASSERT_UNREACHABLE(); + return pagetable_null(); +diff --git a/xen/include/asm-x86/paging.h b/xen/include/asm-x86/paging.h +index fceb208d3671..bd7c7008ae79 100644 +--- a/xen/include/asm-x86/paging.h ++++ b/xen/include/asm-x86/paging.h +@@ -138,8 +138,7 @@ struct paging_mode { + paddr_t ga, uint32_t *pfec, + unsigned int *page_order); + #endif +- pagetable_t (*update_cr3 )(struct vcpu *v, bool do_locking, +- bool noflush); ++ pagetable_t (*update_cr3 )(struct vcpu *v, bool noflush); + void (*update_paging_modes )(struct vcpu *v); + bool (*flush_tlb )(bool (*flush_vcpu)(void *ctxt, + struct vcpu *v), +@@ -317,7 +316,7 @@ static inline unsigned long paging_ga_to_gfn_cr3(struct vcpu *v, + * as the value to load into the host CR3 to schedule this vcpu */ + static inline pagetable_t paging_update_cr3(struct vcpu *v, bool noflush) + { +- return paging_get_hostmode(v)->update_cr3(v, 1, noflush); ++ return paging_get_hostmode(v)->update_cr3(v, noflush); + } + + /* Update all the things that are derived from the guest's CR0/CR3/CR4. + diff --git a/main/xen/xsa453-4.16-2.patch b/main/xen/xsa453-4.16-2.patch new file mode 100644 index 00000000000..2f0c1b2084b --- /dev/null +++ b/main/xen/xsa453-4.16-2.patch @@ -0,0 +1,49 @@ +From: Andrew Cooper <andrew.cooper3@citrix.com> +Subject: xen: Swap order of actions in the FREE*() macros + +Wherever possible, it is a good idea to NULL out the visible reference to an +object prior to freeing it. The FREE*() macros already collect together both +parts, making it easy to adjust. + +This has a marginal code generation improvement, as some of the calls to the +free() function can be tailcall optimised. + +No functional change. + +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Acked-by: Jan Beulich <jbeulich@suse.com> +(cherry picked from commit c4f427ec879e7c0df6d44d02561e8bee838a293e) + +diff --git a/xen/include/xen/mm.h b/xen/include/xen/mm.h +index 3f5c296138cf..c0b77d563d80 100644 +--- a/xen/include/xen/mm.h ++++ b/xen/include/xen/mm.h +@@ -80,8 +80,9 @@ bool scrub_free_pages(void); + + /* Free an allocation, and zero the pointer to it. */ + #define FREE_XENHEAP_PAGES(p, o) do { \ +- free_xenheap_pages(p, o); \ ++ void *_ptr_ = (p); \ + (p) = NULL; \ ++ free_xenheap_pages(_ptr_, o); \ + } while ( false ) + #define FREE_XENHEAP_PAGE(p) FREE_XENHEAP_PAGES(p, 0) + +diff --git a/xen/include/xen/xmalloc.h b/xen/include/xen/xmalloc.h +index 16979a117c6a..d857298011c1 100644 +--- a/xen/include/xen/xmalloc.h ++++ b/xen/include/xen/xmalloc.h +@@ -66,9 +66,10 @@ + extern void xfree(void *); + + /* Free an allocation, and zero the pointer to it. */ +-#define XFREE(p) do { \ +- xfree(p); \ +- (p) = NULL; \ ++#define XFREE(p) do { \ ++ void *_ptr_ = (p); \ ++ (p) = NULL; \ ++ xfree(_ptr_); \ + } while ( false ) + + /* Underlying functions */ diff --git a/main/xen/xsa453-4.16-3.patch b/main/xen/xsa453-4.16-3.patch new file mode 100644 index 00000000000..07ce4e78ac6 --- /dev/null +++ b/main/xen/xsa453-4.16-3.patch @@ -0,0 +1,313 @@ +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> +Subject: x86/spinlock: introduce support for blocking speculation into + critical regions + +Introduce a new Kconfig option to block speculation into lock protected +critical regions. The Kconfig option is enabled by default, but the mitigation +won't be engaged unless it's explicitly enabled in the command line using +`spec-ctrl=lock-harden`. + +Convert the spinlock acquire macros into always-inline functions, and introduce +a speculation barrier after the lock has been taken. Note the speculation +barrier is not placed inside the implementation of the spin lock functions, as +to prevent speculation from falling through the call to the lock functions +resulting in the barrier also being skipped. + +trylock variants are protected using a construct akin to the existing +evaluate_nospec(). + +This patch only implements the speculation barrier for x86. + +Note spin locks are the only locking primitive taken care in this change, +further locking primitives will be adjusted by separate changes. + +This is part of XSA-453 / CVE-2024-2193 + +Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +(cherry picked from commit 7ef0084418e188d05f338c3e028fbbe8b6924afa) + +diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc +index 029002fa82d6..33c32cfc1cbc 100644 +--- a/docs/misc/xen-command-line.pandoc ++++ b/docs/misc/xen-command-line.pandoc +@@ -2263,7 +2263,7 @@ By default SSBD will be mitigated at runtime (i.e `ssbd=runtime`). + > {msr-sc,rsb,verw,ibpb-entry}=<bool>|{pv,hvm}=<bool>, + > bti-thunk=retpoline|lfence|jmp, {ibrs,ibpb,ssbd,psfd, + > eager-fpu,l1d-flush,branch-harden,srb-lock, +-> unpriv-mmio,gds-mit,div-scrub}=<bool> ]` ++> unpriv-mmio,gds-mit,div-scrub,lock-harden}=<bool> ]` + + Controls for speculative execution sidechannel mitigations. By default, Xen + will pick the most appropriate mitigations based on compiled in support, +@@ -2388,6 +2388,11 @@ On all hardware, the `div-scrub=` option can be used to force or prevent Xen + from mitigating the DIV-leakage vulnerability. By default, Xen will mitigate + DIV-leakage on hardware believed to be vulnerable. + ++If Xen is compiled with `CONFIG_SPECULATIVE_HARDEN_LOCK`, the `lock-harden=` ++boolean can be used to force or prevent Xen from using speculation barriers to ++protect lock critical regions. This mitigation won't be engaged by default, ++and needs to be explicitly enabled on the command line. ++ + ### sync_console + > `= <boolean>` + +diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c +index 24bf98a018a0..0a7af22a9b3c 100644 +--- a/xen/arch/x86/spec_ctrl.c ++++ b/xen/arch/x86/spec_ctrl.c +@@ -63,6 +63,7 @@ int8_t __read_mostly opt_ibpb_ctxt_switch = -1; + int8_t __read_mostly opt_eager_fpu = -1; + int8_t __read_mostly opt_l1d_flush = -1; + static bool __initdata opt_branch_harden = true; ++static bool __initdata opt_lock_harden; + + bool __initdata bsp_delay_spec_ctrl; + uint8_t __read_mostly default_xen_spec_ctrl; +@@ -131,6 +132,7 @@ static int __init parse_spec_ctrl(const char *s) + opt_ssbd = false; + opt_l1d_flush = 0; + opt_branch_harden = false; ++ opt_lock_harden = false; + opt_srb_lock = 0; + opt_unpriv_mmio = false; + opt_gds_mit = 0; +@@ -282,6 +284,16 @@ static int __init parse_spec_ctrl(const char *s) + opt_l1d_flush = val; + else if ( (val = parse_boolean("branch-harden", s, ss)) >= 0 ) + opt_branch_harden = val; ++ else if ( (val = parse_boolean("lock-harden", s, ss)) >= 0 ) ++ { ++ if ( IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_LOCK) ) ++ opt_lock_harden = val; ++ else ++ { ++ no_config_param("SPECULATIVE_HARDEN_LOCK", "spec-ctrl", s, ss); ++ rc = -EINVAL; ++ } ++ } + else if ( (val = parse_boolean("srb-lock", s, ss)) >= 0 ) + opt_srb_lock = val; + else if ( (val = parse_boolean("unpriv-mmio", s, ss)) >= 0 ) +@@ -481,18 +493,22 @@ static void __init print_details(enum ind_thunk thunk) + (e21a & cpufeat_mask(X86_FEATURE_SBPB)) ? " SBPB" : ""); + + /* Compiled-in support which pertains to mitigations. */ +- if ( IS_ENABLED(CONFIG_INDIRECT_THUNK) || IS_ENABLED(CONFIG_SHADOW_PAGING) ) ++ if ( IS_ENABLED(CONFIG_INDIRECT_THUNK) || IS_ENABLED(CONFIG_SHADOW_PAGING) || ++ IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_LOCK) ) + printk(" Compiled-in support:" + #ifdef CONFIG_INDIRECT_THUNK + " INDIRECT_THUNK" + #endif + #ifdef CONFIG_SHADOW_PAGING + " SHADOW_PAGING" ++#endif ++#ifdef CONFIG_SPECULATIVE_HARDEN_LOCK ++ " HARDEN_LOCK" + #endif + "\n"); + + /* Settings for Xen's protection, irrespective of guests. */ +- printk(" Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s%s%s%s, Other:%s%s%s%s%s%s\n", ++ printk(" Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s%s%s%s, Other:%s%s%s%s%s%s%s\n", + thunk == THUNK_NONE ? "N/A" : + thunk == THUNK_RETPOLINE ? "RETPOLINE" : + thunk == THUNK_LFENCE ? "LFENCE" : +@@ -518,7 +534,8 @@ static void __init print_details(enum ind_thunk thunk) + opt_verw_pv || opt_verw_hvm || + opt_verw_mmio ? " VERW" : "", + opt_div_scrub ? " DIV" : "", +- opt_branch_harden ? " BRANCH_HARDEN" : ""); ++ opt_branch_harden ? " BRANCH_HARDEN" : "", ++ opt_lock_harden ? " LOCK_HARDEN" : ""); + + /* L1TF diagnostics, printed if vulnerable or PV shadowing is in use. */ + if ( cpu_has_bug_l1tf || opt_pv_l1tf_hwdom || opt_pv_l1tf_domu ) +@@ -1889,6 +1906,9 @@ void __init init_speculation_mitigations(void) + if ( !opt_branch_harden ) + setup_force_cpu_cap(X86_FEATURE_SC_NO_BRANCH_HARDEN); + ++ if ( !opt_lock_harden ) ++ setup_force_cpu_cap(X86_FEATURE_SC_NO_LOCK_HARDEN); ++ + /* + * We do not disable HT by default on affected hardware. + * +diff --git a/xen/common/Kconfig b/xen/common/Kconfig +index c9f4b7f49240..01c70109f539 100644 +--- a/xen/common/Kconfig ++++ b/xen/common/Kconfig +@@ -161,6 +161,23 @@ config SPECULATIVE_HARDEN_GUEST_ACCESS + + If unsure, say Y. + ++config SPECULATIVE_HARDEN_LOCK ++ bool "Speculative lock context hardening" ++ default y ++ depends on X86 ++ help ++ Contemporary processors may use speculative execution as a ++ performance optimisation, but this can potentially be abused by an ++ attacker to leak data via speculative sidechannels. ++ ++ One source of data leakage is via speculative accesses to lock ++ critical regions. ++ ++ This option is disabled by default at run time, and needs to be ++ enabled on the command line. ++ ++ If unsure, say Y. ++ + endmenu + + config HYPFS +diff --git a/xen/include/asm-x86/cpufeatures.h b/xen/include/asm-x86/cpufeatures.h +index 70b93b6b443f..7e8221fd85dd 100644 +--- a/xen/include/asm-x86/cpufeatures.h ++++ b/xen/include/asm-x86/cpufeatures.h +@@ -24,7 +24,7 @@ XEN_CPUFEATURE(APERFMPERF, X86_SYNTH( 8)) /* APERFMPERF */ + XEN_CPUFEATURE(MFENCE_RDTSC, X86_SYNTH( 9)) /* MFENCE synchronizes RDTSC */ + XEN_CPUFEATURE(XEN_SMEP, X86_SYNTH(10)) /* SMEP gets used by Xen itself */ + XEN_CPUFEATURE(XEN_SMAP, X86_SYNTH(11)) /* SMAP gets used by Xen itself */ +-/* Bit 12 - unused. */ ++XEN_CPUFEATURE(SC_NO_LOCK_HARDEN, X86_SYNTH(12)) /* (Disable) Lock critical region hardening */ + XEN_CPUFEATURE(IND_THUNK_LFENCE, X86_SYNTH(13)) /* Use IND_THUNK_LFENCE */ + XEN_CPUFEATURE(IND_THUNK_JMP, X86_SYNTH(14)) /* Use IND_THUNK_JMP */ + XEN_CPUFEATURE(SC_NO_BRANCH_HARDEN, X86_SYNTH(15)) /* (Disable) Conditional branch hardening */ +diff --git a/xen/include/asm-x86/nospec.h b/xen/include/asm-x86/nospec.h +index 7150e76b87fb..0725839e1982 100644 +--- a/xen/include/asm-x86/nospec.h ++++ b/xen/include/asm-x86/nospec.h +@@ -38,6 +38,32 @@ static always_inline void block_speculation(void) + barrier_nospec_true(); + } + ++static always_inline void arch_block_lock_speculation(void) ++{ ++ alternative("lfence", "", X86_FEATURE_SC_NO_LOCK_HARDEN); ++} ++ ++/* Allow to insert a read memory barrier into conditionals */ ++static always_inline bool barrier_lock_true(void) ++{ ++ alternative("lfence #nospec-true", "", X86_FEATURE_SC_NO_LOCK_HARDEN); ++ return true; ++} ++ ++static always_inline bool barrier_lock_false(void) ++{ ++ alternative("lfence #nospec-false", "", X86_FEATURE_SC_NO_LOCK_HARDEN); ++ return false; ++} ++ ++static always_inline bool arch_lock_evaluate_nospec(bool condition) ++{ ++ if ( condition ) ++ return barrier_lock_true(); ++ else ++ return barrier_lock_false(); ++} ++ + #endif /* _ASM_X86_NOSPEC_H */ + + /* +diff --git a/xen/include/xen/nospec.h b/xen/include/xen/nospec.h +index 76255bc46efe..455284640396 100644 +--- a/xen/include/xen/nospec.h ++++ b/xen/include/xen/nospec.h +@@ -70,6 +70,21 @@ static inline unsigned long array_index_mask_nospec(unsigned long index, + #define array_access_nospec(array, index) \ + (array)[array_index_nospec(index, ARRAY_SIZE(array))] + ++static always_inline void block_lock_speculation(void) ++{ ++#ifdef CONFIG_SPECULATIVE_HARDEN_LOCK ++ arch_block_lock_speculation(); ++#endif ++} ++ ++static always_inline bool lock_evaluate_nospec(bool condition) ++{ ++#ifdef CONFIG_SPECULATIVE_HARDEN_LOCK ++ return arch_lock_evaluate_nospec(condition); ++#endif ++ return condition; ++} ++ + #endif /* XEN_NOSPEC_H */ + + /* +diff --git a/xen/include/xen/spinlock.h b/xen/include/xen/spinlock.h +index 9fa4e600c1f7..efdb21ea9072 100644 +--- a/xen/include/xen/spinlock.h ++++ b/xen/include/xen/spinlock.h +@@ -1,6 +1,7 @@ + #ifndef __SPINLOCK_H__ + #define __SPINLOCK_H__ + ++#include <xen/nospec.h> + #include <xen/time.h> + #include <asm/system.h> + #include <asm/spinlock.h> +@@ -189,13 +190,30 @@ int _spin_trylock_recursive(spinlock_t *lock); + void _spin_lock_recursive(spinlock_t *lock); + void _spin_unlock_recursive(spinlock_t *lock); + +-#define spin_lock(l) _spin_lock(l) +-#define spin_lock_cb(l, c, d) _spin_lock_cb(l, c, d) +-#define spin_lock_irq(l) _spin_lock_irq(l) ++static always_inline void spin_lock(spinlock_t *l) ++{ ++ _spin_lock(l); ++ block_lock_speculation(); ++} ++ ++static always_inline void spin_lock_cb(spinlock_t *l, void (*c)(void *data), ++ void *d) ++{ ++ _spin_lock_cb(l, c, d); ++ block_lock_speculation(); ++} ++ ++static always_inline void spin_lock_irq(spinlock_t *l) ++{ ++ _spin_lock_irq(l); ++ block_lock_speculation(); ++} ++ + #define spin_lock_irqsave(l, f) \ + ({ \ + BUILD_BUG_ON(sizeof(f) != sizeof(unsigned long)); \ + ((f) = _spin_lock_irqsave(l)); \ ++ block_lock_speculation(); \ + }) + + #define spin_unlock(l) _spin_unlock(l) +@@ -203,7 +221,7 @@ void _spin_unlock_recursive(spinlock_t *lock); + #define spin_unlock_irqrestore(l, f) _spin_unlock_irqrestore(l, f) + + #define spin_is_locked(l) _spin_is_locked(l) +-#define spin_trylock(l) _spin_trylock(l) ++#define spin_trylock(l) lock_evaluate_nospec(_spin_trylock(l)) + + #define spin_trylock_irqsave(lock, flags) \ + ({ \ +@@ -224,8 +242,15 @@ void _spin_unlock_recursive(spinlock_t *lock); + * are any critical regions that cannot form part of such a set, they can use + * standard spin_[un]lock(). + */ +-#define spin_trylock_recursive(l) _spin_trylock_recursive(l) +-#define spin_lock_recursive(l) _spin_lock_recursive(l) ++#define spin_trylock_recursive(l) \ ++ lock_evaluate_nospec(_spin_trylock_recursive(l)) ++ ++static always_inline void spin_lock_recursive(spinlock_t *l) ++{ ++ _spin_lock_recursive(l); ++ block_lock_speculation(); ++} ++ + #define spin_unlock_recursive(l) _spin_unlock_recursive(l) + + #endif /* __SPINLOCK_H__ */ diff --git a/main/xen/xsa453-4.16-4.patch b/main/xen/xsa453-4.16-4.patch new file mode 100644 index 00000000000..f53cc4703c1 --- /dev/null +++ b/main/xen/xsa453-4.16-4.patch @@ -0,0 +1,113 @@ +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> +Subject: rwlock: introduce support for blocking speculation into critical + regions + +Introduce inline wrappers as required and add direct calls to +block_lock_speculation() in order to prevent speculation into the rwlock +protected critical regions. + +Note the rwlock primitives are adjusted to use the non speculation safe variants +of the spinlock handlers, as a speculation barrier is added in the rwlock +calling wrappers. + +trylock variants are protected by using lock_evaluate_nospec(). + +This is part of XSA-453 / CVE-2024-2193 + +Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +(cherry picked from commit a1fb15f61692b1fa9945fc51f55471ace49cdd59) + +diff --git a/xen/common/rwlock.c b/xen/common/rwlock.c +index dadab372b5e1..2464f745485d 100644 +--- a/xen/common/rwlock.c ++++ b/xen/common/rwlock.c +@@ -34,8 +34,11 @@ void queue_read_lock_slowpath(rwlock_t *lock) + + /* + * Put the reader into the wait queue. ++ * ++ * Use the speculation unsafe helper, as it's the caller responsibility to ++ * issue a speculation barrier if required. + */ +- spin_lock(&lock->lock); ++ _spin_lock(&lock->lock); + + /* + * At the head of the wait queue now, wait until the writer state +@@ -64,8 +67,13 @@ void queue_write_lock_slowpath(rwlock_t *lock) + { + u32 cnts; + +- /* Put the writer into the wait queue. */ +- spin_lock(&lock->lock); ++ /* ++ * Put the writer into the wait queue. ++ * ++ * Use the speculation unsafe helper, as it's the caller responsibility to ++ * issue a speculation barrier if required. ++ */ ++ _spin_lock(&lock->lock); + + /* Try to acquire the lock directly if no reader is present. */ + if ( !atomic_read(&lock->cnts) && +diff --git a/xen/include/xen/rwlock.h b/xen/include/xen/rwlock.h +index 0cc9167715b3..fd0458be94ae 100644 +--- a/xen/include/xen/rwlock.h ++++ b/xen/include/xen/rwlock.h +@@ -247,27 +247,49 @@ static inline int _rw_is_write_locked(rwlock_t *lock) + return (atomic_read(&lock->cnts) & _QW_WMASK) == _QW_LOCKED; + } + +-#define read_lock(l) _read_lock(l) +-#define read_lock_irq(l) _read_lock_irq(l) ++static always_inline void read_lock(rwlock_t *l) ++{ ++ _read_lock(l); ++ block_lock_speculation(); ++} ++ ++static always_inline void read_lock_irq(rwlock_t *l) ++{ ++ _read_lock_irq(l); ++ block_lock_speculation(); ++} ++ + #define read_lock_irqsave(l, f) \ + ({ \ + BUILD_BUG_ON(sizeof(f) != sizeof(unsigned long)); \ + ((f) = _read_lock_irqsave(l)); \ ++ block_lock_speculation(); \ + }) + + #define read_unlock(l) _read_unlock(l) + #define read_unlock_irq(l) _read_unlock_irq(l) + #define read_unlock_irqrestore(l, f) _read_unlock_irqrestore(l, f) +-#define read_trylock(l) _read_trylock(l) ++#define read_trylock(l) lock_evaluate_nospec(_read_trylock(l)) ++ ++static always_inline void write_lock(rwlock_t *l) ++{ ++ _write_lock(l); ++ block_lock_speculation(); ++} ++ ++static always_inline void write_lock_irq(rwlock_t *l) ++{ ++ _write_lock_irq(l); ++ block_lock_speculation(); ++} + +-#define write_lock(l) _write_lock(l) +-#define write_lock_irq(l) _write_lock_irq(l) + #define write_lock_irqsave(l, f) \ + ({ \ + BUILD_BUG_ON(sizeof(f) != sizeof(unsigned long)); \ + ((f) = _write_lock_irqsave(l)); \ ++ block_lock_speculation(); \ + }) +-#define write_trylock(l) _write_trylock(l) ++#define write_trylock(l) lock_evaluate_nospec(_write_trylock(l)) + + #define write_unlock(l) _write_unlock(l) + #define write_unlock_irq(l) _write_unlock_irq(l) diff --git a/main/xen/xsa453-4.16-5.patch b/main/xen/xsa453-4.16-5.patch new file mode 100644 index 00000000000..94b78eea116 --- /dev/null +++ b/main/xen/xsa453-4.16-5.patch @@ -0,0 +1,75 @@ +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> +Subject: percpu-rwlock: introduce support for blocking speculation into + critical regions + +Add direct calls to block_lock_speculation() where required in order to prevent +speculation into the lock protected critical regions. Also convert +_percpu_read_lock() from inline to always_inline. + +Note that _percpu_write_lock() has been modified the use the non speculation +safe of the locking primites, as a speculation is added unconditionally by the +calling wrapper. + +This is part of XSA-453 / CVE-2024-2193 + +Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +(cherry picked from commit f218daf6d3a3b847736d37c6a6b76031a0d08441) + +diff --git a/xen/common/rwlock.c b/xen/common/rwlock.c +index 2464f745485d..703276f4aa63 100644 +--- a/xen/common/rwlock.c ++++ b/xen/common/rwlock.c +@@ -125,8 +125,12 @@ void _percpu_write_lock(percpu_rwlock_t **per_cpudata, + /* + * First take the write lock to protect against other writers or slow + * path readers. ++ * ++ * Note we use the speculation unsafe variant of write_lock(), as the ++ * calling wrapper already adds a speculation barrier after the lock has ++ * been taken. + */ +- write_lock(&percpu_rwlock->rwlock); ++ _write_lock(&percpu_rwlock->rwlock); + + /* Now set the global variable so that readers start using read_lock. */ + percpu_rwlock->writer_activating = 1; +diff --git a/xen/include/xen/rwlock.h b/xen/include/xen/rwlock.h +index fd0458be94ae..abe0804bf7d5 100644 +--- a/xen/include/xen/rwlock.h ++++ b/xen/include/xen/rwlock.h +@@ -326,8 +326,8 @@ static inline void _percpu_rwlock_owner_check(percpu_rwlock_t **per_cpudata, + #define percpu_rwlock_resource_init(l, owner) \ + (*(l) = (percpu_rwlock_t)PERCPU_RW_LOCK_UNLOCKED(&get_per_cpu_var(owner))) + +-static inline void _percpu_read_lock(percpu_rwlock_t **per_cpudata, +- percpu_rwlock_t *percpu_rwlock) ++static always_inline void _percpu_read_lock(percpu_rwlock_t **per_cpudata, ++ percpu_rwlock_t *percpu_rwlock) + { + /* Validate the correct per_cpudata variable has been provided. */ + _percpu_rwlock_owner_check(per_cpudata, percpu_rwlock); +@@ -362,6 +362,8 @@ static inline void _percpu_read_lock(percpu_rwlock_t **per_cpudata, + } + else + { ++ /* Other branch already has a speculation barrier in read_lock(). */ ++ block_lock_speculation(); + /* All other paths have implicit check_lock() calls via read_lock(). */ + check_lock(&percpu_rwlock->rwlock.lock.debug, false); + } +@@ -410,8 +412,12 @@ static inline void _percpu_write_unlock(percpu_rwlock_t **per_cpudata, + _percpu_read_lock(&get_per_cpu_var(percpu), lock) + #define percpu_read_unlock(percpu, lock) \ + _percpu_read_unlock(&get_per_cpu_var(percpu), lock) +-#define percpu_write_lock(percpu, lock) \ +- _percpu_write_lock(&get_per_cpu_var(percpu), lock) ++ ++#define percpu_write_lock(percpu, lock) \ ++({ \ ++ _percpu_write_lock(&get_per_cpu_var(percpu), lock); \ ++ block_lock_speculation(); \ ++}) + #define percpu_write_unlock(percpu, lock) \ + _percpu_write_unlock(&get_per_cpu_var(percpu), lock) + diff --git a/main/xen/xsa453-4.16-6.patch b/main/xen/xsa453-4.16-6.patch new file mode 100644 index 00000000000..317f61823c7 --- /dev/null +++ b/main/xen/xsa453-4.16-6.patch @@ -0,0 +1,382 @@ +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> +Subject: locking: attempt to ensure lock wrappers are always inline + +In order to prevent the locking speculation barriers from being inside of +`call`ed functions that could be speculatively bypassed. + +While there also add an extra locking barrier to _mm_write_lock() in the branch +taken when the lock is already held. + +Note some functions are switched to use the unsafe variants (without speculation +barrier) of the locking primitives, but a speculation barrier is always added +to the exposed public lock wrapping helper. That's the case with +sched_spin_lock_double() or pcidevs_lock() for example. + +This is part of XSA-453 / CVE-2024-2193 + +Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +(cherry picked from commit 197ecd838a2aaf959a469df3696d4559c4f8b762) + +diff --git a/xen/arch/x86/hvm/vpt.c b/xen/arch/x86/hvm/vpt.c +index 6fdc3e19fe8c..dd2de574cf18 100644 +--- a/xen/arch/x86/hvm/vpt.c ++++ b/xen/arch/x86/hvm/vpt.c +@@ -161,7 +161,7 @@ static int pt_irq_masked(struct periodic_time *pt) + * pt->vcpu field, because another thread holding the pt_migrate lock + * may already be spinning waiting for your vcpu lock. + */ +-static void pt_vcpu_lock(struct vcpu *v) ++static always_inline void pt_vcpu_lock(struct vcpu *v) + { + spin_lock(&v->arch.hvm.tm_lock); + } +@@ -180,9 +180,13 @@ static void pt_vcpu_unlock(struct vcpu *v) + * need to take an additional lock that protects against pt->vcpu + * changing. + */ +-static void pt_lock(struct periodic_time *pt) ++static always_inline void pt_lock(struct periodic_time *pt) + { +- read_lock(&pt->vcpu->domain->arch.hvm.pl_time->pt_migrate); ++ /* ++ * Use the speculation unsafe variant for the first lock, as the following ++ * lock taking helper already includes a speculation barrier. ++ */ ++ _read_lock(&pt->vcpu->domain->arch.hvm.pl_time->pt_migrate); + spin_lock(&pt->vcpu->arch.hvm.tm_lock); + } + +diff --git a/xen/arch/x86/mm/mm-locks.h b/xen/arch/x86/mm/mm-locks.h +index d6c073dc5cf5..cc635a440571 100644 +--- a/xen/arch/x86/mm/mm-locks.h ++++ b/xen/arch/x86/mm/mm-locks.h +@@ -88,8 +88,8 @@ static inline void _set_lock_level(int l) + this_cpu(mm_lock_level) = l; + } + +-static inline void _mm_lock(const struct domain *d, mm_lock_t *l, +- const char *func, int level, int rec) ++static always_inline void _mm_lock(const struct domain *d, mm_lock_t *l, ++ const char *func, int level, int rec) + { + if ( !((mm_locked_by_me(l)) && rec) ) + _check_lock_level(d, level); +@@ -139,8 +139,8 @@ static inline int mm_write_locked_by_me(mm_rwlock_t *l) + return (l->locker == get_processor_id()); + } + +-static inline void _mm_write_lock(const struct domain *d, mm_rwlock_t *l, +- const char *func, int level) ++static always_inline void _mm_write_lock(const struct domain *d, mm_rwlock_t *l, ++ const char *func, int level) + { + if ( !mm_write_locked_by_me(l) ) + { +@@ -151,6 +151,8 @@ static inline void _mm_write_lock(const struct domain *d, mm_rwlock_t *l, + l->unlock_level = _get_lock_level(); + _set_lock_level(_lock_level(d, level)); + } ++ else ++ block_speculation(); + l->recurse_count++; + } + +@@ -164,8 +166,8 @@ static inline void mm_write_unlock(mm_rwlock_t *l) + percpu_write_unlock(p2m_percpu_rwlock, &l->lock); + } + +-static inline void _mm_read_lock(const struct domain *d, mm_rwlock_t *l, +- int level) ++static always_inline void _mm_read_lock(const struct domain *d, mm_rwlock_t *l, ++ int level) + { + _check_lock_level(d, level); + percpu_read_lock(p2m_percpu_rwlock, &l->lock); +@@ -180,15 +182,15 @@ static inline void mm_read_unlock(mm_rwlock_t *l) + + /* This wrapper uses the line number to express the locking order below */ + #define declare_mm_lock(name) \ +- static inline void mm_lock_##name(const struct domain *d, mm_lock_t *l, \ +- const char *func, int rec) \ ++ static always_inline void mm_lock_##name( \ ++ const struct domain *d, mm_lock_t *l, const char *func, int rec) \ + { _mm_lock(d, l, func, MM_LOCK_ORDER_##name, rec); } + #define declare_mm_rwlock(name) \ +- static inline void mm_write_lock_##name(const struct domain *d, \ +- mm_rwlock_t *l, const char *func) \ ++ static always_inline void mm_write_lock_##name( \ ++ const struct domain *d, mm_rwlock_t *l, const char *func) \ + { _mm_write_lock(d, l, func, MM_LOCK_ORDER_##name); } \ +- static inline void mm_read_lock_##name(const struct domain *d, \ +- mm_rwlock_t *l) \ ++ static always_inline void mm_read_lock_##name(const struct domain *d, \ ++ mm_rwlock_t *l) \ + { _mm_read_lock(d, l, MM_LOCK_ORDER_##name); } + /* These capture the name of the calling function */ + #define mm_lock(name, d, l) mm_lock_##name(d, l, __func__, 0) +@@ -321,7 +323,7 @@ declare_mm_lock(altp2mlist) + #define MM_LOCK_ORDER_altp2m 40 + declare_mm_rwlock(altp2m); + +-static inline void p2m_lock(struct p2m_domain *p) ++static always_inline void p2m_lock(struct p2m_domain *p) + { + if ( p2m_is_altp2m(p) ) + mm_write_lock(altp2m, p->domain, &p->lock); +diff --git a/xen/arch/x86/mm/p2m-pod.c b/xen/arch/x86/mm/p2m-pod.c +index a3c9d8a97423..c82628840864 100644 +--- a/xen/arch/x86/mm/p2m-pod.c ++++ b/xen/arch/x86/mm/p2m-pod.c +@@ -35,7 +35,7 @@ + #define superpage_aligned(_x) (((_x)&(SUPERPAGE_PAGES-1))==0) + + /* Enforce lock ordering when grabbing the "external" page_alloc lock */ +-static inline void lock_page_alloc(struct p2m_domain *p2m) ++static always_inline void lock_page_alloc(struct p2m_domain *p2m) + { + page_alloc_mm_pre_lock(p2m->domain); + spin_lock(&(p2m->domain->page_alloc_lock)); +diff --git a/xen/common/event_channel.c b/xen/common/event_channel.c +index da88ad141a69..e5f4e68b8819 100644 +--- a/xen/common/event_channel.c ++++ b/xen/common/event_channel.c +@@ -57,7 +57,7 @@ + * just assume the event channel is free or unbound at the moment when the + * evtchn_read_trylock() returns false. + */ +-static inline void evtchn_write_lock(struct evtchn *evtchn) ++static always_inline void evtchn_write_lock(struct evtchn *evtchn) + { + write_lock(&evtchn->lock); + +@@ -324,7 +324,8 @@ static int evtchn_alloc_unbound(evtchn_alloc_unbound_t *alloc) + return rc; + } + +-static void double_evtchn_lock(struct evtchn *lchn, struct evtchn *rchn) ++static always_inline void double_evtchn_lock(struct evtchn *lchn, ++ struct evtchn *rchn) + { + ASSERT(lchn != rchn); + +diff --git a/xen/common/grant_table.c b/xen/common/grant_table.c +index 76272b3c8add..9464cebdd6e4 100644 +--- a/xen/common/grant_table.c ++++ b/xen/common/grant_table.c +@@ -398,7 +398,7 @@ static inline void act_set_gfn(struct active_grant_entry *act, gfn_t gfn) + + static DEFINE_PERCPU_RWLOCK_GLOBAL(grant_rwlock); + +-static inline void grant_read_lock(struct grant_table *gt) ++static always_inline void grant_read_lock(struct grant_table *gt) + { + percpu_read_lock(grant_rwlock, >->lock); + } +@@ -408,7 +408,7 @@ static inline void grant_read_unlock(struct grant_table *gt) + percpu_read_unlock(grant_rwlock, >->lock); + } + +-static inline void grant_write_lock(struct grant_table *gt) ++static always_inline void grant_write_lock(struct grant_table *gt) + { + percpu_write_lock(grant_rwlock, >->lock); + } +@@ -445,7 +445,7 @@ nr_active_grant_frames(struct grant_table *gt) + return num_act_frames_from_sha_frames(nr_grant_frames(gt)); + } + +-static inline struct active_grant_entry * ++static always_inline struct active_grant_entry * + active_entry_acquire(struct grant_table *t, grant_ref_t e) + { + struct active_grant_entry *act; +diff --git a/xen/common/sched/core.c b/xen/common/sched/core.c +index 03ace41540d6..9e80ad4c7463 100644 +--- a/xen/common/sched/core.c ++++ b/xen/common/sched/core.c +@@ -348,23 +348,28 @@ uint64_t get_cpu_idle_time(unsigned int cpu) + * This avoids dead- or live-locks when this code is running on both + * cpus at the same time. + */ +-static void sched_spin_lock_double(spinlock_t *lock1, spinlock_t *lock2, +- unsigned long *flags) ++static always_inline void sched_spin_lock_double( ++ spinlock_t *lock1, spinlock_t *lock2, unsigned long *flags) + { ++ /* ++ * In order to avoid extra overhead, use the locking primitives without the ++ * speculation barrier, and introduce a single barrier here. ++ */ + if ( lock1 == lock2 ) + { +- spin_lock_irqsave(lock1, *flags); ++ *flags = _spin_lock_irqsave(lock1); + } + else if ( lock1 < lock2 ) + { +- spin_lock_irqsave(lock1, *flags); +- spin_lock(lock2); ++ *flags = _spin_lock_irqsave(lock1); ++ _spin_lock(lock2); + } + else + { +- spin_lock_irqsave(lock2, *flags); +- spin_lock(lock1); ++ *flags = _spin_lock_irqsave(lock2); ++ _spin_lock(lock1); + } ++ block_lock_speculation(); + } + + static void sched_spin_unlock_double(spinlock_t *lock1, spinlock_t *lock2, +diff --git a/xen/common/sched/private.h b/xen/common/sched/private.h +index 0527a8c70d1c..24a93dd0c123 100644 +--- a/xen/common/sched/private.h ++++ b/xen/common/sched/private.h +@@ -207,8 +207,24 @@ DECLARE_PER_CPU(cpumask_t, cpumask_scratch); + #define cpumask_scratch (&this_cpu(cpumask_scratch)) + #define cpumask_scratch_cpu(c) (&per_cpu(cpumask_scratch, c)) + ++/* ++ * Deal with _spin_lock_irqsave() returning the flags value instead of storing ++ * it in a passed parameter. ++ */ ++#define _sched_spinlock0(lock, irq) _spin_lock##irq(lock) ++#define _sched_spinlock1(lock, irq, arg) ({ \ ++ BUILD_BUG_ON(sizeof(arg) != sizeof(unsigned long)); \ ++ (arg) = _spin_lock##irq(lock); \ ++}) ++ ++#define _sched_spinlock__(nr) _sched_spinlock ## nr ++#define _sched_spinlock_(nr) _sched_spinlock__(nr) ++#define _sched_spinlock(lock, irq, args...) \ ++ _sched_spinlock_(count_args(args))(lock, irq, ## args) ++ + #define sched_lock(kind, param, cpu, irq, arg...) \ +-static inline spinlock_t *kind##_schedule_lock##irq(param EXTRA_TYPE(arg)) \ ++static always_inline spinlock_t \ ++*kind##_schedule_lock##irq(param EXTRA_TYPE(arg)) \ + { \ + for ( ; ; ) \ + { \ +@@ -220,10 +236,16 @@ static inline spinlock_t *kind##_schedule_lock##irq(param EXTRA_TYPE(arg)) \ + * \ + * It may also be the case that v->processor may change but the \ + * lock may be the same; this will succeed in that case. \ ++ * \ ++ * Use the speculation unsafe locking helper, there's a speculation \ ++ * barrier before returning to the caller. \ + */ \ +- spin_lock##irq(lock, ## arg); \ ++ _sched_spinlock(lock, irq, ## arg); \ + if ( likely(lock == get_sched_res(cpu)->schedule_lock) ) \ ++ { \ ++ block_lock_speculation(); \ + return lock; \ ++ } \ + spin_unlock##irq(lock, ## arg); \ + } \ + } +diff --git a/xen/common/timer.c b/xen/common/timer.c +index 1bb265ceea0e..dc831efc79e5 100644 +--- a/xen/common/timer.c ++++ b/xen/common/timer.c +@@ -240,7 +240,7 @@ static inline void deactivate_timer(struct timer *timer) + list_add(&timer->inactive, &per_cpu(timers, timer->cpu).inactive); + } + +-static inline bool_t timer_lock(struct timer *timer) ++static inline bool_t timer_lock_unsafe(struct timer *timer) + { + unsigned int cpu; + +@@ -254,7 +254,8 @@ static inline bool_t timer_lock(struct timer *timer) + rcu_read_unlock(&timer_cpu_read_lock); + return 0; + } +- spin_lock(&per_cpu(timers, cpu).lock); ++ /* Use the speculation unsafe variant, the wrapper has the barrier. */ ++ _spin_lock(&per_cpu(timers, cpu).lock); + if ( likely(timer->cpu == cpu) ) + break; + spin_unlock(&per_cpu(timers, cpu).lock); +@@ -267,8 +268,9 @@ static inline bool_t timer_lock(struct timer *timer) + #define timer_lock_irqsave(t, flags) ({ \ + bool_t __x; \ + local_irq_save(flags); \ +- if ( !(__x = timer_lock(t)) ) \ ++ if ( !(__x = timer_lock_unsafe(t)) ) \ + local_irq_restore(flags); \ ++ block_lock_speculation(); \ + __x; \ + }) + +diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c +index 6fc27e7ede40..2fd663062ad5 100644 +--- a/xen/drivers/passthrough/pci.c ++++ b/xen/drivers/passthrough/pci.c +@@ -52,9 +52,10 @@ struct pci_seg { + + static spinlock_t _pcidevs_lock = SPIN_LOCK_UNLOCKED; + +-void pcidevs_lock(void) ++/* Do not use, as it has no speculation barrier, use pcidevs_lock() instead. */ ++void pcidevs_lock_unsafe(void) + { +- spin_lock_recursive(&_pcidevs_lock); ++ _spin_lock_recursive(&_pcidevs_lock); + } + + void pcidevs_unlock(void) +diff --git a/xen/include/asm-x86/irq.h b/xen/include/asm-x86/irq.h +index 7c825e9d9c0a..d4b2beda798d 100644 +--- a/xen/include/asm-x86/irq.h ++++ b/xen/include/asm-x86/irq.h +@@ -177,6 +177,7 @@ extern void irq_complete_move(struct irq_desc *); + + extern struct irq_desc *irq_desc; + ++/* Not speculation safe, only used for AP bringup. */ + void lock_vector_lock(void); + void unlock_vector_lock(void); + +diff --git a/xen/include/xen/event.h b/xen/include/xen/event.h +index 21c95e14fd6a..18924e69e7d0 100644 +--- a/xen/include/xen/event.h ++++ b/xen/include/xen/event.h +@@ -105,12 +105,12 @@ void notify_via_xen_event_channel(struct domain *ld, int lport); + #define bucket_from_port(d, p) \ + ((group_from_port(d, p))[((p) % EVTCHNS_PER_GROUP) / EVTCHNS_PER_BUCKET]) + +-static inline void evtchn_read_lock(struct evtchn *evtchn) ++static always_inline void evtchn_read_lock(struct evtchn *evtchn) + { + read_lock(&evtchn->lock); + } + +-static inline bool evtchn_read_trylock(struct evtchn *evtchn) ++static always_inline bool evtchn_read_trylock(struct evtchn *evtchn) + { + return read_trylock(&evtchn->lock); + } +diff --git a/xen/include/xen/pci.h b/xen/include/xen/pci.h +index ac3880e686f8..3f1324e5de92 100644 +--- a/xen/include/xen/pci.h ++++ b/xen/include/xen/pci.h +@@ -147,8 +147,12 @@ struct pci_dev { + * devices, it also sync the access to the msi capability that is not + * interrupt handling related (the mask bit register). + */ +- +-void pcidevs_lock(void); ++void pcidevs_lock_unsafe(void); ++static always_inline void pcidevs_lock(void) ++{ ++ pcidevs_lock_unsafe(); ++ block_lock_speculation(); ++} + void pcidevs_unlock(void); + bool_t __must_check pcidevs_locked(void); + diff --git a/main/xen/xsa453-4.16-7.patch b/main/xen/xsa453-4.16-7.patch new file mode 100644 index 00000000000..8a32529d99e --- /dev/null +++ b/main/xen/xsa453-4.16-7.patch @@ -0,0 +1,61 @@ +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> +Subject: x86/mm: add speculation barriers to open coded locks + +Add a speculation barrier to the clearly identified open-coded lock taking +functions. + +Note that the memory sharing page_lock() replacement (_page_lock()) is left +as-is, as the code is experimental and not security supported. + +This is part of XSA-453 / CVE-2024-2193 + +Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +(cherry picked from commit 42a572a38e22a97d86a4b648a22597628d5b42e4) + +diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c +index ea024c145034..2bf1b709851a 100644 +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -2005,7 +2005,7 @@ static inline bool current_locked_page_ne_check(struct page_info *page) { + #define current_locked_page_ne_check(x) true + #endif + +-int page_lock(struct page_info *page) ++int page_lock_unsafe(struct page_info *page) + { + unsigned long x, nx; + +@@ -2066,7 +2066,7 @@ void page_unlock(struct page_info *page) + * l3t_lock(), so to avoid deadlock we must avoid grabbing them in + * reverse order. + */ +-static void l3t_lock(struct page_info *page) ++static always_inline void l3t_lock(struct page_info *page) + { + unsigned long x, nx; + +@@ -2075,6 +2075,8 @@ static void l3t_lock(struct page_info *page) + cpu_relax(); + nx = x | PGT_locked; + } while ( cmpxchg(&page->u.inuse.type_info, x, nx) != x ); ++ ++ block_lock_speculation(); + } + + static void l3t_unlock(struct page_info *page) +diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h +index cccef852b4de..73d5a98bec7e 100644 +--- a/xen/include/asm-x86/mm.h ++++ b/xen/include/asm-x86/mm.h +@@ -393,7 +393,9 @@ const struct platform_bad_page *get_platform_badpages(unsigned int *array_size); + * The use of PGT_locked in mem_sharing does not collide, since mem_sharing is + * only supported for hvm guests, which do not have PV PTEs updated. + */ +-int page_lock(struct page_info *page); ++int page_lock_unsafe(struct page_info *page); ++#define page_lock(pg) lock_evaluate_nospec(page_lock_unsafe(pg)) ++ + void page_unlock(struct page_info *page); + + void put_page_type(struct page_info *page); diff --git a/main/xen/xsa453-4.16-8.patch b/main/xen/xsa453-4.16-8.patch new file mode 100644 index 00000000000..9a134fbcab5 --- /dev/null +++ b/main/xen/xsa453-4.16-8.patch @@ -0,0 +1,201 @@ +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> +Subject: x86: protect conditional lock taking from speculative execution + +Conditionally taken locks that use the pattern: + +if ( lock ) + spin_lock(...); + +Need an else branch in order to issue an speculation barrier in the else case, +just like it's done in case the lock needs to be acquired. + +eval_nospec() could be used on the condition itself, but that would result in a +double barrier on the branch where the lock is taken. + +Introduce a new pair of helpers, {gfn,spin}_lock_if() that can be used to +conditionally take a lock in a speculation safe way. + +This is part of XSA-453 / CVE-2024-2193 + +Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +(cherry picked from commit 03cf7ca23e0e876075954c558485b267b7d02406) + +diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c +index 2bf1b709851a..16287e62af23 100644 +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -5000,8 +5000,7 @@ static l3_pgentry_t *virt_to_xen_l3e(unsigned long v) + if ( !l3t ) + return NULL; + UNMAP_DOMAIN_PAGE(l3t); +- if ( locking ) +- spin_lock(&map_pgdir_lock); ++ spin_lock_if(locking, &map_pgdir_lock); + if ( !(l4e_get_flags(*pl4e) & _PAGE_PRESENT) ) + { + l4_pgentry_t l4e = l4e_from_mfn(l3mfn, __PAGE_HYPERVISOR); +@@ -5038,8 +5037,7 @@ static l2_pgentry_t *virt_to_xen_l2e(unsigned long v) + return NULL; + } + UNMAP_DOMAIN_PAGE(l2t); +- if ( locking ) +- spin_lock(&map_pgdir_lock); ++ spin_lock_if(locking, &map_pgdir_lock); + if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) ) + { + l3e_write(pl3e, l3e_from_mfn(l2mfn, __PAGE_HYPERVISOR)); +@@ -5077,8 +5075,7 @@ l1_pgentry_t *virt_to_xen_l1e(unsigned long v) + return NULL; + } + UNMAP_DOMAIN_PAGE(l1t); +- if ( locking ) +- spin_lock(&map_pgdir_lock); ++ spin_lock_if(locking, &map_pgdir_lock); + if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) ) + { + l2e_write(pl2e, l2e_from_mfn(l1mfn, __PAGE_HYPERVISOR)); +@@ -5109,6 +5106,8 @@ l1_pgentry_t *virt_to_xen_l1e(unsigned long v) + do { \ + if ( locking ) \ + l3t_lock(page); \ ++ else \ ++ block_lock_speculation(); \ + } while ( false ) + + #define L3T_UNLOCK(page) \ +@@ -5324,8 +5323,7 @@ int map_pages_to_xen( + if ( l3e_get_flags(ol3e) & _PAGE_GLOBAL ) + flush_flags |= FLUSH_TLB_GLOBAL; + +- if ( locking ) +- spin_lock(&map_pgdir_lock); ++ spin_lock_if(locking, &map_pgdir_lock); + if ( (l3e_get_flags(*pl3e) & _PAGE_PRESENT) && + (l3e_get_flags(*pl3e) & _PAGE_PSE) ) + { +@@ -5429,8 +5427,7 @@ int map_pages_to_xen( + if ( l2e_get_flags(*pl2e) & _PAGE_GLOBAL ) + flush_flags |= FLUSH_TLB_GLOBAL; + +- if ( locking ) +- spin_lock(&map_pgdir_lock); ++ spin_lock_if(locking, &map_pgdir_lock); + if ( (l2e_get_flags(*pl2e) & _PAGE_PRESENT) && + (l2e_get_flags(*pl2e) & _PAGE_PSE) ) + { +@@ -5471,8 +5468,7 @@ int map_pages_to_xen( + unsigned long base_mfn; + const l1_pgentry_t *l1t; + +- if ( locking ) +- spin_lock(&map_pgdir_lock); ++ spin_lock_if(locking, &map_pgdir_lock); + + ol2e = *pl2e; + /* +@@ -5526,8 +5522,7 @@ int map_pages_to_xen( + unsigned long base_mfn; + const l2_pgentry_t *l2t; + +- if ( locking ) +- spin_lock(&map_pgdir_lock); ++ spin_lock_if(locking, &map_pgdir_lock); + + ol3e = *pl3e; + /* +@@ -5671,8 +5666,7 @@ int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf) + l3e_get_flags(*pl3e))); + UNMAP_DOMAIN_PAGE(l2t); + +- if ( locking ) +- spin_lock(&map_pgdir_lock); ++ spin_lock_if(locking, &map_pgdir_lock); + if ( (l3e_get_flags(*pl3e) & _PAGE_PRESENT) && + (l3e_get_flags(*pl3e) & _PAGE_PSE) ) + { +@@ -5731,8 +5725,7 @@ int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf) + l2e_get_flags(*pl2e) & ~_PAGE_PSE)); + UNMAP_DOMAIN_PAGE(l1t); + +- if ( locking ) +- spin_lock(&map_pgdir_lock); ++ spin_lock_if(locking, &map_pgdir_lock); + if ( (l2e_get_flags(*pl2e) & _PAGE_PRESENT) && + (l2e_get_flags(*pl2e) & _PAGE_PSE) ) + { +@@ -5776,8 +5769,7 @@ int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf) + */ + if ( (nf & _PAGE_PRESENT) || ((v != e) && (l1_table_offset(v) != 0)) ) + continue; +- if ( locking ) +- spin_lock(&map_pgdir_lock); ++ spin_lock_if(locking, &map_pgdir_lock); + + /* + * L2E may be already cleared, or set to a superpage, by +@@ -5824,8 +5816,7 @@ int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf) + if ( (nf & _PAGE_PRESENT) || + ((v != e) && (l2_table_offset(v) + l1_table_offset(v) != 0)) ) + continue; +- if ( locking ) +- spin_lock(&map_pgdir_lock); ++ spin_lock_if(locking, &map_pgdir_lock); + + /* + * L3E may be already cleared, or set to a superpage, by +diff --git a/xen/arch/x86/mm/mm-locks.h b/xen/arch/x86/mm/mm-locks.h +index cc635a440571..7eee233b4cef 100644 +--- a/xen/arch/x86/mm/mm-locks.h ++++ b/xen/arch/x86/mm/mm-locks.h +@@ -347,6 +347,15 @@ static inline void p2m_unlock(struct p2m_domain *p) + #define p2m_locked_by_me(p) mm_write_locked_by_me(&(p)->lock) + #define gfn_locked_by_me(p,g) p2m_locked_by_me(p) + ++static always_inline void gfn_lock_if(bool condition, struct p2m_domain *p2m, ++ gfn_t gfn, unsigned int order) ++{ ++ if ( condition ) ++ gfn_lock(p2m, gfn, order); ++ else ++ block_lock_speculation(); ++} ++ + /* PoD lock (per-p2m-table) + * + * Protects private PoD data structs: entry and cache +diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c +index 2d41446a6902..ddd2f861c3c7 100644 +--- a/xen/arch/x86/mm/p2m.c ++++ b/xen/arch/x86/mm/p2m.c +@@ -514,9 +514,8 @@ mfn_t __get_gfn_type_access(struct p2m_domain *p2m, unsigned long gfn_l, + if ( q & P2M_UNSHARE ) + q |= P2M_ALLOC; + +- if ( locked ) +- /* Grab the lock here, don't release until put_gfn */ +- gfn_lock(p2m, gfn, 0); ++ /* Grab the lock here, don't release until put_gfn */ ++ gfn_lock_if(locked, p2m, gfn, 0); + + mfn = p2m->get_entry(p2m, gfn, t, a, q, page_order, NULL); + +diff --git a/xen/include/xen/spinlock.h b/xen/include/xen/spinlock.h +index efdb21ea9072..8bffb3f4b610 100644 +--- a/xen/include/xen/spinlock.h ++++ b/xen/include/xen/spinlock.h +@@ -216,6 +216,14 @@ static always_inline void spin_lock_irq(spinlock_t *l) + block_lock_speculation(); \ + }) + ++/* Conditionally take a spinlock in a speculation safe way. */ ++static always_inline void spin_lock_if(bool condition, spinlock_t *l) ++{ ++ if ( condition ) ++ _spin_lock(l); ++ block_lock_speculation(); ++} ++ + #define spin_unlock(l) _spin_unlock(l) + #define spin_unlock_irq(l) _spin_unlock_irq(l) + #define spin_unlock_irqrestore(l, f) _spin_unlock_irqrestore(l, f) |