summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-12-20 11:31:37 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2025-12-20 11:31:37 -0800
commit072c0b4f0f9597c86ddb01fd39e784fda6b7a922 (patch)
treeb5e0fbd24fe85139c22b24192b764d56dfe1b1e4
parent255a918a943ba17bbd8801f5eb56c9083d1def7e (diff)
parent0499add8efd72456514c6218c062911ccc922a99 (diff)
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull x86 kvm fixes from Paolo Bonzini: "x86 fixes. Everyone else is already in holiday mood apparently. - Add a missing 'break' to fix param parsing in the rseq selftest - Apply runtime updates to the _current_ CPUID when userspace is setting CPUID, e.g. as part of vCPU hotplug, to fix a false positive and to avoid dropping the pending update - Disallow toggling KVM_MEM_GUEST_MEMFD on an existing memslot, as it's not supported by KVM and leads to a use-after-free due to KVM failing to unbind the memslot from the previously-associated guest_memfd instance - Harden against similar KVM_MEM_GUEST_MEMFD goofs, and prepare for supporting flags-only changes on KVM_MEM_GUEST_MEMFD memlslots, e.g. for dirty logging - Set exit_code[63:32] to -1 (all 0xffs) when synthesizing a nested SVM_EXIT_ERR (a.k.a. VMEXIT_INVALID) #VMEXIT, as VMEXIT_INVALID is defined as -1ull (a 64-bit value) - Update SVI when activating APICv to fix a bug where a post-activation EOI for an in-service IRQ would effective be lost due to SVI being stale - Immediately refresh APICv controls (if necessary) on a nested VM-Exit instead of deferring the update via KVM_REQ_APICV_UPDATE, as the request is effectively ignored because KVM thinks the vCPU already has the correct APICv settings" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: KVM: nVMX: Immediately refresh APICv controls as needed on nested VM-Exit KVM: VMX: Update SVI during runtime APICv activation KVM: nSVM: Set exit_code_hi to -1 when synthesizing SVM_EXIT_ERR (failed VMRUN) KVM: nSVM: Clear exit_code_hi in VMCB when synthesizing nested VM-Exits KVM: Harden and prepare for modifying existing guest_memfd memslots KVM: Disallow toggling KVM_MEM_GUEST_MEMFD on an existing memslot KVM: selftests: Add a CPUID testcase for KVM_SET_CPUID2 with runtime updates KVM: x86: Apply runtime updates to current CPUID during KVM_SET_CPUID{,2} KVM: selftests: Add missing "break" in rseq_test's param parsing
-rw-r--r--arch/x86/kvm/cpuid.c11
-rw-r--r--arch/x86/kvm/svm/nested.c4
-rw-r--r--arch/x86/kvm/svm/svm.c2
-rw-r--r--arch/x86/kvm/svm/svm.h7
-rw-r--r--arch/x86/kvm/vmx/nested.c3
-rw-r--r--arch/x86/kvm/vmx/vmx.c9
-rw-r--r--arch/x86/kvm/x86.c7
-rw-r--r--tools/testing/selftests/kvm/rseq_test.c1
-rw-r--r--tools/testing/selftests/kvm/x86/cpuid_test.c15
-rw-r--r--virt/kvm/kvm_main.c17
10 files changed, 58 insertions, 18 deletions
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index d563a948318b..88a5426674a1 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -510,10 +510,17 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2,
int r;
/*
+ * Apply pending runtime CPUID updates to the current CPUID entries to
+ * avoid false positives due to mismatches on KVM-owned feature flags.
+ */
+ if (vcpu->arch.cpuid_dynamic_bits_dirty)
+ kvm_update_cpuid_runtime(vcpu);
+
+ /*
* Swap the existing (old) entries with the incoming (new) entries in
* order to massage the new entries, e.g. to account for dynamic bits
- * that KVM controls, without clobbering the current guest CPUID, which
- * KVM needs to preserve in order to unwind on failure.
+ * that KVM controls, without losing the current guest CPUID, which KVM
+ * needs to preserve in order to unwind on failure.
*
* Similarly, save the vCPU's current cpu_caps so that the capabilities
* can be updated alongside the CPUID entries when performing runtime
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index c81005b24522..ba0f11c68372 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -985,7 +985,7 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu)
if (!nested_vmcb_check_save(vcpu) ||
!nested_vmcb_check_controls(vcpu)) {
vmcb12->control.exit_code = SVM_EXIT_ERR;
- vmcb12->control.exit_code_hi = 0;
+ vmcb12->control.exit_code_hi = -1u;
vmcb12->control.exit_info_1 = 0;
vmcb12->control.exit_info_2 = 0;
goto out;
@@ -1018,7 +1018,7 @@ out_exit_err:
svm->soft_int_injected = false;
svm->vmcb->control.exit_code = SVM_EXIT_ERR;
- svm->vmcb->control.exit_code_hi = 0;
+ svm->vmcb->control.exit_code_hi = -1u;
svm->vmcb->control.exit_info_1 = 0;
svm->vmcb->control.exit_info_2 = 0;
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index f56c2d895011..24d59ccfa40d 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -2443,6 +2443,7 @@ static bool check_selective_cr0_intercepted(struct kvm_vcpu *vcpu,
if (cr0 ^ val) {
svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
+ svm->vmcb->control.exit_code_hi = 0;
ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE);
}
@@ -4617,6 +4618,7 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu,
if (static_cpu_has(X86_FEATURE_NRIPS))
vmcb->control.next_rip = info->next_rip;
vmcb->control.exit_code = icpt_info.exit_code;
+ vmcb->control.exit_code_hi = 0;
vmexit = nested_svm_exit_handled(svm);
ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 9e151dbdef25..01be93a53d07 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -761,9 +761,10 @@ int nested_svm_vmexit(struct vcpu_svm *svm);
static inline int nested_svm_simple_vmexit(struct vcpu_svm *svm, u32 exit_code)
{
- svm->vmcb->control.exit_code = exit_code;
- svm->vmcb->control.exit_info_1 = 0;
- svm->vmcb->control.exit_info_2 = 0;
+ svm->vmcb->control.exit_code = exit_code;
+ svm->vmcb->control.exit_code_hi = 0;
+ svm->vmcb->control.exit_info_1 = 0;
+ svm->vmcb->control.exit_info_2 = 0;
return nested_svm_vmexit(svm);
}
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 40777278eabb..6137e5307d0f 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -19,6 +19,7 @@
#include "trace.h"
#include "vmx.h"
#include "smm.h"
+#include "x86_ops.h"
static bool __read_mostly enable_shadow_vmcs = 1;
module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
@@ -5165,7 +5166,7 @@ void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
if (vmx->nested.update_vmcs01_apicv_status) {
vmx->nested.update_vmcs01_apicv_status = false;
- kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
+ vmx_refresh_apicv_exec_ctrl(vcpu);
}
if (vmx->nested.update_vmcs01_hwapic_isr) {
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 4cbe8c84b636..6b96f7aea20b 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6937,15 +6937,6 @@ void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
* VM-Exit, otherwise L1 with run with a stale SVI.
*/
if (is_guest_mode(vcpu)) {
- /*
- * KVM is supposed to forward intercepted L2 EOIs to L1 if VID
- * is enabled in vmcs12; as above, the EOIs affect L2's vAPIC.
- * Note, userspace can stuff state while L2 is active; assert
- * that VID is disabled if and only if the vCPU is in KVM_RUN
- * to avoid false positives if userspace is setting APIC state.
- */
- WARN_ON_ONCE(vcpu->wants_to_run &&
- nested_cpu_has_vid(get_vmcs12(vcpu)));
to_vmx(vcpu)->nested.update_vmcs01_hwapic_isr = true;
return;
}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0c6d899d53dd..ff8812f3a129 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -10886,9 +10886,16 @@ void __kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
* pending. At the same time, KVM_REQ_EVENT may not be set as APICv was
* still active when the interrupt got accepted. Make sure
* kvm_check_and_inject_events() is called to check for that.
+ *
+ * Update SVI when APICv gets enabled, otherwise SVI won't reflect the
+ * highest bit in vISR and the next accelerated EOI in the guest won't
+ * be virtualized correctly (the CPU uses SVI to determine which vISR
+ * vector to clear).
*/
if (!apic->apicv_active)
kvm_make_request(KVM_REQ_EVENT, vcpu);
+ else
+ kvm_apic_update_hwapic_isr(vcpu);
out:
preempt_enable();
diff --git a/tools/testing/selftests/kvm/rseq_test.c b/tools/testing/selftests/kvm/rseq_test.c
index 1375fca80bcd..f80ad6b47d16 100644
--- a/tools/testing/selftests/kvm/rseq_test.c
+++ b/tools/testing/selftests/kvm/rseq_test.c
@@ -215,6 +215,7 @@ int main(int argc, char *argv[])
switch (opt) {
case 'u':
skip_sanity_check = true;
+ break;
case 'l':
latency = atoi_paranoid(optarg);
break;
diff --git a/tools/testing/selftests/kvm/x86/cpuid_test.c b/tools/testing/selftests/kvm/x86/cpuid_test.c
index 7b3fda6842bc..f9ed14996977 100644
--- a/tools/testing/selftests/kvm/x86/cpuid_test.c
+++ b/tools/testing/selftests/kvm/x86/cpuid_test.c
@@ -155,6 +155,7 @@ struct kvm_cpuid2 *vcpu_alloc_cpuid(struct kvm_vm *vm, vm_vaddr_t *p_gva, struct
static void set_cpuid_after_run(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *ent;
+ struct kvm_sregs sregs;
int rc;
u32 eax, ebx, x;
@@ -162,6 +163,20 @@ static void set_cpuid_after_run(struct kvm_vcpu *vcpu)
rc = __vcpu_set_cpuid(vcpu);
TEST_ASSERT(!rc, "Setting unmodified CPUID after KVM_RUN failed: %d", rc);
+ /*
+ * Toggle CR4 bits that affect dynamic CPUID feature flags to verify
+ * setting unmodified CPUID succeeds with runtime CPUID updates.
+ */
+ vcpu_sregs_get(vcpu, &sregs);
+ if (kvm_cpu_has(X86_FEATURE_XSAVE))
+ sregs.cr4 ^= X86_CR4_OSXSAVE;
+ if (kvm_cpu_has(X86_FEATURE_PKU))
+ sregs.cr4 ^= X86_CR4_PKE;
+ vcpu_sregs_set(vcpu, &sregs);
+
+ rc = __vcpu_set_cpuid(vcpu);
+ TEST_ASSERT(!rc, "Setting unmodified CPUID after KVM_RUN failed: %d", rc);
+
/* Changing CPU features is forbidden */
ent = vcpu_get_cpuid_entry(vcpu, 0x7);
ebx = ent->ebx;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 5fcd401a5897..5b5b69c97665 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1749,6 +1749,12 @@ static void kvm_commit_memory_region(struct kvm *kvm,
kvm_free_memslot(kvm, old);
break;
case KVM_MR_MOVE:
+ /*
+ * Moving a guest_memfd memslot isn't supported, and will never
+ * be supported.
+ */
+ WARN_ON_ONCE(old->flags & KVM_MEM_GUEST_MEMFD);
+ fallthrough;
case KVM_MR_FLAGS_ONLY:
/*
* Free the dirty bitmap as needed; the below check encompasses
@@ -1758,6 +1764,15 @@ static void kvm_commit_memory_region(struct kvm *kvm,
kvm_destroy_dirty_bitmap(old);
/*
+ * Unbind the guest_memfd instance as needed; the @new slot has
+ * already created its own binding. TODO: Drop the WARN when
+ * dirty logging guest_memfd memslots is supported. Until then,
+ * flags-only changes on guest_memfd slots should be impossible.
+ */
+ if (WARN_ON_ONCE(old->flags & KVM_MEM_GUEST_MEMFD))
+ kvm_gmem_unbind(old);
+
+ /*
* The final quirk. Free the detached, old slot, but only its
* memory, not any metadata. Metadata, including arch specific
* data, may be reused by @new.
@@ -2086,7 +2101,7 @@ static int kvm_set_memory_region(struct kvm *kvm,
return -EINVAL;
if ((mem->userspace_addr != old->userspace_addr) ||
(npages != old->npages) ||
- ((mem->flags ^ old->flags) & KVM_MEM_READONLY))
+ ((mem->flags ^ old->flags) & (KVM_MEM_READONLY | KVM_MEM_GUEST_MEMFD)))
return -EINVAL;
if (base_gfn != old->base_gfn)