From 1b9439c933b500cb24710bbd81fe56e9b0025b6f Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Mon, 24 Nov 2025 15:04:27 +1000 Subject: KVM: selftests: Add missing "break" in rseq_test's param parsing In commit 0297cdc12a87 ("KVM: selftests: Add option to rseq test to override /dev/cpu_dma_latency"), a 'break' is missed before the option 'l' in the argument parsing loop, which leads to an unexpected core dump in atoi_paranoid(). It tries to get the latency from non-existent argument. host$ ./rseq_test -u Random seed: 0x6b8b4567 Segmentation fault (core dumped) Add a 'break' before the option 'l' in the argument parsing loop to avoid the unexpected core dump. Fixes: 0297cdc12a87 ("KVM: selftests: Add option to rseq test to override /dev/cpu_dma_latency") Cc: stable@vger.kernel.org # v6.15+ Signed-off-by: Gavin Shan Link: https://patch.msgid.link/20251124050427.1924591-1-gshan@redhat.com [sean: describe code change in shortlog] Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/rseq_test.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/kvm/rseq_test.c b/tools/testing/selftests/kvm/rseq_test.c index 1375fca80bcd..f80ad6b47d16 100644 --- a/tools/testing/selftests/kvm/rseq_test.c +++ b/tools/testing/selftests/kvm/rseq_test.c @@ -215,6 +215,7 @@ int main(int argc, char *argv[]) switch (opt) { case 'u': skip_sanity_check = true; + break; case 'l': latency = atoi_paranoid(optarg); break; -- cgit v1.2.3 From e2b43fb25243d502ad36b07bab9de09f4b76fff9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 1 Dec 2025 17:50:48 -0800 Subject: KVM: x86: Apply runtime updates to current CPUID during KVM_SET_CPUID{,2} When handling KVM_SET_CPUID{,2}, do runtime CPUID updates on the vCPU's current CPUID (and caps) prior to swapping in the incoming CPUID state so that KVM doesn't lose pending updates if the incoming CPUID is rejected, and to prevent a false failure on the equality check. Note, runtime updates are unconditionally performed on the incoming/new CPUID (and associated caps), i.e. clearing the dirty flag won't negatively affect the new CPUID. Fixes: 93da6af3ae56 ("KVM: x86: Defer runtime updates of dynamic CPUID bits until CPUID emulation") Reported-by: Igor Mammedov Closes: https://lore.kernel.org/all/20251128123202.68424a95@imammedo Cc: stable@vger.kernel.org Acked-by: Igor Mammedov Tested-by: Igor Mammedov Link: https://patch.msgid.link/20251202015049.1167490-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/cpuid.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 52524e0ca97f..913ffb995279 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -509,11 +509,18 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2, u32 vcpu_caps[NR_KVM_CPU_CAPS]; int r; + /* + * Apply pending runtime CPUID updates to the current CPUID entries to + * avoid false positives due to mismatches on KVM-owned feature flags. + */ + if (vcpu->arch.cpuid_dynamic_bits_dirty) + kvm_update_cpuid_runtime(vcpu); + /* * Swap the existing (old) entries with the incoming (new) entries in * order to massage the new entries, e.g. to account for dynamic bits - * that KVM controls, without clobbering the current guest CPUID, which - * KVM needs to preserve in order to unwind on failure. + * that KVM controls, without losing the current guest CPUID, which KVM + * needs to preserve in order to unwind on failure. * * Similarly, save the vCPU's current cpu_caps so that the capabilities * can be updated alongside the CPUID entries when performing runtime -- cgit v1.2.3 From 824d227324dcd328857b70e37b41780f02225729 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 1 Dec 2025 17:50:49 -0800 Subject: KVM: selftests: Add a CPUID testcase for KVM_SET_CPUID2 with runtime updates Add a CPUID testcase to verify that KVM allows KVM_SET_CPUID2 after (or in conjunction with) runtime updates. This is a regression test for the bug introduced by commit 93da6af3ae56 ("KVM: x86: Defer runtime updates of dynamic CPUID bits until CPUID emulation"), where KVM would incorrectly reject KVM_SET_CPUID due to a not handling a pending runtime update on the current CPUID, resulting in a false mismatch between the "old" and "new" CPUID entries. Link: https://lore.kernel.org/all/20251128123202.68424a95@imammedo Link: https://patch.msgid.link/20251202015049.1167490-3-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/x86/cpuid_test.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tools/testing/selftests/kvm/x86/cpuid_test.c b/tools/testing/selftests/kvm/x86/cpuid_test.c index 7b3fda6842bc..f9ed14996977 100644 --- a/tools/testing/selftests/kvm/x86/cpuid_test.c +++ b/tools/testing/selftests/kvm/x86/cpuid_test.c @@ -155,6 +155,7 @@ struct kvm_cpuid2 *vcpu_alloc_cpuid(struct kvm_vm *vm, vm_vaddr_t *p_gva, struct static void set_cpuid_after_run(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *ent; + struct kvm_sregs sregs; int rc; u32 eax, ebx, x; @@ -162,6 +163,20 @@ static void set_cpuid_after_run(struct kvm_vcpu *vcpu) rc = __vcpu_set_cpuid(vcpu); TEST_ASSERT(!rc, "Setting unmodified CPUID after KVM_RUN failed: %d", rc); + /* + * Toggle CR4 bits that affect dynamic CPUID feature flags to verify + * setting unmodified CPUID succeeds with runtime CPUID updates. + */ + vcpu_sregs_get(vcpu, &sregs); + if (kvm_cpu_has(X86_FEATURE_XSAVE)) + sregs.cr4 ^= X86_CR4_OSXSAVE; + if (kvm_cpu_has(X86_FEATURE_PKU)) + sregs.cr4 ^= X86_CR4_PKE; + vcpu_sregs_set(vcpu, &sregs); + + rc = __vcpu_set_cpuid(vcpu); + TEST_ASSERT(!rc, "Setting unmodified CPUID after KVM_RUN failed: %d", rc); + /* Changing CPU features is forbidden */ ent = vcpu_get_cpuid_entry(vcpu, 0x7); ebx = ent->ebx; -- cgit v1.2.3 From 9935df5333aa503a18de5071f53762b65c783c4c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 1 Dec 2025 18:03:33 -0800 Subject: KVM: Disallow toggling KVM_MEM_GUEST_MEMFD on an existing memslot Reject attempts to disable KVM_MEM_GUEST_MEMFD on a memslot that was initially created with a guest_memfd binding, as KVM doesn't support toggling KVM_MEM_GUEST_MEMFD on existing memslots. KVM prevents enabling KVM_MEM_GUEST_MEMFD, but doesn't prevent clearing the flag. Failure to reject the new memslot results in a use-after-free due to KVM not unbinding from the guest_memfd instance. Unbinding on a FLAGS_ONLY change is easy enough, and can/will be done as a hardening measure (in anticipation of KVM supporting dirty logging on guest_memfd at some point), but fixing the use-after-free would only address the immediate symptom. ================================================================== BUG: KASAN: slab-use-after-free in kvm_gmem_release+0x362/0x400 [kvm] Write of size 8 at addr ffff8881111ae908 by task repro/745 CPU: 7 UID: 1000 PID: 745 Comm: repro Not tainted 6.18.0-rc6-115d5de2eef3-next-kasan #3 NONE Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 Call Trace: dump_stack_lvl+0x51/0x60 print_report+0xcb/0x5c0 kasan_report+0xb4/0xe0 kvm_gmem_release+0x362/0x400 [kvm] __fput+0x2fa/0x9d0 task_work_run+0x12c/0x200 do_exit+0x6ae/0x2100 do_group_exit+0xa8/0x230 __x64_sys_exit_group+0x3a/0x50 x64_sys_call+0x737/0x740 do_syscall_64+0x5b/0x900 entry_SYSCALL_64_after_hwframe+0x4b/0x53 RIP: 0033:0x7f581f2eac31 Allocated by task 745 on cpu 6 at 9.746971s: kasan_save_stack+0x20/0x40 kasan_save_track+0x13/0x50 __kasan_kmalloc+0x77/0x90 kvm_set_memory_region.part.0+0x652/0x1110 [kvm] kvm_vm_ioctl+0x14b0/0x3290 [kvm] __x64_sys_ioctl+0x129/0x1a0 do_syscall_64+0x5b/0x900 entry_SYSCALL_64_after_hwframe+0x4b/0x53 Freed by task 745 on cpu 6 at 9.747467s: kasan_save_stack+0x20/0x40 kasan_save_track+0x13/0x50 __kasan_save_free_info+0x37/0x50 __kasan_slab_free+0x3b/0x60 kfree+0xf5/0x440 kvm_set_memslot+0x3c2/0x1160 [kvm] kvm_set_memory_region.part.0+0x86a/0x1110 [kvm] kvm_vm_ioctl+0x14b0/0x3290 [kvm] __x64_sys_ioctl+0x129/0x1a0 do_syscall_64+0x5b/0x900 entry_SYSCALL_64_after_hwframe+0x4b/0x53 Reported-by: Alexander Potapenko Fixes: a7800aa80ea4 ("KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory") Cc: stable@vger.kernel.org Link: https://patch.msgid.link/20251202020334.1171351-2-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/kvm/kvm_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index cde1eddbaa91..7fea6ba91c1e 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2085,7 +2085,7 @@ static int kvm_set_memory_region(struct kvm *kvm, return -EINVAL; if ((mem->userspace_addr != old->userspace_addr) || (npages != old->npages) || - ((mem->flags ^ old->flags) & KVM_MEM_READONLY)) + ((mem->flags ^ old->flags) & (KVM_MEM_READONLY | KVM_MEM_GUEST_MEMFD))) return -EINVAL; if (base_gfn != old->base_gfn) -- cgit v1.2.3 From af62fe2494da84eb01752282c8228c9bb3fe9f67 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 1 Dec 2025 18:03:34 -0800 Subject: KVM: Harden and prepare for modifying existing guest_memfd memslots Unbind guest_memfd memslots if KVM commits a MOVE or FLAGS_ONLY memslot change to harden against use-after-free, and to prepare for eventually supporting dirty logging on guest_memfd memslots, at which point FLAGS_ONLY changes will be expected/supported. Add two separate WARNs, once to yell if a guest_memfd memslot is moved (which KVM is never expected to allow/support), and again if the unbind() is triggered, to help detect uAPI goofs prior to deliberately allowing FLAGS_ONLY changes. Link: https://patch.msgid.link/20251202020334.1171351-3-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/kvm/kvm_main.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 7fea6ba91c1e..32b6c6209b63 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1748,6 +1748,12 @@ static void kvm_commit_memory_region(struct kvm *kvm, kvm_free_memslot(kvm, old); break; case KVM_MR_MOVE: + /* + * Moving a guest_memfd memslot isn't supported, and will never + * be supported. + */ + WARN_ON_ONCE(old->flags & KVM_MEM_GUEST_MEMFD); + fallthrough; case KVM_MR_FLAGS_ONLY: /* * Free the dirty bitmap as needed; the below check encompasses @@ -1756,6 +1762,15 @@ static void kvm_commit_memory_region(struct kvm *kvm, if (old->dirty_bitmap && !new->dirty_bitmap) kvm_destroy_dirty_bitmap(old); + /* + * Unbind the guest_memfd instance as needed; the @new slot has + * already created its own binding. TODO: Drop the WARN when + * dirty logging guest_memfd memslots is supported. Until then, + * flags-only changes on guest_memfd slots should be impossible. + */ + if (WARN_ON_ONCE(old->flags & KVM_MEM_GUEST_MEMFD)) + kvm_gmem_unbind(old); + /* * The final quirk. Free the detached, old slot, but only its * memory, not any metadata. Metadata, including arch specific -- cgit v1.2.3 From da01f64e7470988f8607776aa7afa924208863fb Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 13 Nov 2025 14:56:13 -0800 Subject: KVM: nSVM: Clear exit_code_hi in VMCB when synthesizing nested VM-Exits Explicitly clear exit_code_hi in the VMCB when synthesizing "normal" nested VM-Exits, as the full exit code is a 64-bit value (spoiler alert), and all exit codes for non-failing VMRUN use only bits 31:0. Cc: Jim Mattson Cc: Yosry Ahmed Cc: stable@vger.kernel.org Reviewed-by: Yosry Ahmed Link: https://patch.msgid.link/20251113225621.1688428-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 2 ++ arch/x86/kvm/svm/svm.h | 7 ++++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 9d29b2e7e855..eeeb4ae4c617 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2435,6 +2435,7 @@ static bool check_selective_cr0_intercepted(struct kvm_vcpu *vcpu, if (cr0 ^ val) { svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE; + svm->vmcb->control.exit_code_hi = 0; ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE); } @@ -4611,6 +4612,7 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu, if (static_cpu_has(X86_FEATURE_NRIPS)) vmcb->control.next_rip = info->next_rip; vmcb->control.exit_code = icpt_info.exit_code; + vmcb->control.exit_code_hi = 0; vmexit = nested_svm_exit_handled(svm); ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index dd78e6402345..e66a16e59b1a 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -764,9 +764,10 @@ int nested_svm_vmexit(struct vcpu_svm *svm); static inline int nested_svm_simple_vmexit(struct vcpu_svm *svm, u32 exit_code) { - svm->vmcb->control.exit_code = exit_code; - svm->vmcb->control.exit_info_1 = 0; - svm->vmcb->control.exit_info_2 = 0; + svm->vmcb->control.exit_code = exit_code; + svm->vmcb->control.exit_code_hi = 0; + svm->vmcb->control.exit_info_1 = 0; + svm->vmcb->control.exit_info_2 = 0; return nested_svm_vmexit(svm); } -- cgit v1.2.3 From f402ecd7a8b6446547076f4bd24bd5d4dcc94481 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 13 Nov 2025 14:56:14 -0800 Subject: KVM: nSVM: Set exit_code_hi to -1 when synthesizing SVM_EXIT_ERR (failed VMRUN) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Set exit_code_hi to -1u as a temporary band-aid to fix a long-standing (effectively since KVM's inception) bug where KVM treats the exit code as a 32-bit value, when in reality it's a 64-bit value. Per the APM, offset 0x70 is a single 64-bit value: 070h 63:0 EXITCODE And a sane reading of the error values defined in "Table C-1. SVM Intercept Codes" is that negative values use the full 64 bits: –1 VMEXIT_INVALID Invalid guest state in VMCB. –2 VMEXIT_BUSYBUSY bit was set in the VMSA –3 VMEXIT_IDLE_REQUIREDThe sibling thread is not in an idle state -4 VMEXIT_INVALID_PMC Invalid PMC state And that interpretation is confirmed by testing on Milan and Turin (by setting bits in CR0[63:32] to generate VMEXIT_INVALID on VMRUN). Furthermore, Xen has treated exitcode as a 64-bit value since HVM support was adding in 2006 (see Xen commit d1bd157fbc ("Big merge the HVM full-virtualisation abstractions.")). Cc: Jim Mattson Cc: Yosry Ahmed Cc: stable@vger.kernel.org Reviewed-by: Yosry Ahmed Link: https://patch.msgid.link/20251113225621.1688428-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/nested.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index da6e80b3ac35..143a0ef02b03 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -983,7 +983,7 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu) if (!nested_vmcb_check_save(vcpu) || !nested_vmcb_check_controls(vcpu)) { vmcb12->control.exit_code = SVM_EXIT_ERR; - vmcb12->control.exit_code_hi = 0; + vmcb12->control.exit_code_hi = -1u; vmcb12->control.exit_info_1 = 0; vmcb12->control.exit_info_2 = 0; goto out; @@ -1016,7 +1016,7 @@ out_exit_err: svm->soft_int_injected = false; svm->vmcb->control.exit_code = SVM_EXIT_ERR; - svm->vmcb->control.exit_code_hi = 0; + svm->vmcb->control.exit_code_hi = -1u; svm->vmcb->control.exit_info_1 = 0; svm->vmcb->control.exit_info_2 = 0; -- cgit v1.2.3 From b2849bec936be642b5420801f902337f2507648e Mon Sep 17 00:00:00 2001 From: Dongli Zhang Date: Fri, 5 Dec 2025 15:19:04 -0800 Subject: KVM: VMX: Update SVI during runtime APICv activation The APICv (apic->apicv_active) can be activated or deactivated at runtime, for instance, because of APICv inhibit reasons. Intel VMX employs different mechanisms to virtualize LAPIC based on whether APICv is active. When APICv is activated at runtime, GUEST_INTR_STATUS is used to configure and report the current pending IRR and ISR states. Unless a specific vector is explicitly included in EOI_EXIT_BITMAP, its EOI will not be trapped to KVM. Intel VMX automatically clears the corresponding ISR bit based on the GUEST_INTR_STATUS.SVI field. When APICv is deactivated at runtime, the VM_ENTRY_INTR_INFO_FIELD is used to specify the next interrupt vector to invoke upon VM-entry. The VMX IDT_VECTORING_INFO_FIELD is used to report un-invoked vectors on VM-exit. EOIs are always trapped to KVM, so the software can manually clear pending ISR bits. There are scenarios where, with APICv activated at runtime, a guest-issued EOI may not be able to clear the pending ISR bit. Taking vector 236 as an example, here is one scenario. 1. Suppose APICv is inactive. Vector 236 is pending in the IRR. 2. To handle KVM_REQ_EVENT, KVM moves vector 236 from the IRR to the ISR, and configures the VM_ENTRY_INTR_INFO_FIELD via vmx_inject_irq(). 3. After VM-entry, vector 236 is invoked through the guest IDT. At this point, the data in VM_ENTRY_INTR_INFO_FIELD is no longer valid. The guest interrupt handler for vector 236 is invoked. 4. Suppose a VM exit occurs very early in the guest interrupt handler, before the EOI is issued. 5. Nothing is reported through the IDT_VECTORING_INFO_FIELD because vector 236 has already been invoked in the guest. 6. Now, suppose APICv is activated. Before the next VM-entry, KVM calls kvm_vcpu_update_apicv() to activate APICv. 7. Unfortunately, GUEST_INTR_STATUS.SVI is not configured, although vector 236 is still pending in the ISR. 8. After VM-entry, the guest finally issues the EOI for vector 236. However, because SVI is not configured, vector 236 is not cleared. 9. ISR is stalled forever on vector 236. Here is another scenario. 1. Suppose APICv is inactive. Vector 236 is pending in the IRR. 2. To handle KVM_REQ_EVENT, KVM moves vector 236 from the IRR to the ISR, and configures the VM_ENTRY_INTR_INFO_FIELD via vmx_inject_irq(). 3. VM-exit occurs immediately after the next VM-entry. The vector 236 is not invoked through the guest IDT. Instead, it is saved to the IDT_VECTORING_INFO_FIELD during the VM-exit. 4. KVM calls kvm_queue_interrupt() to re-queue the un-invoked vector 236 into vcpu->arch.interrupt. A KVM_REQ_EVENT is requested. 5. Now, suppose APICv is activated. Before the next VM-entry, KVM calls kvm_vcpu_update_apicv() to activate APICv. 6. Although APICv is now active, KVM still uses the legacy VM_ENTRY_INTR_INFO_FIELD to re-inject vector 236. GUEST_INTR_STATUS.SVI is not configured. 7. After the next VM-entry, vector 236 is invoked through the guest IDT. Finally, an EOI occurs. However, due to the lack of GUEST_INTR_STATUS.SVI configuration, vector 236 is not cleared from the ISR. 8. ISR is stalled forever on vector 236. Using QEMU as an example, vector 236 is stuck in ISR forever. (qemu) info lapic 1 dumping local APIC state for CPU 1 LVT0 0x00010700 active-hi edge masked ExtINT (vec 0) LVT1 0x00010400 active-hi edge masked NMI LVTPC 0x00000400 active-hi edge NMI LVTERR 0x000000fe active-hi edge Fixed (vec 254) LVTTHMR 0x00010000 active-hi edge masked Fixed (vec 0) LVTT 0x000400ec active-hi edge tsc-deadline Fixed (vec 236) Timer DCR=0x0 (divide by 2) initial_count = 0 current_count = 0 SPIV 0x000001ff APIC enabled, focus=off, spurious vec 255 ICR 0x000000fd physical edge de-assert no-shorthand ICR2 0x00000000 cpu 0 (X2APIC ID) ESR 0x00000000 ISR 236 IRR 37(level) 236 The issue isn't applicable to AMD SVM as KVM simply writes vmcb01 directly irrespective of whether L1 (vmcs01) or L2 (vmcb02) is active (unlike VMX, there is no need/cost to switch between VMCBs). In addition, APICV_INHIBIT_REASON_IRQWIN ensures AMD SVM AVIC is not activated until the last interrupt is EOI'd. Fix the bug by configuring Intel VMX GUEST_INTR_STATUS.SVI if APICv is activated at runtime. Signed-off-by: Dongli Zhang Reviewed-by: Chao Gao Link: https://patch.msgid.link/20251110063212.34902-1-dongli.zhang@oracle.com [sean: call out that SVM writes vmcb01 directly, tweak comment] Link: https://patch.msgid.link/20251205231913.441872-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 9 --------- arch/x86/kvm/x86.c | 7 +++++++ 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 91b6f2f3edc2..c3b9eb72b6f3 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6886,15 +6886,6 @@ void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) * VM-Exit, otherwise L1 with run with a stale SVI. */ if (is_guest_mode(vcpu)) { - /* - * KVM is supposed to forward intercepted L2 EOIs to L1 if VID - * is enabled in vmcs12; as above, the EOIs affect L2's vAPIC. - * Note, userspace can stuff state while L2 is active; assert - * that VID is disabled if and only if the vCPU is in KVM_RUN - * to avoid false positives if userspace is setting APIC state. - */ - WARN_ON_ONCE(vcpu->wants_to_run && - nested_cpu_has_vid(get_vmcs12(vcpu))); to_vmx(vcpu)->nested.update_vmcs01_hwapic_isr = true; return; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c9c2aa6f4705..82036205945f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10877,9 +10877,16 @@ void __kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) * pending. At the same time, KVM_REQ_EVENT may not be set as APICv was * still active when the interrupt got accepted. Make sure * kvm_check_and_inject_events() is called to check for that. + * + * Update SVI when APICv gets enabled, otherwise SVI won't reflect the + * highest bit in vISR and the next accelerated EOI in the guest won't + * be virtualized correctly (the CPU uses SVI to determine which vISR + * vector to clear). */ if (!apic->apicv_active) kvm_make_request(KVM_REQ_EVENT, vcpu); + else + kvm_apic_update_hwapic_isr(vcpu); out: preempt_enable(); -- cgit v1.2.3 From 29763138830916f46daaa50e83e7f4f907a3236b Mon Sep 17 00:00:00 2001 From: Dongli Zhang Date: Fri, 5 Dec 2025 15:19:05 -0800 Subject: KVM: nVMX: Immediately refresh APICv controls as needed on nested VM-Exit If an APICv status updated was pended while L2 was active, immediately refresh vmcs01's controls instead of pending KVM_REQ_APICV_UPDATE as kvm_vcpu_update_apicv() only calls into vendor code if a change is necessary. E.g. if APICv is inhibited, and then activated while L2 is running: kvm_vcpu_update_apicv() | -> __kvm_vcpu_update_apicv() | -> apic->apicv_active = true | -> vmx_refresh_apicv_exec_ctrl() | -> vmx->nested.update_vmcs01_apicv_status = true | -> return Then L2 exits to L1: __nested_vmx_vmexit() | -> kvm_make_request(KVM_REQ_APICV_UPDATE) vcpu_enter_guest(): KVM_REQ_APICV_UPDATE -> kvm_vcpu_update_apicv() | -> __kvm_vcpu_update_apicv() | -> return // because if (apic->apicv_active == activate) Reported-by: Chao Gao Closes: https://lore.kernel.org/all/aQ2jmnN8wUYVEawF@intel.com Fixes: 7c69661e225c ("KVM: nVMX: Defer APICv updates while L2 is active until L1 is active") Cc: stable@vger.kernel.org Signed-off-by: Dongli Zhang [sean: write changelog] Link: https://patch.msgid.link/20251205231913.441872-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index bcea087b642f..1725c6a94f99 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -19,6 +19,7 @@ #include "trace.h" #include "vmx.h" #include "smm.h" +#include "x86_ops.h" static bool __read_mostly enable_shadow_vmcs = 1; module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); @@ -5216,7 +5217,7 @@ void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, if (vmx->nested.update_vmcs01_apicv_status) { vmx->nested.update_vmcs01_apicv_status = false; - kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu); + vmx_refresh_apicv_exec_ctrl(vcpu); } if (vmx->nested.update_vmcs01_hwapic_isr) { -- cgit v1.2.3