aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
authorMark Brown <broonie@kernel.org>2026-05-29 22:46:54 +0100
committerMark Brown <broonie@kernel.org>2026-05-29 22:46:54 +0100
commitf624c17e475e17a1dbeef56ef624378f4e5a80b7 (patch)
treeaeae91c403e0713b4a67f8610040905e8c0786fd /arch
parent29fa9b4922922bd33cac81162dbf669bfb55a52d (diff)
parentd1568b1332b6b3b36b222c2868fc102727c12a34 (diff)
downloadlinux-next-history-f624c17e475e17a1dbeef56ef624378f4e5a80b7.tar.gz
Merge branch 'next' of https://github.com/kvm-x86/linux.git
# Conflicts: # arch/x86/include/asm/tdx.h
Diffstat (limited to 'arch')
-rw-r--r--arch/x86/include/asm/kvm-x86-ops.h4
-rw-r--r--arch/x86/include/asm/kvm-x86-pmu-ops.h5
-rw-r--r--arch/x86/include/asm/kvm_host.h44
-rw-r--r--arch/x86/include/asm/msr-index.h1
-rw-r--r--arch/x86/include/asm/perf_event.h2
-rw-r--r--arch/x86/include/asm/tdx.h34
-rw-r--r--arch/x86/include/uapi/asm/kvm.h2
-rw-r--r--arch/x86/kvm/cpuid.c7
-rw-r--r--arch/x86/kvm/cpuid.h14
-rw-r--r--arch/x86/kvm/emulate.c20
-rw-r--r--arch/x86/kvm/kvm_emulate.h17
-rw-r--r--arch/x86/kvm/lapic.c21
-rw-r--r--arch/x86/kvm/lapic.h2
-rw-r--r--arch/x86/kvm/mmu/mmu.c2
-rw-r--r--arch/x86/kvm/mmu/paging_tmpl.h36
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c275
-rw-r--r--arch/x86/kvm/pmu.c21
-rw-r--r--arch/x86/kvm/pmu.h44
-rw-r--r--arch/x86/kvm/svm/avic.c82
-rw-r--r--arch/x86/kvm/svm/nested.c145
-rw-r--r--arch/x86/kvm/svm/pmu.c42
-rw-r--r--arch/x86/kvm/svm/sev.c473
-rw-r--r--arch/x86/kvm/svm/svm.c55
-rw-r--r--arch/x86/kvm/svm/svm.h44
-rw-r--r--arch/x86/kvm/vmx/nested.c31
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.c8
-rw-r--r--arch/x86/kvm/vmx/tdx.c282
-rw-r--r--arch/x86/kvm/vmx/vmx.c41
-rw-r--r--arch/x86/kvm/x86.c230
-rw-r--r--arch/x86/kvm/x86.h9
-rw-r--r--arch/x86/virt/hw.c15
-rw-r--r--arch/x86/virt/vmx/tdx/tdx.c64
32 files changed, 1211 insertions, 861 deletions
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index e4fca997ec797..83dc5086138b3 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -96,10 +96,8 @@ KVM_X86_OP_OPTIONAL_RET0(set_identity_map_addr)
KVM_X86_OP_OPTIONAL_RET0(get_mt_mask)
KVM_X86_OP_OPTIONAL_RET0(tdp_has_smep)
KVM_X86_OP(load_mmu_pgd)
-KVM_X86_OP_OPTIONAL(link_external_spt)
-KVM_X86_OP_OPTIONAL(set_external_spte)
+KVM_X86_OP_OPTIONAL_RET0(set_external_spte)
KVM_X86_OP_OPTIONAL(free_external_spt)
-KVM_X86_OP_OPTIONAL(remove_external_spte)
KVM_X86_OP(has_wbinvd_exit)
KVM_X86_OP(get_l2_tsc_offset)
KVM_X86_OP(get_l2_tsc_multiplier)
diff --git a/arch/x86/include/asm/kvm-x86-pmu-ops.h b/arch/x86/include/asm/kvm-x86-pmu-ops.h
index d5452b3433b7d..4a223c2793e3f 100644
--- a/arch/x86/include/asm/kvm-x86-pmu-ops.h
+++ b/arch/x86/include/asm/kvm-x86-pmu-ops.h
@@ -1,6 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
#if !defined(KVM_X86_PMU_OP) || \
- !defined(KVM_X86_PMU_OP_OPTIONAL)
+ !defined(KVM_X86_PMU_OP_OPTIONAL) || \
+ !defined(KVM_X86_PMU_OP_OPTIONAL_RET0)
#error Missing one or more KVM_X86_PMU_OP #defines
#else
@@ -23,6 +24,7 @@ KVM_X86_PMU_OP(init)
KVM_X86_PMU_OP_OPTIONAL(reset)
KVM_X86_PMU_OP_OPTIONAL(deliver_pmi)
KVM_X86_PMU_OP_OPTIONAL(cleanup)
+KVM_X86_PMU_OP_OPTIONAL_RET0(pmc_is_disabled_in_current_mode)
KVM_X86_PMU_OP_OPTIONAL(write_global_ctrl)
KVM_X86_PMU_OP(mediated_load)
@@ -31,3 +33,4 @@ KVM_X86_PMU_OP(mediated_put)
#undef KVM_X86_PMU_OP
#undef KVM_X86_PMU_OP_OPTIONAL
+#undef KVM_X86_PMU_OP_OPTIONAL_RET0
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 8a53ca6195701..6ae7d539af909 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -284,6 +284,8 @@ enum x86_intercept_stage;
#define PFERR_GUEST_RMP_MASK BIT_ULL(31)
#define PFERR_GUEST_FINAL_MASK BIT_ULL(32)
#define PFERR_GUEST_PAGE_MASK BIT_ULL(33)
+#define PFERR_GUEST_FAULT_STAGE_MASK \
+ (PFERR_GUEST_FINAL_MASK | PFERR_GUEST_PAGE_MASK)
#define PFERR_GUEST_ENC_MASK BIT_ULL(34)
#define PFERR_GUEST_SIZEM_MASK BIT_ULL(35)
#define PFERR_GUEST_VMPL_MASK BIT_ULL(36)
@@ -484,7 +486,8 @@ struct kvm_mmu {
u64 (*get_pdptr)(struct kvm_vcpu *vcpu, int index);
int (*page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
void (*inject_page_fault)(struct kvm_vcpu *vcpu,
- struct x86_exception *fault);
+ struct x86_exception *fault,
+ bool from_hardware);
gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
gpa_t gva_or_gpa, u64 access,
struct x86_exception *exception);
@@ -612,6 +615,8 @@ struct kvm_pmu {
DECLARE_BITMAP(pmc_counting_instructions, X86_PMC_IDX_MAX);
DECLARE_BITMAP(pmc_counting_branches, X86_PMC_IDX_MAX);
+ DECLARE_BITMAP(pmc_has_mode_specific_enables, X86_PMC_IDX_MAX);
+
u64 ds_area;
u64 pebs_enable;
u64 pebs_enable_rsvd;
@@ -1057,8 +1062,6 @@ struct kvm_vcpu_arch {
u16 vec;
u32 id;
u32 host_apf_flags;
- bool send_always;
- bool delivery_as_pf_vmexit;
bool pageready_pending;
} apf;
@@ -1441,6 +1444,7 @@ struct kvm_arch {
bool has_private_mem;
bool has_protected_state;
bool has_protected_eoi;
+ bool has_protected_pmu;
bool pre_fault_allowed;
struct hlist_head *mmu_page_hash;
struct list_head active_mmu_pages;
@@ -1523,6 +1527,7 @@ struct kvm_arch {
bool use_master_clock;
u64 master_kernel_ns;
u64 master_cycle_now;
+ struct ratelimit_state kvmclock_update_rs;
#ifdef CONFIG_KVM_HYPERV
struct kvm_hv hyperv;
@@ -1911,20 +1916,13 @@ struct kvm_x86_ops {
void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, hpa_t root_hpa,
int root_level);
- /* Update external mapping with page table link. */
- int (*link_external_spt)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
- void *external_spt);
/* Update the external page table from spte getting set. */
- int (*set_external_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
- u64 mirror_spte);
+ int (*set_external_spte)(struct kvm *kvm, gfn_t gfn, u64 old_spte,
+ u64 new_spte, enum pg_level level);
/* Update external page tables for page table about to be freed. */
- int (*free_external_spt)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
- void *external_spt);
+ void (*free_external_spt)(struct kvm *kvm, struct kvm_mmu_page *sp);
- /* Update external page table from spte getting removed, and flush TLB. */
- void (*remove_external_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
- u64 mirror_spte);
bool (*has_wbinvd_exit)(void);
@@ -2065,7 +2063,6 @@ extern bool __read_mostly enable_device_posted_irqs;
extern struct kvm_x86_ops kvm_x86_ops;
#define kvm_x86_call(func) static_call(kvm_x86_##func)
-#define kvm_pmu_call(func) static_call(kvm_x86_pmu_##func)
#define KVM_X86_OP(func) \
DECLARE_STATIC_CALL(kvm_x86_##func, *(((struct kvm_x86_ops *)0)->func));
@@ -2306,10 +2303,18 @@ void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr, unsigned long payload);
void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned int nr,
bool has_error_code, u32 error_code);
-void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault);
-void kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
- struct x86_exception *fault);
-bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
+void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault,
+ bool from_hardware);
+void __kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
+ struct x86_exception *fault,
+ bool from_hardware);
+
+static inline void kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
+ struct x86_exception *fault)
+{
+ __kvm_inject_emulated_page_fault(vcpu, fault, false);
+}
+
bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr);
static inline int __kvm_irq_line_state(unsigned long *irq_state,
@@ -2550,7 +2555,8 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages);
KVM_X86_QUIRK_SLOT_ZAP_ALL | \
KVM_X86_QUIRK_STUFF_FEATURE_MSRS | \
KVM_X86_QUIRK_IGNORE_GUEST_PAT | \
- KVM_X86_QUIRK_VMCS12_ALLOW_FREEZE_IN_SMM)
+ KVM_X86_QUIRK_VMCS12_ALLOW_FREEZE_IN_SMM | \
+ KVM_X86_QUIRK_NESTED_SVM_SHARED_PAT)
#define KVM_X86_CONDITIONAL_QUIRKS \
(KVM_X86_QUIRK_CD_NW_CLEARED | \
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 86554de9a3f52..18c4be75e9271 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -899,6 +899,7 @@
#define MSR_K7_HWCR_IRPERF_EN_BIT 30
#define MSR_K7_HWCR_IRPERF_EN BIT_ULL(MSR_K7_HWCR_IRPERF_EN_BIT)
#define MSR_K7_HWCR_CPUID_USER_DIS_BIT 35
+#define MSR_K7_HWCR_CPUID_USER_DIS BIT_ULL(MSR_K7_HWCR_CPUID_USER_DIS_BIT)
#define MSR_K7_FID_VID_CTL 0xc0010041
#define MSR_K7_FID_VID_STATUS 0xc0010042
#define MSR_K7_HWCR_CPB_DIS_BIT 25
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 752cb319d5eab..1eb13673e889f 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -60,6 +60,8 @@
#define AMD64_EVENTSEL_INT_CORE_ENABLE (1ULL << 36)
#define AMD64_EVENTSEL_GUESTONLY (1ULL << 40)
#define AMD64_EVENTSEL_HOSTONLY (1ULL << 41)
+#define AMD64_EVENTSEL_HOST_GUEST_MASK \
+ (AMD64_EVENTSEL_HOSTONLY | AMD64_EVENTSEL_GUESTONLY)
#define AMD64_EVENTSEL_INT_CORE_SEL_SHIFT 37
#define AMD64_EVENTSEL_INT_CORE_SEL_MASK \
diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
index e5a9cf656c072..89e97d5761d89 100644
--- a/arch/x86/include/asm/tdx.h
+++ b/arch/x86/include/asm/tdx.h
@@ -6,6 +6,7 @@
#include <linux/init.h>
#include <linux/bits.h>
#include <linux/mmzone.h>
+#include <linux/kvm_types.h>
#include <asm/errno.h>
#include <asm/ptrace.h>
@@ -121,7 +122,7 @@ int tdx_guest_keyid_alloc(void);
u32 tdx_get_nr_guest_keyids(void);
void tdx_guest_keyid_free(unsigned int keyid);
-void tdx_quirk_reset_page(struct page *page);
+void tdx_quirk_reset_paddr(unsigned long base, unsigned long size);
struct tdx_td {
/* TD root structure: */
@@ -145,32 +146,17 @@ struct tdx_vp {
struct page **tdcx_pages;
};
-static inline u64 mk_keyed_paddr(u16 hkid, struct page *page)
-{
- u64 ret;
-
- ret = page_to_phys(page);
- /* KeyID bits are just above the physical address bits: */
- ret |= (u64)hkid << boot_cpu_data.x86_phys_bits;
-
- return ret;
-}
-
-static inline int pg_level_to_tdx_sept_level(enum pg_level level)
-{
- WARN_ON_ONCE(level == PG_LEVEL_NONE);
- return level - 1;
-}
-
void tdx_sys_disable(void);
u64 tdh_vp_enter(struct tdx_vp *vp, struct tdx_module_args *args);
u64 tdh_mng_addcx(struct tdx_td *td, struct page *tdcs_page);
-u64 tdh_mem_page_add(struct tdx_td *td, u64 gpa, struct page *page, struct page *source, u64 *ext_err1, u64 *ext_err2);
-u64 tdh_mem_sept_add(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2);
+u64 tdh_mem_page_add(struct tdx_td *td, u64 gpa, kvm_pfn_t pfn, struct page *source,
+ u64 *ext_err1, u64 *ext_err2);
+u64 tdh_mem_sept_add(struct tdx_td *td, u64 gpa, enum pg_level level, struct page *page, u64 *ext_err1, u64 *ext_err2);
u64 tdh_vp_addcx(struct tdx_vp *vp, struct page *tdcx_page);
-u64 tdh_mem_page_aug(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2);
-u64 tdh_mem_range_block(struct tdx_td *td, u64 gpa, int level, u64 *ext_err1, u64 *ext_err2);
+u64 tdh_mem_page_aug(struct tdx_td *td, u64 gpa, enum pg_level level, kvm_pfn_t pfn,
+ u64 *ext_err1, u64 *ext_err2);
+u64 tdh_mem_range_block(struct tdx_td *td, u64 gpa, enum pg_level level, u64 *ext_err1, u64 *ext_err2);
u64 tdh_mng_key_config(struct tdx_td *td);
u64 tdh_mng_create(struct tdx_td *td, u16 hkid);
u64 tdh_vp_create(struct tdx_td *td, struct tdx_vp *vp);
@@ -186,10 +172,10 @@ u64 tdh_vp_rd(struct tdx_vp *vp, u64 field, u64 *data);
u64 tdh_vp_wr(struct tdx_vp *vp, u64 field, u64 data, u64 mask);
u64 tdh_phymem_page_reclaim(struct page *page, u64 *tdx_pt, u64 *tdx_owner, u64 *tdx_size);
u64 tdh_mem_track(struct tdx_td *tdr);
-u64 tdh_mem_page_remove(struct tdx_td *td, u64 gpa, u64 level, u64 *ext_err1, u64 *ext_err2);
+u64 tdh_mem_page_remove(struct tdx_td *td, u64 gpa, enum pg_level level, u64 *ext_err1, u64 *ext_err2);
u64 tdh_phymem_cache_wb(bool resume);
u64 tdh_phymem_page_wbinvd_tdr(struct tdx_td *td);
-u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, struct page *page);
+u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, kvm_pfn_t pfn);
#else
static inline void tdx_init(void) { }
static inline u32 tdx_get_nr_guest_keyids(void) { return 0; }
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 5f2b30d0405c8..1585ec8040666 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -477,6 +477,7 @@ struct kvm_sync_regs {
#define KVM_X86_QUIRK_STUFF_FEATURE_MSRS (1 << 8)
#define KVM_X86_QUIRK_IGNORE_GUEST_PAT (1 << 9)
#define KVM_X86_QUIRK_VMCS12_ALLOW_FREEZE_IN_SMM (1 << 10)
+#define KVM_X86_QUIRK_NESTED_SVM_SHARED_PAT (1 << 11)
#define KVM_STATE_NESTED_FORMAT_VMX 0
#define KVM_STATE_NESTED_FORMAT_SVM 1
@@ -532,6 +533,7 @@ struct kvm_svm_nested_state_data {
struct kvm_svm_nested_state_hdr {
__u64 vmcb_pa;
+ __u64 gpat;
};
/* for KVM_CAP_NESTED_STATE */
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index e69156b54cfff..8e5340dd26211 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -1248,7 +1248,7 @@ void kvm_initialize_cpu_caps(void)
F(AUTOIBRS),
EMULATED_F(NO_SMM_CTL_MSR),
/* PrefetchCtlMsr */
- /* GpOnUserCpuid */
+ EMULATED_F(GP_ON_USER_CPUID),
/* EPSF */
F(PREFETCHI),
F(AVX512_BMM),
@@ -2161,9 +2161,10 @@ int kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
{
u32 eax, ebx, ecx, edx;
- if (!is_smm(vcpu) && cpuid_fault_enabled(vcpu) &&
- !kvm_require_cpl(vcpu, 0))
+ if (!kvm_is_cpuid_allowed(vcpu)) {
+ kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
return 1;
+ }
eax = kvm_rax_read(vcpu);
ecx = kvm_rcx_read(vcpu);
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 039b8e6f40baf..fc96ba86c644d 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -7,6 +7,8 @@
#include <asm/processor.h>
#include <uapi/asm/kvm_para.h>
+#include "smm.h"
+
extern u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly;
extern bool kvm_is_configuring_cpu_caps __read_mostly;
@@ -181,15 +183,17 @@ static inline int guest_cpuid_stepping(struct kvm_vcpu *vcpu)
return x86_stepping(best->eax);
}
-static inline bool supports_cpuid_fault(struct kvm_vcpu *vcpu)
+static inline bool cpuid_fault_enabled(struct kvm_vcpu *vcpu)
{
- return vcpu->arch.msr_platform_info & MSR_PLATFORM_INFO_CPUID_FAULT;
+ return (vcpu->arch.msr_misc_features_enables &
+ MSR_MISC_FEATURES_ENABLES_CPUID_FAULT) ||
+ (vcpu->arch.msr_hwcr & MSR_K7_HWCR_CPUID_USER_DIS);
}
-static inline bool cpuid_fault_enabled(struct kvm_vcpu *vcpu)
+static inline bool kvm_is_cpuid_allowed(struct kvm_vcpu *vcpu)
{
- return vcpu->arch.msr_misc_features_enables &
- MSR_MISC_FEATURES_ENABLES_CPUID_FAULT;
+ return !cpuid_fault_enabled(vcpu) || is_smm(vcpu) ||
+ !kvm_x86_call(get_cpl)(vcpu);
}
static __always_inline void kvm_cpu_cap_clear(unsigned int x86_feature)
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 8013dccb31102..585a8ceab220d 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -540,8 +540,9 @@ static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
return X86EMUL_PROPAGATE_FAULT;
}
-static int emulate_db(struct x86_emulate_ctxt *ctxt)
+static int emulate_db(struct x86_emulate_ctxt *ctxt, unsigned long dr6)
{
+ ctxt->exception.dr6 = dr6;
return emulate_exception(ctxt, DB_VECTOR, 0, false);
}
@@ -3593,12 +3594,8 @@ static int em_sti(struct x86_emulate_ctxt *ctxt)
static int em_cpuid(struct x86_emulate_ctxt *ctxt)
{
u32 eax, ebx, ecx, edx;
- u64 msr = 0;
- ctxt->ops->get_msr(ctxt, MSR_MISC_FEATURES_ENABLES, &msr);
- if (!ctxt->ops->is_smm(ctxt) &&
- (msr & MSR_MISC_FEATURES_ENABLES_CPUID_FAULT) &&
- ctxt->ops->cpl(ctxt))
+ if (!ctxt->ops->is_cpuid_allowed(ctxt))
return emulate_gp(ctxt, 0);
eax = reg_read(ctxt, VCPU_REGS_RAX);
@@ -3847,15 +3844,8 @@ static int check_dr_read(struct x86_emulate_ctxt *ctxt)
if ((cr4 & X86_CR4_DE) && (dr == 4 || dr == 5))
return emulate_ud(ctxt);
- if (ctxt->ops->get_dr(ctxt, 7) & DR7_GD) {
- ulong dr6;
-
- dr6 = ctxt->ops->get_dr(ctxt, 6);
- dr6 &= ~DR_TRAP_BITS;
- dr6 |= DR6_BD | DR6_ACTIVE_LOW;
- ctxt->ops->set_dr(ctxt, 6, dr6);
- return emulate_db(ctxt);
- }
+ if (ctxt->ops->get_effective_dr7(ctxt) & DR7_GD)
+ return emulate_db(ctxt, DR6_BD);
return X86EMUL_CONTINUE;
}
diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h
index 0abff36d09942..3e375af15c035 100644
--- a/arch/x86/kvm/kvm_emulate.h
+++ b/arch/x86/kvm/kvm_emulate.h
@@ -22,9 +22,13 @@ enum x86_intercept_stage;
struct x86_exception {
u8 vector;
bool error_code_valid;
- u16 error_code;
+ u64 error_code;
bool nested_page_fault;
- u64 address; /* cr2 or nested page fault gpa */
+ union {
+ u64 address; /* cr2 or nested page fault gpa */
+ unsigned long dr6;
+ u64 payload;
+ };
u8 async_page_fault;
unsigned long exit_qualification;
};
@@ -211,6 +215,7 @@ struct x86_emulate_ops {
ulong (*get_cr)(struct x86_emulate_ctxt *ctxt, int cr);
int (*set_cr)(struct x86_emulate_ctxt *ctxt, int cr, ulong val);
int (*cpl)(struct x86_emulate_ctxt *ctxt);
+ ulong (*get_effective_dr7)(struct x86_emulate_ctxt *ctxt);
ulong (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr);
int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value);
int (*set_msr_with_filter)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data);
@@ -225,6 +230,7 @@ struct x86_emulate_ops {
struct x86_instruction_info *info,
enum x86_intercept_stage stage);
+ bool (*is_cpuid_allowed)(struct x86_emulate_ctxt *ctxt);
bool (*get_cpuid)(struct x86_emulate_ctxt *ctxt, u32 *eax, u32 *ebx,
u32 *ecx, u32 *edx, bool exact_only);
bool (*guest_has_movbe)(struct x86_emulate_ctxt *ctxt);
@@ -520,13 +526,6 @@ enum x86_intercept {
nr_x86_intercepts
};
-/* Host execution mode. */
-#if defined(CONFIG_X86_32)
-#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT32
-#elif defined(CONFIG_X86_64)
-#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64
-#endif
-
int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int emulation_type);
bool x86_page_table_writing_insn(struct x86_emulate_ctxt *ctxt);
#define EMULATION_FAILED -1
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 4078e624ca667..4e34f75e705da 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1730,7 +1730,7 @@ static inline struct kvm_lapic *to_lapic(struct kvm_io_device *dev)
#define APIC_REGS_MASK(first, count) \
(APIC_REG_MASK(first) * ((1ull << (count)) - 1))
-u64 kvm_lapic_readable_reg_mask(struct kvm_lapic *apic)
+static u64 kvm_lapic_readable_reg_mask(struct kvm_lapic *apic)
{
/* Leave bits '0' for reserved and write-only registers. */
u64 valid_reg_mask =
@@ -1766,7 +1766,24 @@ u64 kvm_lapic_readable_reg_mask(struct kvm_lapic *apic)
return valid_reg_mask;
}
-EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_lapic_readable_reg_mask);
+
+u64 kvm_x2apic_disable_read_intercept_reg_mask(struct kvm_vcpu *vcpu)
+{
+ if (WARN_ON_ONCE(!lapic_in_kernel(vcpu)))
+ return 0;
+
+ /*
+ * TMMCT, a.k.a. the current APIC timer count, reads aren't accelerated
+ * by hardware (Intel or AMD) as the timer is emulated in software (by
+ * KVM), i.e. reads from the virtual APIC page would return garbage.
+ * Intercept RDMSR, as handling the fault-like APIC-access VM-Exit is
+ * more expensive than handling a RDMSR VM-Exit (the APIC-access exit
+ * requires slow emulation of the code stream).
+ */
+ return kvm_lapic_readable_reg_mask(vcpu->arch.apic) &
+ ~APIC_REG_MASK(APIC_TMCCT);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_x2apic_disable_read_intercept_reg_mask);
static int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
void *data)
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 274885af4ebc4..f763cd29a5082 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -156,7 +156,7 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
int kvm_lapic_set_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len);
void kvm_lapic_exit(void);
-u64 kvm_lapic_readable_reg_mask(struct kvm_lapic *apic);
+u64 kvm_x2apic_disable_read_intercept_reg_mask(struct kvm_vcpu *vcpu);
static inline void kvm_lapic_set_irr(int vec, struct kvm_lapic *apic)
{
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index db1b82eae4da7..a01a1faab2eef 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -7022,7 +7022,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
write_lock(&kvm->mmu_lock);
- kvm_mmu_invalidate_begin(kvm);
+ kvm_mmu_invalidate_start(kvm);
kvm_mmu_invalidate_range_add(kvm, gfn_start, gfn_end);
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 07100bbfc2701..df3ae0c7ec2c3 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -328,6 +328,12 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
const int write_fault = access & PFERR_WRITE_MASK;
const int user_fault = access & PFERR_USER_MASK;
const int fetch_fault = access & PFERR_FETCH_MASK;
+ /*
+ * Note! Track the error_code that's common to legacy shadow paging
+ * and NPT shadow paging as a u16 to guard against unintentionally
+ * setting any of bits 63:16. Architecturally, the #PF error code is
+ * 32 bits, and Intel CPUs don't support settings bits 31:16.
+ */
u16 errcode = 0;
gpa_t real_gpa;
gfn_t gfn;
@@ -391,16 +397,6 @@ retry_walk:
nested_access | PFERR_GUEST_PAGE_MASK,
&walker->fault, 0);
- /*
- * FIXME: This can happen if emulation (for of an INS/OUTS
- * instruction) triggers a nested page fault. The exit
- * qualification / exit info field will incorrectly have
- * "guest page access" as the nested page fault's cause,
- * instead of "guest page structure access". To fix this,
- * the x86_exception struct should be augmented with enough
- * information to fix the exit_qualification or exit_info_1
- * fields.
- */
if (unlikely(real_gpa == INVALID_GPA))
return 0;
@@ -506,7 +502,8 @@ error:
* [2:0] - Derive from the access bits. The exit_qualification might be
* out of date if it is serving an EPT misconfiguration.
* [5:3] - Calculated by the page walk of the guest EPT page tables
- * [7:11] - Derived from [7:11] of real exit_qualification
+ * [7:8] - Derived from "fault stage" access bits
+ * [9:11] - Derived from [9:11] of real exit_qualification
*
* The other bits are set to 0.
*/
@@ -521,12 +518,22 @@ error:
walker->fault.exit_qualification |= EPT_VIOLATION_ACC_READ;
/*
+ * KVM doesn't emulate features that access GPAs directly, e.g.
+ * Intel Processor Trace. Assume the GVA is always valid; when
+ * propagating faults from hardware, KVM will discard this info
+ * and use the EXIT_QUALIFICATION bits from the VMCS.
+ */
+ walker->fault.exit_qualification |= EPT_VIOLATION_GVA_IS_VALID;
+
+ /*
* Accesses to guest paging structures are either "reads" or
* "read+write" accesses, so consider them the latter if write_fault
* is true.
*/
if (access & PFERR_GUEST_PAGE_MASK)
walker->fault.exit_qualification |= EPT_VIOLATION_ACC_READ;
+ else
+ walker->fault.exit_qualification |= EPT_VIOLATION_GVA_TRANSLATED;
/*
* Note, pte_access holds the raw RWX bits from the EPTE, not
@@ -542,6 +549,11 @@ error:
walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu;
walker->fault.async_page_fault = false;
+#if PTTYPE != PTTYPE_EPT
+ if (walker->fault.nested_page_fault)
+ walker->fault.error_code |= access & PFERR_GUEST_FAULT_STAGE_MASK;
+#endif
+
trace_kvm_mmu_walker_error(walker->fault.error_code);
return 0;
}
@@ -807,7 +819,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
*/
if (!r) {
if (!fault->prefetch)
- kvm_inject_emulated_page_fault(vcpu, &walker.fault);
+ __kvm_inject_emulated_page_fault(vcpu, &walker.fault, true);
return RET_PF_RETRY;
}
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 5a2f8ce9a32b8..5b3041138301b 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -53,13 +53,18 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
rcu_barrier();
}
-static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
+static void __tdp_mmu_free_sp(struct kvm_mmu_page *sp)
{
- free_page((unsigned long)sp->external_spt);
free_page((unsigned long)sp->spt);
kmem_cache_free(mmu_page_header_cache, sp);
}
+static void tdp_mmu_free_unused_sp(struct kvm_mmu_page *sp)
+{
+ free_page((unsigned long)sp->external_spt);
+ __tdp_mmu_free_sp(sp);
+}
+
/*
* This is called through call_rcu in order to free TDP page table memory
* safely with respect to other kernel threads that may be operating on
@@ -73,7 +78,8 @@ static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
rcu_head);
- tdp_mmu_free_sp(sp);
+ WARN_ON_ONCE(sp->external_spt);
+ __tdp_mmu_free_sp(sp);
}
void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
@@ -320,9 +326,9 @@ out_read_unlock:
}
}
-static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
- u64 old_spte, u64 new_spte, int level,
- bool shared);
+static void handle_changed_spte(struct kvm *kvm, struct kvm_mmu_page *sp,
+ gfn_t gfn, u64 old_spte, u64 new_spte,
+ int level, bool shared);
static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
@@ -359,25 +365,6 @@ static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
}
-static void remove_external_spte(struct kvm *kvm, gfn_t gfn, u64 old_spte,
- int level)
-{
- /*
- * External (TDX) SPTEs are limited to PG_LEVEL_4K, and external
- * PTs are removed in a special order, involving free_external_spt().
- * But remove_external_spte() will be called on non-leaf PTEs via
- * __tdp_mmu_zap_root(), so avoid the error the former would return
- * in this case.
- */
- if (!is_last_spte(old_spte, level))
- return;
-
- /* Zapping leaf spte is allowed only when write lock is held. */
- lockdep_assert_held_write(&kvm->mmu_lock);
-
- kvm_x86_call(remove_external_spte)(kvm, gfn, level, old_spte);
-}
-
/**
* handle_removed_pt() - handle a page table removed from the TDP structure
*
@@ -471,86 +458,19 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte,
FROZEN_SPTE, level);
}
- handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
- old_spte, FROZEN_SPTE, level, shared);
-
- if (is_mirror_sp(sp)) {
- KVM_BUG_ON(shared, kvm);
- remove_external_spte(kvm, gfn, old_spte, level);
- }
+ handle_changed_spte(kvm, sp, gfn, old_spte, FROZEN_SPTE, level, shared);
}
- if (is_mirror_sp(sp) &&
- WARN_ON(kvm_x86_call(free_external_spt)(kvm, base_gfn, sp->role.level,
- sp->external_spt))) {
- /*
- * Failed to free page table page in mirror page table and
- * there is nothing to do further.
- * Intentionally leak the page to prevent the kernel from
- * accessing the encrypted page.
- */
- sp->external_spt = NULL;
- }
+ if (is_mirror_sp(sp))
+ kvm_x86_call(free_external_spt)(kvm, sp);
call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
}
-static void *get_external_spt(gfn_t gfn, u64 new_spte, int level)
-{
- if (is_shadow_present_pte(new_spte) && !is_last_spte(new_spte, level)) {
- struct kvm_mmu_page *sp = spte_to_child_sp(new_spte);
-
- WARN_ON_ONCE(sp->role.level + 1 != level);
- WARN_ON_ONCE(sp->gfn != gfn);
- return sp->external_spt;
- }
-
- return NULL;
-}
-
-static int __must_check set_external_spte_present(struct kvm *kvm, tdp_ptep_t sptep,
- gfn_t gfn, u64 old_spte,
- u64 new_spte, int level)
-{
- bool was_present = is_shadow_present_pte(old_spte);
- bool is_present = is_shadow_present_pte(new_spte);
- bool is_leaf = is_present && is_last_spte(new_spte, level);
- int ret = 0;
-
- KVM_BUG_ON(was_present, kvm);
-
- lockdep_assert_held(&kvm->mmu_lock);
- /*
- * We need to lock out other updates to the SPTE until the external
- * page table has been modified. Use FROZEN_SPTE similar to
- * the zapping case.
- */
- if (!try_cmpxchg64(rcu_dereference(sptep), &old_spte, FROZEN_SPTE))
- return -EBUSY;
-
- /*
- * Use different call to either set up middle level
- * external page table, or leaf.
- */
- if (is_leaf) {
- ret = kvm_x86_call(set_external_spte)(kvm, gfn, level, new_spte);
- } else {
- void *external_spt = get_external_spt(gfn, new_spte, level);
-
- KVM_BUG_ON(!external_spt, kvm);
- ret = kvm_x86_call(link_external_spt)(kvm, gfn, level, external_spt);
- }
- if (ret)
- __kvm_tdp_mmu_write_spte(sptep, old_spte);
- else
- __kvm_tdp_mmu_write_spte(sptep, new_spte);
- return ret;
-}
-
/**
- * handle_changed_spte - handle bookkeeping associated with an SPTE change
+ * __handle_changed_spte - handle bookkeeping associated with an SPTE change
* @kvm: kvm instance
- * @as_id: the address space of the paging structure the SPTE was a part of
+ * @sp: the page table in which the SPTE resides
* @gfn: the base GFN that was mapped by the SPTE
* @old_spte: The value of the SPTE before the change
* @new_spte: The value of the SPTE after the change
@@ -563,15 +483,16 @@ static int __must_check set_external_spte_present(struct kvm *kvm, tdp_ptep_t sp
* dirty logging updates are handled in common code, not here (see make_spte()
* and fast_pf_fix_direct_spte()).
*/
-static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
- u64 old_spte, u64 new_spte, int level,
- bool shared)
+static int __handle_changed_spte(struct kvm *kvm, struct kvm_mmu_page *sp,
+ gfn_t gfn, u64 old_spte, u64 new_spte,
+ int level, bool shared)
{
bool was_present = is_shadow_present_pte(old_spte);
bool is_present = is_shadow_present_pte(new_spte);
bool was_leaf = was_present && is_last_spte(old_spte, level);
bool is_leaf = is_present && is_last_spte(new_spte, level);
bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
+ int as_id = kvm_mmu_page_as_id(sp);
WARN_ON_ONCE(level > PT64_ROOT_MAX_LEVEL);
WARN_ON_ONCE(level < PG_LEVEL_4K);
@@ -601,9 +522,7 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
}
if (old_spte == new_spte)
- return;
-
- trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
+ return 0;
if (is_leaf)
check_spte_writable_invariants(new_spte);
@@ -630,21 +549,45 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
"a temporary frozen SPTE.\n"
"as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
as_id, gfn, old_spte, new_spte, level);
- return;
- }
- if (is_leaf != was_leaf)
- kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
+ trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
+ return 0;
+ }
/*
* Recursively handle child PTs if the change removed a subtree from
* the paging structure. Note the WARN on the PFN changing without the
* SPTE being converted to a hugepage (leaf) or being zapped. Shadow
* pages are kernel allocations and should never be migrated.
+ *
+ * For the mirror page table, propagate all changes to the external SPTE
+ * (except zapping/promotion of non-leaf SPTEs) via the
+ * set_external_spte() op.
*/
if (was_present && !was_leaf &&
- (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed)))
+ (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed))) {
handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
+ } else if (is_mirror_sp(sp)) {
+ int r;
+
+ r = kvm_x86_call(set_external_spte)(kvm, gfn, old_spte, new_spte, level);
+ if (r)
+ return r;
+ }
+ trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
+
+ if (is_leaf != was_leaf)
+ kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
+
+ return 0;
+}
+
+static void handle_changed_spte(struct kvm *kvm, struct kvm_mmu_page *sp,
+ gfn_t gfn, u64 old_spte, u64 new_spte,
+ int level, bool shared)
+{
+ KVM_BUG_ON(__handle_changed_spte(kvm, sp, gfn, old_spte, new_spte,
+ level, shared), kvm);
}
static inline int __must_check __tdp_mmu_set_spte_atomic(struct kvm *kvm,
@@ -659,34 +602,15 @@ static inline int __must_check __tdp_mmu_set_spte_atomic(struct kvm *kvm,
*/
WARN_ON_ONCE(iter->yielded || is_frozen_spte(iter->old_spte));
- if (is_mirror_sptep(iter->sptep) && !is_frozen_spte(new_spte)) {
- int ret;
-
- /*
- * Users of atomic zapping don't operate on mirror roots,
- * so don't handle it and bug the VM if it's seen.
- */
- if (KVM_BUG_ON(!is_shadow_present_pte(new_spte), kvm))
- return -EBUSY;
-
- ret = set_external_spte_present(kvm, iter->sptep, iter->gfn,
- iter->old_spte, new_spte, iter->level);
- if (ret)
- return ret;
- } else {
- u64 *sptep = rcu_dereference(iter->sptep);
-
- /*
- * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs
- * and does not hold the mmu_lock. On failure, i.e. if a
- * different logical CPU modified the SPTE, try_cmpxchg64()
- * updates iter->old_spte with the current value, so the caller
- * operates on fresh data, e.g. if it retries
- * tdp_mmu_set_spte_atomic()
- */
- if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte))
- return -EBUSY;
- }
+ /*
+ * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
+ * does not hold the mmu_lock. On failure, i.e. if a different logical
+ * CPU modified the SPTE, try_cmpxchg64() updates iter->old_spte with
+ * the current value, so the caller operates on fresh data, e.g. if it
+ * retries tdp_mmu_set_spte_atomic().
+ */
+ if (!try_cmpxchg64(rcu_dereference(iter->sptep), &iter->old_spte, new_spte))
+ return -EBUSY;
return 0;
}
@@ -712,24 +636,61 @@ static inline int __must_check tdp_mmu_set_spte_atomic(struct kvm *kvm,
struct tdp_iter *iter,
u64 new_spte)
{
+ struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep));
int ret;
lockdep_assert_held_read(&kvm->mmu_lock);
- ret = __tdp_mmu_set_spte_atomic(kvm, iter, new_spte);
+ /* Should not set FROZEN_SPTE as a long-term value. */
+ KVM_MMU_WARN_ON(is_frozen_spte(new_spte));
+
+ /*
+ * Temporarily freeze the SPTE until the external PTE operation has
+ * completed, e.g. so that concurrent faults don't attempt to install a
+ * child PTE in the external page table before the parent PTE has been
+ * written.
+ */
+ if (is_mirror_sptep(iter->sptep))
+ ret = __tdp_mmu_set_spte_atomic(kvm, iter, FROZEN_SPTE);
+ else
+ ret = __tdp_mmu_set_spte_atomic(kvm, iter, new_spte);
+
if (ret)
return ret;
- handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
- new_spte, iter->level, true);
-
- return 0;
+ /*
+ * Handle the change from iter->old_spte to new_spte.
+ *
+ * Note: for mirror page table, this means the updates of the external
+ * PTE, statistics, or updates of child SPTEs, child external PTEs and
+ * corresponding statistics are performed while the mirror SPTE is in
+ * frozen state (i.e., before the mirror SPTE is set to new_spte).
+ */
+ ret = __handle_changed_spte(kvm, sp, iter->gfn, iter->old_spte,
+ new_spte, iter->level, true);
+ /*
+ * Unfreeze the mirror SPTE. If updating the external SPTE failed,
+ * restore the old value so that the mirror SPTE isn't frozen in
+ * perpetuity, otherwise set the mirror SPTE to the new desired value.
+ */
+ if (is_mirror_sptep(iter->sptep)) {
+ if (ret)
+ __kvm_tdp_mmu_write_spte(iter->sptep, iter->old_spte);
+ else
+ __kvm_tdp_mmu_write_spte(iter->sptep, new_spte);
+ } else {
+ /*
+ * Bug the VM if handling the change failed, as failure is only
+ * allowed if KVM couldn't update the external SPTE.
+ */
+ KVM_BUG_ON(ret, kvm);
+ }
+ return ret;
}
/*
* tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
* @kvm: KVM instance
- * @as_id: Address space ID, i.e. regular vs. SMM
* @sptep: Pointer to the SPTE
* @old_spte: The current value of the SPTE
* @new_spte: The new value that will be set for the SPTE
@@ -739,9 +700,11 @@ static inline int __must_check tdp_mmu_set_spte_atomic(struct kvm *kvm,
* Returns the old SPTE value, which _may_ be different than @old_spte if the
* SPTE had voldatile bits.
*/
-static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
- u64 old_spte, u64 new_spte, gfn_t gfn, int level)
+static u64 tdp_mmu_set_spte(struct kvm *kvm, tdp_ptep_t sptep, u64 old_spte,
+ u64 new_spte, gfn_t gfn, int level)
{
+ struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(sptep));
+
lockdep_assert_held_write(&kvm->mmu_lock);
/*
@@ -755,16 +718,7 @@ static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level);
- handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
-
- /*
- * Users that do non-atomic setting of PTEs don't operate on mirror
- * roots, so don't handle it and bug the VM if it's seen.
- */
- if (is_mirror_sptep(sptep)) {
- KVM_BUG_ON(is_shadow_present_pte(new_spte), kvm);
- remove_external_spte(kvm, gfn, old_spte, level);
- }
+ handle_changed_spte(kvm, sp, gfn, old_spte, new_spte, level, false);
return old_spte;
}
@@ -773,9 +727,8 @@ static inline void tdp_mmu_iter_set_spte(struct kvm *kvm, struct tdp_iter *iter,
u64 new_spte)
{
WARN_ON_ONCE(iter->yielded);
- iter->old_spte = tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep,
- iter->old_spte, new_spte,
- iter->gfn, iter->level);
+ iter->old_spte = tdp_mmu_set_spte(kvm, iter->sptep, iter->old_spte,
+ new_spte, iter->gfn, iter->level);
}
#define tdp_root_for_each_pte(_iter, _kvm, _root, _start, _end) \
@@ -1321,7 +1274,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
* failed, e.g. because a different task modified the SPTE.
*/
if (r) {
- tdp_mmu_free_sp(sp);
+ tdp_mmu_free_unused_sp(sp);
goto retry;
}
@@ -1377,6 +1330,10 @@ static void kvm_tdp_mmu_age_spte(struct kvm *kvm, struct tdp_iter *iter)
{
u64 new_spte;
+ /* TODO: Add support for aging external SPTEs, if necessary. */
+ if (WARN_ON_ONCE(is_mirror_sptep(iter->sptep)))
+ return;
+
if (spte_ad_enabled(iter->old_spte)) {
iter->old_spte = tdp_mmu_clear_spte_bits_atomic(iter->sptep,
shadow_accessed_mask);
@@ -1628,7 +1585,7 @@ retry:
* installs its own sp in place of the last sp we tried to split.
*/
if (sp)
- tdp_mmu_free_sp(sp);
+ tdp_mmu_free_unused_sp(sp);
return 0;
}
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index e218352e34231..b92dd2e583356 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -16,6 +16,7 @@
#include <linux/perf_event.h>
#include <linux/bsearch.h>
#include <linux/sort.h>
+#include <linux/moduleparam.h>
#include <asm/perf_event.h>
#include <asm/cpu_device_id.h>
#include "x86.h"
@@ -33,6 +34,15 @@ static struct x86_pmu_capability __read_mostly kvm_host_pmu;
struct x86_pmu_capability __read_mostly kvm_pmu_cap;
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_pmu_cap);
+/* Enable/disable PMU virtualization */
+bool __read_mostly enable_pmu = true;
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_pmu);
+module_param(enable_pmu, bool, 0444);
+
+/* Enable/disabled mediated PMU virtualization. */
+bool __read_mostly enable_mediated_pmu;
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_mediated_pmu);
+
struct kvm_pmu_emulated_event_selectors {
u64 INSTRUCTIONS_RETIRED;
u64 BRANCH_INSTRUCTIONS_RETIRED;
@@ -88,7 +98,9 @@ static struct kvm_pmu_ops kvm_pmu_ops __read_mostly;
DEFINE_STATIC_CALL_NULL(kvm_x86_pmu_##func, \
*(((struct kvm_pmu_ops *)0)->func));
#define KVM_X86_PMU_OP_OPTIONAL KVM_X86_PMU_OP
+#define KVM_X86_PMU_OP_OPTIONAL_RET0 KVM_X86_PMU_OP
#include <asm/kvm-x86-pmu-ops.h>
+EXPORT_STATIC_CALL_GPL(kvm_x86_pmu_pmc_is_disabled_in_current_mode);
void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops)
{
@@ -99,6 +111,9 @@ void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops)
#define KVM_X86_PMU_OP(func) \
WARN_ON(!kvm_pmu_ops.func); __KVM_X86_PMU_OP(func)
#define KVM_X86_PMU_OP_OPTIONAL __KVM_X86_PMU_OP
+#define KVM_X86_PMU_OP_OPTIONAL_RET0(func) \
+ static_call_update(kvm_x86_pmu_##func, (void *)kvm_pmu_ops.func ? : \
+ (void *)__static_call_return0);
#include <asm/kvm-x86-pmu-ops.h>
#undef __KVM_X86_PMU_OP
}
@@ -522,7 +537,7 @@ static bool pmc_is_event_allowed(struct kvm_pmc *pmc)
static void kvm_mediated_pmu_refresh_event_filter(struct kvm_pmc *pmc)
{
- bool allowed = pmc_is_event_allowed(pmc);
+ bool allowed = pmc_is_locally_enabled(pmc) && pmc_is_event_allowed(pmc);
struct kvm_pmu *pmu = pmc_to_pmu(pmc);
if (pmc_is_gp(pmc)) {
@@ -670,6 +685,7 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
kvm_for_each_pmc(pmu, pmc, bit, bitmap)
kvm_pmu_recalc_pmc_emulation(pmu, pmc);
}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_pmu_handle_event);
int kvm_pmu_check_rdpmc_early(struct kvm_vcpu *vcpu, unsigned int idx)
{
@@ -879,7 +895,7 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (pmu->global_ctrl != data) {
diff = pmu->global_ctrl ^ data;
pmu->global_ctrl = data;
- reprogram_counters(pmu, diff);
+ kvm_pmu_request_counters_reprogram(pmu, diff);
}
/*
* Unconditionally forward writes to vendor code, i.e. to the
@@ -921,6 +937,7 @@ static void kvm_pmu_reset(struct kvm_vcpu *vcpu)
pmu->need_cleanup = false;
bitmap_zero(pmu->reprogram_pmi, X86_PMC_IDX_MAX);
+ bitmap_zero(pmu->pmc_has_mode_specific_enables, X86_PMC_IDX_MAX);
kvm_for_each_pmc(pmu, pmc, i, pmu->all_valid_pmc_idx) {
pmc_stop_counter(pmc);
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
index 0925246731cb1..a5821d7c87f93 100644
--- a/arch/x86/kvm/pmu.h
+++ b/arch/x86/kvm/pmu.h
@@ -36,6 +36,7 @@ struct kvm_pmu_ops {
void (*reset)(struct kvm_vcpu *vcpu);
void (*deliver_pmi)(struct kvm_vcpu *vcpu);
void (*cleanup)(struct kvm_vcpu *vcpu);
+ bool (*pmc_is_disabled_in_current_mode)(struct kvm_pmc *pmc);
bool (*is_mediated_pmu_supported)(struct x86_pmu_capability *host_pmu);
void (*mediated_load)(struct kvm_vcpu *vcpu);
@@ -53,6 +54,17 @@ struct kvm_pmu_ops {
const u32 MSR_STRIDE;
};
+#define kvm_pmu_call(func) static_call(kvm_x86_pmu_##func)
+
+#define KVM_X86_PMU_OP(func) \
+ DECLARE_STATIC_CALL(kvm_x86_pmu_##func, *(((struct kvm_pmu_ops *)0)->func));
+#define KVM_X86_PMU_OP_OPTIONAL KVM_X86_PMU_OP
+#define KVM_X86_PMU_OP_OPTIONAL_RET0 KVM_X86_PMU_OP
+#include <asm/kvm-x86-pmu-ops.h>
+
+extern bool enable_pmu;
+extern bool enable_mediated_pmu;
+
void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops);
void kvm_handle_guest_mediated_pmi(void);
@@ -190,7 +202,13 @@ static inline bool pmc_is_locally_enabled(struct kvm_pmc *pmc)
pmc->idx - KVM_FIXED_PMC_BASE_IDX) &
(INTEL_FIXED_0_KERNEL | INTEL_FIXED_0_USER);
- return pmc->eventsel & ARCH_PERFMON_EVENTSEL_ENABLE;
+ if (!(pmc->eventsel & ARCH_PERFMON_EVENTSEL_ENABLE))
+ return false;
+
+ if (!test_bit(pmc->idx, pmu->pmc_has_mode_specific_enables))
+ return true;
+
+ return !kvm_pmu_call(pmc_is_disabled_in_current_mode)(pmc);
}
extern struct x86_pmu_capability kvm_pmu_cap;
@@ -198,6 +216,7 @@ extern struct x86_pmu_capability kvm_pmu_cap;
void kvm_init_pmu_capability(struct kvm_pmu_ops *pmu_ops);
void kvm_pmu_recalc_pmc_emulation(struct kvm_pmu *pmu, struct kvm_pmc *pmc);
+void kvm_pmu_handle_event(struct kvm_vcpu *vcpu);
static inline void kvm_pmu_request_counter_reprogram(struct kvm_pmc *pmc)
{
@@ -207,16 +226,24 @@ static inline void kvm_pmu_request_counter_reprogram(struct kvm_pmc *pmc)
kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
}
-static inline void reprogram_counters(struct kvm_pmu *pmu, u64 diff)
+static inline void __kvm_pmu_reprogram_counters(struct kvm_pmu *pmu,
+ u64 counters,
+ bool defer)
{
- int bit;
-
- if (!diff)
+ if (!counters)
return;
- for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX)
- set_bit(bit, pmu->reprogram_pmi);
- kvm_make_request(KVM_REQ_PMU, pmu_to_vcpu(pmu));
+ atomic64_or(counters, &pmu->__reprogram_pmi);
+ if (defer)
+ kvm_make_request(KVM_REQ_PMU, pmu_to_vcpu(pmu));
+ else
+ kvm_pmu_handle_event(pmu_to_vcpu(pmu));
+}
+
+static inline void kvm_pmu_request_counters_reprogram(struct kvm_pmu *pmu,
+ u64 counters)
+{
+ __kvm_pmu_reprogram_counters(pmu, counters, true);
}
/*
@@ -245,7 +272,6 @@ static inline bool kvm_pmu_is_fastpath_emulation_allowed(struct kvm_vcpu *vcpu)
}
void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu);
-void kvm_pmu_handle_event(struct kvm_vcpu *vcpu);
int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data);
int kvm_pmu_check_rdpmc_early(struct kvm_vcpu *vcpu, unsigned int idx);
bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr);
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index 993b551180fe9..0726f88e679aa 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -122,38 +122,8 @@ static u32 x2avic_max_physical_id;
static void avic_set_x2apic_msr_interception(struct vcpu_svm *svm,
bool intercept)
{
- static const u32 x2avic_passthrough_msrs[] = {
- X2APIC_MSR(APIC_ID),
- X2APIC_MSR(APIC_LVR),
- X2APIC_MSR(APIC_TASKPRI),
- X2APIC_MSR(APIC_ARBPRI),
- X2APIC_MSR(APIC_PROCPRI),
- X2APIC_MSR(APIC_EOI),
- X2APIC_MSR(APIC_RRR),
- X2APIC_MSR(APIC_LDR),
- X2APIC_MSR(APIC_DFR),
- X2APIC_MSR(APIC_SPIV),
- X2APIC_MSR(APIC_ISR),
- X2APIC_MSR(APIC_TMR),
- X2APIC_MSR(APIC_IRR),
- X2APIC_MSR(APIC_ESR),
- X2APIC_MSR(APIC_ICR),
- X2APIC_MSR(APIC_ICR2),
-
- /*
- * Note! Always intercept LVTT, as TSC-deadline timer mode
- * isn't virtualized by hardware, and the CPU will generate a
- * #GP instead of a #VMEXIT.
- */
- X2APIC_MSR(APIC_LVTTHMR),
- X2APIC_MSR(APIC_LVTPC),
- X2APIC_MSR(APIC_LVT0),
- X2APIC_MSR(APIC_LVT1),
- X2APIC_MSR(APIC_LVTERR),
- X2APIC_MSR(APIC_TMICT),
- X2APIC_MSR(APIC_TMCCT),
- X2APIC_MSR(APIC_TDCR),
- };
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ u64 rd_regs;
int i;
if (intercept == svm->x2avic_msrs_intercepted)
@@ -162,9 +132,16 @@ static void avic_set_x2apic_msr_interception(struct vcpu_svm *svm,
if (!x2avic_enabled)
return;
- for (i = 0; i < ARRAY_SIZE(x2avic_passthrough_msrs); i++)
- svm_set_intercept_for_msr(&svm->vcpu, x2avic_passthrough_msrs[i],
- MSR_TYPE_RW, intercept);
+ rd_regs = kvm_x2apic_disable_read_intercept_reg_mask(vcpu);
+
+ for_each_set_bit(i, (unsigned long *)&rd_regs, BITS_PER_TYPE(rd_regs))
+ svm_set_intercept_for_msr(vcpu, APIC_BASE_MSR + i,
+ MSR_TYPE_R, intercept);
+
+ svm_set_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_W, intercept);
+ svm_set_intercept_for_msr(vcpu, X2APIC_MSR(APIC_EOI), MSR_TYPE_W, intercept);
+ svm_set_intercept_for_msr(vcpu, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W, intercept);
+ svm_set_intercept_for_msr(vcpu, X2APIC_MSR(APIC_ICR), MSR_TYPE_W, intercept);
svm->x2avic_msrs_intercepted = intercept;
}
@@ -207,6 +184,35 @@ static void avic_activate_vmcb(struct vcpu_svm *svm)
svm_clr_intercept(svm, INTERCEPT_CR8_WRITE);
/*
+ * Flush the TLB when enabling (x2)AVIC and when transitioning between
+ * xAVIC and x2AVIC, as the CPU may have inserted a TLB entry for the
+ * "wrong" mapping.
+ *
+ * KVM uses a per-VM "scratch" page to back the APIC memslot, because
+ * KVM also uses per-VM page tables *and* maintains the page table (NPT
+ * or shadow page) mappings for said memslot even if one or more vCPUs
+ * have their local APIC hardware-disabled or are in x2APIC mode, i.e.
+ * even if one or more vCPUs' APIC MMIO BAR is effectively disabled.
+ *
+ * If xAVIC is fully enabled, hardware ignores the physical address in
+ * KVM's page tables, i.e. in the leaf SPTE for the APIC memslot, and
+ * instead redirects the access to the AVIC backing page, i.e. to the
+ * vCPU's virtual APIC page. If xAVIC is not enabled (APIC is either
+ * hardware-disabled or in x2APIC mode), then guest accesses will use
+ * the page table mapping verbatim, i.e. will access the per-VM scratch
+ * page, as normal memory.
+ *
+ * In both cases, the CPU is allowed to cache TLB entries for the APIC
+ * base GPA. So, KVM needs to flush the TLB when enabling xAVIC, as
+ * accesses need to be redirected to the virtual APIC page, but the TLB
+ * may contain entries pointing at the scratch page. KVM also needs to
+ * flush the TLB when enabling x2AVIC, as accesses need to go to the
+ * scratch page, but the TLB may contain entries tagged as xAVIC, i.e.
+ * entries pointing to the vCPU's virtual APIC page.
+ */
+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, &svm->vcpu);
+
+ /*
* Note: KVM supports hybrid-AVIC mode, where KVM emulates x2APIC MSR
* accesses, while interrupt injection to a running vCPU can be
* achieved using AVIC doorbell. KVM disables the APIC access page
@@ -219,12 +225,6 @@ static void avic_activate_vmcb(struct vcpu_svm *svm)
/* Disabling MSR intercept for x2APIC registers */
avic_set_x2apic_msr_interception(svm, false);
} else {
- /*
- * Flush the TLB, the guest may have inserted a non-APIC
- * mapping into the TLB while AVIC was disabled.
- */
- kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, &svm->vcpu);
-
/* Enabling MSR intercept for x2APIC registers */
avic_set_x2apic_msr_interception(svm, true);
}
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 1bf3e4804ad0a..7ad4b4fb7a1c0 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -30,27 +30,42 @@
#include "lapic.h"
#include "svm.h"
#include "hyperv.h"
+#include "pmu.h"
#define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK
static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
- struct x86_exception *fault)
+ struct x86_exception *fault,
+ bool from_hardware)
{
struct vcpu_svm *svm = to_svm(vcpu);
struct vmcb *vmcb = svm->vmcb;
+ u64 fault_stage;
- if (vmcb->control.exit_code != SVM_EXIT_NPF) {
- /*
- * TODO: track the cause of the nested page fault, and
- * correctly fill in the high bits of exit_info_1.
- */
- vmcb->control.exit_code = SVM_EXIT_NPF;
- vmcb->control.exit_info_1 = (1ULL << 32);
- vmcb->control.exit_info_2 = fault->address;
- }
+ /*
+ * For hardware NPF exits, the GUEST_FAULT_STAGE bits are only
+ * available in the hardware exit_info_1, since the guest_mmu
+ * walker doesn't know whether the faulting GPA was a page table
+ * page or final page from L2's perspective.
+ */
+ if (from_hardware)
+ fault_stage = vmcb->control.exit_info_1 &
+ PFERR_GUEST_FAULT_STAGE_MASK;
+ else
+ fault_stage = fault->error_code & PFERR_GUEST_FAULT_STAGE_MASK;
- vmcb->control.exit_info_1 &= ~0xffffffffULL;
- vmcb->control.exit_info_1 |= fault->error_code;
+ /*
+ * All nested page faults should be annotated as occurring on the
+ * final translation *or* the page walk. Arbitrarily choose "final"
+ * if KVM is buggy and enumerated both or neither.
+ */
+ if (WARN_ON_ONCE(hweight64(fault_stage) != 1))
+ fault_stage = PFERR_GUEST_FINAL_MASK;
+
+ vmcb->control.exit_code = SVM_EXIT_NPF;
+ vmcb->control.exit_info_1 = fault_stage |
+ (fault->error_code & ~PFERR_GUEST_FAULT_STAGE_MASK);
+ vmcb->control.exit_info_2 = fault->address;
nested_svm_vmexit(svm);
}
@@ -421,7 +436,8 @@ static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu,
/* Common checks that apply to both L1 and L2 state. */
static bool nested_vmcb_check_save(struct kvm_vcpu *vcpu,
- struct vmcb_save_area_cached *save)
+ struct vmcb_save_area_cached *save,
+ bool check_gpat)
{
if (CC(!(save->efer & EFER_SVME)))
return false;
@@ -456,6 +472,15 @@ static bool nested_vmcb_check_save(struct kvm_vcpu *vcpu,
if (CC(!kvm_valid_efer(vcpu, save->efer)))
return false;
+ /*
+ * If userspace contrives to get an invalid g_pat into vmcb02 by
+ * disabling KVM_X86_QUIRK_NESTED_SVM_SHARED_PAT in a race with
+ * this check, it should be prepared for the KVM_EXIT_FAIL_ENTRY
+ * that will follow.
+ */
+ if (check_gpat && CC(!kvm_pat_valid(save->g_pat)))
+ return false;
+
return true;
}
@@ -463,7 +488,8 @@ int nested_svm_check_cached_vmcb12(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- if (!nested_vmcb_check_save(vcpu, &svm->nested.save) ||
+ if (!nested_vmcb_check_save(vcpu, &svm->nested.save,
+ l2_has_separate_pat(vcpu)) ||
!nested_vmcb_check_controls(vcpu, &svm->nested.ctl))
return -EINVAL;
@@ -576,6 +602,7 @@ static void __nested_copy_vmcb_save_to_cache(struct vmcb_save_area_cached *to,
to->rax = from->rax;
to->cr2 = from->cr2;
+ to->g_pat = from->g_pat;
svm_copy_lbrs(to, from);
}
@@ -705,15 +732,6 @@ static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3,
return 0;
}
-void nested_vmcb02_compute_g_pat(struct vcpu_svm *svm)
-{
- if (!svm->nested.vmcb02.ptr)
- return;
-
- /* FIXME: merge g_pat from vmcb01 and vmcb12. */
- svm->nested.vmcb02.ptr->save.g_pat = svm->vmcb01.ptr->save.g_pat;
-}
-
static bool nested_vmcb12_has_lbrv(struct kvm_vcpu *vcpu)
{
return guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) &&
@@ -729,9 +747,6 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm)
struct vmcb *vmcb02 = svm->nested.vmcb02.ptr;
struct kvm_vcpu *vcpu = &svm->vcpu;
- nested_vmcb02_compute_g_pat(svm);
- vmcb_mark_dirty(vmcb02, VMCB_NPT);
-
/* Load the nested guest state */
if (svm->nested.vmcb12_gpa != svm->nested.last_vmcb12_gpa) {
new_vmcb12 = true;
@@ -762,6 +777,13 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm)
vmcb_mark_dirty(vmcb02, VMCB_CET);
}
+ if (l2_has_separate_pat(vcpu)) {
+ if (unlikely(new_vmcb12 || vmcb12_is_dirty(control, VMCB_NPT)))
+ vmcb_set_gpat(vmcb02, svm->nested.save.g_pat);
+ } else if (npt_enabled) {
+ vmcb_set_gpat(vmcb02, vcpu->arch.pat);
+ }
+
kvm_set_rflags(vcpu, save->rflags | X86_EFLAGS_FIXED);
svm_set_efer(vcpu, svm->nested.save.efer);
@@ -838,6 +860,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
/* Enter Guest-Mode */
enter_guest_mode(vcpu);
+ svm_pmu_handle_nested_transition(svm);
/*
* Filled at exit: exit_code, exit_info_1, exit_info_2, exit_int_info,
@@ -1119,16 +1142,22 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu)
}
ret = nested_svm_copy_vmcb12_to_cache(vcpu, vmcb12_gpa);
- if (ret) {
- if (ret == -EFAULT)
- return kvm_handle_memory_failure(vcpu, X86EMUL_IO_NEEDED, NULL);
+ if (ret == -EFAULT)
+ return kvm_handle_memory_failure(vcpu, X86EMUL_IO_NEEDED, NULL);
- /* Advance RIP past VMRUN as part of the nested #VMEXIT. */
- return kvm_skip_emulated_instruction(vcpu);
- }
+ /*
+ * At this point, VMRUN is guaranteed to not fault; advance RIP. If
+ * caching vmcb12 failed for other reasons, return immediately afterward
+ * as a nested #VMEXIT was already set up.
+ *
+ * FIXME: If TF is set on VMRUN should inject a #DB (or handle guest
+ * debugging) right after #VMEXIT, right now it's just ignored.
+ */
+ if (!svm_skip_emulated_instruction(vcpu))
+ return 0;
- /* At this point, VMRUN is guaranteed to not fault; advance RIP. */
- ret = kvm_skip_emulated_instruction(vcpu);
+ if (ret)
+ goto insn_retired;
/*
* Since vmcb01 is not in use, we can use it to store some of the L1
@@ -1158,7 +1187,13 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu)
nested_svm_vmexit(svm);
}
- return ret;
+insn_retired:
+ /*
+ * A successful VMRUN is counted by the PMU in guest mode, so only
+ * retire the instruction after potentially entering guest mode.
+ */
+ kvm_pmu_instruction_retired(vcpu);
+ return 1;
}
/* Copy state save area fields which are handled by VMRUN */
@@ -1242,6 +1277,9 @@ static int nested_svm_vmexit_update_vmcb12(struct kvm_vcpu *vcpu)
vmcb12->save.dr6 = svm->vcpu.arch.dr6;
vmcb12->save.cpl = vmcb02->save.cpl;
+ if (l2_has_separate_pat(vcpu))
+ vmcb12->save.g_pat = vmcb02->save.g_pat;
+
if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) {
vmcb12->save.s_cet = vmcb02->save.s_cet;
vmcb12->save.isst_addr = vmcb02->save.isst_addr;
@@ -1288,6 +1326,8 @@ void nested_svm_vmexit(struct vcpu_svm *svm)
/* Exit Guest-Mode */
leave_guest_mode(vcpu);
+ svm_pmu_handle_nested_transition(svm);
+
svm->nested.vmcb12_gpa = 0;
kvm_warn_on_nested_run_pending(vcpu);
@@ -1499,6 +1539,15 @@ void svm_leave_nested(struct kvm_vcpu *vcpu)
leave_guest_mode(vcpu);
+ /*
+ * Force leaving nested is a non-architectural flow so precision
+ * isn't a priority. Defer updating the PMU until the next vCPU
+ * run, potentially tolerating some imprecision to avoid poking
+ * into PMU state from arbitrary contexts (e.g. to avoid using
+ * stale state).
+ */
+ __svm_pmu_handle_nested_transition(svm, true);
+
svm_switch_vmcb(svm, &svm->vmcb01);
nested_svm_uninit_mmu_context(vcpu);
@@ -1851,6 +1900,9 @@ static int svm_get_nested_state(struct kvm_vcpu *vcpu,
/* First fill in the header and copy it out. */
if (is_guest_mode(vcpu)) {
kvm_state.hdr.svm.vmcb_pa = svm->nested.vmcb12_gpa;
+ kvm_state.hdr.svm.gpat = 0;
+ if (l2_has_separate_pat(vcpu))
+ kvm_state.hdr.svm.gpat = svm->vmcb->save.g_pat;
kvm_state.size += KVM_STATE_NESTED_SVM_VMCB_SIZE;
kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE;
@@ -1903,6 +1955,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
struct vmcb_save_area *save;
struct vmcb_save_area_cached save_cached;
struct vmcb_ctrl_area_cached ctl_cached;
+ bool use_separate_l2_pat;
unsigned long cr0;
int ret;
@@ -1967,15 +2020,29 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
/*
* Validate host state saved from before VMRUN (see
- * nested_svm_check_permissions).
+ * nested_svm_check_permissions). Note that the g_pat field is not
+ * validated, because (a) it may have been clobbered by SMM before
+ * KVM_GET_NESTED_STATE, and (b) it is not loaded at emulated
+ * #VMEXIT.
*/
__nested_copy_vmcb_save_to_cache(&save_cached, save);
if (!(save->cr0 & X86_CR0_PG) ||
!(save->cr0 & X86_CR0_PE) ||
(save->rflags & X86_EFLAGS_VM) ||
- !nested_vmcb_check_save(vcpu, &save_cached))
+ !nested_vmcb_check_save(vcpu, &save_cached, false))
goto out_free;
+ /*
+ * Validate gPAT when the shared PAT quirk is disabled (i.e. L2
+ * has its own gPAT). This is done separately from the
+ * vmcb_save_area_cached validation above, because gPAT is L2
+ * state, but the vmcb_save_area_cached is populated with L1 state.
+ */
+ use_separate_l2_pat = (ctl_cached.misc_ctl & SVM_MISC_ENABLE_NP) &&
+ !kvm_check_has_quirk(vcpu->kvm,
+ KVM_X86_QUIRK_NESTED_SVM_SHARED_PAT);
+ if (use_separate_l2_pat && !kvm_pat_valid(kvm_state->hdr.svm.gpat))
+ goto out_free;
/*
* All checks done, we can enter guest mode. Userspace provides
@@ -2002,6 +2069,10 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
nested_copy_vmcb_control_to_cache(svm, ctl);
svm_switch_vmcb(svm, &svm->nested.vmcb02);
+
+ if (use_separate_l2_pat)
+ vmcb_set_gpat(svm->vmcb, kvm_state->hdr.svm.gpat);
+
nested_vmcb02_prepare_control(svm);
/*
diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c
index 7aa298eeb0721..c18286545a7ac 100644
--- a/arch/x86/kvm/svm/pmu.c
+++ b/arch/x86/kvm/svm/pmu.c
@@ -168,6 +168,12 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
pmc->eventsel = data;
pmc->eventsel_hw = (data & ~AMD64_EVENTSEL_HOSTONLY) |
AMD64_EVENTSEL_GUESTONLY;
+
+ if (data & AMD64_EVENTSEL_HOST_GUEST_MASK)
+ __set_bit(pmc->idx, pmu->pmc_has_mode_specific_enables);
+ else
+ __clear_bit(pmc->idx, pmu->pmc_has_mode_specific_enables);
+
kvm_pmu_request_counter_reprogram(pmc);
}
return 0;
@@ -207,7 +213,11 @@ static void amd_pmu_refresh(struct kvm_vcpu *vcpu)
}
pmu->counter_bitmask[KVM_PMC_GP] = BIT_ULL(48) - 1;
+
pmu->reserved_bits = 0xfffffff000280000ull;
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_SVM) && kvm_vcpu_has_mediated_pmu(vcpu))
+ pmu->reserved_bits &= ~AMD64_EVENTSEL_HOST_GUEST_MASK;
+
pmu->raw_event_mask = AMD64_RAW_EVENT_MASK;
/* not applicable to AMD; but clean them to prevent any fall out */
pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
@@ -260,6 +270,37 @@ static void amd_mediated_pmu_put(struct kvm_vcpu *vcpu)
wrmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, pmu->global_status);
}
+static bool amd_pmc_is_disabled_in_current_mode(struct kvm_pmc *pmc)
+{
+ struct kvm_vcpu *vcpu = pmc->vcpu;
+ u64 host_guest_bits;
+
+ if (!kvm_vcpu_has_mediated_pmu(vcpu))
+ return false;
+
+ /* Common code is supposed to check the common enable bit */
+ if (WARN_ON_ONCE(!(pmc->eventsel & ARCH_PERFMON_EVENTSEL_ENABLE)))
+ return false;
+
+ /* If both bits are cleared, the counter is always enabled */
+ host_guest_bits = pmc->eventsel & AMD64_EVENTSEL_HOST_GUEST_MASK;
+ if (!host_guest_bits)
+ return false;
+
+ /* If EFER.SVME=0 and either bit is set, the counter is disabled */
+ if (!(vcpu->arch.efer & EFER_SVME))
+ return true;
+
+ /*
+ * If EFER.SVME=1, the counter is disabled iff only one of the bits is
+ * set AND the set bit doesn't match the vCPU mode.
+ */
+ if (host_guest_bits == AMD64_EVENTSEL_HOST_GUEST_MASK)
+ return false;
+
+ return !!(host_guest_bits & AMD64_EVENTSEL_GUESTONLY) != is_guest_mode(vcpu);
+}
+
struct kvm_pmu_ops amd_pmu_ops __initdata = {
.rdpmc_ecx_to_pmc = amd_rdpmc_ecx_to_pmc,
.msr_idx_to_pmc = amd_msr_idx_to_pmc,
@@ -269,6 +310,7 @@ struct kvm_pmu_ops amd_pmu_ops __initdata = {
.set_msr = amd_pmu_set_msr,
.refresh = amd_pmu_refresh,
.init = amd_pmu_init,
+ .pmc_is_disabled_in_current_mode = amd_pmc_is_disabled_in_current_mode,
.is_mediated_pmu_supported = amd_pmu_is_mediated_pmu_supported,
.mediated_load = amd_mediated_pmu_load,
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 18a7cddb097d2..e0a2edc1543a6 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -721,14 +721,50 @@ e_free_dh:
return ret;
}
+static int sev_check_pin_count(struct kvm *kvm, unsigned long npages)
+{
+ unsigned long total_npages, lock_limit;
+
+ total_npages = to_kvm_sev_info(kvm)->pages_locked + npages;
+ if (total_npages > totalram_pages())
+ return -EINVAL;
+
+ lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+ if (total_npages > lock_limit && !capable(CAP_IPC_LOCK)) {
+ pr_err_ratelimited("SEV: %lu total pages would exceed the lock limit of %lu.\n",
+ total_npages, lock_limit);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static int sev_pin_user_pages(struct kvm *kvm, unsigned long addr, int npages,
+ unsigned int gup_flags, struct page **pages)
+{
+ int npinned;
+
+ lockdep_assert_held(&kvm->lock);
+
+ npinned = pin_user_pages_fast(addr, npages, gup_flags, pages);
+ if (npinned != npages) {
+ if (npinned > 0)
+ unpin_user_pages(pages, npinned);
+ pr_err_ratelimited("SEV: Failure locking %u pages.\n", npages);
+ return -ENOMEM;
+ }
+
+ to_kvm_sev_info(kvm)->pages_locked += npages;
+ return 0;
+}
+
static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
unsigned long ulen, unsigned long *n,
unsigned int flags)
{
- struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
- unsigned long npages, total_npages, lock_limit;
+ unsigned long npages;
struct page **pages;
- int npinned, ret;
+ int ret;
lockdep_assert_held(&kvm->lock);
@@ -744,16 +780,9 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
if (npages > INT_MAX)
return ERR_PTR(-EINVAL);
- total_npages = sev->pages_locked + npages;
- if (total_npages > totalram_pages())
- return ERR_PTR(-EINVAL);
-
- lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
- if (total_npages > lock_limit && !capable(CAP_IPC_LOCK)) {
- pr_err("SEV: %lu total pages would exceed the lock limit of %lu.\n",
- total_npages, lock_limit);
- return ERR_PTR(-ENOMEM);
- }
+ ret = sev_check_pin_count(kvm, npages);
+ if (ret)
+ return ERR_PTR(ret);
/*
* Don't WARN if the kernel (rightly) thinks the total size is absurd,
@@ -765,25 +794,14 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
if (!pages)
return ERR_PTR(-ENOMEM);
- /* Pin the user virtual address. */
- npinned = pin_user_pages_fast(uaddr, npages, flags, pages);
- if (npinned != npages) {
- pr_err("SEV: Failure locking %lu pages.\n", npages);
- ret = -ENOMEM;
- goto err;
+ ret = sev_pin_user_pages(kvm, uaddr, npages, flags, pages);
+ if (ret) {
+ kvfree(pages);
+ return ERR_PTR(ret);
}
*n = npages;
- sev->pages_locked = total_npages;
-
return pages;
-
-err:
- if (npinned > 0)
- unpin_user_pages(pages, npinned);
-
- kvfree(pages);
- return ERR_PTR(ret);
}
static void sev_unpin_memory(struct kvm *kvm, struct page **pages,
@@ -794,6 +812,29 @@ static void sev_unpin_memory(struct kvm *kvm, struct page **pages,
to_kvm_sev_info(kvm)->pages_locked -= npages;
}
+static struct page *sev_pin_page(struct kvm *kvm, unsigned long addr,
+ unsigned int flags)
+{
+ struct page *page;
+ int r;
+
+ r = sev_check_pin_count(kvm, 1);
+ if (r)
+ return ERR_PTR(r);
+
+ r = sev_pin_user_pages(kvm, addr, 1, flags, &page);
+ if (r)
+ return ERR_PTR(r);
+
+ return page;
+}
+
+static void sev_unpin_page(struct kvm *kvm, struct page *page)
+{
+ unpin_user_pages(&page, 1);
+ to_kvm_sev_info(kvm)->pages_locked -= 1;
+}
+
static void sev_clflush_pages(struct page *pages[], unsigned long npages)
{
uint8_t *page_virtual;
@@ -1197,160 +1238,115 @@ static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp)
return ret;
}
-static int __sev_issue_dbg_cmd(struct kvm *kvm, unsigned long src,
- unsigned long dst, int size,
- int *error, bool enc)
+static int sev_issue_dbg_cmd(struct kvm *kvm, unsigned long src_pa,
+ unsigned long dst_pa, unsigned int size,
+ unsigned int ioctl, int *error)
{
- struct sev_data_dbg data;
-
- data.reserved = 0;
- data.handle = to_kvm_sev_info(kvm)->handle;
- data.dst_addr = dst;
- data.src_addr = src;
- data.len = size;
+ int cmd = ioctl == KVM_SEV_DBG_DECRYPT ? SEV_CMD_DBG_DECRYPT :
+ SEV_CMD_DBG_ENCRYPT;
+ struct sev_data_dbg data = {
+ .handle = to_kvm_sev_info(kvm)->handle,
+ .dst_addr = dst_pa,
+ .src_addr = src_pa,
+ .len = size,
+ };
- return sev_issue_cmd(kvm,
- enc ? SEV_CMD_DBG_ENCRYPT : SEV_CMD_DBG_DECRYPT,
- &data, error);
+ return sev_issue_cmd(kvm, cmd, &data, error);
}
-static int __sev_dbg_decrypt(struct kvm *kvm, unsigned long src_paddr,
- unsigned long dst_paddr, int sz, int *err)
+static void *sev_dbg_crypt_slow_alloc(struct page *page, unsigned long __va,
+ unsigned int len, unsigned long *pa,
+ unsigned int *nr_bytes)
{
- int offset;
+ unsigned long va = ALIGN_DOWN(__va, 16);
+
+ /* The number of bytes to {de,en}crypt must be 16-byte aligned. */
+ *nr_bytes = round_up(len, 16);
/*
- * Its safe to read more than we are asked, caller should ensure that
- * destination has enough space.
+ * Increase the number of bytes to {de,en}crypt by one chunk (16 bytes)
+ * if the aligned address and length doesn't cover the unaligned range,
+ * e.g. if the address is unaligned _and_ the access will split a chunk
+ * at the tail.
*/
- offset = src_paddr & 15;
- src_paddr = round_down(src_paddr, 16);
- sz = round_up(sz + offset, 16);
-
- return __sev_issue_dbg_cmd(kvm, src_paddr, dst_paddr, sz, err, false);
-}
+ if (va + *nr_bytes < __va + len)
+ *nr_bytes += 16;
-static int __sev_dbg_decrypt_user(struct kvm *kvm, unsigned long paddr,
- void __user *dst_uaddr,
- unsigned long dst_paddr,
- int size, int *err)
-{
- struct page *tpage = NULL;
- int ret, offset;
+ *pa = __sme_page_pa(page) + (va & ~PAGE_MASK);
- /* if inputs are not 16-byte then use intermediate buffer */
- if (!IS_ALIGNED(dst_paddr, 16) ||
- !IS_ALIGNED(paddr, 16) ||
- !IS_ALIGNED(size, 16)) {
- tpage = (void *)alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
- if (!tpage)
- return -ENOMEM;
+ /*
+ * Sanity check that the new access won't split a page. This should
+ * never happen; just pretend the allocation failed.
+ */
+ if (WARN_ON_ONCE((*pa & PAGE_MASK) != ((*pa + *nr_bytes - 1) & PAGE_MASK)))
+ return NULL;
- dst_paddr = __sme_page_pa(tpage);
- }
+ return kmalloc(*nr_bytes, GFP_KERNEL);
+}
- ret = __sev_dbg_decrypt(kvm, paddr, dst_paddr, size, err);
- if (ret)
- goto e_free;
+static int sev_dbg_decrypt_slow(struct kvm *kvm, unsigned long src,
+ struct page *src_p, unsigned long dst,
+ unsigned int len, int *err)
+{
+ unsigned int nr_bytes;
+ unsigned long src_pa;
+ void *buf;
+ int r;
- if (tpage) {
- offset = paddr & 15;
- if (copy_to_user(dst_uaddr, page_address(tpage) + offset, size))
- ret = -EFAULT;
- }
+ buf = sev_dbg_crypt_slow_alloc(src_p, src, len, &src_pa, &nr_bytes);
+ if (!buf)
+ return -ENOMEM;
-e_free:
- if (tpage)
- __free_page(tpage);
+ r = sev_issue_dbg_cmd(kvm, src_pa, __sme_set(__pa(buf)),
+ nr_bytes, KVM_SEV_DBG_DECRYPT, err);
+ if (r)
+ goto out;
- return ret;
+ if (copy_to_user((void __user *)dst, buf + (src & 15), len))
+ r = -EFAULT;
+out:
+ kfree(buf);
+ return r;
}
-static int __sev_dbg_encrypt_user(struct kvm *kvm, unsigned long paddr,
- void __user *vaddr,
- unsigned long dst_paddr,
- void __user *dst_vaddr,
- int size, int *error)
+static int sev_dbg_encrypt_slow(struct kvm *kvm, unsigned long src,
+ unsigned long dst, struct page *dst_p,
+ unsigned int len, int *err)
{
- struct page *src_tpage = NULL;
- struct page *dst_tpage = NULL;
- int ret, len = size;
-
- /* If source buffer is not aligned then use an intermediate buffer */
- if (!IS_ALIGNED((unsigned long)vaddr, 16)) {
- src_tpage = alloc_page(GFP_KERNEL_ACCOUNT);
- if (!src_tpage)
- return -ENOMEM;
+ unsigned int nr_bytes;
+ unsigned long dst_pa;
+ void *buf;
+ int r;
- if (copy_from_user(page_address(src_tpage), vaddr, size)) {
- __free_page(src_tpage);
- return -EFAULT;
- }
+ /* Decrypt the _destination_ to do a RMW on plaintext. */
+ buf = sev_dbg_crypt_slow_alloc(dst_p, dst, len, &dst_pa, &nr_bytes);
+ if (!buf)
+ return -ENOMEM;
- paddr = __sme_page_pa(src_tpage);
- }
+ r = sev_issue_dbg_cmd(kvm, dst_pa, __sme_set(__pa(buf)),
+ nr_bytes, KVM_SEV_DBG_DECRYPT, err);
+ if (r)
+ goto out;
/*
- * If destination buffer or length is not aligned then do read-modify-write:
- * - decrypt destination in an intermediate buffer
- * - copy the source buffer in an intermediate buffer
- * - use the intermediate buffer as source buffer
+ * Copy from the source into the intermediate buffer, and then
+ * re-encrypt the buffer into the destination.
*/
- if (!IS_ALIGNED((unsigned long)dst_vaddr, 16) || !IS_ALIGNED(size, 16)) {
- int dst_offset;
-
- dst_tpage = alloc_page(GFP_KERNEL_ACCOUNT);
- if (!dst_tpage) {
- ret = -ENOMEM;
- goto e_free;
- }
-
- ret = __sev_dbg_decrypt(kvm, dst_paddr,
- __sme_page_pa(dst_tpage), size, error);
- if (ret)
- goto e_free;
-
- /*
- * If source is kernel buffer then use memcpy() otherwise
- * copy_from_user().
- */
- dst_offset = dst_paddr & 15;
-
- if (src_tpage)
- memcpy(page_address(dst_tpage) + dst_offset,
- page_address(src_tpage), size);
- else {
- if (copy_from_user(page_address(dst_tpage) + dst_offset,
- vaddr, size)) {
- ret = -EFAULT;
- goto e_free;
- }
- }
-
- paddr = __sme_page_pa(dst_tpage);
- dst_paddr = round_down(dst_paddr, 16);
- len = round_up(size, 16);
- }
-
- ret = __sev_issue_dbg_cmd(kvm, paddr, dst_paddr, len, error, true);
-
-e_free:
- if (src_tpage)
- __free_page(src_tpage);
- if (dst_tpage)
- __free_page(dst_tpage);
- return ret;
+ if (copy_from_user(buf + (dst & 15), (void __user *)src, len))
+ r = -EFAULT;
+ else
+ r = sev_issue_dbg_cmd(kvm, __sme_set(__pa(buf)), dst_pa,
+ nr_bytes, KVM_SEV_DBG_ENCRYPT, err);
+out:
+ kfree(buf);
+ return r;
}
-static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec)
+static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp,
+ unsigned int cmd)
{
- unsigned long vaddr, vaddr_end, next_vaddr;
- unsigned long dst_vaddr;
- struct page **src_p, **dst_p;
struct kvm_sev_dbg debug;
- unsigned long n;
- unsigned int size;
- int ret;
+ unsigned int i, len;
if (!sev_guest(kvm))
return -ENOTTY;
@@ -1358,27 +1354,38 @@ static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec)
if (copy_from_user(&debug, u64_to_user_ptr(argp->data), sizeof(debug)))
return -EFAULT;
- if (!debug.len || debug.src_uaddr + debug.len < debug.src_uaddr)
+ if (!debug.len || !debug.src_uaddr || !debug.dst_uaddr)
return -EINVAL;
- if (!debug.dst_uaddr)
+
+ if (debug.src_uaddr + debug.len < debug.src_uaddr ||
+ debug.dst_uaddr + debug.len < debug.dst_uaddr)
return -EINVAL;
- vaddr = debug.src_uaddr;
- size = debug.len;
- vaddr_end = vaddr + size;
- dst_vaddr = debug.dst_uaddr;
+ for (i = 0; i < debug.len; i += len) {
+ unsigned long src = debug.src_uaddr + i;
+ unsigned long dst = debug.dst_uaddr + i;
+ unsigned long s_off = src & ~PAGE_MASK;
+ unsigned long d_off = dst & ~PAGE_MASK;
+ struct page *src_p, *dst_p;
+ int ret;
- for (; vaddr < vaddr_end; vaddr = next_vaddr) {
- int len, s_off, d_off;
+ /*
+ * Copy as many remaining bytes as possible while staying in a
+ * single page for both the source and destination.
+ */
+ len = min3(debug.len - i, PAGE_SIZE - s_off, PAGE_SIZE - d_off);
- /* lock userspace source and destination page */
- src_p = sev_pin_memory(kvm, vaddr & PAGE_MASK, PAGE_SIZE, &n, 0);
+ /*
+ * Pin the source and destination pages; firmware operates on
+ * physical addresses.
+ */
+ src_p = sev_pin_page(kvm, src & PAGE_MASK, 0);
if (IS_ERR(src_p))
return PTR_ERR(src_p);
- dst_p = sev_pin_memory(kvm, dst_vaddr & PAGE_MASK, PAGE_SIZE, &n, FOLL_WRITE);
+ dst_p = sev_pin_page(kvm, dst & PAGE_MASK, FOLL_WRITE);
if (IS_ERR(dst_p)) {
- sev_unpin_memory(kvm, src_p, n);
+ sev_unpin_page(kvm, src_p);
return PTR_ERR(dst_p);
}
@@ -1387,43 +1394,28 @@ static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec)
* the pages; flush the destination too so that future accesses do not
* see stale data.
*/
- sev_clflush_pages(src_p, 1);
- sev_clflush_pages(dst_p, 1);
-
- /*
- * Since user buffer may not be page aligned, calculate the
- * offset within the page.
- */
- s_off = vaddr & ~PAGE_MASK;
- d_off = dst_vaddr & ~PAGE_MASK;
- len = min_t(size_t, (PAGE_SIZE - s_off), size);
+ sev_clflush_pages(&src_p, 1);
+ sev_clflush_pages(&dst_p, 1);
- if (dec)
- ret = __sev_dbg_decrypt_user(kvm,
- __sme_page_pa(src_p[0]) + s_off,
- (void __user *)dst_vaddr,
- __sme_page_pa(dst_p[0]) + d_off,
- len, &argp->error);
+ if (IS_ALIGNED(src, 16) && IS_ALIGNED(dst, 16) && IS_ALIGNED(len, 16))
+ ret = sev_issue_dbg_cmd(kvm,
+ __sme_page_pa(src_p) + s_off,
+ __sme_page_pa(dst_p) + d_off,
+ len, cmd, &argp->error);
+ else if (cmd == KVM_SEV_DBG_DECRYPT)
+ ret = sev_dbg_decrypt_slow(kvm, src, src_p, dst,
+ len, &argp->error);
else
- ret = __sev_dbg_encrypt_user(kvm,
- __sme_page_pa(src_p[0]) + s_off,
- (void __user *)vaddr,
- __sme_page_pa(dst_p[0]) + d_off,
- (void __user *)dst_vaddr,
- len, &argp->error);
+ ret = sev_dbg_encrypt_slow(kvm, src, dst, dst_p,
+ len, &argp->error);
- sev_unpin_memory(kvm, src_p, n);
- sev_unpin_memory(kvm, dst_p, n);
+ sev_unpin_page(kvm, src_p);
+ sev_unpin_page(kvm, dst_p);
if (ret)
- goto err;
-
- next_vaddr = vaddr + len;
- dst_vaddr = dst_vaddr + len;
- size -= len;
+ return ret;
}
-err:
- return ret;
+ return 0;
}
static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp)
@@ -1696,8 +1688,7 @@ static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
struct sev_data_send_update_data data;
struct kvm_sev_send_update_data params;
void *hdr, *trans_data;
- struct page **guest_page;
- unsigned long n;
+ struct page *guest_page;
int ret, offset;
if (!sev_guest(kvm))
@@ -1721,8 +1712,7 @@ static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
return -EINVAL;
/* Pin guest memory */
- guest_page = sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK,
- PAGE_SIZE, &n, 0);
+ guest_page = sev_pin_page(kvm, params.guest_uaddr & PAGE_MASK, 0);
if (IS_ERR(guest_page))
return PTR_ERR(guest_page);
@@ -1743,7 +1733,7 @@ static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
data.trans_len = params.trans_len;
/* The SEND_UPDATE_DATA command requires C-bit to be always set. */
- data.guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) + offset;
+ data.guest_address = page_to_phys(guest_page) + offset;
data.guest_address |= sev_me_mask;
data.guest_len = params.guest_len;
data.handle = to_kvm_sev_info(kvm)->handle;
@@ -1770,8 +1760,7 @@ e_free_trans_data:
e_free_hdr:
kfree(hdr);
e_unpin:
- sev_unpin_memory(kvm, guest_page, n);
-
+ sev_unpin_page(kvm, guest_page);
return ret;
}
@@ -1876,8 +1865,7 @@ static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
struct kvm_sev_receive_update_data params;
struct sev_data_receive_update_data data;
void *hdr = NULL, *trans = NULL;
- struct page **guest_page;
- unsigned long n;
+ struct page *guest_page;
int ret, offset;
if (!sev_guest(kvm))
@@ -1914,8 +1902,7 @@ static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
data.trans_len = params.trans_len;
/* Pin guest memory */
- guest_page = sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK,
- PAGE_SIZE, &n, FOLL_WRITE);
+ guest_page = sev_pin_page(kvm, params.guest_uaddr & PAGE_MASK, FOLL_WRITE);
if (IS_ERR(guest_page)) {
ret = PTR_ERR(guest_page);
goto e_free_trans;
@@ -1926,10 +1913,10 @@ static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
* encrypts the written data with the guest's key, and the cache may
* contain dirty, unencrypted data.
*/
- sev_clflush_pages(guest_page, n);
+ sev_clflush_pages(&guest_page, 1);
/* The RECEIVE_UPDATE_DATA command requires C-bit to be always set. */
- data.guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) + offset;
+ data.guest_address = page_to_phys(guest_page) + offset;
data.guest_address |= sev_me_mask;
data.guest_len = params.guest_len;
data.handle = to_kvm_sev_info(kvm)->handle;
@@ -1937,7 +1924,7 @@ static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
ret = sev_issue_cmd(kvm, SEV_CMD_RECEIVE_UPDATE_DATA, &data,
&argp->error);
- sev_unpin_memory(kvm, guest_page, n);
+ sev_unpin_page(kvm, guest_page);
e_free_trans:
kfree(trans);
@@ -2361,8 +2348,8 @@ static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
memcpy(dst_vaddr, src_vaddr, PAGE_SIZE);
- kunmap_local(src_vaddr);
kunmap_local(dst_vaddr);
+ kunmap_local(src_vaddr);
}
ret = rmp_make_private(pfn, gfn << PAGE_SHIFT, PG_LEVEL_4K,
@@ -2396,9 +2383,10 @@ static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
void *dst_vaddr = kmap_local_pfn(pfn);
memcpy(src_vaddr, dst_vaddr, PAGE_SIZE);
+ set_page_dirty(src_page);
- kunmap_local(src_vaddr);
kunmap_local(dst_vaddr);
+ kunmap_local(src_vaddr);
}
out:
@@ -2470,6 +2458,7 @@ static int snp_launch_update(struct kvm *kvm, struct kvm_sev_cmd *argp)
sev_populate_args.type = params.type;
count = kvm_gmem_populate(kvm, params.gfn_start, src, npages,
+ params.type == KVM_SEV_SNP_PAGE_TYPE_CPUID,
sev_gmem_post_populate, &sev_populate_args);
if (count < 0) {
argp->error = sev_populate_args.fw_error;
@@ -2690,10 +2679,8 @@ int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp)
r = sev_guest_status(kvm, &sev_cmd);
break;
case KVM_SEV_DBG_DECRYPT:
- r = sev_dbg_crypt(kvm, &sev_cmd, true);
- break;
case KVM_SEV_DBG_ENCRYPT:
- r = sev_dbg_crypt(kvm, &sev_cmd, false);
+ r = sev_dbg_crypt(kvm, &sev_cmd, sev_cmd.id);
break;
case KVM_SEV_LAUNCH_SECRET:
r = sev_launch_secret(kvm, &sev_cmd);
@@ -3014,18 +3001,14 @@ void sev_vm_destroy(struct kvm *kvm)
void __init sev_set_cpu_caps(void)
{
- if (sev_enabled) {
+ if (sev_enabled)
kvm_cpu_cap_set(X86_FEATURE_SEV);
- kvm_caps.supported_vm_types |= BIT(KVM_X86_SEV_VM);
- }
- if (sev_es_enabled) {
+
+ if (sev_es_enabled)
kvm_cpu_cap_set(X86_FEATURE_SEV_ES);
- kvm_caps.supported_vm_types |= BIT(KVM_X86_SEV_ES_VM);
- }
- if (sev_snp_enabled) {
+
+ if (sev_snp_enabled)
kvm_cpu_cap_set(X86_FEATURE_SEV_SNP);
- kvm_caps.supported_vm_types |= BIT(KVM_X86_SNP_VM);
- }
}
static bool is_sev_snp_initialized(void)
@@ -3055,6 +3038,11 @@ out:
return initialized;
}
+static const char * __init sev_str_feature_state(bool is_supported, bool is_usable)
+{
+ return is_supported ? is_usable ? "enabled" : "unusable" : "disabled";
+}
+
void __init sev_hardware_setup(void)
{
unsigned int eax, ebx, ecx, edx, sev_asid_count, sev_es_asid_count;
@@ -3062,6 +3050,7 @@ void __init sev_hardware_setup(void)
bool sev_snp_supported = false;
bool sev_es_supported = false;
bool sev_supported = false;
+ u32 vm_types = 0;
if (!sev_enabled || !npt_enabled || !nrips)
goto out;
@@ -3195,21 +3184,27 @@ out:
}
}
+ if (sev_supported && min_sev_asid <= max_sev_asid)
+ vm_types |= BIT(KVM_X86_SEV_VM);
+ if (sev_es_supported && min_sev_es_asid <= max_sev_es_asid)
+ vm_types |= BIT(KVM_X86_SEV_ES_VM);
+ if (sev_snp_supported)
+ vm_types |= BIT(KVM_X86_SNP_VM);
+ vm_types &= sev_firmware_supported_vm_types();
+
+ kvm_caps.supported_vm_types |= vm_types;
+
if (boot_cpu_has(X86_FEATURE_SEV))
pr_info("SEV %s (ASIDs %u - %u)\n",
- sev_supported ? min_sev_asid <= max_sev_asid ? "enabled" :
- "unusable" :
- "disabled",
+ sev_str_feature_state(sev_supported, vm_types & BIT(KVM_X86_SEV_VM)),
min_sev_asid, max_sev_asid);
if (boot_cpu_has(X86_FEATURE_SEV_ES))
pr_info("SEV-ES %s (ASIDs %u - %u)\n",
- sev_es_supported ? min_sev_es_asid <= max_sev_es_asid ? "enabled" :
- "unusable" :
- "disabled",
+ sev_str_feature_state(sev_es_supported, vm_types & BIT(KVM_X86_SEV_ES_VM)),
min_sev_es_asid, max_sev_es_asid);
if (boot_cpu_has(X86_FEATURE_SEV_SNP))
pr_info("SEV-SNP %s (ASIDs %u - %u)\n",
- str_enabled_disabled(sev_snp_supported),
+ sev_str_feature_state(sev_snp_supported, vm_types & BIT(KVM_X86_SNP_VM)),
min_snp_asid, max_snp_asid);
sev_enabled = sev_supported;
@@ -3782,9 +3777,13 @@ static int snp_rmptable_psmash(kvm_pfn_t pfn)
static int snp_complete_psc_msr(struct kvm_vcpu *vcpu)
{
+ u64 hypercall_ret = READ_ONCE(vcpu->run->hypercall.ret);
struct vcpu_svm *svm = to_svm(vcpu);
- if (vcpu->run->hypercall.ret)
+ if (!kvm_is_valid_map_gpa_range_ret(hypercall_ret))
+ return -EINVAL;
+
+ if (hypercall_ret)
set_ghcb_msr(svm, GHCB_MSR_PSC_RESP_ERROR);
else
set_ghcb_msr(svm, GHCB_MSR_PSC_RESP);
@@ -3875,10 +3874,14 @@ static void __snp_complete_one_psc(struct vcpu_svm *svm)
static int snp_complete_one_psc(struct kvm_vcpu *vcpu)
{
+ u64 hypercall_ret = READ_ONCE(vcpu->run->hypercall.ret);
struct vcpu_svm *svm = to_svm(vcpu);
struct psc_buffer *psc = svm->sev_es.ghcb_sa;
- if (vcpu->run->hypercall.ret) {
+ if (!kvm_is_valid_map_gpa_range_ret(hypercall_ret))
+ return -EINVAL;
+
+ if (hypercall_ret) {
snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC);
return 1; /* resume guest */
}
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index adfa9ff48c573..a5ec2e66a80e5 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -266,6 +266,7 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
set_exception_intercept(svm, GP_VECTOR);
}
+ svm_pmu_handle_nested_transition(svm);
kvm_make_request(KVM_REQ_RECALC_INTERCEPTS, vcpu);
}
@@ -334,7 +335,7 @@ done:
return 1;
}
-static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
+int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
{
return __svm_skip_emulated_instruction(vcpu, EMULTYPE_SKIP, true);
}
@@ -2798,6 +2799,20 @@ static bool sev_es_prevent_msr_access(struct kvm_vcpu *vcpu,
!msr_write_intercepted(to_svm(vcpu), msr_info->index);
}
+static bool svm_pat_accesses_gpat(struct kvm_vcpu *vcpu, bool from_host)
+{
+ /*
+ * When KVM_X86_QUIRK_NESTED_SVM_SHARED_PAT is disabled and nested
+ * NPT is enabled, L2 has a separate PAT from L1. Guest accesses
+ * to IA32_PAT while running L2 target L2's gPAT; host-initiated
+ * accesses always target L1's hPAT so that KVM_GET/SET_MSRS and
+ * KVM_GET/SET_NESTED_STATE are independent of each other and can
+ * be ordered arbitrarily during save and restore.
+ */
+ WARN_ON_ONCE(from_host && vcpu->wants_to_run);
+ return !from_host && is_guest_mode(vcpu) && l2_has_separate_pat(vcpu);
+}
+
static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -2914,6 +2929,12 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_AMD64_DE_CFG:
msr_info->data = svm->msr_decfg;
break;
+ case MSR_IA32_CR_PAT:
+ if (svm_pat_accesses_gpat(vcpu, msr_info->host_initiated)) {
+ msr_info->data = svm->vmcb->save.g_pat;
+ break;
+ }
+ return kvm_get_msr_common(vcpu, msr_info);
default:
return kvm_get_msr_common(vcpu, msr_info);
}
@@ -2997,14 +3018,23 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
break;
case MSR_IA32_CR_PAT:
+ if (svm_pat_accesses_gpat(vcpu, msr->host_initiated)) {
+ if (!kvm_pat_valid(data))
+ return 1;
+
+ vmcb_set_gpat(svm->vmcb, data);
+ break;
+ }
+
ret = kvm_set_msr_common(vcpu, msr);
if (ret)
break;
- svm->vmcb01.ptr->save.g_pat = data;
- if (is_guest_mode(vcpu))
- nested_vmcb02_compute_g_pat(svm);
- vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
+ if (npt_enabled) {
+ vmcb_set_gpat(svm->vmcb01.ptr, data);
+ if (is_guest_mode(vcpu) && !l2_has_separate_pat(vcpu))
+ vmcb_set_gpat(svm->vmcb, data);
+ }
break;
case MSR_IA32_SPEC_CTRL:
if (!msr->host_initiated &&
@@ -3675,13 +3705,8 @@ static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
struct vcpu_svm *svm = to_svm(vcpu);
struct kvm_run *kvm_run = vcpu->run;
- /* SEV-ES guests must use the CR write traps to track CR registers. */
- if (!is_sev_es_guest(vcpu)) {
- if (!svm_is_intercept(svm, INTERCEPT_CR0_WRITE))
- vcpu->arch.cr0 = svm->vmcb->save.cr0;
- if (npt_enabled)
- vcpu->arch.cr3 = svm->vmcb->save.cr3;
- }
+ if (unlikely(exit_fastpath == EXIT_FASTPATH_EXIT_USERSPACE))
+ return 0;
if (is_guest_mode(vcpu)) {
int vmexit;
@@ -4536,11 +4561,17 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
x86_spec_ctrl_restore_host(svm->virt_spec_ctrl);
+ /* SEV-ES guests must use the CR write traps to track CR registers. */
if (!is_sev_es_guest(vcpu)) {
vcpu->arch.cr2 = svm->vmcb->save.cr2;
vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
vcpu->arch.rip = svm->vmcb->save.rip;
+
+ if (!svm_is_intercept(svm, INTERCEPT_CR0_WRITE))
+ vcpu->arch.cr0 = svm->vmcb->save.cr0;
+ if (npt_enabled)
+ vcpu->arch.cr3 = svm->vmcb->save.cr3;
}
kvm_reset_dirty_registers(vcpu);
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 19b80ef56e2b7..87c6b105deef6 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -24,6 +24,8 @@
#include "cpuid.h"
#include "kvm_cache_regs.h"
+#include "x86.h"
+#include "pmu.h"
/*
* Helpers to convert to/from physical addresses for pages whose address is
@@ -166,6 +168,7 @@ struct vmcb_save_area_cached {
u64 isst_addr;
u64 rax;
u64 cr2;
+ u64 g_pat;
u64 dbgctl;
u64 br_from;
u64 br_to;
@@ -464,6 +467,12 @@ static inline bool vmcb12_is_dirty(struct vmcb_ctrl_area_cached *control, int bi
return !test_bit(bit, (unsigned long *)&control->clean);
}
+static inline void vmcb_set_gpat(struct vmcb *vmcb, u64 data)
+{
+ vmcb->save.g_pat = data;
+ vmcb_mark_dirty(vmcb, VMCB_NPT);
+}
+
static __always_inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
{
return container_of(vcpu, struct vcpu_svm, vcpu);
@@ -641,6 +650,16 @@ static inline bool nested_npt_enabled(struct vcpu_svm *svm)
return svm->nested.ctl.misc_ctl & SVM_MISC_ENABLE_NP;
}
+static inline bool l2_has_separate_pat(struct kvm_vcpu *vcpu)
+{
+ /*
+ * If KVM_X86_QUIRK_NESTED_SVM_SHARED_PAT is disabled while a vCPU
+ * is running, the L2 IA32_PAT semantics for that vCPU are undefined.
+ */
+ return nested_npt_enabled(to_svm(vcpu)) &&
+ !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_NESTED_SVM_SHARED_PAT);
+}
+
static inline bool nested_vnmi_enabled(struct vcpu_svm *svm)
{
return guest_cpu_cap_has(&svm->vcpu, X86_FEATURE_VNMI) &&
@@ -814,6 +833,8 @@ static inline void svm_enable_intercept_for_msr(struct kvm_vcpu *vcpu,
svm_set_intercept_for_msr(vcpu, msr, type, true);
}
+int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu);
+
/* nested.c */
#define NESTED_EXIT_HOST 0 /* Exit handled on host level */
@@ -875,9 +896,30 @@ void nested_copy_vmcb_control_to_cache(struct vcpu_svm *svm,
void nested_copy_vmcb_save_to_cache(struct vcpu_svm *svm,
struct vmcb_save_area *save);
void nested_sync_control_from_vmcb02(struct vcpu_svm *svm);
-void nested_vmcb02_compute_g_pat(struct vcpu_svm *svm);
void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb);
+
+static inline void __svm_pmu_handle_nested_transition(struct vcpu_svm *svm,
+ bool defer)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(&svm->vcpu);
+ u64 counters = *(u64 *)pmu->pmc_has_mode_specific_enables;
+
+ __kvm_pmu_reprogram_counters(pmu, counters, defer);
+}
+
+static inline void svm_pmu_handle_nested_transition(struct vcpu_svm *svm)
+{
+ /*
+ * Do NOT defer reprogramming the counters by default. Instructions
+ * causing a state change are counted based on the _new_ CPU state
+ * (e.g. a successful VMRUN is counted in guest mode). Hence, the
+ * counters should be reprogrammed with the new state _before_ the
+ * instruction is potentially counted upon emulation completion.
+ */
+ __svm_pmu_handle_nested_transition(svm, false);
+}
+
extern struct kvm_x86_nested_ops svm_nested_ops;
/* avic.c */
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 4690a4d23709d..30dcabc899a29 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -411,7 +411,8 @@ static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp,
}
static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
- struct x86_exception *fault)
+ struct x86_exception *fault,
+ bool from_hardware)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -444,13 +445,29 @@ static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
exit_qualification = 0;
} else {
u64 mask = EPT_VIOLATION_GVA_IS_VALID |
- EPT_VIOLATION_GVA_TRANSLATED;
+ EPT_VIOLATION_GVA_TRANSLATED;
+
if (vmx->nested.msrs.ept_caps & VMX_EPT_ADVANCED_VMEXIT_INFO_BIT)
mask |= EPT_VIOLATION_GVA_USER |
- EPT_VIOLATION_GVA_WRITABLE |
- EPT_VIOLATION_GVA_NX;
- exit_qualification = fault->exit_qualification;
- exit_qualification |= vmx_get_exit_qual(vcpu) & mask;
+ EPT_VIOLATION_GVA_WRITABLE |
+ EPT_VIOLATION_GVA_NX;
+
+ exit_qualification = fault->exit_qualification & ~mask;
+
+ /*
+ * Use the EXIT_QUALIFICATION from the VMCS if and only
+ * if the hardware VM-Exit from L2 was an EPT Violation.
+ * If the fault is synthesized, then EXIT_QUALIFICATION
+ * is stale and/or holds entirely different data. And
+ * conversely, KVM _must_ rely on EXIT_QUALIFICATION if
+ * the fault came from hardware, because KVM only sees
+ * and walks the faulting GPA.
+ */
+ if (from_hardware)
+ exit_qualification |= vmx_get_exit_qual(vcpu) & mask;
+ else
+ exit_qualification |= fault->exit_qualification & mask;
+
vm_exit_reason = EXIT_REASON_EPT_VIOLATION;
}
@@ -6535,6 +6552,8 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu,
nested_evmcs_l2_tlb_flush_enabled(vcpu) &&
kvm_hv_is_tlb_flush_hcall(vcpu);
#endif
+ case EXIT_REASON_CPUID:
+ return !kvm_is_cpuid_allowed(vcpu);
default:
break;
}
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 74e0b01185b80..e741e6473b5e5 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -309,13 +309,15 @@ static bool intel_pmu_handle_lbr_msrs_access(struct kvm_vcpu *vcpu,
*/
local_irq_disable();
if (lbr_desc->event->state == PERF_EVENT_STATE_ACTIVE) {
+ int err = 0;
+
if (read)
rdmsrq(index, msr_info->data);
else
- wrmsrq(index, msr_info->data);
+ err = wrmsrq_safe(index, msr_info->data);
__set_bit(INTEL_PMC_IDX_FIXED_VLBR, vcpu_to_pmu(vcpu)->pmc_in_use);
local_irq_enable();
- return true;
+ return !err;
}
clear_bit(INTEL_PMC_IDX_FIXED_VLBR, vcpu_to_pmu(vcpu)->pmc_in_use);
local_irq_enable();
@@ -392,7 +394,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (pmu->pebs_enable != data) {
diff = pmu->pebs_enable ^ data;
pmu->pebs_enable = data;
- reprogram_counters(pmu, diff);
+ kvm_pmu_request_counters_reprogram(pmu, diff);
}
break;
case MSR_IA32_DS_AREA:
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index b187ef9a6ae46..8a88de071a2ff 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -343,7 +343,7 @@ static int tdx_reclaim_page(struct page *page)
r = __tdx_reclaim_page(page);
if (!r)
- tdx_quirk_reset_page(page);
+ tdx_quirk_reset_paddr(page_to_phys(page), PAGE_SIZE);
return r;
}
@@ -587,7 +587,7 @@ static void tdx_reclaim_td_control_pages(struct kvm *kvm)
if (TDX_BUG_ON(err, TDH_PHYMEM_PAGE_WBINVD, kvm))
return;
- tdx_quirk_reset_page(kvm_tdx->td.tdr_page);
+ tdx_quirk_reset_paddr(page_to_phys(kvm_tdx->td.tdr_page), PAGE_SIZE);
__free_page(kvm_tdx->td.tdr_page);
kvm_tdx->td.tdr_page = NULL;
@@ -629,6 +629,12 @@ int tdx_vm_init(struct kvm *kvm)
kvm->arch.disabled_quirks |= KVM_X86_QUIRK_IGNORE_GUEST_PAT;
/*
+ * PMU support is provided by the TDX-Module (if enabled for the VM).
+ * From KVM's perspective, the VM doesn't have a virtual PMU.
+ */
+ kvm->arch.has_protected_pmu = true;
+
+ /*
* Because guest TD is protected, VMM can't parse the instruction in TD.
* Instead, guest uses MMIO hypercall. For unmodified device driver,
* #VE needs to be injected for MMIO and #VE handler in TD converts MMIO
@@ -1172,12 +1178,22 @@ static void __tdx_map_gpa(struct vcpu_tdx *tdx);
static int tdx_complete_vmcall_map_gpa(struct kvm_vcpu *vcpu)
{
+ u64 hypercall_ret = READ_ONCE(vcpu->run->hypercall.ret);
struct vcpu_tdx *tdx = to_tdx(vcpu);
+ long rc;
- if (vcpu->run->hypercall.ret) {
- tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
- tdx->vp_enter_args.r11 = tdx->map_gpa_next;
- return 1;
+ switch (hypercall_ret) {
+ case 0:
+ break;
+ case EAGAIN:
+ rc = TDVMCALL_STATUS_RETRY;
+ goto propagate_error;
+ case EINVAL:
+ rc = TDVMCALL_STATUS_INVALID_OPERAND;
+ goto propagate_error;
+ default:
+ WARN_ON_ONCE(kvm_is_valid_map_gpa_range_ret(hypercall_ret));
+ return -EINVAL;
}
tdx->map_gpa_next += TDX_MAP_GPA_MAX_LEN;
@@ -1190,13 +1206,17 @@ static int tdx_complete_vmcall_map_gpa(struct kvm_vcpu *vcpu)
* TDVMCALL_MAP_GPA, see comments in tdx_protected_apic_has_interrupt().
*/
if (kvm_vcpu_has_events(vcpu)) {
- tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_RETRY);
- tdx->vp_enter_args.r11 = tdx->map_gpa_next;
- return 1;
+ rc = TDVMCALL_STATUS_RETRY;
+ goto propagate_error;
}
__tdx_map_gpa(tdx);
return 0;
+
+propagate_error:
+ tdvmcall_set_return_code(vcpu, rc);
+ tdx->vp_enter_args.r11 = tdx->map_gpa_next;
+ return 1;
}
static void __tdx_map_gpa(struct vcpu_tdx *tdx)
@@ -1614,8 +1634,8 @@ static int tdx_mem_page_add(struct kvm *kvm, gfn_t gfn, enum pg_level level,
KVM_BUG_ON(!kvm_tdx->page_add_src, kvm))
return -EIO;
- err = tdh_mem_page_add(&kvm_tdx->td, gpa, pfn_to_page(pfn),
- kvm_tdx->page_add_src, &entry, &level_state);
+ err = tdh_mem_page_add(&kvm_tdx->td, gpa, pfn, kvm_tdx->page_add_src,
+ &entry, &level_state);
if (unlikely(tdx_operand_busy(err)))
return -EBUSY;
@@ -1628,14 +1648,12 @@ static int tdx_mem_page_add(struct kvm *kvm, gfn_t gfn, enum pg_level level,
static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn,
enum pg_level level, kvm_pfn_t pfn)
{
- int tdx_level = pg_level_to_tdx_sept_level(level);
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
- struct page *page = pfn_to_page(pfn);
gpa_t gpa = gfn_to_gpa(gfn);
u64 entry, level_state;
u64 err;
- err = tdh_mem_page_aug(&kvm_tdx->td, gpa, tdx_level, page, &entry, &level_state);
+ err = tdh_mem_page_aug(&kvm_tdx->td, gpa, level, pfn, &entry, &level_state);
if (unlikely(tdx_operand_busy(err)))
return -EBUSY;
@@ -1645,18 +1663,52 @@ static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn,
return 0;
}
-static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
- enum pg_level level, u64 mirror_spte)
+static struct page *tdx_spte_to_sept_pt(struct kvm *kvm, gfn_t gfn,
+ u64 new_spte, enum pg_level level)
+{
+ struct kvm_mmu_page *sp = spte_to_child_sp(new_spte);
+
+ if (KVM_BUG_ON(!sp->external_spt, kvm) ||
+ KVM_BUG_ON(sp->role.level + 1 != level, kvm) ||
+ KVM_BUG_ON(sp->gfn != gfn, kvm))
+ return NULL;
+
+ return virt_to_page(sp->external_spt);
+}
+
+static int tdx_sept_map_nonleaf_spte(struct kvm *kvm, gfn_t gfn,
+ enum pg_level level, u64 new_spte)
+{
+ gpa_t gpa = gfn_to_gpa(gfn);
+ u64 err, entry, level_state;
+ struct page *sept_pt;
+
+ sept_pt = tdx_spte_to_sept_pt(kvm, gfn, new_spte, level);
+ if (!sept_pt)
+ return -EIO;
+
+ err = tdh_mem_sept_add(&to_kvm_tdx(kvm)->td, gpa, level, sept_pt,
+ &entry, &level_state);
+ if (unlikely(tdx_operand_busy(err)))
+ return -EBUSY;
+
+ if (TDX_BUG_ON_2(err, TDH_MEM_SEPT_ADD, entry, level_state, kvm))
+ return -EIO;
+
+ return 0;
+}
+
+static int tdx_sept_map_leaf_spte(struct kvm *kvm, gfn_t gfn, enum pg_level level,
+ u64 new_spte)
{
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
- kvm_pfn_t pfn = spte_to_pfn(mirror_spte);
+ kvm_pfn_t pfn = spte_to_pfn(new_spte);
/* TODO: handle large pages. */
if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
return -EIO;
- WARN_ON_ONCE(!is_shadow_present_pte(mirror_spte) ||
- (mirror_spte & VMX_EPT_RWX_MASK) != VMX_EPT_RWX_MASK);
+ WARN_ON_ONCE((new_spte & VMX_EPT_RWX_MASK) != VMX_EPT_RWX_MASK);
/*
* Ensure pre_fault_allowed is read by kvm_arch_vcpu_pre_fault_memory()
@@ -1676,25 +1728,6 @@ static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
return tdx_mem_page_aug(kvm, gfn, level, pfn);
}
-static int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn,
- enum pg_level level, void *private_spt)
-{
- int tdx_level = pg_level_to_tdx_sept_level(level);
- gpa_t gpa = gfn_to_gpa(gfn);
- struct page *page = virt_to_page(private_spt);
- u64 err, entry, level_state;
-
- err = tdh_mem_sept_add(&to_kvm_tdx(kvm)->td, gpa, tdx_level, page, &entry,
- &level_state);
- if (unlikely(tdx_operand_busy(err)))
- return -EBUSY;
-
- if (TDX_BUG_ON_2(err, TDH_MEM_SEPT_ADD, entry, level_state, kvm))
- return -EIO;
-
- return 0;
-}
-
/*
* Ensure shared and private EPTs to be flushed on all vCPUs.
* tdh_mem_track() is the only caller that increases TD epoch. An increase in
@@ -1741,35 +1774,11 @@ static void tdx_track(struct kvm *kvm)
kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE);
}
-static int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn,
- enum pg_level level, void *private_spt)
-{
- struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
-
- /*
- * free_external_spt() is only called after hkid is freed when TD is
- * tearing down.
- * KVM doesn't (yet) zap page table pages in mirror page table while
- * TD is active, though guest pages mapped in mirror page table could be
- * zapped during TD is active, e.g. for shared <-> private conversion
- * and slot move/deletion.
- */
- if (KVM_BUG_ON(is_hkid_assigned(kvm_tdx), kvm))
- return -EIO;
-
- /*
- * The HKID assigned to this TD was already freed and cache was
- * already flushed. We don't have to flush again.
- */
- return tdx_reclaim_page(virt_to_page(private_spt));
-}
-
-static void tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
- enum pg_level level, u64 mirror_spte)
+static int tdx_sept_remove_leaf_spte(struct kvm *kvm, gfn_t gfn,
+ enum pg_level level, u64 old_spte)
{
- struct page *page = pfn_to_page(spte_to_pfn(mirror_spte));
- int tdx_level = pg_level_to_tdx_sept_level(level);
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ kvm_pfn_t pfn = spte_to_pfn(old_spte);
gpa_t gpa = gfn_to_gpa(gfn);
u64 err, entry, level_state;
@@ -1781,16 +1790,16 @@ static void tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
* there can't be anything populated in the private EPT.
*/
if (KVM_BUG_ON(!is_hkid_assigned(to_kvm_tdx(kvm)), kvm))
- return;
+ return -EIO;
/* TODO: handle large pages. */
if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
- return;
+ return -EIO;
err = tdh_do_no_vcpus(tdh_mem_range_block, kvm, &kvm_tdx->td, gpa,
- tdx_level, &entry, &level_state);
+ level, &entry, &level_state);
if (TDX_BUG_ON_2(err, TDH_MEM_RANGE_BLOCK, entry, level_state, kvm))
- return;
+ return -EIO;
/*
* TDX requires TLB tracking before dropping private page. Do
@@ -1804,15 +1813,82 @@ static void tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
* Race with TDH.VP.ENTER due to (0-step mitigation) and Guest TDCALLs.
*/
err = tdh_do_no_vcpus(tdh_mem_page_remove, kvm, &kvm_tdx->td, gpa,
- tdx_level, &entry, &level_state);
+ level, &entry, &level_state);
if (TDX_BUG_ON_2(err, TDH_MEM_PAGE_REMOVE, entry, level_state, kvm))
- return;
+ return -EIO;
- err = tdh_phymem_page_wbinvd_hkid((u16)kvm_tdx->hkid, page);
+ err = tdh_phymem_page_wbinvd_hkid((u16)kvm_tdx->hkid, pfn);
if (TDX_BUG_ON(err, TDH_PHYMEM_PAGE_WBINVD, kvm))
- return;
+ return -EIO;
- tdx_quirk_reset_page(page);
+ tdx_quirk_reset_paddr(PFN_PHYS(pfn), PAGE_SIZE);
+ return 0;
+}
+
+/*
+ * Handle changes for
+ * (1) leaf SPTEs from non-present to present
+ * (2) non-leaf SPTEs from non-present to present
+ * (3) leaf SPTEs from present to non-present
+ *
+ * - (1) and (2) must be under shared mmu_lock. If (1) and (2) are under
+ * exclusive mmu_lock (currently impossible), contention errors may lead to
+ * KVM_BUG_ON() in handle_changed_spte(), e.g., due to tdx_mem_page_aug(),
+ * tdx_mem_page_add(), or tdh_mem_sept_add() contending with tdh_vp_enter()
+ * due to zero-step mitigation or contending with TDCALLs.
+ * - (3) must be under write mmu_lock. If (3) is under shared mmu_lock
+ * (currently impossible), warnings will be generated due to
+ * lockdep_assert_held_write() or TDX_BUG_ON() caused by concurrent BLOCK,
+ * TRACK, REMOVE.
+ * - Promotion/demotion is not yet supported.
+ */
+static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, u64 old_spte,
+ u64 new_spte, enum pg_level level)
+{
+ lockdep_assert_held(&kvm->mmu_lock);
+
+ if (is_shadow_present_pte(old_spte))
+ return tdx_sept_remove_leaf_spte(kvm, gfn, level, old_spte);
+
+ if (KVM_BUG_ON(!is_shadow_present_pte(new_spte), kvm))
+ return -EIO;
+
+ if (!is_last_spte(new_spte, level))
+ return tdx_sept_map_nonleaf_spte(kvm, gfn, level, new_spte);
+
+ return tdx_sept_map_leaf_spte(kvm, gfn, level, new_spte);
+}
+
+/*
+ * Handle changes for non-leaf SPTEs from present to non-present.
+ * Must be under exclusive mmu_lock and cannot fail.
+ */
+static void tdx_sept_free_private_spt(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ /*
+ * KVM doesn't (yet) zap page table pages in mirror page table while
+ * TD is active, though guest pages mapped in mirror page table could be
+ * zapped during TD is active, e.g. for shared <-> private conversion
+ * and slot move/deletion.
+ *
+ * In other words, KVM should only free mirror page tables after the
+ * TD's hkid is freed, when the TD is being torn down.
+ *
+ * If the S-EPT PTE can't be removed for any reason, intentionally leak
+ * the page to prevent the kernel from accessing the encrypted page.
+ */
+ if (KVM_BUG_ON(is_hkid_assigned(to_kvm_tdx(kvm)), kvm) ||
+ tdx_reclaim_page(virt_to_page(sp->external_spt)))
+ goto out;
+
+ /*
+ * Immediately free the S-EPT page because RCU-time free is unnecessary
+ * after TDH.PHYMEM.PAGE.RECLAIM ensures there are no outstanding
+ * readers.
+ */
+ free_page((unsigned long)sp->external_spt);
+out:
+ sp->external_spt = NULL;
}
void tdx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
@@ -2106,23 +2182,29 @@ bool tdx_has_emulated_msr(u32 index)
case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
/* MSR_IA32_MCx_{CTL, STATUS, ADDR, MISC, CTL2} */
case MSR_KVM_POLL_CONTROL:
+ /*
+ * Except for x2APIC registers that are virtualized by the CPU, which
+ * KVM can't emulate as KVM doesn't have access to the virtual APIC
+ * page, KVM emulates the same set of x2APIC registers for TDX versus
+ * non-TDX guests.
+ */
+ case X2APIC_MSR(APIC_ID):
+ case X2APIC_MSR(APIC_LVR):
+ case X2APIC_MSR(APIC_LDR):
+ case X2APIC_MSR(APIC_SPIV):
+ case X2APIC_MSR(APIC_ESR):
+ case X2APIC_MSR(APIC_LVTCMCI):
+ case X2APIC_MSR(APIC_ICR):
+ case X2APIC_MSR(APIC_LVTT):
+ case X2APIC_MSR(APIC_LVTTHMR):
+ case X2APIC_MSR(APIC_LVTPC):
+ case X2APIC_MSR(APIC_LVT0):
+ case X2APIC_MSR(APIC_LVT1):
+ case X2APIC_MSR(APIC_LVTERR):
+ case X2APIC_MSR(APIC_TMICT):
+ case X2APIC_MSR(APIC_TMCCT):
+ case X2APIC_MSR(APIC_TDCR):
return true;
- case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
- /*
- * x2APIC registers that are virtualized by the CPU can't be
- * emulated, KVM doesn't have access to the virtual APIC page.
- */
- switch (index) {
- case X2APIC_MSR(APIC_TASKPRI):
- case X2APIC_MSR(APIC_PROCPRI):
- case X2APIC_MSR(APIC_EOI):
- case X2APIC_MSR(APIC_ISR) ... X2APIC_MSR(APIC_ISR + APIC_ISR_NR):
- case X2APIC_MSR(APIC_TMR) ... X2APIC_MSR(APIC_TMR + APIC_ISR_NR):
- case X2APIC_MSR(APIC_IRR) ... X2APIC_MSR(APIC_IRR + APIC_ISR_NR):
- return false;
- default:
- return true;
- }
default:
return false;
}
@@ -2377,20 +2459,20 @@ static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params,
ret = -ENOMEM;
- tdr_page = alloc_page(GFP_KERNEL);
+ tdr_page = alloc_page(GFP_KERNEL_ACCOUNT);
if (!tdr_page)
goto free_hkid;
kvm_tdx->td.tdcs_nr_pages = tdx_sysinfo->td_ctrl.tdcs_base_size / PAGE_SIZE;
/* TDVPS = TDVPR(4K page) + TDCX(multiple 4K pages), -1 for TDVPR. */
kvm_tdx->td.tdcx_nr_pages = tdx_sysinfo->td_ctrl.tdvps_base_size / PAGE_SIZE - 1;
- tdcs_pages = kzalloc_objs(*kvm_tdx->td.tdcs_pages,
- kvm_tdx->td.tdcs_nr_pages);
+ tdcs_pages = kzalloc_objs(*kvm_tdx->td.tdcs_pages, kvm_tdx->td.tdcs_nr_pages,
+ GFP_KERNEL_ACCOUNT);
if (!tdcs_pages)
goto free_tdr;
for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
- tdcs_pages[i] = alloc_page(GFP_KERNEL);
+ tdcs_pages[i] = alloc_page(GFP_KERNEL_ACCOUNT);
if (!tdcs_pages[i])
goto free_tdcs;
}
@@ -2775,7 +2857,7 @@ void tdx_flush_tlb_current(struct kvm_vcpu *vcpu)
void tdx_flush_tlb_all(struct kvm_vcpu *vcpu)
{
/*
- * TDX has called tdx_track() in tdx_sept_remove_private_spte() to
+ * TDX has called tdx_track() in tdx_sept_remove_leaf_spte() to
* ensure that private EPT will be flushed on the next TD enter. No need
* to call tdx_track() here again even when this callback is a result of
* zapping private EPT.
@@ -2865,7 +2947,7 @@ static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx)
int ret, i;
u64 err;
- page = alloc_page(GFP_KERNEL);
+ page = alloc_page(GFP_KERNEL_ACCOUNT);
if (!page)
return -ENOMEM;
tdx->vp.tdvpr_page = page;
@@ -2878,14 +2960,14 @@ static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx)
tdx->vp.tdvpr_pa = page_to_phys(tdx->vp.tdvpr_page);
tdx->vp.tdcx_pages = kcalloc(kvm_tdx->td.tdcx_nr_pages, sizeof(*tdx->vp.tdcx_pages),
- GFP_KERNEL);
+ GFP_KERNEL_ACCOUNT);
if (!tdx->vp.tdcx_pages) {
ret = -ENOMEM;
goto free_tdvpr;
}
for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
- page = alloc_page(GFP_KERNEL);
+ page = alloc_page(GFP_KERNEL_ACCOUNT);
if (!page) {
ret = -ENOMEM;
goto free_tdcx;
@@ -3175,7 +3257,7 @@ static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *c
};
gmem_ret = kvm_gmem_populate(kvm, gpa_to_gfn(region.gpa),
u64_to_user_ptr(region.source_addr),
- 1, tdx_gmem_post_populate, &arg);
+ 1, false, tdx_gmem_post_populate, &arg);
if (gmem_ret < 0) {
ret = gmem_ret;
break;
@@ -3405,10 +3487,8 @@ int __init tdx_hardware_setup(void)
vt_x86_ops.vm_size = max_t(unsigned int, vt_x86_ops.vm_size, sizeof(struct kvm_tdx));
- vt_x86_ops.link_external_spt = tdx_sept_link_private_spt;
vt_x86_ops.set_external_spte = tdx_sept_set_private_spte;
vt_x86_ops.free_external_spt = tdx_sept_free_private_spt;
- vt_x86_ops.remove_external_spte = tdx_sept_remove_private_spte;
vt_x86_ops.protected_apic_has_interrupt = tdx_protected_apic_has_interrupt;
return 0;
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index e5cfe4d12c479..5f171d6943dd4 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1913,6 +1913,24 @@ void vmx_inject_exception(struct kvm_vcpu *vcpu)
u32 intr_info = ex->vector | INTR_INFO_VALID_MASK;
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ /*
+ * When injecting a #DB, single-stepping is enabled in RFLAGS, and STI
+ * or MOV-SS blocking is active, set vmcs.PENDING_DBG_EXCEPTIONS.BS to
+ * prevent a false positive from VM-Entry consistency check. VM-Entry
+ * asserts that a single-step #DB _must_ be pending in this scenario,
+ * as the previous instruction cannot have toggled RFLAGS.TF 0=>1
+ * (because STI and POP/MOV don't modify RFLAGS), therefore the one
+ * instruction delay when activating single-step breakpoints must have
+ * already expired. However, the CPU isn't smart enough to peek at
+ * vmcs.VM_ENTRY_INTR_INFO_FIELD and so doesn't realize that yes, there
+ * is indeed a #DB pending/imminent.
+ */
+ if (ex->vector == DB_VECTOR &&
+ (vmx_get_rflags(vcpu) & X86_EFLAGS_TF) &&
+ vmx_get_interrupt_shadow(vcpu))
+ vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
+ vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS) | DR6_BS);
+
kvm_deliver_exception_payload(vcpu, ex);
if (ex->has_error_code) {
@@ -4157,7 +4175,7 @@ static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu)
* mode, only the current timer count needs on-demand emulation by KVM.
*/
if (mode & MSR_BITMAP_MODE_X2APIC_APICV)
- msr_bitmap[read_idx] = ~kvm_lapic_readable_reg_mask(vcpu->arch.apic);
+ msr_bitmap[read_idx] = ~kvm_x2apic_disable_read_intercept_reg_mask(vcpu);
else
msr_bitmap[read_idx] = ~0ull;
msr_bitmap[write_idx] = ~0ull;
@@ -4170,7 +4188,6 @@ static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu)
!(mode & MSR_BITMAP_MODE_X2APIC));
if (mode & MSR_BITMAP_MODE_X2APIC_APICV) {
- vmx_enable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_RW);
vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
if (enable_ipiv)
@@ -5496,26 +5513,9 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
* avoid single-step #DB and MTF updates, as ICEBP is
* higher priority. Note, skipping ICEBP still clears
* STI and MOVSS blocking.
- *
- * For all other #DBs, set vmcs.PENDING_DBG_EXCEPTIONS.BS
- * if single-step is enabled in RFLAGS and STI or MOVSS
- * blocking is active, as the CPU doesn't set the bit
- * on VM-Exit due to #DB interception. VM-Entry has a
- * consistency check that a single-step #DB is pending
- * in this scenario as the previous instruction cannot
- * have toggled RFLAGS.TF 0=>1 (because STI and POP/MOV
- * don't modify RFLAGS), therefore the one instruction
- * delay when activating single-step breakpoints must
- * have already expired. Note, the CPU sets/clears BS
- * as appropriate for all other VM-Exits types.
*/
if (is_icebp(intr_info))
WARN_ON(!skip_emulated_instruction(vcpu));
- else if ((vmx_get_rflags(vcpu) & X86_EFLAGS_TF) &&
- (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
- (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)))
- vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
- vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS) | DR6_BS);
kvm_queue_exception_p(vcpu, DB_VECTOR, dr6);
return 1;
@@ -6716,6 +6716,9 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
if (enable_pml && !is_guest_mode(vcpu))
vmx_flush_pml_buffer(vcpu);
+ if (unlikely(exit_fastpath == EXIT_FASTPATH_EXIT_USERSPACE))
+ return 0;
+
/*
* KVM should never reach this point with a pending nested VM-Enter.
* More specifically, short-circuiting VM-Entry to emulate L2 due to
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0ac9d94e51776..ae8e911ef4a1d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -133,7 +133,6 @@ static void process_nmi(struct kvm_vcpu *vcpu);
static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
static void store_regs(struct kvm_vcpu *vcpu);
static int sync_regs(struct kvm_vcpu *vcpu);
-static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu);
static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
@@ -152,6 +151,7 @@ struct kvm_x86_ops kvm_x86_ops __read_mostly;
#include <asm/kvm-x86-ops.h>
EXPORT_STATIC_CALL_GPL(kvm_x86_get_cs_db_l_bits);
EXPORT_STATIC_CALL_GPL(kvm_x86_cache_reg);
+EXPORT_STATIC_CALL_GPL(kvm_x86_get_cpl);
static bool __read_mostly ignore_msrs = 0;
module_param(ignore_msrs, bool, 0644);
@@ -182,15 +182,6 @@ module_param(force_emulation_prefix, int, 0644);
int __read_mostly pi_inject_timer = -1;
module_param(pi_inject_timer, bint, 0644);
-/* Enable/disable PMU virtualization */
-bool __read_mostly enable_pmu = true;
-EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_pmu);
-module_param(enable_pmu, bool, 0444);
-
-/* Enable/disabled mediated PMU virtualization. */
-bool __read_mostly enable_mediated_pmu;
-EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_mediated_pmu);
-
bool __read_mostly eager_page_split = true;
module_param(eager_page_split, bool, 0644);
@@ -970,7 +961,8 @@ static int complete_emulated_insn_gp(struct kvm_vcpu *vcpu, int err)
EMULTYPE_COMPLETE_USER_EXIT);
}
-void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
+void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault,
+ bool from_hardware)
{
++vcpu->stat.pf_guest;
@@ -987,8 +979,9 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
fault->address);
}
-void kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
- struct x86_exception *fault)
+void __kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
+ struct x86_exception *fault,
+ bool from_hardware)
{
struct kvm_mmu *fault_mmu;
WARN_ON_ONCE(fault->vector != PF_VECTOR);
@@ -1005,9 +998,9 @@ void kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
kvm_mmu_invalidate_addr(vcpu, fault_mmu, fault->address,
KVM_MMU_ROOT_CURRENT);
- fault_mmu->inject_page_fault(vcpu, fault);
+ fault_mmu->inject_page_fault(vcpu, fault, from_hardware);
}
-EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_inject_emulated_page_fault);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_inject_emulated_page_fault);
void kvm_inject_nmi(struct kvm_vcpu *vcpu)
{
@@ -1021,18 +1014,6 @@ void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
}
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_queue_exception_e);
-/*
- * Checks if cpl <= required_cpl; if true, return true. Otherwise queue
- * a #GP and return false.
- */
-bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
-{
- if (kvm_x86_call(get_cpl)(vcpu) <= required_cpl)
- return true;
- kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
- return false;
-}
-
bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr)
{
if ((dr != 4 && dr != 5) || !kvm_is_cr4_bit_set(vcpu, X86_CR4_DE))
@@ -1043,11 +1024,16 @@ bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr)
}
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_require_dr);
-static bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu)
+static bool __kvm_pv_async_pf_enabled(u64 data)
{
u64 mask = KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT;
- return (vcpu->arch.apf.msr_en_val & mask) == mask;
+ return (data & mask) == mask;
+}
+
+static bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu)
+{
+ return __kvm_pv_async_pf_enabled(vcpu->arch.apf.msr_en_val);
}
static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu)
@@ -1601,6 +1587,14 @@ unsigned long kvm_get_dr(struct kvm_vcpu *vcpu, int dr)
}
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_dr);
+static unsigned long kvm_get_effective_dr7(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
+ return vcpu->arch.guest_debug_dr7;
+
+ return vcpu->arch.dr7;
+}
+
int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu)
{
u32 pmc = kvm_rcx_read(vcpu);
@@ -3648,23 +3642,19 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
if (!lapic_in_kernel(vcpu))
return data ? 1 : 0;
+ if (__kvm_pv_async_pf_enabled(data) &&
+ kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa,
+ sizeof(u64)))
+ return 1;
+
vcpu->arch.apf.msr_en_val = data;
- if (!kvm_pv_async_pf_enabled(vcpu)) {
+ if (__kvm_pv_async_pf_enabled(data)) {
+ kvm_async_pf_wakeup_all(vcpu);
+ } else {
kvm_clear_async_pf_completion_queue(vcpu);
kvm_async_pf_hash_reset(vcpu);
- return 0;
}
-
- if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa,
- sizeof(u64)))
- return 1;
-
- vcpu->arch.apf.send_always = (data & KVM_ASYNC_PF_SEND_ALWAYS);
- vcpu->arch.apf.delivery_as_pf_vmexit = data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;
-
- kvm_async_pf_wakeup_all(vcpu);
-
return 0;
}
@@ -4003,22 +3993,28 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
break;
case MSR_EFER:
return set_efer(vcpu, msr_info);
- case MSR_K7_HWCR:
- data &= ~(u64)0x40; /* ignore flush filter disable */
- data &= ~(u64)0x100; /* ignore ignne emulation enable */
- data &= ~(u64)0x8; /* ignore TLB cache disable */
-
+ case MSR_K7_HWCR: {
/*
* Allow McStatusWrEn and TscFreqSel. (Linux guests from v3.2
* through at least v6.6 whine if TscFreqSel is clear,
* depending on F/M/S.
*/
- if (data & ~(BIT_ULL(18) | BIT_ULL(24))) {
+ u64 valid = BIT_ULL(18) | BIT_ULL(24);
+
+ data &= ~(u64)0x40; /* ignore flush filter disable */
+ data &= ~(u64)0x100; /* ignore ignne emulation enable */
+ data &= ~(u64)0x8; /* ignore TLB cache disable */
+
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_GP_ON_USER_CPUID))
+ valid |= MSR_K7_HWCR_CPUID_USER_DIS;
+
+ if (data & ~valid) {
kvm_pr_unimpl_wrmsr(vcpu, msr, data);
return 1;
}
vcpu->arch.msr_hwcr = data;
break;
+ }
case MSR_FAM10H_MMIO_CONF_BASE:
if (data != 0) {
kvm_pr_unimpl_wrmsr(vcpu, msr, data);
@@ -4265,7 +4261,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_MISC_FEATURES_ENABLES:
if (data & ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT ||
(data & MSR_MISC_FEATURES_ENABLES_CPUID_FAULT &&
- !supports_cpuid_fault(vcpu)))
+ !(vcpu->arch.msr_platform_info & MSR_PLATFORM_INFO_CPUID_FAULT)))
return 1;
vcpu->arch.msr_misc_features_enables = data;
break;
@@ -5228,8 +5224,13 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
* On a host with synchronized TSC, there is no need to update
* kvmclock on vcpu->cpu migration
*/
- if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1)
- kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
+ if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1) {
+ if (__ratelimit(&vcpu->kvm->arch.kvmclock_update_rs))
+ kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
+ else
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+ }
+
if (vcpu->cpu != cpu)
kvm_make_request(KVM_REQ_MIGRATE_TIMER, vcpu);
vcpu->cpu = cpu;
@@ -6910,6 +6911,10 @@ disable_exits_unlock:
if (!enable_pmu || (cap->args[0] & ~KVM_CAP_PMU_VALID_MASK))
break;
+ if (kvm->arch.has_protected_pmu &&
+ cap->args[0] != KVM_PMU_CAP_DISABLE)
+ break;
+
mutex_lock(&kvm->lock);
if (!kvm->created_vcpus && !kvm->arch.created_mediated_pmu) {
kvm->arch.enable_pmu = !(cap->args[0] & KVM_PMU_CAP_DISABLE);
@@ -8548,6 +8553,11 @@ static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt)
kvm_emulate_wbinvd_noskip(emul_to_vcpu(ctxt));
}
+static unsigned long emulator_get_effective_dr7(struct x86_emulate_ctxt *ctxt)
+{
+ return kvm_get_effective_dr7(emul_to_vcpu(ctxt));
+}
+
static unsigned long emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr)
{
return kvm_get_dr(emul_to_vcpu(ctxt), dr);
@@ -8805,6 +8815,11 @@ static int emulator_intercept(struct x86_emulate_ctxt *ctxt,
&ctxt->exception);
}
+static bool emulator_is_cpuid_allowed(struct x86_emulate_ctxt *ctxt)
+{
+ return kvm_is_cpuid_allowed(emul_to_vcpu(ctxt));
+}
+
static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
u32 *eax, u32 *ebx, u32 *ecx, u32 *edx,
bool exact_only)
@@ -8930,6 +8945,7 @@ static const struct x86_emulate_ops emulate_ops = {
.get_cr = emulator_get_cr,
.set_cr = emulator_set_cr,
.cpl = emulator_get_cpl,
+ .get_effective_dr7 = emulator_get_effective_dr7,
.get_dr = emulator_get_dr,
.set_dr = emulator_set_dr,
.set_msr_with_filter = emulator_set_msr_with_filter,
@@ -8941,6 +8957,7 @@ static const struct x86_emulate_ops emulate_ops = {
.wbinvd = emulator_wbinvd,
.fix_hypercall = emulator_fix_hypercall,
.intercept = emulator_intercept,
+ .is_cpuid_allowed = emulator_is_cpuid_allowed,
.get_cpuid = emulator_get_cpuid,
.guest_has_movbe = emulator_guest_has_movbe,
.guest_has_fxsr = emulator_guest_has_fxsr,
@@ -8976,17 +8993,36 @@ static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
}
}
-static void inject_emulated_exception(struct kvm_vcpu *vcpu)
+static int kvm_inject_emulated_db(struct kvm_vcpu *vcpu, unsigned long dr6)
{
- struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
+ struct kvm_run *kvm_run = vcpu->run;
- if (ctxt->exception.vector == PF_VECTOR)
- kvm_inject_emulated_page_fault(vcpu, &ctxt->exception);
- else if (ctxt->exception.error_code_valid)
- kvm_queue_exception_e(vcpu, ctxt->exception.vector,
- ctxt->exception.error_code);
+ if (vcpu->guest_debug & (KVM_GUESTDBG_USE_HW_BP | KVM_GUESTDBG_SINGLESTEP)) {
+ kvm_run->debug.arch.dr6 = dr6 | DR6_ACTIVE_LOW;
+ kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu);
+ kvm_run->debug.arch.exception = DB_VECTOR;
+ kvm_run->exit_reason = KVM_EXIT_DEBUG;
+ return 0;
+ }
+
+ kvm_queue_exception_p(vcpu, DB_VECTOR, dr6);
+ return 1;
+}
+
+static int inject_emulated_exception(struct kvm_vcpu *vcpu)
+{
+ struct x86_exception *ex = &vcpu->arch.emulate_ctxt->exception;
+
+ if (ex->vector == DB_VECTOR)
+ return kvm_inject_emulated_db(vcpu, ex->dr6);
+
+ if (ex->vector == PF_VECTOR)
+ kvm_inject_emulated_page_fault(vcpu, ex);
+ else if (ex->error_code_valid)
+ kvm_queue_exception_e(vcpu, ex->vector, ex->error_code);
else
- kvm_queue_exception(vcpu, ctxt->exception.vector);
+ kvm_queue_exception(vcpu, ex->vector);
+ return 1;
}
static struct x86_emulate_ctxt *alloc_emulate_ctxt(struct kvm_vcpu *vcpu)
@@ -9026,6 +9062,7 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
ctxt->interruptibility = 0;
ctxt->have_exception = false;
ctxt->exception.vector = -1;
+ ctxt->exception.payload = 0;
ctxt->perm_ok = false;
init_decode_cache(ctxt);
@@ -9243,21 +9280,6 @@ static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7,
return dr6;
}
-static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu)
-{
- struct kvm_run *kvm_run = vcpu->run;
-
- if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
- kvm_run->debug.arch.dr6 = DR6_BS | DR6_ACTIVE_LOW;
- kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu);
- kvm_run->debug.arch.exception = DB_VECTOR;
- kvm_run->exit_reason = KVM_EXIT_DEBUG;
- return 0;
- }
- kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BS);
- return 1;
-}
-
int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
{
unsigned long rflags = kvm_x86_call(get_rflags)(vcpu);
@@ -9278,13 +9300,16 @@ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
* that sets the TF flag".
*/
if (unlikely(rflags & X86_EFLAGS_TF))
- r = kvm_vcpu_do_singlestep(vcpu);
+ r = kvm_inject_emulated_db(vcpu, DR6_BS);
return r;
}
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_skip_emulated_instruction);
static bool kvm_is_code_breakpoint_inhibited(struct kvm_vcpu *vcpu)
{
+ if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
+ return false;
+
if (kvm_get_rflags(vcpu) & X86_EFLAGS_RF)
return true;
@@ -9301,6 +9326,8 @@ static bool kvm_is_code_breakpoint_inhibited(struct kvm_vcpu *vcpu)
static bool kvm_vcpu_check_code_breakpoint(struct kvm_vcpu *vcpu,
int emulation_type, int *r)
{
+ unsigned long dr7 = kvm_get_effective_dr7(vcpu);
+
WARN_ON_ONCE(emulation_type & EMULTYPE_NO_DECODE);
/*
@@ -9321,34 +9348,14 @@ static bool kvm_vcpu_check_code_breakpoint(struct kvm_vcpu *vcpu,
EMULTYPE_TRAP_UD | EMULTYPE_VMWARE_GP | EMULTYPE_PF))
return false;
- if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) &&
- (vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) {
- struct kvm_run *kvm_run = vcpu->run;
- unsigned long eip = kvm_get_linear_rip(vcpu);
- u32 dr6 = kvm_vcpu_check_hw_bp(eip, 0,
- vcpu->arch.guest_debug_dr7,
- vcpu->arch.eff_db);
-
- if (dr6 != 0) {
- kvm_run->debug.arch.dr6 = dr6 | DR6_ACTIVE_LOW;
- kvm_run->debug.arch.pc = eip;
- kvm_run->debug.arch.exception = DB_VECTOR;
- kvm_run->exit_reason = KVM_EXIT_DEBUG;
- *r = 0;
- return true;
- }
- }
-
- if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK) &&
+ if (unlikely(dr7 & DR7_BP_EN_MASK) &&
!kvm_is_code_breakpoint_inhibited(vcpu)) {
unsigned long eip = kvm_get_linear_rip(vcpu);
- u32 dr6 = kvm_vcpu_check_hw_bp(eip, 0,
- vcpu->arch.dr7,
- vcpu->arch.db);
+ u32 dr6 = kvm_vcpu_check_hw_bp(eip, 0, dr7,
+ vcpu->arch.eff_db);
- if (dr6 != 0) {
- kvm_queue_exception_p(vcpu, DB_VECTOR, dr6);
- *r = 1;
+ if (dr6) {
+ *r = kvm_inject_emulated_db(vcpu, dr6);
return true;
}
}
@@ -9494,8 +9501,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
*/
WARN_ON_ONCE(ctxt->exception.vector == UD_VECTOR ||
exception_type(ctxt->exception.vector) == EXCPT_TRAP);
- inject_emulated_exception(vcpu);
- return 1;
+ return inject_emulated_exception(vcpu);
}
return handle_emulation_failure(vcpu, emulation_type);
}
@@ -9590,8 +9596,7 @@ restart:
if (ctxt->have_exception) {
WARN_ON_ONCE(vcpu->mmio_needed && !vcpu->mmio_is_write);
vcpu->mmio_needed = false;
- r = 1;
- inject_emulated_exception(vcpu);
+ r = inject_emulated_exception(vcpu);
} else if (vcpu->arch.pio.count) {
if (!vcpu->arch.pio.in) {
/* FIXME: return into emulator if single-stepping. */
@@ -9634,7 +9639,7 @@ writeback:
kvm_pmu_branch_retired(vcpu);
kvm_rip_write(vcpu, ctxt->eip);
if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
- r = kvm_vcpu_do_singlestep(vcpu);
+ r = kvm_inject_emulated_db(vcpu, DR6_BS);
kvm_x86_call(update_emulated_instruction)(vcpu);
__kvm_set_rflags(vcpu, ctxt->eflags);
}
@@ -11588,9 +11593,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (vcpu->arch.apic_attention)
kvm_lapic_sync_from_vapic(vcpu);
- if (unlikely(exit_fastpath == EXIT_FASTPATH_EXIT_USERSPACE))
- return 0;
-
r = kvm_x86_call(handle_exit)(vcpu, exit_fastpath);
return r;
@@ -13352,6 +13354,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
raw_spin_lock_init(&kvm->arch.tsc_write_lock);
mutex_init(&kvm->arch.apic_map_lock);
seqcount_raw_spinlock_init(&kvm->arch.pvclock_sc, &kvm->arch.tsc_write_lock);
+ ratelimit_state_init(&kvm->arch.kvmclock_update_rs, HZ, 10);
+ ratelimit_set_flags(&kvm->arch.kvmclock_update_rs, RATELIMIT_MSG_ON_RELEASE);
kvm->arch.kvmclock_offset = -get_kvmclock_base_ns();
raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
@@ -13361,7 +13365,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
kvm->arch.default_tsc_khz = max_tsc_khz ? : tsc_khz;
kvm->arch.apic_bus_cycle_ns = APIC_BUS_CYCLE_NS_DEFAULT;
kvm->arch.guest_can_read_msr_platform_info = true;
- kvm->arch.enable_pmu = enable_pmu;
+ kvm->arch.enable_pmu = enable_pmu && !kvm->arch.has_protected_pmu;
#if IS_ENABLED(CONFIG_HYPERV)
spin_lock_init(&kvm->arch.hv_root_tdp_lock);
@@ -14006,7 +14010,7 @@ static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu)
if (!kvm_pv_async_pf_enabled(vcpu))
return false;
- if (!vcpu->arch.apf.send_always &&
+ if (!(vcpu->arch.apf.msr_en_val & KVM_ASYNC_PF_SEND_ALWAYS) &&
(vcpu->arch.guest_state_protected || !kvm_x86_call(get_cpl)(vcpu)))
return false;
@@ -14015,7 +14019,7 @@ static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu)
* L1 needs to opt into the special #PF vmexits that are
* used to deliver async page faults.
*/
- return vcpu->arch.apf.delivery_as_pf_vmexit;
+ return vcpu->arch.apf.msr_en_val & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;
} else {
/*
* Play it safe in case the guest temporarily disables paging.
@@ -14059,7 +14063,7 @@ bool kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
fault.nested_page_fault = false;
fault.address = work->arch.token;
fault.async_page_fault = true;
- kvm_inject_page_fault(vcpu, &fault);
+ kvm_inject_page_fault(vcpu, &fault, false);
return true;
} else {
/*
@@ -14230,7 +14234,7 @@ void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_c
fault.address = gva;
fault.async_page_fault = false;
}
- vcpu->arch.walk_mmu->inject_page_fault(vcpu, &fault);
+ vcpu->arch.walk_mmu->inject_page_fault(vcpu, &fault, true);
}
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_fixup_and_inject_pf_error);
@@ -14309,7 +14313,7 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva)
* the RAP (Return Address Predicator).
*/
if (guest_cpu_cap_has(vcpu, X86_FEATURE_ERAPS))
- kvm_register_is_dirty(vcpu, VCPU_REG_ERAPS);
+ kvm_register_mark_dirty(vcpu, VCPU_REG_ERAPS);
kvm_invalidate_pcid(vcpu, operand.pcid);
return kvm_skip_emulated_instruction(vcpu);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 38a905fa86de2..a49424f9c968e 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -490,9 +490,6 @@ fastpath_t handle_fastpath_invd(struct kvm_vcpu *vcpu);
extern struct kvm_caps kvm_caps;
extern struct kvm_host_values kvm_host;
-extern bool enable_pmu;
-extern bool enable_mediated_pmu;
-
void kvm_setup_xss_caps(void);
/*
@@ -754,6 +751,12 @@ static inline void kvm_prepare_emulated_mmio_exit(struct kvm_vcpu *vcpu,
frag->data, vcpu->mmio_is_write);
}
+static inline bool kvm_is_valid_map_gpa_range_ret(u64 hypercall_ret)
+{
+ return !hypercall_ret || hypercall_ret == EINVAL ||
+ hypercall_ret == EAGAIN;
+}
+
static inline bool user_exit_on_hypercall(struct kvm *kvm, unsigned long hc_nr)
{
return kvm->arch.hypercall_exit_enabled & BIT(hc_nr);
diff --git a/arch/x86/virt/hw.c b/arch/x86/virt/hw.c
index f647557d38ac5..7e9091c640be0 100644
--- a/arch/x86/virt/hw.c
+++ b/arch/x86/virt/hw.c
@@ -49,7 +49,20 @@ static void x86_virt_invoke_kvm_emergency_callback(void)
{
cpu_emergency_virt_cb *kvm_callback;
- kvm_callback = rcu_dereference(kvm_emergency_callback);
+ /*
+ * RCU may not be watching the crashing CPU here, so rcu_dereference()
+ * triggers a suspicious-RCU-usage splat. In principle, a concurrent
+ * KVM module unload could race with this read; see commit 2baa33a8ddd6
+ * ("KVM: x86: Leave user-return notifier registered on reboot/shutdown")
+ * which notes that nothing prevents module unload during panic/reboot.
+ *
+ * However, taking a lock here would be riskier than the current race:
+ * the system is going down via NMI shootdown, and any lock could be
+ * held by an already-stopped CPU. Use rcu_dereference_raw() to silence
+ * the lockdep splat and accept the comically small remaining race;
+ * panic context inherently cannot guarantee complete correctness.
+ */
+ kvm_callback = rcu_dereference_raw(kvm_emergency_callback);
if (kvm_callback)
kvm_callback();
}
diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
index b15269b5941dc..42df8ea464c47 100644
--- a/arch/x86/virt/vmx/tdx/tdx.c
+++ b/arch/x86/virt/vmx/tdx/tdx.c
@@ -30,7 +30,6 @@
#include <linux/suspend.h>
#include <linux/syscore_ops.h>
#include <linux/idr.h>
-#include <linux/kvm_types.h>
#include <asm/page.h>
#include <asm/special_insns.h>
#include <asm/msr-index.h>
@@ -709,7 +708,7 @@ err:
* to normal kernel memory. Systems with the X86_BUG_TDX_PW_MCE erratum need to
* do the conversion explicitly via MOVDIR64B.
*/
-static void tdx_quirk_reset_paddr(unsigned long base, unsigned long size)
+void tdx_quirk_reset_paddr(unsigned long base, unsigned long size)
{
const void *zero_page = (const void *)page_address(ZERO_PAGE(0));
unsigned long phys, end;
@@ -728,12 +727,7 @@ static void tdx_quirk_reset_paddr(unsigned long base, unsigned long size)
*/
mb();
}
-
-void tdx_quirk_reset_page(struct page *page)
-{
- tdx_quirk_reset_paddr(page_to_phys(page), PAGE_SIZE);
-}
-EXPORT_SYMBOL_FOR_KVM(tdx_quirk_reset_page);
+EXPORT_SYMBOL_FOR_KVM(tdx_quirk_reset_paddr);
static __init void tdmr_quirk_reset_pamt(struct tdmr_info *tdmr)
@@ -1634,6 +1628,17 @@ static void tdx_clflush_page(struct page *page)
clflush_cache_range(page_to_virt(page), PAGE_SIZE);
}
+static void tdx_clflush_pfn(kvm_pfn_t pfn)
+{
+ clflush_cache_range(__va(PFN_PHYS(pfn)), PAGE_SIZE);
+}
+
+static int pg_level_to_tdx_sept_level(enum pg_level level)
+{
+ WARN_ON_ONCE(level == PG_LEVEL_NONE);
+ return level - 1;
+}
+
noinstr u64 tdh_vp_enter(struct tdx_vp *td, struct tdx_module_args *args)
{
args->rcx = td->tdvpr_pa;
@@ -1654,17 +1659,18 @@ u64 tdh_mng_addcx(struct tdx_td *td, struct page *tdcs_page)
}
EXPORT_SYMBOL_FOR_KVM(tdh_mng_addcx);
-u64 tdh_mem_page_add(struct tdx_td *td, u64 gpa, struct page *page, struct page *source, u64 *ext_err1, u64 *ext_err2)
+u64 tdh_mem_page_add(struct tdx_td *td, u64 gpa, kvm_pfn_t pfn, struct page *source,
+ u64 *ext_err1, u64 *ext_err2)
{
struct tdx_module_args args = {
.rcx = gpa,
.rdx = tdx_tdr_pa(td),
- .r8 = page_to_phys(page),
+ .r8 = PFN_PHYS(pfn),
.r9 = page_to_phys(source),
};
u64 ret;
- tdx_clflush_page(page);
+ tdx_clflush_pfn(pfn);
ret = seamcall_ret(TDH_MEM_PAGE_ADD, &args);
*ext_err1 = args.rcx;
@@ -1674,10 +1680,11 @@ u64 tdh_mem_page_add(struct tdx_td *td, u64 gpa, struct page *page, struct page
}
EXPORT_SYMBOL_FOR_KVM(tdh_mem_page_add);
-u64 tdh_mem_sept_add(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2)
+u64 tdh_mem_sept_add(struct tdx_td *td, u64 gpa, enum pg_level level,
+ struct page *page, u64 *ext_err1, u64 *ext_err2)
{
struct tdx_module_args args = {
- .rcx = gpa | level,
+ .rcx = gpa | pg_level_to_tdx_sept_level(level),
.rdx = tdx_tdr_pa(td),
.r8 = page_to_phys(page),
};
@@ -1705,16 +1712,17 @@ u64 tdh_vp_addcx(struct tdx_vp *vp, struct page *tdcx_page)
}
EXPORT_SYMBOL_FOR_KVM(tdh_vp_addcx);
-u64 tdh_mem_page_aug(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2)
+u64 tdh_mem_page_aug(struct tdx_td *td, u64 gpa, enum pg_level level,
+ kvm_pfn_t pfn, u64 *ext_err1, u64 *ext_err2)
{
struct tdx_module_args args = {
- .rcx = gpa | level,
+ .rcx = gpa | pg_level_to_tdx_sept_level(level),
.rdx = tdx_tdr_pa(td),
- .r8 = page_to_phys(page),
+ .r8 = PFN_PHYS(pfn),
};
u64 ret;
- tdx_clflush_page(page);
+ tdx_clflush_pfn(pfn);
ret = seamcall_ret(TDH_MEM_PAGE_AUG, &args);
*ext_err1 = args.rcx;
@@ -1724,10 +1732,11 @@ u64 tdh_mem_page_aug(struct tdx_td *td, u64 gpa, int level, struct page *page, u
}
EXPORT_SYMBOL_FOR_KVM(tdh_mem_page_aug);
-u64 tdh_mem_range_block(struct tdx_td *td, u64 gpa, int level, u64 *ext_err1, u64 *ext_err2)
+u64 tdh_mem_range_block(struct tdx_td *td, u64 gpa, enum pg_level level,
+ u64 *ext_err1, u64 *ext_err2)
{
struct tdx_module_args args = {
- .rcx = gpa | level,
+ .rcx = gpa | pg_level_to_tdx_sept_level(level),
.rdx = tdx_tdr_pa(td),
};
u64 ret;
@@ -1940,10 +1949,11 @@ u64 tdh_mem_track(struct tdx_td *td)
}
EXPORT_SYMBOL_FOR_KVM(tdh_mem_track);
-u64 tdh_mem_page_remove(struct tdx_td *td, u64 gpa, u64 level, u64 *ext_err1, u64 *ext_err2)
+u64 tdh_mem_page_remove(struct tdx_td *td, u64 gpa, enum pg_level level,
+ u64 *ext_err1, u64 *ext_err2)
{
struct tdx_module_args args = {
- .rcx = gpa | level,
+ .rcx = gpa | pg_level_to_tdx_sept_level(level),
.rdx = tdx_tdr_pa(td),
};
u64 ret;
@@ -1967,21 +1977,27 @@ u64 tdh_phymem_cache_wb(bool resume)
}
EXPORT_SYMBOL_FOR_KVM(tdh_phymem_cache_wb);
+static inline u64 mk_keyed_paddr(u16 hkid, kvm_pfn_t pfn)
+{
+ /* KeyID bits are just above the physical address bits. */
+ return PFN_PHYS(pfn) | ((u64)hkid << boot_cpu_data.x86_phys_bits);
+}
+
u64 tdh_phymem_page_wbinvd_tdr(struct tdx_td *td)
{
struct tdx_module_args args = {};
- args.rcx = mk_keyed_paddr(tdx_global_keyid, td->tdr_page);
+ args.rcx = mk_keyed_paddr(tdx_global_keyid, page_to_pfn(td->tdr_page));
return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args);
}
EXPORT_SYMBOL_FOR_KVM(tdh_phymem_page_wbinvd_tdr);
-u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, struct page *page)
+u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, kvm_pfn_t pfn)
{
struct tdx_module_args args = {};
- args.rcx = mk_keyed_paddr(hkid, page);
+ args.rcx = mk_keyed_paddr(hkid, pfn);
return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args);
}