diff options
| author | Mark Brown <broonie@kernel.org> | 2026-05-29 22:46:54 +0100 |
|---|---|---|
| committer | Mark Brown <broonie@kernel.org> | 2026-05-29 22:46:54 +0100 |
| commit | f624c17e475e17a1dbeef56ef624378f4e5a80b7 (patch) | |
| tree | aeae91c403e0713b4a67f8610040905e8c0786fd /arch | |
| parent | 29fa9b4922922bd33cac81162dbf669bfb55a52d (diff) | |
| parent | d1568b1332b6b3b36b222c2868fc102727c12a34 (diff) | |
| download | linux-next-history-f624c17e475e17a1dbeef56ef624378f4e5a80b7.tar.gz | |
Merge branch 'next' of https://github.com/kvm-x86/linux.git
# Conflicts:
# arch/x86/include/asm/tdx.h
Diffstat (limited to 'arch')
32 files changed, 1211 insertions, 861 deletions
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index e4fca997ec797..83dc5086138b3 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -96,10 +96,8 @@ KVM_X86_OP_OPTIONAL_RET0(set_identity_map_addr) KVM_X86_OP_OPTIONAL_RET0(get_mt_mask) KVM_X86_OP_OPTIONAL_RET0(tdp_has_smep) KVM_X86_OP(load_mmu_pgd) -KVM_X86_OP_OPTIONAL(link_external_spt) -KVM_X86_OP_OPTIONAL(set_external_spte) +KVM_X86_OP_OPTIONAL_RET0(set_external_spte) KVM_X86_OP_OPTIONAL(free_external_spt) -KVM_X86_OP_OPTIONAL(remove_external_spte) KVM_X86_OP(has_wbinvd_exit) KVM_X86_OP(get_l2_tsc_offset) KVM_X86_OP(get_l2_tsc_multiplier) diff --git a/arch/x86/include/asm/kvm-x86-pmu-ops.h b/arch/x86/include/asm/kvm-x86-pmu-ops.h index d5452b3433b7d..4a223c2793e3f 100644 --- a/arch/x86/include/asm/kvm-x86-pmu-ops.h +++ b/arch/x86/include/asm/kvm-x86-pmu-ops.h @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ #if !defined(KVM_X86_PMU_OP) || \ - !defined(KVM_X86_PMU_OP_OPTIONAL) + !defined(KVM_X86_PMU_OP_OPTIONAL) || \ + !defined(KVM_X86_PMU_OP_OPTIONAL_RET0) #error Missing one or more KVM_X86_PMU_OP #defines #else @@ -23,6 +24,7 @@ KVM_X86_PMU_OP(init) KVM_X86_PMU_OP_OPTIONAL(reset) KVM_X86_PMU_OP_OPTIONAL(deliver_pmi) KVM_X86_PMU_OP_OPTIONAL(cleanup) +KVM_X86_PMU_OP_OPTIONAL_RET0(pmc_is_disabled_in_current_mode) KVM_X86_PMU_OP_OPTIONAL(write_global_ctrl) KVM_X86_PMU_OP(mediated_load) @@ -31,3 +33,4 @@ KVM_X86_PMU_OP(mediated_put) #undef KVM_X86_PMU_OP #undef KVM_X86_PMU_OP_OPTIONAL +#undef KVM_X86_PMU_OP_OPTIONAL_RET0 diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 8a53ca6195701..6ae7d539af909 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -284,6 +284,8 @@ enum x86_intercept_stage; #define PFERR_GUEST_RMP_MASK BIT_ULL(31) #define PFERR_GUEST_FINAL_MASK BIT_ULL(32) #define PFERR_GUEST_PAGE_MASK BIT_ULL(33) +#define PFERR_GUEST_FAULT_STAGE_MASK \ + (PFERR_GUEST_FINAL_MASK | PFERR_GUEST_PAGE_MASK) #define PFERR_GUEST_ENC_MASK BIT_ULL(34) #define PFERR_GUEST_SIZEM_MASK BIT_ULL(35) #define PFERR_GUEST_VMPL_MASK BIT_ULL(36) @@ -484,7 +486,8 @@ struct kvm_mmu { u64 (*get_pdptr)(struct kvm_vcpu *vcpu, int index); int (*page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); void (*inject_page_fault)(struct kvm_vcpu *vcpu, - struct x86_exception *fault); + struct x86_exception *fault, + bool from_hardware); gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, gpa_t gva_or_gpa, u64 access, struct x86_exception *exception); @@ -612,6 +615,8 @@ struct kvm_pmu { DECLARE_BITMAP(pmc_counting_instructions, X86_PMC_IDX_MAX); DECLARE_BITMAP(pmc_counting_branches, X86_PMC_IDX_MAX); + DECLARE_BITMAP(pmc_has_mode_specific_enables, X86_PMC_IDX_MAX); + u64 ds_area; u64 pebs_enable; u64 pebs_enable_rsvd; @@ -1057,8 +1062,6 @@ struct kvm_vcpu_arch { u16 vec; u32 id; u32 host_apf_flags; - bool send_always; - bool delivery_as_pf_vmexit; bool pageready_pending; } apf; @@ -1441,6 +1444,7 @@ struct kvm_arch { bool has_private_mem; bool has_protected_state; bool has_protected_eoi; + bool has_protected_pmu; bool pre_fault_allowed; struct hlist_head *mmu_page_hash; struct list_head active_mmu_pages; @@ -1523,6 +1527,7 @@ struct kvm_arch { bool use_master_clock; u64 master_kernel_ns; u64 master_cycle_now; + struct ratelimit_state kvmclock_update_rs; #ifdef CONFIG_KVM_HYPERV struct kvm_hv hyperv; @@ -1911,20 +1916,13 @@ struct kvm_x86_ops { void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level); - /* Update external mapping with page table link. */ - int (*link_external_spt)(struct kvm *kvm, gfn_t gfn, enum pg_level level, - void *external_spt); /* Update the external page table from spte getting set. */ - int (*set_external_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level, - u64 mirror_spte); + int (*set_external_spte)(struct kvm *kvm, gfn_t gfn, u64 old_spte, + u64 new_spte, enum pg_level level); /* Update external page tables for page table about to be freed. */ - int (*free_external_spt)(struct kvm *kvm, gfn_t gfn, enum pg_level level, - void *external_spt); + void (*free_external_spt)(struct kvm *kvm, struct kvm_mmu_page *sp); - /* Update external page table from spte getting removed, and flush TLB. */ - void (*remove_external_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level, - u64 mirror_spte); bool (*has_wbinvd_exit)(void); @@ -2065,7 +2063,6 @@ extern bool __read_mostly enable_device_posted_irqs; extern struct kvm_x86_ops kvm_x86_ops; #define kvm_x86_call(func) static_call(kvm_x86_##func) -#define kvm_pmu_call(func) static_call(kvm_x86_pmu_##func) #define KVM_X86_OP(func) \ DECLARE_STATIC_CALL(kvm_x86_##func, *(((struct kvm_x86_ops *)0)->func)); @@ -2306,10 +2303,18 @@ void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr, unsigned long payload); void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned int nr, bool has_error_code, u32 error_code); -void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault); -void kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu, - struct x86_exception *fault); -bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl); +void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault, + bool from_hardware); +void __kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu, + struct x86_exception *fault, + bool from_hardware); + +static inline void kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu, + struct x86_exception *fault) +{ + __kvm_inject_emulated_page_fault(vcpu, fault, false); +} + bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr); static inline int __kvm_irq_line_state(unsigned long *irq_state, @@ -2550,7 +2555,8 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages); KVM_X86_QUIRK_SLOT_ZAP_ALL | \ KVM_X86_QUIRK_STUFF_FEATURE_MSRS | \ KVM_X86_QUIRK_IGNORE_GUEST_PAT | \ - KVM_X86_QUIRK_VMCS12_ALLOW_FREEZE_IN_SMM) + KVM_X86_QUIRK_VMCS12_ALLOW_FREEZE_IN_SMM | \ + KVM_X86_QUIRK_NESTED_SVM_SHARED_PAT) #define KVM_X86_CONDITIONAL_QUIRKS \ (KVM_X86_QUIRK_CD_NW_CLEARED | \ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 86554de9a3f52..18c4be75e9271 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -899,6 +899,7 @@ #define MSR_K7_HWCR_IRPERF_EN_BIT 30 #define MSR_K7_HWCR_IRPERF_EN BIT_ULL(MSR_K7_HWCR_IRPERF_EN_BIT) #define MSR_K7_HWCR_CPUID_USER_DIS_BIT 35 +#define MSR_K7_HWCR_CPUID_USER_DIS BIT_ULL(MSR_K7_HWCR_CPUID_USER_DIS_BIT) #define MSR_K7_FID_VID_CTL 0xc0010041 #define MSR_K7_FID_VID_STATUS 0xc0010042 #define MSR_K7_HWCR_CPB_DIS_BIT 25 diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 752cb319d5eab..1eb13673e889f 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -60,6 +60,8 @@ #define AMD64_EVENTSEL_INT_CORE_ENABLE (1ULL << 36) #define AMD64_EVENTSEL_GUESTONLY (1ULL << 40) #define AMD64_EVENTSEL_HOSTONLY (1ULL << 41) +#define AMD64_EVENTSEL_HOST_GUEST_MASK \ + (AMD64_EVENTSEL_HOSTONLY | AMD64_EVENTSEL_GUESTONLY) #define AMD64_EVENTSEL_INT_CORE_SEL_SHIFT 37 #define AMD64_EVENTSEL_INT_CORE_SEL_MASK \ diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index e5a9cf656c072..89e97d5761d89 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -6,6 +6,7 @@ #include <linux/init.h> #include <linux/bits.h> #include <linux/mmzone.h> +#include <linux/kvm_types.h> #include <asm/errno.h> #include <asm/ptrace.h> @@ -121,7 +122,7 @@ int tdx_guest_keyid_alloc(void); u32 tdx_get_nr_guest_keyids(void); void tdx_guest_keyid_free(unsigned int keyid); -void tdx_quirk_reset_page(struct page *page); +void tdx_quirk_reset_paddr(unsigned long base, unsigned long size); struct tdx_td { /* TD root structure: */ @@ -145,32 +146,17 @@ struct tdx_vp { struct page **tdcx_pages; }; -static inline u64 mk_keyed_paddr(u16 hkid, struct page *page) -{ - u64 ret; - - ret = page_to_phys(page); - /* KeyID bits are just above the physical address bits: */ - ret |= (u64)hkid << boot_cpu_data.x86_phys_bits; - - return ret; -} - -static inline int pg_level_to_tdx_sept_level(enum pg_level level) -{ - WARN_ON_ONCE(level == PG_LEVEL_NONE); - return level - 1; -} - void tdx_sys_disable(void); u64 tdh_vp_enter(struct tdx_vp *vp, struct tdx_module_args *args); u64 tdh_mng_addcx(struct tdx_td *td, struct page *tdcs_page); -u64 tdh_mem_page_add(struct tdx_td *td, u64 gpa, struct page *page, struct page *source, u64 *ext_err1, u64 *ext_err2); -u64 tdh_mem_sept_add(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2); +u64 tdh_mem_page_add(struct tdx_td *td, u64 gpa, kvm_pfn_t pfn, struct page *source, + u64 *ext_err1, u64 *ext_err2); +u64 tdh_mem_sept_add(struct tdx_td *td, u64 gpa, enum pg_level level, struct page *page, u64 *ext_err1, u64 *ext_err2); u64 tdh_vp_addcx(struct tdx_vp *vp, struct page *tdcx_page); -u64 tdh_mem_page_aug(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2); -u64 tdh_mem_range_block(struct tdx_td *td, u64 gpa, int level, u64 *ext_err1, u64 *ext_err2); +u64 tdh_mem_page_aug(struct tdx_td *td, u64 gpa, enum pg_level level, kvm_pfn_t pfn, + u64 *ext_err1, u64 *ext_err2); +u64 tdh_mem_range_block(struct tdx_td *td, u64 gpa, enum pg_level level, u64 *ext_err1, u64 *ext_err2); u64 tdh_mng_key_config(struct tdx_td *td); u64 tdh_mng_create(struct tdx_td *td, u16 hkid); u64 tdh_vp_create(struct tdx_td *td, struct tdx_vp *vp); @@ -186,10 +172,10 @@ u64 tdh_vp_rd(struct tdx_vp *vp, u64 field, u64 *data); u64 tdh_vp_wr(struct tdx_vp *vp, u64 field, u64 data, u64 mask); u64 tdh_phymem_page_reclaim(struct page *page, u64 *tdx_pt, u64 *tdx_owner, u64 *tdx_size); u64 tdh_mem_track(struct tdx_td *tdr); -u64 tdh_mem_page_remove(struct tdx_td *td, u64 gpa, u64 level, u64 *ext_err1, u64 *ext_err2); +u64 tdh_mem_page_remove(struct tdx_td *td, u64 gpa, enum pg_level level, u64 *ext_err1, u64 *ext_err2); u64 tdh_phymem_cache_wb(bool resume); u64 tdh_phymem_page_wbinvd_tdr(struct tdx_td *td); -u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, struct page *page); +u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, kvm_pfn_t pfn); #else static inline void tdx_init(void) { } static inline u32 tdx_get_nr_guest_keyids(void) { return 0; } diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 5f2b30d0405c8..1585ec8040666 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -477,6 +477,7 @@ struct kvm_sync_regs { #define KVM_X86_QUIRK_STUFF_FEATURE_MSRS (1 << 8) #define KVM_X86_QUIRK_IGNORE_GUEST_PAT (1 << 9) #define KVM_X86_QUIRK_VMCS12_ALLOW_FREEZE_IN_SMM (1 << 10) +#define KVM_X86_QUIRK_NESTED_SVM_SHARED_PAT (1 << 11) #define KVM_STATE_NESTED_FORMAT_VMX 0 #define KVM_STATE_NESTED_FORMAT_SVM 1 @@ -532,6 +533,7 @@ struct kvm_svm_nested_state_data { struct kvm_svm_nested_state_hdr { __u64 vmcb_pa; + __u64 gpat; }; /* for KVM_CAP_NESTED_STATE */ diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index e69156b54cfff..8e5340dd26211 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -1248,7 +1248,7 @@ void kvm_initialize_cpu_caps(void) F(AUTOIBRS), EMULATED_F(NO_SMM_CTL_MSR), /* PrefetchCtlMsr */ - /* GpOnUserCpuid */ + EMULATED_F(GP_ON_USER_CPUID), /* EPSF */ F(PREFETCHI), F(AVX512_BMM), @@ -2161,9 +2161,10 @@ int kvm_emulate_cpuid(struct kvm_vcpu *vcpu) { u32 eax, ebx, ecx, edx; - if (!is_smm(vcpu) && cpuid_fault_enabled(vcpu) && - !kvm_require_cpl(vcpu, 0)) + if (!kvm_is_cpuid_allowed(vcpu)) { + kvm_queue_exception_e(vcpu, GP_VECTOR, 0); return 1; + } eax = kvm_rax_read(vcpu); ecx = kvm_rcx_read(vcpu); diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 039b8e6f40baf..fc96ba86c644d 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -7,6 +7,8 @@ #include <asm/processor.h> #include <uapi/asm/kvm_para.h> +#include "smm.h" + extern u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly; extern bool kvm_is_configuring_cpu_caps __read_mostly; @@ -181,15 +183,17 @@ static inline int guest_cpuid_stepping(struct kvm_vcpu *vcpu) return x86_stepping(best->eax); } -static inline bool supports_cpuid_fault(struct kvm_vcpu *vcpu) +static inline bool cpuid_fault_enabled(struct kvm_vcpu *vcpu) { - return vcpu->arch.msr_platform_info & MSR_PLATFORM_INFO_CPUID_FAULT; + return (vcpu->arch.msr_misc_features_enables & + MSR_MISC_FEATURES_ENABLES_CPUID_FAULT) || + (vcpu->arch.msr_hwcr & MSR_K7_HWCR_CPUID_USER_DIS); } -static inline bool cpuid_fault_enabled(struct kvm_vcpu *vcpu) +static inline bool kvm_is_cpuid_allowed(struct kvm_vcpu *vcpu) { - return vcpu->arch.msr_misc_features_enables & - MSR_MISC_FEATURES_ENABLES_CPUID_FAULT; + return !cpuid_fault_enabled(vcpu) || is_smm(vcpu) || + !kvm_x86_call(get_cpl)(vcpu); } static __always_inline void kvm_cpu_cap_clear(unsigned int x86_feature) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 8013dccb31102..585a8ceab220d 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -540,8 +540,9 @@ static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec, return X86EMUL_PROPAGATE_FAULT; } -static int emulate_db(struct x86_emulate_ctxt *ctxt) +static int emulate_db(struct x86_emulate_ctxt *ctxt, unsigned long dr6) { + ctxt->exception.dr6 = dr6; return emulate_exception(ctxt, DB_VECTOR, 0, false); } @@ -3593,12 +3594,8 @@ static int em_sti(struct x86_emulate_ctxt *ctxt) static int em_cpuid(struct x86_emulate_ctxt *ctxt) { u32 eax, ebx, ecx, edx; - u64 msr = 0; - ctxt->ops->get_msr(ctxt, MSR_MISC_FEATURES_ENABLES, &msr); - if (!ctxt->ops->is_smm(ctxt) && - (msr & MSR_MISC_FEATURES_ENABLES_CPUID_FAULT) && - ctxt->ops->cpl(ctxt)) + if (!ctxt->ops->is_cpuid_allowed(ctxt)) return emulate_gp(ctxt, 0); eax = reg_read(ctxt, VCPU_REGS_RAX); @@ -3847,15 +3844,8 @@ static int check_dr_read(struct x86_emulate_ctxt *ctxt) if ((cr4 & X86_CR4_DE) && (dr == 4 || dr == 5)) return emulate_ud(ctxt); - if (ctxt->ops->get_dr(ctxt, 7) & DR7_GD) { - ulong dr6; - - dr6 = ctxt->ops->get_dr(ctxt, 6); - dr6 &= ~DR_TRAP_BITS; - dr6 |= DR6_BD | DR6_ACTIVE_LOW; - ctxt->ops->set_dr(ctxt, 6, dr6); - return emulate_db(ctxt); - } + if (ctxt->ops->get_effective_dr7(ctxt) & DR7_GD) + return emulate_db(ctxt, DR6_BD); return X86EMUL_CONTINUE; } diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index 0abff36d09942..3e375af15c035 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -22,9 +22,13 @@ enum x86_intercept_stage; struct x86_exception { u8 vector; bool error_code_valid; - u16 error_code; + u64 error_code; bool nested_page_fault; - u64 address; /* cr2 or nested page fault gpa */ + union { + u64 address; /* cr2 or nested page fault gpa */ + unsigned long dr6; + u64 payload; + }; u8 async_page_fault; unsigned long exit_qualification; }; @@ -211,6 +215,7 @@ struct x86_emulate_ops { ulong (*get_cr)(struct x86_emulate_ctxt *ctxt, int cr); int (*set_cr)(struct x86_emulate_ctxt *ctxt, int cr, ulong val); int (*cpl)(struct x86_emulate_ctxt *ctxt); + ulong (*get_effective_dr7)(struct x86_emulate_ctxt *ctxt); ulong (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr); int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value); int (*set_msr_with_filter)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data); @@ -225,6 +230,7 @@ struct x86_emulate_ops { struct x86_instruction_info *info, enum x86_intercept_stage stage); + bool (*is_cpuid_allowed)(struct x86_emulate_ctxt *ctxt); bool (*get_cpuid)(struct x86_emulate_ctxt *ctxt, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, bool exact_only); bool (*guest_has_movbe)(struct x86_emulate_ctxt *ctxt); @@ -520,13 +526,6 @@ enum x86_intercept { nr_x86_intercepts }; -/* Host execution mode. */ -#if defined(CONFIG_X86_32) -#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT32 -#elif defined(CONFIG_X86_64) -#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64 -#endif - int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int emulation_type); bool x86_page_table_writing_insn(struct x86_emulate_ctxt *ctxt); #define EMULATION_FAILED -1 diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 4078e624ca667..4e34f75e705da 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1730,7 +1730,7 @@ static inline struct kvm_lapic *to_lapic(struct kvm_io_device *dev) #define APIC_REGS_MASK(first, count) \ (APIC_REG_MASK(first) * ((1ull << (count)) - 1)) -u64 kvm_lapic_readable_reg_mask(struct kvm_lapic *apic) +static u64 kvm_lapic_readable_reg_mask(struct kvm_lapic *apic) { /* Leave bits '0' for reserved and write-only registers. */ u64 valid_reg_mask = @@ -1766,7 +1766,24 @@ u64 kvm_lapic_readable_reg_mask(struct kvm_lapic *apic) return valid_reg_mask; } -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_lapic_readable_reg_mask); + +u64 kvm_x2apic_disable_read_intercept_reg_mask(struct kvm_vcpu *vcpu) +{ + if (WARN_ON_ONCE(!lapic_in_kernel(vcpu))) + return 0; + + /* + * TMMCT, a.k.a. the current APIC timer count, reads aren't accelerated + * by hardware (Intel or AMD) as the timer is emulated in software (by + * KVM), i.e. reads from the virtual APIC page would return garbage. + * Intercept RDMSR, as handling the fault-like APIC-access VM-Exit is + * more expensive than handling a RDMSR VM-Exit (the APIC-access exit + * requires slow emulation of the code stream). + */ + return kvm_lapic_readable_reg_mask(vcpu->arch.apic) & + ~APIC_REG_MASK(APIC_TMCCT); +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_x2apic_disable_read_intercept_reg_mask); static int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len, void *data) diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 274885af4ebc4..f763cd29a5082 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -156,7 +156,7 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data); int kvm_lapic_set_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len); void kvm_lapic_exit(void); -u64 kvm_lapic_readable_reg_mask(struct kvm_lapic *apic); +u64 kvm_x2apic_disable_read_intercept_reg_mask(struct kvm_vcpu *vcpu); static inline void kvm_lapic_set_irr(int vec, struct kvm_lapic *apic) { diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index db1b82eae4da7..a01a1faab2eef 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -7022,7 +7022,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) write_lock(&kvm->mmu_lock); - kvm_mmu_invalidate_begin(kvm); + kvm_mmu_invalidate_start(kvm); kvm_mmu_invalidate_range_add(kvm, gfn_start, gfn_end); diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 07100bbfc2701..df3ae0c7ec2c3 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -328,6 +328,12 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, const int write_fault = access & PFERR_WRITE_MASK; const int user_fault = access & PFERR_USER_MASK; const int fetch_fault = access & PFERR_FETCH_MASK; + /* + * Note! Track the error_code that's common to legacy shadow paging + * and NPT shadow paging as a u16 to guard against unintentionally + * setting any of bits 63:16. Architecturally, the #PF error code is + * 32 bits, and Intel CPUs don't support settings bits 31:16. + */ u16 errcode = 0; gpa_t real_gpa; gfn_t gfn; @@ -391,16 +397,6 @@ retry_walk: nested_access | PFERR_GUEST_PAGE_MASK, &walker->fault, 0); - /* - * FIXME: This can happen if emulation (for of an INS/OUTS - * instruction) triggers a nested page fault. The exit - * qualification / exit info field will incorrectly have - * "guest page access" as the nested page fault's cause, - * instead of "guest page structure access". To fix this, - * the x86_exception struct should be augmented with enough - * information to fix the exit_qualification or exit_info_1 - * fields. - */ if (unlikely(real_gpa == INVALID_GPA)) return 0; @@ -506,7 +502,8 @@ error: * [2:0] - Derive from the access bits. The exit_qualification might be * out of date if it is serving an EPT misconfiguration. * [5:3] - Calculated by the page walk of the guest EPT page tables - * [7:11] - Derived from [7:11] of real exit_qualification + * [7:8] - Derived from "fault stage" access bits + * [9:11] - Derived from [9:11] of real exit_qualification * * The other bits are set to 0. */ @@ -521,12 +518,22 @@ error: walker->fault.exit_qualification |= EPT_VIOLATION_ACC_READ; /* + * KVM doesn't emulate features that access GPAs directly, e.g. + * Intel Processor Trace. Assume the GVA is always valid; when + * propagating faults from hardware, KVM will discard this info + * and use the EXIT_QUALIFICATION bits from the VMCS. + */ + walker->fault.exit_qualification |= EPT_VIOLATION_GVA_IS_VALID; + + /* * Accesses to guest paging structures are either "reads" or * "read+write" accesses, so consider them the latter if write_fault * is true. */ if (access & PFERR_GUEST_PAGE_MASK) walker->fault.exit_qualification |= EPT_VIOLATION_ACC_READ; + else + walker->fault.exit_qualification |= EPT_VIOLATION_GVA_TRANSLATED; /* * Note, pte_access holds the raw RWX bits from the EPTE, not @@ -542,6 +549,11 @@ error: walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu; walker->fault.async_page_fault = false; +#if PTTYPE != PTTYPE_EPT + if (walker->fault.nested_page_fault) + walker->fault.error_code |= access & PFERR_GUEST_FAULT_STAGE_MASK; +#endif + trace_kvm_mmu_walker_error(walker->fault.error_code); return 0; } @@ -807,7 +819,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault */ if (!r) { if (!fault->prefetch) - kvm_inject_emulated_page_fault(vcpu, &walker.fault); + __kvm_inject_emulated_page_fault(vcpu, &walker.fault, true); return RET_PF_RETRY; } diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 5a2f8ce9a32b8..5b3041138301b 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -53,13 +53,18 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) rcu_barrier(); } -static void tdp_mmu_free_sp(struct kvm_mmu_page *sp) +static void __tdp_mmu_free_sp(struct kvm_mmu_page *sp) { - free_page((unsigned long)sp->external_spt); free_page((unsigned long)sp->spt); kmem_cache_free(mmu_page_header_cache, sp); } +static void tdp_mmu_free_unused_sp(struct kvm_mmu_page *sp) +{ + free_page((unsigned long)sp->external_spt); + __tdp_mmu_free_sp(sp); +} + /* * This is called through call_rcu in order to free TDP page table memory * safely with respect to other kernel threads that may be operating on @@ -73,7 +78,8 @@ static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head) struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page, rcu_head); - tdp_mmu_free_sp(sp); + WARN_ON_ONCE(sp->external_spt); + __tdp_mmu_free_sp(sp); } void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root) @@ -320,9 +326,9 @@ out_read_unlock: } } -static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, - u64 old_spte, u64 new_spte, int level, - bool shared); +static void handle_changed_spte(struct kvm *kvm, struct kvm_mmu_page *sp, + gfn_t gfn, u64 old_spte, u64 new_spte, + int level, bool shared); static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) { @@ -359,25 +365,6 @@ static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp) spin_unlock(&kvm->arch.tdp_mmu_pages_lock); } -static void remove_external_spte(struct kvm *kvm, gfn_t gfn, u64 old_spte, - int level) -{ - /* - * External (TDX) SPTEs are limited to PG_LEVEL_4K, and external - * PTs are removed in a special order, involving free_external_spt(). - * But remove_external_spte() will be called on non-leaf PTEs via - * __tdp_mmu_zap_root(), so avoid the error the former would return - * in this case. - */ - if (!is_last_spte(old_spte, level)) - return; - - /* Zapping leaf spte is allowed only when write lock is held. */ - lockdep_assert_held_write(&kvm->mmu_lock); - - kvm_x86_call(remove_external_spte)(kvm, gfn, level, old_spte); -} - /** * handle_removed_pt() - handle a page table removed from the TDP structure * @@ -471,86 +458,19 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, FROZEN_SPTE, level); } - handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn, - old_spte, FROZEN_SPTE, level, shared); - - if (is_mirror_sp(sp)) { - KVM_BUG_ON(shared, kvm); - remove_external_spte(kvm, gfn, old_spte, level); - } + handle_changed_spte(kvm, sp, gfn, old_spte, FROZEN_SPTE, level, shared); } - if (is_mirror_sp(sp) && - WARN_ON(kvm_x86_call(free_external_spt)(kvm, base_gfn, sp->role.level, - sp->external_spt))) { - /* - * Failed to free page table page in mirror page table and - * there is nothing to do further. - * Intentionally leak the page to prevent the kernel from - * accessing the encrypted page. - */ - sp->external_spt = NULL; - } + if (is_mirror_sp(sp)) + kvm_x86_call(free_external_spt)(kvm, sp); call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); } -static void *get_external_spt(gfn_t gfn, u64 new_spte, int level) -{ - if (is_shadow_present_pte(new_spte) && !is_last_spte(new_spte, level)) { - struct kvm_mmu_page *sp = spte_to_child_sp(new_spte); - - WARN_ON_ONCE(sp->role.level + 1 != level); - WARN_ON_ONCE(sp->gfn != gfn); - return sp->external_spt; - } - - return NULL; -} - -static int __must_check set_external_spte_present(struct kvm *kvm, tdp_ptep_t sptep, - gfn_t gfn, u64 old_spte, - u64 new_spte, int level) -{ - bool was_present = is_shadow_present_pte(old_spte); - bool is_present = is_shadow_present_pte(new_spte); - bool is_leaf = is_present && is_last_spte(new_spte, level); - int ret = 0; - - KVM_BUG_ON(was_present, kvm); - - lockdep_assert_held(&kvm->mmu_lock); - /* - * We need to lock out other updates to the SPTE until the external - * page table has been modified. Use FROZEN_SPTE similar to - * the zapping case. - */ - if (!try_cmpxchg64(rcu_dereference(sptep), &old_spte, FROZEN_SPTE)) - return -EBUSY; - - /* - * Use different call to either set up middle level - * external page table, or leaf. - */ - if (is_leaf) { - ret = kvm_x86_call(set_external_spte)(kvm, gfn, level, new_spte); - } else { - void *external_spt = get_external_spt(gfn, new_spte, level); - - KVM_BUG_ON(!external_spt, kvm); - ret = kvm_x86_call(link_external_spt)(kvm, gfn, level, external_spt); - } - if (ret) - __kvm_tdp_mmu_write_spte(sptep, old_spte); - else - __kvm_tdp_mmu_write_spte(sptep, new_spte); - return ret; -} - /** - * handle_changed_spte - handle bookkeeping associated with an SPTE change + * __handle_changed_spte - handle bookkeeping associated with an SPTE change * @kvm: kvm instance - * @as_id: the address space of the paging structure the SPTE was a part of + * @sp: the page table in which the SPTE resides * @gfn: the base GFN that was mapped by the SPTE * @old_spte: The value of the SPTE before the change * @new_spte: The value of the SPTE after the change @@ -563,15 +483,16 @@ static int __must_check set_external_spte_present(struct kvm *kvm, tdp_ptep_t sp * dirty logging updates are handled in common code, not here (see make_spte() * and fast_pf_fix_direct_spte()). */ -static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, - u64 old_spte, u64 new_spte, int level, - bool shared) +static int __handle_changed_spte(struct kvm *kvm, struct kvm_mmu_page *sp, + gfn_t gfn, u64 old_spte, u64 new_spte, + int level, bool shared) { bool was_present = is_shadow_present_pte(old_spte); bool is_present = is_shadow_present_pte(new_spte); bool was_leaf = was_present && is_last_spte(old_spte, level); bool is_leaf = is_present && is_last_spte(new_spte, level); bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); + int as_id = kvm_mmu_page_as_id(sp); WARN_ON_ONCE(level > PT64_ROOT_MAX_LEVEL); WARN_ON_ONCE(level < PG_LEVEL_4K); @@ -601,9 +522,7 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, } if (old_spte == new_spte) - return; - - trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte); + return 0; if (is_leaf) check_spte_writable_invariants(new_spte); @@ -630,21 +549,45 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, "a temporary frozen SPTE.\n" "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", as_id, gfn, old_spte, new_spte, level); - return; - } - if (is_leaf != was_leaf) - kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1); + trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte); + return 0; + } /* * Recursively handle child PTs if the change removed a subtree from * the paging structure. Note the WARN on the PFN changing without the * SPTE being converted to a hugepage (leaf) or being zapped. Shadow * pages are kernel allocations and should never be migrated. + * + * For the mirror page table, propagate all changes to the external SPTE + * (except zapping/promotion of non-leaf SPTEs) via the + * set_external_spte() op. */ if (was_present && !was_leaf && - (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed))) + (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed))) { handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared); + } else if (is_mirror_sp(sp)) { + int r; + + r = kvm_x86_call(set_external_spte)(kvm, gfn, old_spte, new_spte, level); + if (r) + return r; + } + trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte); + + if (is_leaf != was_leaf) + kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1); + + return 0; +} + +static void handle_changed_spte(struct kvm *kvm, struct kvm_mmu_page *sp, + gfn_t gfn, u64 old_spte, u64 new_spte, + int level, bool shared) +{ + KVM_BUG_ON(__handle_changed_spte(kvm, sp, gfn, old_spte, new_spte, + level, shared), kvm); } static inline int __must_check __tdp_mmu_set_spte_atomic(struct kvm *kvm, @@ -659,34 +602,15 @@ static inline int __must_check __tdp_mmu_set_spte_atomic(struct kvm *kvm, */ WARN_ON_ONCE(iter->yielded || is_frozen_spte(iter->old_spte)); - if (is_mirror_sptep(iter->sptep) && !is_frozen_spte(new_spte)) { - int ret; - - /* - * Users of atomic zapping don't operate on mirror roots, - * so don't handle it and bug the VM if it's seen. - */ - if (KVM_BUG_ON(!is_shadow_present_pte(new_spte), kvm)) - return -EBUSY; - - ret = set_external_spte_present(kvm, iter->sptep, iter->gfn, - iter->old_spte, new_spte, iter->level); - if (ret) - return ret; - } else { - u64 *sptep = rcu_dereference(iter->sptep); - - /* - * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs - * and does not hold the mmu_lock. On failure, i.e. if a - * different logical CPU modified the SPTE, try_cmpxchg64() - * updates iter->old_spte with the current value, so the caller - * operates on fresh data, e.g. if it retries - * tdp_mmu_set_spte_atomic() - */ - if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte)) - return -EBUSY; - } + /* + * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and + * does not hold the mmu_lock. On failure, i.e. if a different logical + * CPU modified the SPTE, try_cmpxchg64() updates iter->old_spte with + * the current value, so the caller operates on fresh data, e.g. if it + * retries tdp_mmu_set_spte_atomic(). + */ + if (!try_cmpxchg64(rcu_dereference(iter->sptep), &iter->old_spte, new_spte)) + return -EBUSY; return 0; } @@ -712,24 +636,61 @@ static inline int __must_check tdp_mmu_set_spte_atomic(struct kvm *kvm, struct tdp_iter *iter, u64 new_spte) { + struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep)); int ret; lockdep_assert_held_read(&kvm->mmu_lock); - ret = __tdp_mmu_set_spte_atomic(kvm, iter, new_spte); + /* Should not set FROZEN_SPTE as a long-term value. */ + KVM_MMU_WARN_ON(is_frozen_spte(new_spte)); + + /* + * Temporarily freeze the SPTE until the external PTE operation has + * completed, e.g. so that concurrent faults don't attempt to install a + * child PTE in the external page table before the parent PTE has been + * written. + */ + if (is_mirror_sptep(iter->sptep)) + ret = __tdp_mmu_set_spte_atomic(kvm, iter, FROZEN_SPTE); + else + ret = __tdp_mmu_set_spte_atomic(kvm, iter, new_spte); + if (ret) return ret; - handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, - new_spte, iter->level, true); - - return 0; + /* + * Handle the change from iter->old_spte to new_spte. + * + * Note: for mirror page table, this means the updates of the external + * PTE, statistics, or updates of child SPTEs, child external PTEs and + * corresponding statistics are performed while the mirror SPTE is in + * frozen state (i.e., before the mirror SPTE is set to new_spte). + */ + ret = __handle_changed_spte(kvm, sp, iter->gfn, iter->old_spte, + new_spte, iter->level, true); + /* + * Unfreeze the mirror SPTE. If updating the external SPTE failed, + * restore the old value so that the mirror SPTE isn't frozen in + * perpetuity, otherwise set the mirror SPTE to the new desired value. + */ + if (is_mirror_sptep(iter->sptep)) { + if (ret) + __kvm_tdp_mmu_write_spte(iter->sptep, iter->old_spte); + else + __kvm_tdp_mmu_write_spte(iter->sptep, new_spte); + } else { + /* + * Bug the VM if handling the change failed, as failure is only + * allowed if KVM couldn't update the external SPTE. + */ + KVM_BUG_ON(ret, kvm); + } + return ret; } /* * tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping * @kvm: KVM instance - * @as_id: Address space ID, i.e. regular vs. SMM * @sptep: Pointer to the SPTE * @old_spte: The current value of the SPTE * @new_spte: The new value that will be set for the SPTE @@ -739,9 +700,11 @@ static inline int __must_check tdp_mmu_set_spte_atomic(struct kvm *kvm, * Returns the old SPTE value, which _may_ be different than @old_spte if the * SPTE had voldatile bits. */ -static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep, - u64 old_spte, u64 new_spte, gfn_t gfn, int level) +static u64 tdp_mmu_set_spte(struct kvm *kvm, tdp_ptep_t sptep, u64 old_spte, + u64 new_spte, gfn_t gfn, int level) { + struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(sptep)); + lockdep_assert_held_write(&kvm->mmu_lock); /* @@ -755,16 +718,7 @@ static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep, old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level); - handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false); - - /* - * Users that do non-atomic setting of PTEs don't operate on mirror - * roots, so don't handle it and bug the VM if it's seen. - */ - if (is_mirror_sptep(sptep)) { - KVM_BUG_ON(is_shadow_present_pte(new_spte), kvm); - remove_external_spte(kvm, gfn, old_spte, level); - } + handle_changed_spte(kvm, sp, gfn, old_spte, new_spte, level, false); return old_spte; } @@ -773,9 +727,8 @@ static inline void tdp_mmu_iter_set_spte(struct kvm *kvm, struct tdp_iter *iter, u64 new_spte) { WARN_ON_ONCE(iter->yielded); - iter->old_spte = tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep, - iter->old_spte, new_spte, - iter->gfn, iter->level); + iter->old_spte = tdp_mmu_set_spte(kvm, iter->sptep, iter->old_spte, + new_spte, iter->gfn, iter->level); } #define tdp_root_for_each_pte(_iter, _kvm, _root, _start, _end) \ @@ -1321,7 +1274,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) * failed, e.g. because a different task modified the SPTE. */ if (r) { - tdp_mmu_free_sp(sp); + tdp_mmu_free_unused_sp(sp); goto retry; } @@ -1377,6 +1330,10 @@ static void kvm_tdp_mmu_age_spte(struct kvm *kvm, struct tdp_iter *iter) { u64 new_spte; + /* TODO: Add support for aging external SPTEs, if necessary. */ + if (WARN_ON_ONCE(is_mirror_sptep(iter->sptep))) + return; + if (spte_ad_enabled(iter->old_spte)) { iter->old_spte = tdp_mmu_clear_spte_bits_atomic(iter->sptep, shadow_accessed_mask); @@ -1628,7 +1585,7 @@ retry: * installs its own sp in place of the last sp we tried to split. */ if (sp) - tdp_mmu_free_sp(sp); + tdp_mmu_free_unused_sp(sp); return 0; } diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index e218352e34231..b92dd2e583356 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -16,6 +16,7 @@ #include <linux/perf_event.h> #include <linux/bsearch.h> #include <linux/sort.h> +#include <linux/moduleparam.h> #include <asm/perf_event.h> #include <asm/cpu_device_id.h> #include "x86.h" @@ -33,6 +34,15 @@ static struct x86_pmu_capability __read_mostly kvm_host_pmu; struct x86_pmu_capability __read_mostly kvm_pmu_cap; EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_pmu_cap); +/* Enable/disable PMU virtualization */ +bool __read_mostly enable_pmu = true; +EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_pmu); +module_param(enable_pmu, bool, 0444); + +/* Enable/disabled mediated PMU virtualization. */ +bool __read_mostly enable_mediated_pmu; +EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_mediated_pmu); + struct kvm_pmu_emulated_event_selectors { u64 INSTRUCTIONS_RETIRED; u64 BRANCH_INSTRUCTIONS_RETIRED; @@ -88,7 +98,9 @@ static struct kvm_pmu_ops kvm_pmu_ops __read_mostly; DEFINE_STATIC_CALL_NULL(kvm_x86_pmu_##func, \ *(((struct kvm_pmu_ops *)0)->func)); #define KVM_X86_PMU_OP_OPTIONAL KVM_X86_PMU_OP +#define KVM_X86_PMU_OP_OPTIONAL_RET0 KVM_X86_PMU_OP #include <asm/kvm-x86-pmu-ops.h> +EXPORT_STATIC_CALL_GPL(kvm_x86_pmu_pmc_is_disabled_in_current_mode); void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops) { @@ -99,6 +111,9 @@ void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops) #define KVM_X86_PMU_OP(func) \ WARN_ON(!kvm_pmu_ops.func); __KVM_X86_PMU_OP(func) #define KVM_X86_PMU_OP_OPTIONAL __KVM_X86_PMU_OP +#define KVM_X86_PMU_OP_OPTIONAL_RET0(func) \ + static_call_update(kvm_x86_pmu_##func, (void *)kvm_pmu_ops.func ? : \ + (void *)__static_call_return0); #include <asm/kvm-x86-pmu-ops.h> #undef __KVM_X86_PMU_OP } @@ -522,7 +537,7 @@ static bool pmc_is_event_allowed(struct kvm_pmc *pmc) static void kvm_mediated_pmu_refresh_event_filter(struct kvm_pmc *pmc) { - bool allowed = pmc_is_event_allowed(pmc); + bool allowed = pmc_is_locally_enabled(pmc) && pmc_is_event_allowed(pmc); struct kvm_pmu *pmu = pmc_to_pmu(pmc); if (pmc_is_gp(pmc)) { @@ -670,6 +685,7 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) kvm_for_each_pmc(pmu, pmc, bit, bitmap) kvm_pmu_recalc_pmc_emulation(pmu, pmc); } +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_pmu_handle_event); int kvm_pmu_check_rdpmc_early(struct kvm_vcpu *vcpu, unsigned int idx) { @@ -879,7 +895,7 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (pmu->global_ctrl != data) { diff = pmu->global_ctrl ^ data; pmu->global_ctrl = data; - reprogram_counters(pmu, diff); + kvm_pmu_request_counters_reprogram(pmu, diff); } /* * Unconditionally forward writes to vendor code, i.e. to the @@ -921,6 +937,7 @@ static void kvm_pmu_reset(struct kvm_vcpu *vcpu) pmu->need_cleanup = false; bitmap_zero(pmu->reprogram_pmi, X86_PMC_IDX_MAX); + bitmap_zero(pmu->pmc_has_mode_specific_enables, X86_PMC_IDX_MAX); kvm_for_each_pmc(pmu, pmc, i, pmu->all_valid_pmc_idx) { pmc_stop_counter(pmc); diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 0925246731cb1..a5821d7c87f93 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -36,6 +36,7 @@ struct kvm_pmu_ops { void (*reset)(struct kvm_vcpu *vcpu); void (*deliver_pmi)(struct kvm_vcpu *vcpu); void (*cleanup)(struct kvm_vcpu *vcpu); + bool (*pmc_is_disabled_in_current_mode)(struct kvm_pmc *pmc); bool (*is_mediated_pmu_supported)(struct x86_pmu_capability *host_pmu); void (*mediated_load)(struct kvm_vcpu *vcpu); @@ -53,6 +54,17 @@ struct kvm_pmu_ops { const u32 MSR_STRIDE; }; +#define kvm_pmu_call(func) static_call(kvm_x86_pmu_##func) + +#define KVM_X86_PMU_OP(func) \ + DECLARE_STATIC_CALL(kvm_x86_pmu_##func, *(((struct kvm_pmu_ops *)0)->func)); +#define KVM_X86_PMU_OP_OPTIONAL KVM_X86_PMU_OP +#define KVM_X86_PMU_OP_OPTIONAL_RET0 KVM_X86_PMU_OP +#include <asm/kvm-x86-pmu-ops.h> + +extern bool enable_pmu; +extern bool enable_mediated_pmu; + void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops); void kvm_handle_guest_mediated_pmi(void); @@ -190,7 +202,13 @@ static inline bool pmc_is_locally_enabled(struct kvm_pmc *pmc) pmc->idx - KVM_FIXED_PMC_BASE_IDX) & (INTEL_FIXED_0_KERNEL | INTEL_FIXED_0_USER); - return pmc->eventsel & ARCH_PERFMON_EVENTSEL_ENABLE; + if (!(pmc->eventsel & ARCH_PERFMON_EVENTSEL_ENABLE)) + return false; + + if (!test_bit(pmc->idx, pmu->pmc_has_mode_specific_enables)) + return true; + + return !kvm_pmu_call(pmc_is_disabled_in_current_mode)(pmc); } extern struct x86_pmu_capability kvm_pmu_cap; @@ -198,6 +216,7 @@ extern struct x86_pmu_capability kvm_pmu_cap; void kvm_init_pmu_capability(struct kvm_pmu_ops *pmu_ops); void kvm_pmu_recalc_pmc_emulation(struct kvm_pmu *pmu, struct kvm_pmc *pmc); +void kvm_pmu_handle_event(struct kvm_vcpu *vcpu); static inline void kvm_pmu_request_counter_reprogram(struct kvm_pmc *pmc) { @@ -207,16 +226,24 @@ static inline void kvm_pmu_request_counter_reprogram(struct kvm_pmc *pmc) kvm_make_request(KVM_REQ_PMU, pmc->vcpu); } -static inline void reprogram_counters(struct kvm_pmu *pmu, u64 diff) +static inline void __kvm_pmu_reprogram_counters(struct kvm_pmu *pmu, + u64 counters, + bool defer) { - int bit; - - if (!diff) + if (!counters) return; - for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX) - set_bit(bit, pmu->reprogram_pmi); - kvm_make_request(KVM_REQ_PMU, pmu_to_vcpu(pmu)); + atomic64_or(counters, &pmu->__reprogram_pmi); + if (defer) + kvm_make_request(KVM_REQ_PMU, pmu_to_vcpu(pmu)); + else + kvm_pmu_handle_event(pmu_to_vcpu(pmu)); +} + +static inline void kvm_pmu_request_counters_reprogram(struct kvm_pmu *pmu, + u64 counters) +{ + __kvm_pmu_reprogram_counters(pmu, counters, true); } /* @@ -245,7 +272,6 @@ static inline bool kvm_pmu_is_fastpath_emulation_allowed(struct kvm_vcpu *vcpu) } void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu); -void kvm_pmu_handle_event(struct kvm_vcpu *vcpu); int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data); int kvm_pmu_check_rdpmc_early(struct kvm_vcpu *vcpu, unsigned int idx); bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr); diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 993b551180fe9..0726f88e679aa 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -122,38 +122,8 @@ static u32 x2avic_max_physical_id; static void avic_set_x2apic_msr_interception(struct vcpu_svm *svm, bool intercept) { - static const u32 x2avic_passthrough_msrs[] = { - X2APIC_MSR(APIC_ID), - X2APIC_MSR(APIC_LVR), - X2APIC_MSR(APIC_TASKPRI), - X2APIC_MSR(APIC_ARBPRI), - X2APIC_MSR(APIC_PROCPRI), - X2APIC_MSR(APIC_EOI), - X2APIC_MSR(APIC_RRR), - X2APIC_MSR(APIC_LDR), - X2APIC_MSR(APIC_DFR), - X2APIC_MSR(APIC_SPIV), - X2APIC_MSR(APIC_ISR), - X2APIC_MSR(APIC_TMR), - X2APIC_MSR(APIC_IRR), - X2APIC_MSR(APIC_ESR), - X2APIC_MSR(APIC_ICR), - X2APIC_MSR(APIC_ICR2), - - /* - * Note! Always intercept LVTT, as TSC-deadline timer mode - * isn't virtualized by hardware, and the CPU will generate a - * #GP instead of a #VMEXIT. - */ - X2APIC_MSR(APIC_LVTTHMR), - X2APIC_MSR(APIC_LVTPC), - X2APIC_MSR(APIC_LVT0), - X2APIC_MSR(APIC_LVT1), - X2APIC_MSR(APIC_LVTERR), - X2APIC_MSR(APIC_TMICT), - X2APIC_MSR(APIC_TMCCT), - X2APIC_MSR(APIC_TDCR), - }; + struct kvm_vcpu *vcpu = &svm->vcpu; + u64 rd_regs; int i; if (intercept == svm->x2avic_msrs_intercepted) @@ -162,9 +132,16 @@ static void avic_set_x2apic_msr_interception(struct vcpu_svm *svm, if (!x2avic_enabled) return; - for (i = 0; i < ARRAY_SIZE(x2avic_passthrough_msrs); i++) - svm_set_intercept_for_msr(&svm->vcpu, x2avic_passthrough_msrs[i], - MSR_TYPE_RW, intercept); + rd_regs = kvm_x2apic_disable_read_intercept_reg_mask(vcpu); + + for_each_set_bit(i, (unsigned long *)&rd_regs, BITS_PER_TYPE(rd_regs)) + svm_set_intercept_for_msr(vcpu, APIC_BASE_MSR + i, + MSR_TYPE_R, intercept); + + svm_set_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_W, intercept); + svm_set_intercept_for_msr(vcpu, X2APIC_MSR(APIC_EOI), MSR_TYPE_W, intercept); + svm_set_intercept_for_msr(vcpu, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W, intercept); + svm_set_intercept_for_msr(vcpu, X2APIC_MSR(APIC_ICR), MSR_TYPE_W, intercept); svm->x2avic_msrs_intercepted = intercept; } @@ -207,6 +184,35 @@ static void avic_activate_vmcb(struct vcpu_svm *svm) svm_clr_intercept(svm, INTERCEPT_CR8_WRITE); /* + * Flush the TLB when enabling (x2)AVIC and when transitioning between + * xAVIC and x2AVIC, as the CPU may have inserted a TLB entry for the + * "wrong" mapping. + * + * KVM uses a per-VM "scratch" page to back the APIC memslot, because + * KVM also uses per-VM page tables *and* maintains the page table (NPT + * or shadow page) mappings for said memslot even if one or more vCPUs + * have their local APIC hardware-disabled or are in x2APIC mode, i.e. + * even if one or more vCPUs' APIC MMIO BAR is effectively disabled. + * + * If xAVIC is fully enabled, hardware ignores the physical address in + * KVM's page tables, i.e. in the leaf SPTE for the APIC memslot, and + * instead redirects the access to the AVIC backing page, i.e. to the + * vCPU's virtual APIC page. If xAVIC is not enabled (APIC is either + * hardware-disabled or in x2APIC mode), then guest accesses will use + * the page table mapping verbatim, i.e. will access the per-VM scratch + * page, as normal memory. + * + * In both cases, the CPU is allowed to cache TLB entries for the APIC + * base GPA. So, KVM needs to flush the TLB when enabling xAVIC, as + * accesses need to be redirected to the virtual APIC page, but the TLB + * may contain entries pointing at the scratch page. KVM also needs to + * flush the TLB when enabling x2AVIC, as accesses need to go to the + * scratch page, but the TLB may contain entries tagged as xAVIC, i.e. + * entries pointing to the vCPU's virtual APIC page. + */ + kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, &svm->vcpu); + + /* * Note: KVM supports hybrid-AVIC mode, where KVM emulates x2APIC MSR * accesses, while interrupt injection to a running vCPU can be * achieved using AVIC doorbell. KVM disables the APIC access page @@ -219,12 +225,6 @@ static void avic_activate_vmcb(struct vcpu_svm *svm) /* Disabling MSR intercept for x2APIC registers */ avic_set_x2apic_msr_interception(svm, false); } else { - /* - * Flush the TLB, the guest may have inserted a non-APIC - * mapping into the TLB while AVIC was disabled. - */ - kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, &svm->vcpu); - /* Enabling MSR intercept for x2APIC registers */ avic_set_x2apic_msr_interception(svm, true); } diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 1bf3e4804ad0a..7ad4b4fb7a1c0 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -30,27 +30,42 @@ #include "lapic.h" #include "svm.h" #include "hyperv.h" +#include "pmu.h" #define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu, - struct x86_exception *fault) + struct x86_exception *fault, + bool from_hardware) { struct vcpu_svm *svm = to_svm(vcpu); struct vmcb *vmcb = svm->vmcb; + u64 fault_stage; - if (vmcb->control.exit_code != SVM_EXIT_NPF) { - /* - * TODO: track the cause of the nested page fault, and - * correctly fill in the high bits of exit_info_1. - */ - vmcb->control.exit_code = SVM_EXIT_NPF; - vmcb->control.exit_info_1 = (1ULL << 32); - vmcb->control.exit_info_2 = fault->address; - } + /* + * For hardware NPF exits, the GUEST_FAULT_STAGE bits are only + * available in the hardware exit_info_1, since the guest_mmu + * walker doesn't know whether the faulting GPA was a page table + * page or final page from L2's perspective. + */ + if (from_hardware) + fault_stage = vmcb->control.exit_info_1 & + PFERR_GUEST_FAULT_STAGE_MASK; + else + fault_stage = fault->error_code & PFERR_GUEST_FAULT_STAGE_MASK; - vmcb->control.exit_info_1 &= ~0xffffffffULL; - vmcb->control.exit_info_1 |= fault->error_code; + /* + * All nested page faults should be annotated as occurring on the + * final translation *or* the page walk. Arbitrarily choose "final" + * if KVM is buggy and enumerated both or neither. + */ + if (WARN_ON_ONCE(hweight64(fault_stage) != 1)) + fault_stage = PFERR_GUEST_FINAL_MASK; + + vmcb->control.exit_code = SVM_EXIT_NPF; + vmcb->control.exit_info_1 = fault_stage | + (fault->error_code & ~PFERR_GUEST_FAULT_STAGE_MASK); + vmcb->control.exit_info_2 = fault->address; nested_svm_vmexit(svm); } @@ -421,7 +436,8 @@ static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu, /* Common checks that apply to both L1 and L2 state. */ static bool nested_vmcb_check_save(struct kvm_vcpu *vcpu, - struct vmcb_save_area_cached *save) + struct vmcb_save_area_cached *save, + bool check_gpat) { if (CC(!(save->efer & EFER_SVME))) return false; @@ -456,6 +472,15 @@ static bool nested_vmcb_check_save(struct kvm_vcpu *vcpu, if (CC(!kvm_valid_efer(vcpu, save->efer))) return false; + /* + * If userspace contrives to get an invalid g_pat into vmcb02 by + * disabling KVM_X86_QUIRK_NESTED_SVM_SHARED_PAT in a race with + * this check, it should be prepared for the KVM_EXIT_FAIL_ENTRY + * that will follow. + */ + if (check_gpat && CC(!kvm_pat_valid(save->g_pat))) + return false; + return true; } @@ -463,7 +488,8 @@ int nested_svm_check_cached_vmcb12(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - if (!nested_vmcb_check_save(vcpu, &svm->nested.save) || + if (!nested_vmcb_check_save(vcpu, &svm->nested.save, + l2_has_separate_pat(vcpu)) || !nested_vmcb_check_controls(vcpu, &svm->nested.ctl)) return -EINVAL; @@ -576,6 +602,7 @@ static void __nested_copy_vmcb_save_to_cache(struct vmcb_save_area_cached *to, to->rax = from->rax; to->cr2 = from->cr2; + to->g_pat = from->g_pat; svm_copy_lbrs(to, from); } @@ -705,15 +732,6 @@ static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, return 0; } -void nested_vmcb02_compute_g_pat(struct vcpu_svm *svm) -{ - if (!svm->nested.vmcb02.ptr) - return; - - /* FIXME: merge g_pat from vmcb01 and vmcb12. */ - svm->nested.vmcb02.ptr->save.g_pat = svm->vmcb01.ptr->save.g_pat; -} - static bool nested_vmcb12_has_lbrv(struct kvm_vcpu *vcpu) { return guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) && @@ -729,9 +747,6 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm) struct vmcb *vmcb02 = svm->nested.vmcb02.ptr; struct kvm_vcpu *vcpu = &svm->vcpu; - nested_vmcb02_compute_g_pat(svm); - vmcb_mark_dirty(vmcb02, VMCB_NPT); - /* Load the nested guest state */ if (svm->nested.vmcb12_gpa != svm->nested.last_vmcb12_gpa) { new_vmcb12 = true; @@ -762,6 +777,13 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm) vmcb_mark_dirty(vmcb02, VMCB_CET); } + if (l2_has_separate_pat(vcpu)) { + if (unlikely(new_vmcb12 || vmcb12_is_dirty(control, VMCB_NPT))) + vmcb_set_gpat(vmcb02, svm->nested.save.g_pat); + } else if (npt_enabled) { + vmcb_set_gpat(vmcb02, vcpu->arch.pat); + } + kvm_set_rflags(vcpu, save->rflags | X86_EFLAGS_FIXED); svm_set_efer(vcpu, svm->nested.save.efer); @@ -838,6 +860,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) /* Enter Guest-Mode */ enter_guest_mode(vcpu); + svm_pmu_handle_nested_transition(svm); /* * Filled at exit: exit_code, exit_info_1, exit_info_2, exit_int_info, @@ -1119,16 +1142,22 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu) } ret = nested_svm_copy_vmcb12_to_cache(vcpu, vmcb12_gpa); - if (ret) { - if (ret == -EFAULT) - return kvm_handle_memory_failure(vcpu, X86EMUL_IO_NEEDED, NULL); + if (ret == -EFAULT) + return kvm_handle_memory_failure(vcpu, X86EMUL_IO_NEEDED, NULL); - /* Advance RIP past VMRUN as part of the nested #VMEXIT. */ - return kvm_skip_emulated_instruction(vcpu); - } + /* + * At this point, VMRUN is guaranteed to not fault; advance RIP. If + * caching vmcb12 failed for other reasons, return immediately afterward + * as a nested #VMEXIT was already set up. + * + * FIXME: If TF is set on VMRUN should inject a #DB (or handle guest + * debugging) right after #VMEXIT, right now it's just ignored. + */ + if (!svm_skip_emulated_instruction(vcpu)) + return 0; - /* At this point, VMRUN is guaranteed to not fault; advance RIP. */ - ret = kvm_skip_emulated_instruction(vcpu); + if (ret) + goto insn_retired; /* * Since vmcb01 is not in use, we can use it to store some of the L1 @@ -1158,7 +1187,13 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu) nested_svm_vmexit(svm); } - return ret; +insn_retired: + /* + * A successful VMRUN is counted by the PMU in guest mode, so only + * retire the instruction after potentially entering guest mode. + */ + kvm_pmu_instruction_retired(vcpu); + return 1; } /* Copy state save area fields which are handled by VMRUN */ @@ -1242,6 +1277,9 @@ static int nested_svm_vmexit_update_vmcb12(struct kvm_vcpu *vcpu) vmcb12->save.dr6 = svm->vcpu.arch.dr6; vmcb12->save.cpl = vmcb02->save.cpl; + if (l2_has_separate_pat(vcpu)) + vmcb12->save.g_pat = vmcb02->save.g_pat; + if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) { vmcb12->save.s_cet = vmcb02->save.s_cet; vmcb12->save.isst_addr = vmcb02->save.isst_addr; @@ -1288,6 +1326,8 @@ void nested_svm_vmexit(struct vcpu_svm *svm) /* Exit Guest-Mode */ leave_guest_mode(vcpu); + svm_pmu_handle_nested_transition(svm); + svm->nested.vmcb12_gpa = 0; kvm_warn_on_nested_run_pending(vcpu); @@ -1499,6 +1539,15 @@ void svm_leave_nested(struct kvm_vcpu *vcpu) leave_guest_mode(vcpu); + /* + * Force leaving nested is a non-architectural flow so precision + * isn't a priority. Defer updating the PMU until the next vCPU + * run, potentially tolerating some imprecision to avoid poking + * into PMU state from arbitrary contexts (e.g. to avoid using + * stale state). + */ + __svm_pmu_handle_nested_transition(svm, true); + svm_switch_vmcb(svm, &svm->vmcb01); nested_svm_uninit_mmu_context(vcpu); @@ -1851,6 +1900,9 @@ static int svm_get_nested_state(struct kvm_vcpu *vcpu, /* First fill in the header and copy it out. */ if (is_guest_mode(vcpu)) { kvm_state.hdr.svm.vmcb_pa = svm->nested.vmcb12_gpa; + kvm_state.hdr.svm.gpat = 0; + if (l2_has_separate_pat(vcpu)) + kvm_state.hdr.svm.gpat = svm->vmcb->save.g_pat; kvm_state.size += KVM_STATE_NESTED_SVM_VMCB_SIZE; kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE; @@ -1903,6 +1955,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, struct vmcb_save_area *save; struct vmcb_save_area_cached save_cached; struct vmcb_ctrl_area_cached ctl_cached; + bool use_separate_l2_pat; unsigned long cr0; int ret; @@ -1967,15 +2020,29 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, /* * Validate host state saved from before VMRUN (see - * nested_svm_check_permissions). + * nested_svm_check_permissions). Note that the g_pat field is not + * validated, because (a) it may have been clobbered by SMM before + * KVM_GET_NESTED_STATE, and (b) it is not loaded at emulated + * #VMEXIT. */ __nested_copy_vmcb_save_to_cache(&save_cached, save); if (!(save->cr0 & X86_CR0_PG) || !(save->cr0 & X86_CR0_PE) || (save->rflags & X86_EFLAGS_VM) || - !nested_vmcb_check_save(vcpu, &save_cached)) + !nested_vmcb_check_save(vcpu, &save_cached, false)) goto out_free; + /* + * Validate gPAT when the shared PAT quirk is disabled (i.e. L2 + * has its own gPAT). This is done separately from the + * vmcb_save_area_cached validation above, because gPAT is L2 + * state, but the vmcb_save_area_cached is populated with L1 state. + */ + use_separate_l2_pat = (ctl_cached.misc_ctl & SVM_MISC_ENABLE_NP) && + !kvm_check_has_quirk(vcpu->kvm, + KVM_X86_QUIRK_NESTED_SVM_SHARED_PAT); + if (use_separate_l2_pat && !kvm_pat_valid(kvm_state->hdr.svm.gpat)) + goto out_free; /* * All checks done, we can enter guest mode. Userspace provides @@ -2002,6 +2069,10 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, nested_copy_vmcb_control_to_cache(svm, ctl); svm_switch_vmcb(svm, &svm->nested.vmcb02); + + if (use_separate_l2_pat) + vmcb_set_gpat(svm->vmcb, kvm_state->hdr.svm.gpat); + nested_vmcb02_prepare_control(svm); /* diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 7aa298eeb0721..c18286545a7ac 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -168,6 +168,12 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) pmc->eventsel = data; pmc->eventsel_hw = (data & ~AMD64_EVENTSEL_HOSTONLY) | AMD64_EVENTSEL_GUESTONLY; + + if (data & AMD64_EVENTSEL_HOST_GUEST_MASK) + __set_bit(pmc->idx, pmu->pmc_has_mode_specific_enables); + else + __clear_bit(pmc->idx, pmu->pmc_has_mode_specific_enables); + kvm_pmu_request_counter_reprogram(pmc); } return 0; @@ -207,7 +213,11 @@ static void amd_pmu_refresh(struct kvm_vcpu *vcpu) } pmu->counter_bitmask[KVM_PMC_GP] = BIT_ULL(48) - 1; + pmu->reserved_bits = 0xfffffff000280000ull; + if (guest_cpu_cap_has(vcpu, X86_FEATURE_SVM) && kvm_vcpu_has_mediated_pmu(vcpu)) + pmu->reserved_bits &= ~AMD64_EVENTSEL_HOST_GUEST_MASK; + pmu->raw_event_mask = AMD64_RAW_EVENT_MASK; /* not applicable to AMD; but clean them to prevent any fall out */ pmu->counter_bitmask[KVM_PMC_FIXED] = 0; @@ -260,6 +270,37 @@ static void amd_mediated_pmu_put(struct kvm_vcpu *vcpu) wrmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, pmu->global_status); } +static bool amd_pmc_is_disabled_in_current_mode(struct kvm_pmc *pmc) +{ + struct kvm_vcpu *vcpu = pmc->vcpu; + u64 host_guest_bits; + + if (!kvm_vcpu_has_mediated_pmu(vcpu)) + return false; + + /* Common code is supposed to check the common enable bit */ + if (WARN_ON_ONCE(!(pmc->eventsel & ARCH_PERFMON_EVENTSEL_ENABLE))) + return false; + + /* If both bits are cleared, the counter is always enabled */ + host_guest_bits = pmc->eventsel & AMD64_EVENTSEL_HOST_GUEST_MASK; + if (!host_guest_bits) + return false; + + /* If EFER.SVME=0 and either bit is set, the counter is disabled */ + if (!(vcpu->arch.efer & EFER_SVME)) + return true; + + /* + * If EFER.SVME=1, the counter is disabled iff only one of the bits is + * set AND the set bit doesn't match the vCPU mode. + */ + if (host_guest_bits == AMD64_EVENTSEL_HOST_GUEST_MASK) + return false; + + return !!(host_guest_bits & AMD64_EVENTSEL_GUESTONLY) != is_guest_mode(vcpu); +} + struct kvm_pmu_ops amd_pmu_ops __initdata = { .rdpmc_ecx_to_pmc = amd_rdpmc_ecx_to_pmc, .msr_idx_to_pmc = amd_msr_idx_to_pmc, @@ -269,6 +310,7 @@ struct kvm_pmu_ops amd_pmu_ops __initdata = { .set_msr = amd_pmu_set_msr, .refresh = amd_pmu_refresh, .init = amd_pmu_init, + .pmc_is_disabled_in_current_mode = amd_pmc_is_disabled_in_current_mode, .is_mediated_pmu_supported = amd_pmu_is_mediated_pmu_supported, .mediated_load = amd_mediated_pmu_load, diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 18a7cddb097d2..e0a2edc1543a6 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -721,14 +721,50 @@ e_free_dh: return ret; } +static int sev_check_pin_count(struct kvm *kvm, unsigned long npages) +{ + unsigned long total_npages, lock_limit; + + total_npages = to_kvm_sev_info(kvm)->pages_locked + npages; + if (total_npages > totalram_pages()) + return -EINVAL; + + lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + if (total_npages > lock_limit && !capable(CAP_IPC_LOCK)) { + pr_err_ratelimited("SEV: %lu total pages would exceed the lock limit of %lu.\n", + total_npages, lock_limit); + return -ENOMEM; + } + + return 0; +} + +static int sev_pin_user_pages(struct kvm *kvm, unsigned long addr, int npages, + unsigned int gup_flags, struct page **pages) +{ + int npinned; + + lockdep_assert_held(&kvm->lock); + + npinned = pin_user_pages_fast(addr, npages, gup_flags, pages); + if (npinned != npages) { + if (npinned > 0) + unpin_user_pages(pages, npinned); + pr_err_ratelimited("SEV: Failure locking %u pages.\n", npages); + return -ENOMEM; + } + + to_kvm_sev_info(kvm)->pages_locked += npages; + return 0; +} + static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, unsigned long ulen, unsigned long *n, unsigned int flags) { - struct kvm_sev_info *sev = to_kvm_sev_info(kvm); - unsigned long npages, total_npages, lock_limit; + unsigned long npages; struct page **pages; - int npinned, ret; + int ret; lockdep_assert_held(&kvm->lock); @@ -744,16 +780,9 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, if (npages > INT_MAX) return ERR_PTR(-EINVAL); - total_npages = sev->pages_locked + npages; - if (total_npages > totalram_pages()) - return ERR_PTR(-EINVAL); - - lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - if (total_npages > lock_limit && !capable(CAP_IPC_LOCK)) { - pr_err("SEV: %lu total pages would exceed the lock limit of %lu.\n", - total_npages, lock_limit); - return ERR_PTR(-ENOMEM); - } + ret = sev_check_pin_count(kvm, npages); + if (ret) + return ERR_PTR(ret); /* * Don't WARN if the kernel (rightly) thinks the total size is absurd, @@ -765,25 +794,14 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, if (!pages) return ERR_PTR(-ENOMEM); - /* Pin the user virtual address. */ - npinned = pin_user_pages_fast(uaddr, npages, flags, pages); - if (npinned != npages) { - pr_err("SEV: Failure locking %lu pages.\n", npages); - ret = -ENOMEM; - goto err; + ret = sev_pin_user_pages(kvm, uaddr, npages, flags, pages); + if (ret) { + kvfree(pages); + return ERR_PTR(ret); } *n = npages; - sev->pages_locked = total_npages; - return pages; - -err: - if (npinned > 0) - unpin_user_pages(pages, npinned); - - kvfree(pages); - return ERR_PTR(ret); } static void sev_unpin_memory(struct kvm *kvm, struct page **pages, @@ -794,6 +812,29 @@ static void sev_unpin_memory(struct kvm *kvm, struct page **pages, to_kvm_sev_info(kvm)->pages_locked -= npages; } +static struct page *sev_pin_page(struct kvm *kvm, unsigned long addr, + unsigned int flags) +{ + struct page *page; + int r; + + r = sev_check_pin_count(kvm, 1); + if (r) + return ERR_PTR(r); + + r = sev_pin_user_pages(kvm, addr, 1, flags, &page); + if (r) + return ERR_PTR(r); + + return page; +} + +static void sev_unpin_page(struct kvm *kvm, struct page *page) +{ + unpin_user_pages(&page, 1); + to_kvm_sev_info(kvm)->pages_locked -= 1; +} + static void sev_clflush_pages(struct page *pages[], unsigned long npages) { uint8_t *page_virtual; @@ -1197,160 +1238,115 @@ static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp) return ret; } -static int __sev_issue_dbg_cmd(struct kvm *kvm, unsigned long src, - unsigned long dst, int size, - int *error, bool enc) +static int sev_issue_dbg_cmd(struct kvm *kvm, unsigned long src_pa, + unsigned long dst_pa, unsigned int size, + unsigned int ioctl, int *error) { - struct sev_data_dbg data; - - data.reserved = 0; - data.handle = to_kvm_sev_info(kvm)->handle; - data.dst_addr = dst; - data.src_addr = src; - data.len = size; + int cmd = ioctl == KVM_SEV_DBG_DECRYPT ? SEV_CMD_DBG_DECRYPT : + SEV_CMD_DBG_ENCRYPT; + struct sev_data_dbg data = { + .handle = to_kvm_sev_info(kvm)->handle, + .dst_addr = dst_pa, + .src_addr = src_pa, + .len = size, + }; - return sev_issue_cmd(kvm, - enc ? SEV_CMD_DBG_ENCRYPT : SEV_CMD_DBG_DECRYPT, - &data, error); + return sev_issue_cmd(kvm, cmd, &data, error); } -static int __sev_dbg_decrypt(struct kvm *kvm, unsigned long src_paddr, - unsigned long dst_paddr, int sz, int *err) +static void *sev_dbg_crypt_slow_alloc(struct page *page, unsigned long __va, + unsigned int len, unsigned long *pa, + unsigned int *nr_bytes) { - int offset; + unsigned long va = ALIGN_DOWN(__va, 16); + + /* The number of bytes to {de,en}crypt must be 16-byte aligned. */ + *nr_bytes = round_up(len, 16); /* - * Its safe to read more than we are asked, caller should ensure that - * destination has enough space. + * Increase the number of bytes to {de,en}crypt by one chunk (16 bytes) + * if the aligned address and length doesn't cover the unaligned range, + * e.g. if the address is unaligned _and_ the access will split a chunk + * at the tail. */ - offset = src_paddr & 15; - src_paddr = round_down(src_paddr, 16); - sz = round_up(sz + offset, 16); - - return __sev_issue_dbg_cmd(kvm, src_paddr, dst_paddr, sz, err, false); -} + if (va + *nr_bytes < __va + len) + *nr_bytes += 16; -static int __sev_dbg_decrypt_user(struct kvm *kvm, unsigned long paddr, - void __user *dst_uaddr, - unsigned long dst_paddr, - int size, int *err) -{ - struct page *tpage = NULL; - int ret, offset; + *pa = __sme_page_pa(page) + (va & ~PAGE_MASK); - /* if inputs are not 16-byte then use intermediate buffer */ - if (!IS_ALIGNED(dst_paddr, 16) || - !IS_ALIGNED(paddr, 16) || - !IS_ALIGNED(size, 16)) { - tpage = (void *)alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); - if (!tpage) - return -ENOMEM; + /* + * Sanity check that the new access won't split a page. This should + * never happen; just pretend the allocation failed. + */ + if (WARN_ON_ONCE((*pa & PAGE_MASK) != ((*pa + *nr_bytes - 1) & PAGE_MASK))) + return NULL; - dst_paddr = __sme_page_pa(tpage); - } + return kmalloc(*nr_bytes, GFP_KERNEL); +} - ret = __sev_dbg_decrypt(kvm, paddr, dst_paddr, size, err); - if (ret) - goto e_free; +static int sev_dbg_decrypt_slow(struct kvm *kvm, unsigned long src, + struct page *src_p, unsigned long dst, + unsigned int len, int *err) +{ + unsigned int nr_bytes; + unsigned long src_pa; + void *buf; + int r; - if (tpage) { - offset = paddr & 15; - if (copy_to_user(dst_uaddr, page_address(tpage) + offset, size)) - ret = -EFAULT; - } + buf = sev_dbg_crypt_slow_alloc(src_p, src, len, &src_pa, &nr_bytes); + if (!buf) + return -ENOMEM; -e_free: - if (tpage) - __free_page(tpage); + r = sev_issue_dbg_cmd(kvm, src_pa, __sme_set(__pa(buf)), + nr_bytes, KVM_SEV_DBG_DECRYPT, err); + if (r) + goto out; - return ret; + if (copy_to_user((void __user *)dst, buf + (src & 15), len)) + r = -EFAULT; +out: + kfree(buf); + return r; } -static int __sev_dbg_encrypt_user(struct kvm *kvm, unsigned long paddr, - void __user *vaddr, - unsigned long dst_paddr, - void __user *dst_vaddr, - int size, int *error) +static int sev_dbg_encrypt_slow(struct kvm *kvm, unsigned long src, + unsigned long dst, struct page *dst_p, + unsigned int len, int *err) { - struct page *src_tpage = NULL; - struct page *dst_tpage = NULL; - int ret, len = size; - - /* If source buffer is not aligned then use an intermediate buffer */ - if (!IS_ALIGNED((unsigned long)vaddr, 16)) { - src_tpage = alloc_page(GFP_KERNEL_ACCOUNT); - if (!src_tpage) - return -ENOMEM; + unsigned int nr_bytes; + unsigned long dst_pa; + void *buf; + int r; - if (copy_from_user(page_address(src_tpage), vaddr, size)) { - __free_page(src_tpage); - return -EFAULT; - } + /* Decrypt the _destination_ to do a RMW on plaintext. */ + buf = sev_dbg_crypt_slow_alloc(dst_p, dst, len, &dst_pa, &nr_bytes); + if (!buf) + return -ENOMEM; - paddr = __sme_page_pa(src_tpage); - } + r = sev_issue_dbg_cmd(kvm, dst_pa, __sme_set(__pa(buf)), + nr_bytes, KVM_SEV_DBG_DECRYPT, err); + if (r) + goto out; /* - * If destination buffer or length is not aligned then do read-modify-write: - * - decrypt destination in an intermediate buffer - * - copy the source buffer in an intermediate buffer - * - use the intermediate buffer as source buffer + * Copy from the source into the intermediate buffer, and then + * re-encrypt the buffer into the destination. */ - if (!IS_ALIGNED((unsigned long)dst_vaddr, 16) || !IS_ALIGNED(size, 16)) { - int dst_offset; - - dst_tpage = alloc_page(GFP_KERNEL_ACCOUNT); - if (!dst_tpage) { - ret = -ENOMEM; - goto e_free; - } - - ret = __sev_dbg_decrypt(kvm, dst_paddr, - __sme_page_pa(dst_tpage), size, error); - if (ret) - goto e_free; - - /* - * If source is kernel buffer then use memcpy() otherwise - * copy_from_user(). - */ - dst_offset = dst_paddr & 15; - - if (src_tpage) - memcpy(page_address(dst_tpage) + dst_offset, - page_address(src_tpage), size); - else { - if (copy_from_user(page_address(dst_tpage) + dst_offset, - vaddr, size)) { - ret = -EFAULT; - goto e_free; - } - } - - paddr = __sme_page_pa(dst_tpage); - dst_paddr = round_down(dst_paddr, 16); - len = round_up(size, 16); - } - - ret = __sev_issue_dbg_cmd(kvm, paddr, dst_paddr, len, error, true); - -e_free: - if (src_tpage) - __free_page(src_tpage); - if (dst_tpage) - __free_page(dst_tpage); - return ret; + if (copy_from_user(buf + (dst & 15), (void __user *)src, len)) + r = -EFAULT; + else + r = sev_issue_dbg_cmd(kvm, __sme_set(__pa(buf)), dst_pa, + nr_bytes, KVM_SEV_DBG_ENCRYPT, err); +out: + kfree(buf); + return r; } -static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec) +static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, + unsigned int cmd) { - unsigned long vaddr, vaddr_end, next_vaddr; - unsigned long dst_vaddr; - struct page **src_p, **dst_p; struct kvm_sev_dbg debug; - unsigned long n; - unsigned int size; - int ret; + unsigned int i, len; if (!sev_guest(kvm)) return -ENOTTY; @@ -1358,27 +1354,38 @@ static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec) if (copy_from_user(&debug, u64_to_user_ptr(argp->data), sizeof(debug))) return -EFAULT; - if (!debug.len || debug.src_uaddr + debug.len < debug.src_uaddr) + if (!debug.len || !debug.src_uaddr || !debug.dst_uaddr) return -EINVAL; - if (!debug.dst_uaddr) + + if (debug.src_uaddr + debug.len < debug.src_uaddr || + debug.dst_uaddr + debug.len < debug.dst_uaddr) return -EINVAL; - vaddr = debug.src_uaddr; - size = debug.len; - vaddr_end = vaddr + size; - dst_vaddr = debug.dst_uaddr; + for (i = 0; i < debug.len; i += len) { + unsigned long src = debug.src_uaddr + i; + unsigned long dst = debug.dst_uaddr + i; + unsigned long s_off = src & ~PAGE_MASK; + unsigned long d_off = dst & ~PAGE_MASK; + struct page *src_p, *dst_p; + int ret; - for (; vaddr < vaddr_end; vaddr = next_vaddr) { - int len, s_off, d_off; + /* + * Copy as many remaining bytes as possible while staying in a + * single page for both the source and destination. + */ + len = min3(debug.len - i, PAGE_SIZE - s_off, PAGE_SIZE - d_off); - /* lock userspace source and destination page */ - src_p = sev_pin_memory(kvm, vaddr & PAGE_MASK, PAGE_SIZE, &n, 0); + /* + * Pin the source and destination pages; firmware operates on + * physical addresses. + */ + src_p = sev_pin_page(kvm, src & PAGE_MASK, 0); if (IS_ERR(src_p)) return PTR_ERR(src_p); - dst_p = sev_pin_memory(kvm, dst_vaddr & PAGE_MASK, PAGE_SIZE, &n, FOLL_WRITE); + dst_p = sev_pin_page(kvm, dst & PAGE_MASK, FOLL_WRITE); if (IS_ERR(dst_p)) { - sev_unpin_memory(kvm, src_p, n); + sev_unpin_page(kvm, src_p); return PTR_ERR(dst_p); } @@ -1387,43 +1394,28 @@ static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec) * the pages; flush the destination too so that future accesses do not * see stale data. */ - sev_clflush_pages(src_p, 1); - sev_clflush_pages(dst_p, 1); - - /* - * Since user buffer may not be page aligned, calculate the - * offset within the page. - */ - s_off = vaddr & ~PAGE_MASK; - d_off = dst_vaddr & ~PAGE_MASK; - len = min_t(size_t, (PAGE_SIZE - s_off), size); + sev_clflush_pages(&src_p, 1); + sev_clflush_pages(&dst_p, 1); - if (dec) - ret = __sev_dbg_decrypt_user(kvm, - __sme_page_pa(src_p[0]) + s_off, - (void __user *)dst_vaddr, - __sme_page_pa(dst_p[0]) + d_off, - len, &argp->error); + if (IS_ALIGNED(src, 16) && IS_ALIGNED(dst, 16) && IS_ALIGNED(len, 16)) + ret = sev_issue_dbg_cmd(kvm, + __sme_page_pa(src_p) + s_off, + __sme_page_pa(dst_p) + d_off, + len, cmd, &argp->error); + else if (cmd == KVM_SEV_DBG_DECRYPT) + ret = sev_dbg_decrypt_slow(kvm, src, src_p, dst, + len, &argp->error); else - ret = __sev_dbg_encrypt_user(kvm, - __sme_page_pa(src_p[0]) + s_off, - (void __user *)vaddr, - __sme_page_pa(dst_p[0]) + d_off, - (void __user *)dst_vaddr, - len, &argp->error); + ret = sev_dbg_encrypt_slow(kvm, src, dst, dst_p, + len, &argp->error); - sev_unpin_memory(kvm, src_p, n); - sev_unpin_memory(kvm, dst_p, n); + sev_unpin_page(kvm, src_p); + sev_unpin_page(kvm, dst_p); if (ret) - goto err; - - next_vaddr = vaddr + len; - dst_vaddr = dst_vaddr + len; - size -= len; + return ret; } -err: - return ret; + return 0; } static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp) @@ -1696,8 +1688,7 @@ static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) struct sev_data_send_update_data data; struct kvm_sev_send_update_data params; void *hdr, *trans_data; - struct page **guest_page; - unsigned long n; + struct page *guest_page; int ret, offset; if (!sev_guest(kvm)) @@ -1721,8 +1712,7 @@ static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) return -EINVAL; /* Pin guest memory */ - guest_page = sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK, - PAGE_SIZE, &n, 0); + guest_page = sev_pin_page(kvm, params.guest_uaddr & PAGE_MASK, 0); if (IS_ERR(guest_page)) return PTR_ERR(guest_page); @@ -1743,7 +1733,7 @@ static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) data.trans_len = params.trans_len; /* The SEND_UPDATE_DATA command requires C-bit to be always set. */ - data.guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) + offset; + data.guest_address = page_to_phys(guest_page) + offset; data.guest_address |= sev_me_mask; data.guest_len = params.guest_len; data.handle = to_kvm_sev_info(kvm)->handle; @@ -1770,8 +1760,7 @@ e_free_trans_data: e_free_hdr: kfree(hdr); e_unpin: - sev_unpin_memory(kvm, guest_page, n); - + sev_unpin_page(kvm, guest_page); return ret; } @@ -1876,8 +1865,7 @@ static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) struct kvm_sev_receive_update_data params; struct sev_data_receive_update_data data; void *hdr = NULL, *trans = NULL; - struct page **guest_page; - unsigned long n; + struct page *guest_page; int ret, offset; if (!sev_guest(kvm)) @@ -1914,8 +1902,7 @@ static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) data.trans_len = params.trans_len; /* Pin guest memory */ - guest_page = sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK, - PAGE_SIZE, &n, FOLL_WRITE); + guest_page = sev_pin_page(kvm, params.guest_uaddr & PAGE_MASK, FOLL_WRITE); if (IS_ERR(guest_page)) { ret = PTR_ERR(guest_page); goto e_free_trans; @@ -1926,10 +1913,10 @@ static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) * encrypts the written data with the guest's key, and the cache may * contain dirty, unencrypted data. */ - sev_clflush_pages(guest_page, n); + sev_clflush_pages(&guest_page, 1); /* The RECEIVE_UPDATE_DATA command requires C-bit to be always set. */ - data.guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) + offset; + data.guest_address = page_to_phys(guest_page) + offset; data.guest_address |= sev_me_mask; data.guest_len = params.guest_len; data.handle = to_kvm_sev_info(kvm)->handle; @@ -1937,7 +1924,7 @@ static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) ret = sev_issue_cmd(kvm, SEV_CMD_RECEIVE_UPDATE_DATA, &data, &argp->error); - sev_unpin_memory(kvm, guest_page, n); + sev_unpin_page(kvm, guest_page); e_free_trans: kfree(trans); @@ -2361,8 +2348,8 @@ static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, memcpy(dst_vaddr, src_vaddr, PAGE_SIZE); - kunmap_local(src_vaddr); kunmap_local(dst_vaddr); + kunmap_local(src_vaddr); } ret = rmp_make_private(pfn, gfn << PAGE_SHIFT, PG_LEVEL_4K, @@ -2396,9 +2383,10 @@ static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, void *dst_vaddr = kmap_local_pfn(pfn); memcpy(src_vaddr, dst_vaddr, PAGE_SIZE); + set_page_dirty(src_page); - kunmap_local(src_vaddr); kunmap_local(dst_vaddr); + kunmap_local(src_vaddr); } out: @@ -2470,6 +2458,7 @@ static int snp_launch_update(struct kvm *kvm, struct kvm_sev_cmd *argp) sev_populate_args.type = params.type; count = kvm_gmem_populate(kvm, params.gfn_start, src, npages, + params.type == KVM_SEV_SNP_PAGE_TYPE_CPUID, sev_gmem_post_populate, &sev_populate_args); if (count < 0) { argp->error = sev_populate_args.fw_error; @@ -2690,10 +2679,8 @@ int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp) r = sev_guest_status(kvm, &sev_cmd); break; case KVM_SEV_DBG_DECRYPT: - r = sev_dbg_crypt(kvm, &sev_cmd, true); - break; case KVM_SEV_DBG_ENCRYPT: - r = sev_dbg_crypt(kvm, &sev_cmd, false); + r = sev_dbg_crypt(kvm, &sev_cmd, sev_cmd.id); break; case KVM_SEV_LAUNCH_SECRET: r = sev_launch_secret(kvm, &sev_cmd); @@ -3014,18 +3001,14 @@ void sev_vm_destroy(struct kvm *kvm) void __init sev_set_cpu_caps(void) { - if (sev_enabled) { + if (sev_enabled) kvm_cpu_cap_set(X86_FEATURE_SEV); - kvm_caps.supported_vm_types |= BIT(KVM_X86_SEV_VM); - } - if (sev_es_enabled) { + + if (sev_es_enabled) kvm_cpu_cap_set(X86_FEATURE_SEV_ES); - kvm_caps.supported_vm_types |= BIT(KVM_X86_SEV_ES_VM); - } - if (sev_snp_enabled) { + + if (sev_snp_enabled) kvm_cpu_cap_set(X86_FEATURE_SEV_SNP); - kvm_caps.supported_vm_types |= BIT(KVM_X86_SNP_VM); - } } static bool is_sev_snp_initialized(void) @@ -3055,6 +3038,11 @@ out: return initialized; } +static const char * __init sev_str_feature_state(bool is_supported, bool is_usable) +{ + return is_supported ? is_usable ? "enabled" : "unusable" : "disabled"; +} + void __init sev_hardware_setup(void) { unsigned int eax, ebx, ecx, edx, sev_asid_count, sev_es_asid_count; @@ -3062,6 +3050,7 @@ void __init sev_hardware_setup(void) bool sev_snp_supported = false; bool sev_es_supported = false; bool sev_supported = false; + u32 vm_types = 0; if (!sev_enabled || !npt_enabled || !nrips) goto out; @@ -3195,21 +3184,27 @@ out: } } + if (sev_supported && min_sev_asid <= max_sev_asid) + vm_types |= BIT(KVM_X86_SEV_VM); + if (sev_es_supported && min_sev_es_asid <= max_sev_es_asid) + vm_types |= BIT(KVM_X86_SEV_ES_VM); + if (sev_snp_supported) + vm_types |= BIT(KVM_X86_SNP_VM); + vm_types &= sev_firmware_supported_vm_types(); + + kvm_caps.supported_vm_types |= vm_types; + if (boot_cpu_has(X86_FEATURE_SEV)) pr_info("SEV %s (ASIDs %u - %u)\n", - sev_supported ? min_sev_asid <= max_sev_asid ? "enabled" : - "unusable" : - "disabled", + sev_str_feature_state(sev_supported, vm_types & BIT(KVM_X86_SEV_VM)), min_sev_asid, max_sev_asid); if (boot_cpu_has(X86_FEATURE_SEV_ES)) pr_info("SEV-ES %s (ASIDs %u - %u)\n", - sev_es_supported ? min_sev_es_asid <= max_sev_es_asid ? "enabled" : - "unusable" : - "disabled", + sev_str_feature_state(sev_es_supported, vm_types & BIT(KVM_X86_SEV_ES_VM)), min_sev_es_asid, max_sev_es_asid); if (boot_cpu_has(X86_FEATURE_SEV_SNP)) pr_info("SEV-SNP %s (ASIDs %u - %u)\n", - str_enabled_disabled(sev_snp_supported), + sev_str_feature_state(sev_snp_supported, vm_types & BIT(KVM_X86_SNP_VM)), min_snp_asid, max_snp_asid); sev_enabled = sev_supported; @@ -3782,9 +3777,13 @@ static int snp_rmptable_psmash(kvm_pfn_t pfn) static int snp_complete_psc_msr(struct kvm_vcpu *vcpu) { + u64 hypercall_ret = READ_ONCE(vcpu->run->hypercall.ret); struct vcpu_svm *svm = to_svm(vcpu); - if (vcpu->run->hypercall.ret) + if (!kvm_is_valid_map_gpa_range_ret(hypercall_ret)) + return -EINVAL; + + if (hypercall_ret) set_ghcb_msr(svm, GHCB_MSR_PSC_RESP_ERROR); else set_ghcb_msr(svm, GHCB_MSR_PSC_RESP); @@ -3875,10 +3874,14 @@ static void __snp_complete_one_psc(struct vcpu_svm *svm) static int snp_complete_one_psc(struct kvm_vcpu *vcpu) { + u64 hypercall_ret = READ_ONCE(vcpu->run->hypercall.ret); struct vcpu_svm *svm = to_svm(vcpu); struct psc_buffer *psc = svm->sev_es.ghcb_sa; - if (vcpu->run->hypercall.ret) { + if (!kvm_is_valid_map_gpa_range_ret(hypercall_ret)) + return -EINVAL; + + if (hypercall_ret) { snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC); return 1; /* resume guest */ } diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index adfa9ff48c573..a5ec2e66a80e5 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -266,6 +266,7 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) set_exception_intercept(svm, GP_VECTOR); } + svm_pmu_handle_nested_transition(svm); kvm_make_request(KVM_REQ_RECALC_INTERCEPTS, vcpu); } @@ -334,7 +335,7 @@ done: return 1; } -static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu) +int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu) { return __svm_skip_emulated_instruction(vcpu, EMULTYPE_SKIP, true); } @@ -2798,6 +2799,20 @@ static bool sev_es_prevent_msr_access(struct kvm_vcpu *vcpu, !msr_write_intercepted(to_svm(vcpu), msr_info->index); } +static bool svm_pat_accesses_gpat(struct kvm_vcpu *vcpu, bool from_host) +{ + /* + * When KVM_X86_QUIRK_NESTED_SVM_SHARED_PAT is disabled and nested + * NPT is enabled, L2 has a separate PAT from L1. Guest accesses + * to IA32_PAT while running L2 target L2's gPAT; host-initiated + * accesses always target L1's hPAT so that KVM_GET/SET_MSRS and + * KVM_GET/SET_NESTED_STATE are independent of each other and can + * be ordered arbitrarily during save and restore. + */ + WARN_ON_ONCE(from_host && vcpu->wants_to_run); + return !from_host && is_guest_mode(vcpu) && l2_has_separate_pat(vcpu); +} + static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { struct vcpu_svm *svm = to_svm(vcpu); @@ -2914,6 +2929,12 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_AMD64_DE_CFG: msr_info->data = svm->msr_decfg; break; + case MSR_IA32_CR_PAT: + if (svm_pat_accesses_gpat(vcpu, msr_info->host_initiated)) { + msr_info->data = svm->vmcb->save.g_pat; + break; + } + return kvm_get_msr_common(vcpu, msr_info); default: return kvm_get_msr_common(vcpu, msr_info); } @@ -2997,14 +3018,23 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) break; case MSR_IA32_CR_PAT: + if (svm_pat_accesses_gpat(vcpu, msr->host_initiated)) { + if (!kvm_pat_valid(data)) + return 1; + + vmcb_set_gpat(svm->vmcb, data); + break; + } + ret = kvm_set_msr_common(vcpu, msr); if (ret) break; - svm->vmcb01.ptr->save.g_pat = data; - if (is_guest_mode(vcpu)) - nested_vmcb02_compute_g_pat(svm); - vmcb_mark_dirty(svm->vmcb, VMCB_NPT); + if (npt_enabled) { + vmcb_set_gpat(svm->vmcb01.ptr, data); + if (is_guest_mode(vcpu) && !l2_has_separate_pat(vcpu)) + vmcb_set_gpat(svm->vmcb, data); + } break; case MSR_IA32_SPEC_CTRL: if (!msr->host_initiated && @@ -3675,13 +3705,8 @@ static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) struct vcpu_svm *svm = to_svm(vcpu); struct kvm_run *kvm_run = vcpu->run; - /* SEV-ES guests must use the CR write traps to track CR registers. */ - if (!is_sev_es_guest(vcpu)) { - if (!svm_is_intercept(svm, INTERCEPT_CR0_WRITE)) - vcpu->arch.cr0 = svm->vmcb->save.cr0; - if (npt_enabled) - vcpu->arch.cr3 = svm->vmcb->save.cr3; - } + if (unlikely(exit_fastpath == EXIT_FASTPATH_EXIT_USERSPACE)) + return 0; if (is_guest_mode(vcpu)) { int vmexit; @@ -4536,11 +4561,17 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL)) x86_spec_ctrl_restore_host(svm->virt_spec_ctrl); + /* SEV-ES guests must use the CR write traps to track CR registers. */ if (!is_sev_es_guest(vcpu)) { vcpu->arch.cr2 = svm->vmcb->save.cr2; vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; vcpu->arch.rip = svm->vmcb->save.rip; + + if (!svm_is_intercept(svm, INTERCEPT_CR0_WRITE)) + vcpu->arch.cr0 = svm->vmcb->save.cr0; + if (npt_enabled) + vcpu->arch.cr3 = svm->vmcb->save.cr3; } kvm_reset_dirty_registers(vcpu); diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 19b80ef56e2b7..87c6b105deef6 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -24,6 +24,8 @@ #include "cpuid.h" #include "kvm_cache_regs.h" +#include "x86.h" +#include "pmu.h" /* * Helpers to convert to/from physical addresses for pages whose address is @@ -166,6 +168,7 @@ struct vmcb_save_area_cached { u64 isst_addr; u64 rax; u64 cr2; + u64 g_pat; u64 dbgctl; u64 br_from; u64 br_to; @@ -464,6 +467,12 @@ static inline bool vmcb12_is_dirty(struct vmcb_ctrl_area_cached *control, int bi return !test_bit(bit, (unsigned long *)&control->clean); } +static inline void vmcb_set_gpat(struct vmcb *vmcb, u64 data) +{ + vmcb->save.g_pat = data; + vmcb_mark_dirty(vmcb, VMCB_NPT); +} + static __always_inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) { return container_of(vcpu, struct vcpu_svm, vcpu); @@ -641,6 +650,16 @@ static inline bool nested_npt_enabled(struct vcpu_svm *svm) return svm->nested.ctl.misc_ctl & SVM_MISC_ENABLE_NP; } +static inline bool l2_has_separate_pat(struct kvm_vcpu *vcpu) +{ + /* + * If KVM_X86_QUIRK_NESTED_SVM_SHARED_PAT is disabled while a vCPU + * is running, the L2 IA32_PAT semantics for that vCPU are undefined. + */ + return nested_npt_enabled(to_svm(vcpu)) && + !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_NESTED_SVM_SHARED_PAT); +} + static inline bool nested_vnmi_enabled(struct vcpu_svm *svm) { return guest_cpu_cap_has(&svm->vcpu, X86_FEATURE_VNMI) && @@ -814,6 +833,8 @@ static inline void svm_enable_intercept_for_msr(struct kvm_vcpu *vcpu, svm_set_intercept_for_msr(vcpu, msr, type, true); } +int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu); + /* nested.c */ #define NESTED_EXIT_HOST 0 /* Exit handled on host level */ @@ -875,9 +896,30 @@ void nested_copy_vmcb_control_to_cache(struct vcpu_svm *svm, void nested_copy_vmcb_save_to_cache(struct vcpu_svm *svm, struct vmcb_save_area *save); void nested_sync_control_from_vmcb02(struct vcpu_svm *svm); -void nested_vmcb02_compute_g_pat(struct vcpu_svm *svm); void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb); + +static inline void __svm_pmu_handle_nested_transition(struct vcpu_svm *svm, + bool defer) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(&svm->vcpu); + u64 counters = *(u64 *)pmu->pmc_has_mode_specific_enables; + + __kvm_pmu_reprogram_counters(pmu, counters, defer); +} + +static inline void svm_pmu_handle_nested_transition(struct vcpu_svm *svm) +{ + /* + * Do NOT defer reprogramming the counters by default. Instructions + * causing a state change are counted based on the _new_ CPU state + * (e.g. a successful VMRUN is counted in guest mode). Hence, the + * counters should be reprogrammed with the new state _before_ the + * instruction is potentially counted upon emulation completion. + */ + __svm_pmu_handle_nested_transition(svm, false); +} + extern struct kvm_x86_nested_ops svm_nested_ops; /* avic.c */ diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 4690a4d23709d..30dcabc899a29 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -411,7 +411,8 @@ static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp, } static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, - struct x86_exception *fault) + struct x86_exception *fault, + bool from_hardware) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -444,13 +445,29 @@ static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, exit_qualification = 0; } else { u64 mask = EPT_VIOLATION_GVA_IS_VALID | - EPT_VIOLATION_GVA_TRANSLATED; + EPT_VIOLATION_GVA_TRANSLATED; + if (vmx->nested.msrs.ept_caps & VMX_EPT_ADVANCED_VMEXIT_INFO_BIT) mask |= EPT_VIOLATION_GVA_USER | - EPT_VIOLATION_GVA_WRITABLE | - EPT_VIOLATION_GVA_NX; - exit_qualification = fault->exit_qualification; - exit_qualification |= vmx_get_exit_qual(vcpu) & mask; + EPT_VIOLATION_GVA_WRITABLE | + EPT_VIOLATION_GVA_NX; + + exit_qualification = fault->exit_qualification & ~mask; + + /* + * Use the EXIT_QUALIFICATION from the VMCS if and only + * if the hardware VM-Exit from L2 was an EPT Violation. + * If the fault is synthesized, then EXIT_QUALIFICATION + * is stale and/or holds entirely different data. And + * conversely, KVM _must_ rely on EXIT_QUALIFICATION if + * the fault came from hardware, because KVM only sees + * and walks the faulting GPA. + */ + if (from_hardware) + exit_qualification |= vmx_get_exit_qual(vcpu) & mask; + else + exit_qualification |= fault->exit_qualification & mask; + vm_exit_reason = EXIT_REASON_EPT_VIOLATION; } @@ -6535,6 +6552,8 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, nested_evmcs_l2_tlb_flush_enabled(vcpu) && kvm_hv_is_tlb_flush_hcall(vcpu); #endif + case EXIT_REASON_CPUID: + return !kvm_is_cpuid_allowed(vcpu); default: break; } diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 74e0b01185b80..e741e6473b5e5 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -309,13 +309,15 @@ static bool intel_pmu_handle_lbr_msrs_access(struct kvm_vcpu *vcpu, */ local_irq_disable(); if (lbr_desc->event->state == PERF_EVENT_STATE_ACTIVE) { + int err = 0; + if (read) rdmsrq(index, msr_info->data); else - wrmsrq(index, msr_info->data); + err = wrmsrq_safe(index, msr_info->data); __set_bit(INTEL_PMC_IDX_FIXED_VLBR, vcpu_to_pmu(vcpu)->pmc_in_use); local_irq_enable(); - return true; + return !err; } clear_bit(INTEL_PMC_IDX_FIXED_VLBR, vcpu_to_pmu(vcpu)->pmc_in_use); local_irq_enable(); @@ -392,7 +394,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (pmu->pebs_enable != data) { diff = pmu->pebs_enable ^ data; pmu->pebs_enable = data; - reprogram_counters(pmu, diff); + kvm_pmu_request_counters_reprogram(pmu, diff); } break; case MSR_IA32_DS_AREA: diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index b187ef9a6ae46..8a88de071a2ff 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -343,7 +343,7 @@ static int tdx_reclaim_page(struct page *page) r = __tdx_reclaim_page(page); if (!r) - tdx_quirk_reset_page(page); + tdx_quirk_reset_paddr(page_to_phys(page), PAGE_SIZE); return r; } @@ -587,7 +587,7 @@ static void tdx_reclaim_td_control_pages(struct kvm *kvm) if (TDX_BUG_ON(err, TDH_PHYMEM_PAGE_WBINVD, kvm)) return; - tdx_quirk_reset_page(kvm_tdx->td.tdr_page); + tdx_quirk_reset_paddr(page_to_phys(kvm_tdx->td.tdr_page), PAGE_SIZE); __free_page(kvm_tdx->td.tdr_page); kvm_tdx->td.tdr_page = NULL; @@ -629,6 +629,12 @@ int tdx_vm_init(struct kvm *kvm) kvm->arch.disabled_quirks |= KVM_X86_QUIRK_IGNORE_GUEST_PAT; /* + * PMU support is provided by the TDX-Module (if enabled for the VM). + * From KVM's perspective, the VM doesn't have a virtual PMU. + */ + kvm->arch.has_protected_pmu = true; + + /* * Because guest TD is protected, VMM can't parse the instruction in TD. * Instead, guest uses MMIO hypercall. For unmodified device driver, * #VE needs to be injected for MMIO and #VE handler in TD converts MMIO @@ -1172,12 +1178,22 @@ static void __tdx_map_gpa(struct vcpu_tdx *tdx); static int tdx_complete_vmcall_map_gpa(struct kvm_vcpu *vcpu) { + u64 hypercall_ret = READ_ONCE(vcpu->run->hypercall.ret); struct vcpu_tdx *tdx = to_tdx(vcpu); + long rc; - if (vcpu->run->hypercall.ret) { - tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); - tdx->vp_enter_args.r11 = tdx->map_gpa_next; - return 1; + switch (hypercall_ret) { + case 0: + break; + case EAGAIN: + rc = TDVMCALL_STATUS_RETRY; + goto propagate_error; + case EINVAL: + rc = TDVMCALL_STATUS_INVALID_OPERAND; + goto propagate_error; + default: + WARN_ON_ONCE(kvm_is_valid_map_gpa_range_ret(hypercall_ret)); + return -EINVAL; } tdx->map_gpa_next += TDX_MAP_GPA_MAX_LEN; @@ -1190,13 +1206,17 @@ static int tdx_complete_vmcall_map_gpa(struct kvm_vcpu *vcpu) * TDVMCALL_MAP_GPA, see comments in tdx_protected_apic_has_interrupt(). */ if (kvm_vcpu_has_events(vcpu)) { - tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_RETRY); - tdx->vp_enter_args.r11 = tdx->map_gpa_next; - return 1; + rc = TDVMCALL_STATUS_RETRY; + goto propagate_error; } __tdx_map_gpa(tdx); return 0; + +propagate_error: + tdvmcall_set_return_code(vcpu, rc); + tdx->vp_enter_args.r11 = tdx->map_gpa_next; + return 1; } static void __tdx_map_gpa(struct vcpu_tdx *tdx) @@ -1614,8 +1634,8 @@ static int tdx_mem_page_add(struct kvm *kvm, gfn_t gfn, enum pg_level level, KVM_BUG_ON(!kvm_tdx->page_add_src, kvm)) return -EIO; - err = tdh_mem_page_add(&kvm_tdx->td, gpa, pfn_to_page(pfn), - kvm_tdx->page_add_src, &entry, &level_state); + err = tdh_mem_page_add(&kvm_tdx->td, gpa, pfn, kvm_tdx->page_add_src, + &entry, &level_state); if (unlikely(tdx_operand_busy(err))) return -EBUSY; @@ -1628,14 +1648,12 @@ static int tdx_mem_page_add(struct kvm *kvm, gfn_t gfn, enum pg_level level, static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn, enum pg_level level, kvm_pfn_t pfn) { - int tdx_level = pg_level_to_tdx_sept_level(level); struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); - struct page *page = pfn_to_page(pfn); gpa_t gpa = gfn_to_gpa(gfn); u64 entry, level_state; u64 err; - err = tdh_mem_page_aug(&kvm_tdx->td, gpa, tdx_level, page, &entry, &level_state); + err = tdh_mem_page_aug(&kvm_tdx->td, gpa, level, pfn, &entry, &level_state); if (unlikely(tdx_operand_busy(err))) return -EBUSY; @@ -1645,18 +1663,52 @@ static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn, return 0; } -static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, - enum pg_level level, u64 mirror_spte) +static struct page *tdx_spte_to_sept_pt(struct kvm *kvm, gfn_t gfn, + u64 new_spte, enum pg_level level) +{ + struct kvm_mmu_page *sp = spte_to_child_sp(new_spte); + + if (KVM_BUG_ON(!sp->external_spt, kvm) || + KVM_BUG_ON(sp->role.level + 1 != level, kvm) || + KVM_BUG_ON(sp->gfn != gfn, kvm)) + return NULL; + + return virt_to_page(sp->external_spt); +} + +static int tdx_sept_map_nonleaf_spte(struct kvm *kvm, gfn_t gfn, + enum pg_level level, u64 new_spte) +{ + gpa_t gpa = gfn_to_gpa(gfn); + u64 err, entry, level_state; + struct page *sept_pt; + + sept_pt = tdx_spte_to_sept_pt(kvm, gfn, new_spte, level); + if (!sept_pt) + return -EIO; + + err = tdh_mem_sept_add(&to_kvm_tdx(kvm)->td, gpa, level, sept_pt, + &entry, &level_state); + if (unlikely(tdx_operand_busy(err))) + return -EBUSY; + + if (TDX_BUG_ON_2(err, TDH_MEM_SEPT_ADD, entry, level_state, kvm)) + return -EIO; + + return 0; +} + +static int tdx_sept_map_leaf_spte(struct kvm *kvm, gfn_t gfn, enum pg_level level, + u64 new_spte) { struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); - kvm_pfn_t pfn = spte_to_pfn(mirror_spte); + kvm_pfn_t pfn = spte_to_pfn(new_spte); /* TODO: handle large pages. */ if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm)) return -EIO; - WARN_ON_ONCE(!is_shadow_present_pte(mirror_spte) || - (mirror_spte & VMX_EPT_RWX_MASK) != VMX_EPT_RWX_MASK); + WARN_ON_ONCE((new_spte & VMX_EPT_RWX_MASK) != VMX_EPT_RWX_MASK); /* * Ensure pre_fault_allowed is read by kvm_arch_vcpu_pre_fault_memory() @@ -1676,25 +1728,6 @@ static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, return tdx_mem_page_aug(kvm, gfn, level, pfn); } -static int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn, - enum pg_level level, void *private_spt) -{ - int tdx_level = pg_level_to_tdx_sept_level(level); - gpa_t gpa = gfn_to_gpa(gfn); - struct page *page = virt_to_page(private_spt); - u64 err, entry, level_state; - - err = tdh_mem_sept_add(&to_kvm_tdx(kvm)->td, gpa, tdx_level, page, &entry, - &level_state); - if (unlikely(tdx_operand_busy(err))) - return -EBUSY; - - if (TDX_BUG_ON_2(err, TDH_MEM_SEPT_ADD, entry, level_state, kvm)) - return -EIO; - - return 0; -} - /* * Ensure shared and private EPTs to be flushed on all vCPUs. * tdh_mem_track() is the only caller that increases TD epoch. An increase in @@ -1741,35 +1774,11 @@ static void tdx_track(struct kvm *kvm) kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE); } -static int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn, - enum pg_level level, void *private_spt) -{ - struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); - - /* - * free_external_spt() is only called after hkid is freed when TD is - * tearing down. - * KVM doesn't (yet) zap page table pages in mirror page table while - * TD is active, though guest pages mapped in mirror page table could be - * zapped during TD is active, e.g. for shared <-> private conversion - * and slot move/deletion. - */ - if (KVM_BUG_ON(is_hkid_assigned(kvm_tdx), kvm)) - return -EIO; - - /* - * The HKID assigned to this TD was already freed and cache was - * already flushed. We don't have to flush again. - */ - return tdx_reclaim_page(virt_to_page(private_spt)); -} - -static void tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn, - enum pg_level level, u64 mirror_spte) +static int tdx_sept_remove_leaf_spte(struct kvm *kvm, gfn_t gfn, + enum pg_level level, u64 old_spte) { - struct page *page = pfn_to_page(spte_to_pfn(mirror_spte)); - int tdx_level = pg_level_to_tdx_sept_level(level); struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + kvm_pfn_t pfn = spte_to_pfn(old_spte); gpa_t gpa = gfn_to_gpa(gfn); u64 err, entry, level_state; @@ -1781,16 +1790,16 @@ static void tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn, * there can't be anything populated in the private EPT. */ if (KVM_BUG_ON(!is_hkid_assigned(to_kvm_tdx(kvm)), kvm)) - return; + return -EIO; /* TODO: handle large pages. */ if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm)) - return; + return -EIO; err = tdh_do_no_vcpus(tdh_mem_range_block, kvm, &kvm_tdx->td, gpa, - tdx_level, &entry, &level_state); + level, &entry, &level_state); if (TDX_BUG_ON_2(err, TDH_MEM_RANGE_BLOCK, entry, level_state, kvm)) - return; + return -EIO; /* * TDX requires TLB tracking before dropping private page. Do @@ -1804,15 +1813,82 @@ static void tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn, * Race with TDH.VP.ENTER due to (0-step mitigation) and Guest TDCALLs. */ err = tdh_do_no_vcpus(tdh_mem_page_remove, kvm, &kvm_tdx->td, gpa, - tdx_level, &entry, &level_state); + level, &entry, &level_state); if (TDX_BUG_ON_2(err, TDH_MEM_PAGE_REMOVE, entry, level_state, kvm)) - return; + return -EIO; - err = tdh_phymem_page_wbinvd_hkid((u16)kvm_tdx->hkid, page); + err = tdh_phymem_page_wbinvd_hkid((u16)kvm_tdx->hkid, pfn); if (TDX_BUG_ON(err, TDH_PHYMEM_PAGE_WBINVD, kvm)) - return; + return -EIO; - tdx_quirk_reset_page(page); + tdx_quirk_reset_paddr(PFN_PHYS(pfn), PAGE_SIZE); + return 0; +} + +/* + * Handle changes for + * (1) leaf SPTEs from non-present to present + * (2) non-leaf SPTEs from non-present to present + * (3) leaf SPTEs from present to non-present + * + * - (1) and (2) must be under shared mmu_lock. If (1) and (2) are under + * exclusive mmu_lock (currently impossible), contention errors may lead to + * KVM_BUG_ON() in handle_changed_spte(), e.g., due to tdx_mem_page_aug(), + * tdx_mem_page_add(), or tdh_mem_sept_add() contending with tdh_vp_enter() + * due to zero-step mitigation or contending with TDCALLs. + * - (3) must be under write mmu_lock. If (3) is under shared mmu_lock + * (currently impossible), warnings will be generated due to + * lockdep_assert_held_write() or TDX_BUG_ON() caused by concurrent BLOCK, + * TRACK, REMOVE. + * - Promotion/demotion is not yet supported. + */ +static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, u64 old_spte, + u64 new_spte, enum pg_level level) +{ + lockdep_assert_held(&kvm->mmu_lock); + + if (is_shadow_present_pte(old_spte)) + return tdx_sept_remove_leaf_spte(kvm, gfn, level, old_spte); + + if (KVM_BUG_ON(!is_shadow_present_pte(new_spte), kvm)) + return -EIO; + + if (!is_last_spte(new_spte, level)) + return tdx_sept_map_nonleaf_spte(kvm, gfn, level, new_spte); + + return tdx_sept_map_leaf_spte(kvm, gfn, level, new_spte); +} + +/* + * Handle changes for non-leaf SPTEs from present to non-present. + * Must be under exclusive mmu_lock and cannot fail. + */ +static void tdx_sept_free_private_spt(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + /* + * KVM doesn't (yet) zap page table pages in mirror page table while + * TD is active, though guest pages mapped in mirror page table could be + * zapped during TD is active, e.g. for shared <-> private conversion + * and slot move/deletion. + * + * In other words, KVM should only free mirror page tables after the + * TD's hkid is freed, when the TD is being torn down. + * + * If the S-EPT PTE can't be removed for any reason, intentionally leak + * the page to prevent the kernel from accessing the encrypted page. + */ + if (KVM_BUG_ON(is_hkid_assigned(to_kvm_tdx(kvm)), kvm) || + tdx_reclaim_page(virt_to_page(sp->external_spt))) + goto out; + + /* + * Immediately free the S-EPT page because RCU-time free is unnecessary + * after TDH.PHYMEM.PAGE.RECLAIM ensures there are no outstanding + * readers. + */ + free_page((unsigned long)sp->external_spt); +out: + sp->external_spt = NULL; } void tdx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, @@ -2106,23 +2182,29 @@ bool tdx_has_emulated_msr(u32 index) case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1: /* MSR_IA32_MCx_{CTL, STATUS, ADDR, MISC, CTL2} */ case MSR_KVM_POLL_CONTROL: + /* + * Except for x2APIC registers that are virtualized by the CPU, which + * KVM can't emulate as KVM doesn't have access to the virtual APIC + * page, KVM emulates the same set of x2APIC registers for TDX versus + * non-TDX guests. + */ + case X2APIC_MSR(APIC_ID): + case X2APIC_MSR(APIC_LVR): + case X2APIC_MSR(APIC_LDR): + case X2APIC_MSR(APIC_SPIV): + case X2APIC_MSR(APIC_ESR): + case X2APIC_MSR(APIC_LVTCMCI): + case X2APIC_MSR(APIC_ICR): + case X2APIC_MSR(APIC_LVTT): + case X2APIC_MSR(APIC_LVTTHMR): + case X2APIC_MSR(APIC_LVTPC): + case X2APIC_MSR(APIC_LVT0): + case X2APIC_MSR(APIC_LVT1): + case X2APIC_MSR(APIC_LVTERR): + case X2APIC_MSR(APIC_TMICT): + case X2APIC_MSR(APIC_TMCCT): + case X2APIC_MSR(APIC_TDCR): return true; - case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff: - /* - * x2APIC registers that are virtualized by the CPU can't be - * emulated, KVM doesn't have access to the virtual APIC page. - */ - switch (index) { - case X2APIC_MSR(APIC_TASKPRI): - case X2APIC_MSR(APIC_PROCPRI): - case X2APIC_MSR(APIC_EOI): - case X2APIC_MSR(APIC_ISR) ... X2APIC_MSR(APIC_ISR + APIC_ISR_NR): - case X2APIC_MSR(APIC_TMR) ... X2APIC_MSR(APIC_TMR + APIC_ISR_NR): - case X2APIC_MSR(APIC_IRR) ... X2APIC_MSR(APIC_IRR + APIC_ISR_NR): - return false; - default: - return true; - } default: return false; } @@ -2377,20 +2459,20 @@ static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params, ret = -ENOMEM; - tdr_page = alloc_page(GFP_KERNEL); + tdr_page = alloc_page(GFP_KERNEL_ACCOUNT); if (!tdr_page) goto free_hkid; kvm_tdx->td.tdcs_nr_pages = tdx_sysinfo->td_ctrl.tdcs_base_size / PAGE_SIZE; /* TDVPS = TDVPR(4K page) + TDCX(multiple 4K pages), -1 for TDVPR. */ kvm_tdx->td.tdcx_nr_pages = tdx_sysinfo->td_ctrl.tdvps_base_size / PAGE_SIZE - 1; - tdcs_pages = kzalloc_objs(*kvm_tdx->td.tdcs_pages, - kvm_tdx->td.tdcs_nr_pages); + tdcs_pages = kzalloc_objs(*kvm_tdx->td.tdcs_pages, kvm_tdx->td.tdcs_nr_pages, + GFP_KERNEL_ACCOUNT); if (!tdcs_pages) goto free_tdr; for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) { - tdcs_pages[i] = alloc_page(GFP_KERNEL); + tdcs_pages[i] = alloc_page(GFP_KERNEL_ACCOUNT); if (!tdcs_pages[i]) goto free_tdcs; } @@ -2775,7 +2857,7 @@ void tdx_flush_tlb_current(struct kvm_vcpu *vcpu) void tdx_flush_tlb_all(struct kvm_vcpu *vcpu) { /* - * TDX has called tdx_track() in tdx_sept_remove_private_spte() to + * TDX has called tdx_track() in tdx_sept_remove_leaf_spte() to * ensure that private EPT will be flushed on the next TD enter. No need * to call tdx_track() here again even when this callback is a result of * zapping private EPT. @@ -2865,7 +2947,7 @@ static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx) int ret, i; u64 err; - page = alloc_page(GFP_KERNEL); + page = alloc_page(GFP_KERNEL_ACCOUNT); if (!page) return -ENOMEM; tdx->vp.tdvpr_page = page; @@ -2878,14 +2960,14 @@ static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx) tdx->vp.tdvpr_pa = page_to_phys(tdx->vp.tdvpr_page); tdx->vp.tdcx_pages = kcalloc(kvm_tdx->td.tdcx_nr_pages, sizeof(*tdx->vp.tdcx_pages), - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!tdx->vp.tdcx_pages) { ret = -ENOMEM; goto free_tdvpr; } for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) { - page = alloc_page(GFP_KERNEL); + page = alloc_page(GFP_KERNEL_ACCOUNT); if (!page) { ret = -ENOMEM; goto free_tdcx; @@ -3175,7 +3257,7 @@ static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *c }; gmem_ret = kvm_gmem_populate(kvm, gpa_to_gfn(region.gpa), u64_to_user_ptr(region.source_addr), - 1, tdx_gmem_post_populate, &arg); + 1, false, tdx_gmem_post_populate, &arg); if (gmem_ret < 0) { ret = gmem_ret; break; @@ -3405,10 +3487,8 @@ int __init tdx_hardware_setup(void) vt_x86_ops.vm_size = max_t(unsigned int, vt_x86_ops.vm_size, sizeof(struct kvm_tdx)); - vt_x86_ops.link_external_spt = tdx_sept_link_private_spt; vt_x86_ops.set_external_spte = tdx_sept_set_private_spte; vt_x86_ops.free_external_spt = tdx_sept_free_private_spt; - vt_x86_ops.remove_external_spte = tdx_sept_remove_private_spte; vt_x86_ops.protected_apic_has_interrupt = tdx_protected_apic_has_interrupt; return 0; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index e5cfe4d12c479..5f171d6943dd4 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1913,6 +1913,24 @@ void vmx_inject_exception(struct kvm_vcpu *vcpu) u32 intr_info = ex->vector | INTR_INFO_VALID_MASK; struct vcpu_vmx *vmx = to_vmx(vcpu); + /* + * When injecting a #DB, single-stepping is enabled in RFLAGS, and STI + * or MOV-SS blocking is active, set vmcs.PENDING_DBG_EXCEPTIONS.BS to + * prevent a false positive from VM-Entry consistency check. VM-Entry + * asserts that a single-step #DB _must_ be pending in this scenario, + * as the previous instruction cannot have toggled RFLAGS.TF 0=>1 + * (because STI and POP/MOV don't modify RFLAGS), therefore the one + * instruction delay when activating single-step breakpoints must have + * already expired. However, the CPU isn't smart enough to peek at + * vmcs.VM_ENTRY_INTR_INFO_FIELD and so doesn't realize that yes, there + * is indeed a #DB pending/imminent. + */ + if (ex->vector == DB_VECTOR && + (vmx_get_rflags(vcpu) & X86_EFLAGS_TF) && + vmx_get_interrupt_shadow(vcpu)) + vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, + vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS) | DR6_BS); + kvm_deliver_exception_payload(vcpu, ex); if (ex->has_error_code) { @@ -4157,7 +4175,7 @@ static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu) * mode, only the current timer count needs on-demand emulation by KVM. */ if (mode & MSR_BITMAP_MODE_X2APIC_APICV) - msr_bitmap[read_idx] = ~kvm_lapic_readable_reg_mask(vcpu->arch.apic); + msr_bitmap[read_idx] = ~kvm_x2apic_disable_read_intercept_reg_mask(vcpu); else msr_bitmap[read_idx] = ~0ull; msr_bitmap[write_idx] = ~0ull; @@ -4170,7 +4188,6 @@ static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu) !(mode & MSR_BITMAP_MODE_X2APIC)); if (mode & MSR_BITMAP_MODE_X2APIC_APICV) { - vmx_enable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_RW); vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_EOI), MSR_TYPE_W); vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W); if (enable_ipiv) @@ -5496,26 +5513,9 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) * avoid single-step #DB and MTF updates, as ICEBP is * higher priority. Note, skipping ICEBP still clears * STI and MOVSS blocking. - * - * For all other #DBs, set vmcs.PENDING_DBG_EXCEPTIONS.BS - * if single-step is enabled in RFLAGS and STI or MOVSS - * blocking is active, as the CPU doesn't set the bit - * on VM-Exit due to #DB interception. VM-Entry has a - * consistency check that a single-step #DB is pending - * in this scenario as the previous instruction cannot - * have toggled RFLAGS.TF 0=>1 (because STI and POP/MOV - * don't modify RFLAGS), therefore the one instruction - * delay when activating single-step breakpoints must - * have already expired. Note, the CPU sets/clears BS - * as appropriate for all other VM-Exits types. */ if (is_icebp(intr_info)) WARN_ON(!skip_emulated_instruction(vcpu)); - else if ((vmx_get_rflags(vcpu) & X86_EFLAGS_TF) && - (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & - (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS))) - vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, - vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS) | DR6_BS); kvm_queue_exception_p(vcpu, DB_VECTOR, dr6); return 1; @@ -6716,6 +6716,9 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) if (enable_pml && !is_guest_mode(vcpu)) vmx_flush_pml_buffer(vcpu); + if (unlikely(exit_fastpath == EXIT_FASTPATH_EXIT_USERSPACE)) + return 0; + /* * KVM should never reach this point with a pending nested VM-Enter. * More specifically, short-circuiting VM-Entry to emulate L2 due to diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0ac9d94e51776..ae8e911ef4a1d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -133,7 +133,6 @@ static void process_nmi(struct kvm_vcpu *vcpu); static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); static void store_regs(struct kvm_vcpu *vcpu); static int sync_regs(struct kvm_vcpu *vcpu); -static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu); static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2); static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2); @@ -152,6 +151,7 @@ struct kvm_x86_ops kvm_x86_ops __read_mostly; #include <asm/kvm-x86-ops.h> EXPORT_STATIC_CALL_GPL(kvm_x86_get_cs_db_l_bits); EXPORT_STATIC_CALL_GPL(kvm_x86_cache_reg); +EXPORT_STATIC_CALL_GPL(kvm_x86_get_cpl); static bool __read_mostly ignore_msrs = 0; module_param(ignore_msrs, bool, 0644); @@ -182,15 +182,6 @@ module_param(force_emulation_prefix, int, 0644); int __read_mostly pi_inject_timer = -1; module_param(pi_inject_timer, bint, 0644); -/* Enable/disable PMU virtualization */ -bool __read_mostly enable_pmu = true; -EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_pmu); -module_param(enable_pmu, bool, 0444); - -/* Enable/disabled mediated PMU virtualization. */ -bool __read_mostly enable_mediated_pmu; -EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_mediated_pmu); - bool __read_mostly eager_page_split = true; module_param(eager_page_split, bool, 0644); @@ -970,7 +961,8 @@ static int complete_emulated_insn_gp(struct kvm_vcpu *vcpu, int err) EMULTYPE_COMPLETE_USER_EXIT); } -void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) +void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault, + bool from_hardware) { ++vcpu->stat.pf_guest; @@ -987,8 +979,9 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) fault->address); } -void kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu, - struct x86_exception *fault) +void __kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu, + struct x86_exception *fault, + bool from_hardware) { struct kvm_mmu *fault_mmu; WARN_ON_ONCE(fault->vector != PF_VECTOR); @@ -1005,9 +998,9 @@ void kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu, kvm_mmu_invalidate_addr(vcpu, fault_mmu, fault->address, KVM_MMU_ROOT_CURRENT); - fault_mmu->inject_page_fault(vcpu, fault); + fault_mmu->inject_page_fault(vcpu, fault, from_hardware); } -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_inject_emulated_page_fault); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_inject_emulated_page_fault); void kvm_inject_nmi(struct kvm_vcpu *vcpu) { @@ -1021,18 +1014,6 @@ void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_queue_exception_e); -/* - * Checks if cpl <= required_cpl; if true, return true. Otherwise queue - * a #GP and return false. - */ -bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl) -{ - if (kvm_x86_call(get_cpl)(vcpu) <= required_cpl) - return true; - kvm_queue_exception_e(vcpu, GP_VECTOR, 0); - return false; -} - bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr) { if ((dr != 4 && dr != 5) || !kvm_is_cr4_bit_set(vcpu, X86_CR4_DE)) @@ -1043,11 +1024,16 @@ bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr) } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_require_dr); -static bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu) +static bool __kvm_pv_async_pf_enabled(u64 data) { u64 mask = KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT; - return (vcpu->arch.apf.msr_en_val & mask) == mask; + return (data & mask) == mask; +} + +static bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu) +{ + return __kvm_pv_async_pf_enabled(vcpu->arch.apf.msr_en_val); } static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu) @@ -1601,6 +1587,14 @@ unsigned long kvm_get_dr(struct kvm_vcpu *vcpu, int dr) } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_dr); +static unsigned long kvm_get_effective_dr7(struct kvm_vcpu *vcpu) +{ + if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) + return vcpu->arch.guest_debug_dr7; + + return vcpu->arch.dr7; +} + int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu) { u32 pmc = kvm_rcx_read(vcpu); @@ -3648,23 +3642,19 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) if (!lapic_in_kernel(vcpu)) return data ? 1 : 0; + if (__kvm_pv_async_pf_enabled(data) && + kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa, + sizeof(u64))) + return 1; + vcpu->arch.apf.msr_en_val = data; - if (!kvm_pv_async_pf_enabled(vcpu)) { + if (__kvm_pv_async_pf_enabled(data)) { + kvm_async_pf_wakeup_all(vcpu); + } else { kvm_clear_async_pf_completion_queue(vcpu); kvm_async_pf_hash_reset(vcpu); - return 0; } - - if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa, - sizeof(u64))) - return 1; - - vcpu->arch.apf.send_always = (data & KVM_ASYNC_PF_SEND_ALWAYS); - vcpu->arch.apf.delivery_as_pf_vmexit = data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT; - - kvm_async_pf_wakeup_all(vcpu); - return 0; } @@ -4003,22 +3993,28 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_EFER: return set_efer(vcpu, msr_info); - case MSR_K7_HWCR: - data &= ~(u64)0x40; /* ignore flush filter disable */ - data &= ~(u64)0x100; /* ignore ignne emulation enable */ - data &= ~(u64)0x8; /* ignore TLB cache disable */ - + case MSR_K7_HWCR: { /* * Allow McStatusWrEn and TscFreqSel. (Linux guests from v3.2 * through at least v6.6 whine if TscFreqSel is clear, * depending on F/M/S. */ - if (data & ~(BIT_ULL(18) | BIT_ULL(24))) { + u64 valid = BIT_ULL(18) | BIT_ULL(24); + + data &= ~(u64)0x40; /* ignore flush filter disable */ + data &= ~(u64)0x100; /* ignore ignne emulation enable */ + data &= ~(u64)0x8; /* ignore TLB cache disable */ + + if (guest_cpu_cap_has(vcpu, X86_FEATURE_GP_ON_USER_CPUID)) + valid |= MSR_K7_HWCR_CPUID_USER_DIS; + + if (data & ~valid) { kvm_pr_unimpl_wrmsr(vcpu, msr, data); return 1; } vcpu->arch.msr_hwcr = data; break; + } case MSR_FAM10H_MMIO_CONF_BASE: if (data != 0) { kvm_pr_unimpl_wrmsr(vcpu, msr, data); @@ -4265,7 +4261,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_MISC_FEATURES_ENABLES: if (data & ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT || (data & MSR_MISC_FEATURES_ENABLES_CPUID_FAULT && - !supports_cpuid_fault(vcpu))) + !(vcpu->arch.msr_platform_info & MSR_PLATFORM_INFO_CPUID_FAULT))) return 1; vcpu->arch.msr_misc_features_enables = data; break; @@ -5228,8 +5224,13 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) * On a host with synchronized TSC, there is no need to update * kvmclock on vcpu->cpu migration */ - if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1) - kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); + if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1) { + if (__ratelimit(&vcpu->kvm->arch.kvmclock_update_rs)) + kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); + else + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); + } + if (vcpu->cpu != cpu) kvm_make_request(KVM_REQ_MIGRATE_TIMER, vcpu); vcpu->cpu = cpu; @@ -6910,6 +6911,10 @@ disable_exits_unlock: if (!enable_pmu || (cap->args[0] & ~KVM_CAP_PMU_VALID_MASK)) break; + if (kvm->arch.has_protected_pmu && + cap->args[0] != KVM_PMU_CAP_DISABLE) + break; + mutex_lock(&kvm->lock); if (!kvm->created_vcpus && !kvm->arch.created_mediated_pmu) { kvm->arch.enable_pmu = !(cap->args[0] & KVM_PMU_CAP_DISABLE); @@ -8548,6 +8553,11 @@ static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt) kvm_emulate_wbinvd_noskip(emul_to_vcpu(ctxt)); } +static unsigned long emulator_get_effective_dr7(struct x86_emulate_ctxt *ctxt) +{ + return kvm_get_effective_dr7(emul_to_vcpu(ctxt)); +} + static unsigned long emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr) { return kvm_get_dr(emul_to_vcpu(ctxt), dr); @@ -8805,6 +8815,11 @@ static int emulator_intercept(struct x86_emulate_ctxt *ctxt, &ctxt->exception); } +static bool emulator_is_cpuid_allowed(struct x86_emulate_ctxt *ctxt) +{ + return kvm_is_cpuid_allowed(emul_to_vcpu(ctxt)); +} + static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, bool exact_only) @@ -8930,6 +8945,7 @@ static const struct x86_emulate_ops emulate_ops = { .get_cr = emulator_get_cr, .set_cr = emulator_set_cr, .cpl = emulator_get_cpl, + .get_effective_dr7 = emulator_get_effective_dr7, .get_dr = emulator_get_dr, .set_dr = emulator_set_dr, .set_msr_with_filter = emulator_set_msr_with_filter, @@ -8941,6 +8957,7 @@ static const struct x86_emulate_ops emulate_ops = { .wbinvd = emulator_wbinvd, .fix_hypercall = emulator_fix_hypercall, .intercept = emulator_intercept, + .is_cpuid_allowed = emulator_is_cpuid_allowed, .get_cpuid = emulator_get_cpuid, .guest_has_movbe = emulator_guest_has_movbe, .guest_has_fxsr = emulator_guest_has_fxsr, @@ -8976,17 +8993,36 @@ static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) } } -static void inject_emulated_exception(struct kvm_vcpu *vcpu) +static int kvm_inject_emulated_db(struct kvm_vcpu *vcpu, unsigned long dr6) { - struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; + struct kvm_run *kvm_run = vcpu->run; - if (ctxt->exception.vector == PF_VECTOR) - kvm_inject_emulated_page_fault(vcpu, &ctxt->exception); - else if (ctxt->exception.error_code_valid) - kvm_queue_exception_e(vcpu, ctxt->exception.vector, - ctxt->exception.error_code); + if (vcpu->guest_debug & (KVM_GUESTDBG_USE_HW_BP | KVM_GUESTDBG_SINGLESTEP)) { + kvm_run->debug.arch.dr6 = dr6 | DR6_ACTIVE_LOW; + kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu); + kvm_run->debug.arch.exception = DB_VECTOR; + kvm_run->exit_reason = KVM_EXIT_DEBUG; + return 0; + } + + kvm_queue_exception_p(vcpu, DB_VECTOR, dr6); + return 1; +} + +static int inject_emulated_exception(struct kvm_vcpu *vcpu) +{ + struct x86_exception *ex = &vcpu->arch.emulate_ctxt->exception; + + if (ex->vector == DB_VECTOR) + return kvm_inject_emulated_db(vcpu, ex->dr6); + + if (ex->vector == PF_VECTOR) + kvm_inject_emulated_page_fault(vcpu, ex); + else if (ex->error_code_valid) + kvm_queue_exception_e(vcpu, ex->vector, ex->error_code); else - kvm_queue_exception(vcpu, ctxt->exception.vector); + kvm_queue_exception(vcpu, ex->vector); + return 1; } static struct x86_emulate_ctxt *alloc_emulate_ctxt(struct kvm_vcpu *vcpu) @@ -9026,6 +9062,7 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) ctxt->interruptibility = 0; ctxt->have_exception = false; ctxt->exception.vector = -1; + ctxt->exception.payload = 0; ctxt->perm_ok = false; init_decode_cache(ctxt); @@ -9243,21 +9280,6 @@ static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7, return dr6; } -static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu) -{ - struct kvm_run *kvm_run = vcpu->run; - - if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { - kvm_run->debug.arch.dr6 = DR6_BS | DR6_ACTIVE_LOW; - kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu); - kvm_run->debug.arch.exception = DB_VECTOR; - kvm_run->exit_reason = KVM_EXIT_DEBUG; - return 0; - } - kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BS); - return 1; -} - int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu) { unsigned long rflags = kvm_x86_call(get_rflags)(vcpu); @@ -9278,13 +9300,16 @@ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu) * that sets the TF flag". */ if (unlikely(rflags & X86_EFLAGS_TF)) - r = kvm_vcpu_do_singlestep(vcpu); + r = kvm_inject_emulated_db(vcpu, DR6_BS); return r; } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_skip_emulated_instruction); static bool kvm_is_code_breakpoint_inhibited(struct kvm_vcpu *vcpu) { + if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) + return false; + if (kvm_get_rflags(vcpu) & X86_EFLAGS_RF) return true; @@ -9301,6 +9326,8 @@ static bool kvm_is_code_breakpoint_inhibited(struct kvm_vcpu *vcpu) static bool kvm_vcpu_check_code_breakpoint(struct kvm_vcpu *vcpu, int emulation_type, int *r) { + unsigned long dr7 = kvm_get_effective_dr7(vcpu); + WARN_ON_ONCE(emulation_type & EMULTYPE_NO_DECODE); /* @@ -9321,34 +9348,14 @@ static bool kvm_vcpu_check_code_breakpoint(struct kvm_vcpu *vcpu, EMULTYPE_TRAP_UD | EMULTYPE_VMWARE_GP | EMULTYPE_PF)) return false; - if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) && - (vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) { - struct kvm_run *kvm_run = vcpu->run; - unsigned long eip = kvm_get_linear_rip(vcpu); - u32 dr6 = kvm_vcpu_check_hw_bp(eip, 0, - vcpu->arch.guest_debug_dr7, - vcpu->arch.eff_db); - - if (dr6 != 0) { - kvm_run->debug.arch.dr6 = dr6 | DR6_ACTIVE_LOW; - kvm_run->debug.arch.pc = eip; - kvm_run->debug.arch.exception = DB_VECTOR; - kvm_run->exit_reason = KVM_EXIT_DEBUG; - *r = 0; - return true; - } - } - - if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK) && + if (unlikely(dr7 & DR7_BP_EN_MASK) && !kvm_is_code_breakpoint_inhibited(vcpu)) { unsigned long eip = kvm_get_linear_rip(vcpu); - u32 dr6 = kvm_vcpu_check_hw_bp(eip, 0, - vcpu->arch.dr7, - vcpu->arch.db); + u32 dr6 = kvm_vcpu_check_hw_bp(eip, 0, dr7, + vcpu->arch.eff_db); - if (dr6 != 0) { - kvm_queue_exception_p(vcpu, DB_VECTOR, dr6); - *r = 1; + if (dr6) { + *r = kvm_inject_emulated_db(vcpu, dr6); return true; } } @@ -9494,8 +9501,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, */ WARN_ON_ONCE(ctxt->exception.vector == UD_VECTOR || exception_type(ctxt->exception.vector) == EXCPT_TRAP); - inject_emulated_exception(vcpu); - return 1; + return inject_emulated_exception(vcpu); } return handle_emulation_failure(vcpu, emulation_type); } @@ -9590,8 +9596,7 @@ restart: if (ctxt->have_exception) { WARN_ON_ONCE(vcpu->mmio_needed && !vcpu->mmio_is_write); vcpu->mmio_needed = false; - r = 1; - inject_emulated_exception(vcpu); + r = inject_emulated_exception(vcpu); } else if (vcpu->arch.pio.count) { if (!vcpu->arch.pio.in) { /* FIXME: return into emulator if single-stepping. */ @@ -9634,7 +9639,7 @@ writeback: kvm_pmu_branch_retired(vcpu); kvm_rip_write(vcpu, ctxt->eip); if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP))) - r = kvm_vcpu_do_singlestep(vcpu); + r = kvm_inject_emulated_db(vcpu, DR6_BS); kvm_x86_call(update_emulated_instruction)(vcpu); __kvm_set_rflags(vcpu, ctxt->eflags); } @@ -11588,9 +11593,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (vcpu->arch.apic_attention) kvm_lapic_sync_from_vapic(vcpu); - if (unlikely(exit_fastpath == EXIT_FASTPATH_EXIT_USERSPACE)) - return 0; - r = kvm_x86_call(handle_exit)(vcpu, exit_fastpath); return r; @@ -13352,6 +13354,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) raw_spin_lock_init(&kvm->arch.tsc_write_lock); mutex_init(&kvm->arch.apic_map_lock); seqcount_raw_spinlock_init(&kvm->arch.pvclock_sc, &kvm->arch.tsc_write_lock); + ratelimit_state_init(&kvm->arch.kvmclock_update_rs, HZ, 10); + ratelimit_set_flags(&kvm->arch.kvmclock_update_rs, RATELIMIT_MSG_ON_RELEASE); kvm->arch.kvmclock_offset = -get_kvmclock_base_ns(); raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); @@ -13361,7 +13365,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm->arch.default_tsc_khz = max_tsc_khz ? : tsc_khz; kvm->arch.apic_bus_cycle_ns = APIC_BUS_CYCLE_NS_DEFAULT; kvm->arch.guest_can_read_msr_platform_info = true; - kvm->arch.enable_pmu = enable_pmu; + kvm->arch.enable_pmu = enable_pmu && !kvm->arch.has_protected_pmu; #if IS_ENABLED(CONFIG_HYPERV) spin_lock_init(&kvm->arch.hv_root_tdp_lock); @@ -14006,7 +14010,7 @@ static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu) if (!kvm_pv_async_pf_enabled(vcpu)) return false; - if (!vcpu->arch.apf.send_always && + if (!(vcpu->arch.apf.msr_en_val & KVM_ASYNC_PF_SEND_ALWAYS) && (vcpu->arch.guest_state_protected || !kvm_x86_call(get_cpl)(vcpu))) return false; @@ -14015,7 +14019,7 @@ static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu) * L1 needs to opt into the special #PF vmexits that are * used to deliver async page faults. */ - return vcpu->arch.apf.delivery_as_pf_vmexit; + return vcpu->arch.apf.msr_en_val & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT; } else { /* * Play it safe in case the guest temporarily disables paging. @@ -14059,7 +14063,7 @@ bool kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, fault.nested_page_fault = false; fault.address = work->arch.token; fault.async_page_fault = true; - kvm_inject_page_fault(vcpu, &fault); + kvm_inject_page_fault(vcpu, &fault, false); return true; } else { /* @@ -14230,7 +14234,7 @@ void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_c fault.address = gva; fault.async_page_fault = false; } - vcpu->arch.walk_mmu->inject_page_fault(vcpu, &fault); + vcpu->arch.walk_mmu->inject_page_fault(vcpu, &fault, true); } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_fixup_and_inject_pf_error); @@ -14309,7 +14313,7 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva) * the RAP (Return Address Predicator). */ if (guest_cpu_cap_has(vcpu, X86_FEATURE_ERAPS)) - kvm_register_is_dirty(vcpu, VCPU_REG_ERAPS); + kvm_register_mark_dirty(vcpu, VCPU_REG_ERAPS); kvm_invalidate_pcid(vcpu, operand.pcid); return kvm_skip_emulated_instruction(vcpu); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 38a905fa86de2..a49424f9c968e 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -490,9 +490,6 @@ fastpath_t handle_fastpath_invd(struct kvm_vcpu *vcpu); extern struct kvm_caps kvm_caps; extern struct kvm_host_values kvm_host; -extern bool enable_pmu; -extern bool enable_mediated_pmu; - void kvm_setup_xss_caps(void); /* @@ -754,6 +751,12 @@ static inline void kvm_prepare_emulated_mmio_exit(struct kvm_vcpu *vcpu, frag->data, vcpu->mmio_is_write); } +static inline bool kvm_is_valid_map_gpa_range_ret(u64 hypercall_ret) +{ + return !hypercall_ret || hypercall_ret == EINVAL || + hypercall_ret == EAGAIN; +} + static inline bool user_exit_on_hypercall(struct kvm *kvm, unsigned long hc_nr) { return kvm->arch.hypercall_exit_enabled & BIT(hc_nr); diff --git a/arch/x86/virt/hw.c b/arch/x86/virt/hw.c index f647557d38ac5..7e9091c640be0 100644 --- a/arch/x86/virt/hw.c +++ b/arch/x86/virt/hw.c @@ -49,7 +49,20 @@ static void x86_virt_invoke_kvm_emergency_callback(void) { cpu_emergency_virt_cb *kvm_callback; - kvm_callback = rcu_dereference(kvm_emergency_callback); + /* + * RCU may not be watching the crashing CPU here, so rcu_dereference() + * triggers a suspicious-RCU-usage splat. In principle, a concurrent + * KVM module unload could race with this read; see commit 2baa33a8ddd6 + * ("KVM: x86: Leave user-return notifier registered on reboot/shutdown") + * which notes that nothing prevents module unload during panic/reboot. + * + * However, taking a lock here would be riskier than the current race: + * the system is going down via NMI shootdown, and any lock could be + * held by an already-stopped CPU. Use rcu_dereference_raw() to silence + * the lockdep splat and accept the comically small remaining race; + * panic context inherently cannot guarantee complete correctness. + */ + kvm_callback = rcu_dereference_raw(kvm_emergency_callback); if (kvm_callback) kvm_callback(); } diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c index b15269b5941dc..42df8ea464c47 100644 --- a/arch/x86/virt/vmx/tdx/tdx.c +++ b/arch/x86/virt/vmx/tdx/tdx.c @@ -30,7 +30,6 @@ #include <linux/suspend.h> #include <linux/syscore_ops.h> #include <linux/idr.h> -#include <linux/kvm_types.h> #include <asm/page.h> #include <asm/special_insns.h> #include <asm/msr-index.h> @@ -709,7 +708,7 @@ err: * to normal kernel memory. Systems with the X86_BUG_TDX_PW_MCE erratum need to * do the conversion explicitly via MOVDIR64B. */ -static void tdx_quirk_reset_paddr(unsigned long base, unsigned long size) +void tdx_quirk_reset_paddr(unsigned long base, unsigned long size) { const void *zero_page = (const void *)page_address(ZERO_PAGE(0)); unsigned long phys, end; @@ -728,12 +727,7 @@ static void tdx_quirk_reset_paddr(unsigned long base, unsigned long size) */ mb(); } - -void tdx_quirk_reset_page(struct page *page) -{ - tdx_quirk_reset_paddr(page_to_phys(page), PAGE_SIZE); -} -EXPORT_SYMBOL_FOR_KVM(tdx_quirk_reset_page); +EXPORT_SYMBOL_FOR_KVM(tdx_quirk_reset_paddr); static __init void tdmr_quirk_reset_pamt(struct tdmr_info *tdmr) @@ -1634,6 +1628,17 @@ static void tdx_clflush_page(struct page *page) clflush_cache_range(page_to_virt(page), PAGE_SIZE); } +static void tdx_clflush_pfn(kvm_pfn_t pfn) +{ + clflush_cache_range(__va(PFN_PHYS(pfn)), PAGE_SIZE); +} + +static int pg_level_to_tdx_sept_level(enum pg_level level) +{ + WARN_ON_ONCE(level == PG_LEVEL_NONE); + return level - 1; +} + noinstr u64 tdh_vp_enter(struct tdx_vp *td, struct tdx_module_args *args) { args->rcx = td->tdvpr_pa; @@ -1654,17 +1659,18 @@ u64 tdh_mng_addcx(struct tdx_td *td, struct page *tdcs_page) } EXPORT_SYMBOL_FOR_KVM(tdh_mng_addcx); -u64 tdh_mem_page_add(struct tdx_td *td, u64 gpa, struct page *page, struct page *source, u64 *ext_err1, u64 *ext_err2) +u64 tdh_mem_page_add(struct tdx_td *td, u64 gpa, kvm_pfn_t pfn, struct page *source, + u64 *ext_err1, u64 *ext_err2) { struct tdx_module_args args = { .rcx = gpa, .rdx = tdx_tdr_pa(td), - .r8 = page_to_phys(page), + .r8 = PFN_PHYS(pfn), .r9 = page_to_phys(source), }; u64 ret; - tdx_clflush_page(page); + tdx_clflush_pfn(pfn); ret = seamcall_ret(TDH_MEM_PAGE_ADD, &args); *ext_err1 = args.rcx; @@ -1674,10 +1680,11 @@ u64 tdh_mem_page_add(struct tdx_td *td, u64 gpa, struct page *page, struct page } EXPORT_SYMBOL_FOR_KVM(tdh_mem_page_add); -u64 tdh_mem_sept_add(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2) +u64 tdh_mem_sept_add(struct tdx_td *td, u64 gpa, enum pg_level level, + struct page *page, u64 *ext_err1, u64 *ext_err2) { struct tdx_module_args args = { - .rcx = gpa | level, + .rcx = gpa | pg_level_to_tdx_sept_level(level), .rdx = tdx_tdr_pa(td), .r8 = page_to_phys(page), }; @@ -1705,16 +1712,17 @@ u64 tdh_vp_addcx(struct tdx_vp *vp, struct page *tdcx_page) } EXPORT_SYMBOL_FOR_KVM(tdh_vp_addcx); -u64 tdh_mem_page_aug(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2) +u64 tdh_mem_page_aug(struct tdx_td *td, u64 gpa, enum pg_level level, + kvm_pfn_t pfn, u64 *ext_err1, u64 *ext_err2) { struct tdx_module_args args = { - .rcx = gpa | level, + .rcx = gpa | pg_level_to_tdx_sept_level(level), .rdx = tdx_tdr_pa(td), - .r8 = page_to_phys(page), + .r8 = PFN_PHYS(pfn), }; u64 ret; - tdx_clflush_page(page); + tdx_clflush_pfn(pfn); ret = seamcall_ret(TDH_MEM_PAGE_AUG, &args); *ext_err1 = args.rcx; @@ -1724,10 +1732,11 @@ u64 tdh_mem_page_aug(struct tdx_td *td, u64 gpa, int level, struct page *page, u } EXPORT_SYMBOL_FOR_KVM(tdh_mem_page_aug); -u64 tdh_mem_range_block(struct tdx_td *td, u64 gpa, int level, u64 *ext_err1, u64 *ext_err2) +u64 tdh_mem_range_block(struct tdx_td *td, u64 gpa, enum pg_level level, + u64 *ext_err1, u64 *ext_err2) { struct tdx_module_args args = { - .rcx = gpa | level, + .rcx = gpa | pg_level_to_tdx_sept_level(level), .rdx = tdx_tdr_pa(td), }; u64 ret; @@ -1940,10 +1949,11 @@ u64 tdh_mem_track(struct tdx_td *td) } EXPORT_SYMBOL_FOR_KVM(tdh_mem_track); -u64 tdh_mem_page_remove(struct tdx_td *td, u64 gpa, u64 level, u64 *ext_err1, u64 *ext_err2) +u64 tdh_mem_page_remove(struct tdx_td *td, u64 gpa, enum pg_level level, + u64 *ext_err1, u64 *ext_err2) { struct tdx_module_args args = { - .rcx = gpa | level, + .rcx = gpa | pg_level_to_tdx_sept_level(level), .rdx = tdx_tdr_pa(td), }; u64 ret; @@ -1967,21 +1977,27 @@ u64 tdh_phymem_cache_wb(bool resume) } EXPORT_SYMBOL_FOR_KVM(tdh_phymem_cache_wb); +static inline u64 mk_keyed_paddr(u16 hkid, kvm_pfn_t pfn) +{ + /* KeyID bits are just above the physical address bits. */ + return PFN_PHYS(pfn) | ((u64)hkid << boot_cpu_data.x86_phys_bits); +} + u64 tdh_phymem_page_wbinvd_tdr(struct tdx_td *td) { struct tdx_module_args args = {}; - args.rcx = mk_keyed_paddr(tdx_global_keyid, td->tdr_page); + args.rcx = mk_keyed_paddr(tdx_global_keyid, page_to_pfn(td->tdr_page)); return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args); } EXPORT_SYMBOL_FOR_KVM(tdh_phymem_page_wbinvd_tdr); -u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, struct page *page) +u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, kvm_pfn_t pfn) { struct tdx_module_args args = {}; - args.rcx = mk_keyed_paddr(hkid, page); + args.rcx = mk_keyed_paddr(hkid, pfn); return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args); } |
