diff options
| author | Mark Brown <broonie@kernel.org> | 2026-05-29 22:46:46 +0100 |
|---|---|---|
| committer | Mark Brown <broonie@kernel.org> | 2026-05-29 22:46:46 +0100 |
| commit | f08a9de96912e608e3f2d2c7c8c12fbdc6b98208 (patch) | |
| tree | 603f9be5717308bc74729857e55a12913436360c /arch | |
| parent | 10f1a9c9db76b998263c755761741b6101946a6e (diff) | |
| parent | b7fbe9a1bf9ee6c967ef77d366ca58c35fcf1887 (diff) | |
| download | linux-next-history-f08a9de96912e608e3f2d2c7c8c12fbdc6b98208.tar.gz | |
Merge branch 'next' of git://git.kernel.org/pub/scm/virt/kvm/kvm.git
Diffstat (limited to 'arch')
33 files changed, 887 insertions, 573 deletions
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 7b572bc24265c..1b4a48bff18f7 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -379,6 +379,7 @@ #define X86_FEATURE_AVIC (15*32+13) /* "avic" Virtual Interrupt Controller */ #define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* "v_vmsave_vmload" Virtual VMSAVE VMLOAD */ #define X86_FEATURE_VGIF (15*32+16) /* "vgif" Virtual GIF */ +#define X86_FEATURE_GMET (15*32+17) /* Guest Mode Execution Trap */ #define X86_FEATURE_X2AVIC (15*32+18) /* "x2avic" Virtual x2apic */ #define X86_FEATURE_V_SPEC_CTRL (15*32+20) /* "v_spec_ctrl" Virtual SPEC_CTRL */ #define X86_FEATURE_VNMI (15*32+25) /* "vnmi" Virtual NMI */ diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index 3776cf5382a26..e4fca997ec797 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -94,6 +94,7 @@ KVM_X86_OP_OPTIONAL(sync_pir_to_irr) KVM_X86_OP_OPTIONAL_RET0(set_tss_addr) KVM_X86_OP_OPTIONAL_RET0(set_identity_map_addr) KVM_X86_OP_OPTIONAL_RET0(get_mt_mask) +KVM_X86_OP_OPTIONAL_RET0(tdp_has_smep) KVM_X86_OP(load_mmu_pgd) KVM_X86_OP_OPTIONAL(link_external_spt) KVM_X86_OP_OPTIONAL(set_external_spte) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c470e40a00aa4..8a53ca6195701 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -191,11 +191,12 @@ enum kvm_reg { VCPU_REGS_R14 = __VCPU_REGS_R14, VCPU_REGS_R15 = __VCPU_REGS_R15, #endif - VCPU_REGS_RIP, - NR_VCPU_REGS, + NR_VCPU_GENERAL_PURPOSE_REGS, - VCPU_EXREG_PDPTR = NR_VCPU_REGS, - VCPU_EXREG_CR0, + VCPU_REG_RIP = NR_VCPU_GENERAL_PURPOSE_REGS, + + VCPU_REG_PDPTR, + VCPU_REG_CR0, /* * Alias AMD's ERAPS (not a real register) to CR3 so that common code * can trigger emulation of the RAP (Return Address Predictor) with @@ -203,13 +204,15 @@ enum kvm_reg { * is cleared on writes to CR3, i.e. marking CR3 dirty will naturally * mark ERAPS dirty as well. */ - VCPU_EXREG_CR3, - VCPU_EXREG_ERAPS = VCPU_EXREG_CR3, - VCPU_EXREG_CR4, - VCPU_EXREG_RFLAGS, - VCPU_EXREG_SEGMENTS, - VCPU_EXREG_EXIT_INFO_1, - VCPU_EXREG_EXIT_INFO_2, + VCPU_REG_CR3, + VCPU_REG_ERAPS = VCPU_REG_CR3, + VCPU_REG_CR4, + VCPU_REG_RFLAGS, + VCPU_REG_SEGMENTS, + VCPU_REG_EXIT_INFO_1, + VCPU_REG_EXIT_INFO_2, + + NR_VCPU_TOTAL_REGS, }; enum { @@ -328,11 +331,11 @@ struct kvm_kernel_irq_routing_entry; * the number of unique SPs that can theoretically be created is 2^n, where n * is the number of bits that are used to compute the role. * - * But, even though there are 20 bits in the mask below, not all combinations + * But, even though there are 21 bits in the mask below, not all combinations * of modes and flags are possible: * * - invalid shadow pages are not accounted, mirror pages are not shadowed, - * so the bits are effectively 18. + * so the bits are effectively 19. * * - quadrant will only be used if has_4_byte_gpte=1 (non-PAE paging); * execonly and ad_disabled are only used for nested EPT which has @@ -343,11 +346,11 @@ struct kvm_kernel_irq_routing_entry; * paging has exactly one upper level, making level completely redundant * when has_4_byte_gpte=1. * - * - on top of this, smep_andnot_wp and smap_andnot_wp are only set if - * cr0_wp=0, therefore these three bits only give rise to 5 possibilities. + * - on top of this, smap_andnot_wp is only set if cr0_wp=0, + * therefore these two bits only give rise to 3 possibilities. * * Therefore, the maximum number of possible upper-level shadow pages for a - * single gfn is a bit less than 2^13. + * single gfn is a bit less than 2^14. */ union kvm_mmu_page_role { u32 word; @@ -356,17 +359,26 @@ union kvm_mmu_page_role { unsigned has_4_byte_gpte:1; unsigned quadrant:2; unsigned direct:1; - unsigned access:3; + unsigned access:4; unsigned invalid:1; unsigned efer_nx:1; unsigned cr0_wp:1; - unsigned smep_andnot_wp:1; unsigned smap_andnot_wp:1; unsigned ad_disabled:1; unsigned guest_mode:1; unsigned passthrough:1; unsigned is_mirror:1; - unsigned :4; + + /* + * cr4_smep is also set for EPT MBEC. Because it affects + * which pages are considered non-present (bit 10 additionally + * must be zero if MBEC is on) it has to be in the base role. + * It also has to be in the base role for AMD GMET because + * kernel-executable pages need to have U=0 with GMET enabled. + */ + unsigned cr4_smep:1; + + unsigned:3; /* * This is left at the top of the word so that @@ -392,10 +404,10 @@ union kvm_mmu_page_role { * tables (because KVM doesn't support Protection Keys with shadow paging), and * CR0.PG, CR4.PAE, and CR4.PSE are indirectly reflected in role.level. * - * Note, SMEP and SMAP are not redundant with sm*p_andnot_wp in the page role. - * If CR0.WP=1, KVM can reuse shadow pages for the guest regardless of SMEP and - * SMAP, but the MMU's permission checks for software walks need to be SMEP and - * SMAP aware regardless of CR0.WP. + * Note, SMAP is not redundant with smap_andnot_wp in the page role. If + * CR0.WP=1, KVM can reuse shadow pages for the guest regardless of SMAP, + * but the MMU's permission checks for software walks need to be SMAP + * aware regardless of CR0.WP. */ union kvm_mmu_extended_role { u32 word; @@ -405,9 +417,15 @@ union kvm_mmu_extended_role { unsigned int cr4_pse:1; unsigned int cr4_pke:1; unsigned int cr4_smap:1; - unsigned int cr4_smep:1; unsigned int cr4_la57:1; unsigned int efer_lma:1; + + /* + * True if either CR4.SMEP or EFER.NXE are set. For AMD NPT + * this is the "real" host CR4.SMEP whereas cr4_smep is + * actually GMET. + */ + unsigned int has_pferr_fetch:1; }; }; @@ -492,7 +510,7 @@ struct kvm_mmu { * Byte index: page fault error code [4:1] * Bit index: pte permissions in ACC_* format */ - u8 permissions[16]; + u16 permissions[16]; u64 *pae_root; u64 *pml4_root; @@ -799,9 +817,10 @@ struct kvm_vcpu_arch { * rip and regs accesses must go through * kvm_{register,rip}_{read,write} functions. */ - unsigned long regs[NR_VCPU_REGS]; - u32 regs_avail; - u32 regs_dirty; + unsigned long regs[NR_VCPU_GENERAL_PURPOSE_REGS]; + unsigned long rip; + DECLARE_BITMAP(regs_avail, NR_VCPU_TOTAL_REGS); + DECLARE_BITMAP(regs_dirty, NR_VCPU_TOTAL_REGS); unsigned long cr0; unsigned long cr0_guest_owned_bits; @@ -1887,6 +1906,7 @@ struct kvm_x86_ops { int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); int (*set_identity_map_addr)(struct kvm *kvm, u64 ident_addr); u8 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); + bool (*tdp_has_smep)(struct kvm *kvm); void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level); @@ -2010,6 +2030,10 @@ struct kvm_x86_nested_ops { struct kvm_nested_state *kvm_state); bool (*get_nested_state_pages)(struct kvm_vcpu *vcpu); int (*write_log_dirty)(struct kvm_vcpu *vcpu, gpa_t l2_gpa); + gpa_t (*translate_nested_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, + u64 access, + struct x86_exception *exception, + u64 pte_access); int (*enable_evmcs)(struct kvm_vcpu *vcpu, uint16_t *vmcs_version); diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index bcfeb5e7c0edf..aa63431ba92c3 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -243,6 +243,7 @@ struct __attribute__ ((__packed__)) vmcb_control_area { #define SVM_MISC_ENABLE_NP BIT(0) #define SVM_MISC_ENABLE_SEV BIT(1) #define SVM_MISC_ENABLE_SEV_ES BIT(2) +#define SVM_MISC_ENABLE_GMET BIT(3) #define SVM_MISC2_ENABLE_V_LBR BIT_ULL(0) #define SVM_MISC2_ENABLE_V_VMLOAD_VMSAVE BIT_ULL(1) diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 49d8551d285d9..3f1b3096ff040 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -536,6 +536,7 @@ enum vmcs_field { #define VMX_EPT_1GB_PAGE_BIT (1ull << 17) #define VMX_EPT_INVEPT_BIT (1ull << 20) #define VMX_EPT_AD_BIT (1ull << 21) +#define VMX_EPT_ADVANCED_VMEXIT_INFO_BIT (1ull << 22) #define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) #define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) @@ -561,10 +562,12 @@ enum vmcs_field { #define VMX_EPT_ACCESS_BIT (1ull << 8) #define VMX_EPT_DIRTY_BIT (1ull << 9) #define VMX_EPT_SUPPRESS_VE_BIT (1ull << 63) + #define VMX_EPT_RWX_MASK (VMX_EPT_READABLE_MASK | \ VMX_EPT_WRITABLE_MASK | \ VMX_EPT_EXECUTABLE_MASK) #define VMX_EPT_MT_MASK (7ull << VMX_EPT_MT_EPTE_SHIFT) +#define VMX_EPT_USER_EXECUTABLE_MASK (1ull << 10) static inline u8 vmx_eptp_page_walk_level(u64 eptp) { @@ -609,17 +612,24 @@ enum vm_entry_failure_code { #define EPT_VIOLATION_PROT_READ BIT(3) #define EPT_VIOLATION_PROT_WRITE BIT(4) #define EPT_VIOLATION_PROT_EXEC BIT(5) -#define EPT_VIOLATION_EXEC_FOR_RING3_LIN BIT(6) +#define EPT_VIOLATION_PROT_USER_EXEC BIT(6) #define EPT_VIOLATION_PROT_MASK (EPT_VIOLATION_PROT_READ | \ EPT_VIOLATION_PROT_WRITE | \ - EPT_VIOLATION_PROT_EXEC) + EPT_VIOLATION_PROT_EXEC | \ + EPT_VIOLATION_PROT_USER_EXEC) #define EPT_VIOLATION_GVA_IS_VALID BIT(7) #define EPT_VIOLATION_GVA_TRANSLATED BIT(8) +#define EPT_VIOLATION_GVA_USER BIT(9) +#define EPT_VIOLATION_GVA_WRITABLE BIT(10) +#define EPT_VIOLATION_GVA_NX BIT(11) #define EPT_VIOLATION_RWX_TO_PROT(__epte) (((__epte) & VMX_EPT_RWX_MASK) << 3) +#define EPT_VIOLATION_USER_EXEC_TO_PROT(__epte) (((__epte) & VMX_EPT_USER_EXECUTABLE_MASK) >> 4) static_assert(EPT_VIOLATION_RWX_TO_PROT(VMX_EPT_RWX_MASK) == (EPT_VIOLATION_PROT_READ | EPT_VIOLATION_PROT_WRITE | EPT_VIOLATION_PROT_EXEC)); +static_assert(EPT_VIOLATION_USER_EXEC_TO_PROT(VMX_EPT_USER_EXECUTABLE_MASK) == + (EPT_VIOLATION_PROT_USER_EXEC)); /* * Exit Qualifications for NOTIFY VM EXIT diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 4438ecac9a89b..015c6947b462e 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -2041,7 +2041,9 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) * read with kvm_read_guest(). */ if (!hc->fast && mmu_is_nested(vcpu)) { - hc->ingpa = translate_nested_gpa(vcpu, hc->ingpa, 0, NULL); + hc->ingpa = kvm_x86_ops.nested_ops->translate_nested_gpa( + vcpu, hc->ingpa, + PFERR_GUEST_FINAL_MASK, NULL, 0); if (unlikely(hc->ingpa == INVALID_GPA)) return HV_STATUS_INVALID_HYPERCALL_INPUT; } diff --git a/arch/x86/kvm/kvm-asm-offsets.c b/arch/x86/kvm/kvm-asm-offsets.c index 24a710d373238..36ac61724dd7f 100644 --- a/arch/x86/kvm/kvm-asm-offsets.c +++ b/arch/x86/kvm/kvm-asm-offsets.c @@ -24,6 +24,7 @@ static void __used common(void) if (IS_ENABLED(CONFIG_KVM_INTEL)) { BLANK(); + OFFSET(VMX_vcpu_arch_regs, vcpu_vmx, vcpu.arch.regs); OFFSET(VMX_spec_ctrl, vcpu_vmx, spec_ctrl); } } diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 8ddb01191d6f6..2ae492ad6412b 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -67,29 +67,29 @@ static inline bool kvm_register_is_available(struct kvm_vcpu *vcpu, enum kvm_reg reg) { kvm_assert_register_caching_allowed(vcpu); - return test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); + return test_bit(reg, vcpu->arch.regs_avail); } static inline bool kvm_register_is_dirty(struct kvm_vcpu *vcpu, enum kvm_reg reg) { kvm_assert_register_caching_allowed(vcpu); - return test_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty); + return test_bit(reg, vcpu->arch.regs_dirty); } static inline void kvm_register_mark_available(struct kvm_vcpu *vcpu, enum kvm_reg reg) { kvm_assert_register_caching_allowed(vcpu); - __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); + __set_bit(reg, vcpu->arch.regs_avail); } static inline void kvm_register_mark_dirty(struct kvm_vcpu *vcpu, enum kvm_reg reg) { kvm_assert_register_caching_allowed(vcpu); - __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); - __set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty); + __set_bit(reg, vcpu->arch.regs_avail); + __set_bit(reg, vcpu->arch.regs_dirty); } /* @@ -102,7 +102,29 @@ static __always_inline bool kvm_register_test_and_mark_available(struct kvm_vcpu enum kvm_reg reg) { kvm_assert_register_caching_allowed(vcpu); - return arch___test_and_set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); + return arch___test_and_set_bit(reg, vcpu->arch.regs_avail); +} + +static __always_inline void kvm_clear_available_registers(struct kvm_vcpu *vcpu, + unsigned long clear_mask) +{ + BUILD_BUG_ON(sizeof(clear_mask) != sizeof(vcpu->arch.regs_avail[0])); + BUILD_BUG_ON(ARRAY_SIZE(vcpu->arch.regs_avail) != 1); + + /* + * Note the bitwise-AND! In practice, a straight write would also work + * as KVM initializes the mask to all ones and never clears registers + * that are eagerly synchronized. Using a bitwise-AND adds a bit of + * sanity checking as incorrectly marking an eagerly sync'd register + * unavailable will generate a WARN due to an unexpected cache request. + */ + vcpu->arch.regs_avail[0] &= ~clear_mask; +} + +static __always_inline void kvm_reset_dirty_registers(struct kvm_vcpu *vcpu) +{ + BUILD_BUG_ON(ARRAY_SIZE(vcpu->arch.regs_dirty) != 1); + vcpu->arch.regs_dirty[0] = 0; } /* @@ -112,7 +134,7 @@ static __always_inline bool kvm_register_test_and_mark_available(struct kvm_vcpu */ static inline unsigned long kvm_register_read_raw(struct kvm_vcpu *vcpu, int reg) { - if (WARN_ON_ONCE((unsigned int)reg >= NR_VCPU_REGS)) + if (WARN_ON_ONCE((unsigned int)reg >= NR_VCPU_GENERAL_PURPOSE_REGS)) return 0; if (!kvm_register_is_available(vcpu, reg)) @@ -124,7 +146,7 @@ static inline unsigned long kvm_register_read_raw(struct kvm_vcpu *vcpu, int reg static inline void kvm_register_write_raw(struct kvm_vcpu *vcpu, int reg, unsigned long val) { - if (WARN_ON_ONCE((unsigned int)reg >= NR_VCPU_REGS)) + if (WARN_ON_ONCE((unsigned int)reg >= NR_VCPU_GENERAL_PURPOSE_REGS)) return; vcpu->arch.regs[reg] = val; @@ -133,12 +155,16 @@ static inline void kvm_register_write_raw(struct kvm_vcpu *vcpu, int reg, static inline unsigned long kvm_rip_read(struct kvm_vcpu *vcpu) { - return kvm_register_read_raw(vcpu, VCPU_REGS_RIP); + if (!kvm_register_is_available(vcpu, VCPU_REG_RIP)) + kvm_x86_call(cache_reg)(vcpu, VCPU_REG_RIP); + + return vcpu->arch.rip; } static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val) { - kvm_register_write_raw(vcpu, VCPU_REGS_RIP, val); + vcpu->arch.rip = val; + kvm_register_mark_dirty(vcpu, VCPU_REG_RIP); } static inline unsigned long kvm_rsp_read(struct kvm_vcpu *vcpu) @@ -155,8 +181,8 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index) { might_sleep(); /* on svm */ - if (!kvm_register_is_available(vcpu, VCPU_EXREG_PDPTR)) - kvm_x86_call(cache_reg)(vcpu, VCPU_EXREG_PDPTR); + if (!kvm_register_is_available(vcpu, VCPU_REG_PDPTR)) + kvm_x86_call(cache_reg)(vcpu, VCPU_REG_PDPTR); return vcpu->arch.walk_mmu->pdptrs[index]; } @@ -170,8 +196,8 @@ static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask) { ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS; if ((tmask & vcpu->arch.cr0_guest_owned_bits) && - !kvm_register_is_available(vcpu, VCPU_EXREG_CR0)) - kvm_x86_call(cache_reg)(vcpu, VCPU_EXREG_CR0); + !kvm_register_is_available(vcpu, VCPU_REG_CR0)) + kvm_x86_call(cache_reg)(vcpu, VCPU_REG_CR0); return vcpu->arch.cr0 & mask; } @@ -192,8 +218,8 @@ static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask) { ulong tmask = mask & KVM_POSSIBLE_CR4_GUEST_BITS; if ((tmask & vcpu->arch.cr4_guest_owned_bits) && - !kvm_register_is_available(vcpu, VCPU_EXREG_CR4)) - kvm_x86_call(cache_reg)(vcpu, VCPU_EXREG_CR4); + !kvm_register_is_available(vcpu, VCPU_REG_CR4)) + kvm_x86_call(cache_reg)(vcpu, VCPU_REG_CR4); return vcpu->arch.cr4 & mask; } @@ -207,8 +233,8 @@ static __always_inline bool kvm_is_cr4_bit_set(struct kvm_vcpu *vcpu, static inline ulong kvm_read_cr3(struct kvm_vcpu *vcpu) { - if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3)) - kvm_x86_call(cache_reg)(vcpu, VCPU_EXREG_CR3); + if (!kvm_register_is_available(vcpu, VCPU_REG_CR3)) + kvm_x86_call(cache_reg)(vcpu, VCPU_REG_CR3); return vcpu->arch.cr3; } diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 830f46145692a..ddf4e467c071e 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -37,6 +37,13 @@ extern bool __read_mostly enable_mmio_caching; #define PT32_ROOT_LEVEL 2 #define PT32E_ROOT_LEVEL 3 +#define ACC_READ_MASK PT_PRESENT_MASK +#define ACC_WRITE_MASK PT_WRITABLE_MASK +#define ACC_USER_MASK PT_USER_MASK /* non EPT */ +#define ACC_USER_EXEC_MASK ACC_USER_MASK /* EPT only */ +#define ACC_EXEC_MASK 8 +#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK | ACC_READ_MASK) + #define KVM_MMU_CR4_ROLE_BITS (X86_CR4_PSE | X86_CR4_PAE | X86_CR4_LA57 | \ X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE) @@ -76,19 +83,24 @@ static inline gfn_t kvm_mmu_max_gfn(void) return (1ULL << (max_gpa_bits - PAGE_SHIFT)) - 1; } +static inline bool mmu_has_mbec(struct kvm_mmu *mmu) +{ + return mmu->root_role.cr4_smep; +} + u8 kvm_mmu_get_max_tdp_level(void); void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask); void kvm_mmu_set_mmio_spte_value(struct kvm *kvm, u64 mmio_value); void kvm_mmu_set_me_spte_mask(u64 me_value, u64 me_mask); -void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only); +void kvm_mmu_set_ept_masks(bool has_ad_bits); void kvm_init_mmu(struct kvm_vcpu *vcpu); -void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0, - unsigned long cr4, u64 efer, gpa_t nested_cr3); +void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr4, + u64 efer, gpa_t nested_cr3, u64 misc_ctl); void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, int huge_page_level, bool accessed_dirty, - gpa_t new_eptp); + bool mbec, gpa_t new_eptp); bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu); int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, u64 fault_address, char *insn, int insn_len); @@ -288,17 +300,17 @@ static inline void kvm_update_page_stats(struct kvm *kvm, int level, int count) atomic64_add(count, &kvm->stat.pages[level - 1]); } -gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u64 access, - struct x86_exception *exception); - static inline gpa_t kvm_translate_gpa(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, gpa_t gpa, u64 access, - struct x86_exception *exception) + struct x86_exception *exception, + u64 pte_access) { if (mmu != &vcpu->arch.nested_mmu) return gpa; - return translate_nested_gpa(vcpu, gpa, access, exception); + return kvm_x86_ops.nested_ops->translate_nested_gpa(vcpu, gpa, access, + exception, + pte_access); } static inline bool kvm_has_mirrored_tdp(const struct kvm *kvm) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 91843e9224d04..db1b82eae4da7 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -56,6 +56,7 @@ #include <asm/io.h> #include <asm/set_memory.h> #include <asm/spec-ctrl.h> +#include <asm/svm.h> #include <asm/vmx.h> #include "trace.h" @@ -230,13 +231,18 @@ static inline bool __maybe_unused is_##reg##_##name(struct kvm_mmu *mmu) \ } BUILD_MMU_ROLE_ACCESSOR(base, cr0, wp); BUILD_MMU_ROLE_ACCESSOR(ext, cr4, pse); -BUILD_MMU_ROLE_ACCESSOR(ext, cr4, smep); +BUILD_MMU_ROLE_ACCESSOR(base, cr4, smep); BUILD_MMU_ROLE_ACCESSOR(ext, cr4, smap); BUILD_MMU_ROLE_ACCESSOR(ext, cr4, pke); BUILD_MMU_ROLE_ACCESSOR(ext, cr4, la57); BUILD_MMU_ROLE_ACCESSOR(base, efer, nx); BUILD_MMU_ROLE_ACCESSOR(ext, efer, lma); +static inline bool has_pferr_fetch(struct kvm_mmu *mmu) +{ + return mmu->cpu_role.ext.has_pferr_fetch; +} + static inline bool is_cr0_pg(struct kvm_mmu *mmu) { return mmu->cpu_role.base.level > 0; @@ -2023,7 +2029,7 @@ static bool kvm_sync_page_check(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) */ const union kvm_mmu_page_role sync_role_ign = { .level = 0xf, - .access = 0x7, + .access = ACC_ALL, .quadrant = 0x3, .passthrough = 0x1, }; @@ -3459,12 +3465,13 @@ static int direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { struct kvm_shadow_walk_iterator it; struct kvm_mmu_page *sp; - int ret; + int ret, access; gfn_t base_gfn = fault->gfn; kvm_mmu_hugepage_adjust(vcpu, fault); - trace_kvm_mmu_spte_requested(fault); + access = vcpu->arch.mmu->root_role.access; + trace_kvm_mmu_spte_requested(fault, access); for_each_shadow_entry(vcpu, fault->addr, it) { /* * We cannot overwrite existing page tables with an NX @@ -3477,7 +3484,7 @@ static int direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) if (it.level == fault->goal_level) break; - sp = kvm_mmu_get_child_sp(vcpu, it.sptep, base_gfn, true, ACC_ALL); + sp = kvm_mmu_get_child_sp(vcpu, it.sptep, base_gfn, true, access); if (sp == ERR_PTR(-EEXIST)) continue; @@ -3490,7 +3497,7 @@ static int direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) if (WARN_ON_ONCE(it.level != fault->goal_level)) return -EFAULT; - ret = mmu_set_spte(vcpu, fault->slot, it.sptep, ACC_ALL, + ret = mmu_set_spte(vcpu, fault->slot, it.sptep, access, base_gfn, fault->pfn, fault); if (ret == RET_PF_SPURIOUS) return ret; @@ -4361,7 +4368,14 @@ static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, { if (exception) exception->error_code = 0; - return kvm_translate_gpa(vcpu, mmu, vaddr, access, exception); + /* + * EPT MBEC uses the effective access bits from the PTE to distinguish + * user and supervisor accesses, and treats every linear address as a + * user-mode address if CR0.PG=0. Therefore *include* ACC_USER_MASK in + * the last argument to kvm_translate_gpa (which NPT does not use). + */ + return kvm_translate_gpa(vcpu, mmu, vaddr, access | PFERR_GUEST_FINAL_MASK, + exception, ACC_ALL); } static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct) @@ -5497,7 +5511,7 @@ static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, static inline bool boot_cpu_is_amd(void) { WARN_ON_ONCE(!tdp_enabled); - return shadow_x_mask == 0; + return shadow_xs_mask == 0; } /* @@ -5542,55 +5556,106 @@ reset_ept_shadow_zero_bits_mask(struct kvm_mmu *context, bool execonly) max_huge_page_level); } -#define BYTE_MASK(access) \ - ((1 & (access) ? 2 : 0) | \ - (2 & (access) ? 4 : 0) | \ - (3 & (access) ? 8 : 0) | \ - (4 & (access) ? 16 : 0) | \ - (5 & (access) ? 32 : 0) | \ - (6 & (access) ? 64 : 0) | \ - (7 & (access) ? 128 : 0)) - +/* + * Build a mask with all combinations of PTE access rights that + * include the given access bit. The mask can be queried with + * "mask & (1 << access)", where access is a combination of + * ACC_* bits. + * + * By mixing and matching multiple masks returned by ACC_BITS_MASK, + * update_permission_bitmask() builds what is effectively a + * two-dimensional array of bools. The second dimension is + * provided by individual bits of permissions[pfec >> 1], and + * logical &, | and ~ operations operate on all the 16 possible + * combinations of ACC_* bits. + */ +#define ACC_BITS_MASK(access) \ + ((1 & (access) ? 1 << 1 : 0) | \ + (2 & (access) ? 1 << 2 : 0) | \ + (3 & (access) ? 1 << 3 : 0) | \ + (4 & (access) ? 1 << 4 : 0) | \ + (5 & (access) ? 1 << 5 : 0) | \ + (6 & (access) ? 1 << 6 : 0) | \ + (7 & (access) ? 1 << 7 : 0) | \ + (8 & (access) ? 1 << 8 : 0) | \ + (9 & (access) ? 1 << 9 : 0) | \ + (10 & (access) ? 1 << 10 : 0) | \ + (11 & (access) ? 1 << 11 : 0) | \ + (12 & (access) ? 1 << 12 : 0) | \ + (13 & (access) ? 1 << 13 : 0) | \ + (14 & (access) ? 1 << 14 : 0) | \ + (15 & (access) ? 1 << 15 : 0)) -static void update_permission_bitmask(struct kvm_mmu *mmu, bool ept) +static void update_permission_bitmask(struct kvm_mmu *mmu, bool tdp, bool ept) { - unsigned byte; + unsigned index; - const u8 x = BYTE_MASK(ACC_EXEC_MASK); - const u8 w = BYTE_MASK(ACC_WRITE_MASK); - const u8 u = BYTE_MASK(ACC_USER_MASK); + const u16 w = ACC_BITS_MASK(ACC_WRITE_MASK); + const u16 r = ACC_BITS_MASK(ACC_READ_MASK); bool cr4_smep = is_cr4_smep(mmu); bool cr4_smap = is_cr4_smap(mmu); bool cr0_wp = is_cr0_wp(mmu); bool efer_nx = is_efer_nx(mmu); - for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) { - unsigned pfec = byte << 1; + /* + * In hardware, page fault error codes are generated (as the name + * suggests) on any kind of page fault. permission_fault() and + * paging_tmpl.h already use the same bits after a successful page + * table walk, to indicate the kind of access being performed. + * + * However, PFERR_PRESENT_MASK and PFERR_RSVD_MASK are never set here, + * exactly because the page walk is successful. PFERR_PRESENT_MASK is + * removed by the shift, while PFERR_RSVD_MASK is repurposed in + * permission_fault() to indicate accesses that are *not* subject to + * SMAP restrictions. + */ + for (index = 0; index < ARRAY_SIZE(mmu->permissions); ++index) { + unsigned pfec = index << 1; /* - * Each "*f" variable has a 1 bit for each UWX value + * Each "*f" variable has a 1 bit for each ACC_* combo * that causes a fault with the given PFEC. */ + /* Faults from reads to non-readable pages */ + u16 rf = (pfec & (PFERR_WRITE_MASK|PFERR_FETCH_MASK)) ? 0 : (u16)~r; /* Faults from writes to non-writable pages */ - u8 wf = (pfec & PFERR_WRITE_MASK) ? (u8)~w : 0; + u16 wf = (pfec & PFERR_WRITE_MASK) ? (u16)~w : 0; /* Faults from user mode accesses to supervisor pages */ - u8 uf = (pfec & PFERR_USER_MASK) ? (u8)~u : 0; - /* Faults from fetches of non-executable pages*/ - u8 ff = (pfec & PFERR_FETCH_MASK) ? (u8)~x : 0; - /* Faults from kernel mode fetches of user pages */ - u8 smepf = 0; + u16 uf = 0; + /* Faults from fetches of non-executable pages */ + u16 ff = 0; /* Faults from kernel mode accesses of user pages */ - u8 smapf = 0; + u16 smapf = 0; + + if (ept) { + const u16 xs = ACC_BITS_MASK(ACC_EXEC_MASK); + const u16 xu = ACC_BITS_MASK(ACC_USER_EXEC_MASK); + + if (pfec & PFERR_FETCH_MASK) { + /* Ignore XU unless MBEC is enabled. */ + if (cr4_smep) + ff = pfec & PFERR_USER_MASK ? (u16)~xu : (u16)~xs; + else + ff = (u16)~xs; + } + } else { + const u16 x = ACC_BITS_MASK(ACC_EXEC_MASK); + const u16 u = ACC_BITS_MASK(ACC_USER_MASK); - if (!ept) { /* Faults from kernel mode accesses to user pages */ - u8 kf = (pfec & PFERR_USER_MASK) ? 0 : u; + u16 kf = (pfec & PFERR_USER_MASK) ? 0 : u; - /* Not really needed: !nx will cause pte.nx to fault */ - if (!efer_nx) - ff = 0; + /* + * For NPT GMET, U=0 does not affect reads and writes. Fetches + * are handled below via cr4_smep. + */ + if (!(tdp && cr4_smep)) + uf = (pfec & PFERR_USER_MASK) ? (u16)~u : 0; + + if (efer_nx) + ff |= (pfec & PFERR_FETCH_MASK) ? (u16)~x : 0; /* Allow supervisor writes if !cr0.wp */ if (!cr0_wp) @@ -5598,7 +5663,7 @@ static void update_permission_bitmask(struct kvm_mmu *mmu, bool ept) /* Disallow supervisor fetches of user code if cr4.smep */ if (cr4_smep) - smepf = (pfec & PFERR_FETCH_MASK) ? kf : 0; + ff |= (pfec & PFERR_FETCH_MASK) ? kf : 0; /* * SMAP:kernel-mode data accesses from user-mode @@ -5611,16 +5676,15 @@ static void update_permission_bitmask(struct kvm_mmu *mmu, bool ept) * - The access is supervisor mode * - If implicit supervisor access or X86_EFLAGS_AC is clear * - * Here, we cover the first four conditions. - * The fifth is computed dynamically in permission_fault(); - * PFERR_RSVD_MASK bit will be set in PFEC if the access is - * *not* subject to SMAP restrictions. + * Here, we cover the first four conditions. The fifth + * is computed dynamically in permission_fault() and + * communicated by setting PFERR_RSVD_MASK. */ if (cr4_smap) smapf = (pfec & (PFERR_RSVD_MASK|PFERR_FETCH_MASK)) ? 0 : kf; } - mmu->permissions[byte] = ff | uf | wf | smepf | smapf; + mmu->permissions[index] = ff | uf | wf | rf | smapf; } } @@ -5699,7 +5763,7 @@ static void reset_guest_paging_metadata(struct kvm_vcpu *vcpu, return; reset_guest_rsvds_bits_mask(vcpu, mmu); - update_permission_bitmask(mmu, false); + update_permission_bitmask(mmu, mmu == &vcpu->arch.guest_mmu, false); update_pkru_bitmask(mmu); } @@ -5734,7 +5798,7 @@ static union kvm_cpu_role kvm_calc_cpu_role(struct kvm_vcpu *vcpu, role.base.efer_nx = ____is_efer_nx(regs); role.base.cr0_wp = ____is_cr0_wp(regs); - role.base.smep_andnot_wp = ____is_cr4_smep(regs) && !____is_cr0_wp(regs); + role.base.cr4_smep = ____is_cr4_smep(regs); role.base.smap_andnot_wp = ____is_cr4_smap(regs) && !____is_cr0_wp(regs); role.base.has_4_byte_gpte = !____is_cr4_pae(regs); @@ -5746,7 +5810,6 @@ static union kvm_cpu_role kvm_calc_cpu_role(struct kvm_vcpu *vcpu, else role.base.level = PT32_ROOT_LEVEL; - role.ext.cr4_smep = ____is_cr4_smep(regs); role.ext.cr4_smap = ____is_cr4_smap(regs); role.ext.cr4_pse = ____is_cr4_pse(regs); @@ -5754,6 +5817,8 @@ static union kvm_cpu_role kvm_calc_cpu_role(struct kvm_vcpu *vcpu, role.ext.cr4_pke = ____is_efer_lma(regs) && ____is_cr4_pke(regs); role.ext.cr4_la57 = ____is_efer_lma(regs) && ____is_cr4_la57(regs); role.ext.efer_lma = ____is_efer_lma(regs); + + role.ext.has_pferr_fetch = role.base.efer_nx | role.base.cr4_smep; return role; } @@ -5803,8 +5868,8 @@ kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, { union kvm_mmu_page_role role = {0}; - role.access = ACC_ALL; role.cr0_wp = true; + role.cr4_smep = kvm_x86_call(tdp_has_smep)(vcpu->kvm); role.efer_nx = true; role.smm = cpu_role.base.smm; role.guest_mode = cpu_role.base.guest_mode; @@ -5813,6 +5878,11 @@ kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, role.direct = true; role.has_4_byte_gpte = false; + /* All TDP pages are supervisor-executable */ + role.access = ACC_ALL; + if (role.cr4_smep && shadow_user_mask) + role.access &= ~ACC_USER_MASK; + return role; } @@ -5892,13 +5962,13 @@ static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, shadow_mmu_init_context(vcpu, context, cpu_role, root_role); } -void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0, - unsigned long cr4, u64 efer, gpa_t nested_cr3) +void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr4, + u64 efer, gpa_t nested_cr3, u64 misc_ctl) { struct kvm_mmu *context = &vcpu->arch.guest_mmu; struct kvm_mmu_role_regs regs = { - .cr0 = cr0, - .cr4 = cr4 & ~X86_CR4_PKE, + .cr0 = X86_CR0_PG | X86_CR0_WP, + .cr4 = cr4 & ~(X86_CR4_PKE | X86_CR4_SMAP), .efer = efer, }; union kvm_cpu_role cpu_role = kvm_calc_cpu_role(vcpu, ®s); @@ -5906,6 +5976,7 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0, /* NPT requires CR0.PG=1. */ WARN_ON_ONCE(cpu_role.base.direct || !cpu_role.base.guest_mode); + cpu_role.base.cr4_smep = (misc_ctl & SVM_MISC_ENABLE_GMET) != 0; root_role = cpu_role.base; root_role.level = kvm_mmu_get_tdp_level(vcpu); @@ -5920,7 +5991,7 @@ EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_init_shadow_npt_mmu); static union kvm_cpu_role kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty, - bool execonly, u8 level) + bool execonly, u8 level, bool mbec) { union kvm_cpu_role role = {0}; @@ -5930,6 +6001,7 @@ kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty, */ WARN_ON_ONCE(is_smm(vcpu)); role.base.level = level; + role.base.cr4_smep = mbec; role.base.has_4_byte_gpte = false; role.base.direct = false; role.base.ad_disabled = !accessed_dirty; @@ -5945,13 +6017,13 @@ kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty, void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, int huge_page_level, bool accessed_dirty, - gpa_t new_eptp) + bool mbec, gpa_t new_eptp) { struct kvm_mmu *context = &vcpu->arch.guest_mmu; u8 level = vmx_eptp_page_walk_level(new_eptp); union kvm_cpu_role new_mode = kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty, - execonly, level); + execonly, level, mbec); if (new_mode.as_u64 != context->cpu_role.as_u64) { /* EPT, and thus nested EPT, does not consume CR0, CR4, nor EFER. */ @@ -5962,7 +6034,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, context->gva_to_gpa = ept_gva_to_gpa; context->sync_spte = ept_sync_spte; - update_permission_bitmask(context, true); + update_permission_bitmask(context, true, true); context->pkru_mask = 0; reset_rsvds_bits_mask_ept(vcpu, context, execonly, huge_page_level); reset_ept_shadow_zero_bits_mask(context, execonly); diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h index 764e3015d021b..fa01719baf8d4 100644 --- a/arch/x86/kvm/mmu/mmutrace.h +++ b/arch/x86/kvm/mmu/mmutrace.h @@ -25,7 +25,8 @@ #define KVM_MMU_PAGE_PRINTK() ({ \ const char *saved_ptr = trace_seq_buffer_ptr(p); \ static const char *access_str[] = { \ - "---", "--x", "w--", "w-x", "-u-", "-ux", "wu-", "wux" \ + "----", "r---", "-w--", "rw--", "--u-", "r-u-", "-wu-", "rwu-", \ + "---x", "r--x", "-w-x", "rw-x", "--ux", "r-ux", "-wux", "rwux" \ }; \ union kvm_mmu_page_role role; \ \ @@ -356,8 +357,8 @@ TRACE_EVENT( __entry->sptep = virt_to_phys(sptep); __entry->level = level; __entry->r = shadow_present_mask || (__entry->spte & PT_PRESENT_MASK); - __entry->x = is_executable_pte(__entry->spte); - __entry->u = shadow_user_mask ? !!(__entry->spte & shadow_user_mask) : -1; + __entry->x = (__entry->spte & (shadow_xs_mask | shadow_nx_mask)) == shadow_xs_mask; + __entry->u = !!(__entry->spte & (shadow_xu_mask | shadow_user_mask)); ), TP_printk("gfn %llx spte %llx (%s%s%s%s) level %d at %llx", @@ -365,30 +366,32 @@ TRACE_EVENT( __entry->r ? "r" : "-", __entry->spte & PT_WRITABLE_MASK ? "w" : "-", __entry->x ? "x" : "-", - __entry->u == -1 ? "" : (__entry->u ? "u" : "-"), + __entry->u ? "u" : "-", __entry->level, __entry->sptep ) ); TRACE_EVENT( kvm_mmu_spte_requested, - TP_PROTO(struct kvm_page_fault *fault), - TP_ARGS(fault), + TP_PROTO(struct kvm_page_fault *fault, u8 access), + TP_ARGS(fault, access), TP_STRUCT__entry( __field(u64, gfn) __field(u64, pfn) __field(u8, level) + __field(u8, access) ), TP_fast_assign( __entry->gfn = fault->gfn; __entry->pfn = fault->pfn | (fault->gfn & (KVM_PAGES_PER_HPAGE(fault->goal_level) - 1)); __entry->level = fault->goal_level; + __entry->access = access; ), - TP_printk("gfn %llx pfn %llx level %d", - __entry->gfn, __entry->pfn, __entry->level + TP_printk("gfn %llx pfn %llx level %d access %x", + __entry->gfn, __entry->pfn, __entry->level, __entry->access ) ); diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 901cd2bd40b84..07100bbfc2701 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -124,12 +124,17 @@ static inline void FNAME(protect_clean_gpte)(struct kvm_mmu *mmu, unsigned *acce *access &= mask; } -static inline int FNAME(is_present_gpte)(unsigned long pte) +static inline int FNAME(is_present_gpte)(struct kvm_mmu *mmu, + unsigned long pte) { #if PTTYPE != PTTYPE_EPT return pte & PT_PRESENT_MASK; #else - return pte & 7; + /* + * For EPT, an entry is present if any of bits 2:0 are set. + * With mode-based execute control, bit 10 also indicates presence. + */ + return pte & (7 | (mmu_has_mbec(mmu) ? VMX_EPT_USER_EXECUTABLE_MASK : 0)); #endif } @@ -152,7 +157,7 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, u64 gpte) { - if (!FNAME(is_present_gpte)(gpte)) + if (!FNAME(is_present_gpte)(vcpu->arch.mmu, gpte)) goto no_present; /* Prefetch only accessed entries (unless A/D bits are disabled). */ @@ -170,25 +175,31 @@ no_present: return true; } -/* - * For PTTYPE_EPT, a page table can be executable but not readable - * on supported processors. Therefore, set_spte does not automatically - * set bit 0 if execute only is supported. Here, we repurpose ACC_USER_MASK - * to signify readability since it isn't used in the EPT case - */ static inline unsigned FNAME(gpte_access)(u64 gpte) { unsigned access; + /* + * Set bits in ACC_*_MASK even if they might not be used in the + * actual checks. For example, if EFER.NX is clear permission_fault() + * will ignore ACC_EXEC_MASK, and if MBEC is disabled it will + * ignore ACC_USER_EXEC_MASK. + */ #if PTTYPE == PTTYPE_EPT access = ((gpte & VMX_EPT_WRITABLE_MASK) ? ACC_WRITE_MASK : 0) | ((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) | - ((gpte & VMX_EPT_READABLE_MASK) ? ACC_USER_MASK : 0); + ((gpte & VMX_EPT_READABLE_MASK) ? ACC_READ_MASK : 0) | + ((gpte & VMX_EPT_USER_EXECUTABLE_MASK) ? ACC_USER_EXEC_MASK : 0); #else - BUILD_BUG_ON(ACC_EXEC_MASK != PT_PRESENT_MASK); - BUILD_BUG_ON(ACC_EXEC_MASK != 1); + /* + * P is set here, so the page is always readable and W/U/!NX represent + * allowed accesses. + */ + BUILD_BUG_ON(ACC_READ_MASK != PT_PRESENT_MASK); + BUILD_BUG_ON(ACC_WRITE_MASK != PT_WRITABLE_MASK); + BUILD_BUG_ON(ACC_USER_MASK != PT_USER_MASK); + BUILD_BUG_ON(ACC_EXEC_MASK & (PT_WRITABLE_MASK | PT_USER_MASK | PT_PRESENT_MASK)); access = gpte & (PT_WRITABLE_MASK | PT_USER_MASK | PT_PRESENT_MASK); - /* Combine NX with P (which is set here) to get ACC_EXEC_MASK. */ - access ^= (gpte >> PT64_NX_SHIFT); + access |= gpte & PT64_NX_MASK ? 0 : ACC_EXEC_MASK; #endif return access; @@ -332,7 +343,7 @@ retry_walk: if (walker->level == PT32E_ROOT_LEVEL) { pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3); trace_kvm_mmu_paging_element(pte, walker->level); - if (!FNAME(is_present_gpte)(pte)) + if (!FNAME(is_present_gpte)(mmu, pte)) goto error; --walker->level; } @@ -377,7 +388,8 @@ retry_walk: walker->pte_gpa[walker->level - 1] = pte_gpa; real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(table_gfn), - nested_access, &walker->fault); + nested_access | PFERR_GUEST_PAGE_MASK, + &walker->fault, 0); /* * FIXME: This can happen if emulation (for of an INS/OUTS @@ -414,7 +426,7 @@ retry_walk: */ pte_access = pt_access & (pte ^ walk_nx_mask); - if (unlikely(!FNAME(is_present_gpte)(pte))) + if (unlikely(!FNAME(is_present_gpte)(mmu, pte))) goto error; if (unlikely(FNAME(is_rsvd_bits_set)(mmu, pte, walker->level))) { @@ -445,7 +457,9 @@ retry_walk: gfn += pse36_gfn_delta(pte); #endif - real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(gfn), access, &walker->fault); + real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(gfn), + access | PFERR_GUEST_FINAL_MASK, + &walker->fault, walker->pte_access); if (real_gpa == INVALID_GPA) return 0; @@ -475,7 +489,7 @@ retry_walk: error: errcode |= write_fault | user_fault; - if (fetch_fault && (is_efer_nx(mmu) || is_cr4_smep(mmu))) + if (fetch_fault && has_pferr_fetch(mmu)) errcode |= PFERR_FETCH_MASK; walker->fault.vector = PF_VECTOR; @@ -492,7 +506,7 @@ error: * [2:0] - Derive from the access bits. The exit_qualification might be * out of date if it is serving an EPT misconfiguration. * [5:3] - Calculated by the page walk of the guest EPT page tables - * [7:8] - Derived from [7:8] of real exit_qualification + * [7:11] - Derived from [7:11] of real exit_qualification * * The other bits are set to 0. */ @@ -501,16 +515,27 @@ error: if (write_fault) walker->fault.exit_qualification |= EPT_VIOLATION_ACC_WRITE; - if (user_fault) - walker->fault.exit_qualification |= EPT_VIOLATION_ACC_READ; - if (fetch_fault) + else if (fetch_fault) walker->fault.exit_qualification |= EPT_VIOLATION_ACC_INSTR; + else + walker->fault.exit_qualification |= EPT_VIOLATION_ACC_READ; + + /* + * Accesses to guest paging structures are either "reads" or + * "read+write" accesses, so consider them the latter if write_fault + * is true. + */ + if (access & PFERR_GUEST_PAGE_MASK) + walker->fault.exit_qualification |= EPT_VIOLATION_ACC_READ; /* * Note, pte_access holds the raw RWX bits from the EPTE, not * ACC_*_MASK flags! */ walker->fault.exit_qualification |= EPT_VIOLATION_RWX_TO_PROT(pte_access); + if (mmu_has_mbec(mmu)) + walker->fault.exit_qualification |= + EPT_VIOLATION_USER_EXEC_TO_PROT(pte_access); } #endif walker->fault.address = addr; @@ -709,7 +734,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, */ kvm_mmu_hugepage_adjust(vcpu, fault); - trace_kvm_mmu_spte_requested(fault); + trace_kvm_mmu_spte_requested(fault, gw->pte_access); for (; shadow_walk_okay(&it); shadow_walk_next(&it)) { /* diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index 4e753386c8d46..72d2394e089c9 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -30,8 +30,9 @@ bool __read_mostly kvm_ad_enabled; u64 __read_mostly shadow_host_writable_mask; u64 __read_mostly shadow_mmu_writable_mask; u64 __read_mostly shadow_nx_mask; -u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ u64 __read_mostly shadow_user_mask; +u64 __read_mostly shadow_xs_mask; /* mutual exclusive with nx_mask and user_mask */ +u64 __read_mostly shadow_xu_mask; /* mutual exclusive with nx_mask and user_mask */ u64 __read_mostly shadow_accessed_mask; u64 __read_mostly shadow_dirty_mask; u64 __read_mostly shadow_mmio_value; @@ -195,12 +196,6 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int is_host_mmio = -1; bool wrprot = false; - /* - * For the EPT case, shadow_present_mask has no RWX bits set if - * exec-only page table entries are supported. In that case, - * ACC_USER_MASK and shadow_user_mask are used to represent - * read access. See FNAME(gpte_access) in paging_tmpl.h. - */ WARN_ON_ONCE((pte_access | shadow_present_mask) == SHADOW_NONPRESENT_VALUE); if (sp->role.ad_disabled) @@ -224,18 +219,26 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, * would tie make_spte() further to vCPU/MMU state, and add complexity * just to optimize a mode that is anything but performance critical. */ - if (level > PG_LEVEL_4K && (pte_access & ACC_EXEC_MASK) && - is_nx_huge_page_enabled(vcpu->kvm)) { + if (level > PG_LEVEL_4K && is_nx_huge_page_enabled(vcpu->kvm)) { pte_access &= ~ACC_EXEC_MASK; + if (shadow_xu_mask) + pte_access &= ~ACC_USER_EXEC_MASK; } - if (pte_access & ACC_EXEC_MASK) - spte |= shadow_x_mask; - else - spte |= shadow_nx_mask; + if (pte_access & ACC_READ_MASK) + spte |= PT_PRESENT_MASK; /* or VMX_EPT_READABLE_MASK */ - if (pte_access & ACC_USER_MASK) - spte |= shadow_user_mask; + if (shadow_nx_mask) { + if (!(pte_access & ACC_EXEC_MASK)) + spte |= shadow_nx_mask; + if (pte_access & ACC_USER_MASK) + spte |= shadow_user_mask; + } else { + if (pte_access & ACC_EXEC_MASK) + spte |= shadow_xs_mask; + if (pte_access & ACC_USER_EXEC_MASK) + spte |= shadow_xu_mask; + } if (level > PG_LEVEL_4K) spte |= PT_PAGE_SIZE_MASK; @@ -318,14 +321,18 @@ static u64 modify_spte_protections(u64 spte, u64 set, u64 clear) return spte; } -static u64 make_spte_executable(u64 spte) +static u64 change_spte_executable(u64 spte, u8 access) { - return modify_spte_protections(spte, shadow_x_mask, shadow_nx_mask); -} + u64 set, clear; -static u64 make_spte_nonexecutable(u64 spte) -{ - return modify_spte_protections(spte, shadow_nx_mask, shadow_x_mask); + if (shadow_nx_mask) + set = (access & ACC_EXEC_MASK) ? 0 : shadow_nx_mask; + else + set = + (access & ACC_EXEC_MASK ? shadow_xs_mask : 0) | + (access & ACC_USER_EXEC_MASK ? shadow_xu_mask : 0); + clear = set ^ (shadow_nx_mask | shadow_xs_mask | shadow_xu_mask); + return modify_spte_protections(spte, set, clear); } /* @@ -357,8 +364,8 @@ u64 make_small_spte(struct kvm *kvm, u64 huge_spte, * the page executable as the NX hugepage mitigation no longer * applies. */ - if ((role.access & ACC_EXEC_MASK) && is_nx_huge_page_enabled(kvm)) - child_spte = make_spte_executable(child_spte); + if (is_nx_huge_page_enabled(kvm)) + child_spte = change_spte_executable(child_spte, role.access); } return child_spte; @@ -380,7 +387,7 @@ u64 make_huge_spte(struct kvm *kvm, u64 small_spte, int level) huge_spte &= KVM_HPAGE_MASK(level) | ~PAGE_MASK; if (is_nx_huge_page_enabled(kvm)) - huge_spte = make_spte_nonexecutable(huge_spte); + huge_spte = change_spte_executable(huge_spte, 0); return huge_spte; } @@ -390,7 +397,8 @@ u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled) u64 spte = SPTE_MMU_PRESENT_MASK; spte |= __pa(child_pt) | shadow_present_mask | PT_WRITABLE_MASK | - shadow_user_mask | shadow_x_mask | shadow_me_value; + PT_PRESENT_MASK /* or VMX_EPT_READABLE_MASK */ | + shadow_user_mask | shadow_xs_mask | shadow_xu_mask | shadow_me_value; if (ad_disabled) spte |= SPTE_TDP_AD_DISABLED; @@ -490,20 +498,37 @@ void kvm_mmu_set_me_spte_mask(u64 me_value, u64 me_mask) } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_set_me_spte_mask); -void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only) +void kvm_mmu_set_ept_masks(bool has_ad_bits) { kvm_ad_enabled = has_ad_bits; - shadow_user_mask = VMX_EPT_READABLE_MASK; + shadow_user_mask = 0; shadow_accessed_mask = VMX_EPT_ACCESS_BIT; shadow_dirty_mask = VMX_EPT_DIRTY_BIT; shadow_nx_mask = 0ull; - shadow_x_mask = VMX_EPT_EXECUTABLE_MASK; - /* VMX_EPT_SUPPRESS_VE_BIT is needed for W or X violation. */ - shadow_present_mask = - (has_exec_only ? 0ull : VMX_EPT_READABLE_MASK) | VMX_EPT_SUPPRESS_VE_BIT; + shadow_xs_mask = VMX_EPT_EXECUTABLE_MASK; + + /* + * The MMU always maps ACC_EXEC_MASK and ACC_USER_EXEC_MASK to the + * XS and XU bits of shadow EPT entries, regardless of whether MBEC + * is available on the host or enabled in the VMCS. + * + * For the non-nested case, pages are mapped with ACC_EXEC_MASK + * and ACC_USER_EXEC_MASK set in tandem, so XS == XU and the + * host's MBEC setting does not matter. On hardware without MBEC + * the XU bit is reserved-as-ignored, and setting it does no harm. + * + * For nested EPT, when MBEC is disabled by L1, correctness relies + * on (a) ignoring bit 10 of the gPTE in is_present_gpte(), rather + * than treating it as a present bit, and (b) permission_fault() + * using an mmu->permissions[] array that effectively ignores + * ACC_USER_EXEC_MASK. Bit 10 of the gPTE does end up mirrored + * in the sPTEs but is ignored because L2 runs with MBEC disabled. + */ + shadow_xu_mask = VMX_EPT_USER_EXECUTABLE_MASK; + shadow_present_mask = VMX_EPT_SUPPRESS_VE_BIT; - shadow_acc_track_mask = VMX_EPT_RWX_MASK; + shadow_acc_track_mask = VMX_EPT_RWX_MASK | VMX_EPT_USER_EXECUTABLE_MASK; shadow_host_writable_mask = EPT_SPTE_HOST_WRITABLE; shadow_mmu_writable_mask = EPT_SPTE_MMU_WRITABLE; @@ -551,7 +576,8 @@ void kvm_mmu_reset_all_pte_masks(void) shadow_accessed_mask = PT_ACCESSED_MASK; shadow_dirty_mask = PT_DIRTY_MASK; shadow_nx_mask = PT64_NX_MASK; - shadow_x_mask = 0; + shadow_xs_mask = 0; + shadow_xu_mask = 0; shadow_present_mask = PT_PRESENT_MASK; shadow_acc_track_mask = 0; diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 8c0ffa2cded69..13eea94dd2128 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -18,9 +18,19 @@ #define SPTE_MMU_PRESENT_MASK BIT_ULL(11) /* + * The ignored high bits are allocated as follows: + * - bits 52, 54: saved X-R bits for access tracking when EPT does not have A/D + * - bits 53 (EPT only): host writable + * - bits 55 (EPT only): MMU-writable + * - bits 56-59: unused + * - bits 60-61: type of A/D tracking + * - bits 62 (EPT only): saved XU bit for disabled AD + */ + +/* * TDP SPTES (more specifically, EPT SPTEs) may not have A/D bits, and may also * be restricted to using write-protection (for L2 when CPU dirty logging, i.e. - * PML, is enabled). Use bits 52 and 53 to hold the type of A/D tracking that + * PML, is enabled). Use bits 60 and 61 to hold the type of A/D tracking that * is must be employed for a given TDP SPTE. * * Note, the "enabled" mask must be '0', as bits 62:52 are _reserved_ for PAE @@ -29,7 +39,7 @@ * TDP with CPU dirty logging (PML). If NPT ever gains PML-like support, it * must be restricted to 64-bit KVM. */ -#define SPTE_TDP_AD_SHIFT 52 +#define SPTE_TDP_AD_SHIFT 60 #define SPTE_TDP_AD_MASK (3ULL << SPTE_TDP_AD_SHIFT) #define SPTE_TDP_AD_ENABLED (0ULL << SPTE_TDP_AD_SHIFT) #define SPTE_TDP_AD_DISABLED (1ULL << SPTE_TDP_AD_SHIFT) @@ -42,18 +52,6 @@ static_assert(SPTE_TDP_AD_ENABLED == 0); #define SPTE_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) #endif -#define SPTE_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \ - | shadow_x_mask | shadow_nx_mask | shadow_me_mask) - -#define ACC_EXEC_MASK 1 -#define ACC_WRITE_MASK PT_WRITABLE_MASK -#define ACC_USER_MASK PT_USER_MASK -#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) - -/* The mask for the R/X bits in EPT PTEs */ -#define SPTE_EPT_READABLE_MASK 0x1ull -#define SPTE_EPT_EXECUTABLE_MASK 0x4ull - #define SPTE_LEVEL_BITS 9 #define SPTE_LEVEL_SHIFT(level) __PT_LEVEL_SHIFT(level, SPTE_LEVEL_BITS) #define SPTE_INDEX(address, level) __PT_INDEX(address, level, SPTE_LEVEL_BITS) @@ -66,9 +64,10 @@ static_assert(SPTE_TDP_AD_ENABLED == 0); * restored only when a write is attempted to the page. This mask obviously * must not overlap the A/D type mask. */ -#define SHADOW_ACC_TRACK_SAVED_BITS_MASK (SPTE_EPT_READABLE_MASK | \ - SPTE_EPT_EXECUTABLE_MASK) -#define SHADOW_ACC_TRACK_SAVED_BITS_SHIFT 54 +#define SHADOW_ACC_TRACK_SAVED_BITS_MASK (VMX_EPT_READABLE_MASK | \ + VMX_EPT_EXECUTABLE_MASK | \ + VMX_EPT_USER_EXECUTABLE_MASK) +#define SHADOW_ACC_TRACK_SAVED_BITS_SHIFT 52 #define SHADOW_ACC_TRACK_SAVED_MASK (SHADOW_ACC_TRACK_SAVED_BITS_MASK << \ SHADOW_ACC_TRACK_SAVED_BITS_SHIFT) static_assert(!(SPTE_TDP_AD_MASK & SHADOW_ACC_TRACK_SAVED_MASK)); @@ -87,8 +86,8 @@ static_assert(!(SPTE_TDP_AD_MASK & SHADOW_ACC_TRACK_SAVED_MASK)); * to not overlap the A/D type mask or the saved access bits of access-tracked * SPTEs when A/D bits are disabled. */ -#define EPT_SPTE_HOST_WRITABLE BIT_ULL(57) -#define EPT_SPTE_MMU_WRITABLE BIT_ULL(58) +#define EPT_SPTE_HOST_WRITABLE BIT_ULL(53) +#define EPT_SPTE_MMU_WRITABLE BIT_ULL(55) static_assert(!(EPT_SPTE_HOST_WRITABLE & SPTE_TDP_AD_MASK)); static_assert(!(EPT_SPTE_MMU_WRITABLE & SPTE_TDP_AD_MASK)); @@ -99,11 +98,11 @@ static_assert(!(EPT_SPTE_MMU_WRITABLE & SHADOW_ACC_TRACK_SAVED_MASK)); #undef SHADOW_ACC_TRACK_SAVED_MASK /* - * Due to limited space in PTEs, the MMIO generation is a 19 bit subset of + * Due to limited space in PTEs, the MMIO generation is an 18 bit subset of * the memslots generation and is derived as follows: * - * Bits 0-7 of the MMIO generation are propagated to spte bits 3-10 - * Bits 8-18 of the MMIO generation are propagated to spte bits 52-62 + * Bits 0-6 of the MMIO generation are propagated to spte bits 3-9 + * Bits 7-17 of the MMIO generation are propagated to spte bits 52-62 * * The KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag is intentionally not included in * the MMIO generation number, as doing so would require stealing a bit from @@ -114,7 +113,7 @@ static_assert(!(EPT_SPTE_MMU_WRITABLE & SHADOW_ACC_TRACK_SAVED_MASK)); */ #define MMIO_SPTE_GEN_LOW_START 3 -#define MMIO_SPTE_GEN_LOW_END 10 +#define MMIO_SPTE_GEN_LOW_END 9 #define MMIO_SPTE_GEN_HIGH_START 52 #define MMIO_SPTE_GEN_HIGH_END 62 @@ -136,7 +135,8 @@ static_assert(!(SPTE_MMU_PRESENT_MASK & * and so they're off-limits for generation; additional checks ensure the mask * doesn't overlap legal PA bits), and bit 63 (carved out for future usage). */ -#define SPTE_MMIO_ALLOWED_MASK (BIT_ULL(63) | GENMASK_ULL(51, 12) | GENMASK_ULL(2, 0)) +#define SPTE_MMIO_ALLOWED_MASK (BIT_ULL(63) | GENMASK_ULL(51, 12) | \ + BIT_ULL(10) | GENMASK_ULL(2, 0)) static_assert(!(SPTE_MMIO_ALLOWED_MASK & (SPTE_MMU_PRESENT_MASK | MMIO_SPTE_GEN_LOW_MASK | MMIO_SPTE_GEN_HIGH_MASK))); @@ -144,7 +144,7 @@ static_assert(!(SPTE_MMIO_ALLOWED_MASK & #define MMIO_SPTE_GEN_HIGH_BITS (MMIO_SPTE_GEN_HIGH_END - MMIO_SPTE_GEN_HIGH_START + 1) /* remember to adjust the comment above as well if you change these */ -static_assert(MMIO_SPTE_GEN_LOW_BITS == 8 && MMIO_SPTE_GEN_HIGH_BITS == 11); +static_assert(MMIO_SPTE_GEN_LOW_BITS == 7 && MMIO_SPTE_GEN_HIGH_BITS == 11); #define MMIO_SPTE_GEN_LOW_SHIFT (MMIO_SPTE_GEN_LOW_START - 0) #define MMIO_SPTE_GEN_HIGH_SHIFT (MMIO_SPTE_GEN_HIGH_START - MMIO_SPTE_GEN_LOW_BITS) @@ -179,8 +179,9 @@ extern bool __read_mostly kvm_ad_enabled; extern u64 __read_mostly shadow_host_writable_mask; extern u64 __read_mostly shadow_mmu_writable_mask; extern u64 __read_mostly shadow_nx_mask; -extern u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ extern u64 __read_mostly shadow_user_mask; +extern u64 __read_mostly shadow_xs_mask; /* mutual exclusive with nx_mask and user_mask */ +extern u64 __read_mostly shadow_xu_mask; /* mutual exclusive with nx_mask and user_mask */ extern u64 __read_mostly shadow_accessed_mask; extern u64 __read_mostly shadow_dirty_mask; extern u64 __read_mostly shadow_mmio_value; @@ -220,10 +221,11 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask; * * Only used by the TDP MMU. */ -#define FROZEN_SPTE (SHADOW_NONPRESENT_VALUE | 0x5a0ULL) +#define FROZEN_SPTE (SHADOW_NONPRESENT_VALUE | 0x1a0ULL) -/* Frozen SPTEs must not be misconstrued as shadow present PTEs. */ -static_assert(!(FROZEN_SPTE & SPTE_MMU_PRESENT_MASK)); +/* Frozen SPTEs must not be misconstrued as shadow or MMU present PTEs. */ +static_assert(!(FROZEN_SPTE & (SPTE_MMU_PRESENT_MASK | + VMX_EPT_RWX_MASK | VMX_EPT_USER_EXECUTABLE_MASK))); static inline bool is_frozen_spte(u64 spte) { @@ -357,7 +359,13 @@ static inline bool is_last_spte(u64 pte, int level) static inline bool is_executable_pte(u64 spte) { - return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask; + /* + * For now, return true if either the XS or XU bit is set + * This function is only used for fast_page_fault, + * which never processes shadow EPT, and regular page + * tables always have XS==XU. + */ + return (spte & (shadow_xs_mask | shadow_xu_mask | shadow_nx_mask)) != shadow_nx_mask; } static inline kvm_pfn_t spte_to_pfn(u64 pte) @@ -387,6 +395,8 @@ static inline bool __is_rsvd_bits_set(struct rsvd_bits_validate *rsvd_check, static inline bool __is_bad_mt_xwr(struct rsvd_bits_validate *rsvd_check, u64 pte) { + if (pte & VMX_EPT_USER_EXECUTABLE_MASK) + pte |= VMX_EPT_EXECUTABLE_MASK; return rsvd_check->bad_mt_xwr & BIT_ULL(pte & 0x3f); } diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 7b1102d26f9cd..5a2f8ce9a32b8 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1185,9 +1185,9 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, } if (unlikely(!fault->slot)) - new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); + new_spte = make_mmio_spte(vcpu, iter->gfn, sp->role.access); else - wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn, + wrprot = make_spte(vcpu, sp, fault->slot, sp->role.access, iter->gfn, fault->pfn, iter->old_spte, fault->prefetch, false, fault->map_writable, &new_spte); @@ -1272,7 +1272,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) kvm_mmu_hugepage_adjust(vcpu, fault); - trace_kvm_mmu_spte_requested(fault); + trace_kvm_mmu_spte_requested(fault, root->role.access); rcu_read_lock(); diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index b340dc9991adb..1bf3e4804ad0a 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -93,9 +93,10 @@ static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu) * when called via KVM_SET_NESTED_STATE, that state may _not_ match current * vCPU state. CR0.WP is explicitly ignored, while CR0.PG is required. */ - kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, svm->vmcb01.ptr->save.cr4, + kvm_init_shadow_npt_mmu(vcpu, svm->vmcb01.ptr->save.cr4, svm->vmcb01.ptr->save.efer, - svm->nested.ctl.nested_cr3); + svm->nested.ctl.nested_cr3, + svm->nested.ctl.misc_ctl); vcpu->arch.mmu->get_guest_pgd = nested_svm_get_tdp_cr3; vcpu->arch.mmu->get_pdptr = nested_svm_get_tdp_pdptr; vcpu->arch.mmu->inject_page_fault = nested_svm_inject_npf_exit; @@ -498,11 +499,14 @@ void __nested_copy_vmcb_control_to_cache(struct kvm_vcpu *vcpu, nested_svm_sanitize_intercept(vcpu, to, SKINIT); nested_svm_sanitize_intercept(vcpu, to, RDPRU); - /* Always clear SVM_MISC_ENABLE_NP if the guest cannot use NPTs */ + /* Always clear misc_ctl bits that the guest cannot use */ to->misc_ctl = from->misc_ctl; if (!guest_cpu_cap_has(vcpu, X86_FEATURE_NPT)) to->misc_ctl &= ~SVM_MISC_ENABLE_NP; + if (!gmet_enabled || !guest_cpu_cap_has(vcpu, X86_FEATURE_GMET)) + to->misc_ctl &= ~SVM_MISC_ENABLE_GMET; + to->iopm_base_pa = from->iopm_base_pa & PAGE_MASK; to->msrpm_base_pa = from->msrpm_base_pa & PAGE_MASK; to->tsc_offset = from->tsc_offset; @@ -866,7 +870,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) * the latter, L1 runs L2 with shadow page tables that translate L2 GVAs * to L1 GPAs, so the same NPTs can be used for L1 and L2. */ - vmcb02->control.misc_ctl = vmcb01->control.misc_ctl & SVM_MISC_ENABLE_NP; + vmcb02->control.misc_ctl = vmcb01->control.misc_ctl & (SVM_MISC_ENABLE_NP | SVM_MISC_ENABLE_GMET); vmcb02->control.iopm_base_pa = vmcb01->control.iopm_base_pa; vmcb02->control.msrpm_base_pa = vmcb01->control.msrpm_base_pa; vmcb_mark_dirty(vmcb02, VMCB_PERM_MAP); @@ -903,9 +907,13 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) /* Also overwritten later if necessary. */ vmcb02->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; - /* nested_cr3. */ - if (nested_npt_enabled(svm)) + /* Use vmcb01 MMU and format if guest does not use nNPT */ + if (nested_npt_enabled(svm)) { + vmcb02->control.misc_ctl &= ~SVM_MISC_ENABLE_GMET; + vmcb02->control.misc_ctl |= (svm->nested.ctl.misc_ctl & SVM_MISC_ENABLE_GMET); + nested_svm_init_mmu_context(vcpu); + } vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset(vcpu->arch.l1_tsc_offset, vmcb12_ctrl->tsc_offset, @@ -2056,8 +2064,26 @@ static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu) return true; } +static gpa_t svm_translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, + u64 access, + struct x86_exception *exception, + u64 pte_access) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct kvm_mmu *mmu = vcpu->arch.mmu; + + BUG_ON(!mmu_is_nested(vcpu)); + + /* Non-GMET walks are always user-walks */ + if (!(svm->nested.ctl.misc_ctl & SVM_MISC_ENABLE_GMET)) + access |= PFERR_USER_MASK; + + return mmu->gva_to_gpa(vcpu, mmu, gpa, access, exception); +} + struct kvm_x86_nested_ops svm_nested_ops = { .leave_nested = svm_leave_nested, + .translate_nested_gpa = svm_translate_nested_gpa, .is_exception_vmexit = nested_svm_is_exception_vmexit, .check_events = svm_check_nested_events, .triple_fault = nested_svm_triple_fault, diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index e107f368ed2dd..18a7cddb097d2 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -968,7 +968,7 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm) save->r14 = svm->vcpu.arch.regs[VCPU_REGS_R14]; save->r15 = svm->vcpu.arch.regs[VCPU_REGS_R15]; #endif - save->rip = svm->vcpu.arch.regs[VCPU_REGS_RIP]; + save->rip = svm->vcpu.arch.rip; /* Sync some non-GPR registers before encrypting */ save->xcr0 = svm->vcpu.arch.xcr0; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d38a21be099d6..adfa9ff48c573 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -51,6 +51,7 @@ #include "trace.h" +#include "vmenter.h" #include "svm.h" #include "svm_ops.h" @@ -139,6 +140,9 @@ module_param(pause_filter_count_max, ushort, 0444); bool __ro_after_init npt_enabled = true; module_param_named(npt, npt_enabled, bool, 0444); +bool gmet_enabled = true; +module_param_named(gmet, gmet_enabled, bool, 0444); + /* allow nested virtualization in KVM/SVM */ static int __ro_after_init nested = true; module_param(nested, int, 0444); @@ -662,7 +666,7 @@ static void clr_dr_intercepts(struct vcpu_svm *svm) svm_mark_intercepts_dirty(svm); } -static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr) +static bool msr_write_intercepted(struct vcpu_svm *svm, u32 msr) { /* * For non-nested case: @@ -673,8 +677,7 @@ static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr) * If the L02 MSR bitmap does not intercept the MSR, then we need to * save it. */ - void *msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm : - to_svm(vcpu)->msrpm; + void *msrpm = is_guest_mode(&svm->vcpu) ? svm->nested.msrpm : svm->msrpm; return svm_test_msr_bitmap_write(msrpm, msr); } @@ -1221,6 +1224,10 @@ static void init_vmcb(struct kvm_vcpu *vcpu, bool init_event) save->g_pat = vcpu->arch.pat; save->cr3 = 0; } + + if (gmet_enabled) + control->misc_ctl |= SVM_MISC_ENABLE_GMET; + svm->current_vmcb->asid_generation = 0; svm->asid = 0; @@ -1529,7 +1536,7 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) kvm_register_mark_available(vcpu, reg); switch (reg) { - case VCPU_EXREG_PDPTR: + case VCPU_REG_PDPTR: /* * When !npt_enabled, mmu->pdptrs[] is already available since * it is always updated per SDM when moving to CRs. @@ -1998,6 +2005,18 @@ static int npf_interception(struct kvm_vcpu *vcpu) } } + if (!is_sev_es_guest(vcpu) && + (svm->vmcb->control.misc_ctl & SVM_MISC_ENABLE_GMET) && + (error_code & PFERR_FETCH_MASK)) { + /* + * Work around errata 1218: EXITINFO1[2] May Be Incorrectly Set + * When GMET (Guest Mode Execute Trap extension) is Enabled + */ + error_code |= PFERR_USER_MASK; + if (svm_get_cpl(vcpu) != 3) + error_code &= ~PFERR_USER_MASK; + } + if (is_sev_snp_guest(vcpu) && (error_code & PFERR_GUEST_ENC_MASK)) error_code |= PFERR_PRIVATE_ACCESS; @@ -2776,7 +2795,7 @@ static bool sev_es_prevent_msr_access(struct kvm_vcpu *vcpu, { return is_sev_es_guest(vcpu) && vcpu->arch.guest_state_protected && msr_info->index != MSR_IA32_XSS && - !msr_write_intercepted(vcpu, msr_info->index); + !msr_write_intercepted(to_svm(vcpu), msr_info->index); } static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) @@ -4191,7 +4210,7 @@ static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva) static void svm_flush_tlb_guest(struct kvm_vcpu *vcpu) { - kvm_register_mark_dirty(vcpu, VCPU_EXREG_ERAPS); + kvm_register_mark_dirty(vcpu, VCPU_REG_ERAPS); svm_flush_tlb_asid(vcpu); } @@ -4390,7 +4409,7 @@ static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) return EXIT_FASTPATH_NONE; } -static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_intercepted) +static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, unsigned enter_flags) { struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu); struct vcpu_svm *svm = to_svm(vcpu); @@ -4412,10 +4431,10 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in amd_clear_divider(); if (is_sev_es_guest(vcpu)) - __svm_sev_es_vcpu_run(svm, spec_ctrl_intercepted, + __svm_sev_es_vcpu_run(svm, enter_flags, sev_es_host_save_area(sd)); else - __svm_vcpu_run(svm, spec_ctrl_intercepted); + __svm_vcpu_run(svm, enter_flags); raw_local_irq_disable(); @@ -4426,13 +4445,16 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) { bool force_immediate_exit = run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT; struct vcpu_svm *svm = to_svm(vcpu); - bool spec_ctrl_intercepted = msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL); + unsigned enter_flags = 0; + + if (!msr_write_intercepted(svm, MSR_IA32_SPEC_CTRL)) + enter_flags |= KVM_ENTER_SAVE_SPEC_CTRL; trace_kvm_entry(vcpu, force_immediate_exit); svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; - svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP]; + svm->vmcb->save.rip = vcpu->arch.rip; /* * Disable singlestep if we're injecting an interrupt/exception. @@ -4469,7 +4491,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) svm->vmcb->save.cr2 = vcpu->arch.cr2; if (guest_cpu_cap_has(vcpu, X86_FEATURE_ERAPS) && - kvm_register_is_dirty(vcpu, VCPU_EXREG_ERAPS)) + kvm_register_is_dirty(vcpu, VCPU_REG_ERAPS)) svm->vmcb->control.erap_ctl |= ERAP_CONTROL_CLEAR_RAP; svm_fixup_nested_rips(vcpu); @@ -4509,7 +4531,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL)) x86_spec_ctrl_set_guest(svm->virt_spec_ctrl); - svm_vcpu_enter_exit(vcpu, spec_ctrl_intercepted); + svm_vcpu_enter_exit(vcpu, enter_flags); if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL)) x86_spec_ctrl_restore_host(svm->virt_spec_ctrl); @@ -4518,9 +4540,9 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) vcpu->arch.cr2 = svm->vmcb->save.cr2; vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; - vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; + vcpu->arch.rip = svm->vmcb->save.rip; } - vcpu->arch.regs_dirty = 0; + kvm_reset_dirty_registers(vcpu); if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) kvm_before_interrupt(vcpu, KVM_HANDLING_NMI); @@ -4566,9 +4588,9 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) vcpu->arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags(); - vcpu->arch.regs_avail &= ~SVM_REGS_LAZY_LOAD_SET; + kvm_clear_available_registers(vcpu, SVM_REGS_LAZY_LOAD_SET); - if (!msr_write_intercepted(vcpu, MSR_AMD64_PERF_CNTR_GLOBAL_CTL)) + if (!msr_write_intercepted(svm, MSR_AMD64_PERF_CNTR_GLOBAL_CTL)) rdmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, vcpu_to_pmu(vcpu)->global_ctrl); trace_kvm_exit(vcpu, KVM_ISA_SVM); @@ -4624,6 +4646,11 @@ svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) hypercall[2] = 0xd9; } +static bool svm_tdp_has_smep(struct kvm *kvm) +{ + return gmet_enabled; +} + /* * The kvm parameter can be NULL (module initialization, or invocation before * VM creation). Be sure to check the kvm parameter before using it. @@ -4958,7 +4985,7 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram) svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; - svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP]; + svm->vmcb->save.rip = vcpu->arch.rip; nested_svm_simple_vmexit(svm, SVM_EXIT_SW); @@ -5367,6 +5394,7 @@ struct kvm_x86_ops svm_x86_ops __initdata = { .write_tsc_multiplier = svm_write_tsc_multiplier, .load_mmu_pgd = svm_load_mmu_pgd, + .tdp_has_smep = svm_tdp_has_smep, .check_intercept = svm_check_intercept, .handle_exit_irqoff = svm_handle_exit_irqoff, @@ -5491,6 +5519,9 @@ static __init void svm_set_cpu_caps(void) if (boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) kvm_cpu_cap_set(X86_FEATURE_PFTHRESHOLD); + if (gmet_enabled) + kvm_cpu_cap_set(X86_FEATURE_GMET); + if (vgif) kvm_cpu_cap_set(X86_FEATURE_VGIF); @@ -5600,6 +5631,9 @@ static __init int svm_hardware_setup(void) if (!boot_cpu_has(X86_FEATURE_NPT)) npt_enabled = false; + if (!npt_enabled || !boot_cpu_has(X86_FEATURE_GMET)) + gmet_enabled = false; + /* Force VM NPT level equal to the host's paging level */ kvm_configure_mmu(npt_enabled, get_npt_level(), get_npt_level(), PG_LEVEL_1G); diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index a10668d17a16a..19b80ef56e2b7 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -44,6 +44,7 @@ static inline struct page *__sme_pa_to_page(unsigned long pa) #define IOPM_SIZE PAGE_SIZE * 3 #define MSRPM_SIZE PAGE_SIZE * 2 +extern bool gmet_enabled; extern bool npt_enabled; extern int nrips; extern int vgif; @@ -484,7 +485,7 @@ static inline bool svm_is_vmrun_failure(u64 exit_code) * KVM_REQ_LOAD_MMU_PGD is always requested when the cached vcpu->arch.cr3 * is changed. svm_load_mmu_pgd() then syncs the new CR3 value into the VMCB. */ -#define SVM_REGS_LAZY_LOAD_SET (1 << VCPU_EXREG_PDPTR) +#define SVM_REGS_LAZY_LOAD_SET (BIT(VCPU_REG_PDPTR)) static inline void __vmcb_set_intercept(unsigned long *intercepts, u32 bit) { @@ -1007,9 +1008,9 @@ static inline void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_sa /* vmenter.S */ -void __svm_sev_es_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted, +void __svm_sev_es_vcpu_run(struct vcpu_svm *svm, unsigned int flags, struct sev_es_save_area *hostsa); -void __svm_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted); +void __svm_vcpu_run(struct vcpu_svm *svm, unsigned int flags); #define DEFINE_KVM_GHCB_ACCESSORS(field) \ static __always_inline u64 kvm_ghcb_get_##field(struct vcpu_svm *svm) \ diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S index d47c5c93c9913..f523d9e498398 100644 --- a/arch/x86/kvm/svm/vmenter.S +++ b/arch/x86/kvm/svm/vmenter.S @@ -7,6 +7,7 @@ #include <asm/kvm_vcpu_regs.h> #include <asm/nospec-branch.h> #include "kvm-asm-offsets.h" +#include "vmenter.h" #define WORD_SIZE (BITS_PER_LONG / 8) @@ -39,38 +40,6 @@ ALTERNATIVE_2 "", \ "jmp 800f", X86_FEATURE_MSR_SPEC_CTRL, \ "", X86_FEATURE_V_SPEC_CTRL -801: -.endm -.macro RESTORE_GUEST_SPEC_CTRL_BODY -800: - /* - * SPEC_CTRL handling: if the guest's SPEC_CTRL value differs from the - * host's, write the MSR. This is kept out-of-line so that the common - * case does not have to jump. - * - * IMPORTANT: To avoid RSB underflow attacks and any other nastiness, - * there must not be any returns or indirect branches between this code - * and vmentry. - */ -#ifdef CONFIG_X86_64 - mov SVM_spec_ctrl(%rdi), %rdx - cmp PER_CPU_VAR(x86_spec_ctrl_current), %rdx - je 801b - movl %edx, %eax - shr $32, %rdx -#else - mov SVM_spec_ctrl(%edi), %eax - mov PER_CPU_VAR(x86_spec_ctrl_current), %ecx - xor %eax, %ecx - mov SVM_spec_ctrl + 4(%edi), %edx - mov PER_CPU_VAR(x86_spec_ctrl_current + 4), %esi - xor %edx, %esi - or %esi, %ecx - je 801b -#endif - mov $MSR_IA32_SPEC_CTRL, %ecx - wrmsr - jmp 801b .endm .macro RESTORE_HOST_SPEC_CTRL @@ -78,42 +47,6 @@ ALTERNATIVE_2 "", \ "jmp 900f", X86_FEATURE_MSR_SPEC_CTRL, \ "", X86_FEATURE_V_SPEC_CTRL -901: -.endm -.macro RESTORE_HOST_SPEC_CTRL_BODY spec_ctrl_intercepted:req -900: - /* Same for after vmexit. */ - mov $MSR_IA32_SPEC_CTRL, %ecx - - /* - * Load the value that the guest had written into MSR_IA32_SPEC_CTRL, - * if it was not intercepted during guest execution. - */ - cmpb $0, \spec_ctrl_intercepted - jnz 998f - rdmsr - movl %eax, SVM_spec_ctrl(%_ASM_DI) - movl %edx, SVM_spec_ctrl + 4(%_ASM_DI) -998: - /* Now restore the host value of the MSR if different from the guest's. */ -#ifdef CONFIG_X86_64 - mov PER_CPU_VAR(x86_spec_ctrl_current), %rdx - cmp SVM_spec_ctrl(%rdi), %rdx - je 901b - movl %edx, %eax - shr $32, %rdx -#else - mov PER_CPU_VAR(x86_spec_ctrl_current), %eax - mov SVM_spec_ctrl(%edi), %esi - xor %eax, %esi - mov PER_CPU_VAR(x86_spec_ctrl_current + 4), %edx - mov SVM_spec_ctrl + 4(%edi), %edi - xor %edx, %edi - or %edi, %esi - je 901b -#endif - wrmsr - jmp 901b .endm #define SVM_CLEAR_CPU_BUFFERS \ @@ -121,8 +54,8 @@ /** * __svm_vcpu_run - Run a vCPU via a transition to SVM guest mode - * @svm: struct vcpu_svm * - * @spec_ctrl_intercepted: bool + * @svm: struct vcpu_svm * + * @enter_flags: u32 */ SYM_FUNC_START(__svm_vcpu_run) push %_ASM_BP @@ -162,6 +95,7 @@ SYM_FUNC_START(__svm_vcpu_run) /* Clobbers RAX, RCX, RDX (and ESI on 32-bit), consumes RDI (@svm). */ RESTORE_GUEST_SPEC_CTRL +801: /* * Use a single vmcb (vmcb01 because it's always valid) for @@ -242,6 +176,7 @@ SYM_FUNC_START(__svm_vcpu_run) * and RSP (pointer to @spec_ctrl_intercepted). */ RESTORE_HOST_SPEC_CTRL +901: /* * Mitigate RETBleed for AMD/Hygon Zen uarch. RET should be @@ -278,7 +213,7 @@ SYM_FUNC_START(__svm_vcpu_run) xor %r15d, %r15d #endif - /* "Pop" @spec_ctrl_intercepted. */ + /* "Pop" @enter_flags. */ pop %_ASM_BX pop %_ASM_BX @@ -295,8 +230,12 @@ SYM_FUNC_START(__svm_vcpu_run) pop %_ASM_BP RET - RESTORE_GUEST_SPEC_CTRL_BODY - RESTORE_HOST_SPEC_CTRL_BODY (%_ASM_SP) +800: + RESTORE_GUEST_SPEC_CTRL_BODY SVM_spec_ctrl(%_ASM_DI), 801b + jmp 801b +900: + RESTORE_HOST_SPEC_CTRL_BODY SVM_spec_ctrl(%_ASM_DI), (%_ASM_SP), 901b + jmp 901b 10: cmpb $0, _ASM_RIP(virt_rebooting) jne 2b @@ -335,8 +274,8 @@ SYM_FUNC_END(__svm_vcpu_run) /** * __svm_sev_es_vcpu_run - Run a SEV-ES vCPU via a transition to SVM guest mode - * @svm: struct vcpu_svm * - * @spec_ctrl_intercepted: bool + * @svm: struct vcpu_svm * + * @enter_flags: u32 */ SYM_FUNC_START(__svm_sev_es_vcpu_run) FRAME_BEGIN @@ -355,13 +294,14 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run) /* * Save volatile registers that hold arguments that are needed after - * #VMEXIT (RDI=@svm and RSI=@spec_ctrl_intercepted). + * #VMEXIT (RDI=@svm and RSI=@enter_flags). */ mov %rdi, SEV_ES_RDI (%rdx) mov %rsi, SEV_ES_RSI (%rdx) /* Clobbers RAX, RCX, and RDX (@hostsa), consumes RDI (@svm). */ RESTORE_GUEST_SPEC_CTRL +801: /* Get svm->current_vmcb->pa into RAX. */ mov SVM_current_vmcb(%rdi), %rax @@ -376,8 +316,9 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run) /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */ FILL_RETURN_BUFFER %rax, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_VMEXIT - /* Clobbers RAX, RCX, RDX, consumes RDI (@svm) and RSI (@spec_ctrl_intercepted). */ + /* Clobbers RAX, RCX, RDX, consumes RDI (@svm) and RSI (@enter_flags). */ RESTORE_HOST_SPEC_CTRL +901: /* * Mitigate RETBleed for AMD/Hygon Zen uarch. RET should be @@ -391,8 +332,12 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run) FRAME_END RET - RESTORE_GUEST_SPEC_CTRL_BODY - RESTORE_HOST_SPEC_CTRL_BODY %sil +800: + RESTORE_GUEST_SPEC_CTRL_BODY SVM_spec_ctrl(%_ASM_DI), 801b + jmp 801b +900: + RESTORE_HOST_SPEC_CTRL_BODY SVM_spec_ctrl(%_ASM_DI), %esi, 901b + jmp 901b 3: cmpb $0, virt_rebooting(%rip) jne 2b diff --git a/arch/x86/kvm/vmenter.h b/arch/x86/kvm/vmenter.h new file mode 100644 index 0000000000000..ba3f71449c62c --- /dev/null +++ b/arch/x86/kvm/vmenter.h @@ -0,0 +1,80 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __KVM_X86_VMENTER_H +#define __KVM_X86_VMENTER_H + +#define KVM_ENTER_VMRESUME BIT(0) +#define KVM_ENTER_SAVE_SPEC_CTRL BIT(1) +#define KVM_ENTER_CLEAR_CPU_BUFFERS_FOR_MMIO BIT(2) + +#ifdef __ASSEMBLER__ +.macro RESTORE_GUEST_SPEC_CTRL_BODY guest_spec_ctrl:req, label:req + /* + * SPEC_CTRL handling: if the guest's SPEC_CTRL value differs from the + * host's, write the MSR. This is kept out-of-line so that the common + * case does not have to jump. + * + * IMPORTANT: To avoid RSB underflow attacks and any other nastiness, + * there must not be any returns or indirect branches between this code + * and vmentry. + */ +#ifdef CONFIG_X86_64 + mov \guest_spec_ctrl, %rdx + cmp PER_CPU_VAR(x86_spec_ctrl_current), %rdx + je \label + movl %edx, %eax + shr $32, %rdx +#else + mov \guest_spec_ctrl, %eax + mov PER_CPU_VAR(x86_spec_ctrl_current), %ecx + xor %eax, %ecx + mov 4 + \guest_spec_ctrl, %edx + mov PER_CPU_VAR(x86_spec_ctrl_current + 4), %esi + xor %edx, %esi + or %esi, %ecx + je \label +#endif + mov $MSR_IA32_SPEC_CTRL, %ecx + wrmsr +.endm + +.macro RESTORE_HOST_SPEC_CTRL_BODY guest_spec_ctrl:req, enter_flags:req, label:req + /* Same for after vmexit. */ + mov $MSR_IA32_SPEC_CTRL, %ecx + + /* + * Load the value that the guest had written into MSR_IA32_SPEC_CTRL, + * if it was not intercepted during guest execution. + */ + testl $KVM_ENTER_SAVE_SPEC_CTRL, \enter_flags + jz 998f + rdmsr + movl %eax, \guest_spec_ctrl + movl %edx, 4 + \guest_spec_ctrl +998: + /* Now restore the host value of the MSR if different from the guest's. */ +#ifdef CONFIG_X86_64 + mov PER_CPU_VAR(x86_spec_ctrl_current), %rdx + cmp \guest_spec_ctrl, %rdx + /* + * For legacy IBRS, the IBRS bit always needs to be written after + * transitioning from a less privileged predictor mode, regardless of + * whether the guest/host values differ. + */ + ALTERNATIVE __stringify(je \label), "", X86_FEATURE_KERNEL_IBRS + movl %edx, %eax + shr $32, %rdx +#else + mov PER_CPU_VAR(x86_spec_ctrl_current), %eax + mov \guest_spec_ctrl, %esi + xor %eax, %esi + mov PER_CPU_VAR(x86_spec_ctrl_current + 4), %edx + mov 4 + \guest_spec_ctrl, %edi + xor %edx, %edi + or %edi, %esi + ALTERNATIVE __stringify(je \label), "", X86_FEATURE_KERNEL_IBRS +#endif + wrmsr +.endm + +#endif /* __ASSEMBLER__ */ +#endif /* __KVM_X86_VMENTER_H */ diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 31568274d8bb0..810119167f798 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -16,6 +16,7 @@ extern bool __read_mostly enable_unrestricted_guest; extern bool __read_mostly enable_ept_ad_bits; extern bool __read_mostly enable_cet; extern bool __read_mostly enable_pml; +extern bool __read_mostly enable_mbec; extern int __read_mostly pt_mode; #define PT_MODE_SYSTEM 0 @@ -301,11 +302,6 @@ static inline bool cpu_has_vmx_flexpriority(void) cpu_has_vmx_virtualize_apic_accesses(); } -static inline bool cpu_has_vmx_ept_execute_only(void) -{ - return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT; -} - static inline bool cpu_has_vmx_ept_4levels(void) { return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT; @@ -412,4 +408,10 @@ static inline bool cpu_has_notify_vmexit(void) SECONDARY_EXEC_NOTIFY_VM_EXITING; } +static inline bool cpu_has_ept_mbec(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_MODE_BASED_EPT_EXEC; +} + #endif /* __KVM_X86_VMX_CAPS_H */ diff --git a/arch/x86/kvm/vmx/common.h b/arch/x86/kvm/vmx/common.h index 412d0829d7a21..08005676702c2 100644 --- a/arch/x86/kvm/vmx/common.h +++ b/arch/x86/kvm/vmx/common.h @@ -85,22 +85,30 @@ static inline int __vmx_handle_ept_violation(struct kvm_vcpu *vcpu, gpa_t gpa, { u64 error_code; - /* Is it a read fault? */ - error_code = (exit_qualification & EPT_VIOLATION_ACC_READ) - ? PFERR_USER_MASK : 0; /* Is it a write fault? */ - error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE) + error_code = (exit_qualification & EPT_VIOLATION_ACC_WRITE) ? PFERR_WRITE_MASK : 0; /* Is it a fetch fault? */ error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR) ? PFERR_FETCH_MASK : 0; - /* ept page table entry is present? */ - error_code |= (exit_qualification & EPT_VIOLATION_PROT_MASK) + /* ept page table entry is present? */ + error_code |= (exit_qualification & + (EPT_VIOLATION_PROT_MASK & ~EPT_VIOLATION_PROT_USER_EXEC)) ? PFERR_PRESENT_MASK : 0; - if (exit_qualification & EPT_VIOLATION_GVA_IS_VALID) - error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) ? - PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK; + if (mmu_has_mbec(vcpu->arch.mmu)) + error_code |= (exit_qualification & EPT_VIOLATION_PROT_USER_EXEC) + ? PFERR_PRESENT_MASK : 0; + + if (exit_qualification & EPT_VIOLATION_GVA_IS_VALID) { + if (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) { + error_code |= PFERR_GUEST_FINAL_MASK; + if (exit_qualification & EPT_VIOLATION_GVA_USER) + error_code |= PFERR_USER_MASK; + } else { + error_code |= PFERR_GUEST_PAGE_MASK; + } + } if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) error_code |= PFERR_PRIVATE_ACCESS; diff --git a/arch/x86/kvm/vmx/hyperv_evmcs.h b/arch/x86/kvm/vmx/hyperv_evmcs.h index fc7c4e7bd1bfb..bc08fe40590e9 100644 --- a/arch/x86/kvm/vmx/hyperv_evmcs.h +++ b/arch/x86/kvm/vmx/hyperv_evmcs.h @@ -87,6 +87,7 @@ SECONDARY_EXEC_PT_CONCEAL_VMX | \ SECONDARY_EXEC_BUS_LOCK_DETECTION | \ SECONDARY_EXEC_NOTIFY_VM_EXITING | \ + SECONDARY_EXEC_MODE_BASED_EPT_EXEC | \ SECONDARY_EXEC_ENCLS_EXITING) #define EVMCS1_SUPPORTED_3RDEXEC (0ULL) diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c index dbebddf648be7..83d9921277eac 100644 --- a/arch/x86/kvm/vmx/main.c +++ b/arch/x86/kvm/vmx/main.c @@ -755,6 +755,14 @@ static int vt_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) return vmx_set_identity_map_addr(kvm, ident_addr); } +static bool vt_tdp_has_smep(struct kvm *kvm) +{ + if (is_td(kvm)) + return false; + + return vmx_tdp_has_smep(kvm); +} + static u64 vt_get_l2_tsc_offset(struct kvm_vcpu *vcpu) { /* TDX doesn't support L2 guest at the moment. */ @@ -966,6 +974,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .set_tss_addr = vt_op(set_tss_addr), .set_identity_map_addr = vt_op(set_identity_map_addr), .get_mt_mask = vmx_get_mt_mask, + .tdp_has_smep = vt_op(tdp_has_smep), .get_exit_info = vt_op(get_exit_info), .get_entry_info = vt_op(get_entry_info), diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 3fe88f29be7a9..4690a4d23709d 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -310,13 +310,13 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) vmx_sync_vmcs_host_state(vmx, prev); put_cpu(); - vcpu->arch.regs_avail = ~VMX_REGS_LAZY_LOAD_SET; + kvm_clear_available_registers(vcpu, VMX_REGS_LAZY_LOAD_SET); /* * All lazily updated registers will be reloaded from VMCS12 on both * vmentry and vmexit. */ - vcpu->arch.regs_dirty = 0; + kvm_reset_dirty_registers(vcpu); } static void nested_put_vmcs12_pages(struct kvm_vcpu *vcpu) @@ -443,10 +443,14 @@ static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, vm_exit_reason = EXIT_REASON_EPT_MISCONFIG; exit_qualification = 0; } else { + u64 mask = EPT_VIOLATION_GVA_IS_VALID | + EPT_VIOLATION_GVA_TRANSLATED; + if (vmx->nested.msrs.ept_caps & VMX_EPT_ADVANCED_VMEXIT_INFO_BIT) + mask |= EPT_VIOLATION_GVA_USER | + EPT_VIOLATION_GVA_WRITABLE | + EPT_VIOLATION_GVA_NX; exit_qualification = fault->exit_qualification; - exit_qualification |= vmx_get_exit_qual(vcpu) & - (EPT_VIOLATION_GVA_IS_VALID | - EPT_VIOLATION_GVA_TRANSLATED); + exit_qualification |= vmx_get_exit_qual(vcpu) & mask; vm_exit_reason = EXIT_REASON_EPT_VIOLATION; } @@ -465,6 +469,13 @@ static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, vmcs12->guest_physical_address = fault->address; } +static inline bool nested_ept_mbec_enabled(struct kvm_vcpu *vcpu) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC); +} + static void nested_ept_new_eptp(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -473,6 +484,7 @@ static void nested_ept_new_eptp(struct kvm_vcpu *vcpu) kvm_init_shadow_ept_mmu(vcpu, execonly, ept_lpage_level, nested_ept_ad_enabled(vcpu), + nested_ept_mbec_enabled(vcpu), nested_ept_get_eptp(vcpu)); } @@ -1189,7 +1201,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, } vcpu->arch.cr3 = cr3; - kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3); + kvm_register_mark_dirty(vcpu, VCPU_REG_CR3); /* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */ kvm_init_mmu(vcpu); @@ -2440,6 +2452,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_ENABLE_VMFUNC | + SECONDARY_EXEC_MODE_BASED_EPT_EXEC | SECONDARY_EXEC_DESC); if (nested_cpu_has(vmcs12, @@ -4972,7 +4985,7 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) nested_ept_uninit_mmu_context(vcpu); vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); - kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); + kvm_register_mark_available(vcpu, VCPU_REG_CR3); /* * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs @@ -5074,7 +5087,7 @@ void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, kvm_service_local_tlb_flush_requests(vcpu); /* - * VCPU_EXREG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between + * VCPU_REG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between * now and the new vmentry. Ensure that the VMCS02 PDPTR fields are * up-to-date before switching to L1. */ @@ -7239,7 +7252,8 @@ static void nested_vmx_setup_secondary_ctls(u32 ept_caps, VMX_EPT_PAGE_WALK_5_BIT | VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT | - VMX_EPT_EXECUTE_ONLY_BIT; + VMX_EPT_EXECUTE_ONLY_BIT | + VMX_EPT_ADVANCED_VMEXIT_INFO_BIT; msrs->ept_caps &= ept_caps; msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | @@ -7251,6 +7265,9 @@ static void nested_vmx_setup_secondary_ctls(u32 ept_caps, msrs->ept_caps |= VMX_EPT_AD_BIT; } + if (enable_mbec) + msrs->secondary_ctls_high |= + SECONDARY_EXEC_MODE_BASED_EPT_EXEC; /* * Advertise EPTP switching irrespective of hardware support, * KVM emulates it in software so long as VMFUNC is supported. @@ -7438,8 +7455,29 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) return 0; } +static gpa_t vmx_translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, + u64 access, + struct x86_exception *exception, + u64 pte_access) +{ + struct kvm_mmu *mmu = vcpu->arch.mmu; + + BUG_ON(!mmu_is_nested(vcpu)); + + /* + * MBEC differentiates based on the effective U/S bit of + * the guest page tables; not the processor CPL. + */ + access &= ~PFERR_USER_MASK; + if ((pte_access & ACC_USER_MASK) && (access & PFERR_GUEST_FINAL_MASK)) + access |= PFERR_USER_MASK; + + return mmu->gva_to_gpa(vcpu, mmu, gpa, access, exception); +} + struct kvm_x86_nested_ops vmx_nested_ops = { .leave_nested = vmx_leave_nested, + .translate_nested_gpa = vmx_translate_nested_gpa, .is_exception_vmexit = nested_vmx_is_exception_vmexit, .check_events = vmx_check_nested_events, .has_events = vmx_has_nested_events, diff --git a/arch/x86/kvm/vmx/run_flags.h b/arch/x86/kvm/vmx/run_flags.h deleted file mode 100644 index 6a87a12135fb5..0000000000000 --- a/arch/x86/kvm/vmx/run_flags.h +++ /dev/null @@ -1,9 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __KVM_X86_VMX_RUN_FLAGS_H -#define __KVM_X86_VMX_RUN_FLAGS_H - -#define VMX_RUN_VMRESUME BIT(0) -#define VMX_RUN_SAVE_SPEC_CTRL BIT(1) -#define VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO BIT(2) - -#endif /* __KVM_X86_VMX_RUN_FLAGS_H */ diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index ed12805bbb444..b187ef9a6ae46 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -1003,23 +1003,23 @@ static fastpath_t tdx_exit_handlers_fastpath(struct kvm_vcpu *vcpu) return EXIT_FASTPATH_NONE; } -#define TDX_REGS_AVAIL_SET (BIT_ULL(VCPU_EXREG_EXIT_INFO_1) | \ - BIT_ULL(VCPU_EXREG_EXIT_INFO_2) | \ - BIT_ULL(VCPU_REGS_RAX) | \ - BIT_ULL(VCPU_REGS_RBX) | \ - BIT_ULL(VCPU_REGS_RCX) | \ - BIT_ULL(VCPU_REGS_RDX) | \ - BIT_ULL(VCPU_REGS_RBP) | \ - BIT_ULL(VCPU_REGS_RSI) | \ - BIT_ULL(VCPU_REGS_RDI) | \ - BIT_ULL(VCPU_REGS_R8) | \ - BIT_ULL(VCPU_REGS_R9) | \ - BIT_ULL(VCPU_REGS_R10) | \ - BIT_ULL(VCPU_REGS_R11) | \ - BIT_ULL(VCPU_REGS_R12) | \ - BIT_ULL(VCPU_REGS_R13) | \ - BIT_ULL(VCPU_REGS_R14) | \ - BIT_ULL(VCPU_REGS_R15)) +#define TDX_REGS_AVAIL_SET (BIT(VCPU_REG_EXIT_INFO_1) | \ + BIT(VCPU_REG_EXIT_INFO_2) | \ + BIT(VCPU_REGS_RAX) | \ + BIT(VCPU_REGS_RBX) | \ + BIT(VCPU_REGS_RCX) | \ + BIT(VCPU_REGS_RDX) | \ + BIT(VCPU_REGS_RBP) | \ + BIT(VCPU_REGS_RSI) | \ + BIT(VCPU_REGS_RDI) | \ + BIT(VCPU_REGS_R8) | \ + BIT(VCPU_REGS_R9) | \ + BIT(VCPU_REGS_R10) | \ + BIT(VCPU_REGS_R11) | \ + BIT(VCPU_REGS_R12) | \ + BIT(VCPU_REGS_R13) | \ + BIT(VCPU_REGS_R14) | \ + BIT(VCPU_REGS_R15)) static void tdx_load_host_xsave_state(struct kvm_vcpu *vcpu) { @@ -1088,7 +1088,7 @@ fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) tdx_load_host_xsave_state(vcpu); - vcpu->arch.regs_avail &= TDX_REGS_AVAIL_SET; + kvm_clear_available_registers(vcpu, ~TDX_REGS_AVAIL_SET); if (unlikely(tdx->vp_enter_ret == EXIT_REASON_EPT_MISCONFIG)) return EXIT_FASTPATH_NONE; @@ -1835,7 +1835,7 @@ static inline bool tdx_is_sept_violation_unexpected_pending(struct kvm_vcpu *vcp if (eeq_type != TDX_EXT_EXIT_QUAL_TYPE_PENDING_EPT_VIOLATION) return false; - return !(eq & EPT_VIOLATION_PROT_MASK) && !(eq & EPT_VIOLATION_EXEC_FOR_RING3_LIN); + return !(eq & EPT_VIOLATION_PROT_MASK); } static int tdx_handle_ept_violation(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index ff1f254a0ef4e..196a1f481881c 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -7,28 +7,28 @@ #include <asm/percpu.h> #include <asm/segment.h> #include "kvm-asm-offsets.h" -#include "run_flags.h" +#include "vmenter.h" #define WORD_SIZE (BITS_PER_LONG / 8) -#define VCPU_RAX __VCPU_REGS_RAX * WORD_SIZE -#define VCPU_RCX __VCPU_REGS_RCX * WORD_SIZE -#define VCPU_RDX __VCPU_REGS_RDX * WORD_SIZE -#define VCPU_RBX __VCPU_REGS_RBX * WORD_SIZE +#define VCPU_RAX (VMX_vcpu_arch_regs + __VCPU_REGS_RAX * WORD_SIZE) +#define VCPU_RCX (VMX_vcpu_arch_regs + __VCPU_REGS_RCX * WORD_SIZE) +#define VCPU_RDX (VMX_vcpu_arch_regs + __VCPU_REGS_RDX * WORD_SIZE) +#define VCPU_RBX (VMX_vcpu_arch_regs + __VCPU_REGS_RBX * WORD_SIZE) /* Intentionally omit RSP as it's context switched by hardware */ -#define VCPU_RBP __VCPU_REGS_RBP * WORD_SIZE -#define VCPU_RSI __VCPU_REGS_RSI * WORD_SIZE -#define VCPU_RDI __VCPU_REGS_RDI * WORD_SIZE +#define VCPU_RBP (VMX_vcpu_arch_regs + __VCPU_REGS_RBP * WORD_SIZE) +#define VCPU_RSI (VMX_vcpu_arch_regs + __VCPU_REGS_RSI * WORD_SIZE) +#define VCPU_RDI (VMX_vcpu_arch_regs + __VCPU_REGS_RDI * WORD_SIZE) #ifdef CONFIG_X86_64 -#define VCPU_R8 __VCPU_REGS_R8 * WORD_SIZE -#define VCPU_R9 __VCPU_REGS_R9 * WORD_SIZE -#define VCPU_R10 __VCPU_REGS_R10 * WORD_SIZE -#define VCPU_R11 __VCPU_REGS_R11 * WORD_SIZE -#define VCPU_R12 __VCPU_REGS_R12 * WORD_SIZE -#define VCPU_R13 __VCPU_REGS_R13 * WORD_SIZE -#define VCPU_R14 __VCPU_REGS_R14 * WORD_SIZE -#define VCPU_R15 __VCPU_REGS_R15 * WORD_SIZE +#define VCPU_R8 (VMX_vcpu_arch_regs + __VCPU_REGS_R8 * WORD_SIZE) +#define VCPU_R9 (VMX_vcpu_arch_regs + __VCPU_REGS_R9 * WORD_SIZE) +#define VCPU_R10 (VMX_vcpu_arch_regs + __VCPU_REGS_R10 * WORD_SIZE) +#define VCPU_R11 (VMX_vcpu_arch_regs + __VCPU_REGS_R11 * WORD_SIZE) +#define VCPU_R12 (VMX_vcpu_arch_regs + __VCPU_REGS_R12 * WORD_SIZE) +#define VCPU_R13 (VMX_vcpu_arch_regs + __VCPU_REGS_R13 * WORD_SIZE) +#define VCPU_R14 (VMX_vcpu_arch_regs + __VCPU_REGS_R14 * WORD_SIZE) +#define VCPU_R15 (VMX_vcpu_arch_regs + __VCPU_REGS_R15 * WORD_SIZE) #endif .section .noinstr.text, "ax" @@ -36,10 +36,9 @@ /** * __vmx_vcpu_run - Run a vCPU via a transition to VMX guest mode * @vmx: struct vcpu_vmx * - * @regs: unsigned long * (to guest registers) - * @flags: VMX_RUN_VMRESUME: use VMRESUME instead of VMLAUNCH - * VMX_RUN_SAVE_SPEC_CTRL: save guest SPEC_CTRL into vmx->spec_ctrl - * VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO: vCPU can access host MMIO + * @flags: KVM_ENTER_VMRESUME: use VMRESUME instead of VMLAUNCH + * KVM_ENTER_SAVE_SPEC_CTRL: save guest SPEC_CTRL into vmx->spec_ctrl + * KVM_ENTER_CLEAR_CPU_BUFFERS_FOR_MMIO: vCPU can access host MMIO * * Returns: * 0 on VM-Exit, 1 on VM-Fail @@ -62,76 +61,46 @@ SYM_FUNC_START(__vmx_vcpu_run) push %_ASM_ARG1 /* Save @flags (used for VMLAUNCH vs. VMRESUME and mitigations). */ - push %_ASM_ARG3 - - /* - * Save @regs, _ASM_ARG2 may be modified by vmx_update_host_rsp() and - * @regs is needed after VM-Exit to save the guest's register values. - */ push %_ASM_ARG2 lea (%_ASM_SP), %_ASM_ARG2 call vmx_update_host_rsp - ALTERNATIVE "jmp .Lspec_ctrl_done", "", X86_FEATURE_MSR_SPEC_CTRL + /* Reload @vmx, _ASM_ARG1 may be modified by vmx_update_host_rsp(). */ + mov WORD_SIZE(%_ASM_SP), %_ASM_DI /* - * SPEC_CTRL handling: if the guest's SPEC_CTRL value differs from the - * host's, write the MSR. - * - * IMPORTANT: To avoid RSB underflow attacks and any other nastiness, - * there must not be any returns or indirect branches between this code - * and vmentry. + * Unlike AMD there's no V_SPEC_CTRL here, so do not leave the body + * out of line. Clobbers RAX, RCX, RDX, RSI. */ - mov 2*WORD_SIZE(%_ASM_SP), %_ASM_DI -#ifdef CONFIG_X86_64 - mov VMX_spec_ctrl(%rdi), %rdx - cmp PER_CPU_VAR(x86_spec_ctrl_current), %rdx - je .Lspec_ctrl_done - movl %edx, %eax - shr $32, %rdx -#else - mov VMX_spec_ctrl(%edi), %eax - mov PER_CPU_VAR(x86_spec_ctrl_current), %ecx - xor %eax, %ecx - mov VMX_spec_ctrl + 4(%edi), %edx - mov PER_CPU_VAR(x86_spec_ctrl_current + 4), %edi - xor %edx, %edi - or %edi, %ecx - je .Lspec_ctrl_done -#endif - mov $MSR_IA32_SPEC_CTRL, %ecx - wrmsr - -.Lspec_ctrl_done: + ALTERNATIVE "jmp .Lspec_ctrl_guest_done", "", X86_FEATURE_MSR_SPEC_CTRL + RESTORE_GUEST_SPEC_CTRL_BODY VMX_spec_ctrl(%_ASM_DI), .Lspec_ctrl_guest_done +.Lspec_ctrl_guest_done: /* * Since vmentry is serializing on affected CPUs, there's no need for * an LFENCE to stop speculation from skipping the wrmsr. */ - /* Load @regs to RAX. */ - mov (%_ASM_SP), %_ASM_AX - /* Load guest registers. Don't clobber flags. */ - mov VCPU_RCX(%_ASM_AX), %_ASM_CX - mov VCPU_RDX(%_ASM_AX), %_ASM_DX - mov VCPU_RBX(%_ASM_AX), %_ASM_BX - mov VCPU_RBP(%_ASM_AX), %_ASM_BP - mov VCPU_RSI(%_ASM_AX), %_ASM_SI - mov VCPU_RDI(%_ASM_AX), %_ASM_DI + mov VCPU_RAX(%_ASM_DI), %_ASM_AX + mov VCPU_RCX(%_ASM_DI), %_ASM_CX + mov VCPU_RDX(%_ASM_DI), %_ASM_DX + mov VCPU_RBX(%_ASM_DI), %_ASM_BX + mov VCPU_RBP(%_ASM_DI), %_ASM_BP + mov VCPU_RSI(%_ASM_DI), %_ASM_SI #ifdef CONFIG_X86_64 - mov VCPU_R8 (%_ASM_AX), %r8 - mov VCPU_R9 (%_ASM_AX), %r9 - mov VCPU_R10(%_ASM_AX), %r10 - mov VCPU_R11(%_ASM_AX), %r11 - mov VCPU_R12(%_ASM_AX), %r12 - mov VCPU_R13(%_ASM_AX), %r13 - mov VCPU_R14(%_ASM_AX), %r14 - mov VCPU_R15(%_ASM_AX), %r15 + mov VCPU_R8 (%_ASM_DI), %r8 + mov VCPU_R9 (%_ASM_DI), %r9 + mov VCPU_R10(%_ASM_DI), %r10 + mov VCPU_R11(%_ASM_DI), %r11 + mov VCPU_R12(%_ASM_DI), %r12 + mov VCPU_R13(%_ASM_DI), %r13 + mov VCPU_R14(%_ASM_DI), %r14 + mov VCPU_R15(%_ASM_DI), %r15 #endif - /* Load guest RAX. This kills the @regs pointer! */ - mov VCPU_RAX(%_ASM_AX), %_ASM_AX + /* Load guest RDI. This kills the @vmx pointer! */ + mov VCPU_RDI(%_ASM_DI), %_ASM_DI /* * Note, ALTERNATIVE_2 works in reverse order. If CLEAR_CPU_BUF_VM is @@ -140,7 +109,7 @@ SYM_FUNC_START(__vmx_vcpu_run) * do VERW. Else, do nothing (no mitigations needed/enabled). */ ALTERNATIVE_2 "", \ - __stringify(testl $VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO, WORD_SIZE(%_ASM_SP); \ + __stringify(testl $KVM_ENTER_CLEAR_CPU_BUFFERS_FOR_MMIO, (%_ASM_SP); \ jz .Lskip_mmio_verw; \ VERW; \ .Lskip_mmio_verw:), \ @@ -148,7 +117,7 @@ SYM_FUNC_START(__vmx_vcpu_run) __stringify(VERW), X86_FEATURE_CLEAR_CPU_BUF_VM /* Check @flags to see if VMLAUNCH or VMRESUME is needed. */ - testl $VMX_RUN_VMRESUME, WORD_SIZE(%_ASM_SP) + testl $KVM_ENTER_VMRESUME, (%_ASM_SP) jz .Lvmlaunch /* @@ -180,38 +149,35 @@ SYM_INNER_LABEL_ALIGN(vmx_vmexit, SYM_L_GLOBAL) UNWIND_HINT_RESTORE ENDBR - /* Temporarily save guest's RAX. */ - push %_ASM_AX + /* Temporarily save guest's RDI. */ + push %_ASM_DI - /* Reload @regs to RAX. */ - mov WORD_SIZE(%_ASM_SP), %_ASM_AX + /* Reload @vmx to RDI. */ + mov 2*WORD_SIZE(%_ASM_SP), %_ASM_DI - /* Save all guest registers, including RAX from the stack */ - pop VCPU_RAX(%_ASM_AX) - mov %_ASM_CX, VCPU_RCX(%_ASM_AX) - mov %_ASM_DX, VCPU_RDX(%_ASM_AX) - mov %_ASM_BX, VCPU_RBX(%_ASM_AX) - mov %_ASM_BP, VCPU_RBP(%_ASM_AX) - mov %_ASM_SI, VCPU_RSI(%_ASM_AX) - mov %_ASM_DI, VCPU_RDI(%_ASM_AX) + /* Save all guest registers, including RDI from the stack */ + mov %_ASM_AX, VCPU_RAX(%_ASM_DI) + mov %_ASM_CX, VCPU_RCX(%_ASM_DI) + mov %_ASM_DX, VCPU_RDX(%_ASM_DI) + mov %_ASM_BX, VCPU_RBX(%_ASM_DI) + mov %_ASM_BP, VCPU_RBP(%_ASM_DI) + mov %_ASM_SI, VCPU_RSI(%_ASM_DI) + pop VCPU_RDI(%_ASM_DI) #ifdef CONFIG_X86_64 - mov %r8, VCPU_R8 (%_ASM_AX) - mov %r9, VCPU_R9 (%_ASM_AX) - mov %r10, VCPU_R10(%_ASM_AX) - mov %r11, VCPU_R11(%_ASM_AX) - mov %r12, VCPU_R12(%_ASM_AX) - mov %r13, VCPU_R13(%_ASM_AX) - mov %r14, VCPU_R14(%_ASM_AX) - mov %r15, VCPU_R15(%_ASM_AX) + mov %r8, VCPU_R8 (%_ASM_DI) + mov %r9, VCPU_R9 (%_ASM_DI) + mov %r10, VCPU_R10(%_ASM_DI) + mov %r11, VCPU_R11(%_ASM_DI) + mov %r12, VCPU_R12(%_ASM_DI) + mov %r13, VCPU_R13(%_ASM_DI) + mov %r14, VCPU_R14(%_ASM_DI) + mov %r15, VCPU_R15(%_ASM_DI) #endif /* Clear return value to indicate VM-Exit (as opposed to VM-Fail). */ xor %ebx, %ebx .Lclear_regs: - /* Discard @regs. The register is irrelevant, it just can't be RBX. */ - pop %_ASM_AX - /* * Clear all general purpose registers except RSP and RBX to prevent * speculative use of the guest's values, even those that are reloaded @@ -254,16 +220,32 @@ SYM_INNER_LABEL_ALIGN(vmx_vmexit, SYM_L_GLOBAL) FILL_RETURN_BUFFER %_ASM_CX, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_VMEXIT,\ X86_FEATURE_RSB_VMEXIT_LITE - pop %_ASM_ARG2 /* @flags */ - pop %_ASM_ARG1 /* @vmx */ + /* Clobbers RAX, RCX, RDX, RSI. */ + ALTERNATIVE "jmp .Lspec_ctrl_host_done", "", X86_FEATURE_MSR_SPEC_CTRL + mov WORD_SIZE(%_ASM_SP), %_ASM_DI + RESTORE_HOST_SPEC_CTRL_BODY VMX_spec_ctrl(%_ASM_DI), (%_ASM_SP), .Lspec_ctrl_host_done +.Lspec_ctrl_host_done: - call vmx_spec_ctrl_restore_host + /* + * Halt speculation past a conditional wrmsr. Intel's eIBRS + * guarantees that the guest cannot control the RSB "once IBRS is + * set", but in the eIBRS case speculative execution past the 'je' + * can go all the way to the RET below while MSR_IA32_SPEC_CTRL + * still holds the guest value. + */ + ALTERNATIVE_2 "", "lfence", X86_FEATURE_MSR_SPEC_CTRL, \ + "", X86_FEATURE_KERNEL_IBRS CLEAR_BRANCH_HISTORY_VMEXIT /* Put return value in AX */ mov %_ASM_BX, %_ASM_AX + /* Pop our saved arguments from the stack */ + pop %_ASM_BX + pop %_ASM_BX + + /* ... and then the callee-save registers */ pop %_ASM_BX #ifdef CONFIG_X86_64 pop %r12 diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index ede773ce065a7..e5cfe4d12c479 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -74,6 +74,7 @@ #include "x86_ops.h" #include "smm.h" #include "vmx_onhyperv.h" +#include "vmenter.h" #include "posted_intr.h" #include "mmu/spte.h" @@ -118,6 +119,9 @@ module_param(emulate_invalid_guest_state, bool, 0444); static bool __read_mostly fasteoi = 1; module_param(fasteoi, bool, 0444); +bool __read_mostly enable_mbec = 1; +module_param_named(mbec, enable_mbec, bool, 0444); + module_param(enable_apicv, bool, 0444); module_param(enable_ipiv, bool, 0444); @@ -847,8 +851,8 @@ static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg, bool ret; u32 mask = 1 << (seg * SEG_FIELD_NR + field); - if (!kvm_register_is_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS)) { - kvm_register_mark_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS); + if (!kvm_register_is_available(&vmx->vcpu, VCPU_REG_SEGMENTS)) { + kvm_register_mark_available(&vmx->vcpu, VCPU_REG_SEGMENTS); vmx->segment_cache.bitmask = 0; } ret = vmx->segment_cache.bitmask & mask; @@ -968,12 +972,12 @@ static bool msr_write_intercepted(struct vcpu_vmx *vmx, u32 msr) return vmx_test_msr_bitmap_write(vmx->loaded_vmcs->msr_bitmap, msr); } -unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx) +unsigned int __vmx_vcpu_enter_flags(struct vcpu_vmx *vmx) { unsigned int flags = 0; if (vmx->loaded_vmcs->launched) - flags |= VMX_RUN_VMRESUME; + flags |= KVM_ENTER_VMRESUME; /* * If writes to the SPEC_CTRL MSR aren't intercepted, the guest is free @@ -981,11 +985,11 @@ unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx) * it after vmexit and store it in vmx->spec_ctrl. */ if (!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL)) - flags |= VMX_RUN_SAVE_SPEC_CTRL; + flags |= KVM_ENTER_SAVE_SPEC_CTRL; if (cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF_VM_MMIO) && kvm_vcpu_can_access_host_mmio(&vmx->vcpu)) - flags |= VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO; + flags |= KVM_ENTER_CLEAR_CPU_BUFFERS_FOR_MMIO; return flags; } @@ -1613,8 +1617,8 @@ unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long rflags, save_rflags; - if (!kvm_register_is_available(vcpu, VCPU_EXREG_RFLAGS)) { - kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS); + if (!kvm_register_is_available(vcpu, VCPU_REG_RFLAGS)) { + kvm_register_mark_available(vcpu, VCPU_REG_RFLAGS); rflags = vmcs_readl(GUEST_RFLAGS); if (vmx->rmode.vm86_active) { rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS; @@ -1637,7 +1641,7 @@ void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) * if L1 runs L2 as a restricted guest. */ if (is_unrestricted_guest(vcpu)) { - kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS); + kvm_register_mark_available(vcpu, VCPU_REG_RFLAGS); vmx->rflags = rflags; vmcs_writel(GUEST_RFLAGS, rflags); return; @@ -2608,20 +2612,20 @@ void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) case VCPU_REGS_RSP: vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); break; - case VCPU_REGS_RIP: - vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP); + case VCPU_REG_RIP: + vcpu->arch.rip = vmcs_readl(GUEST_RIP); break; - case VCPU_EXREG_PDPTR: + case VCPU_REG_PDPTR: if (enable_ept) ept_save_pdptrs(vcpu); break; - case VCPU_EXREG_CR0: + case VCPU_REG_CR0: guest_owned_bits = vcpu->arch.cr0_guest_owned_bits; vcpu->arch.cr0 &= ~guest_owned_bits; vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & guest_owned_bits; break; - case VCPU_EXREG_CR3: + case VCPU_REG_CR3: /* * When intercepting CR3 loads, e.g. for shadowing paging, KVM's * CR3 is loaded into hardware, not the guest's CR3. @@ -2629,7 +2633,7 @@ void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) if (!(exec_controls_get(to_vmx(vcpu)) & CPU_BASED_CR3_LOAD_EXITING)) vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); break; - case VCPU_EXREG_CR4: + case VCPU_REG_CR4: guest_owned_bits = vcpu->arch.cr4_guest_owned_bits; vcpu->arch.cr4 &= ~guest_owned_bits; @@ -2777,6 +2781,7 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf, return -EIO; vmx_cap->ept = 0; + _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_MODE_BASED_EPT_EXEC; _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE; } if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) && @@ -2790,6 +2795,16 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf, vmx_cap->vpid = 0; } + /* + * Virtualizing MBEC requires advanced vmexit information in order to + * distinguish supervisor and user accesses. For simplicity and clarity + * disable MBEC entirely if advanced vmexit information is not available, + * this way mbec=1 in the kvm_intel module parameters implies availability + * to nested guests as well. + */ + if (!(vmx_cap->ept & VMX_EPT_ADVANCED_VMEXIT_INFO_BIT)) + _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_MODE_BASED_EPT_EXEC; + if (!cpu_has_sgx()) _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_ENCLS_EXITING; @@ -3354,7 +3369,7 @@ void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu) { struct kvm_mmu *mmu = vcpu->arch.walk_mmu; - if (!kvm_register_is_dirty(vcpu, VCPU_EXREG_PDPTR)) + if (!kvm_register_is_dirty(vcpu, VCPU_REG_PDPTR)) return; if (is_pae_paging(vcpu)) { @@ -3377,7 +3392,7 @@ void ept_save_pdptrs(struct kvm_vcpu *vcpu) mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2); mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3); - kvm_register_mark_available(vcpu, VCPU_EXREG_PDPTR); + kvm_register_mark_available(vcpu, VCPU_REG_PDPTR); } #define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \ @@ -3420,7 +3435,7 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) vmcs_writel(CR0_READ_SHADOW, cr0); vmcs_writel(GUEST_CR0, hw_cr0); vcpu->arch.cr0 = cr0; - kvm_register_mark_available(vcpu, VCPU_EXREG_CR0); + kvm_register_mark_available(vcpu, VCPU_REG_CR0); #ifdef CONFIG_X86_64 if (vcpu->arch.efer & EFER_LME) { @@ -3438,8 +3453,8 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) * (correctly) stop reading vmcs.GUEST_CR3 because it thinks * KVM's CR3 is installed. */ - if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3)) - vmx_cache_reg(vcpu, VCPU_EXREG_CR3); + if (!kvm_register_is_available(vcpu, VCPU_REG_CR3)) + vmx_cache_reg(vcpu, VCPU_REG_CR3); /* * When running with EPT but not unrestricted guest, KVM must @@ -3476,7 +3491,7 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) * GUEST_CR3 is still vmx->ept_identity_map_addr if EPT + !URG. */ if (!(old_cr0_pg & X86_CR0_PG) && (cr0 & X86_CR0_PG)) - kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3); + kvm_register_mark_dirty(vcpu, VCPU_REG_CR3); } /* depends on vcpu->arch.cr0 to be set to a new value */ @@ -3505,7 +3520,7 @@ void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) if (!enable_unrestricted_guest && !is_paging(vcpu)) guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr; - else if (kvm_register_is_dirty(vcpu, VCPU_EXREG_CR3)) + else if (kvm_register_is_dirty(vcpu, VCPU_REG_CR3)) guest_cr3 = vcpu->arch.cr3; else /* vmcs.GUEST_CR3 is already up-to-date. */ update_guest_cr3 = false; @@ -3565,7 +3580,7 @@ void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) } vcpu->arch.cr4 = cr4; - kvm_register_mark_available(vcpu, VCPU_EXREG_CR4); + kvm_register_mark_available(vcpu, VCPU_REG_CR4); if (!enable_unrestricted_guest) { if (enable_ept) { @@ -4746,6 +4761,9 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) */ exec_control &= ~SECONDARY_EXEC_ENABLE_VMFUNC; + if (!enable_mbec) + exec_control &= ~SECONDARY_EXEC_MODE_BASED_EPT_EXEC; + /* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP, * in vmx_set_cr4. */ exec_control &= ~SECONDARY_EXEC_DESC; @@ -5032,7 +5050,7 @@ void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); vmx_segment_cache_clear(vmx); - kvm_register_mark_available(vcpu, VCPU_EXREG_SEGMENTS); + kvm_register_mark_available(vcpu, VCPU_REG_SEGMENTS); vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); @@ -7410,31 +7428,6 @@ void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp) } } -void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx, - unsigned int flags) -{ - u64 hostval = this_cpu_read(x86_spec_ctrl_current); - - if (!cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL)) - return; - - if (flags & VMX_RUN_SAVE_SPEC_CTRL) - vmx->spec_ctrl = native_rdmsrq(MSR_IA32_SPEC_CTRL); - - /* - * If the guest/host SPEC_CTRL values differ, restore the host value. - * - * For legacy IBRS, the IBRS bit always needs to be written after - * transitioning from a less privileged predictor mode, regardless of - * whether the guest/host values differ. - */ - if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) || - vmx->spec_ctrl != hostval) - native_wrmsrq(MSR_IA32_SPEC_CTRL, hostval); - - barrier_nospec(); -} - static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu, bool force_immediate_exit) { @@ -7488,11 +7481,10 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, if (vcpu->arch.cr2 != native_read_cr2()) native_write_cr2(vcpu->arch.cr2); - vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, - flags); + vmx->fail = __vmx_vcpu_run(vmx, flags); vcpu->arch.cr2 = native_read_cr2(); - vcpu->arch.regs_avail &= ~VMX_REGS_LAZY_LOAD_SET; + kvm_clear_available_registers(vcpu, VMX_REGS_LAZY_LOAD_SET); vmx->idt_vectoring_info = 0; @@ -7534,9 +7526,9 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) vmx->vt.exit_reason.full = EXIT_REASON_INVALID_STATE; vmx->vt.exit_reason.failed_vmentry = 1; - kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1); + kvm_register_mark_available(vcpu, VCPU_REG_EXIT_INFO_1); vmx->vt.exit_qualification = ENTRY_FAIL_DEFAULT; - kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2); + kvm_register_mark_available(vcpu, VCPU_REG_EXIT_INFO_2); vmx->vt.exit_intr_info = 0; return EXIT_FASTPATH_NONE; } @@ -7556,9 +7548,9 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) if (kvm_register_is_dirty(vcpu, VCPU_REGS_RSP)) vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); - if (kvm_register_is_dirty(vcpu, VCPU_REGS_RIP)) - vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); - vcpu->arch.regs_dirty = 0; + if (kvm_register_is_dirty(vcpu, VCPU_REG_RIP)) + vmcs_writel(GUEST_RIP, vcpu->arch.rip); + kvm_reset_dirty_registers(vcpu); if (run_flags & KVM_RUN_LOAD_GUEST_DR6) set_debugreg(vcpu->arch.dr6, 6); @@ -7607,7 +7599,7 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) kvm_wait_lapic_expire(vcpu); /* The actual VMENTER/EXIT is in the .noinstr.text section. */ - vmx_vcpu_enter_exit(vcpu, __vmx_vcpu_run_flags(vmx)); + vmx_vcpu_enter_exit(vcpu, __vmx_vcpu_enter_flags(vmx)); /* All fields are clean at this point */ if (kvm_is_using_evmcs()) { @@ -8645,6 +8637,8 @@ __init int vmx_hardware_setup(void) if (!cpu_has_vmx_ept_ad_bits() || !enable_ept) enable_ept_ad_bits = 0; + if (!cpu_has_ept_mbec() || !enable_ept) + enable_mbec = 0; if (!cpu_has_vmx_unrestricted_guest() || !enable_ept) enable_unrestricted_guest = 0; @@ -8706,8 +8700,7 @@ __init int vmx_hardware_setup(void) set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ if (enable_ept) - kvm_mmu_set_ept_masks(enable_ept_ad_bits, - cpu_has_vmx_ept_execute_only()); + kvm_mmu_set_ept_masks(enable_ept_ad_bits); else vt_x86_ops.get_mt_mask = NULL; diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index db84e8001da58..daedf663c0a9c 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -15,7 +15,6 @@ #include "vmcs.h" #include "vmx_ops.h" #include "../cpuid.h" -#include "run_flags.h" #include "../mmu.h" #include "common.h" @@ -317,7 +316,7 @@ static __always_inline unsigned long vmx_get_exit_qual(struct kvm_vcpu *vcpu) { struct vcpu_vt *vt = to_vt(vcpu); - if (!kvm_register_test_and_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1) && + if (!kvm_register_test_and_mark_available(vcpu, VCPU_REG_EXIT_INFO_1) && !WARN_ON_ONCE(is_td_vcpu(vcpu))) vt->exit_qualification = vmcs_readl(EXIT_QUALIFICATION); @@ -328,7 +327,7 @@ static __always_inline u32 vmx_get_intr_info(struct kvm_vcpu *vcpu) { struct vcpu_vt *vt = to_vt(vcpu); - if (!kvm_register_test_and_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2) && + if (!kvm_register_test_and_mark_available(vcpu, VCPU_REG_EXIT_INFO_2) && !WARN_ON_ONCE(is_td_vcpu(vcpu))) vt->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); @@ -368,10 +367,8 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu); struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr); void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu); void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp); -void vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx, unsigned int flags); -unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx); -bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, - unsigned int flags); +unsigned int __vmx_vcpu_enter_flags(struct vcpu_vmx *vmx); +bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned int flags); void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu); void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool set); @@ -567,6 +564,7 @@ static inline u8 vmx_get_rvi(void) SECONDARY_EXEC_ENABLE_VMFUNC | \ SECONDARY_EXEC_BUS_LOCK_DETECTION | \ SECONDARY_EXEC_NOTIFY_VM_EXITING | \ + SECONDARY_EXEC_MODE_BASED_EPT_EXEC | \ SECONDARY_EXEC_ENCLS_EXITING | \ SECONDARY_EXEC_EPT_VIOLATION_VE) @@ -620,16 +618,16 @@ BUILD_CONTROLS_SHADOW(tertiary_exec, TERTIARY_VM_EXEC_CONTROL, 64) * cache on demand. Other registers not listed here are synced to * the cache immediately after VM-Exit. */ -#define VMX_REGS_LAZY_LOAD_SET ((1 << VCPU_REGS_RIP) | \ - (1 << VCPU_REGS_RSP) | \ - (1 << VCPU_EXREG_RFLAGS) | \ - (1 << VCPU_EXREG_PDPTR) | \ - (1 << VCPU_EXREG_SEGMENTS) | \ - (1 << VCPU_EXREG_CR0) | \ - (1 << VCPU_EXREG_CR3) | \ - (1 << VCPU_EXREG_CR4) | \ - (1 << VCPU_EXREG_EXIT_INFO_1) | \ - (1 << VCPU_EXREG_EXIT_INFO_2)) +#define VMX_REGS_LAZY_LOAD_SET (BIT(VCPU_REGS_RSP) | \ + BIT(VCPU_REG_RIP) | \ + BIT(VCPU_REG_RFLAGS) | \ + BIT(VCPU_REG_PDPTR) | \ + BIT(VCPU_REG_SEGMENTS) | \ + BIT(VCPU_REG_CR0) | \ + BIT(VCPU_REG_CR3) | \ + BIT(VCPU_REG_CR4) | \ + BIT(VCPU_REG_EXIT_INFO_1) | \ + BIT(VCPU_REG_EXIT_INFO_2)) static inline unsigned long vmx_l1_guest_owned_cr0_bits(void) { diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h index d09abeac2b56a..4098580742462 100644 --- a/arch/x86/kvm/vmx/x86_ops.h +++ b/arch/x86/kvm/vmx/x86_ops.h @@ -4,6 +4,7 @@ #include <linux/kvm_host.h> +#include "capabilities.h" #include "x86.h" __init int vmx_hardware_setup(void); @@ -104,6 +105,11 @@ int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr); u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); +static inline bool vmx_tdp_has_smep(struct kvm *kvm) +{ + return enable_mbec; +} + void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code); void vmx_get_entry_info(struct kvm_vcpu *vcpu, u32 *intr_info, u32 *error_code); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c1a72d749084f..0ac9d94e51776 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1072,7 +1072,8 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) * to an L1 GPA. */ real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(pdpt_gfn), - PFERR_USER_MASK | PFERR_WRITE_MASK, NULL); + PFERR_USER_MASK | PFERR_WRITE_MASK | + PFERR_GUEST_PAGE_MASK, NULL, 0); if (real_gpa == INVALID_GPA) return 0; @@ -1090,14 +1091,14 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) } /* - * Marking VCPU_EXREG_PDPTR dirty doesn't work for !tdp_enabled. + * Marking VCPU_REG_PDPTR dirty doesn't work for !tdp_enabled. * Shadow page roots need to be reconstructed instead. */ if (!tdp_enabled && memcmp(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs))) kvm_mmu_free_roots(vcpu->kvm, mmu, KVM_MMU_ROOT_CURRENT); memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs)); - kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR); + kvm_register_mark_dirty(vcpu, VCPU_REG_PDPTR); kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu); vcpu->arch.pdptrs_from_userspace = false; @@ -1478,7 +1479,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) kvm_mmu_new_pgd(vcpu, cr3); vcpu->arch.cr3 = cr3; - kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3); + kvm_register_mark_dirty(vcpu, VCPU_REG_CR3); /* Do not call post_set_cr3, we do not get here for confidential guests. */ handle_tlb_flush: @@ -7847,21 +7848,6 @@ void kvm_get_segment(struct kvm_vcpu *vcpu, kvm_x86_call(get_segment)(vcpu, var, seg); } -gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u64 access, - struct x86_exception *exception) -{ - struct kvm_mmu *mmu = vcpu->arch.mmu; - gpa_t t_gpa; - - BUG_ON(!mmu_is_nested(vcpu)); - - /* NPT walks are always user-walks */ - access |= PFERR_USER_MASK; - t_gpa = mmu->gva_to_gpa(vcpu, mmu, gpa, access, exception); - - return t_gpa; -} - gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception) { @@ -12473,7 +12459,7 @@ static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs, vcpu->arch.cr2 = sregs->cr2; *mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3; vcpu->arch.cr3 = sregs->cr3; - kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3); + kvm_register_mark_dirty(vcpu, VCPU_REG_CR3); kvm_x86_call(post_set_cr3)(vcpu, sregs->cr3); kvm_set_cr8(vcpu, sregs->cr8); @@ -12566,7 +12552,7 @@ static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2) for (i = 0; i < 4 ; i++) kvm_pdptr_write(vcpu, i, sregs2->pdptrs[i]); - kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR); + kvm_register_mark_dirty(vcpu, VCPU_REG_PDPTR); mmu_reset_needed = 1; vcpu->arch.pdptrs_from_userspace = true; } @@ -12836,8 +12822,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) int r; vcpu->arch.last_vmentry_cpu = -1; - vcpu->arch.regs_avail = ~0; - vcpu->arch.regs_dirty = ~0; + bitmap_fill(vcpu->arch.regs_avail, NR_VCPU_TOTAL_REGS); + bitmap_fill(vcpu->arch.regs_dirty, NR_VCPU_TOTAL_REGS); kvm_gpc_init(&vcpu->arch.pv_time, vcpu->kvm); @@ -13111,7 +13097,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) kvm_rip_write(vcpu, 0xfff0); vcpu->arch.cr3 = 0; - kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3); + kvm_register_mark_dirty(vcpu, VCPU_REG_CR3); /* * CR0.CD/NW are set on RESET, preserved on INIT. Note, some versions @@ -14323,7 +14309,7 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva) * the RAP (Return Address Predicator). */ if (guest_cpu_cap_has(vcpu, X86_FEATURE_ERAPS)) - kvm_register_is_dirty(vcpu, VCPU_EXREG_ERAPS); + kvm_register_is_dirty(vcpu, VCPU_REG_ERAPS); kvm_invalidate_pcid(vcpu, operand.pcid); return kvm_skip_emulated_instruction(vcpu); @@ -14339,7 +14325,7 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva) fallthrough; case INVPCID_TYPE_ALL_INCL_GLOBAL: /* - * Don't bother marking VCPU_EXREG_ERAPS dirty, SVM will take + * Don't bother marking VCPU_REG_ERAPS dirty, SVM will take * care of doing so when emulating the full guest TLB flush * (the RAP is cleared on all implicit TLB flushes). */ |
