aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
authorMark Brown <broonie@kernel.org>2026-05-29 22:46:46 +0100
committerMark Brown <broonie@kernel.org>2026-05-29 22:46:46 +0100
commitf08a9de96912e608e3f2d2c7c8c12fbdc6b98208 (patch)
tree603f9be5717308bc74729857e55a12913436360c /arch
parent10f1a9c9db76b998263c755761741b6101946a6e (diff)
parentb7fbe9a1bf9ee6c967ef77d366ca58c35fcf1887 (diff)
downloadlinux-next-history-f08a9de96912e608e3f2d2c7c8c12fbdc6b98208.tar.gz
Merge branch 'next' of git://git.kernel.org/pub/scm/virt/kvm/kvm.git
Diffstat (limited to 'arch')
-rw-r--r--arch/x86/include/asm/cpufeatures.h1
-rw-r--r--arch/x86/include/asm/kvm-x86-ops.h1
-rw-r--r--arch/x86/include/asm/kvm_host.h80
-rw-r--r--arch/x86/include/asm/svm.h1
-rw-r--r--arch/x86/include/asm/vmx.h14
-rw-r--r--arch/x86/kvm/hyperv.c4
-rw-r--r--arch/x86/kvm/kvm-asm-offsets.c1
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h62
-rw-r--r--arch/x86/kvm/mmu.h30
-rw-r--r--arch/x86/kvm/mmu/mmu.c182
-rw-r--r--arch/x86/kvm/mmu/mmutrace.h19
-rw-r--r--arch/x86/kvm/mmu/paging_tmpl.h73
-rw-r--r--arch/x86/kvm/mmu/spte.c92
-rw-r--r--arch/x86/kvm/mmu/spte.h70
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c6
-rw-r--r--arch/x86/kvm/svm/nested.c38
-rw-r--r--arch/x86/kvm/svm/sev.c2
-rw-r--r--arch/x86/kvm/svm/svm.c70
-rw-r--r--arch/x86/kvm/svm/svm.h7
-rw-r--r--arch/x86/kvm/svm/vmenter.S103
-rw-r--r--arch/x86/kvm/vmenter.h80
-rw-r--r--arch/x86/kvm/vmx/capabilities.h12
-rw-r--r--arch/x86/kvm/vmx/common.h26
-rw-r--r--arch/x86/kvm/vmx/hyperv_evmcs.h1
-rw-r--r--arch/x86/kvm/vmx/main.c9
-rw-r--r--arch/x86/kvm/vmx/nested.c56
-rw-r--r--arch/x86/kvm/vmx/run_flags.h9
-rw-r--r--arch/x86/kvm/vmx/tdx.c38
-rw-r--r--arch/x86/kvm/vmx/vmenter.S184
-rw-r--r--arch/x86/kvm/vmx/vmx.c113
-rw-r--r--arch/x86/kvm/vmx/vmx.h32
-rw-r--r--arch/x86/kvm/vmx/x86_ops.h6
-rw-r--r--arch/x86/kvm/x86.c38
33 files changed, 887 insertions, 573 deletions
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 7b572bc24265c..1b4a48bff18f7 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -379,6 +379,7 @@
#define X86_FEATURE_AVIC (15*32+13) /* "avic" Virtual Interrupt Controller */
#define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* "v_vmsave_vmload" Virtual VMSAVE VMLOAD */
#define X86_FEATURE_VGIF (15*32+16) /* "vgif" Virtual GIF */
+#define X86_FEATURE_GMET (15*32+17) /* Guest Mode Execution Trap */
#define X86_FEATURE_X2AVIC (15*32+18) /* "x2avic" Virtual x2apic */
#define X86_FEATURE_V_SPEC_CTRL (15*32+20) /* "v_spec_ctrl" Virtual SPEC_CTRL */
#define X86_FEATURE_VNMI (15*32+25) /* "vnmi" Virtual NMI */
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index 3776cf5382a26..e4fca997ec797 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -94,6 +94,7 @@ KVM_X86_OP_OPTIONAL(sync_pir_to_irr)
KVM_X86_OP_OPTIONAL_RET0(set_tss_addr)
KVM_X86_OP_OPTIONAL_RET0(set_identity_map_addr)
KVM_X86_OP_OPTIONAL_RET0(get_mt_mask)
+KVM_X86_OP_OPTIONAL_RET0(tdp_has_smep)
KVM_X86_OP(load_mmu_pgd)
KVM_X86_OP_OPTIONAL(link_external_spt)
KVM_X86_OP_OPTIONAL(set_external_spte)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c470e40a00aa4..8a53ca6195701 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -191,11 +191,12 @@ enum kvm_reg {
VCPU_REGS_R14 = __VCPU_REGS_R14,
VCPU_REGS_R15 = __VCPU_REGS_R15,
#endif
- VCPU_REGS_RIP,
- NR_VCPU_REGS,
+ NR_VCPU_GENERAL_PURPOSE_REGS,
- VCPU_EXREG_PDPTR = NR_VCPU_REGS,
- VCPU_EXREG_CR0,
+ VCPU_REG_RIP = NR_VCPU_GENERAL_PURPOSE_REGS,
+
+ VCPU_REG_PDPTR,
+ VCPU_REG_CR0,
/*
* Alias AMD's ERAPS (not a real register) to CR3 so that common code
* can trigger emulation of the RAP (Return Address Predictor) with
@@ -203,13 +204,15 @@ enum kvm_reg {
* is cleared on writes to CR3, i.e. marking CR3 dirty will naturally
* mark ERAPS dirty as well.
*/
- VCPU_EXREG_CR3,
- VCPU_EXREG_ERAPS = VCPU_EXREG_CR3,
- VCPU_EXREG_CR4,
- VCPU_EXREG_RFLAGS,
- VCPU_EXREG_SEGMENTS,
- VCPU_EXREG_EXIT_INFO_1,
- VCPU_EXREG_EXIT_INFO_2,
+ VCPU_REG_CR3,
+ VCPU_REG_ERAPS = VCPU_REG_CR3,
+ VCPU_REG_CR4,
+ VCPU_REG_RFLAGS,
+ VCPU_REG_SEGMENTS,
+ VCPU_REG_EXIT_INFO_1,
+ VCPU_REG_EXIT_INFO_2,
+
+ NR_VCPU_TOTAL_REGS,
};
enum {
@@ -328,11 +331,11 @@ struct kvm_kernel_irq_routing_entry;
* the number of unique SPs that can theoretically be created is 2^n, where n
* is the number of bits that are used to compute the role.
*
- * But, even though there are 20 bits in the mask below, not all combinations
+ * But, even though there are 21 bits in the mask below, not all combinations
* of modes and flags are possible:
*
* - invalid shadow pages are not accounted, mirror pages are not shadowed,
- * so the bits are effectively 18.
+ * so the bits are effectively 19.
*
* - quadrant will only be used if has_4_byte_gpte=1 (non-PAE paging);
* execonly and ad_disabled are only used for nested EPT which has
@@ -343,11 +346,11 @@ struct kvm_kernel_irq_routing_entry;
* paging has exactly one upper level, making level completely redundant
* when has_4_byte_gpte=1.
*
- * - on top of this, smep_andnot_wp and smap_andnot_wp are only set if
- * cr0_wp=0, therefore these three bits only give rise to 5 possibilities.
+ * - on top of this, smap_andnot_wp is only set if cr0_wp=0,
+ * therefore these two bits only give rise to 3 possibilities.
*
* Therefore, the maximum number of possible upper-level shadow pages for a
- * single gfn is a bit less than 2^13.
+ * single gfn is a bit less than 2^14.
*/
union kvm_mmu_page_role {
u32 word;
@@ -356,17 +359,26 @@ union kvm_mmu_page_role {
unsigned has_4_byte_gpte:1;
unsigned quadrant:2;
unsigned direct:1;
- unsigned access:3;
+ unsigned access:4;
unsigned invalid:1;
unsigned efer_nx:1;
unsigned cr0_wp:1;
- unsigned smep_andnot_wp:1;
unsigned smap_andnot_wp:1;
unsigned ad_disabled:1;
unsigned guest_mode:1;
unsigned passthrough:1;
unsigned is_mirror:1;
- unsigned :4;
+
+ /*
+ * cr4_smep is also set for EPT MBEC. Because it affects
+ * which pages are considered non-present (bit 10 additionally
+ * must be zero if MBEC is on) it has to be in the base role.
+ * It also has to be in the base role for AMD GMET because
+ * kernel-executable pages need to have U=0 with GMET enabled.
+ */
+ unsigned cr4_smep:1;
+
+ unsigned:3;
/*
* This is left at the top of the word so that
@@ -392,10 +404,10 @@ union kvm_mmu_page_role {
* tables (because KVM doesn't support Protection Keys with shadow paging), and
* CR0.PG, CR4.PAE, and CR4.PSE are indirectly reflected in role.level.
*
- * Note, SMEP and SMAP are not redundant with sm*p_andnot_wp in the page role.
- * If CR0.WP=1, KVM can reuse shadow pages for the guest regardless of SMEP and
- * SMAP, but the MMU's permission checks for software walks need to be SMEP and
- * SMAP aware regardless of CR0.WP.
+ * Note, SMAP is not redundant with smap_andnot_wp in the page role. If
+ * CR0.WP=1, KVM can reuse shadow pages for the guest regardless of SMAP,
+ * but the MMU's permission checks for software walks need to be SMAP
+ * aware regardless of CR0.WP.
*/
union kvm_mmu_extended_role {
u32 word;
@@ -405,9 +417,15 @@ union kvm_mmu_extended_role {
unsigned int cr4_pse:1;
unsigned int cr4_pke:1;
unsigned int cr4_smap:1;
- unsigned int cr4_smep:1;
unsigned int cr4_la57:1;
unsigned int efer_lma:1;
+
+ /*
+ * True if either CR4.SMEP or EFER.NXE are set. For AMD NPT
+ * this is the "real" host CR4.SMEP whereas cr4_smep is
+ * actually GMET.
+ */
+ unsigned int has_pferr_fetch:1;
};
};
@@ -492,7 +510,7 @@ struct kvm_mmu {
* Byte index: page fault error code [4:1]
* Bit index: pte permissions in ACC_* format
*/
- u8 permissions[16];
+ u16 permissions[16];
u64 *pae_root;
u64 *pml4_root;
@@ -799,9 +817,10 @@ struct kvm_vcpu_arch {
* rip and regs accesses must go through
* kvm_{register,rip}_{read,write} functions.
*/
- unsigned long regs[NR_VCPU_REGS];
- u32 regs_avail;
- u32 regs_dirty;
+ unsigned long regs[NR_VCPU_GENERAL_PURPOSE_REGS];
+ unsigned long rip;
+ DECLARE_BITMAP(regs_avail, NR_VCPU_TOTAL_REGS);
+ DECLARE_BITMAP(regs_dirty, NR_VCPU_TOTAL_REGS);
unsigned long cr0;
unsigned long cr0_guest_owned_bits;
@@ -1887,6 +1906,7 @@ struct kvm_x86_ops {
int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
int (*set_identity_map_addr)(struct kvm *kvm, u64 ident_addr);
u8 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
+ bool (*tdp_has_smep)(struct kvm *kvm);
void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, hpa_t root_hpa,
int root_level);
@@ -2010,6 +2030,10 @@ struct kvm_x86_nested_ops {
struct kvm_nested_state *kvm_state);
bool (*get_nested_state_pages)(struct kvm_vcpu *vcpu);
int (*write_log_dirty)(struct kvm_vcpu *vcpu, gpa_t l2_gpa);
+ gpa_t (*translate_nested_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa,
+ u64 access,
+ struct x86_exception *exception,
+ u64 pte_access);
int (*enable_evmcs)(struct kvm_vcpu *vcpu,
uint16_t *vmcs_version);
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index bcfeb5e7c0edf..aa63431ba92c3 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -243,6 +243,7 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
#define SVM_MISC_ENABLE_NP BIT(0)
#define SVM_MISC_ENABLE_SEV BIT(1)
#define SVM_MISC_ENABLE_SEV_ES BIT(2)
+#define SVM_MISC_ENABLE_GMET BIT(3)
#define SVM_MISC2_ENABLE_V_LBR BIT_ULL(0)
#define SVM_MISC2_ENABLE_V_VMLOAD_VMSAVE BIT_ULL(1)
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 49d8551d285d9..3f1b3096ff040 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -536,6 +536,7 @@ enum vmcs_field {
#define VMX_EPT_1GB_PAGE_BIT (1ull << 17)
#define VMX_EPT_INVEPT_BIT (1ull << 20)
#define VMX_EPT_AD_BIT (1ull << 21)
+#define VMX_EPT_ADVANCED_VMEXIT_INFO_BIT (1ull << 22)
#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25)
#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26)
@@ -561,10 +562,12 @@ enum vmcs_field {
#define VMX_EPT_ACCESS_BIT (1ull << 8)
#define VMX_EPT_DIRTY_BIT (1ull << 9)
#define VMX_EPT_SUPPRESS_VE_BIT (1ull << 63)
+
#define VMX_EPT_RWX_MASK (VMX_EPT_READABLE_MASK | \
VMX_EPT_WRITABLE_MASK | \
VMX_EPT_EXECUTABLE_MASK)
#define VMX_EPT_MT_MASK (7ull << VMX_EPT_MT_EPTE_SHIFT)
+#define VMX_EPT_USER_EXECUTABLE_MASK (1ull << 10)
static inline u8 vmx_eptp_page_walk_level(u64 eptp)
{
@@ -609,17 +612,24 @@ enum vm_entry_failure_code {
#define EPT_VIOLATION_PROT_READ BIT(3)
#define EPT_VIOLATION_PROT_WRITE BIT(4)
#define EPT_VIOLATION_PROT_EXEC BIT(5)
-#define EPT_VIOLATION_EXEC_FOR_RING3_LIN BIT(6)
+#define EPT_VIOLATION_PROT_USER_EXEC BIT(6)
#define EPT_VIOLATION_PROT_MASK (EPT_VIOLATION_PROT_READ | \
EPT_VIOLATION_PROT_WRITE | \
- EPT_VIOLATION_PROT_EXEC)
+ EPT_VIOLATION_PROT_EXEC | \
+ EPT_VIOLATION_PROT_USER_EXEC)
#define EPT_VIOLATION_GVA_IS_VALID BIT(7)
#define EPT_VIOLATION_GVA_TRANSLATED BIT(8)
+#define EPT_VIOLATION_GVA_USER BIT(9)
+#define EPT_VIOLATION_GVA_WRITABLE BIT(10)
+#define EPT_VIOLATION_GVA_NX BIT(11)
#define EPT_VIOLATION_RWX_TO_PROT(__epte) (((__epte) & VMX_EPT_RWX_MASK) << 3)
+#define EPT_VIOLATION_USER_EXEC_TO_PROT(__epte) (((__epte) & VMX_EPT_USER_EXECUTABLE_MASK) >> 4)
static_assert(EPT_VIOLATION_RWX_TO_PROT(VMX_EPT_RWX_MASK) ==
(EPT_VIOLATION_PROT_READ | EPT_VIOLATION_PROT_WRITE | EPT_VIOLATION_PROT_EXEC));
+static_assert(EPT_VIOLATION_USER_EXEC_TO_PROT(VMX_EPT_USER_EXECUTABLE_MASK) ==
+ (EPT_VIOLATION_PROT_USER_EXEC));
/*
* Exit Qualifications for NOTIFY VM EXIT
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 4438ecac9a89b..015c6947b462e 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -2041,7 +2041,9 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
* read with kvm_read_guest().
*/
if (!hc->fast && mmu_is_nested(vcpu)) {
- hc->ingpa = translate_nested_gpa(vcpu, hc->ingpa, 0, NULL);
+ hc->ingpa = kvm_x86_ops.nested_ops->translate_nested_gpa(
+ vcpu, hc->ingpa,
+ PFERR_GUEST_FINAL_MASK, NULL, 0);
if (unlikely(hc->ingpa == INVALID_GPA))
return HV_STATUS_INVALID_HYPERCALL_INPUT;
}
diff --git a/arch/x86/kvm/kvm-asm-offsets.c b/arch/x86/kvm/kvm-asm-offsets.c
index 24a710d373238..36ac61724dd7f 100644
--- a/arch/x86/kvm/kvm-asm-offsets.c
+++ b/arch/x86/kvm/kvm-asm-offsets.c
@@ -24,6 +24,7 @@ static void __used common(void)
if (IS_ENABLED(CONFIG_KVM_INTEL)) {
BLANK();
+ OFFSET(VMX_vcpu_arch_regs, vcpu_vmx, vcpu.arch.regs);
OFFSET(VMX_spec_ctrl, vcpu_vmx, spec_ctrl);
}
}
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 8ddb01191d6f6..2ae492ad6412b 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -67,29 +67,29 @@ static inline bool kvm_register_is_available(struct kvm_vcpu *vcpu,
enum kvm_reg reg)
{
kvm_assert_register_caching_allowed(vcpu);
- return test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
+ return test_bit(reg, vcpu->arch.regs_avail);
}
static inline bool kvm_register_is_dirty(struct kvm_vcpu *vcpu,
enum kvm_reg reg)
{
kvm_assert_register_caching_allowed(vcpu);
- return test_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty);
+ return test_bit(reg, vcpu->arch.regs_dirty);
}
static inline void kvm_register_mark_available(struct kvm_vcpu *vcpu,
enum kvm_reg reg)
{
kvm_assert_register_caching_allowed(vcpu);
- __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
+ __set_bit(reg, vcpu->arch.regs_avail);
}
static inline void kvm_register_mark_dirty(struct kvm_vcpu *vcpu,
enum kvm_reg reg)
{
kvm_assert_register_caching_allowed(vcpu);
- __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
- __set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty);
+ __set_bit(reg, vcpu->arch.regs_avail);
+ __set_bit(reg, vcpu->arch.regs_dirty);
}
/*
@@ -102,7 +102,29 @@ static __always_inline bool kvm_register_test_and_mark_available(struct kvm_vcpu
enum kvm_reg reg)
{
kvm_assert_register_caching_allowed(vcpu);
- return arch___test_and_set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
+ return arch___test_and_set_bit(reg, vcpu->arch.regs_avail);
+}
+
+static __always_inline void kvm_clear_available_registers(struct kvm_vcpu *vcpu,
+ unsigned long clear_mask)
+{
+ BUILD_BUG_ON(sizeof(clear_mask) != sizeof(vcpu->arch.regs_avail[0]));
+ BUILD_BUG_ON(ARRAY_SIZE(vcpu->arch.regs_avail) != 1);
+
+ /*
+ * Note the bitwise-AND! In practice, a straight write would also work
+ * as KVM initializes the mask to all ones and never clears registers
+ * that are eagerly synchronized. Using a bitwise-AND adds a bit of
+ * sanity checking as incorrectly marking an eagerly sync'd register
+ * unavailable will generate a WARN due to an unexpected cache request.
+ */
+ vcpu->arch.regs_avail[0] &= ~clear_mask;
+}
+
+static __always_inline void kvm_reset_dirty_registers(struct kvm_vcpu *vcpu)
+{
+ BUILD_BUG_ON(ARRAY_SIZE(vcpu->arch.regs_dirty) != 1);
+ vcpu->arch.regs_dirty[0] = 0;
}
/*
@@ -112,7 +134,7 @@ static __always_inline bool kvm_register_test_and_mark_available(struct kvm_vcpu
*/
static inline unsigned long kvm_register_read_raw(struct kvm_vcpu *vcpu, int reg)
{
- if (WARN_ON_ONCE((unsigned int)reg >= NR_VCPU_REGS))
+ if (WARN_ON_ONCE((unsigned int)reg >= NR_VCPU_GENERAL_PURPOSE_REGS))
return 0;
if (!kvm_register_is_available(vcpu, reg))
@@ -124,7 +146,7 @@ static inline unsigned long kvm_register_read_raw(struct kvm_vcpu *vcpu, int reg
static inline void kvm_register_write_raw(struct kvm_vcpu *vcpu, int reg,
unsigned long val)
{
- if (WARN_ON_ONCE((unsigned int)reg >= NR_VCPU_REGS))
+ if (WARN_ON_ONCE((unsigned int)reg >= NR_VCPU_GENERAL_PURPOSE_REGS))
return;
vcpu->arch.regs[reg] = val;
@@ -133,12 +155,16 @@ static inline void kvm_register_write_raw(struct kvm_vcpu *vcpu, int reg,
static inline unsigned long kvm_rip_read(struct kvm_vcpu *vcpu)
{
- return kvm_register_read_raw(vcpu, VCPU_REGS_RIP);
+ if (!kvm_register_is_available(vcpu, VCPU_REG_RIP))
+ kvm_x86_call(cache_reg)(vcpu, VCPU_REG_RIP);
+
+ return vcpu->arch.rip;
}
static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val)
{
- kvm_register_write_raw(vcpu, VCPU_REGS_RIP, val);
+ vcpu->arch.rip = val;
+ kvm_register_mark_dirty(vcpu, VCPU_REG_RIP);
}
static inline unsigned long kvm_rsp_read(struct kvm_vcpu *vcpu)
@@ -155,8 +181,8 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
{
might_sleep(); /* on svm */
- if (!kvm_register_is_available(vcpu, VCPU_EXREG_PDPTR))
- kvm_x86_call(cache_reg)(vcpu, VCPU_EXREG_PDPTR);
+ if (!kvm_register_is_available(vcpu, VCPU_REG_PDPTR))
+ kvm_x86_call(cache_reg)(vcpu, VCPU_REG_PDPTR);
return vcpu->arch.walk_mmu->pdptrs[index];
}
@@ -170,8 +196,8 @@ static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask)
{
ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS;
if ((tmask & vcpu->arch.cr0_guest_owned_bits) &&
- !kvm_register_is_available(vcpu, VCPU_EXREG_CR0))
- kvm_x86_call(cache_reg)(vcpu, VCPU_EXREG_CR0);
+ !kvm_register_is_available(vcpu, VCPU_REG_CR0))
+ kvm_x86_call(cache_reg)(vcpu, VCPU_REG_CR0);
return vcpu->arch.cr0 & mask;
}
@@ -192,8 +218,8 @@ static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask)
{
ulong tmask = mask & KVM_POSSIBLE_CR4_GUEST_BITS;
if ((tmask & vcpu->arch.cr4_guest_owned_bits) &&
- !kvm_register_is_available(vcpu, VCPU_EXREG_CR4))
- kvm_x86_call(cache_reg)(vcpu, VCPU_EXREG_CR4);
+ !kvm_register_is_available(vcpu, VCPU_REG_CR4))
+ kvm_x86_call(cache_reg)(vcpu, VCPU_REG_CR4);
return vcpu->arch.cr4 & mask;
}
@@ -207,8 +233,8 @@ static __always_inline bool kvm_is_cr4_bit_set(struct kvm_vcpu *vcpu,
static inline ulong kvm_read_cr3(struct kvm_vcpu *vcpu)
{
- if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3))
- kvm_x86_call(cache_reg)(vcpu, VCPU_EXREG_CR3);
+ if (!kvm_register_is_available(vcpu, VCPU_REG_CR3))
+ kvm_x86_call(cache_reg)(vcpu, VCPU_REG_CR3);
return vcpu->arch.cr3;
}
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 830f46145692a..ddf4e467c071e 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -37,6 +37,13 @@ extern bool __read_mostly enable_mmio_caching;
#define PT32_ROOT_LEVEL 2
#define PT32E_ROOT_LEVEL 3
+#define ACC_READ_MASK PT_PRESENT_MASK
+#define ACC_WRITE_MASK PT_WRITABLE_MASK
+#define ACC_USER_MASK PT_USER_MASK /* non EPT */
+#define ACC_USER_EXEC_MASK ACC_USER_MASK /* EPT only */
+#define ACC_EXEC_MASK 8
+#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK | ACC_READ_MASK)
+
#define KVM_MMU_CR4_ROLE_BITS (X86_CR4_PSE | X86_CR4_PAE | X86_CR4_LA57 | \
X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE)
@@ -76,19 +83,24 @@ static inline gfn_t kvm_mmu_max_gfn(void)
return (1ULL << (max_gpa_bits - PAGE_SHIFT)) - 1;
}
+static inline bool mmu_has_mbec(struct kvm_mmu *mmu)
+{
+ return mmu->root_role.cr4_smep;
+}
+
u8 kvm_mmu_get_max_tdp_level(void);
void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask);
void kvm_mmu_set_mmio_spte_value(struct kvm *kvm, u64 mmio_value);
void kvm_mmu_set_me_spte_mask(u64 me_value, u64 me_mask);
-void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only);
+void kvm_mmu_set_ept_masks(bool has_ad_bits);
void kvm_init_mmu(struct kvm_vcpu *vcpu);
-void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
- unsigned long cr4, u64 efer, gpa_t nested_cr3);
+void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr4,
+ u64 efer, gpa_t nested_cr3, u64 misc_ctl);
void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
int huge_page_level, bool accessed_dirty,
- gpa_t new_eptp);
+ bool mbec, gpa_t new_eptp);
bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu);
int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
u64 fault_address, char *insn, int insn_len);
@@ -288,17 +300,17 @@ static inline void kvm_update_page_stats(struct kvm *kvm, int level, int count)
atomic64_add(count, &kvm->stat.pages[level - 1]);
}
-gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u64 access,
- struct x86_exception *exception);
-
static inline gpa_t kvm_translate_gpa(struct kvm_vcpu *vcpu,
struct kvm_mmu *mmu,
gpa_t gpa, u64 access,
- struct x86_exception *exception)
+ struct x86_exception *exception,
+ u64 pte_access)
{
if (mmu != &vcpu->arch.nested_mmu)
return gpa;
- return translate_nested_gpa(vcpu, gpa, access, exception);
+ return kvm_x86_ops.nested_ops->translate_nested_gpa(vcpu, gpa, access,
+ exception,
+ pte_access);
}
static inline bool kvm_has_mirrored_tdp(const struct kvm *kvm)
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 91843e9224d04..db1b82eae4da7 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -56,6 +56,7 @@
#include <asm/io.h>
#include <asm/set_memory.h>
#include <asm/spec-ctrl.h>
+#include <asm/svm.h>
#include <asm/vmx.h>
#include "trace.h"
@@ -230,13 +231,18 @@ static inline bool __maybe_unused is_##reg##_##name(struct kvm_mmu *mmu) \
}
BUILD_MMU_ROLE_ACCESSOR(base, cr0, wp);
BUILD_MMU_ROLE_ACCESSOR(ext, cr4, pse);
-BUILD_MMU_ROLE_ACCESSOR(ext, cr4, smep);
+BUILD_MMU_ROLE_ACCESSOR(base, cr4, smep);
BUILD_MMU_ROLE_ACCESSOR(ext, cr4, smap);
BUILD_MMU_ROLE_ACCESSOR(ext, cr4, pke);
BUILD_MMU_ROLE_ACCESSOR(ext, cr4, la57);
BUILD_MMU_ROLE_ACCESSOR(base, efer, nx);
BUILD_MMU_ROLE_ACCESSOR(ext, efer, lma);
+static inline bool has_pferr_fetch(struct kvm_mmu *mmu)
+{
+ return mmu->cpu_role.ext.has_pferr_fetch;
+}
+
static inline bool is_cr0_pg(struct kvm_mmu *mmu)
{
return mmu->cpu_role.base.level > 0;
@@ -2023,7 +2029,7 @@ static bool kvm_sync_page_check(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
*/
const union kvm_mmu_page_role sync_role_ign = {
.level = 0xf,
- .access = 0x7,
+ .access = ACC_ALL,
.quadrant = 0x3,
.passthrough = 0x1,
};
@@ -3459,12 +3465,13 @@ static int direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
struct kvm_shadow_walk_iterator it;
struct kvm_mmu_page *sp;
- int ret;
+ int ret, access;
gfn_t base_gfn = fault->gfn;
kvm_mmu_hugepage_adjust(vcpu, fault);
- trace_kvm_mmu_spte_requested(fault);
+ access = vcpu->arch.mmu->root_role.access;
+ trace_kvm_mmu_spte_requested(fault, access);
for_each_shadow_entry(vcpu, fault->addr, it) {
/*
* We cannot overwrite existing page tables with an NX
@@ -3477,7 +3484,7 @@ static int direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
if (it.level == fault->goal_level)
break;
- sp = kvm_mmu_get_child_sp(vcpu, it.sptep, base_gfn, true, ACC_ALL);
+ sp = kvm_mmu_get_child_sp(vcpu, it.sptep, base_gfn, true, access);
if (sp == ERR_PTR(-EEXIST))
continue;
@@ -3490,7 +3497,7 @@ static int direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
if (WARN_ON_ONCE(it.level != fault->goal_level))
return -EFAULT;
- ret = mmu_set_spte(vcpu, fault->slot, it.sptep, ACC_ALL,
+ ret = mmu_set_spte(vcpu, fault->slot, it.sptep, access,
base_gfn, fault->pfn, fault);
if (ret == RET_PF_SPURIOUS)
return ret;
@@ -4361,7 +4368,14 @@ static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
{
if (exception)
exception->error_code = 0;
- return kvm_translate_gpa(vcpu, mmu, vaddr, access, exception);
+ /*
+ * EPT MBEC uses the effective access bits from the PTE to distinguish
+ * user and supervisor accesses, and treats every linear address as a
+ * user-mode address if CR0.PG=0. Therefore *include* ACC_USER_MASK in
+ * the last argument to kvm_translate_gpa (which NPT does not use).
+ */
+ return kvm_translate_gpa(vcpu, mmu, vaddr, access | PFERR_GUEST_FINAL_MASK,
+ exception, ACC_ALL);
}
static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
@@ -5497,7 +5511,7 @@ static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
static inline bool boot_cpu_is_amd(void)
{
WARN_ON_ONCE(!tdp_enabled);
- return shadow_x_mask == 0;
+ return shadow_xs_mask == 0;
}
/*
@@ -5542,55 +5556,106 @@ reset_ept_shadow_zero_bits_mask(struct kvm_mmu *context, bool execonly)
max_huge_page_level);
}
-#define BYTE_MASK(access) \
- ((1 & (access) ? 2 : 0) | \
- (2 & (access) ? 4 : 0) | \
- (3 & (access) ? 8 : 0) | \
- (4 & (access) ? 16 : 0) | \
- (5 & (access) ? 32 : 0) | \
- (6 & (access) ? 64 : 0) | \
- (7 & (access) ? 128 : 0))
-
+/*
+ * Build a mask with all combinations of PTE access rights that
+ * include the given access bit. The mask can be queried with
+ * "mask & (1 << access)", where access is a combination of
+ * ACC_* bits.
+ *
+ * By mixing and matching multiple masks returned by ACC_BITS_MASK,
+ * update_permission_bitmask() builds what is effectively a
+ * two-dimensional array of bools. The second dimension is
+ * provided by individual bits of permissions[pfec >> 1], and
+ * logical &, | and ~ operations operate on all the 16 possible
+ * combinations of ACC_* bits.
+ */
+#define ACC_BITS_MASK(access) \
+ ((1 & (access) ? 1 << 1 : 0) | \
+ (2 & (access) ? 1 << 2 : 0) | \
+ (3 & (access) ? 1 << 3 : 0) | \
+ (4 & (access) ? 1 << 4 : 0) | \
+ (5 & (access) ? 1 << 5 : 0) | \
+ (6 & (access) ? 1 << 6 : 0) | \
+ (7 & (access) ? 1 << 7 : 0) | \
+ (8 & (access) ? 1 << 8 : 0) | \
+ (9 & (access) ? 1 << 9 : 0) | \
+ (10 & (access) ? 1 << 10 : 0) | \
+ (11 & (access) ? 1 << 11 : 0) | \
+ (12 & (access) ? 1 << 12 : 0) | \
+ (13 & (access) ? 1 << 13 : 0) | \
+ (14 & (access) ? 1 << 14 : 0) | \
+ (15 & (access) ? 1 << 15 : 0))
-static void update_permission_bitmask(struct kvm_mmu *mmu, bool ept)
+static void update_permission_bitmask(struct kvm_mmu *mmu, bool tdp, bool ept)
{
- unsigned byte;
+ unsigned index;
- const u8 x = BYTE_MASK(ACC_EXEC_MASK);
- const u8 w = BYTE_MASK(ACC_WRITE_MASK);
- const u8 u = BYTE_MASK(ACC_USER_MASK);
+ const u16 w = ACC_BITS_MASK(ACC_WRITE_MASK);
+ const u16 r = ACC_BITS_MASK(ACC_READ_MASK);
bool cr4_smep = is_cr4_smep(mmu);
bool cr4_smap = is_cr4_smap(mmu);
bool cr0_wp = is_cr0_wp(mmu);
bool efer_nx = is_efer_nx(mmu);
- for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) {
- unsigned pfec = byte << 1;
+ /*
+ * In hardware, page fault error codes are generated (as the name
+ * suggests) on any kind of page fault. permission_fault() and
+ * paging_tmpl.h already use the same bits after a successful page
+ * table walk, to indicate the kind of access being performed.
+ *
+ * However, PFERR_PRESENT_MASK and PFERR_RSVD_MASK are never set here,
+ * exactly because the page walk is successful. PFERR_PRESENT_MASK is
+ * removed by the shift, while PFERR_RSVD_MASK is repurposed in
+ * permission_fault() to indicate accesses that are *not* subject to
+ * SMAP restrictions.
+ */
+ for (index = 0; index < ARRAY_SIZE(mmu->permissions); ++index) {
+ unsigned pfec = index << 1;
/*
- * Each "*f" variable has a 1 bit for each UWX value
+ * Each "*f" variable has a 1 bit for each ACC_* combo
* that causes a fault with the given PFEC.
*/
+ /* Faults from reads to non-readable pages */
+ u16 rf = (pfec & (PFERR_WRITE_MASK|PFERR_FETCH_MASK)) ? 0 : (u16)~r;
/* Faults from writes to non-writable pages */
- u8 wf = (pfec & PFERR_WRITE_MASK) ? (u8)~w : 0;
+ u16 wf = (pfec & PFERR_WRITE_MASK) ? (u16)~w : 0;
/* Faults from user mode accesses to supervisor pages */
- u8 uf = (pfec & PFERR_USER_MASK) ? (u8)~u : 0;
- /* Faults from fetches of non-executable pages*/
- u8 ff = (pfec & PFERR_FETCH_MASK) ? (u8)~x : 0;
- /* Faults from kernel mode fetches of user pages */
- u8 smepf = 0;
+ u16 uf = 0;
+ /* Faults from fetches of non-executable pages */
+ u16 ff = 0;
/* Faults from kernel mode accesses of user pages */
- u8 smapf = 0;
+ u16 smapf = 0;
+
+ if (ept) {
+ const u16 xs = ACC_BITS_MASK(ACC_EXEC_MASK);
+ const u16 xu = ACC_BITS_MASK(ACC_USER_EXEC_MASK);
+
+ if (pfec & PFERR_FETCH_MASK) {
+ /* Ignore XU unless MBEC is enabled. */
+ if (cr4_smep)
+ ff = pfec & PFERR_USER_MASK ? (u16)~xu : (u16)~xs;
+ else
+ ff = (u16)~xs;
+ }
+ } else {
+ const u16 x = ACC_BITS_MASK(ACC_EXEC_MASK);
+ const u16 u = ACC_BITS_MASK(ACC_USER_MASK);
- if (!ept) {
/* Faults from kernel mode accesses to user pages */
- u8 kf = (pfec & PFERR_USER_MASK) ? 0 : u;
+ u16 kf = (pfec & PFERR_USER_MASK) ? 0 : u;
- /* Not really needed: !nx will cause pte.nx to fault */
- if (!efer_nx)
- ff = 0;
+ /*
+ * For NPT GMET, U=0 does not affect reads and writes. Fetches
+ * are handled below via cr4_smep.
+ */
+ if (!(tdp && cr4_smep))
+ uf = (pfec & PFERR_USER_MASK) ? (u16)~u : 0;
+
+ if (efer_nx)
+ ff |= (pfec & PFERR_FETCH_MASK) ? (u16)~x : 0;
/* Allow supervisor writes if !cr0.wp */
if (!cr0_wp)
@@ -5598,7 +5663,7 @@ static void update_permission_bitmask(struct kvm_mmu *mmu, bool ept)
/* Disallow supervisor fetches of user code if cr4.smep */
if (cr4_smep)
- smepf = (pfec & PFERR_FETCH_MASK) ? kf : 0;
+ ff |= (pfec & PFERR_FETCH_MASK) ? kf : 0;
/*
* SMAP:kernel-mode data accesses from user-mode
@@ -5611,16 +5676,15 @@ static void update_permission_bitmask(struct kvm_mmu *mmu, bool ept)
* - The access is supervisor mode
* - If implicit supervisor access or X86_EFLAGS_AC is clear
*
- * Here, we cover the first four conditions.
- * The fifth is computed dynamically in permission_fault();
- * PFERR_RSVD_MASK bit will be set in PFEC if the access is
- * *not* subject to SMAP restrictions.
+ * Here, we cover the first four conditions. The fifth
+ * is computed dynamically in permission_fault() and
+ * communicated by setting PFERR_RSVD_MASK.
*/
if (cr4_smap)
smapf = (pfec & (PFERR_RSVD_MASK|PFERR_FETCH_MASK)) ? 0 : kf;
}
- mmu->permissions[byte] = ff | uf | wf | smepf | smapf;
+ mmu->permissions[index] = ff | uf | wf | rf | smapf;
}
}
@@ -5699,7 +5763,7 @@ static void reset_guest_paging_metadata(struct kvm_vcpu *vcpu,
return;
reset_guest_rsvds_bits_mask(vcpu, mmu);
- update_permission_bitmask(mmu, false);
+ update_permission_bitmask(mmu, mmu == &vcpu->arch.guest_mmu, false);
update_pkru_bitmask(mmu);
}
@@ -5734,7 +5798,7 @@ static union kvm_cpu_role kvm_calc_cpu_role(struct kvm_vcpu *vcpu,
role.base.efer_nx = ____is_efer_nx(regs);
role.base.cr0_wp = ____is_cr0_wp(regs);
- role.base.smep_andnot_wp = ____is_cr4_smep(regs) && !____is_cr0_wp(regs);
+ role.base.cr4_smep = ____is_cr4_smep(regs);
role.base.smap_andnot_wp = ____is_cr4_smap(regs) && !____is_cr0_wp(regs);
role.base.has_4_byte_gpte = !____is_cr4_pae(regs);
@@ -5746,7 +5810,6 @@ static union kvm_cpu_role kvm_calc_cpu_role(struct kvm_vcpu *vcpu,
else
role.base.level = PT32_ROOT_LEVEL;
- role.ext.cr4_smep = ____is_cr4_smep(regs);
role.ext.cr4_smap = ____is_cr4_smap(regs);
role.ext.cr4_pse = ____is_cr4_pse(regs);
@@ -5754,6 +5817,8 @@ static union kvm_cpu_role kvm_calc_cpu_role(struct kvm_vcpu *vcpu,
role.ext.cr4_pke = ____is_efer_lma(regs) && ____is_cr4_pke(regs);
role.ext.cr4_la57 = ____is_efer_lma(regs) && ____is_cr4_la57(regs);
role.ext.efer_lma = ____is_efer_lma(regs);
+
+ role.ext.has_pferr_fetch = role.base.efer_nx | role.base.cr4_smep;
return role;
}
@@ -5803,8 +5868,8 @@ kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu,
{
union kvm_mmu_page_role role = {0};
- role.access = ACC_ALL;
role.cr0_wp = true;
+ role.cr4_smep = kvm_x86_call(tdp_has_smep)(vcpu->kvm);
role.efer_nx = true;
role.smm = cpu_role.base.smm;
role.guest_mode = cpu_role.base.guest_mode;
@@ -5813,6 +5878,11 @@ kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu,
role.direct = true;
role.has_4_byte_gpte = false;
+ /* All TDP pages are supervisor-executable */
+ role.access = ACC_ALL;
+ if (role.cr4_smep && shadow_user_mask)
+ role.access &= ~ACC_USER_MASK;
+
return role;
}
@@ -5892,13 +5962,13 @@ static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu,
shadow_mmu_init_context(vcpu, context, cpu_role, root_role);
}
-void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
- unsigned long cr4, u64 efer, gpa_t nested_cr3)
+void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr4,
+ u64 efer, gpa_t nested_cr3, u64 misc_ctl)
{
struct kvm_mmu *context = &vcpu->arch.guest_mmu;
struct kvm_mmu_role_regs regs = {
- .cr0 = cr0,
- .cr4 = cr4 & ~X86_CR4_PKE,
+ .cr0 = X86_CR0_PG | X86_CR0_WP,
+ .cr4 = cr4 & ~(X86_CR4_PKE | X86_CR4_SMAP),
.efer = efer,
};
union kvm_cpu_role cpu_role = kvm_calc_cpu_role(vcpu, &regs);
@@ -5906,6 +5976,7 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
/* NPT requires CR0.PG=1. */
WARN_ON_ONCE(cpu_role.base.direct || !cpu_role.base.guest_mode);
+ cpu_role.base.cr4_smep = (misc_ctl & SVM_MISC_ENABLE_GMET) != 0;
root_role = cpu_role.base;
root_role.level = kvm_mmu_get_tdp_level(vcpu);
@@ -5920,7 +5991,7 @@ EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_init_shadow_npt_mmu);
static union kvm_cpu_role
kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
- bool execonly, u8 level)
+ bool execonly, u8 level, bool mbec)
{
union kvm_cpu_role role = {0};
@@ -5930,6 +6001,7 @@ kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
*/
WARN_ON_ONCE(is_smm(vcpu));
role.base.level = level;
+ role.base.cr4_smep = mbec;
role.base.has_4_byte_gpte = false;
role.base.direct = false;
role.base.ad_disabled = !accessed_dirty;
@@ -5945,13 +6017,13 @@ kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
int huge_page_level, bool accessed_dirty,
- gpa_t new_eptp)
+ bool mbec, gpa_t new_eptp)
{
struct kvm_mmu *context = &vcpu->arch.guest_mmu;
u8 level = vmx_eptp_page_walk_level(new_eptp);
union kvm_cpu_role new_mode =
kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty,
- execonly, level);
+ execonly, level, mbec);
if (new_mode.as_u64 != context->cpu_role.as_u64) {
/* EPT, and thus nested EPT, does not consume CR0, CR4, nor EFER. */
@@ -5962,7 +6034,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
context->gva_to_gpa = ept_gva_to_gpa;
context->sync_spte = ept_sync_spte;
- update_permission_bitmask(context, true);
+ update_permission_bitmask(context, true, true);
context->pkru_mask = 0;
reset_rsvds_bits_mask_ept(vcpu, context, execonly, huge_page_level);
reset_ept_shadow_zero_bits_mask(context, execonly);
diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h
index 764e3015d021b..fa01719baf8d4 100644
--- a/arch/x86/kvm/mmu/mmutrace.h
+++ b/arch/x86/kvm/mmu/mmutrace.h
@@ -25,7 +25,8 @@
#define KVM_MMU_PAGE_PRINTK() ({ \
const char *saved_ptr = trace_seq_buffer_ptr(p); \
static const char *access_str[] = { \
- "---", "--x", "w--", "w-x", "-u-", "-ux", "wu-", "wux" \
+ "----", "r---", "-w--", "rw--", "--u-", "r-u-", "-wu-", "rwu-", \
+ "---x", "r--x", "-w-x", "rw-x", "--ux", "r-ux", "-wux", "rwux" \
}; \
union kvm_mmu_page_role role; \
\
@@ -356,8 +357,8 @@ TRACE_EVENT(
__entry->sptep = virt_to_phys(sptep);
__entry->level = level;
__entry->r = shadow_present_mask || (__entry->spte & PT_PRESENT_MASK);
- __entry->x = is_executable_pte(__entry->spte);
- __entry->u = shadow_user_mask ? !!(__entry->spte & shadow_user_mask) : -1;
+ __entry->x = (__entry->spte & (shadow_xs_mask | shadow_nx_mask)) == shadow_xs_mask;
+ __entry->u = !!(__entry->spte & (shadow_xu_mask | shadow_user_mask));
),
TP_printk("gfn %llx spte %llx (%s%s%s%s) level %d at %llx",
@@ -365,30 +366,32 @@ TRACE_EVENT(
__entry->r ? "r" : "-",
__entry->spte & PT_WRITABLE_MASK ? "w" : "-",
__entry->x ? "x" : "-",
- __entry->u == -1 ? "" : (__entry->u ? "u" : "-"),
+ __entry->u ? "u" : "-",
__entry->level, __entry->sptep
)
);
TRACE_EVENT(
kvm_mmu_spte_requested,
- TP_PROTO(struct kvm_page_fault *fault),
- TP_ARGS(fault),
+ TP_PROTO(struct kvm_page_fault *fault, u8 access),
+ TP_ARGS(fault, access),
TP_STRUCT__entry(
__field(u64, gfn)
__field(u64, pfn)
__field(u8, level)
+ __field(u8, access)
),
TP_fast_assign(
__entry->gfn = fault->gfn;
__entry->pfn = fault->pfn | (fault->gfn & (KVM_PAGES_PER_HPAGE(fault->goal_level) - 1));
__entry->level = fault->goal_level;
+ __entry->access = access;
),
- TP_printk("gfn %llx pfn %llx level %d",
- __entry->gfn, __entry->pfn, __entry->level
+ TP_printk("gfn %llx pfn %llx level %d access %x",
+ __entry->gfn, __entry->pfn, __entry->level, __entry->access
)
);
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 901cd2bd40b84..07100bbfc2701 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -124,12 +124,17 @@ static inline void FNAME(protect_clean_gpte)(struct kvm_mmu *mmu, unsigned *acce
*access &= mask;
}
-static inline int FNAME(is_present_gpte)(unsigned long pte)
+static inline int FNAME(is_present_gpte)(struct kvm_mmu *mmu,
+ unsigned long pte)
{
#if PTTYPE != PTTYPE_EPT
return pte & PT_PRESENT_MASK;
#else
- return pte & 7;
+ /*
+ * For EPT, an entry is present if any of bits 2:0 are set.
+ * With mode-based execute control, bit 10 also indicates presence.
+ */
+ return pte & (7 | (mmu_has_mbec(mmu) ? VMX_EPT_USER_EXECUTABLE_MASK : 0));
#endif
}
@@ -152,7 +157,7 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp, u64 *spte,
u64 gpte)
{
- if (!FNAME(is_present_gpte)(gpte))
+ if (!FNAME(is_present_gpte)(vcpu->arch.mmu, gpte))
goto no_present;
/* Prefetch only accessed entries (unless A/D bits are disabled). */
@@ -170,25 +175,31 @@ no_present:
return true;
}
-/*
- * For PTTYPE_EPT, a page table can be executable but not readable
- * on supported processors. Therefore, set_spte does not automatically
- * set bit 0 if execute only is supported. Here, we repurpose ACC_USER_MASK
- * to signify readability since it isn't used in the EPT case
- */
static inline unsigned FNAME(gpte_access)(u64 gpte)
{
unsigned access;
+ /*
+ * Set bits in ACC_*_MASK even if they might not be used in the
+ * actual checks. For example, if EFER.NX is clear permission_fault()
+ * will ignore ACC_EXEC_MASK, and if MBEC is disabled it will
+ * ignore ACC_USER_EXEC_MASK.
+ */
#if PTTYPE == PTTYPE_EPT
access = ((gpte & VMX_EPT_WRITABLE_MASK) ? ACC_WRITE_MASK : 0) |
((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) |
- ((gpte & VMX_EPT_READABLE_MASK) ? ACC_USER_MASK : 0);
+ ((gpte & VMX_EPT_READABLE_MASK) ? ACC_READ_MASK : 0) |
+ ((gpte & VMX_EPT_USER_EXECUTABLE_MASK) ? ACC_USER_EXEC_MASK : 0);
#else
- BUILD_BUG_ON(ACC_EXEC_MASK != PT_PRESENT_MASK);
- BUILD_BUG_ON(ACC_EXEC_MASK != 1);
+ /*
+ * P is set here, so the page is always readable and W/U/!NX represent
+ * allowed accesses.
+ */
+ BUILD_BUG_ON(ACC_READ_MASK != PT_PRESENT_MASK);
+ BUILD_BUG_ON(ACC_WRITE_MASK != PT_WRITABLE_MASK);
+ BUILD_BUG_ON(ACC_USER_MASK != PT_USER_MASK);
+ BUILD_BUG_ON(ACC_EXEC_MASK & (PT_WRITABLE_MASK | PT_USER_MASK | PT_PRESENT_MASK));
access = gpte & (PT_WRITABLE_MASK | PT_USER_MASK | PT_PRESENT_MASK);
- /* Combine NX with P (which is set here) to get ACC_EXEC_MASK. */
- access ^= (gpte >> PT64_NX_SHIFT);
+ access |= gpte & PT64_NX_MASK ? 0 : ACC_EXEC_MASK;
#endif
return access;
@@ -332,7 +343,7 @@ retry_walk:
if (walker->level == PT32E_ROOT_LEVEL) {
pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3);
trace_kvm_mmu_paging_element(pte, walker->level);
- if (!FNAME(is_present_gpte)(pte))
+ if (!FNAME(is_present_gpte)(mmu, pte))
goto error;
--walker->level;
}
@@ -377,7 +388,8 @@ retry_walk:
walker->pte_gpa[walker->level - 1] = pte_gpa;
real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(table_gfn),
- nested_access, &walker->fault);
+ nested_access | PFERR_GUEST_PAGE_MASK,
+ &walker->fault, 0);
/*
* FIXME: This can happen if emulation (for of an INS/OUTS
@@ -414,7 +426,7 @@ retry_walk:
*/
pte_access = pt_access & (pte ^ walk_nx_mask);
- if (unlikely(!FNAME(is_present_gpte)(pte)))
+ if (unlikely(!FNAME(is_present_gpte)(mmu, pte)))
goto error;
if (unlikely(FNAME(is_rsvd_bits_set)(mmu, pte, walker->level))) {
@@ -445,7 +457,9 @@ retry_walk:
gfn += pse36_gfn_delta(pte);
#endif
- real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(gfn), access, &walker->fault);
+ real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(gfn),
+ access | PFERR_GUEST_FINAL_MASK,
+ &walker->fault, walker->pte_access);
if (real_gpa == INVALID_GPA)
return 0;
@@ -475,7 +489,7 @@ retry_walk:
error:
errcode |= write_fault | user_fault;
- if (fetch_fault && (is_efer_nx(mmu) || is_cr4_smep(mmu)))
+ if (fetch_fault && has_pferr_fetch(mmu))
errcode |= PFERR_FETCH_MASK;
walker->fault.vector = PF_VECTOR;
@@ -492,7 +506,7 @@ error:
* [2:0] - Derive from the access bits. The exit_qualification might be
* out of date if it is serving an EPT misconfiguration.
* [5:3] - Calculated by the page walk of the guest EPT page tables
- * [7:8] - Derived from [7:8] of real exit_qualification
+ * [7:11] - Derived from [7:11] of real exit_qualification
*
* The other bits are set to 0.
*/
@@ -501,16 +515,27 @@ error:
if (write_fault)
walker->fault.exit_qualification |= EPT_VIOLATION_ACC_WRITE;
- if (user_fault)
- walker->fault.exit_qualification |= EPT_VIOLATION_ACC_READ;
- if (fetch_fault)
+ else if (fetch_fault)
walker->fault.exit_qualification |= EPT_VIOLATION_ACC_INSTR;
+ else
+ walker->fault.exit_qualification |= EPT_VIOLATION_ACC_READ;
+
+ /*
+ * Accesses to guest paging structures are either "reads" or
+ * "read+write" accesses, so consider them the latter if write_fault
+ * is true.
+ */
+ if (access & PFERR_GUEST_PAGE_MASK)
+ walker->fault.exit_qualification |= EPT_VIOLATION_ACC_READ;
/*
* Note, pte_access holds the raw RWX bits from the EPTE, not
* ACC_*_MASK flags!
*/
walker->fault.exit_qualification |= EPT_VIOLATION_RWX_TO_PROT(pte_access);
+ if (mmu_has_mbec(mmu))
+ walker->fault.exit_qualification |=
+ EPT_VIOLATION_USER_EXEC_TO_PROT(pte_access);
}
#endif
walker->fault.address = addr;
@@ -709,7 +734,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
*/
kvm_mmu_hugepage_adjust(vcpu, fault);
- trace_kvm_mmu_spte_requested(fault);
+ trace_kvm_mmu_spte_requested(fault, gw->pte_access);
for (; shadow_walk_okay(&it); shadow_walk_next(&it)) {
/*
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index 4e753386c8d46..72d2394e089c9 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -30,8 +30,9 @@ bool __read_mostly kvm_ad_enabled;
u64 __read_mostly shadow_host_writable_mask;
u64 __read_mostly shadow_mmu_writable_mask;
u64 __read_mostly shadow_nx_mask;
-u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
u64 __read_mostly shadow_user_mask;
+u64 __read_mostly shadow_xs_mask; /* mutual exclusive with nx_mask and user_mask */
+u64 __read_mostly shadow_xu_mask; /* mutual exclusive with nx_mask and user_mask */
u64 __read_mostly shadow_accessed_mask;
u64 __read_mostly shadow_dirty_mask;
u64 __read_mostly shadow_mmio_value;
@@ -195,12 +196,6 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
int is_host_mmio = -1;
bool wrprot = false;
- /*
- * For the EPT case, shadow_present_mask has no RWX bits set if
- * exec-only page table entries are supported. In that case,
- * ACC_USER_MASK and shadow_user_mask are used to represent
- * read access. See FNAME(gpte_access) in paging_tmpl.h.
- */
WARN_ON_ONCE((pte_access | shadow_present_mask) == SHADOW_NONPRESENT_VALUE);
if (sp->role.ad_disabled)
@@ -224,18 +219,26 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
* would tie make_spte() further to vCPU/MMU state, and add complexity
* just to optimize a mode that is anything but performance critical.
*/
- if (level > PG_LEVEL_4K && (pte_access & ACC_EXEC_MASK) &&
- is_nx_huge_page_enabled(vcpu->kvm)) {
+ if (level > PG_LEVEL_4K && is_nx_huge_page_enabled(vcpu->kvm)) {
pte_access &= ~ACC_EXEC_MASK;
+ if (shadow_xu_mask)
+ pte_access &= ~ACC_USER_EXEC_MASK;
}
- if (pte_access & ACC_EXEC_MASK)
- spte |= shadow_x_mask;
- else
- spte |= shadow_nx_mask;
+ if (pte_access & ACC_READ_MASK)
+ spte |= PT_PRESENT_MASK; /* or VMX_EPT_READABLE_MASK */
- if (pte_access & ACC_USER_MASK)
- spte |= shadow_user_mask;
+ if (shadow_nx_mask) {
+ if (!(pte_access & ACC_EXEC_MASK))
+ spte |= shadow_nx_mask;
+ if (pte_access & ACC_USER_MASK)
+ spte |= shadow_user_mask;
+ } else {
+ if (pte_access & ACC_EXEC_MASK)
+ spte |= shadow_xs_mask;
+ if (pte_access & ACC_USER_EXEC_MASK)
+ spte |= shadow_xu_mask;
+ }
if (level > PG_LEVEL_4K)
spte |= PT_PAGE_SIZE_MASK;
@@ -318,14 +321,18 @@ static u64 modify_spte_protections(u64 spte, u64 set, u64 clear)
return spte;
}
-static u64 make_spte_executable(u64 spte)
+static u64 change_spte_executable(u64 spte, u8 access)
{
- return modify_spte_protections(spte, shadow_x_mask, shadow_nx_mask);
-}
+ u64 set, clear;
-static u64 make_spte_nonexecutable(u64 spte)
-{
- return modify_spte_protections(spte, shadow_nx_mask, shadow_x_mask);
+ if (shadow_nx_mask)
+ set = (access & ACC_EXEC_MASK) ? 0 : shadow_nx_mask;
+ else
+ set =
+ (access & ACC_EXEC_MASK ? shadow_xs_mask : 0) |
+ (access & ACC_USER_EXEC_MASK ? shadow_xu_mask : 0);
+ clear = set ^ (shadow_nx_mask | shadow_xs_mask | shadow_xu_mask);
+ return modify_spte_protections(spte, set, clear);
}
/*
@@ -357,8 +364,8 @@ u64 make_small_spte(struct kvm *kvm, u64 huge_spte,
* the page executable as the NX hugepage mitigation no longer
* applies.
*/
- if ((role.access & ACC_EXEC_MASK) && is_nx_huge_page_enabled(kvm))
- child_spte = make_spte_executable(child_spte);
+ if (is_nx_huge_page_enabled(kvm))
+ child_spte = change_spte_executable(child_spte, role.access);
}
return child_spte;
@@ -380,7 +387,7 @@ u64 make_huge_spte(struct kvm *kvm, u64 small_spte, int level)
huge_spte &= KVM_HPAGE_MASK(level) | ~PAGE_MASK;
if (is_nx_huge_page_enabled(kvm))
- huge_spte = make_spte_nonexecutable(huge_spte);
+ huge_spte = change_spte_executable(huge_spte, 0);
return huge_spte;
}
@@ -390,7 +397,8 @@ u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled)
u64 spte = SPTE_MMU_PRESENT_MASK;
spte |= __pa(child_pt) | shadow_present_mask | PT_WRITABLE_MASK |
- shadow_user_mask | shadow_x_mask | shadow_me_value;
+ PT_PRESENT_MASK /* or VMX_EPT_READABLE_MASK */ |
+ shadow_user_mask | shadow_xs_mask | shadow_xu_mask | shadow_me_value;
if (ad_disabled)
spte |= SPTE_TDP_AD_DISABLED;
@@ -490,20 +498,37 @@ void kvm_mmu_set_me_spte_mask(u64 me_value, u64 me_mask)
}
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_set_me_spte_mask);
-void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only)
+void kvm_mmu_set_ept_masks(bool has_ad_bits)
{
kvm_ad_enabled = has_ad_bits;
- shadow_user_mask = VMX_EPT_READABLE_MASK;
+ shadow_user_mask = 0;
shadow_accessed_mask = VMX_EPT_ACCESS_BIT;
shadow_dirty_mask = VMX_EPT_DIRTY_BIT;
shadow_nx_mask = 0ull;
- shadow_x_mask = VMX_EPT_EXECUTABLE_MASK;
- /* VMX_EPT_SUPPRESS_VE_BIT is needed for W or X violation. */
- shadow_present_mask =
- (has_exec_only ? 0ull : VMX_EPT_READABLE_MASK) | VMX_EPT_SUPPRESS_VE_BIT;
+ shadow_xs_mask = VMX_EPT_EXECUTABLE_MASK;
+
+ /*
+ * The MMU always maps ACC_EXEC_MASK and ACC_USER_EXEC_MASK to the
+ * XS and XU bits of shadow EPT entries, regardless of whether MBEC
+ * is available on the host or enabled in the VMCS.
+ *
+ * For the non-nested case, pages are mapped with ACC_EXEC_MASK
+ * and ACC_USER_EXEC_MASK set in tandem, so XS == XU and the
+ * host's MBEC setting does not matter. On hardware without MBEC
+ * the XU bit is reserved-as-ignored, and setting it does no harm.
+ *
+ * For nested EPT, when MBEC is disabled by L1, correctness relies
+ * on (a) ignoring bit 10 of the gPTE in is_present_gpte(), rather
+ * than treating it as a present bit, and (b) permission_fault()
+ * using an mmu->permissions[] array that effectively ignores
+ * ACC_USER_EXEC_MASK. Bit 10 of the gPTE does end up mirrored
+ * in the sPTEs but is ignored because L2 runs with MBEC disabled.
+ */
+ shadow_xu_mask = VMX_EPT_USER_EXECUTABLE_MASK;
+ shadow_present_mask = VMX_EPT_SUPPRESS_VE_BIT;
- shadow_acc_track_mask = VMX_EPT_RWX_MASK;
+ shadow_acc_track_mask = VMX_EPT_RWX_MASK | VMX_EPT_USER_EXECUTABLE_MASK;
shadow_host_writable_mask = EPT_SPTE_HOST_WRITABLE;
shadow_mmu_writable_mask = EPT_SPTE_MMU_WRITABLE;
@@ -551,7 +576,8 @@ void kvm_mmu_reset_all_pte_masks(void)
shadow_accessed_mask = PT_ACCESSED_MASK;
shadow_dirty_mask = PT_DIRTY_MASK;
shadow_nx_mask = PT64_NX_MASK;
- shadow_x_mask = 0;
+ shadow_xs_mask = 0;
+ shadow_xu_mask = 0;
shadow_present_mask = PT_PRESENT_MASK;
shadow_acc_track_mask = 0;
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index 8c0ffa2cded69..13eea94dd2128 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -18,9 +18,19 @@
#define SPTE_MMU_PRESENT_MASK BIT_ULL(11)
/*
+ * The ignored high bits are allocated as follows:
+ * - bits 52, 54: saved X-R bits for access tracking when EPT does not have A/D
+ * - bits 53 (EPT only): host writable
+ * - bits 55 (EPT only): MMU-writable
+ * - bits 56-59: unused
+ * - bits 60-61: type of A/D tracking
+ * - bits 62 (EPT only): saved XU bit for disabled AD
+ */
+
+/*
* TDP SPTES (more specifically, EPT SPTEs) may not have A/D bits, and may also
* be restricted to using write-protection (for L2 when CPU dirty logging, i.e.
- * PML, is enabled). Use bits 52 and 53 to hold the type of A/D tracking that
+ * PML, is enabled). Use bits 60 and 61 to hold the type of A/D tracking that
* is must be employed for a given TDP SPTE.
*
* Note, the "enabled" mask must be '0', as bits 62:52 are _reserved_ for PAE
@@ -29,7 +39,7 @@
* TDP with CPU dirty logging (PML). If NPT ever gains PML-like support, it
* must be restricted to 64-bit KVM.
*/
-#define SPTE_TDP_AD_SHIFT 52
+#define SPTE_TDP_AD_SHIFT 60
#define SPTE_TDP_AD_MASK (3ULL << SPTE_TDP_AD_SHIFT)
#define SPTE_TDP_AD_ENABLED (0ULL << SPTE_TDP_AD_SHIFT)
#define SPTE_TDP_AD_DISABLED (1ULL << SPTE_TDP_AD_SHIFT)
@@ -42,18 +52,6 @@ static_assert(SPTE_TDP_AD_ENABLED == 0);
#define SPTE_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
#endif
-#define SPTE_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \
- | shadow_x_mask | shadow_nx_mask | shadow_me_mask)
-
-#define ACC_EXEC_MASK 1
-#define ACC_WRITE_MASK PT_WRITABLE_MASK
-#define ACC_USER_MASK PT_USER_MASK
-#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
-
-/* The mask for the R/X bits in EPT PTEs */
-#define SPTE_EPT_READABLE_MASK 0x1ull
-#define SPTE_EPT_EXECUTABLE_MASK 0x4ull
-
#define SPTE_LEVEL_BITS 9
#define SPTE_LEVEL_SHIFT(level) __PT_LEVEL_SHIFT(level, SPTE_LEVEL_BITS)
#define SPTE_INDEX(address, level) __PT_INDEX(address, level, SPTE_LEVEL_BITS)
@@ -66,9 +64,10 @@ static_assert(SPTE_TDP_AD_ENABLED == 0);
* restored only when a write is attempted to the page. This mask obviously
* must not overlap the A/D type mask.
*/
-#define SHADOW_ACC_TRACK_SAVED_BITS_MASK (SPTE_EPT_READABLE_MASK | \
- SPTE_EPT_EXECUTABLE_MASK)
-#define SHADOW_ACC_TRACK_SAVED_BITS_SHIFT 54
+#define SHADOW_ACC_TRACK_SAVED_BITS_MASK (VMX_EPT_READABLE_MASK | \
+ VMX_EPT_EXECUTABLE_MASK | \
+ VMX_EPT_USER_EXECUTABLE_MASK)
+#define SHADOW_ACC_TRACK_SAVED_BITS_SHIFT 52
#define SHADOW_ACC_TRACK_SAVED_MASK (SHADOW_ACC_TRACK_SAVED_BITS_MASK << \
SHADOW_ACC_TRACK_SAVED_BITS_SHIFT)
static_assert(!(SPTE_TDP_AD_MASK & SHADOW_ACC_TRACK_SAVED_MASK));
@@ -87,8 +86,8 @@ static_assert(!(SPTE_TDP_AD_MASK & SHADOW_ACC_TRACK_SAVED_MASK));
* to not overlap the A/D type mask or the saved access bits of access-tracked
* SPTEs when A/D bits are disabled.
*/
-#define EPT_SPTE_HOST_WRITABLE BIT_ULL(57)
-#define EPT_SPTE_MMU_WRITABLE BIT_ULL(58)
+#define EPT_SPTE_HOST_WRITABLE BIT_ULL(53)
+#define EPT_SPTE_MMU_WRITABLE BIT_ULL(55)
static_assert(!(EPT_SPTE_HOST_WRITABLE & SPTE_TDP_AD_MASK));
static_assert(!(EPT_SPTE_MMU_WRITABLE & SPTE_TDP_AD_MASK));
@@ -99,11 +98,11 @@ static_assert(!(EPT_SPTE_MMU_WRITABLE & SHADOW_ACC_TRACK_SAVED_MASK));
#undef SHADOW_ACC_TRACK_SAVED_MASK
/*
- * Due to limited space in PTEs, the MMIO generation is a 19 bit subset of
+ * Due to limited space in PTEs, the MMIO generation is an 18 bit subset of
* the memslots generation and is derived as follows:
*
- * Bits 0-7 of the MMIO generation are propagated to spte bits 3-10
- * Bits 8-18 of the MMIO generation are propagated to spte bits 52-62
+ * Bits 0-6 of the MMIO generation are propagated to spte bits 3-9
+ * Bits 7-17 of the MMIO generation are propagated to spte bits 52-62
*
* The KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag is intentionally not included in
* the MMIO generation number, as doing so would require stealing a bit from
@@ -114,7 +113,7 @@ static_assert(!(EPT_SPTE_MMU_WRITABLE & SHADOW_ACC_TRACK_SAVED_MASK));
*/
#define MMIO_SPTE_GEN_LOW_START 3
-#define MMIO_SPTE_GEN_LOW_END 10
+#define MMIO_SPTE_GEN_LOW_END 9
#define MMIO_SPTE_GEN_HIGH_START 52
#define MMIO_SPTE_GEN_HIGH_END 62
@@ -136,7 +135,8 @@ static_assert(!(SPTE_MMU_PRESENT_MASK &
* and so they're off-limits for generation; additional checks ensure the mask
* doesn't overlap legal PA bits), and bit 63 (carved out for future usage).
*/
-#define SPTE_MMIO_ALLOWED_MASK (BIT_ULL(63) | GENMASK_ULL(51, 12) | GENMASK_ULL(2, 0))
+#define SPTE_MMIO_ALLOWED_MASK (BIT_ULL(63) | GENMASK_ULL(51, 12) | \
+ BIT_ULL(10) | GENMASK_ULL(2, 0))
static_assert(!(SPTE_MMIO_ALLOWED_MASK &
(SPTE_MMU_PRESENT_MASK | MMIO_SPTE_GEN_LOW_MASK | MMIO_SPTE_GEN_HIGH_MASK)));
@@ -144,7 +144,7 @@ static_assert(!(SPTE_MMIO_ALLOWED_MASK &
#define MMIO_SPTE_GEN_HIGH_BITS (MMIO_SPTE_GEN_HIGH_END - MMIO_SPTE_GEN_HIGH_START + 1)
/* remember to adjust the comment above as well if you change these */
-static_assert(MMIO_SPTE_GEN_LOW_BITS == 8 && MMIO_SPTE_GEN_HIGH_BITS == 11);
+static_assert(MMIO_SPTE_GEN_LOW_BITS == 7 && MMIO_SPTE_GEN_HIGH_BITS == 11);
#define MMIO_SPTE_GEN_LOW_SHIFT (MMIO_SPTE_GEN_LOW_START - 0)
#define MMIO_SPTE_GEN_HIGH_SHIFT (MMIO_SPTE_GEN_HIGH_START - MMIO_SPTE_GEN_LOW_BITS)
@@ -179,8 +179,9 @@ extern bool __read_mostly kvm_ad_enabled;
extern u64 __read_mostly shadow_host_writable_mask;
extern u64 __read_mostly shadow_mmu_writable_mask;
extern u64 __read_mostly shadow_nx_mask;
-extern u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
extern u64 __read_mostly shadow_user_mask;
+extern u64 __read_mostly shadow_xs_mask; /* mutual exclusive with nx_mask and user_mask */
+extern u64 __read_mostly shadow_xu_mask; /* mutual exclusive with nx_mask and user_mask */
extern u64 __read_mostly shadow_accessed_mask;
extern u64 __read_mostly shadow_dirty_mask;
extern u64 __read_mostly shadow_mmio_value;
@@ -220,10 +221,11 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
*
* Only used by the TDP MMU.
*/
-#define FROZEN_SPTE (SHADOW_NONPRESENT_VALUE | 0x5a0ULL)
+#define FROZEN_SPTE (SHADOW_NONPRESENT_VALUE | 0x1a0ULL)
-/* Frozen SPTEs must not be misconstrued as shadow present PTEs. */
-static_assert(!(FROZEN_SPTE & SPTE_MMU_PRESENT_MASK));
+/* Frozen SPTEs must not be misconstrued as shadow or MMU present PTEs. */
+static_assert(!(FROZEN_SPTE & (SPTE_MMU_PRESENT_MASK |
+ VMX_EPT_RWX_MASK | VMX_EPT_USER_EXECUTABLE_MASK)));
static inline bool is_frozen_spte(u64 spte)
{
@@ -357,7 +359,13 @@ static inline bool is_last_spte(u64 pte, int level)
static inline bool is_executable_pte(u64 spte)
{
- return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask;
+ /*
+ * For now, return true if either the XS or XU bit is set
+ * This function is only used for fast_page_fault,
+ * which never processes shadow EPT, and regular page
+ * tables always have XS==XU.
+ */
+ return (spte & (shadow_xs_mask | shadow_xu_mask | shadow_nx_mask)) != shadow_nx_mask;
}
static inline kvm_pfn_t spte_to_pfn(u64 pte)
@@ -387,6 +395,8 @@ static inline bool __is_rsvd_bits_set(struct rsvd_bits_validate *rsvd_check,
static inline bool __is_bad_mt_xwr(struct rsvd_bits_validate *rsvd_check,
u64 pte)
{
+ if (pte & VMX_EPT_USER_EXECUTABLE_MASK)
+ pte |= VMX_EPT_EXECUTABLE_MASK;
return rsvd_check->bad_mt_xwr & BIT_ULL(pte & 0x3f);
}
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 7b1102d26f9cd..5a2f8ce9a32b8 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1185,9 +1185,9 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
}
if (unlikely(!fault->slot))
- new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
+ new_spte = make_mmio_spte(vcpu, iter->gfn, sp->role.access);
else
- wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn,
+ wrprot = make_spte(vcpu, sp, fault->slot, sp->role.access, iter->gfn,
fault->pfn, iter->old_spte, fault->prefetch,
false, fault->map_writable, &new_spte);
@@ -1272,7 +1272,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
kvm_mmu_hugepage_adjust(vcpu, fault);
- trace_kvm_mmu_spte_requested(fault);
+ trace_kvm_mmu_spte_requested(fault, root->role.access);
rcu_read_lock();
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index b340dc9991adb..1bf3e4804ad0a 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -93,9 +93,10 @@ static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
* when called via KVM_SET_NESTED_STATE, that state may _not_ match current
* vCPU state. CR0.WP is explicitly ignored, while CR0.PG is required.
*/
- kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, svm->vmcb01.ptr->save.cr4,
+ kvm_init_shadow_npt_mmu(vcpu, svm->vmcb01.ptr->save.cr4,
svm->vmcb01.ptr->save.efer,
- svm->nested.ctl.nested_cr3);
+ svm->nested.ctl.nested_cr3,
+ svm->nested.ctl.misc_ctl);
vcpu->arch.mmu->get_guest_pgd = nested_svm_get_tdp_cr3;
vcpu->arch.mmu->get_pdptr = nested_svm_get_tdp_pdptr;
vcpu->arch.mmu->inject_page_fault = nested_svm_inject_npf_exit;
@@ -498,11 +499,14 @@ void __nested_copy_vmcb_control_to_cache(struct kvm_vcpu *vcpu,
nested_svm_sanitize_intercept(vcpu, to, SKINIT);
nested_svm_sanitize_intercept(vcpu, to, RDPRU);
- /* Always clear SVM_MISC_ENABLE_NP if the guest cannot use NPTs */
+ /* Always clear misc_ctl bits that the guest cannot use */
to->misc_ctl = from->misc_ctl;
if (!guest_cpu_cap_has(vcpu, X86_FEATURE_NPT))
to->misc_ctl &= ~SVM_MISC_ENABLE_NP;
+ if (!gmet_enabled || !guest_cpu_cap_has(vcpu, X86_FEATURE_GMET))
+ to->misc_ctl &= ~SVM_MISC_ENABLE_GMET;
+
to->iopm_base_pa = from->iopm_base_pa & PAGE_MASK;
to->msrpm_base_pa = from->msrpm_base_pa & PAGE_MASK;
to->tsc_offset = from->tsc_offset;
@@ -866,7 +870,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
* the latter, L1 runs L2 with shadow page tables that translate L2 GVAs
* to L1 GPAs, so the same NPTs can be used for L1 and L2.
*/
- vmcb02->control.misc_ctl = vmcb01->control.misc_ctl & SVM_MISC_ENABLE_NP;
+ vmcb02->control.misc_ctl = vmcb01->control.misc_ctl & (SVM_MISC_ENABLE_NP | SVM_MISC_ENABLE_GMET);
vmcb02->control.iopm_base_pa = vmcb01->control.iopm_base_pa;
vmcb02->control.msrpm_base_pa = vmcb01->control.msrpm_base_pa;
vmcb_mark_dirty(vmcb02, VMCB_PERM_MAP);
@@ -903,9 +907,13 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
/* Also overwritten later if necessary. */
vmcb02->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
- /* nested_cr3. */
- if (nested_npt_enabled(svm))
+ /* Use vmcb01 MMU and format if guest does not use nNPT */
+ if (nested_npt_enabled(svm)) {
+ vmcb02->control.misc_ctl &= ~SVM_MISC_ENABLE_GMET;
+ vmcb02->control.misc_ctl |= (svm->nested.ctl.misc_ctl & SVM_MISC_ENABLE_GMET);
+
nested_svm_init_mmu_context(vcpu);
+ }
vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset(vcpu->arch.l1_tsc_offset,
vmcb12_ctrl->tsc_offset,
@@ -2056,8 +2064,26 @@ static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu)
return true;
}
+static gpa_t svm_translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa,
+ u64 access,
+ struct x86_exception *exception,
+ u64 pte_access)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
+
+ BUG_ON(!mmu_is_nested(vcpu));
+
+ /* Non-GMET walks are always user-walks */
+ if (!(svm->nested.ctl.misc_ctl & SVM_MISC_ENABLE_GMET))
+ access |= PFERR_USER_MASK;
+
+ return mmu->gva_to_gpa(vcpu, mmu, gpa, access, exception);
+}
+
struct kvm_x86_nested_ops svm_nested_ops = {
.leave_nested = svm_leave_nested,
+ .translate_nested_gpa = svm_translate_nested_gpa,
.is_exception_vmexit = nested_svm_is_exception_vmexit,
.check_events = svm_check_nested_events,
.triple_fault = nested_svm_triple_fault,
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index e107f368ed2dd..18a7cddb097d2 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -968,7 +968,7 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm)
save->r14 = svm->vcpu.arch.regs[VCPU_REGS_R14];
save->r15 = svm->vcpu.arch.regs[VCPU_REGS_R15];
#endif
- save->rip = svm->vcpu.arch.regs[VCPU_REGS_RIP];
+ save->rip = svm->vcpu.arch.rip;
/* Sync some non-GPR registers before encrypting */
save->xcr0 = svm->vcpu.arch.xcr0;
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index d38a21be099d6..adfa9ff48c573 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -51,6 +51,7 @@
#include "trace.h"
+#include "vmenter.h"
#include "svm.h"
#include "svm_ops.h"
@@ -139,6 +140,9 @@ module_param(pause_filter_count_max, ushort, 0444);
bool __ro_after_init npt_enabled = true;
module_param_named(npt, npt_enabled, bool, 0444);
+bool gmet_enabled = true;
+module_param_named(gmet, gmet_enabled, bool, 0444);
+
/* allow nested virtualization in KVM/SVM */
static int __ro_after_init nested = true;
module_param(nested, int, 0444);
@@ -662,7 +666,7 @@ static void clr_dr_intercepts(struct vcpu_svm *svm)
svm_mark_intercepts_dirty(svm);
}
-static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
+static bool msr_write_intercepted(struct vcpu_svm *svm, u32 msr)
{
/*
* For non-nested case:
@@ -673,8 +677,7 @@ static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
* If the L02 MSR bitmap does not intercept the MSR, then we need to
* save it.
*/
- void *msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm :
- to_svm(vcpu)->msrpm;
+ void *msrpm = is_guest_mode(&svm->vcpu) ? svm->nested.msrpm : svm->msrpm;
return svm_test_msr_bitmap_write(msrpm, msr);
}
@@ -1221,6 +1224,10 @@ static void init_vmcb(struct kvm_vcpu *vcpu, bool init_event)
save->g_pat = vcpu->arch.pat;
save->cr3 = 0;
}
+
+ if (gmet_enabled)
+ control->misc_ctl |= SVM_MISC_ENABLE_GMET;
+
svm->current_vmcb->asid_generation = 0;
svm->asid = 0;
@@ -1529,7 +1536,7 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
kvm_register_mark_available(vcpu, reg);
switch (reg) {
- case VCPU_EXREG_PDPTR:
+ case VCPU_REG_PDPTR:
/*
* When !npt_enabled, mmu->pdptrs[] is already available since
* it is always updated per SDM when moving to CRs.
@@ -1998,6 +2005,18 @@ static int npf_interception(struct kvm_vcpu *vcpu)
}
}
+ if (!is_sev_es_guest(vcpu) &&
+ (svm->vmcb->control.misc_ctl & SVM_MISC_ENABLE_GMET) &&
+ (error_code & PFERR_FETCH_MASK)) {
+ /*
+ * Work around errata 1218: EXITINFO1[2] May Be Incorrectly Set
+ * When GMET (Guest Mode Execute Trap extension) is Enabled
+ */
+ error_code |= PFERR_USER_MASK;
+ if (svm_get_cpl(vcpu) != 3)
+ error_code &= ~PFERR_USER_MASK;
+ }
+
if (is_sev_snp_guest(vcpu) && (error_code & PFERR_GUEST_ENC_MASK))
error_code |= PFERR_PRIVATE_ACCESS;
@@ -2776,7 +2795,7 @@ static bool sev_es_prevent_msr_access(struct kvm_vcpu *vcpu,
{
return is_sev_es_guest(vcpu) && vcpu->arch.guest_state_protected &&
msr_info->index != MSR_IA32_XSS &&
- !msr_write_intercepted(vcpu, msr_info->index);
+ !msr_write_intercepted(to_svm(vcpu), msr_info->index);
}
static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
@@ -4191,7 +4210,7 @@ static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva)
static void svm_flush_tlb_guest(struct kvm_vcpu *vcpu)
{
- kvm_register_mark_dirty(vcpu, VCPU_EXREG_ERAPS);
+ kvm_register_mark_dirty(vcpu, VCPU_REG_ERAPS);
svm_flush_tlb_asid(vcpu);
}
@@ -4390,7 +4409,7 @@ static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
return EXIT_FASTPATH_NONE;
}
-static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_intercepted)
+static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, unsigned enter_flags)
{
struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu);
struct vcpu_svm *svm = to_svm(vcpu);
@@ -4412,10 +4431,10 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in
amd_clear_divider();
if (is_sev_es_guest(vcpu))
- __svm_sev_es_vcpu_run(svm, spec_ctrl_intercepted,
+ __svm_sev_es_vcpu_run(svm, enter_flags,
sev_es_host_save_area(sd));
else
- __svm_vcpu_run(svm, spec_ctrl_intercepted);
+ __svm_vcpu_run(svm, enter_flags);
raw_local_irq_disable();
@@ -4426,13 +4445,16 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
{
bool force_immediate_exit = run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT;
struct vcpu_svm *svm = to_svm(vcpu);
- bool spec_ctrl_intercepted = msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL);
+ unsigned enter_flags = 0;
+
+ if (!msr_write_intercepted(svm, MSR_IA32_SPEC_CTRL))
+ enter_flags |= KVM_ENTER_SAVE_SPEC_CTRL;
trace_kvm_entry(vcpu, force_immediate_exit);
svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
- svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
+ svm->vmcb->save.rip = vcpu->arch.rip;
/*
* Disable singlestep if we're injecting an interrupt/exception.
@@ -4469,7 +4491,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
svm->vmcb->save.cr2 = vcpu->arch.cr2;
if (guest_cpu_cap_has(vcpu, X86_FEATURE_ERAPS) &&
- kvm_register_is_dirty(vcpu, VCPU_EXREG_ERAPS))
+ kvm_register_is_dirty(vcpu, VCPU_REG_ERAPS))
svm->vmcb->control.erap_ctl |= ERAP_CONTROL_CLEAR_RAP;
svm_fixup_nested_rips(vcpu);
@@ -4509,7 +4531,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
x86_spec_ctrl_set_guest(svm->virt_spec_ctrl);
- svm_vcpu_enter_exit(vcpu, spec_ctrl_intercepted);
+ svm_vcpu_enter_exit(vcpu, enter_flags);
if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
x86_spec_ctrl_restore_host(svm->virt_spec_ctrl);
@@ -4518,9 +4540,9 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
vcpu->arch.cr2 = svm->vmcb->save.cr2;
vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
- vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
+ vcpu->arch.rip = svm->vmcb->save.rip;
}
- vcpu->arch.regs_dirty = 0;
+ kvm_reset_dirty_registers(vcpu);
if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
@@ -4566,9 +4588,9 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
vcpu->arch.apf.host_apf_flags =
kvm_read_and_reset_apf_flags();
- vcpu->arch.regs_avail &= ~SVM_REGS_LAZY_LOAD_SET;
+ kvm_clear_available_registers(vcpu, SVM_REGS_LAZY_LOAD_SET);
- if (!msr_write_intercepted(vcpu, MSR_AMD64_PERF_CNTR_GLOBAL_CTL))
+ if (!msr_write_intercepted(svm, MSR_AMD64_PERF_CNTR_GLOBAL_CTL))
rdmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, vcpu_to_pmu(vcpu)->global_ctrl);
trace_kvm_exit(vcpu, KVM_ISA_SVM);
@@ -4624,6 +4646,11 @@ svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
hypercall[2] = 0xd9;
}
+static bool svm_tdp_has_smep(struct kvm *kvm)
+{
+ return gmet_enabled;
+}
+
/*
* The kvm parameter can be NULL (module initialization, or invocation before
* VM creation). Be sure to check the kvm parameter before using it.
@@ -4958,7 +4985,7 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram)
svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
- svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
+ svm->vmcb->save.rip = vcpu->arch.rip;
nested_svm_simple_vmexit(svm, SVM_EXIT_SW);
@@ -5367,6 +5394,7 @@ struct kvm_x86_ops svm_x86_ops __initdata = {
.write_tsc_multiplier = svm_write_tsc_multiplier,
.load_mmu_pgd = svm_load_mmu_pgd,
+ .tdp_has_smep = svm_tdp_has_smep,
.check_intercept = svm_check_intercept,
.handle_exit_irqoff = svm_handle_exit_irqoff,
@@ -5491,6 +5519,9 @@ static __init void svm_set_cpu_caps(void)
if (boot_cpu_has(X86_FEATURE_PFTHRESHOLD))
kvm_cpu_cap_set(X86_FEATURE_PFTHRESHOLD);
+ if (gmet_enabled)
+ kvm_cpu_cap_set(X86_FEATURE_GMET);
+
if (vgif)
kvm_cpu_cap_set(X86_FEATURE_VGIF);
@@ -5600,6 +5631,9 @@ static __init int svm_hardware_setup(void)
if (!boot_cpu_has(X86_FEATURE_NPT))
npt_enabled = false;
+ if (!npt_enabled || !boot_cpu_has(X86_FEATURE_GMET))
+ gmet_enabled = false;
+
/* Force VM NPT level equal to the host's paging level */
kvm_configure_mmu(npt_enabled, get_npt_level(),
get_npt_level(), PG_LEVEL_1G);
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index a10668d17a16a..19b80ef56e2b7 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -44,6 +44,7 @@ static inline struct page *__sme_pa_to_page(unsigned long pa)
#define IOPM_SIZE PAGE_SIZE * 3
#define MSRPM_SIZE PAGE_SIZE * 2
+extern bool gmet_enabled;
extern bool npt_enabled;
extern int nrips;
extern int vgif;
@@ -484,7 +485,7 @@ static inline bool svm_is_vmrun_failure(u64 exit_code)
* KVM_REQ_LOAD_MMU_PGD is always requested when the cached vcpu->arch.cr3
* is changed. svm_load_mmu_pgd() then syncs the new CR3 value into the VMCB.
*/
-#define SVM_REGS_LAZY_LOAD_SET (1 << VCPU_EXREG_PDPTR)
+#define SVM_REGS_LAZY_LOAD_SET (BIT(VCPU_REG_PDPTR))
static inline void __vmcb_set_intercept(unsigned long *intercepts, u32 bit)
{
@@ -1007,9 +1008,9 @@ static inline void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_sa
/* vmenter.S */
-void __svm_sev_es_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted,
+void __svm_sev_es_vcpu_run(struct vcpu_svm *svm, unsigned int flags,
struct sev_es_save_area *hostsa);
-void __svm_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted);
+void __svm_vcpu_run(struct vcpu_svm *svm, unsigned int flags);
#define DEFINE_KVM_GHCB_ACCESSORS(field) \
static __always_inline u64 kvm_ghcb_get_##field(struct vcpu_svm *svm) \
diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S
index d47c5c93c9913..f523d9e498398 100644
--- a/arch/x86/kvm/svm/vmenter.S
+++ b/arch/x86/kvm/svm/vmenter.S
@@ -7,6 +7,7 @@
#include <asm/kvm_vcpu_regs.h>
#include <asm/nospec-branch.h>
#include "kvm-asm-offsets.h"
+#include "vmenter.h"
#define WORD_SIZE (BITS_PER_LONG / 8)
@@ -39,38 +40,6 @@
ALTERNATIVE_2 "", \
"jmp 800f", X86_FEATURE_MSR_SPEC_CTRL, \
"", X86_FEATURE_V_SPEC_CTRL
-801:
-.endm
-.macro RESTORE_GUEST_SPEC_CTRL_BODY
-800:
- /*
- * SPEC_CTRL handling: if the guest's SPEC_CTRL value differs from the
- * host's, write the MSR. This is kept out-of-line so that the common
- * case does not have to jump.
- *
- * IMPORTANT: To avoid RSB underflow attacks and any other nastiness,
- * there must not be any returns or indirect branches between this code
- * and vmentry.
- */
-#ifdef CONFIG_X86_64
- mov SVM_spec_ctrl(%rdi), %rdx
- cmp PER_CPU_VAR(x86_spec_ctrl_current), %rdx
- je 801b
- movl %edx, %eax
- shr $32, %rdx
-#else
- mov SVM_spec_ctrl(%edi), %eax
- mov PER_CPU_VAR(x86_spec_ctrl_current), %ecx
- xor %eax, %ecx
- mov SVM_spec_ctrl + 4(%edi), %edx
- mov PER_CPU_VAR(x86_spec_ctrl_current + 4), %esi
- xor %edx, %esi
- or %esi, %ecx
- je 801b
-#endif
- mov $MSR_IA32_SPEC_CTRL, %ecx
- wrmsr
- jmp 801b
.endm
.macro RESTORE_HOST_SPEC_CTRL
@@ -78,42 +47,6 @@
ALTERNATIVE_2 "", \
"jmp 900f", X86_FEATURE_MSR_SPEC_CTRL, \
"", X86_FEATURE_V_SPEC_CTRL
-901:
-.endm
-.macro RESTORE_HOST_SPEC_CTRL_BODY spec_ctrl_intercepted:req
-900:
- /* Same for after vmexit. */
- mov $MSR_IA32_SPEC_CTRL, %ecx
-
- /*
- * Load the value that the guest had written into MSR_IA32_SPEC_CTRL,
- * if it was not intercepted during guest execution.
- */
- cmpb $0, \spec_ctrl_intercepted
- jnz 998f
- rdmsr
- movl %eax, SVM_spec_ctrl(%_ASM_DI)
- movl %edx, SVM_spec_ctrl + 4(%_ASM_DI)
-998:
- /* Now restore the host value of the MSR if different from the guest's. */
-#ifdef CONFIG_X86_64
- mov PER_CPU_VAR(x86_spec_ctrl_current), %rdx
- cmp SVM_spec_ctrl(%rdi), %rdx
- je 901b
- movl %edx, %eax
- shr $32, %rdx
-#else
- mov PER_CPU_VAR(x86_spec_ctrl_current), %eax
- mov SVM_spec_ctrl(%edi), %esi
- xor %eax, %esi
- mov PER_CPU_VAR(x86_spec_ctrl_current + 4), %edx
- mov SVM_spec_ctrl + 4(%edi), %edi
- xor %edx, %edi
- or %edi, %esi
- je 901b
-#endif
- wrmsr
- jmp 901b
.endm
#define SVM_CLEAR_CPU_BUFFERS \
@@ -121,8 +54,8 @@
/**
* __svm_vcpu_run - Run a vCPU via a transition to SVM guest mode
- * @svm: struct vcpu_svm *
- * @spec_ctrl_intercepted: bool
+ * @svm: struct vcpu_svm *
+ * @enter_flags: u32
*/
SYM_FUNC_START(__svm_vcpu_run)
push %_ASM_BP
@@ -162,6 +95,7 @@ SYM_FUNC_START(__svm_vcpu_run)
/* Clobbers RAX, RCX, RDX (and ESI on 32-bit), consumes RDI (@svm). */
RESTORE_GUEST_SPEC_CTRL
+801:
/*
* Use a single vmcb (vmcb01 because it's always valid) for
@@ -242,6 +176,7 @@ SYM_FUNC_START(__svm_vcpu_run)
* and RSP (pointer to @spec_ctrl_intercepted).
*/
RESTORE_HOST_SPEC_CTRL
+901:
/*
* Mitigate RETBleed for AMD/Hygon Zen uarch. RET should be
@@ -278,7 +213,7 @@ SYM_FUNC_START(__svm_vcpu_run)
xor %r15d, %r15d
#endif
- /* "Pop" @spec_ctrl_intercepted. */
+ /* "Pop" @enter_flags. */
pop %_ASM_BX
pop %_ASM_BX
@@ -295,8 +230,12 @@ SYM_FUNC_START(__svm_vcpu_run)
pop %_ASM_BP
RET
- RESTORE_GUEST_SPEC_CTRL_BODY
- RESTORE_HOST_SPEC_CTRL_BODY (%_ASM_SP)
+800:
+ RESTORE_GUEST_SPEC_CTRL_BODY SVM_spec_ctrl(%_ASM_DI), 801b
+ jmp 801b
+900:
+ RESTORE_HOST_SPEC_CTRL_BODY SVM_spec_ctrl(%_ASM_DI), (%_ASM_SP), 901b
+ jmp 901b
10: cmpb $0, _ASM_RIP(virt_rebooting)
jne 2b
@@ -335,8 +274,8 @@ SYM_FUNC_END(__svm_vcpu_run)
/**
* __svm_sev_es_vcpu_run - Run a SEV-ES vCPU via a transition to SVM guest mode
- * @svm: struct vcpu_svm *
- * @spec_ctrl_intercepted: bool
+ * @svm: struct vcpu_svm *
+ * @enter_flags: u32
*/
SYM_FUNC_START(__svm_sev_es_vcpu_run)
FRAME_BEGIN
@@ -355,13 +294,14 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
/*
* Save volatile registers that hold arguments that are needed after
- * #VMEXIT (RDI=@svm and RSI=@spec_ctrl_intercepted).
+ * #VMEXIT (RDI=@svm and RSI=@enter_flags).
*/
mov %rdi, SEV_ES_RDI (%rdx)
mov %rsi, SEV_ES_RSI (%rdx)
/* Clobbers RAX, RCX, and RDX (@hostsa), consumes RDI (@svm). */
RESTORE_GUEST_SPEC_CTRL
+801:
/* Get svm->current_vmcb->pa into RAX. */
mov SVM_current_vmcb(%rdi), %rax
@@ -376,8 +316,9 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
/* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
FILL_RETURN_BUFFER %rax, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_VMEXIT
- /* Clobbers RAX, RCX, RDX, consumes RDI (@svm) and RSI (@spec_ctrl_intercepted). */
+ /* Clobbers RAX, RCX, RDX, consumes RDI (@svm) and RSI (@enter_flags). */
RESTORE_HOST_SPEC_CTRL
+901:
/*
* Mitigate RETBleed for AMD/Hygon Zen uarch. RET should be
@@ -391,8 +332,12 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
FRAME_END
RET
- RESTORE_GUEST_SPEC_CTRL_BODY
- RESTORE_HOST_SPEC_CTRL_BODY %sil
+800:
+ RESTORE_GUEST_SPEC_CTRL_BODY SVM_spec_ctrl(%_ASM_DI), 801b
+ jmp 801b
+900:
+ RESTORE_HOST_SPEC_CTRL_BODY SVM_spec_ctrl(%_ASM_DI), %esi, 901b
+ jmp 901b
3: cmpb $0, virt_rebooting(%rip)
jne 2b
diff --git a/arch/x86/kvm/vmenter.h b/arch/x86/kvm/vmenter.h
new file mode 100644
index 0000000000000..ba3f71449c62c
--- /dev/null
+++ b/arch/x86/kvm/vmenter.h
@@ -0,0 +1,80 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_VMENTER_H
+#define __KVM_X86_VMENTER_H
+
+#define KVM_ENTER_VMRESUME BIT(0)
+#define KVM_ENTER_SAVE_SPEC_CTRL BIT(1)
+#define KVM_ENTER_CLEAR_CPU_BUFFERS_FOR_MMIO BIT(2)
+
+#ifdef __ASSEMBLER__
+.macro RESTORE_GUEST_SPEC_CTRL_BODY guest_spec_ctrl:req, label:req
+ /*
+ * SPEC_CTRL handling: if the guest's SPEC_CTRL value differs from the
+ * host's, write the MSR. This is kept out-of-line so that the common
+ * case does not have to jump.
+ *
+ * IMPORTANT: To avoid RSB underflow attacks and any other nastiness,
+ * there must not be any returns or indirect branches between this code
+ * and vmentry.
+ */
+#ifdef CONFIG_X86_64
+ mov \guest_spec_ctrl, %rdx
+ cmp PER_CPU_VAR(x86_spec_ctrl_current), %rdx
+ je \label
+ movl %edx, %eax
+ shr $32, %rdx
+#else
+ mov \guest_spec_ctrl, %eax
+ mov PER_CPU_VAR(x86_spec_ctrl_current), %ecx
+ xor %eax, %ecx
+ mov 4 + \guest_spec_ctrl, %edx
+ mov PER_CPU_VAR(x86_spec_ctrl_current + 4), %esi
+ xor %edx, %esi
+ or %esi, %ecx
+ je \label
+#endif
+ mov $MSR_IA32_SPEC_CTRL, %ecx
+ wrmsr
+.endm
+
+.macro RESTORE_HOST_SPEC_CTRL_BODY guest_spec_ctrl:req, enter_flags:req, label:req
+ /* Same for after vmexit. */
+ mov $MSR_IA32_SPEC_CTRL, %ecx
+
+ /*
+ * Load the value that the guest had written into MSR_IA32_SPEC_CTRL,
+ * if it was not intercepted during guest execution.
+ */
+ testl $KVM_ENTER_SAVE_SPEC_CTRL, \enter_flags
+ jz 998f
+ rdmsr
+ movl %eax, \guest_spec_ctrl
+ movl %edx, 4 + \guest_spec_ctrl
+998:
+ /* Now restore the host value of the MSR if different from the guest's. */
+#ifdef CONFIG_X86_64
+ mov PER_CPU_VAR(x86_spec_ctrl_current), %rdx
+ cmp \guest_spec_ctrl, %rdx
+ /*
+ * For legacy IBRS, the IBRS bit always needs to be written after
+ * transitioning from a less privileged predictor mode, regardless of
+ * whether the guest/host values differ.
+ */
+ ALTERNATIVE __stringify(je \label), "", X86_FEATURE_KERNEL_IBRS
+ movl %edx, %eax
+ shr $32, %rdx
+#else
+ mov PER_CPU_VAR(x86_spec_ctrl_current), %eax
+ mov \guest_spec_ctrl, %esi
+ xor %eax, %esi
+ mov PER_CPU_VAR(x86_spec_ctrl_current + 4), %edx
+ mov 4 + \guest_spec_ctrl, %edi
+ xor %edx, %edi
+ or %edi, %esi
+ ALTERNATIVE __stringify(je \label), "", X86_FEATURE_KERNEL_IBRS
+#endif
+ wrmsr
+.endm
+
+#endif /* __ASSEMBLER__ */
+#endif /* __KVM_X86_VMENTER_H */
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index 31568274d8bb0..810119167f798 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -16,6 +16,7 @@ extern bool __read_mostly enable_unrestricted_guest;
extern bool __read_mostly enable_ept_ad_bits;
extern bool __read_mostly enable_cet;
extern bool __read_mostly enable_pml;
+extern bool __read_mostly enable_mbec;
extern int __read_mostly pt_mode;
#define PT_MODE_SYSTEM 0
@@ -301,11 +302,6 @@ static inline bool cpu_has_vmx_flexpriority(void)
cpu_has_vmx_virtualize_apic_accesses();
}
-static inline bool cpu_has_vmx_ept_execute_only(void)
-{
- return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT;
-}
-
static inline bool cpu_has_vmx_ept_4levels(void)
{
return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT;
@@ -412,4 +408,10 @@ static inline bool cpu_has_notify_vmexit(void)
SECONDARY_EXEC_NOTIFY_VM_EXITING;
}
+static inline bool cpu_has_ept_mbec(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_MODE_BASED_EPT_EXEC;
+}
+
#endif /* __KVM_X86_VMX_CAPS_H */
diff --git a/arch/x86/kvm/vmx/common.h b/arch/x86/kvm/vmx/common.h
index 412d0829d7a21..08005676702c2 100644
--- a/arch/x86/kvm/vmx/common.h
+++ b/arch/x86/kvm/vmx/common.h
@@ -85,22 +85,30 @@ static inline int __vmx_handle_ept_violation(struct kvm_vcpu *vcpu, gpa_t gpa,
{
u64 error_code;
- /* Is it a read fault? */
- error_code = (exit_qualification & EPT_VIOLATION_ACC_READ)
- ? PFERR_USER_MASK : 0;
/* Is it a write fault? */
- error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE)
+ error_code = (exit_qualification & EPT_VIOLATION_ACC_WRITE)
? PFERR_WRITE_MASK : 0;
/* Is it a fetch fault? */
error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR)
? PFERR_FETCH_MASK : 0;
- /* ept page table entry is present? */
- error_code |= (exit_qualification & EPT_VIOLATION_PROT_MASK)
+ /* ept page table entry is present? */
+ error_code |= (exit_qualification &
+ (EPT_VIOLATION_PROT_MASK & ~EPT_VIOLATION_PROT_USER_EXEC))
? PFERR_PRESENT_MASK : 0;
- if (exit_qualification & EPT_VIOLATION_GVA_IS_VALID)
- error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) ?
- PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
+ if (mmu_has_mbec(vcpu->arch.mmu))
+ error_code |= (exit_qualification & EPT_VIOLATION_PROT_USER_EXEC)
+ ? PFERR_PRESENT_MASK : 0;
+
+ if (exit_qualification & EPT_VIOLATION_GVA_IS_VALID) {
+ if (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) {
+ error_code |= PFERR_GUEST_FINAL_MASK;
+ if (exit_qualification & EPT_VIOLATION_GVA_USER)
+ error_code |= PFERR_USER_MASK;
+ } else {
+ error_code |= PFERR_GUEST_PAGE_MASK;
+ }
+ }
if (vt_is_tdx_private_gpa(vcpu->kvm, gpa))
error_code |= PFERR_PRIVATE_ACCESS;
diff --git a/arch/x86/kvm/vmx/hyperv_evmcs.h b/arch/x86/kvm/vmx/hyperv_evmcs.h
index fc7c4e7bd1bfb..bc08fe40590e9 100644
--- a/arch/x86/kvm/vmx/hyperv_evmcs.h
+++ b/arch/x86/kvm/vmx/hyperv_evmcs.h
@@ -87,6 +87,7 @@
SECONDARY_EXEC_PT_CONCEAL_VMX | \
SECONDARY_EXEC_BUS_LOCK_DETECTION | \
SECONDARY_EXEC_NOTIFY_VM_EXITING | \
+ SECONDARY_EXEC_MODE_BASED_EPT_EXEC | \
SECONDARY_EXEC_ENCLS_EXITING)
#define EVMCS1_SUPPORTED_3RDEXEC (0ULL)
diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
index dbebddf648be7..83d9921277eac 100644
--- a/arch/x86/kvm/vmx/main.c
+++ b/arch/x86/kvm/vmx/main.c
@@ -755,6 +755,14 @@ static int vt_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
return vmx_set_identity_map_addr(kvm, ident_addr);
}
+static bool vt_tdp_has_smep(struct kvm *kvm)
+{
+ if (is_td(kvm))
+ return false;
+
+ return vmx_tdp_has_smep(kvm);
+}
+
static u64 vt_get_l2_tsc_offset(struct kvm_vcpu *vcpu)
{
/* TDX doesn't support L2 guest at the moment. */
@@ -966,6 +974,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
.set_tss_addr = vt_op(set_tss_addr),
.set_identity_map_addr = vt_op(set_identity_map_addr),
.get_mt_mask = vmx_get_mt_mask,
+ .tdp_has_smep = vt_op(tdp_has_smep),
.get_exit_info = vt_op(get_exit_info),
.get_entry_info = vt_op(get_entry_info),
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 3fe88f29be7a9..4690a4d23709d 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -310,13 +310,13 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
vmx_sync_vmcs_host_state(vmx, prev);
put_cpu();
- vcpu->arch.regs_avail = ~VMX_REGS_LAZY_LOAD_SET;
+ kvm_clear_available_registers(vcpu, VMX_REGS_LAZY_LOAD_SET);
/*
* All lazily updated registers will be reloaded from VMCS12 on both
* vmentry and vmexit.
*/
- vcpu->arch.regs_dirty = 0;
+ kvm_reset_dirty_registers(vcpu);
}
static void nested_put_vmcs12_pages(struct kvm_vcpu *vcpu)
@@ -443,10 +443,14 @@ static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
vm_exit_reason = EXIT_REASON_EPT_MISCONFIG;
exit_qualification = 0;
} else {
+ u64 mask = EPT_VIOLATION_GVA_IS_VALID |
+ EPT_VIOLATION_GVA_TRANSLATED;
+ if (vmx->nested.msrs.ept_caps & VMX_EPT_ADVANCED_VMEXIT_INFO_BIT)
+ mask |= EPT_VIOLATION_GVA_USER |
+ EPT_VIOLATION_GVA_WRITABLE |
+ EPT_VIOLATION_GVA_NX;
exit_qualification = fault->exit_qualification;
- exit_qualification |= vmx_get_exit_qual(vcpu) &
- (EPT_VIOLATION_GVA_IS_VALID |
- EPT_VIOLATION_GVA_TRANSLATED);
+ exit_qualification |= vmx_get_exit_qual(vcpu) & mask;
vm_exit_reason = EXIT_REASON_EPT_VIOLATION;
}
@@ -465,6 +469,13 @@ static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
vmcs12->guest_physical_address = fault->address;
}
+static inline bool nested_ept_mbec_enabled(struct kvm_vcpu *vcpu)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC);
+}
+
static void nested_ept_new_eptp(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -473,6 +484,7 @@ static void nested_ept_new_eptp(struct kvm_vcpu *vcpu)
kvm_init_shadow_ept_mmu(vcpu, execonly, ept_lpage_level,
nested_ept_ad_enabled(vcpu),
+ nested_ept_mbec_enabled(vcpu),
nested_ept_get_eptp(vcpu));
}
@@ -1189,7 +1201,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3,
}
vcpu->arch.cr3 = cr3;
- kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
+ kvm_register_mark_dirty(vcpu, VCPU_REG_CR3);
/* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */
kvm_init_mmu(vcpu);
@@ -2440,6 +2452,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
SECONDARY_EXEC_APIC_REGISTER_VIRT |
SECONDARY_EXEC_ENABLE_VMFUNC |
+ SECONDARY_EXEC_MODE_BASED_EPT_EXEC |
SECONDARY_EXEC_DESC);
if (nested_cpu_has(vmcs12,
@@ -4972,7 +4985,7 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
nested_ept_uninit_mmu_context(vcpu);
vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
- kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
+ kvm_register_mark_available(vcpu, VCPU_REG_CR3);
/*
* Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs
@@ -5074,7 +5087,7 @@ void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
kvm_service_local_tlb_flush_requests(vcpu);
/*
- * VCPU_EXREG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between
+ * VCPU_REG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between
* now and the new vmentry. Ensure that the VMCS02 PDPTR fields are
* up-to-date before switching to L1.
*/
@@ -7239,7 +7252,8 @@ static void nested_vmx_setup_secondary_ctls(u32 ept_caps,
VMX_EPT_PAGE_WALK_5_BIT |
VMX_EPTP_WB_BIT |
VMX_EPT_INVEPT_BIT |
- VMX_EPT_EXECUTE_ONLY_BIT;
+ VMX_EPT_EXECUTE_ONLY_BIT |
+ VMX_EPT_ADVANCED_VMEXIT_INFO_BIT;
msrs->ept_caps &= ept_caps;
msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
@@ -7251,6 +7265,9 @@ static void nested_vmx_setup_secondary_ctls(u32 ept_caps,
msrs->ept_caps |= VMX_EPT_AD_BIT;
}
+ if (enable_mbec)
+ msrs->secondary_ctls_high |=
+ SECONDARY_EXEC_MODE_BASED_EPT_EXEC;
/*
* Advertise EPTP switching irrespective of hardware support,
* KVM emulates it in software so long as VMFUNC is supported.
@@ -7438,8 +7455,29 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
return 0;
}
+static gpa_t vmx_translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa,
+ u64 access,
+ struct x86_exception *exception,
+ u64 pte_access)
+{
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
+
+ BUG_ON(!mmu_is_nested(vcpu));
+
+ /*
+ * MBEC differentiates based on the effective U/S bit of
+ * the guest page tables; not the processor CPL.
+ */
+ access &= ~PFERR_USER_MASK;
+ if ((pte_access & ACC_USER_MASK) && (access & PFERR_GUEST_FINAL_MASK))
+ access |= PFERR_USER_MASK;
+
+ return mmu->gva_to_gpa(vcpu, mmu, gpa, access, exception);
+}
+
struct kvm_x86_nested_ops vmx_nested_ops = {
.leave_nested = vmx_leave_nested,
+ .translate_nested_gpa = vmx_translate_nested_gpa,
.is_exception_vmexit = nested_vmx_is_exception_vmexit,
.check_events = vmx_check_nested_events,
.has_events = vmx_has_nested_events,
diff --git a/arch/x86/kvm/vmx/run_flags.h b/arch/x86/kvm/vmx/run_flags.h
deleted file mode 100644
index 6a87a12135fb5..0000000000000
--- a/arch/x86/kvm/vmx/run_flags.h
+++ /dev/null
@@ -1,9 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __KVM_X86_VMX_RUN_FLAGS_H
-#define __KVM_X86_VMX_RUN_FLAGS_H
-
-#define VMX_RUN_VMRESUME BIT(0)
-#define VMX_RUN_SAVE_SPEC_CTRL BIT(1)
-#define VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO BIT(2)
-
-#endif /* __KVM_X86_VMX_RUN_FLAGS_H */
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index ed12805bbb444..b187ef9a6ae46 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -1003,23 +1003,23 @@ static fastpath_t tdx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
return EXIT_FASTPATH_NONE;
}
-#define TDX_REGS_AVAIL_SET (BIT_ULL(VCPU_EXREG_EXIT_INFO_1) | \
- BIT_ULL(VCPU_EXREG_EXIT_INFO_2) | \
- BIT_ULL(VCPU_REGS_RAX) | \
- BIT_ULL(VCPU_REGS_RBX) | \
- BIT_ULL(VCPU_REGS_RCX) | \
- BIT_ULL(VCPU_REGS_RDX) | \
- BIT_ULL(VCPU_REGS_RBP) | \
- BIT_ULL(VCPU_REGS_RSI) | \
- BIT_ULL(VCPU_REGS_RDI) | \
- BIT_ULL(VCPU_REGS_R8) | \
- BIT_ULL(VCPU_REGS_R9) | \
- BIT_ULL(VCPU_REGS_R10) | \
- BIT_ULL(VCPU_REGS_R11) | \
- BIT_ULL(VCPU_REGS_R12) | \
- BIT_ULL(VCPU_REGS_R13) | \
- BIT_ULL(VCPU_REGS_R14) | \
- BIT_ULL(VCPU_REGS_R15))
+#define TDX_REGS_AVAIL_SET (BIT(VCPU_REG_EXIT_INFO_1) | \
+ BIT(VCPU_REG_EXIT_INFO_2) | \
+ BIT(VCPU_REGS_RAX) | \
+ BIT(VCPU_REGS_RBX) | \
+ BIT(VCPU_REGS_RCX) | \
+ BIT(VCPU_REGS_RDX) | \
+ BIT(VCPU_REGS_RBP) | \
+ BIT(VCPU_REGS_RSI) | \
+ BIT(VCPU_REGS_RDI) | \
+ BIT(VCPU_REGS_R8) | \
+ BIT(VCPU_REGS_R9) | \
+ BIT(VCPU_REGS_R10) | \
+ BIT(VCPU_REGS_R11) | \
+ BIT(VCPU_REGS_R12) | \
+ BIT(VCPU_REGS_R13) | \
+ BIT(VCPU_REGS_R14) | \
+ BIT(VCPU_REGS_R15))
static void tdx_load_host_xsave_state(struct kvm_vcpu *vcpu)
{
@@ -1088,7 +1088,7 @@ fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
tdx_load_host_xsave_state(vcpu);
- vcpu->arch.regs_avail &= TDX_REGS_AVAIL_SET;
+ kvm_clear_available_registers(vcpu, ~TDX_REGS_AVAIL_SET);
if (unlikely(tdx->vp_enter_ret == EXIT_REASON_EPT_MISCONFIG))
return EXIT_FASTPATH_NONE;
@@ -1835,7 +1835,7 @@ static inline bool tdx_is_sept_violation_unexpected_pending(struct kvm_vcpu *vcp
if (eeq_type != TDX_EXT_EXIT_QUAL_TYPE_PENDING_EPT_VIOLATION)
return false;
- return !(eq & EPT_VIOLATION_PROT_MASK) && !(eq & EPT_VIOLATION_EXEC_FOR_RING3_LIN);
+ return !(eq & EPT_VIOLATION_PROT_MASK);
}
static int tdx_handle_ept_violation(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
index ff1f254a0ef4e..196a1f481881c 100644
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -7,28 +7,28 @@
#include <asm/percpu.h>
#include <asm/segment.h>
#include "kvm-asm-offsets.h"
-#include "run_flags.h"
+#include "vmenter.h"
#define WORD_SIZE (BITS_PER_LONG / 8)
-#define VCPU_RAX __VCPU_REGS_RAX * WORD_SIZE
-#define VCPU_RCX __VCPU_REGS_RCX * WORD_SIZE
-#define VCPU_RDX __VCPU_REGS_RDX * WORD_SIZE
-#define VCPU_RBX __VCPU_REGS_RBX * WORD_SIZE
+#define VCPU_RAX (VMX_vcpu_arch_regs + __VCPU_REGS_RAX * WORD_SIZE)
+#define VCPU_RCX (VMX_vcpu_arch_regs + __VCPU_REGS_RCX * WORD_SIZE)
+#define VCPU_RDX (VMX_vcpu_arch_regs + __VCPU_REGS_RDX * WORD_SIZE)
+#define VCPU_RBX (VMX_vcpu_arch_regs + __VCPU_REGS_RBX * WORD_SIZE)
/* Intentionally omit RSP as it's context switched by hardware */
-#define VCPU_RBP __VCPU_REGS_RBP * WORD_SIZE
-#define VCPU_RSI __VCPU_REGS_RSI * WORD_SIZE
-#define VCPU_RDI __VCPU_REGS_RDI * WORD_SIZE
+#define VCPU_RBP (VMX_vcpu_arch_regs + __VCPU_REGS_RBP * WORD_SIZE)
+#define VCPU_RSI (VMX_vcpu_arch_regs + __VCPU_REGS_RSI * WORD_SIZE)
+#define VCPU_RDI (VMX_vcpu_arch_regs + __VCPU_REGS_RDI * WORD_SIZE)
#ifdef CONFIG_X86_64
-#define VCPU_R8 __VCPU_REGS_R8 * WORD_SIZE
-#define VCPU_R9 __VCPU_REGS_R9 * WORD_SIZE
-#define VCPU_R10 __VCPU_REGS_R10 * WORD_SIZE
-#define VCPU_R11 __VCPU_REGS_R11 * WORD_SIZE
-#define VCPU_R12 __VCPU_REGS_R12 * WORD_SIZE
-#define VCPU_R13 __VCPU_REGS_R13 * WORD_SIZE
-#define VCPU_R14 __VCPU_REGS_R14 * WORD_SIZE
-#define VCPU_R15 __VCPU_REGS_R15 * WORD_SIZE
+#define VCPU_R8 (VMX_vcpu_arch_regs + __VCPU_REGS_R8 * WORD_SIZE)
+#define VCPU_R9 (VMX_vcpu_arch_regs + __VCPU_REGS_R9 * WORD_SIZE)
+#define VCPU_R10 (VMX_vcpu_arch_regs + __VCPU_REGS_R10 * WORD_SIZE)
+#define VCPU_R11 (VMX_vcpu_arch_regs + __VCPU_REGS_R11 * WORD_SIZE)
+#define VCPU_R12 (VMX_vcpu_arch_regs + __VCPU_REGS_R12 * WORD_SIZE)
+#define VCPU_R13 (VMX_vcpu_arch_regs + __VCPU_REGS_R13 * WORD_SIZE)
+#define VCPU_R14 (VMX_vcpu_arch_regs + __VCPU_REGS_R14 * WORD_SIZE)
+#define VCPU_R15 (VMX_vcpu_arch_regs + __VCPU_REGS_R15 * WORD_SIZE)
#endif
.section .noinstr.text, "ax"
@@ -36,10 +36,9 @@
/**
* __vmx_vcpu_run - Run a vCPU via a transition to VMX guest mode
* @vmx: struct vcpu_vmx *
- * @regs: unsigned long * (to guest registers)
- * @flags: VMX_RUN_VMRESUME: use VMRESUME instead of VMLAUNCH
- * VMX_RUN_SAVE_SPEC_CTRL: save guest SPEC_CTRL into vmx->spec_ctrl
- * VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO: vCPU can access host MMIO
+ * @flags: KVM_ENTER_VMRESUME: use VMRESUME instead of VMLAUNCH
+ * KVM_ENTER_SAVE_SPEC_CTRL: save guest SPEC_CTRL into vmx->spec_ctrl
+ * KVM_ENTER_CLEAR_CPU_BUFFERS_FOR_MMIO: vCPU can access host MMIO
*
* Returns:
* 0 on VM-Exit, 1 on VM-Fail
@@ -62,76 +61,46 @@ SYM_FUNC_START(__vmx_vcpu_run)
push %_ASM_ARG1
/* Save @flags (used for VMLAUNCH vs. VMRESUME and mitigations). */
- push %_ASM_ARG3
-
- /*
- * Save @regs, _ASM_ARG2 may be modified by vmx_update_host_rsp() and
- * @regs is needed after VM-Exit to save the guest's register values.
- */
push %_ASM_ARG2
lea (%_ASM_SP), %_ASM_ARG2
call vmx_update_host_rsp
- ALTERNATIVE "jmp .Lspec_ctrl_done", "", X86_FEATURE_MSR_SPEC_CTRL
+ /* Reload @vmx, _ASM_ARG1 may be modified by vmx_update_host_rsp(). */
+ mov WORD_SIZE(%_ASM_SP), %_ASM_DI
/*
- * SPEC_CTRL handling: if the guest's SPEC_CTRL value differs from the
- * host's, write the MSR.
- *
- * IMPORTANT: To avoid RSB underflow attacks and any other nastiness,
- * there must not be any returns or indirect branches between this code
- * and vmentry.
+ * Unlike AMD there's no V_SPEC_CTRL here, so do not leave the body
+ * out of line. Clobbers RAX, RCX, RDX, RSI.
*/
- mov 2*WORD_SIZE(%_ASM_SP), %_ASM_DI
-#ifdef CONFIG_X86_64
- mov VMX_spec_ctrl(%rdi), %rdx
- cmp PER_CPU_VAR(x86_spec_ctrl_current), %rdx
- je .Lspec_ctrl_done
- movl %edx, %eax
- shr $32, %rdx
-#else
- mov VMX_spec_ctrl(%edi), %eax
- mov PER_CPU_VAR(x86_spec_ctrl_current), %ecx
- xor %eax, %ecx
- mov VMX_spec_ctrl + 4(%edi), %edx
- mov PER_CPU_VAR(x86_spec_ctrl_current + 4), %edi
- xor %edx, %edi
- or %edi, %ecx
- je .Lspec_ctrl_done
-#endif
- mov $MSR_IA32_SPEC_CTRL, %ecx
- wrmsr
-
-.Lspec_ctrl_done:
+ ALTERNATIVE "jmp .Lspec_ctrl_guest_done", "", X86_FEATURE_MSR_SPEC_CTRL
+ RESTORE_GUEST_SPEC_CTRL_BODY VMX_spec_ctrl(%_ASM_DI), .Lspec_ctrl_guest_done
+.Lspec_ctrl_guest_done:
/*
* Since vmentry is serializing on affected CPUs, there's no need for
* an LFENCE to stop speculation from skipping the wrmsr.
*/
- /* Load @regs to RAX. */
- mov (%_ASM_SP), %_ASM_AX
-
/* Load guest registers. Don't clobber flags. */
- mov VCPU_RCX(%_ASM_AX), %_ASM_CX
- mov VCPU_RDX(%_ASM_AX), %_ASM_DX
- mov VCPU_RBX(%_ASM_AX), %_ASM_BX
- mov VCPU_RBP(%_ASM_AX), %_ASM_BP
- mov VCPU_RSI(%_ASM_AX), %_ASM_SI
- mov VCPU_RDI(%_ASM_AX), %_ASM_DI
+ mov VCPU_RAX(%_ASM_DI), %_ASM_AX
+ mov VCPU_RCX(%_ASM_DI), %_ASM_CX
+ mov VCPU_RDX(%_ASM_DI), %_ASM_DX
+ mov VCPU_RBX(%_ASM_DI), %_ASM_BX
+ mov VCPU_RBP(%_ASM_DI), %_ASM_BP
+ mov VCPU_RSI(%_ASM_DI), %_ASM_SI
#ifdef CONFIG_X86_64
- mov VCPU_R8 (%_ASM_AX), %r8
- mov VCPU_R9 (%_ASM_AX), %r9
- mov VCPU_R10(%_ASM_AX), %r10
- mov VCPU_R11(%_ASM_AX), %r11
- mov VCPU_R12(%_ASM_AX), %r12
- mov VCPU_R13(%_ASM_AX), %r13
- mov VCPU_R14(%_ASM_AX), %r14
- mov VCPU_R15(%_ASM_AX), %r15
+ mov VCPU_R8 (%_ASM_DI), %r8
+ mov VCPU_R9 (%_ASM_DI), %r9
+ mov VCPU_R10(%_ASM_DI), %r10
+ mov VCPU_R11(%_ASM_DI), %r11
+ mov VCPU_R12(%_ASM_DI), %r12
+ mov VCPU_R13(%_ASM_DI), %r13
+ mov VCPU_R14(%_ASM_DI), %r14
+ mov VCPU_R15(%_ASM_DI), %r15
#endif
- /* Load guest RAX. This kills the @regs pointer! */
- mov VCPU_RAX(%_ASM_AX), %_ASM_AX
+ /* Load guest RDI. This kills the @vmx pointer! */
+ mov VCPU_RDI(%_ASM_DI), %_ASM_DI
/*
* Note, ALTERNATIVE_2 works in reverse order. If CLEAR_CPU_BUF_VM is
@@ -140,7 +109,7 @@ SYM_FUNC_START(__vmx_vcpu_run)
* do VERW. Else, do nothing (no mitigations needed/enabled).
*/
ALTERNATIVE_2 "", \
- __stringify(testl $VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO, WORD_SIZE(%_ASM_SP); \
+ __stringify(testl $KVM_ENTER_CLEAR_CPU_BUFFERS_FOR_MMIO, (%_ASM_SP); \
jz .Lskip_mmio_verw; \
VERW; \
.Lskip_mmio_verw:), \
@@ -148,7 +117,7 @@ SYM_FUNC_START(__vmx_vcpu_run)
__stringify(VERW), X86_FEATURE_CLEAR_CPU_BUF_VM
/* Check @flags to see if VMLAUNCH or VMRESUME is needed. */
- testl $VMX_RUN_VMRESUME, WORD_SIZE(%_ASM_SP)
+ testl $KVM_ENTER_VMRESUME, (%_ASM_SP)
jz .Lvmlaunch
/*
@@ -180,38 +149,35 @@ SYM_INNER_LABEL_ALIGN(vmx_vmexit, SYM_L_GLOBAL)
UNWIND_HINT_RESTORE
ENDBR
- /* Temporarily save guest's RAX. */
- push %_ASM_AX
+ /* Temporarily save guest's RDI. */
+ push %_ASM_DI
- /* Reload @regs to RAX. */
- mov WORD_SIZE(%_ASM_SP), %_ASM_AX
+ /* Reload @vmx to RDI. */
+ mov 2*WORD_SIZE(%_ASM_SP), %_ASM_DI
- /* Save all guest registers, including RAX from the stack */
- pop VCPU_RAX(%_ASM_AX)
- mov %_ASM_CX, VCPU_RCX(%_ASM_AX)
- mov %_ASM_DX, VCPU_RDX(%_ASM_AX)
- mov %_ASM_BX, VCPU_RBX(%_ASM_AX)
- mov %_ASM_BP, VCPU_RBP(%_ASM_AX)
- mov %_ASM_SI, VCPU_RSI(%_ASM_AX)
- mov %_ASM_DI, VCPU_RDI(%_ASM_AX)
+ /* Save all guest registers, including RDI from the stack */
+ mov %_ASM_AX, VCPU_RAX(%_ASM_DI)
+ mov %_ASM_CX, VCPU_RCX(%_ASM_DI)
+ mov %_ASM_DX, VCPU_RDX(%_ASM_DI)
+ mov %_ASM_BX, VCPU_RBX(%_ASM_DI)
+ mov %_ASM_BP, VCPU_RBP(%_ASM_DI)
+ mov %_ASM_SI, VCPU_RSI(%_ASM_DI)
+ pop VCPU_RDI(%_ASM_DI)
#ifdef CONFIG_X86_64
- mov %r8, VCPU_R8 (%_ASM_AX)
- mov %r9, VCPU_R9 (%_ASM_AX)
- mov %r10, VCPU_R10(%_ASM_AX)
- mov %r11, VCPU_R11(%_ASM_AX)
- mov %r12, VCPU_R12(%_ASM_AX)
- mov %r13, VCPU_R13(%_ASM_AX)
- mov %r14, VCPU_R14(%_ASM_AX)
- mov %r15, VCPU_R15(%_ASM_AX)
+ mov %r8, VCPU_R8 (%_ASM_DI)
+ mov %r9, VCPU_R9 (%_ASM_DI)
+ mov %r10, VCPU_R10(%_ASM_DI)
+ mov %r11, VCPU_R11(%_ASM_DI)
+ mov %r12, VCPU_R12(%_ASM_DI)
+ mov %r13, VCPU_R13(%_ASM_DI)
+ mov %r14, VCPU_R14(%_ASM_DI)
+ mov %r15, VCPU_R15(%_ASM_DI)
#endif
/* Clear return value to indicate VM-Exit (as opposed to VM-Fail). */
xor %ebx, %ebx
.Lclear_regs:
- /* Discard @regs. The register is irrelevant, it just can't be RBX. */
- pop %_ASM_AX
-
/*
* Clear all general purpose registers except RSP and RBX to prevent
* speculative use of the guest's values, even those that are reloaded
@@ -254,16 +220,32 @@ SYM_INNER_LABEL_ALIGN(vmx_vmexit, SYM_L_GLOBAL)
FILL_RETURN_BUFFER %_ASM_CX, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_VMEXIT,\
X86_FEATURE_RSB_VMEXIT_LITE
- pop %_ASM_ARG2 /* @flags */
- pop %_ASM_ARG1 /* @vmx */
+ /* Clobbers RAX, RCX, RDX, RSI. */
+ ALTERNATIVE "jmp .Lspec_ctrl_host_done", "", X86_FEATURE_MSR_SPEC_CTRL
+ mov WORD_SIZE(%_ASM_SP), %_ASM_DI
+ RESTORE_HOST_SPEC_CTRL_BODY VMX_spec_ctrl(%_ASM_DI), (%_ASM_SP), .Lspec_ctrl_host_done
+.Lspec_ctrl_host_done:
- call vmx_spec_ctrl_restore_host
+ /*
+ * Halt speculation past a conditional wrmsr. Intel's eIBRS
+ * guarantees that the guest cannot control the RSB "once IBRS is
+ * set", but in the eIBRS case speculative execution past the 'je'
+ * can go all the way to the RET below while MSR_IA32_SPEC_CTRL
+ * still holds the guest value.
+ */
+ ALTERNATIVE_2 "", "lfence", X86_FEATURE_MSR_SPEC_CTRL, \
+ "", X86_FEATURE_KERNEL_IBRS
CLEAR_BRANCH_HISTORY_VMEXIT
/* Put return value in AX */
mov %_ASM_BX, %_ASM_AX
+ /* Pop our saved arguments from the stack */
+ pop %_ASM_BX
+ pop %_ASM_BX
+
+ /* ... and then the callee-save registers */
pop %_ASM_BX
#ifdef CONFIG_X86_64
pop %r12
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index ede773ce065a7..e5cfe4d12c479 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -74,6 +74,7 @@
#include "x86_ops.h"
#include "smm.h"
#include "vmx_onhyperv.h"
+#include "vmenter.h"
#include "posted_intr.h"
#include "mmu/spte.h"
@@ -118,6 +119,9 @@ module_param(emulate_invalid_guest_state, bool, 0444);
static bool __read_mostly fasteoi = 1;
module_param(fasteoi, bool, 0444);
+bool __read_mostly enable_mbec = 1;
+module_param_named(mbec, enable_mbec, bool, 0444);
+
module_param(enable_apicv, bool, 0444);
module_param(enable_ipiv, bool, 0444);
@@ -847,8 +851,8 @@ static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg,
bool ret;
u32 mask = 1 << (seg * SEG_FIELD_NR + field);
- if (!kvm_register_is_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS)) {
- kvm_register_mark_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS);
+ if (!kvm_register_is_available(&vmx->vcpu, VCPU_REG_SEGMENTS)) {
+ kvm_register_mark_available(&vmx->vcpu, VCPU_REG_SEGMENTS);
vmx->segment_cache.bitmask = 0;
}
ret = vmx->segment_cache.bitmask & mask;
@@ -968,12 +972,12 @@ static bool msr_write_intercepted(struct vcpu_vmx *vmx, u32 msr)
return vmx_test_msr_bitmap_write(vmx->loaded_vmcs->msr_bitmap, msr);
}
-unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx)
+unsigned int __vmx_vcpu_enter_flags(struct vcpu_vmx *vmx)
{
unsigned int flags = 0;
if (vmx->loaded_vmcs->launched)
- flags |= VMX_RUN_VMRESUME;
+ flags |= KVM_ENTER_VMRESUME;
/*
* If writes to the SPEC_CTRL MSR aren't intercepted, the guest is free
@@ -981,11 +985,11 @@ unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx)
* it after vmexit and store it in vmx->spec_ctrl.
*/
if (!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL))
- flags |= VMX_RUN_SAVE_SPEC_CTRL;
+ flags |= KVM_ENTER_SAVE_SPEC_CTRL;
if (cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF_VM_MMIO) &&
kvm_vcpu_can_access_host_mmio(&vmx->vcpu))
- flags |= VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO;
+ flags |= KVM_ENTER_CLEAR_CPU_BUFFERS_FOR_MMIO;
return flags;
}
@@ -1613,8 +1617,8 @@ unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long rflags, save_rflags;
- if (!kvm_register_is_available(vcpu, VCPU_EXREG_RFLAGS)) {
- kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
+ if (!kvm_register_is_available(vcpu, VCPU_REG_RFLAGS)) {
+ kvm_register_mark_available(vcpu, VCPU_REG_RFLAGS);
rflags = vmcs_readl(GUEST_RFLAGS);
if (vmx->rmode.vm86_active) {
rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
@@ -1637,7 +1641,7 @@ void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
* if L1 runs L2 as a restricted guest.
*/
if (is_unrestricted_guest(vcpu)) {
- kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
+ kvm_register_mark_available(vcpu, VCPU_REG_RFLAGS);
vmx->rflags = rflags;
vmcs_writel(GUEST_RFLAGS, rflags);
return;
@@ -2608,20 +2612,20 @@ void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
case VCPU_REGS_RSP:
vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
break;
- case VCPU_REGS_RIP:
- vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
+ case VCPU_REG_RIP:
+ vcpu->arch.rip = vmcs_readl(GUEST_RIP);
break;
- case VCPU_EXREG_PDPTR:
+ case VCPU_REG_PDPTR:
if (enable_ept)
ept_save_pdptrs(vcpu);
break;
- case VCPU_EXREG_CR0:
+ case VCPU_REG_CR0:
guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
vcpu->arch.cr0 &= ~guest_owned_bits;
vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & guest_owned_bits;
break;
- case VCPU_EXREG_CR3:
+ case VCPU_REG_CR3:
/*
* When intercepting CR3 loads, e.g. for shadowing paging, KVM's
* CR3 is loaded into hardware, not the guest's CR3.
@@ -2629,7 +2633,7 @@ void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
if (!(exec_controls_get(to_vmx(vcpu)) & CPU_BASED_CR3_LOAD_EXITING))
vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
break;
- case VCPU_EXREG_CR4:
+ case VCPU_REG_CR4:
guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
vcpu->arch.cr4 &= ~guest_owned_bits;
@@ -2777,6 +2781,7 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
return -EIO;
vmx_cap->ept = 0;
+ _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_MODE_BASED_EPT_EXEC;
_cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
}
if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
@@ -2790,6 +2795,16 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
vmx_cap->vpid = 0;
}
+ /*
+ * Virtualizing MBEC requires advanced vmexit information in order to
+ * distinguish supervisor and user accesses. For simplicity and clarity
+ * disable MBEC entirely if advanced vmexit information is not available,
+ * this way mbec=1 in the kvm_intel module parameters implies availability
+ * to nested guests as well.
+ */
+ if (!(vmx_cap->ept & VMX_EPT_ADVANCED_VMEXIT_INFO_BIT))
+ _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_MODE_BASED_EPT_EXEC;
+
if (!cpu_has_sgx())
_cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_ENCLS_EXITING;
@@ -3354,7 +3369,7 @@ void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
- if (!kvm_register_is_dirty(vcpu, VCPU_EXREG_PDPTR))
+ if (!kvm_register_is_dirty(vcpu, VCPU_REG_PDPTR))
return;
if (is_pae_paging(vcpu)) {
@@ -3377,7 +3392,7 @@ void ept_save_pdptrs(struct kvm_vcpu *vcpu)
mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
- kvm_register_mark_available(vcpu, VCPU_EXREG_PDPTR);
+ kvm_register_mark_available(vcpu, VCPU_REG_PDPTR);
}
#define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \
@@ -3420,7 +3435,7 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
vmcs_writel(CR0_READ_SHADOW, cr0);
vmcs_writel(GUEST_CR0, hw_cr0);
vcpu->arch.cr0 = cr0;
- kvm_register_mark_available(vcpu, VCPU_EXREG_CR0);
+ kvm_register_mark_available(vcpu, VCPU_REG_CR0);
#ifdef CONFIG_X86_64
if (vcpu->arch.efer & EFER_LME) {
@@ -3438,8 +3453,8 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
* (correctly) stop reading vmcs.GUEST_CR3 because it thinks
* KVM's CR3 is installed.
*/
- if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3))
- vmx_cache_reg(vcpu, VCPU_EXREG_CR3);
+ if (!kvm_register_is_available(vcpu, VCPU_REG_CR3))
+ vmx_cache_reg(vcpu, VCPU_REG_CR3);
/*
* When running with EPT but not unrestricted guest, KVM must
@@ -3476,7 +3491,7 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
* GUEST_CR3 is still vmx->ept_identity_map_addr if EPT + !URG.
*/
if (!(old_cr0_pg & X86_CR0_PG) && (cr0 & X86_CR0_PG))
- kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
+ kvm_register_mark_dirty(vcpu, VCPU_REG_CR3);
}
/* depends on vcpu->arch.cr0 to be set to a new value */
@@ -3505,7 +3520,7 @@ void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level)
if (!enable_unrestricted_guest && !is_paging(vcpu))
guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr;
- else if (kvm_register_is_dirty(vcpu, VCPU_EXREG_CR3))
+ else if (kvm_register_is_dirty(vcpu, VCPU_REG_CR3))
guest_cr3 = vcpu->arch.cr3;
else /* vmcs.GUEST_CR3 is already up-to-date. */
update_guest_cr3 = false;
@@ -3565,7 +3580,7 @@ void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
}
vcpu->arch.cr4 = cr4;
- kvm_register_mark_available(vcpu, VCPU_EXREG_CR4);
+ kvm_register_mark_available(vcpu, VCPU_REG_CR4);
if (!enable_unrestricted_guest) {
if (enable_ept) {
@@ -4746,6 +4761,9 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
*/
exec_control &= ~SECONDARY_EXEC_ENABLE_VMFUNC;
+ if (!enable_mbec)
+ exec_control &= ~SECONDARY_EXEC_MODE_BASED_EPT_EXEC;
+
/* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP,
* in vmx_set_cr4. */
exec_control &= ~SECONDARY_EXEC_DESC;
@@ -5032,7 +5050,7 @@ void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
vmx_segment_cache_clear(vmx);
- kvm_register_mark_available(vcpu, VCPU_EXREG_SEGMENTS);
+ kvm_register_mark_available(vcpu, VCPU_REG_SEGMENTS);
vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
@@ -7410,31 +7428,6 @@ void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
}
}
-void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx,
- unsigned int flags)
-{
- u64 hostval = this_cpu_read(x86_spec_ctrl_current);
-
- if (!cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL))
- return;
-
- if (flags & VMX_RUN_SAVE_SPEC_CTRL)
- vmx->spec_ctrl = native_rdmsrq(MSR_IA32_SPEC_CTRL);
-
- /*
- * If the guest/host SPEC_CTRL values differ, restore the host value.
- *
- * For legacy IBRS, the IBRS bit always needs to be written after
- * transitioning from a less privileged predictor mode, regardless of
- * whether the guest/host values differ.
- */
- if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) ||
- vmx->spec_ctrl != hostval)
- native_wrmsrq(MSR_IA32_SPEC_CTRL, hostval);
-
- barrier_nospec();
-}
-
static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu,
bool force_immediate_exit)
{
@@ -7488,11 +7481,10 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
if (vcpu->arch.cr2 != native_read_cr2())
native_write_cr2(vcpu->arch.cr2);
- vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
- flags);
+ vmx->fail = __vmx_vcpu_run(vmx, flags);
vcpu->arch.cr2 = native_read_cr2();
- vcpu->arch.regs_avail &= ~VMX_REGS_LAZY_LOAD_SET;
+ kvm_clear_available_registers(vcpu, VMX_REGS_LAZY_LOAD_SET);
vmx->idt_vectoring_info = 0;
@@ -7534,9 +7526,9 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
vmx->vt.exit_reason.full = EXIT_REASON_INVALID_STATE;
vmx->vt.exit_reason.failed_vmentry = 1;
- kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1);
+ kvm_register_mark_available(vcpu, VCPU_REG_EXIT_INFO_1);
vmx->vt.exit_qualification = ENTRY_FAIL_DEFAULT;
- kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2);
+ kvm_register_mark_available(vcpu, VCPU_REG_EXIT_INFO_2);
vmx->vt.exit_intr_info = 0;
return EXIT_FASTPATH_NONE;
}
@@ -7556,9 +7548,9 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
if (kvm_register_is_dirty(vcpu, VCPU_REGS_RSP))
vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
- if (kvm_register_is_dirty(vcpu, VCPU_REGS_RIP))
- vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
- vcpu->arch.regs_dirty = 0;
+ if (kvm_register_is_dirty(vcpu, VCPU_REG_RIP))
+ vmcs_writel(GUEST_RIP, vcpu->arch.rip);
+ kvm_reset_dirty_registers(vcpu);
if (run_flags & KVM_RUN_LOAD_GUEST_DR6)
set_debugreg(vcpu->arch.dr6, 6);
@@ -7607,7 +7599,7 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
kvm_wait_lapic_expire(vcpu);
/* The actual VMENTER/EXIT is in the .noinstr.text section. */
- vmx_vcpu_enter_exit(vcpu, __vmx_vcpu_run_flags(vmx));
+ vmx_vcpu_enter_exit(vcpu, __vmx_vcpu_enter_flags(vmx));
/* All fields are clean at this point */
if (kvm_is_using_evmcs()) {
@@ -8645,6 +8637,8 @@ __init int vmx_hardware_setup(void)
if (!cpu_has_vmx_ept_ad_bits() || !enable_ept)
enable_ept_ad_bits = 0;
+ if (!cpu_has_ept_mbec() || !enable_ept)
+ enable_mbec = 0;
if (!cpu_has_vmx_unrestricted_guest() || !enable_ept)
enable_unrestricted_guest = 0;
@@ -8706,8 +8700,7 @@ __init int vmx_hardware_setup(void)
set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
if (enable_ept)
- kvm_mmu_set_ept_masks(enable_ept_ad_bits,
- cpu_has_vmx_ept_execute_only());
+ kvm_mmu_set_ept_masks(enable_ept_ad_bits);
else
vt_x86_ops.get_mt_mask = NULL;
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index db84e8001da58..daedf663c0a9c 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -15,7 +15,6 @@
#include "vmcs.h"
#include "vmx_ops.h"
#include "../cpuid.h"
-#include "run_flags.h"
#include "../mmu.h"
#include "common.h"
@@ -317,7 +316,7 @@ static __always_inline unsigned long vmx_get_exit_qual(struct kvm_vcpu *vcpu)
{
struct vcpu_vt *vt = to_vt(vcpu);
- if (!kvm_register_test_and_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1) &&
+ if (!kvm_register_test_and_mark_available(vcpu, VCPU_REG_EXIT_INFO_1) &&
!WARN_ON_ONCE(is_td_vcpu(vcpu)))
vt->exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
@@ -328,7 +327,7 @@ static __always_inline u32 vmx_get_intr_info(struct kvm_vcpu *vcpu)
{
struct vcpu_vt *vt = to_vt(vcpu);
- if (!kvm_register_test_and_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2) &&
+ if (!kvm_register_test_and_mark_available(vcpu, VCPU_REG_EXIT_INFO_2) &&
!WARN_ON_ONCE(is_td_vcpu(vcpu)))
vt->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
@@ -368,10 +367,8 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu);
struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr);
void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu);
void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp);
-void vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx, unsigned int flags);
-unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx);
-bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs,
- unsigned int flags);
+unsigned int __vmx_vcpu_enter_flags(struct vcpu_vmx *vmx);
+bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned int flags);
void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu);
void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool set);
@@ -567,6 +564,7 @@ static inline u8 vmx_get_rvi(void)
SECONDARY_EXEC_ENABLE_VMFUNC | \
SECONDARY_EXEC_BUS_LOCK_DETECTION | \
SECONDARY_EXEC_NOTIFY_VM_EXITING | \
+ SECONDARY_EXEC_MODE_BASED_EPT_EXEC | \
SECONDARY_EXEC_ENCLS_EXITING | \
SECONDARY_EXEC_EPT_VIOLATION_VE)
@@ -620,16 +618,16 @@ BUILD_CONTROLS_SHADOW(tertiary_exec, TERTIARY_VM_EXEC_CONTROL, 64)
* cache on demand. Other registers not listed here are synced to
* the cache immediately after VM-Exit.
*/
-#define VMX_REGS_LAZY_LOAD_SET ((1 << VCPU_REGS_RIP) | \
- (1 << VCPU_REGS_RSP) | \
- (1 << VCPU_EXREG_RFLAGS) | \
- (1 << VCPU_EXREG_PDPTR) | \
- (1 << VCPU_EXREG_SEGMENTS) | \
- (1 << VCPU_EXREG_CR0) | \
- (1 << VCPU_EXREG_CR3) | \
- (1 << VCPU_EXREG_CR4) | \
- (1 << VCPU_EXREG_EXIT_INFO_1) | \
- (1 << VCPU_EXREG_EXIT_INFO_2))
+#define VMX_REGS_LAZY_LOAD_SET (BIT(VCPU_REGS_RSP) | \
+ BIT(VCPU_REG_RIP) | \
+ BIT(VCPU_REG_RFLAGS) | \
+ BIT(VCPU_REG_PDPTR) | \
+ BIT(VCPU_REG_SEGMENTS) | \
+ BIT(VCPU_REG_CR0) | \
+ BIT(VCPU_REG_CR3) | \
+ BIT(VCPU_REG_CR4) | \
+ BIT(VCPU_REG_EXIT_INFO_1) | \
+ BIT(VCPU_REG_EXIT_INFO_2))
static inline unsigned long vmx_l1_guest_owned_cr0_bits(void)
{
diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
index d09abeac2b56a..4098580742462 100644
--- a/arch/x86/kvm/vmx/x86_ops.h
+++ b/arch/x86/kvm/vmx/x86_ops.h
@@ -4,6 +4,7 @@
#include <linux/kvm_host.h>
+#include "capabilities.h"
#include "x86.h"
__init int vmx_hardware_setup(void);
@@ -104,6 +105,11 @@ int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr);
u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
+static inline bool vmx_tdp_has_smep(struct kvm *kvm)
+{
+ return enable_mbec;
+}
+
void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code);
void vmx_get_entry_info(struct kvm_vcpu *vcpu, u32 *intr_info, u32 *error_code);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c1a72d749084f..0ac9d94e51776 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1072,7 +1072,8 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
* to an L1 GPA.
*/
real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(pdpt_gfn),
- PFERR_USER_MASK | PFERR_WRITE_MASK, NULL);
+ PFERR_USER_MASK | PFERR_WRITE_MASK |
+ PFERR_GUEST_PAGE_MASK, NULL, 0);
if (real_gpa == INVALID_GPA)
return 0;
@@ -1090,14 +1091,14 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
}
/*
- * Marking VCPU_EXREG_PDPTR dirty doesn't work for !tdp_enabled.
+ * Marking VCPU_REG_PDPTR dirty doesn't work for !tdp_enabled.
* Shadow page roots need to be reconstructed instead.
*/
if (!tdp_enabled && memcmp(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs)))
kvm_mmu_free_roots(vcpu->kvm, mmu, KVM_MMU_ROOT_CURRENT);
memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
- kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
+ kvm_register_mark_dirty(vcpu, VCPU_REG_PDPTR);
kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu);
vcpu->arch.pdptrs_from_userspace = false;
@@ -1478,7 +1479,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
kvm_mmu_new_pgd(vcpu, cr3);
vcpu->arch.cr3 = cr3;
- kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
+ kvm_register_mark_dirty(vcpu, VCPU_REG_CR3);
/* Do not call post_set_cr3, we do not get here for confidential guests. */
handle_tlb_flush:
@@ -7847,21 +7848,6 @@ void kvm_get_segment(struct kvm_vcpu *vcpu,
kvm_x86_call(get_segment)(vcpu, var, seg);
}
-gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u64 access,
- struct x86_exception *exception)
-{
- struct kvm_mmu *mmu = vcpu->arch.mmu;
- gpa_t t_gpa;
-
- BUG_ON(!mmu_is_nested(vcpu));
-
- /* NPT walks are always user-walks */
- access |= PFERR_USER_MASK;
- t_gpa = mmu->gva_to_gpa(vcpu, mmu, gpa, access, exception);
-
- return t_gpa;
-}
-
gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception)
{
@@ -12473,7 +12459,7 @@ static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs,
vcpu->arch.cr2 = sregs->cr2;
*mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
vcpu->arch.cr3 = sregs->cr3;
- kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
+ kvm_register_mark_dirty(vcpu, VCPU_REG_CR3);
kvm_x86_call(post_set_cr3)(vcpu, sregs->cr3);
kvm_set_cr8(vcpu, sregs->cr8);
@@ -12566,7 +12552,7 @@ static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2)
for (i = 0; i < 4 ; i++)
kvm_pdptr_write(vcpu, i, sregs2->pdptrs[i]);
- kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
+ kvm_register_mark_dirty(vcpu, VCPU_REG_PDPTR);
mmu_reset_needed = 1;
vcpu->arch.pdptrs_from_userspace = true;
}
@@ -12836,8 +12822,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
int r;
vcpu->arch.last_vmentry_cpu = -1;
- vcpu->arch.regs_avail = ~0;
- vcpu->arch.regs_dirty = ~0;
+ bitmap_fill(vcpu->arch.regs_avail, NR_VCPU_TOTAL_REGS);
+ bitmap_fill(vcpu->arch.regs_dirty, NR_VCPU_TOTAL_REGS);
kvm_gpc_init(&vcpu->arch.pv_time, vcpu->kvm);
@@ -13111,7 +13097,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
kvm_rip_write(vcpu, 0xfff0);
vcpu->arch.cr3 = 0;
- kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
+ kvm_register_mark_dirty(vcpu, VCPU_REG_CR3);
/*
* CR0.CD/NW are set on RESET, preserved on INIT. Note, some versions
@@ -14323,7 +14309,7 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva)
* the RAP (Return Address Predicator).
*/
if (guest_cpu_cap_has(vcpu, X86_FEATURE_ERAPS))
- kvm_register_is_dirty(vcpu, VCPU_EXREG_ERAPS);
+ kvm_register_is_dirty(vcpu, VCPU_REG_ERAPS);
kvm_invalidate_pcid(vcpu, operand.pcid);
return kvm_skip_emulated_instruction(vcpu);
@@ -14339,7 +14325,7 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva)
fallthrough;
case INVPCID_TYPE_ALL_INCL_GLOBAL:
/*
- * Don't bother marking VCPU_EXREG_ERAPS dirty, SVM will take
+ * Don't bother marking VCPU_REG_ERAPS dirty, SVM will take
* care of doing so when emulating the full guest TLB flush
* (the RAP is cleared on all implicit TLB flushes).
*/