diff options
| -rw-r--r-- | Documentation/virt/kvm/api.rst | 26 | ||||
| -rw-r--r-- | arch/x86/include/asm/kvm-x86-pmu-ops.h | 5 | ||||
| -rw-r--r-- | arch/x86/include/asm/kvm_host.h | 6 | ||||
| -rw-r--r-- | arch/x86/include/asm/perf_event.h | 2 | ||||
| -rw-r--r-- | arch/x86/include/uapi/asm/kvm.h | 2 | ||||
| -rw-r--r-- | arch/x86/kvm/lapic.c | 21 | ||||
| -rw-r--r-- | arch/x86/kvm/lapic.h | 2 | ||||
| -rw-r--r-- | arch/x86/kvm/pmu.c | 21 | ||||
| -rw-r--r-- | arch/x86/kvm/pmu.h | 44 | ||||
| -rw-r--r-- | arch/x86/kvm/svm/avic.c | 47 | ||||
| -rw-r--r-- | arch/x86/kvm/svm/nested.c | 107 | ||||
| -rw-r--r-- | arch/x86/kvm/svm/pmu.c | 42 | ||||
| -rw-r--r-- | arch/x86/kvm/svm/svm.c | 40 | ||||
| -rw-r--r-- | arch/x86/kvm/svm/svm.h | 44 | ||||
| -rw-r--r-- | arch/x86/kvm/vmx/pmu_intel.c | 2 | ||||
| -rw-r--r-- | arch/x86/kvm/vmx/vmx.c | 3 | ||||
| -rw-r--r-- | arch/x86/kvm/x86.c | 9 | ||||
| -rw-r--r-- | arch/x86/kvm/x86.h | 3 |
18 files changed, 328 insertions, 98 deletions
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 1a5a82ab0d661..801ca33f07e3a 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -4944,10 +4944,13 @@ Errors: #define KVM_STATE_NESTED_FORMAT_SVM 1 #define KVM_STATE_NESTED_VMX_VMCS_SIZE 0x1000 + #define KVM_STATE_NESTED_SVM_VMCB_SIZE 0x1000 #define KVM_STATE_NESTED_VMX_SMM_GUEST_MODE 0x00000001 #define KVM_STATE_NESTED_VMX_SMM_VMXON 0x00000002 + #define KVM_STATE_NESTED_GIF_SET 0x00000100 + #define KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE 0x00000001 struct kvm_vmx_nested_state_hdr { @@ -4962,11 +4965,20 @@ Errors: __u64 preemption_timer_deadline; }; + struct kvm_svm_nested_state_hdr { + __u64 vmcb_pa; + __u64 gpat; + }; + struct kvm_vmx_nested_state_data { __u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE]; __u8 shadow_vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE]; }; + struct kvm_svm_nested_state_data { + __u8 vmcb12[KVM_STATE_NESTED_SVM_VMCB_SIZE]; + }; + This ioctl copies the vcpu's nested virtualization state from the kernel to userspace. @@ -8553,6 +8565,20 @@ KVM_X86_QUIRK_VMCS12_ALLOW_FREEZE_IN_SMM By default, KVM relaxes the consisten bit to be cleared. Note that the vmcs02 bit is still completely controlled by the host, regardless of the quirk setting. + +KVM_X86_QUIRK_NESTED_SVM_SHARED_PAT By default, KVM for nested SVM guests + shares the IA32_PAT MSR between L1 and + L2. This is legacy behavior and does + not match the AMD architecture + specification. When this quirk is + disabled and nested paging (NPT) is + enabled for L2, KVM correctly + virtualizes a separate guest PAT + register for L2, using the g_pat + field in the VMCB. When NPT is + disabled for L2, L1 and L2 continue + to share the IA32_PAT MSR regardless + of the quirk setting. ======================================== ================================================ 7.32 KVM_CAP_MAX_VCPU_ID diff --git a/arch/x86/include/asm/kvm-x86-pmu-ops.h b/arch/x86/include/asm/kvm-x86-pmu-ops.h index d5452b3433b7d..4a223c2793e3f 100644 --- a/arch/x86/include/asm/kvm-x86-pmu-ops.h +++ b/arch/x86/include/asm/kvm-x86-pmu-ops.h @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ #if !defined(KVM_X86_PMU_OP) || \ - !defined(KVM_X86_PMU_OP_OPTIONAL) + !defined(KVM_X86_PMU_OP_OPTIONAL) || \ + !defined(KVM_X86_PMU_OP_OPTIONAL_RET0) #error Missing one or more KVM_X86_PMU_OP #defines #else @@ -23,6 +24,7 @@ KVM_X86_PMU_OP(init) KVM_X86_PMU_OP_OPTIONAL(reset) KVM_X86_PMU_OP_OPTIONAL(deliver_pmi) KVM_X86_PMU_OP_OPTIONAL(cleanup) +KVM_X86_PMU_OP_OPTIONAL_RET0(pmc_is_disabled_in_current_mode) KVM_X86_PMU_OP_OPTIONAL(write_global_ctrl) KVM_X86_PMU_OP(mediated_load) @@ -31,3 +33,4 @@ KVM_X86_PMU_OP(mediated_put) #undef KVM_X86_PMU_OP #undef KVM_X86_PMU_OP_OPTIONAL +#undef KVM_X86_PMU_OP_OPTIONAL_RET0 diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 31666b81e60b1..6ae7d539af909 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -615,6 +615,8 @@ struct kvm_pmu { DECLARE_BITMAP(pmc_counting_instructions, X86_PMC_IDX_MAX); DECLARE_BITMAP(pmc_counting_branches, X86_PMC_IDX_MAX); + DECLARE_BITMAP(pmc_has_mode_specific_enables, X86_PMC_IDX_MAX); + u64 ds_area; u64 pebs_enable; u64 pebs_enable_rsvd; @@ -2061,7 +2063,6 @@ extern bool __read_mostly enable_device_posted_irqs; extern struct kvm_x86_ops kvm_x86_ops; #define kvm_x86_call(func) static_call(kvm_x86_##func) -#define kvm_pmu_call(func) static_call(kvm_x86_pmu_##func) #define KVM_X86_OP(func) \ DECLARE_STATIC_CALL(kvm_x86_##func, *(((struct kvm_x86_ops *)0)->func)); @@ -2554,7 +2555,8 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages); KVM_X86_QUIRK_SLOT_ZAP_ALL | \ KVM_X86_QUIRK_STUFF_FEATURE_MSRS | \ KVM_X86_QUIRK_IGNORE_GUEST_PAT | \ - KVM_X86_QUIRK_VMCS12_ALLOW_FREEZE_IN_SMM) + KVM_X86_QUIRK_VMCS12_ALLOW_FREEZE_IN_SMM | \ + KVM_X86_QUIRK_NESTED_SVM_SHARED_PAT) #define KVM_X86_CONDITIONAL_QUIRKS \ (KVM_X86_QUIRK_CD_NW_CLEARED | \ diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 752cb319d5eab..1eb13673e889f 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -60,6 +60,8 @@ #define AMD64_EVENTSEL_INT_CORE_ENABLE (1ULL << 36) #define AMD64_EVENTSEL_GUESTONLY (1ULL << 40) #define AMD64_EVENTSEL_HOSTONLY (1ULL << 41) +#define AMD64_EVENTSEL_HOST_GUEST_MASK \ + (AMD64_EVENTSEL_HOSTONLY | AMD64_EVENTSEL_GUESTONLY) #define AMD64_EVENTSEL_INT_CORE_SEL_SHIFT 37 #define AMD64_EVENTSEL_INT_CORE_SEL_MASK \ diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 5f2b30d0405c8..1585ec8040666 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -477,6 +477,7 @@ struct kvm_sync_regs { #define KVM_X86_QUIRK_STUFF_FEATURE_MSRS (1 << 8) #define KVM_X86_QUIRK_IGNORE_GUEST_PAT (1 << 9) #define KVM_X86_QUIRK_VMCS12_ALLOW_FREEZE_IN_SMM (1 << 10) +#define KVM_X86_QUIRK_NESTED_SVM_SHARED_PAT (1 << 11) #define KVM_STATE_NESTED_FORMAT_VMX 0 #define KVM_STATE_NESTED_FORMAT_SVM 1 @@ -532,6 +533,7 @@ struct kvm_svm_nested_state_data { struct kvm_svm_nested_state_hdr { __u64 vmcb_pa; + __u64 gpat; }; /* for KVM_CAP_NESTED_STATE */ diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 4078e624ca667..4e34f75e705da 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1730,7 +1730,7 @@ static inline struct kvm_lapic *to_lapic(struct kvm_io_device *dev) #define APIC_REGS_MASK(first, count) \ (APIC_REG_MASK(first) * ((1ull << (count)) - 1)) -u64 kvm_lapic_readable_reg_mask(struct kvm_lapic *apic) +static u64 kvm_lapic_readable_reg_mask(struct kvm_lapic *apic) { /* Leave bits '0' for reserved and write-only registers. */ u64 valid_reg_mask = @@ -1766,7 +1766,24 @@ u64 kvm_lapic_readable_reg_mask(struct kvm_lapic *apic) return valid_reg_mask; } -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_lapic_readable_reg_mask); + +u64 kvm_x2apic_disable_read_intercept_reg_mask(struct kvm_vcpu *vcpu) +{ + if (WARN_ON_ONCE(!lapic_in_kernel(vcpu))) + return 0; + + /* + * TMMCT, a.k.a. the current APIC timer count, reads aren't accelerated + * by hardware (Intel or AMD) as the timer is emulated in software (by + * KVM), i.e. reads from the virtual APIC page would return garbage. + * Intercept RDMSR, as handling the fault-like APIC-access VM-Exit is + * more expensive than handling a RDMSR VM-Exit (the APIC-access exit + * requires slow emulation of the code stream). + */ + return kvm_lapic_readable_reg_mask(vcpu->arch.apic) & + ~APIC_REG_MASK(APIC_TMCCT); +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_x2apic_disable_read_intercept_reg_mask); static int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len, void *data) diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 274885af4ebc4..f763cd29a5082 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -156,7 +156,7 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data); int kvm_lapic_set_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len); void kvm_lapic_exit(void); -u64 kvm_lapic_readable_reg_mask(struct kvm_lapic *apic); +u64 kvm_x2apic_disable_read_intercept_reg_mask(struct kvm_vcpu *vcpu); static inline void kvm_lapic_set_irr(int vec, struct kvm_lapic *apic) { diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index e218352e34231..b92dd2e583356 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -16,6 +16,7 @@ #include <linux/perf_event.h> #include <linux/bsearch.h> #include <linux/sort.h> +#include <linux/moduleparam.h> #include <asm/perf_event.h> #include <asm/cpu_device_id.h> #include "x86.h" @@ -33,6 +34,15 @@ static struct x86_pmu_capability __read_mostly kvm_host_pmu; struct x86_pmu_capability __read_mostly kvm_pmu_cap; EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_pmu_cap); +/* Enable/disable PMU virtualization */ +bool __read_mostly enable_pmu = true; +EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_pmu); +module_param(enable_pmu, bool, 0444); + +/* Enable/disabled mediated PMU virtualization. */ +bool __read_mostly enable_mediated_pmu; +EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_mediated_pmu); + struct kvm_pmu_emulated_event_selectors { u64 INSTRUCTIONS_RETIRED; u64 BRANCH_INSTRUCTIONS_RETIRED; @@ -88,7 +98,9 @@ static struct kvm_pmu_ops kvm_pmu_ops __read_mostly; DEFINE_STATIC_CALL_NULL(kvm_x86_pmu_##func, \ *(((struct kvm_pmu_ops *)0)->func)); #define KVM_X86_PMU_OP_OPTIONAL KVM_X86_PMU_OP +#define KVM_X86_PMU_OP_OPTIONAL_RET0 KVM_X86_PMU_OP #include <asm/kvm-x86-pmu-ops.h> +EXPORT_STATIC_CALL_GPL(kvm_x86_pmu_pmc_is_disabled_in_current_mode); void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops) { @@ -99,6 +111,9 @@ void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops) #define KVM_X86_PMU_OP(func) \ WARN_ON(!kvm_pmu_ops.func); __KVM_X86_PMU_OP(func) #define KVM_X86_PMU_OP_OPTIONAL __KVM_X86_PMU_OP +#define KVM_X86_PMU_OP_OPTIONAL_RET0(func) \ + static_call_update(kvm_x86_pmu_##func, (void *)kvm_pmu_ops.func ? : \ + (void *)__static_call_return0); #include <asm/kvm-x86-pmu-ops.h> #undef __KVM_X86_PMU_OP } @@ -522,7 +537,7 @@ static bool pmc_is_event_allowed(struct kvm_pmc *pmc) static void kvm_mediated_pmu_refresh_event_filter(struct kvm_pmc *pmc) { - bool allowed = pmc_is_event_allowed(pmc); + bool allowed = pmc_is_locally_enabled(pmc) && pmc_is_event_allowed(pmc); struct kvm_pmu *pmu = pmc_to_pmu(pmc); if (pmc_is_gp(pmc)) { @@ -670,6 +685,7 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) kvm_for_each_pmc(pmu, pmc, bit, bitmap) kvm_pmu_recalc_pmc_emulation(pmu, pmc); } +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_pmu_handle_event); int kvm_pmu_check_rdpmc_early(struct kvm_vcpu *vcpu, unsigned int idx) { @@ -879,7 +895,7 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (pmu->global_ctrl != data) { diff = pmu->global_ctrl ^ data; pmu->global_ctrl = data; - reprogram_counters(pmu, diff); + kvm_pmu_request_counters_reprogram(pmu, diff); } /* * Unconditionally forward writes to vendor code, i.e. to the @@ -921,6 +937,7 @@ static void kvm_pmu_reset(struct kvm_vcpu *vcpu) pmu->need_cleanup = false; bitmap_zero(pmu->reprogram_pmi, X86_PMC_IDX_MAX); + bitmap_zero(pmu->pmc_has_mode_specific_enables, X86_PMC_IDX_MAX); kvm_for_each_pmc(pmu, pmc, i, pmu->all_valid_pmc_idx) { pmc_stop_counter(pmc); diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 0925246731cb1..a5821d7c87f93 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -36,6 +36,7 @@ struct kvm_pmu_ops { void (*reset)(struct kvm_vcpu *vcpu); void (*deliver_pmi)(struct kvm_vcpu *vcpu); void (*cleanup)(struct kvm_vcpu *vcpu); + bool (*pmc_is_disabled_in_current_mode)(struct kvm_pmc *pmc); bool (*is_mediated_pmu_supported)(struct x86_pmu_capability *host_pmu); void (*mediated_load)(struct kvm_vcpu *vcpu); @@ -53,6 +54,17 @@ struct kvm_pmu_ops { const u32 MSR_STRIDE; }; +#define kvm_pmu_call(func) static_call(kvm_x86_pmu_##func) + +#define KVM_X86_PMU_OP(func) \ + DECLARE_STATIC_CALL(kvm_x86_pmu_##func, *(((struct kvm_pmu_ops *)0)->func)); +#define KVM_X86_PMU_OP_OPTIONAL KVM_X86_PMU_OP +#define KVM_X86_PMU_OP_OPTIONAL_RET0 KVM_X86_PMU_OP +#include <asm/kvm-x86-pmu-ops.h> + +extern bool enable_pmu; +extern bool enable_mediated_pmu; + void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops); void kvm_handle_guest_mediated_pmi(void); @@ -190,7 +202,13 @@ static inline bool pmc_is_locally_enabled(struct kvm_pmc *pmc) pmc->idx - KVM_FIXED_PMC_BASE_IDX) & (INTEL_FIXED_0_KERNEL | INTEL_FIXED_0_USER); - return pmc->eventsel & ARCH_PERFMON_EVENTSEL_ENABLE; + if (!(pmc->eventsel & ARCH_PERFMON_EVENTSEL_ENABLE)) + return false; + + if (!test_bit(pmc->idx, pmu->pmc_has_mode_specific_enables)) + return true; + + return !kvm_pmu_call(pmc_is_disabled_in_current_mode)(pmc); } extern struct x86_pmu_capability kvm_pmu_cap; @@ -198,6 +216,7 @@ extern struct x86_pmu_capability kvm_pmu_cap; void kvm_init_pmu_capability(struct kvm_pmu_ops *pmu_ops); void kvm_pmu_recalc_pmc_emulation(struct kvm_pmu *pmu, struct kvm_pmc *pmc); +void kvm_pmu_handle_event(struct kvm_vcpu *vcpu); static inline void kvm_pmu_request_counter_reprogram(struct kvm_pmc *pmc) { @@ -207,16 +226,24 @@ static inline void kvm_pmu_request_counter_reprogram(struct kvm_pmc *pmc) kvm_make_request(KVM_REQ_PMU, pmc->vcpu); } -static inline void reprogram_counters(struct kvm_pmu *pmu, u64 diff) +static inline void __kvm_pmu_reprogram_counters(struct kvm_pmu *pmu, + u64 counters, + bool defer) { - int bit; - - if (!diff) + if (!counters) return; - for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX) - set_bit(bit, pmu->reprogram_pmi); - kvm_make_request(KVM_REQ_PMU, pmu_to_vcpu(pmu)); + atomic64_or(counters, &pmu->__reprogram_pmi); + if (defer) + kvm_make_request(KVM_REQ_PMU, pmu_to_vcpu(pmu)); + else + kvm_pmu_handle_event(pmu_to_vcpu(pmu)); +} + +static inline void kvm_pmu_request_counters_reprogram(struct kvm_pmu *pmu, + u64 counters) +{ + __kvm_pmu_reprogram_counters(pmu, counters, true); } /* @@ -245,7 +272,6 @@ static inline bool kvm_pmu_is_fastpath_emulation_allowed(struct kvm_vcpu *vcpu) } void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu); -void kvm_pmu_handle_event(struct kvm_vcpu *vcpu); int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data); int kvm_pmu_check_rdpmc_early(struct kvm_vcpu *vcpu, unsigned int idx); bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr); diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index e8bd60156941b..b7083cd692ad8 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -122,38 +122,8 @@ static u32 x2avic_max_physical_id; static void avic_set_x2apic_msr_interception(struct vcpu_svm *svm, bool intercept) { - static const u32 x2avic_passthrough_msrs[] = { - X2APIC_MSR(APIC_ID), - X2APIC_MSR(APIC_LVR), - X2APIC_MSR(APIC_TASKPRI), - X2APIC_MSR(APIC_ARBPRI), - X2APIC_MSR(APIC_PROCPRI), - X2APIC_MSR(APIC_EOI), - X2APIC_MSR(APIC_RRR), - X2APIC_MSR(APIC_LDR), - X2APIC_MSR(APIC_DFR), - X2APIC_MSR(APIC_SPIV), - X2APIC_MSR(APIC_ISR), - X2APIC_MSR(APIC_TMR), - X2APIC_MSR(APIC_IRR), - X2APIC_MSR(APIC_ESR), - X2APIC_MSR(APIC_ICR), - X2APIC_MSR(APIC_ICR2), - - /* - * Note! Always intercept LVTT, as TSC-deadline timer mode - * isn't virtualized by hardware, and the CPU will generate a - * #GP instead of a #VMEXIT. - */ - X2APIC_MSR(APIC_LVTTHMR), - X2APIC_MSR(APIC_LVTPC), - X2APIC_MSR(APIC_LVT0), - X2APIC_MSR(APIC_LVT1), - X2APIC_MSR(APIC_LVTERR), - X2APIC_MSR(APIC_TMICT), - X2APIC_MSR(APIC_TMCCT), - X2APIC_MSR(APIC_TDCR), - }; + struct kvm_vcpu *vcpu = &svm->vcpu; + u64 rd_regs; int i; if (intercept == svm->x2avic_msrs_intercepted) @@ -162,9 +132,16 @@ static void avic_set_x2apic_msr_interception(struct vcpu_svm *svm, if (!x2avic_enabled) return; - for (i = 0; i < ARRAY_SIZE(x2avic_passthrough_msrs); i++) - svm_set_intercept_for_msr(&svm->vcpu, x2avic_passthrough_msrs[i], - MSR_TYPE_RW, intercept); + rd_regs = kvm_x2apic_disable_read_intercept_reg_mask(vcpu); + + for_each_set_bit(i, (unsigned long *)&rd_regs, BITS_PER_TYPE(rd_regs)) + svm_set_intercept_for_msr(vcpu, APIC_BASE_MSR + i, + MSR_TYPE_R, intercept); + + svm_set_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_W, intercept); + svm_set_intercept_for_msr(vcpu, X2APIC_MSR(APIC_EOI), MSR_TYPE_W, intercept); + svm_set_intercept_for_msr(vcpu, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W, intercept); + svm_set_intercept_for_msr(vcpu, X2APIC_MSR(APIC_ICR), MSR_TYPE_W, intercept); svm->x2avic_msrs_intercepted = intercept; } diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 80df0d040bb8b..7ad4b4fb7a1c0 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -30,6 +30,7 @@ #include "lapic.h" #include "svm.h" #include "hyperv.h" +#include "pmu.h" #define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK @@ -435,7 +436,8 @@ static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu, /* Common checks that apply to both L1 and L2 state. */ static bool nested_vmcb_check_save(struct kvm_vcpu *vcpu, - struct vmcb_save_area_cached *save) + struct vmcb_save_area_cached *save, + bool check_gpat) { if (CC(!(save->efer & EFER_SVME))) return false; @@ -470,6 +472,15 @@ static bool nested_vmcb_check_save(struct kvm_vcpu *vcpu, if (CC(!kvm_valid_efer(vcpu, save->efer))) return false; + /* + * If userspace contrives to get an invalid g_pat into vmcb02 by + * disabling KVM_X86_QUIRK_NESTED_SVM_SHARED_PAT in a race with + * this check, it should be prepared for the KVM_EXIT_FAIL_ENTRY + * that will follow. + */ + if (check_gpat && CC(!kvm_pat_valid(save->g_pat))) + return false; + return true; } @@ -477,7 +488,8 @@ int nested_svm_check_cached_vmcb12(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - if (!nested_vmcb_check_save(vcpu, &svm->nested.save) || + if (!nested_vmcb_check_save(vcpu, &svm->nested.save, + l2_has_separate_pat(vcpu)) || !nested_vmcb_check_controls(vcpu, &svm->nested.ctl)) return -EINVAL; @@ -590,6 +602,7 @@ static void __nested_copy_vmcb_save_to_cache(struct vmcb_save_area_cached *to, to->rax = from->rax; to->cr2 = from->cr2; + to->g_pat = from->g_pat; svm_copy_lbrs(to, from); } @@ -719,15 +732,6 @@ static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, return 0; } -void nested_vmcb02_compute_g_pat(struct vcpu_svm *svm) -{ - if (!svm->nested.vmcb02.ptr) - return; - - /* FIXME: merge g_pat from vmcb01 and vmcb12. */ - svm->nested.vmcb02.ptr->save.g_pat = svm->vmcb01.ptr->save.g_pat; -} - static bool nested_vmcb12_has_lbrv(struct kvm_vcpu *vcpu) { return guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) && @@ -743,9 +747,6 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm) struct vmcb *vmcb02 = svm->nested.vmcb02.ptr; struct kvm_vcpu *vcpu = &svm->vcpu; - nested_vmcb02_compute_g_pat(svm); - vmcb_mark_dirty(vmcb02, VMCB_NPT); - /* Load the nested guest state */ if (svm->nested.vmcb12_gpa != svm->nested.last_vmcb12_gpa) { new_vmcb12 = true; @@ -776,6 +777,13 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm) vmcb_mark_dirty(vmcb02, VMCB_CET); } + if (l2_has_separate_pat(vcpu)) { + if (unlikely(new_vmcb12 || vmcb12_is_dirty(control, VMCB_NPT))) + vmcb_set_gpat(vmcb02, svm->nested.save.g_pat); + } else if (npt_enabled) { + vmcb_set_gpat(vmcb02, vcpu->arch.pat); + } + kvm_set_rflags(vcpu, save->rflags | X86_EFLAGS_FIXED); svm_set_efer(vcpu, svm->nested.save.efer); @@ -852,6 +860,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) /* Enter Guest-Mode */ enter_guest_mode(vcpu); + svm_pmu_handle_nested_transition(svm); /* * Filled at exit: exit_code, exit_info_1, exit_info_2, exit_int_info, @@ -1133,16 +1142,22 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu) } ret = nested_svm_copy_vmcb12_to_cache(vcpu, vmcb12_gpa); - if (ret) { - if (ret == -EFAULT) - return kvm_handle_memory_failure(vcpu, X86EMUL_IO_NEEDED, NULL); + if (ret == -EFAULT) + return kvm_handle_memory_failure(vcpu, X86EMUL_IO_NEEDED, NULL); - /* Advance RIP past VMRUN as part of the nested #VMEXIT. */ - return kvm_skip_emulated_instruction(vcpu); - } + /* + * At this point, VMRUN is guaranteed to not fault; advance RIP. If + * caching vmcb12 failed for other reasons, return immediately afterward + * as a nested #VMEXIT was already set up. + * + * FIXME: If TF is set on VMRUN should inject a #DB (or handle guest + * debugging) right after #VMEXIT, right now it's just ignored. + */ + if (!svm_skip_emulated_instruction(vcpu)) + return 0; - /* At this point, VMRUN is guaranteed to not fault; advance RIP. */ - ret = kvm_skip_emulated_instruction(vcpu); + if (ret) + goto insn_retired; /* * Since vmcb01 is not in use, we can use it to store some of the L1 @@ -1172,7 +1187,13 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu) nested_svm_vmexit(svm); } - return ret; +insn_retired: + /* + * A successful VMRUN is counted by the PMU in guest mode, so only + * retire the instruction after potentially entering guest mode. + */ + kvm_pmu_instruction_retired(vcpu); + return 1; } /* Copy state save area fields which are handled by VMRUN */ @@ -1256,6 +1277,9 @@ static int nested_svm_vmexit_update_vmcb12(struct kvm_vcpu *vcpu) vmcb12->save.dr6 = svm->vcpu.arch.dr6; vmcb12->save.cpl = vmcb02->save.cpl; + if (l2_has_separate_pat(vcpu)) + vmcb12->save.g_pat = vmcb02->save.g_pat; + if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) { vmcb12->save.s_cet = vmcb02->save.s_cet; vmcb12->save.isst_addr = vmcb02->save.isst_addr; @@ -1302,6 +1326,8 @@ void nested_svm_vmexit(struct vcpu_svm *svm) /* Exit Guest-Mode */ leave_guest_mode(vcpu); + svm_pmu_handle_nested_transition(svm); + svm->nested.vmcb12_gpa = 0; kvm_warn_on_nested_run_pending(vcpu); @@ -1513,6 +1539,15 @@ void svm_leave_nested(struct kvm_vcpu *vcpu) leave_guest_mode(vcpu); + /* + * Force leaving nested is a non-architectural flow so precision + * isn't a priority. Defer updating the PMU until the next vCPU + * run, potentially tolerating some imprecision to avoid poking + * into PMU state from arbitrary contexts (e.g. to avoid using + * stale state). + */ + __svm_pmu_handle_nested_transition(svm, true); + svm_switch_vmcb(svm, &svm->vmcb01); nested_svm_uninit_mmu_context(vcpu); @@ -1865,6 +1900,9 @@ static int svm_get_nested_state(struct kvm_vcpu *vcpu, /* First fill in the header and copy it out. */ if (is_guest_mode(vcpu)) { kvm_state.hdr.svm.vmcb_pa = svm->nested.vmcb12_gpa; + kvm_state.hdr.svm.gpat = 0; + if (l2_has_separate_pat(vcpu)) + kvm_state.hdr.svm.gpat = svm->vmcb->save.g_pat; kvm_state.size += KVM_STATE_NESTED_SVM_VMCB_SIZE; kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE; @@ -1917,6 +1955,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, struct vmcb_save_area *save; struct vmcb_save_area_cached save_cached; struct vmcb_ctrl_area_cached ctl_cached; + bool use_separate_l2_pat; unsigned long cr0; int ret; @@ -1981,15 +2020,29 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, /* * Validate host state saved from before VMRUN (see - * nested_svm_check_permissions). + * nested_svm_check_permissions). Note that the g_pat field is not + * validated, because (a) it may have been clobbered by SMM before + * KVM_GET_NESTED_STATE, and (b) it is not loaded at emulated + * #VMEXIT. */ __nested_copy_vmcb_save_to_cache(&save_cached, save); if (!(save->cr0 & X86_CR0_PG) || !(save->cr0 & X86_CR0_PE) || (save->rflags & X86_EFLAGS_VM) || - !nested_vmcb_check_save(vcpu, &save_cached)) + !nested_vmcb_check_save(vcpu, &save_cached, false)) goto out_free; + /* + * Validate gPAT when the shared PAT quirk is disabled (i.e. L2 + * has its own gPAT). This is done separately from the + * vmcb_save_area_cached validation above, because gPAT is L2 + * state, but the vmcb_save_area_cached is populated with L1 state. + */ + use_separate_l2_pat = (ctl_cached.misc_ctl & SVM_MISC_ENABLE_NP) && + !kvm_check_has_quirk(vcpu->kvm, + KVM_X86_QUIRK_NESTED_SVM_SHARED_PAT); + if (use_separate_l2_pat && !kvm_pat_valid(kvm_state->hdr.svm.gpat)) + goto out_free; /* * All checks done, we can enter guest mode. Userspace provides @@ -2016,6 +2069,10 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, nested_copy_vmcb_control_to_cache(svm, ctl); svm_switch_vmcb(svm, &svm->nested.vmcb02); + + if (use_separate_l2_pat) + vmcb_set_gpat(svm->vmcb, kvm_state->hdr.svm.gpat); + nested_vmcb02_prepare_control(svm); /* diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 7aa298eeb0721..c18286545a7ac 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -168,6 +168,12 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) pmc->eventsel = data; pmc->eventsel_hw = (data & ~AMD64_EVENTSEL_HOSTONLY) | AMD64_EVENTSEL_GUESTONLY; + + if (data & AMD64_EVENTSEL_HOST_GUEST_MASK) + __set_bit(pmc->idx, pmu->pmc_has_mode_specific_enables); + else + __clear_bit(pmc->idx, pmu->pmc_has_mode_specific_enables); + kvm_pmu_request_counter_reprogram(pmc); } return 0; @@ -207,7 +213,11 @@ static void amd_pmu_refresh(struct kvm_vcpu *vcpu) } pmu->counter_bitmask[KVM_PMC_GP] = BIT_ULL(48) - 1; + pmu->reserved_bits = 0xfffffff000280000ull; + if (guest_cpu_cap_has(vcpu, X86_FEATURE_SVM) && kvm_vcpu_has_mediated_pmu(vcpu)) + pmu->reserved_bits &= ~AMD64_EVENTSEL_HOST_GUEST_MASK; + pmu->raw_event_mask = AMD64_RAW_EVENT_MASK; /* not applicable to AMD; but clean them to prevent any fall out */ pmu->counter_bitmask[KVM_PMC_FIXED] = 0; @@ -260,6 +270,37 @@ static void amd_mediated_pmu_put(struct kvm_vcpu *vcpu) wrmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, pmu->global_status); } +static bool amd_pmc_is_disabled_in_current_mode(struct kvm_pmc *pmc) +{ + struct kvm_vcpu *vcpu = pmc->vcpu; + u64 host_guest_bits; + + if (!kvm_vcpu_has_mediated_pmu(vcpu)) + return false; + + /* Common code is supposed to check the common enable bit */ + if (WARN_ON_ONCE(!(pmc->eventsel & ARCH_PERFMON_EVENTSEL_ENABLE))) + return false; + + /* If both bits are cleared, the counter is always enabled */ + host_guest_bits = pmc->eventsel & AMD64_EVENTSEL_HOST_GUEST_MASK; + if (!host_guest_bits) + return false; + + /* If EFER.SVME=0 and either bit is set, the counter is disabled */ + if (!(vcpu->arch.efer & EFER_SVME)) + return true; + + /* + * If EFER.SVME=1, the counter is disabled iff only one of the bits is + * set AND the set bit doesn't match the vCPU mode. + */ + if (host_guest_bits == AMD64_EVENTSEL_HOST_GUEST_MASK) + return false; + + return !!(host_guest_bits & AMD64_EVENTSEL_GUESTONLY) != is_guest_mode(vcpu); +} + struct kvm_pmu_ops amd_pmu_ops __initdata = { .rdpmc_ecx_to_pmc = amd_rdpmc_ecx_to_pmc, .msr_idx_to_pmc = amd_msr_idx_to_pmc, @@ -269,6 +310,7 @@ struct kvm_pmu_ops amd_pmu_ops __initdata = { .set_msr = amd_pmu_set_msr, .refresh = amd_pmu_refresh, .init = amd_pmu_init, + .pmc_is_disabled_in_current_mode = amd_pmc_is_disabled_in_current_mode, .is_mediated_pmu_supported = amd_pmu_is_mediated_pmu_supported, .mediated_load = amd_mediated_pmu_load, diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d5b9426d6c06e..717af5c4d0571 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -265,6 +265,7 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) set_exception_intercept(svm, GP_VECTOR); } + svm_pmu_handle_nested_transition(svm); kvm_make_request(KVM_REQ_RECALC_INTERCEPTS, vcpu); } @@ -333,7 +334,7 @@ done: return 1; } -static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu) +int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu) { return __svm_skip_emulated_instruction(vcpu, EMULTYPE_SKIP, true); } @@ -2797,6 +2798,20 @@ static bool sev_es_prevent_msr_access(struct kvm_vcpu *vcpu, !msr_write_intercepted(to_svm(vcpu), msr_info->index); } +static bool svm_pat_accesses_gpat(struct kvm_vcpu *vcpu, bool from_host) +{ + /* + * When KVM_X86_QUIRK_NESTED_SVM_SHARED_PAT is disabled and nested + * NPT is enabled, L2 has a separate PAT from L1. Guest accesses + * to IA32_PAT while running L2 target L2's gPAT; host-initiated + * accesses always target L1's hPAT so that KVM_GET/SET_MSRS and + * KVM_GET/SET_NESTED_STATE are independent of each other and can + * be ordered arbitrarily during save and restore. + */ + WARN_ON_ONCE(from_host && vcpu->wants_to_run); + return !from_host && is_guest_mode(vcpu) && l2_has_separate_pat(vcpu); +} + static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { struct vcpu_svm *svm = to_svm(vcpu); @@ -2913,6 +2928,12 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_AMD64_DE_CFG: msr_info->data = svm->msr_decfg; break; + case MSR_IA32_CR_PAT: + if (svm_pat_accesses_gpat(vcpu, msr_info->host_initiated)) { + msr_info->data = svm->vmcb->save.g_pat; + break; + } + return kvm_get_msr_common(vcpu, msr_info); default: return kvm_get_msr_common(vcpu, msr_info); } @@ -2996,14 +3017,23 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) break; case MSR_IA32_CR_PAT: + if (svm_pat_accesses_gpat(vcpu, msr->host_initiated)) { + if (!kvm_pat_valid(data)) + return 1; + + vmcb_set_gpat(svm->vmcb, data); + break; + } + ret = kvm_set_msr_common(vcpu, msr); if (ret) break; - svm->vmcb01.ptr->save.g_pat = data; - if (is_guest_mode(vcpu)) - nested_vmcb02_compute_g_pat(svm); - vmcb_mark_dirty(svm->vmcb, VMCB_NPT); + if (npt_enabled) { + vmcb_set_gpat(svm->vmcb01.ptr, data); + if (is_guest_mode(vcpu) && !l2_has_separate_pat(vcpu)) + vmcb_set_gpat(svm->vmcb, data); + } break; case MSR_IA32_SPEC_CTRL: if (!msr->host_initiated && diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 19b80ef56e2b7..87c6b105deef6 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -24,6 +24,8 @@ #include "cpuid.h" #include "kvm_cache_regs.h" +#include "x86.h" +#include "pmu.h" /* * Helpers to convert to/from physical addresses for pages whose address is @@ -166,6 +168,7 @@ struct vmcb_save_area_cached { u64 isst_addr; u64 rax; u64 cr2; + u64 g_pat; u64 dbgctl; u64 br_from; u64 br_to; @@ -464,6 +467,12 @@ static inline bool vmcb12_is_dirty(struct vmcb_ctrl_area_cached *control, int bi return !test_bit(bit, (unsigned long *)&control->clean); } +static inline void vmcb_set_gpat(struct vmcb *vmcb, u64 data) +{ + vmcb->save.g_pat = data; + vmcb_mark_dirty(vmcb, VMCB_NPT); +} + static __always_inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) { return container_of(vcpu, struct vcpu_svm, vcpu); @@ -641,6 +650,16 @@ static inline bool nested_npt_enabled(struct vcpu_svm *svm) return svm->nested.ctl.misc_ctl & SVM_MISC_ENABLE_NP; } +static inline bool l2_has_separate_pat(struct kvm_vcpu *vcpu) +{ + /* + * If KVM_X86_QUIRK_NESTED_SVM_SHARED_PAT is disabled while a vCPU + * is running, the L2 IA32_PAT semantics for that vCPU are undefined. + */ + return nested_npt_enabled(to_svm(vcpu)) && + !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_NESTED_SVM_SHARED_PAT); +} + static inline bool nested_vnmi_enabled(struct vcpu_svm *svm) { return guest_cpu_cap_has(&svm->vcpu, X86_FEATURE_VNMI) && @@ -814,6 +833,8 @@ static inline void svm_enable_intercept_for_msr(struct kvm_vcpu *vcpu, svm_set_intercept_for_msr(vcpu, msr, type, true); } +int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu); + /* nested.c */ #define NESTED_EXIT_HOST 0 /* Exit handled on host level */ @@ -875,9 +896,30 @@ void nested_copy_vmcb_control_to_cache(struct vcpu_svm *svm, void nested_copy_vmcb_save_to_cache(struct vcpu_svm *svm, struct vmcb_save_area *save); void nested_sync_control_from_vmcb02(struct vcpu_svm *svm); -void nested_vmcb02_compute_g_pat(struct vcpu_svm *svm); void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb); + +static inline void __svm_pmu_handle_nested_transition(struct vcpu_svm *svm, + bool defer) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(&svm->vcpu); + u64 counters = *(u64 *)pmu->pmc_has_mode_specific_enables; + + __kvm_pmu_reprogram_counters(pmu, counters, defer); +} + +static inline void svm_pmu_handle_nested_transition(struct vcpu_svm *svm) +{ + /* + * Do NOT defer reprogramming the counters by default. Instructions + * causing a state change are counted based on the _new_ CPU state + * (e.g. a successful VMRUN is counted in guest mode). Hence, the + * counters should be reprogrammed with the new state _before_ the + * instruction is potentially counted upon emulation completion. + */ + __svm_pmu_handle_nested_transition(svm, false); +} + extern struct kvm_x86_nested_ops svm_nested_ops; /* avic.c */ diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 27eb76e6b6a03..9bd77843d8da2 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -391,7 +391,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (pmu->pebs_enable != data) { diff = pmu->pebs_enable ^ data; pmu->pebs_enable = data; - reprogram_counters(pmu, diff); + kvm_pmu_request_counters_reprogram(pmu, diff); } break; case MSR_IA32_DS_AREA: diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index cd528c8ea1409..cbc2034d7924a 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4174,7 +4174,7 @@ static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu) * mode, only the current timer count needs on-demand emulation by KVM. */ if (mode & MSR_BITMAP_MODE_X2APIC_APICV) - msr_bitmap[read_idx] = ~kvm_lapic_readable_reg_mask(vcpu->arch.apic); + msr_bitmap[read_idx] = ~kvm_x2apic_disable_read_intercept_reg_mask(vcpu); else msr_bitmap[read_idx] = ~0ull; msr_bitmap[write_idx] = ~0ull; @@ -4187,7 +4187,6 @@ static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu) !(mode & MSR_BITMAP_MODE_X2APIC)); if (mode & MSR_BITMAP_MODE_X2APIC_APICV) { - vmx_enable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_RW); vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_EOI), MSR_TYPE_W); vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W); if (enable_ipiv) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 54c552efb59e5..e6f1dd84f22dd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -182,15 +182,6 @@ module_param(force_emulation_prefix, int, 0644); int __read_mostly pi_inject_timer = -1; module_param(pi_inject_timer, bint, 0644); -/* Enable/disable PMU virtualization */ -bool __read_mostly enable_pmu = true; -EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_pmu); -module_param(enable_pmu, bool, 0444); - -/* Enable/disabled mediated PMU virtualization. */ -bool __read_mostly enable_mediated_pmu; -EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_mediated_pmu); - bool __read_mostly eager_page_split = true; module_param(eager_page_split, bool, 0644); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index aa7d5b757fb54..a49424f9c968e 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -490,9 +490,6 @@ fastpath_t handle_fastpath_invd(struct kvm_vcpu *vcpu); extern struct kvm_caps kvm_caps; extern struct kvm_host_values kvm_host; -extern bool enable_pmu; -extern bool enable_mediated_pmu; - void kvm_setup_xss_caps(void); /* |
