diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2026-06-16 06:26:12 +0530 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2026-06-16 06:26:12 +0530 |
| commit | b0820861748f43759bbf4c319ed1277cff3c5921 (patch) | |
| tree | c77f30fc402607316a59656f1b08977e6cf57ee8 /arch | |
| parent | 7d36844ae7b0a9ef61dd345c8dfe1d47a5199a53 (diff) | |
| parent | 2b9ad7a6154e0938b9458691536296dd0224942d (diff) | |
| download | ath-b0820861748f43759bbf4c319ed1277cff3c5921.tar.gz | |
Merge tag 'x86_tdx_for_7.2-rc1' of gitolite.kernel.org:pub/scm/linux/kernel/git/tip/tip
Pull x86 TDX updates from Dave Hansen:
"There are a few cleanups, and some changes that should allow TDX and
kexec to coexist nicely.
The biggest change, however, is support for updating the TDX module
after boot, just like CPU microcode. TDX users really want this
because it lets them do security updates without tearing things down
and rebooting.
- Add TDX module update support
- Make kexec and TDX finally place nice together
- Put TDX error codes into a single header"
* tag 'x86_tdx_for_7.2-rc1' of gitolite.kernel.org:pub/scm/linux/kernel/git/tip/tip: (30 commits)
x86/virt/tdx: Document TDX module update
x86/virt/tdx: Enable TDX module runtime updates
x86/virt/tdx: Refresh TDX module version after update
coco/tdx-host: Lock out module updates when reading version
x86/virt/seamldr: Add module update locking
x86/virt/tdx: Restore TDX module state
x86/virt/seamldr: Initialize the newly-installed TDX module
x86/virt/seamldr: Install a new TDX module
x86/virt/tdx: Reset software states during TDX module shutdown
x86/virt/seamldr: Shut down the current TDX module
x86/virt/seamldr: Abort updates after a failed step
x86/virt/seamldr: Introduce skeleton for TDX module updates
x86/virt/seamldr: Allocate and populate a module update request
coco/tdx-host: Implement firmware upload sysfs ABI for TDX module updates
coco/tdx-host: Don't expose P-SEAMLDR information on CPUs with erratum
coco/tdx-host: Expose P-SEAMLDR information via sysfs
x86/virt/seamldr: Add a helper to retrieve P-SEAMLDR information
x86/virt/seamldr: Introduce a wrapper for P-SEAMLDR SEAMCALLs
coco/tdx-host: Expose TDX module version
coco/tdx-host: Introduce a "tdx_host" device
...
Diffstat (limited to 'arch')
| -rw-r--r-- | arch/x86/include/asm/cpufeatures.h | 1 | ||||
| -rw-r--r-- | arch/x86/include/asm/seamldr.h | 38 | ||||
| -rw-r--r-- | arch/x86/include/asm/shared/tdx.h | 1 | ||||
| -rw-r--r-- | arch/x86/include/asm/shared/tdx_errno.h (renamed from arch/x86/kvm/vmx/tdx_errno.h) | 8 | ||||
| -rw-r--r-- | arch/x86/include/asm/tdx.h | 70 | ||||
| -rw-r--r-- | arch/x86/include/asm/tdx_global_metadata.h | 4 | ||||
| -rw-r--r-- | arch/x86/include/asm/vmx.h | 1 | ||||
| -rw-r--r-- | arch/x86/kernel/crash.c | 2 | ||||
| -rw-r--r-- | arch/x86/kernel/machine_kexec_64.c | 16 | ||||
| -rw-r--r-- | arch/x86/kvm/vmx/tdx.c | 10 | ||||
| -rw-r--r-- | arch/x86/kvm/vmx/tdx.h | 1 | ||||
| -rw-r--r-- | arch/x86/virt/vmx/tdx/Makefile | 2 | ||||
| -rw-r--r-- | arch/x86/virt/vmx/tdx/seamcall_internal.h | 109 | ||||
| -rw-r--r-- | arch/x86/virt/vmx/tdx/seamldr.c | 368 | ||||
| -rw-r--r-- | arch/x86/virt/vmx/tdx/tdx.c | 219 | ||||
| -rw-r--r-- | arch/x86/virt/vmx/tdx/tdx.h | 9 | ||||
| -rw-r--r-- | arch/x86/virt/vmx/tdx/tdx_global_metadata.c | 17 |
17 files changed, 716 insertions, 160 deletions
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 1d506e5d6f46a..7b572bc24265c 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -573,4 +573,5 @@ #define X86_BUG_ITS_NATIVE_ONLY X86_BUG( 1*32+ 8) /* "its_native_only" CPU is affected by ITS, VMX is not affected */ #define X86_BUG_TSA X86_BUG( 1*32+ 9) /* "tsa" CPU is affected by Transient Scheduler Attacks */ #define X86_BUG_VMSCAPE X86_BUG( 1*32+10) /* "vmscape" CPU is affected by VMSCAPE attacks from guests */ +#define X86_BUG_SEAMRET_INVD_VMCS X86_BUG( 1*32+11) /* "seamret_invd_vmcs" SEAMRET from P-SEAMLDR clears the current VMCS */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/seamldr.h b/arch/x86/include/asm/seamldr.h new file mode 100644 index 0000000000000..cfc6a1b1a440b --- /dev/null +++ b/arch/x86/include/asm/seamldr.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_SEAMLDR_H +#define _ASM_X86_SEAMLDR_H + +#include <linux/types.h> + +/* + * This is the "SEAMLDR_INFO" data structure defined in the + * "SEAM Loader (SEAMLDR) Interface Specification". + * + * Must be aligned to a 256-byte boundary. + */ +struct seamldr_info { + u32 version; + u32 attributes; + u32 vendor_id; + u32 build_date; + u16 build_num; + u16 minor_version; + u16 major_version; + u16 update_version; + u32 acm_x2apicid; + u32 num_remaining_updates; + u8 seam_info[128]; + u8 seam_ready; + u8 seam_debug; + u8 p_seam_ready; + u8 reserved[93]; +} __packed __aligned(256); + +static_assert(sizeof(struct seamldr_info) == 256); + +int seamldr_get_info(struct seamldr_info *seamldr_info); +int seamldr_install_module(const u8 *data, u32 data_len); +void seamldr_lock_module_update(void); +void seamldr_unlock_module_update(void); + +#endif /* _ASM_X86_SEAMLDR_H */ diff --git a/arch/x86/include/asm/shared/tdx.h b/arch/x86/include/asm/shared/tdx.h index 049638e3da743..f20e91d7ac35b 100644 --- a/arch/x86/include/asm/shared/tdx.h +++ b/arch/x86/include/asm/shared/tdx.h @@ -4,6 +4,7 @@ #include <linux/bits.h> #include <linux/types.h> +#include <asm/shared/tdx_errno.h> #define TDX_HYPERCALL_STANDARD 0 diff --git a/arch/x86/kvm/vmx/tdx_errno.h b/arch/x86/include/asm/shared/tdx_errno.h index 6ff4672c41810..ee411b360e20d 100644 --- a/arch/x86/kvm/vmx/tdx_errno.h +++ b/arch/x86/include/asm/shared/tdx_errno.h @@ -1,8 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* architectural status code for SEAMCALL */ - -#ifndef __KVM_X86_TDX_ERRNO_H -#define __KVM_X86_TDX_ERRNO_H +#ifndef _ASM_X86_SHARED_TDX_ERRNO_H +#define _ASM_X86_SHARED_TDX_ERRNO_H #define TDX_SEAMCALL_STATUS_MASK 0xFFFFFFFF00000000ULL @@ -14,6 +13,7 @@ #define TDX_NON_RECOVERABLE_TD_NON_ACCESSIBLE 0x6000000500000000ULL #define TDX_NON_RECOVERABLE_TD_WRONG_APIC_MODE 0x6000000700000000ULL #define TDX_INTERRUPTED_RESUMABLE 0x8000000300000000ULL +#define TDX_SYS_BUSY 0x8000020200000000ULL #define TDX_OPERAND_INVALID 0xC000010000000000ULL #define TDX_OPERAND_BUSY 0x8000020000000000ULL #define TDX_PREVIOUS_TLB_EPOCH_BUSY 0x8000020100000000ULL @@ -37,4 +37,4 @@ #define TDX_OPERAND_ID_SEPT 0x92 #define TDX_OPERAND_ID_TD_EPOCH 0xa9 -#endif /* __KVM_X86_TDX_ERRNO_H */ +#endif /* _ASM_X86_SHARED_TDX_ERRNO_H */ diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index a149740b24e8b..e5a9cf656c072 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -32,6 +32,10 @@ #define TDX_SUCCESS 0ULL #define TDX_RND_NO_ENTROPY 0x8000020300000000ULL +/* Bit definitions of TDX_FEATURES0 metadata field */ +#define TDX_FEATURES0_TD_PRESERVING BIT_ULL(1) +#define TDX_FEATURES0_NO_RBP_MOD BIT_ULL(18) + #ifndef __ASSEMBLER__ #include <uapi/asm/mce.h> @@ -39,6 +43,12 @@ #include <linux/pgtable.h> /* + * TDX module and P-SEAMLDR version convention: "major.minor.update" + * (e.g., "1.5.08") with zero-padded two-digit update field. + */ +#define TDX_VERSION_FMT "%u.%u.%02u" + +/* * Used by the #VE exception handler to gather the #VE exception * info from the TDX module. This is a software only structure * and not part of the TDX module/VMM ABI. @@ -97,57 +107,16 @@ static inline long tdx_kvm_hypercall(unsigned int nr, unsigned long p1, #endif /* CONFIG_INTEL_TDX_GUEST && CONFIG_KVM_GUEST */ #ifdef CONFIG_INTEL_TDX_HOST -u64 __seamcall(u64 fn, struct tdx_module_args *args); -u64 __seamcall_ret(u64 fn, struct tdx_module_args *args); -u64 __seamcall_saved_ret(u64 fn, struct tdx_module_args *args); void tdx_init(void); +int tdx_cpu_enable(void); +const char *tdx_dump_mce_info(struct mce *m); +const struct tdx_sys_info *tdx_get_sysinfo(void); -#include <linux/preempt.h> -#include <asm/archrandom.h> -#include <asm/processor.h> - -typedef u64 (*sc_func_t)(u64 fn, struct tdx_module_args *args); - -static __always_inline u64 __seamcall_dirty_cache(sc_func_t func, u64 fn, - struct tdx_module_args *args) -{ - lockdep_assert_preemption_disabled(); - - /* - * SEAMCALLs are made to the TDX module and can generate dirty - * cachelines of TDX private memory. Mark cache state incoherent - * so that the cache can be flushed during kexec. - * - * This needs to be done before actually making the SEAMCALL, - * because kexec-ing CPU could send NMI to stop remote CPUs, - * in which case even disabling IRQ won't help here. - */ - this_cpu_write(cache_state_incoherent, true); - - return func(fn, args); -} - -static __always_inline u64 sc_retry(sc_func_t func, u64 fn, - struct tdx_module_args *args) +static inline bool tdx_supports_runtime_update(const struct tdx_sys_info *sysinfo) { - int retry = RDRAND_RETRY_LOOPS; - u64 ret; - - do { - preempt_disable(); - ret = __seamcall_dirty_cache(func, fn, args); - preempt_enable(); - } while (ret == TDX_RND_NO_ENTROPY && --retry); - - return ret; + return sysinfo->features.tdx_features0 & TDX_FEATURES0_TD_PRESERVING; } -#define seamcall(_fn, _args) sc_retry(__seamcall, (_fn), (_args)) -#define seamcall_ret(_fn, _args) sc_retry(__seamcall_ret, (_fn), (_args)) -#define seamcall_saved_ret(_fn, _args) sc_retry(__seamcall_saved_ret, (_fn), (_args)) -const char *tdx_dump_mce_info(struct mce *m); -const struct tdx_sys_info *tdx_get_sysinfo(void); - int tdx_guest_keyid_alloc(void); u32 tdx_get_nr_guest_keyids(void); void tdx_guest_keyid_free(unsigned int keyid); @@ -193,6 +162,8 @@ static inline int pg_level_to_tdx_sept_level(enum pg_level level) return level - 1; } +void tdx_sys_disable(void); + u64 tdh_vp_enter(struct tdx_vp *vp, struct tdx_module_args *args); u64 tdh_mng_addcx(struct tdx_td *td, struct page *tdcs_page); u64 tdh_mem_page_add(struct tdx_td *td, u64 gpa, struct page *page, struct page *source, u64 *ext_err1, u64 *ext_err2); @@ -224,13 +195,8 @@ static inline void tdx_init(void) { } static inline u32 tdx_get_nr_guest_keyids(void) { return 0; } static inline const char *tdx_dump_mce_info(struct mce *m) { return NULL; } static inline const struct tdx_sys_info *tdx_get_sysinfo(void) { return NULL; } +static inline void tdx_sys_disable(void) { } #endif /* CONFIG_INTEL_TDX_HOST */ -#ifdef CONFIG_KEXEC_CORE -void tdx_cpu_flush_cache_for_kexec(void); -#else -static inline void tdx_cpu_flush_cache_for_kexec(void) { } -#endif - #endif /* !__ASSEMBLER__ */ #endif /* _ASM_X86_TDX_H */ diff --git a/arch/x86/include/asm/tdx_global_metadata.h b/arch/x86/include/asm/tdx_global_metadata.h index 40689c8dc67eb..41150d546589c 100644 --- a/arch/x86/include/asm/tdx_global_metadata.h +++ b/arch/x86/include/asm/tdx_global_metadata.h @@ -40,6 +40,10 @@ struct tdx_sys_info_td_conf { u64 cpuid_config_values[128][2]; }; +struct tdx_sys_info_handoff { + u16 module_hv; +}; + struct tdx_sys_info { struct tdx_sys_info_version version; struct tdx_sys_info_features features; diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 37080382df548..49d8551d285d9 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -147,6 +147,7 @@ struct vmcs { #define VMX_BASIC_INOUT BIT_ULL(54) #define VMX_BASIC_TRUE_CTLS BIT_ULL(55) #define VMX_BASIC_NO_HW_ERROR_CODE_CC BIT_ULL(56) +#define VMX_BASIC_NO_SEAMRET_INVD_VMCS BIT_ULL(60) static inline u32 vmx_basic_vmcs_revision_id(u64 vmx_basic) { diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index cd796818d94d9..623d4474631a6 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -38,6 +38,7 @@ #include <linux/kdebug.h> #include <asm/cpu.h> #include <asm/reboot.h> +#include <asm/tdx.h> #include <asm/intel_pt.h> #include <asm/crash.h> #include <asm/cmdline.h> @@ -112,6 +113,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs) crash_smp_send_stop(); + tdx_sys_disable(); x86_virt_emergency_disable_virtualization_cpu(); /* diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 0590d399d4f1f..c3f4a389992da 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -347,22 +347,6 @@ int machine_kexec_prepare(struct kimage *image) unsigned long reloc_end = (unsigned long)__relocate_kernel_end; int result; - /* - * Some early TDX-capable platforms have an erratum. A kernel - * partial write (a write transaction of less than cacheline - * lands at memory controller) to TDX private memory poisons that - * memory, and a subsequent read triggers a machine check. - * - * On those platforms the old kernel must reset TDX private - * memory before jumping to the new kernel otherwise the new - * kernel may see unexpected machine check. For simplicity - * just fail kexec/kdump on those platforms. - */ - if (boot_cpu_has_bug(X86_BUG_TDX_PW_MCE)) { - pr_info_once("Not allowed on platform with tdx_pw_mce bug\n"); - return -EOPNOTSUPP; - } - /* Setup the identity mapped 64bit page table */ result = init_pgtable(image, __pa(control_page)); if (result) diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index cb50e23c39cab..67d3c5c8aca0c 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -440,16 +440,6 @@ void tdx_disable_virtualization_cpu(void) tdx_flush_vp(&arg); } local_irq_restore(flags); - - /* - * Flush cache now if kexec is possible: this is necessary to avoid - * having dirty private memory cachelines when the new kernel boots, - * but WBINVD is a relatively expensive operation and doing it during - * kexec can exacerbate races in native_stop_other_cpus(). Do it - * now, since this is a safe moment and there is going to be no more - * TDX activity on this CPU from this point on. - */ - tdx_cpu_flush_cache_for_kexec(); } #define TDX_SEAMCALL_RETRIES 10000 diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h index b5cd2ffb303e5..ac8323a68b163 100644 --- a/arch/x86/kvm/vmx/tdx.h +++ b/arch/x86/kvm/vmx/tdx.h @@ -3,7 +3,6 @@ #define __KVM_X86_VMX_TDX_H #include "tdx_arch.h" -#include "tdx_errno.h" #ifdef CONFIG_KVM_INTEL_TDX #include "common.h" diff --git a/arch/x86/virt/vmx/tdx/Makefile b/arch/x86/virt/vmx/tdx/Makefile index 90da47eb85eec..d1dbc5cc56978 100644 --- a/arch/x86/virt/vmx/tdx/Makefile +++ b/arch/x86/virt/vmx/tdx/Makefile @@ -1,2 +1,2 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-y += seamcall.o tdx.o +obj-y += seamcall.o seamldr.o tdx.o diff --git a/arch/x86/virt/vmx/tdx/seamcall_internal.h b/arch/x86/virt/vmx/tdx/seamcall_internal.h new file mode 100644 index 0000000000000..be5f446467dfa --- /dev/null +++ b/arch/x86/virt/vmx/tdx/seamcall_internal.h @@ -0,0 +1,109 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * SEAMCALL utilities for TDX host-side operations. + * + * Provides convenient wrappers around SEAMCALL assembly with retry logic, + * error reporting and cache coherency tracking. + * + * Copyright (C) 2021-2023 Intel Corporation + */ + +#ifndef _X86_VIRT_SEAMCALL_INTERNAL_H +#define _X86_VIRT_SEAMCALL_INTERNAL_H + +#include <linux/printk.h> +#include <linux/types.h> +#include <asm/archrandom.h> +#include <asm/processor.h> +#include <asm/tdx.h> + +u64 __seamcall(u64 fn, struct tdx_module_args *args); +u64 __seamcall_ret(u64 fn, struct tdx_module_args *args); +u64 __seamcall_saved_ret(u64 fn, struct tdx_module_args *args); + +typedef u64 (*sc_func_t)(u64 fn, struct tdx_module_args *args); + +static __always_inline u64 __seamcall_dirty_cache(sc_func_t func, u64 fn, + struct tdx_module_args *args) +{ + lockdep_assert_preemption_disabled(); + + /* + * SEAMCALLs are made to the TDX module and can generate dirty + * cachelines of TDX private memory. Mark cache state incoherent + * so that the cache can be flushed during kexec. + * + * This needs to be done before actually making the SEAMCALL, + * because kexec-ing CPU could send NMI to stop remote CPUs, + * in which case even disabling IRQ won't help here. + */ + this_cpu_write(cache_state_incoherent, true); + + return func(fn, args); +} + +static __always_inline u64 sc_retry(sc_func_t func, u64 fn, + struct tdx_module_args *args) +{ + int retry = RDRAND_RETRY_LOOPS; + u64 ret; + + do { + preempt_disable(); + ret = __seamcall_dirty_cache(func, fn, args); + preempt_enable(); + } while (ret == TDX_RND_NO_ENTROPY && --retry); + + return ret; +} + +#define seamcall(_fn, _args) sc_retry(__seamcall, (_fn), (_args)) +#define seamcall_ret(_fn, _args) sc_retry(__seamcall_ret, (_fn), (_args)) +#define seamcall_saved_ret(_fn, _args) sc_retry(__seamcall_saved_ret, (_fn), (_args)) + +typedef void (*sc_err_func_t)(u64 fn, u64 err, struct tdx_module_args *args); + +static inline void seamcall_err(u64 fn, u64 err, struct tdx_module_args *args) +{ + pr_err("SEAMCALL (0x%016llx) failed: 0x%016llx\n", fn, err); +} + +static inline void seamcall_err_ret(u64 fn, u64 err, + struct tdx_module_args *args) +{ + seamcall_err(fn, err, args); + pr_err("RCX 0x%016llx RDX 0x%016llx R08 0x%016llx\n", + args->rcx, args->rdx, args->r8); + pr_err("R09 0x%016llx R10 0x%016llx R11 0x%016llx\n", + args->r9, args->r10, args->r11); +} + +static __always_inline int sc_retry_prerr(sc_func_t func, + sc_err_func_t err_func, + u64 fn, struct tdx_module_args *args) +{ + u64 sret = sc_retry(func, fn, args); + + if (sret == TDX_SUCCESS) + return 0; + + if (sret == TDX_SEAMCALL_VMFAILINVALID) + return -ENODEV; + + if (sret == TDX_SEAMCALL_GP) + return -EOPNOTSUPP; + + if (sret == TDX_SEAMCALL_UD) + return -EACCES; + + err_func(fn, sret, args); + return -EIO; +} + +#define seamcall_prerr(__fn, __args) \ + sc_retry_prerr(__seamcall, seamcall_err, (__fn), (__args)) + +#define seamcall_prerr_ret(__fn, __args) \ + sc_retry_prerr(__seamcall_ret, seamcall_err_ret, (__fn), (__args)) + +#endif /* _X86_VIRT_SEAMCALL_INTERNAL_H */ diff --git a/arch/x86/virt/vmx/tdx/seamldr.c b/arch/x86/virt/vmx/tdx/seamldr.c new file mode 100644 index 0000000000000..b1137ca6150d4 --- /dev/null +++ b/arch/x86/virt/vmx/tdx/seamldr.c @@ -0,0 +1,368 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * P-SEAMLDR support for TDX module management features like runtime updates + * + * Copyright (C) 2025 Intel Corporation + */ +#define pr_fmt(fmt) "seamldr: " fmt + +#include <linux/bug.h> +#include <linux/mm.h> +#include <linux/nmi.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/stop_machine.h> + +#include <asm/cpufeature.h> +#include <asm/cpufeatures.h> +#include <asm/seamldr.h> + +#include "seamcall_internal.h" +#include "tdx.h" + +/* P-SEAMLDR SEAMCALL leaf function */ +#define P_SEAMLDR_INFO 0x8000000000000000 +#define P_SEAMLDR_INSTALL 0x8000000000000001 + +#define SEAMLDR_MAX_NR_MODULE_PAGES 496 +#define SEAMLDR_MAX_NR_SIG_PAGES 1 + +/* + * The seamldr_params "scenario" field specifies the operation mode: + * 0: Install TDX module from scratch (not used by kernel) + * 1: Update existing TDX module to a compatible version + */ +#define SEAMLDR_SCENARIO_UPDATE 1 + +/* + * This is the "SEAMLDR_PARAMS" data structure defined in the + * "SEAM Loader (SEAMLDR) Interface Specification". + * + * It is the in-memory ABI that the kernel passes to the P-SEAMLDR + * to update the TDX module. It breaks the TDX module image up in + * page-size pieces. + */ +struct seamldr_params { + u32 version; + u32 scenario; + u64 sigstruct_pages_pa_list[SEAMLDR_MAX_NR_SIG_PAGES]; + u8 reserved[104]; + u64 module_nr_pages; + u64 module_pages_pa_list[SEAMLDR_MAX_NR_MODULE_PAGES]; +} __packed; + +static_assert(sizeof(struct seamldr_params) == 4096); + +/* + * Serialize P-SEAMLDR calls since the hardware only allows a single CPU to + * interact with P-SEAMLDR simultaneously. Use raw version as the calls can + * be made with interrupts disabled, where plain spinlocks are prohibited in + * PREEMPT_RT kernels as they become sleeping locks. + */ +static DEFINE_RAW_SPINLOCK(seamldr_lock); + +static int seamldr_call(u64 fn, struct tdx_module_args *args) +{ + /* + * With this bug, P-SEAMLDR calls corrupt the VMCS + * pointer and must be avoided. This path should be + * unreachable since sysfs hides the ABIs. + */ + if (boot_cpu_has_bug(X86_BUG_SEAMRET_INVD_VMCS)) { + WARN_ON(1); + return -EINVAL; + } + + guard(raw_spinlock)(&seamldr_lock); + return seamcall_prerr(fn, args); +} + +int seamldr_get_info(struct seamldr_info *seamldr_info) +{ + struct tdx_module_args args = {}; + + /* + * Use slow_virt_to_phys() since @seamldr_info may be allocated on + * the stack. + */ + args.rcx = slow_virt_to_phys(seamldr_info); + return seamldr_call(P_SEAMLDR_INFO, &args); +} +EXPORT_SYMBOL_FOR_MODULES(seamldr_get_info, "tdx-host"); + +/* Call into P-SEAMLDR to install a TDX module update */ +static int seamldr_install(const struct seamldr_params *params) +{ + struct tdx_module_args args = {}; + + args.rcx = __pa(params); + return seamldr_call(P_SEAMLDR_INSTALL, &args); +} + +#define TDX_IMAGE_VERSION_2 0x200 + +/* First page of the on-disk module update image: */ +struct tdx_image_header { + u16 version; + u16 checksum; + u8 signature[8]; + u32 sigstruct_nr_pages; + u32 module_nr_pages; + u8 reserved[4076]; +} __packed; + +#define TDX_IMAGE_HEADER_SIZE sizeof(struct tdx_image_header) +static_assert(TDX_IMAGE_HEADER_SIZE == 4096); + +/* + * Intel TDX module update ABI structure. aka. "TDX module blob". + * This is the on-disk format that fw_upload lands in a kernel + * buffer. + * + * @payload contains sigstruct pages followed by module pages. + */ +struct tdx_image { + struct tdx_image_header header; + u8 payload[]; +}; + +/* + * Given a vmalloc() allocation, write all of the backing physical + * addresses to pa_list[]. Caller guarantees that the array is big + * enough. + */ +static void populate_pa_list(u64 *pa_list, const u8 *vmalloc_addr, u32 vmalloc_len_pages) +{ + int i; + + for (i = 0; i < vmalloc_len_pages; i++) { + unsigned long offset = i * PAGE_SIZE; + unsigned long pfn = vmalloc_to_pfn(&vmalloc_addr[offset]); + + pa_list[i] = pfn << PAGE_SHIFT; + } +} + +static void populate_seamldr_params(struct seamldr_params *params, + const u8 *sig, u32 sig_nr_pages, + const u8 *mod, u32 mod_nr_pages) +{ + params->version = 0; + params->scenario = SEAMLDR_SCENARIO_UPDATE; + params->module_nr_pages = mod_nr_pages; + + populate_pa_list(params->sigstruct_pages_pa_list, sig, sig_nr_pages); + populate_pa_list(params->module_pages_pa_list, mod, mod_nr_pages); +} + +/* + * @image points to a vmalloc()'d 'struct tdx_image'. Transform + * it into @params which is the P-SEAMLDR ABI format. + */ +static int init_seamldr_params(struct seamldr_params *params, + const struct tdx_image *image, + u32 image_len) +{ + const struct tdx_image_header *header = &image->header; + + u32 sigstruct_len = header->sigstruct_nr_pages * PAGE_SIZE; + u32 module_len = header->module_nr_pages * PAGE_SIZE; + + u8 *header_start = (u8 *)header; + u8 *header_end = header_start + TDX_IMAGE_HEADER_SIZE; + + u8 *sigstruct_start = header_end; + u8 *sigstruct_end = sigstruct_start + sigstruct_len; + + u8 *module_start = sigstruct_end; + + /* Check the calculated payload size against the image size. */ + if (TDX_IMAGE_HEADER_SIZE + sigstruct_len + module_len != image_len) + return -EINVAL; + + /* Reject unsupported tdx_image ABI versions. */ + if (header->version != TDX_IMAGE_VERSION_2) + return -EINVAL; + + if (header->sigstruct_nr_pages > SEAMLDR_MAX_NR_SIG_PAGES || + header->module_nr_pages > SEAMLDR_MAX_NR_MODULE_PAGES) + return -EINVAL; + + if (memcmp(header->signature, "TDX-BLOB", sizeof(header->signature))) + return -EINVAL; + + if (memchr_inv(header->reserved, 0, sizeof(header->reserved))) + return -EINVAL; + + populate_seamldr_params(params, sigstruct_start, header->sigstruct_nr_pages, + module_start, header->module_nr_pages); + return 0; +} + +/* + * During a TDX module update, all CPUs start from MODULE_UPDATE_START and + * progress to MODULE_UPDATE_DONE. Each state is associated with certain + * work. For some states, just one CPU needs to perform the work, while + * other CPUs just wait during those states. + */ +enum module_update_state { + MODULE_UPDATE_START, + MODULE_UPDATE_SHUTDOWN, + MODULE_UPDATE_CPU_INSTALL, + MODULE_UPDATE_CPU_INIT, + MODULE_UPDATE_RUN_UPDATE, + MODULE_UPDATE_DONE, +}; + +static struct update_ctrl { + enum module_update_state state; + int num_ack; + int num_failed; + /* + * Protect update_ctrl. Raw spinlock as it will be acquired from + * interrupt-disabled contexts. + */ + raw_spinlock_t lock; +} update_ctrl; + +/* Called with ctrl->lock held or during initialization. */ +static void __set_target_state(struct update_ctrl *ctrl, + enum module_update_state newstate) +{ + /* Reset ack counter. */ + ctrl->num_ack = 0; + ctrl->state = newstate; +} + +/* Last one to ack a state moves to the next state. */ +static void ack_state(struct update_ctrl *ctrl, int result) +{ + raw_spin_lock(&ctrl->lock); + + ctrl->num_failed += !!result; + ctrl->num_ack++; + if (ctrl->num_ack == num_online_cpus() && !ctrl->num_failed) + __set_target_state(ctrl, ctrl->state + 1); + + raw_spin_unlock(&ctrl->lock); +} + +static void init_state(struct update_ctrl *ctrl) +{ + raw_spin_lock_init(&ctrl->lock); + __set_target_state(ctrl, MODULE_UPDATE_START + 1); + ctrl->num_failed = 0; +} + +/* + * See multi_cpu_stop() from where this multi-cpu state-machine was + * adopted. + */ +static int do_seamldr_install_module(void *seamldr_params) +{ + enum module_update_state curstate = MODULE_UPDATE_START; + enum module_update_state newstate; + bool is_lead_cpu = false; + int ret = 0; + + /* + * Some steps must be run on exactly one CPU. Pick a "lead" CPU to + * execute those steps. Use CPU 0 because it is always online. + */ + if (smp_processor_id() == 0) + is_lead_cpu = true; + + do { + newstate = READ_ONCE(update_ctrl.state); + + if (curstate == newstate) { + cpu_relax(); + continue; + } + + curstate = newstate; + switch (curstate) { + case MODULE_UPDATE_SHUTDOWN: + if (is_lead_cpu) + ret = tdx_module_shutdown(); + break; + case MODULE_UPDATE_CPU_INSTALL: + ret = seamldr_install(seamldr_params); + break; + case MODULE_UPDATE_CPU_INIT: + ret = tdx_cpu_enable(); + break; + case MODULE_UPDATE_RUN_UPDATE: + if (is_lead_cpu) + ret = tdx_module_run_update(); + break; + default: + break; + } + + ack_state(&update_ctrl, ret); + } while (curstate != MODULE_UPDATE_DONE && + !READ_ONCE(update_ctrl.num_failed)); + + return ret; +} + +/** + * seamldr_install_module - Install a new TDX module. + * @data: Pointer to the TDX module image. + * @data_len: Size of the TDX module image. + * + * Returns 0 on success, negative error code on failure. + */ +int seamldr_install_module(const u8 *data, u32 data_len) +{ + struct seamldr_params *params; + const struct tdx_image *image; + int ret; + + /* + * init_seamldr_params() reads the header early. + * Ensure there is enough data to do at least that. + */ + if (data_len < TDX_IMAGE_HEADER_SIZE) + return -EINVAL; + + image = (const struct tdx_image *)data; + + params = kzalloc_obj(*params); + if (!params) + return -ENOMEM; + + /* Populate 'params' from 'image'. */ + ret = init_seamldr_params(params, image, data_len); + if (ret) + goto out; + + /* Ensure a stable set of online CPUs for the update process. */ + cpus_read_lock(); + init_state(&update_ctrl); + ret = stop_machine_cpuslocked(do_seamldr_install_module, params, + cpu_online_mask); + cpus_read_unlock(); + +out: + kfree(params); + return ret; +} +EXPORT_SYMBOL_FOR_MODULES(seamldr_install_module, "tdx-host"); + +/* + * stop_machine() does not interrupt preemption-disabled regions. + * Simply disabling preempt prevents updates. + */ +void seamldr_lock_module_update(void) +{ + preempt_disable(); +} +EXPORT_SYMBOL_FOR_MODULES(seamldr_lock_module_update, "tdx-host"); + +void seamldr_unlock_module_update(void) +{ + preempt_enable(); +} +EXPORT_SYMBOL_FOR_MODULES(seamldr_unlock_module_update, "tdx-host"); diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c index cb9b3210ab710..b15269b5941dc 100644 --- a/arch/x86/virt/vmx/tdx/tdx.c +++ b/arch/x86/virt/vmx/tdx/tdx.c @@ -37,12 +37,23 @@ #include <asm/msr.h> #include <asm/cpufeature.h> #include <asm/tdx.h> +#include <asm/shared/tdx_errno.h> #include <asm/cpu_device_id.h> #include <asm/processor.h> #include <asm/mce.h> #include <asm/virt.h> +#include <asm/vmx.h> + +#include "seamcall_internal.h" #include "tdx.h" +struct tdx_module_state { + bool initialized; + bool sysinit_done; + int sysinit_ret; +}; + +static struct tdx_module_state tdx_module_state; static u32 tdx_global_keyid __ro_after_init; static u32 tdx_guest_keyid_start __ro_after_init; static u32 tdx_nr_guest_keyids __ro_after_init; @@ -56,53 +67,9 @@ static struct tdmr_info_list tdx_tdmr_list; /* All TDX-usable memory regions. Protected by mem_hotplug_lock. */ static LIST_HEAD(tdx_memlist); -static struct tdx_sys_info tdx_sysinfo __ro_after_init; -static bool tdx_module_initialized __ro_after_init; - -typedef void (*sc_err_func_t)(u64 fn, u64 err, struct tdx_module_args *args); - -static inline void seamcall_err(u64 fn, u64 err, struct tdx_module_args *args) -{ - pr_err("SEAMCALL (0x%016llx) failed: 0x%016llx\n", fn, err); -} - -static inline void seamcall_err_ret(u64 fn, u64 err, - struct tdx_module_args *args) -{ - seamcall_err(fn, err, args); - pr_err("RCX 0x%016llx RDX 0x%016llx R08 0x%016llx\n", - args->rcx, args->rdx, args->r8); - pr_err("R09 0x%016llx R10 0x%016llx R11 0x%016llx\n", - args->r9, args->r10, args->r11); -} +static struct tdx_sys_info tdx_sysinfo; -static __always_inline int sc_retry_prerr(sc_func_t func, - sc_err_func_t err_func, - u64 fn, struct tdx_module_args *args) -{ - u64 sret = sc_retry(func, fn, args); - - if (sret == TDX_SUCCESS) - return 0; - - if (sret == TDX_SEAMCALL_VMFAILINVALID) - return -ENODEV; - - if (sret == TDX_SEAMCALL_GP) - return -EOPNOTSUPP; - - if (sret == TDX_SEAMCALL_UD) - return -EACCES; - - err_func(fn, sret, args); - return -EIO; -} - -#define seamcall_prerr(__fn, __args) \ - sc_retry_prerr(__seamcall, seamcall_err, (__fn), (__args)) - -#define seamcall_prerr_ret(__fn, __args) \ - sc_retry_prerr(__seamcall_ret, seamcall_err_ret, (__fn), (__args)) +static DEFINE_RAW_SPINLOCK(sysinit_lock); /* * Do the module global initialization once and return its result. @@ -111,31 +78,34 @@ static __always_inline int sc_retry_prerr(sc_func_t func, static int try_init_module_global(void) { struct tdx_module_args args = {}; - static DEFINE_RAW_SPINLOCK(sysinit_lock); - static bool sysinit_done; - static int sysinit_ret; + int ret; raw_spin_lock(&sysinit_lock); - if (sysinit_done) + /* Return the "cached" return code. */ + if (tdx_module_state.sysinit_done) { + ret = tdx_module_state.sysinit_ret; goto out; + } /* RCX is module attributes and all bits are reserved */ args.rcx = 0; - sysinit_ret = seamcall_prerr(TDH_SYS_INIT, &args); + ret = seamcall_prerr(TDH_SYS_INIT, &args); /* * The first SEAMCALL also detects the TDX module, thus * it can fail due to the TDX module is not loaded. * Dump message to let the user know. */ - if (sysinit_ret == -ENODEV) + if (ret == -ENODEV) pr_err("module not loaded\n"); - sysinit_done = true; + /* Save the return code for later callers. */ + tdx_module_state.sysinit_done = true; + tdx_module_state.sysinit_ret = ret; out: raw_spin_unlock(&sysinit_lock); - return sysinit_ret; + return ret; } /** @@ -143,7 +113,7 @@ out: * (and TDX module global initialization SEAMCALL if not done) on local cpu to * make this cpu be ready to run any other SEAMCALLs. */ -static int tdx_cpu_enable(void) +int tdx_cpu_enable(void) { struct tdx_module_args args = {}; int ret; @@ -184,6 +154,17 @@ static int tdx_online_cpu(unsigned int cpu) return ret; } +static void tdx_cpu_flush_cache(void) +{ + lockdep_assert_preemption_disabled(); + + if (!this_cpu_read(cache_state_incoherent)) + return; + + wbinvd(); + this_cpu_write(cache_state_incoherent, false); +} + static int tdx_offline_cpu(unsigned int cpu) { int i; @@ -220,17 +201,34 @@ static int tdx_offline_cpu(unsigned int cpu) return -EBUSY; done: + /* + * Flush cache on the CPU going offline to ensure no dirty + * cachelines of TDX private memory remain. This may be + * redundant with WBINVD done elsewhere during CPU offline + * (e.g. hlt_play_dead()), but do it explicitly for safety. + */ + tdx_cpu_flush_cache(); x86_virt_put_ref(X86_FEATURE_VMX); return 0; } static void tdx_shutdown_cpu(void *ign) { + /* + * Flush cache in preparation for kexec - this is necessary to avoid + * having dirty private memory cachelines when the new kernel boots, + * but WBINVD is a relatively expensive operation and doing it during + * kexec can exacerbate races in native_stop_other_cpus(). Do it + * now, since this is a safe moment and there is going to be no more + * TDX activity on this CPU from this point on. + */ + tdx_cpu_flush_cache(); x86_virt_put_ref(X86_FEATURE_VMX); } static void tdx_shutdown(void *ign) { + tdx_sys_disable(); on_each_cpu(tdx_shutdown_cpu, NULL, 1); } @@ -330,7 +328,7 @@ err: return ret; } -static __init int read_sys_metadata_field(u64 field_id, u64 *data) +static int read_sys_metadata_field(u64 field_id, u64 *data) { struct tdx_module_args args = {}; int ret; @@ -1270,12 +1268,70 @@ static __init int tdx_enable(void) register_syscore(&tdx_syscore); - tdx_module_initialized = true; + tdx_module_state.initialized = true; pr_info("TDX-Module initialized\n"); return 0; } subsys_initcall(tdx_enable); +int tdx_module_shutdown(void) +{ + struct tdx_sys_info_handoff handoff = {}; + struct tdx_module_args args = {}; + int ret; + int cpu; + + ret = get_tdx_sys_info_handoff(&handoff); + /* + * Handoff information is required for proper + * shutdown. Refuse to shut down without it. + */ + if (ret) + return ret; + + /* + * Use the module's handoff version as it is the highest the + * module can produce and most likely supported by newer modules. + */ + args.rcx = handoff.module_hv; + + ret = seamcall_prerr(TDH_SYS_SHUTDOWN, &args); + if (ret) + return ret; + + /* + * Clear global and per-CPU initialization flags so the new module + * can be fully re-initialized after a successful update. + * + * No locks needed as no concurrent accesses can occur here. + */ + memset(&tdx_module_state, 0, sizeof(tdx_module_state)); + for_each_possible_cpu(cpu) + per_cpu(tdx_lp_initialized, cpu) = false; + + return 0; +} + +int tdx_module_run_update(void) +{ + struct tdx_module_args args = {}; + int ret; + + ret = seamcall_prerr(TDH_SYS_UPDATE, &args); + if (ret) + return ret; + + ret = get_tdx_sys_info_version(&tdx_sysinfo.version); + /* + * Only fails if there is something unexpected + * and severely wrong with the module. + */ + WARN_ON_ONCE(ret); + + tdx_module_state.initialized = true; + return 0; +} + static bool is_pamt_page(unsigned long phys) { struct tdmr_info_list *tdmr_list = &tdx_tdmr_list; @@ -1453,6 +1509,8 @@ static struct notifier_block tdx_memory_nb = { static void __init check_tdx_erratum(void) { + u64 basic_msr; + /* * These CPUs have an erratum. A partial write from non-TD * software (e.g. via MOVNTI variants or UC/WC mapping) to TDX @@ -1464,6 +1522,14 @@ static void __init check_tdx_erratum(void) case INTEL_EMERALDRAPIDS_X: setup_force_cpu_bug(X86_BUG_TDX_PW_MCE); } + + /* + * Some TDX-capable CPUs have an erratum where the current VMCS is + * cleared after calling into P-SEAMLDR. + */ + rdmsrq(MSR_IA32_VMX_BASIC, basic_msr); + if (!(basic_msr & VMX_BASIC_NO_SEAMRET_INVD_VMCS)) + setup_force_cpu_bug(X86_BUG_SEAMRET_INVD_VMCS); } void __init tdx_init(void) @@ -1525,12 +1591,12 @@ void __init tdx_init(void) const struct tdx_sys_info *tdx_get_sysinfo(void) { - if (!tdx_module_initialized) + if (!tdx_module_state.initialized) return NULL; return (const struct tdx_sys_info *)&tdx_sysinfo; } -EXPORT_SYMBOL_FOR_KVM(tdx_get_sysinfo); +EXPORT_SYMBOL_FOR_MODULES(tdx_get_sysinfo, "kvm-intel,tdx-host"); u32 tdx_get_nr_guest_keyids(void) { @@ -1921,21 +1987,32 @@ u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, struct page *page) } EXPORT_SYMBOL_FOR_KVM(tdh_phymem_page_wbinvd_hkid); -#ifdef CONFIG_KEXEC_CORE -void tdx_cpu_flush_cache_for_kexec(void) +void tdx_sys_disable(void) { - lockdep_assert_preemption_disabled(); + struct tdx_module_args args = {}; + u64 ret; - if (!this_cpu_read(cache_state_incoherent)) - return; + /* + * Don't loop forever. + * + * - TDX_INTERRUPTED_RESUMABLE guarantees forward progress between + * calls. + * + * - TDX_SYS_BUSY could be returned due to contention with other + * TDH.SYS.* SEAMCALLs, but will lock out *new* TDH.SYS.* SEAMCALLs, + * so that SYS.DISABLE can eventually make progress. + * + * This is a 'destructive' SEAMCALL, in that no other SEAMCALL can be + * run after this until a full reinitialization is done. + */ + do { + ret = seamcall(TDH_SYS_DISABLE, &args); + } while (ret == TDX_INTERRUPTED_RESUMABLE || ret == TDX_SYS_BUSY); /* - * Private memory cachelines need to be clean at the time of - * kexec. Write them back now, as the caller promises that - * there should be no more SEAMCALLs on this CPU. + * Print SEAMCALL failures, but not SW-defined error codes + * (SEAMCALL faulted with #GP/#UD, TDX not supported). */ - wbinvd(); - this_cpu_write(cache_state_incoherent, false); + if (ret && (ret & TDX_SW_ERROR) != TDX_SW_ERROR) + pr_err("TDH.SYS.DISABLE failed: 0x%016llx\n", ret); } -EXPORT_SYMBOL_FOR_KVM(tdx_cpu_flush_cache_for_kexec); -#endif diff --git a/arch/x86/virt/vmx/tdx/tdx.h b/arch/x86/virt/vmx/tdx/tdx.h index dde219c823b41..bdfd0e1e337ac 100644 --- a/arch/x86/virt/vmx/tdx/tdx.h +++ b/arch/x86/virt/vmx/tdx/tdx.h @@ -46,6 +46,9 @@ #define TDH_PHYMEM_PAGE_WBINVD 41 #define TDH_VP_WR 43 #define TDH_SYS_CONFIG 45 +#define TDH_SYS_SHUTDOWN 52 +#define TDH_SYS_UPDATE 53 +#define TDH_SYS_DISABLE 69 /* * SEAMCALL leaf: @@ -84,9 +87,6 @@ struct tdmr_info { DECLARE_FLEX_ARRAY(struct tdmr_reserved_area, reserved_areas); } __packed __aligned(TDMR_INFO_ALIGNMENT); -/* Bit definitions of TDX_FEATURES0 metadata field */ -#define TDX_FEATURES0_NO_RBP_MOD BIT(18) - /* * Do not put any hardware-defined TDX structure representations below * this comment! @@ -110,4 +110,7 @@ struct tdmr_info_list { int max_tdmrs; /* How many 'tdmr_info's are allocated */ }; +int tdx_module_shutdown(void); +int tdx_module_run_update(void); + #endif diff --git a/arch/x86/virt/vmx/tdx/tdx_global_metadata.c b/arch/x86/virt/vmx/tdx/tdx_global_metadata.c index c7db393a9cfb1..e49c300f23d43 100644 --- a/arch/x86/virt/vmx/tdx/tdx_global_metadata.c +++ b/arch/x86/virt/vmx/tdx/tdx_global_metadata.c @@ -7,7 +7,7 @@ * Include this file to other C file instead. */ -static __init int get_tdx_sys_info_version(struct tdx_sys_info_version *sysinfo_version) +static int get_tdx_sys_info_version(struct tdx_sys_info_version *sysinfo_version) { int ret = 0; u64 val; @@ -100,13 +100,26 @@ static __init int get_tdx_sys_info_td_conf(struct tdx_sys_info_td_conf *sysinfo_ return ret; } +static int get_tdx_sys_info_handoff(struct tdx_sys_info_handoff *sysinfo_handoff) +{ + int ret; + u64 val; + + ret = read_sys_metadata_field(0x8900000100000000, &val); + if (ret) + return ret; + + sysinfo_handoff->module_hv = val; + return 0; +} + static __init int get_tdx_sys_info(struct tdx_sys_info *sysinfo) { int ret = 0; ret = ret ?: get_tdx_sys_info_version(&sysinfo->version); - pr_info("Module version: %u.%u.%02u\n", + pr_info("Module version: " TDX_VERSION_FMT "\n", sysinfo->version.major_version, sysinfo->version.minor_version, sysinfo->version.update_version); |
