diff options
| author | Ard Biesheuvel <ardb@kernel.org> | 2026-04-22 19:16:59 +0200 |
|---|---|---|
| committer | Eric Biggers <ebiggers@kernel.org> | 2026-05-28 13:14:21 -0700 |
| commit | 4d3c5cbfe2b55c12ad1d866ddf0928d13d775816 (patch) | |
| tree | e6aa3b81f1d2282ea60200b3c326602341d96ac4 /lib | |
| parent | a967b1f51c83b65372a93d652a888e3addd9c5a3 (diff) | |
| download | linux-next-history-4d3c5cbfe2b55c12ad1d866ddf0928d13d775816.tar.gz | |
xor/arm64: Use shared NEON intrinsics implementation from 32-bit ARM
Tweak the arm64 code so that the pure NEON intrinsics implementation of
XOR is shared between arm64 and ARM. While at it, rename the arm64
specific piece xor-eor3.c to reflect that only the version based on the
EOR3 instruction is kept there.
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://patch.msgid.link/20260422171655.3437334-13-ardb+git@google.com
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
Diffstat (limited to 'lib')
| -rw-r--r-- | lib/raid/xor/Makefile | 7 | ||||
| -rw-r--r-- | lib/raid/xor/arm64/xor-eor3.c | 146 | ||||
| -rw-r--r-- | lib/raid/xor/arm64/xor-neon.c | 312 | ||||
| -rw-r--r-- | lib/raid/xor/xor-neon.c | 4 |
4 files changed, 154 insertions, 315 deletions
diff --git a/lib/raid/xor/Makefile b/lib/raid/xor/Makefile index d78400f2427ab..e8ecec3c09f9f 100644 --- a/lib/raid/xor/Makefile +++ b/lib/raid/xor/Makefile @@ -19,7 +19,8 @@ xor-$(CONFIG_ARM) += arm/xor.o ifeq ($(CONFIG_ARM),y) xor-$(CONFIG_KERNEL_MODE_NEON) += xor-neon.o arm/xor-neon-glue.o endif -xor-$(CONFIG_ARM64) += arm64/xor-neon.o arm64/xor-neon-glue.o +xor-$(CONFIG_ARM64) += xor-neon.o arm64/xor-eor3.o \ + arm64/xor-neon-glue.o xor-$(CONFIG_CPU_HAS_LSX) += loongarch/xor_simd.o xor-$(CONFIG_CPU_HAS_LSX) += loongarch/xor_simd_glue.o xor-$(CONFIG_ALTIVEC) += powerpc/xor_vmx.o powerpc/xor_vmx_glue.o @@ -34,8 +35,8 @@ obj-y += tests/ CFLAGS_xor-neon.o += $(CC_FLAGS_FPU) -I$(src)/$(SRCARCH) CFLAGS_REMOVE_xor-neon.o += $(CC_FLAGS_NO_FPU) -CFLAGS_arm64/xor-neon.o += $(CC_FLAGS_FPU) -CFLAGS_REMOVE_arm64/xor-neon.o += $(CC_FLAGS_NO_FPU) +CFLAGS_arm64/xor-eor3.o += $(CC_FLAGS_FPU) +CFLAGS_REMOVE_arm64/xor-eor3.o += $(CC_FLAGS_NO_FPU) CFLAGS_powerpc/xor_vmx.o += -mhard-float -maltivec \ $(call cc-option,-mabi=altivec) \ diff --git a/lib/raid/xor/arm64/xor-eor3.c b/lib/raid/xor/arm64/xor-eor3.c new file mode 100644 index 0000000000000..e44016c363f1c --- /dev/null +++ b/lib/raid/xor/arm64/xor-eor3.c @@ -0,0 +1,146 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/cache.h> +#include <asm/neon-intrinsics.h> +#include "xor_impl.h" +#include "xor_arch.h" +#include "xor-neon.h" + +extern void __xor_eor3_2(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2); + +static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r) +{ + uint64x2_t res; + + asm(ARM64_ASM_PREAMBLE ".arch_extension sha3\n" + "eor3 %0.16b, %1.16b, %2.16b, %3.16b" + : "=w"(res) : "w"(p), "w"(q), "w"(r)); + return res; +} + +static void __xor_eor3_3(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3) +{ + uint64_t *dp1 = (uint64_t *)p1; + uint64_t *dp2 = (uint64_t *)p2; + uint64_t *dp3 = (uint64_t *)p3; + + register uint64x2_t v0, v1, v2, v3; + long lines = bytes / (sizeof(uint64x2_t) * 4); + + do { + /* p1 ^= p2 ^ p3 */ + v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0), + vld1q_u64(dp3 + 0)); + v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2), + vld1q_u64(dp3 + 2)); + v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4), + vld1q_u64(dp3 + 4)); + v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6), + vld1q_u64(dp3 + 6)); + + /* store */ + vst1q_u64(dp1 + 0, v0); + vst1q_u64(dp1 + 2, v1); + vst1q_u64(dp1 + 4, v2); + vst1q_u64(dp1 + 6, v3); + + dp1 += 8; + dp2 += 8; + dp3 += 8; + } while (--lines > 0); +} + +static void __xor_eor3_4(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4) +{ + uint64_t *dp1 = (uint64_t *)p1; + uint64_t *dp2 = (uint64_t *)p2; + uint64_t *dp3 = (uint64_t *)p3; + uint64_t *dp4 = (uint64_t *)p4; + + register uint64x2_t v0, v1, v2, v3; + long lines = bytes / (sizeof(uint64x2_t) * 4); + + do { + /* p1 ^= p2 ^ p3 */ + v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0), + vld1q_u64(dp3 + 0)); + v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2), + vld1q_u64(dp3 + 2)); + v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4), + vld1q_u64(dp3 + 4)); + v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6), + vld1q_u64(dp3 + 6)); + + /* p1 ^= p4 */ + v0 = veorq_u64(v0, vld1q_u64(dp4 + 0)); + v1 = veorq_u64(v1, vld1q_u64(dp4 + 2)); + v2 = veorq_u64(v2, vld1q_u64(dp4 + 4)); + v3 = veorq_u64(v3, vld1q_u64(dp4 + 6)); + + /* store */ + vst1q_u64(dp1 + 0, v0); + vst1q_u64(dp1 + 2, v1); + vst1q_u64(dp1 + 4, v2); + vst1q_u64(dp1 + 6, v3); + + dp1 += 8; + dp2 += 8; + dp3 += 8; + dp4 += 8; + } while (--lines > 0); +} + +static void __xor_eor3_5(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4, + const unsigned long * __restrict p5) +{ + uint64_t *dp1 = (uint64_t *)p1; + uint64_t *dp2 = (uint64_t *)p2; + uint64_t *dp3 = (uint64_t *)p3; + uint64_t *dp4 = (uint64_t *)p4; + uint64_t *dp5 = (uint64_t *)p5; + + register uint64x2_t v0, v1, v2, v3; + long lines = bytes / (sizeof(uint64x2_t) * 4); + + do { + /* p1 ^= p2 ^ p3 */ + v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0), + vld1q_u64(dp3 + 0)); + v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2), + vld1q_u64(dp3 + 2)); + v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4), + vld1q_u64(dp3 + 4)); + v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6), + vld1q_u64(dp3 + 6)); + + /* p1 ^= p4 ^ p5 */ + v0 = eor3(v0, vld1q_u64(dp4 + 0), vld1q_u64(dp5 + 0)); + v1 = eor3(v1, vld1q_u64(dp4 + 2), vld1q_u64(dp5 + 2)); + v2 = eor3(v2, vld1q_u64(dp4 + 4), vld1q_u64(dp5 + 4)); + v3 = eor3(v3, vld1q_u64(dp4 + 6), vld1q_u64(dp5 + 6)); + + /* store */ + vst1q_u64(dp1 + 0, v0); + vst1q_u64(dp1 + 2, v1); + vst1q_u64(dp1 + 4, v2); + vst1q_u64(dp1 + 6, v3); + + dp1 += 8; + dp2 += 8; + dp3 += 8; + dp4 += 8; + dp5 += 8; + } while (--lines > 0); +} + +__DO_XOR_BLOCKS(eor3_inner, __xor_eor3_2, __xor_eor3_3, __xor_eor3_4, + __xor_eor3_5); diff --git a/lib/raid/xor/arm64/xor-neon.c b/lib/raid/xor/arm64/xor-neon.c deleted file mode 100644 index 97ef3cb924968..0000000000000 --- a/lib/raid/xor/arm64/xor-neon.c +++ /dev/null @@ -1,312 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Authors: Jackie Liu <liuyun01@kylinos.cn> - * Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd. - */ - -#include <linux/cache.h> -#include <asm/neon-intrinsics.h> -#include "xor_impl.h" -#include "xor_arch.h" -#include "xor-neon.h" - -static void __xor_neon_2(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2) -{ - uint64_t *dp1 = (uint64_t *)p1; - uint64_t *dp2 = (uint64_t *)p2; - - register uint64x2_t v0, v1, v2, v3; - long lines = bytes / (sizeof(uint64x2_t) * 4); - - do { - /* p1 ^= p2 */ - v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0)); - v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2)); - v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4)); - v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6)); - - /* store */ - vst1q_u64(dp1 + 0, v0); - vst1q_u64(dp1 + 2, v1); - vst1q_u64(dp1 + 4, v2); - vst1q_u64(dp1 + 6, v3); - - dp1 += 8; - dp2 += 8; - } while (--lines > 0); -} - -static void __xor_neon_3(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2, - const unsigned long * __restrict p3) -{ - uint64_t *dp1 = (uint64_t *)p1; - uint64_t *dp2 = (uint64_t *)p2; - uint64_t *dp3 = (uint64_t *)p3; - - register uint64x2_t v0, v1, v2, v3; - long lines = bytes / (sizeof(uint64x2_t) * 4); - - do { - /* p1 ^= p2 */ - v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0)); - v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2)); - v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4)); - v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6)); - - /* p1 ^= p3 */ - v0 = veorq_u64(v0, vld1q_u64(dp3 + 0)); - v1 = veorq_u64(v1, vld1q_u64(dp3 + 2)); - v2 = veorq_u64(v2, vld1q_u64(dp3 + 4)); - v3 = veorq_u64(v3, vld1q_u64(dp3 + 6)); - - /* store */ - vst1q_u64(dp1 + 0, v0); - vst1q_u64(dp1 + 2, v1); - vst1q_u64(dp1 + 4, v2); - vst1q_u64(dp1 + 6, v3); - - dp1 += 8; - dp2 += 8; - dp3 += 8; - } while (--lines > 0); -} - -static void __xor_neon_4(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2, - const unsigned long * __restrict p3, - const unsigned long * __restrict p4) -{ - uint64_t *dp1 = (uint64_t *)p1; - uint64_t *dp2 = (uint64_t *)p2; - uint64_t *dp3 = (uint64_t *)p3; - uint64_t *dp4 = (uint64_t *)p4; - - register uint64x2_t v0, v1, v2, v3; - long lines = bytes / (sizeof(uint64x2_t) * 4); - - do { - /* p1 ^= p2 */ - v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0)); - v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2)); - v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4)); - v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6)); - - /* p1 ^= p3 */ - v0 = veorq_u64(v0, vld1q_u64(dp3 + 0)); - v1 = veorq_u64(v1, vld1q_u64(dp3 + 2)); - v2 = veorq_u64(v2, vld1q_u64(dp3 + 4)); - v3 = veorq_u64(v3, vld1q_u64(dp3 + 6)); - - /* p1 ^= p4 */ - v0 = veorq_u64(v0, vld1q_u64(dp4 + 0)); - v1 = veorq_u64(v1, vld1q_u64(dp4 + 2)); - v2 = veorq_u64(v2, vld1q_u64(dp4 + 4)); - v3 = veorq_u64(v3, vld1q_u64(dp4 + 6)); - - /* store */ - vst1q_u64(dp1 + 0, v0); - vst1q_u64(dp1 + 2, v1); - vst1q_u64(dp1 + 4, v2); - vst1q_u64(dp1 + 6, v3); - - dp1 += 8; - dp2 += 8; - dp3 += 8; - dp4 += 8; - } while (--lines > 0); -} - -static void __xor_neon_5(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2, - const unsigned long * __restrict p3, - const unsigned long * __restrict p4, - const unsigned long * __restrict p5) -{ - uint64_t *dp1 = (uint64_t *)p1; - uint64_t *dp2 = (uint64_t *)p2; - uint64_t *dp3 = (uint64_t *)p3; - uint64_t *dp4 = (uint64_t *)p4; - uint64_t *dp5 = (uint64_t *)p5; - - register uint64x2_t v0, v1, v2, v3; - long lines = bytes / (sizeof(uint64x2_t) * 4); - - do { - /* p1 ^= p2 */ - v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0)); - v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2)); - v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4)); - v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6)); - - /* p1 ^= p3 */ - v0 = veorq_u64(v0, vld1q_u64(dp3 + 0)); - v1 = veorq_u64(v1, vld1q_u64(dp3 + 2)); - v2 = veorq_u64(v2, vld1q_u64(dp3 + 4)); - v3 = veorq_u64(v3, vld1q_u64(dp3 + 6)); - - /* p1 ^= p4 */ - v0 = veorq_u64(v0, vld1q_u64(dp4 + 0)); - v1 = veorq_u64(v1, vld1q_u64(dp4 + 2)); - v2 = veorq_u64(v2, vld1q_u64(dp4 + 4)); - v3 = veorq_u64(v3, vld1q_u64(dp4 + 6)); - - /* p1 ^= p5 */ - v0 = veorq_u64(v0, vld1q_u64(dp5 + 0)); - v1 = veorq_u64(v1, vld1q_u64(dp5 + 2)); - v2 = veorq_u64(v2, vld1q_u64(dp5 + 4)); - v3 = veorq_u64(v3, vld1q_u64(dp5 + 6)); - - /* store */ - vst1q_u64(dp1 + 0, v0); - vst1q_u64(dp1 + 2, v1); - vst1q_u64(dp1 + 4, v2); - vst1q_u64(dp1 + 6, v3); - - dp1 += 8; - dp2 += 8; - dp3 += 8; - dp4 += 8; - dp5 += 8; - } while (--lines > 0); -} - -__DO_XOR_BLOCKS(neon_inner, __xor_neon_2, __xor_neon_3, __xor_neon_4, - __xor_neon_5); - -static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r) -{ - uint64x2_t res; - - asm(ARM64_ASM_PREAMBLE ".arch_extension sha3\n" - "eor3 %0.16b, %1.16b, %2.16b, %3.16b" - : "=w"(res) : "w"(p), "w"(q), "w"(r)); - return res; -} - -static void __xor_eor3_3(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2, - const unsigned long * __restrict p3) -{ - uint64_t *dp1 = (uint64_t *)p1; - uint64_t *dp2 = (uint64_t *)p2; - uint64_t *dp3 = (uint64_t *)p3; - - register uint64x2_t v0, v1, v2, v3; - long lines = bytes / (sizeof(uint64x2_t) * 4); - - do { - /* p1 ^= p2 ^ p3 */ - v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0), - vld1q_u64(dp3 + 0)); - v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2), - vld1q_u64(dp3 + 2)); - v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4), - vld1q_u64(dp3 + 4)); - v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6), - vld1q_u64(dp3 + 6)); - - /* store */ - vst1q_u64(dp1 + 0, v0); - vst1q_u64(dp1 + 2, v1); - vst1q_u64(dp1 + 4, v2); - vst1q_u64(dp1 + 6, v3); - - dp1 += 8; - dp2 += 8; - dp3 += 8; - } while (--lines > 0); -} - -static void __xor_eor3_4(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2, - const unsigned long * __restrict p3, - const unsigned long * __restrict p4) -{ - uint64_t *dp1 = (uint64_t *)p1; - uint64_t *dp2 = (uint64_t *)p2; - uint64_t *dp3 = (uint64_t *)p3; - uint64_t *dp4 = (uint64_t *)p4; - - register uint64x2_t v0, v1, v2, v3; - long lines = bytes / (sizeof(uint64x2_t) * 4); - - do { - /* p1 ^= p2 ^ p3 */ - v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0), - vld1q_u64(dp3 + 0)); - v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2), - vld1q_u64(dp3 + 2)); - v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4), - vld1q_u64(dp3 + 4)); - v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6), - vld1q_u64(dp3 + 6)); - - /* p1 ^= p4 */ - v0 = veorq_u64(v0, vld1q_u64(dp4 + 0)); - v1 = veorq_u64(v1, vld1q_u64(dp4 + 2)); - v2 = veorq_u64(v2, vld1q_u64(dp4 + 4)); - v3 = veorq_u64(v3, vld1q_u64(dp4 + 6)); - - /* store */ - vst1q_u64(dp1 + 0, v0); - vst1q_u64(dp1 + 2, v1); - vst1q_u64(dp1 + 4, v2); - vst1q_u64(dp1 + 6, v3); - - dp1 += 8; - dp2 += 8; - dp3 += 8; - dp4 += 8; - } while (--lines > 0); -} - -static void __xor_eor3_5(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2, - const unsigned long * __restrict p3, - const unsigned long * __restrict p4, - const unsigned long * __restrict p5) -{ - uint64_t *dp1 = (uint64_t *)p1; - uint64_t *dp2 = (uint64_t *)p2; - uint64_t *dp3 = (uint64_t *)p3; - uint64_t *dp4 = (uint64_t *)p4; - uint64_t *dp5 = (uint64_t *)p5; - - register uint64x2_t v0, v1, v2, v3; - long lines = bytes / (sizeof(uint64x2_t) * 4); - - do { - /* p1 ^= p2 ^ p3 */ - v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0), - vld1q_u64(dp3 + 0)); - v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2), - vld1q_u64(dp3 + 2)); - v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4), - vld1q_u64(dp3 + 4)); - v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6), - vld1q_u64(dp3 + 6)); - - /* p1 ^= p4 ^ p5 */ - v0 = eor3(v0, vld1q_u64(dp4 + 0), vld1q_u64(dp5 + 0)); - v1 = eor3(v1, vld1q_u64(dp4 + 2), vld1q_u64(dp5 + 2)); - v2 = eor3(v2, vld1q_u64(dp4 + 4), vld1q_u64(dp5 + 4)); - v3 = eor3(v3, vld1q_u64(dp4 + 6), vld1q_u64(dp5 + 6)); - - /* store */ - vst1q_u64(dp1 + 0, v0); - vst1q_u64(dp1 + 2, v1); - vst1q_u64(dp1 + 4, v2); - vst1q_u64(dp1 + 6, v3); - - dp1 += 8; - dp2 += 8; - dp3 += 8; - dp4 += 8; - dp5 += 8; - } while (--lines > 0); -} - -__DO_XOR_BLOCKS(eor3_inner, __xor_neon_2, __xor_eor3_3, __xor_eor3_4, - __xor_eor3_5); diff --git a/lib/raid/xor/xor-neon.c b/lib/raid/xor/xor-neon.c index a3e2b4af8d362..c7c3cf634e23a 100644 --- a/lib/raid/xor/xor-neon.c +++ b/lib/raid/xor/xor-neon.c @@ -173,3 +173,7 @@ static void __xor_neon_5(unsigned long bytes, unsigned long * __restrict p1, __DO_XOR_BLOCKS(neon_inner, __xor_neon_2, __xor_neon_3, __xor_neon_4, __xor_neon_5); + +#ifdef CONFIG_ARM64 +extern typeof(__xor_neon_2) __xor_eor3_2 __alias(__xor_neon_2); +#endif |
