diff options
| -rw-r--r-- | Documentation/arch/arm/kernel_mode_neon.rst | 4 | ||||
| -rw-r--r-- | arch/arm/include/asm/neon-intrinsics.h | 60 | ||||
| -rw-r--r-- | crypto/Makefile | 10 | ||||
| -rw-r--r-- | crypto/aegis128-neon-inner.c | 4 | ||||
| -rw-r--r-- | lib/crc/Kconfig | 1 | ||||
| -rw-r--r-- | lib/crc/Makefile | 9 | ||||
| -rw-r--r-- | lib/crc/arm/crc64-neon.h | 34 | ||||
| -rw-r--r-- | lib/crc/arm/crc64.h | 36 | ||||
| -rw-r--r-- | lib/crc/arm64/crc64-neon.h | 21 | ||||
| -rw-r--r-- | lib/crc/arm64/crc64.h | 4 | ||||
| -rw-r--r-- | lib/crc/crc64-neon.c (renamed from lib/crc/arm64/crc64-neon-inner.c) | 26 | ||||
| -rw-r--r-- | lib/raid/xor/Makefile | 13 | ||||
| -rw-r--r-- | lib/raid/xor/arm/xor-neon.c | 26 | ||||
| -rw-r--r-- | lib/raid/xor/arm/xor-neon.h | 7 | ||||
| -rw-r--r-- | lib/raid/xor/arm/xor_arch.h | 7 | ||||
| -rw-r--r-- | lib/raid/xor/arm64/xor-eor3.c | 146 | ||||
| -rw-r--r-- | lib/raid/xor/xor-8regs.c | 2 | ||||
| -rw-r--r-- | lib/raid/xor/xor-neon.c (renamed from lib/raid/xor/arm64/xor-neon.c) | 143 |
18 files changed, 338 insertions, 215 deletions
diff --git a/Documentation/arch/arm/kernel_mode_neon.rst b/Documentation/arch/arm/kernel_mode_neon.rst index 9bfb71a2a9b96..1efb6d35b7bd0 100644 --- a/Documentation/arch/arm/kernel_mode_neon.rst +++ b/Documentation/arch/arm/kernel_mode_neon.rst @@ -121,4 +121,6 @@ observe the following in addition to the rules above: * Compile the unit containing the NEON intrinsics with '-ffreestanding' so GCC uses its builtin version of <stdint.h> (this is a C99 header which the kernel does not supply); -* Include <arm_neon.h> last, or at least after <linux/types.h> +* Do not include <arm_neon.h> directly: instead, include <asm/neon-intrinsics.h>, + which tweaks some macro definitions so that system headers can be included + safely. diff --git a/arch/arm/include/asm/neon-intrinsics.h b/arch/arm/include/asm/neon-intrinsics.h new file mode 100644 index 0000000000000..8b80c05ce1d7d --- /dev/null +++ b/arch/arm/include/asm/neon-intrinsics.h @@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __ASM_NEON_INTRINSICS_H +#define __ASM_NEON_INTRINSICS_H + +#ifndef __ARM_NEON__ +#error You should compile this file with '-march=armv7-a -mfloat-abi=softfp -mfpu=neon' +#endif + +#include <asm-generic/int-ll64.h> + +/* + * The C99 types uintXX_t that are usually defined in 'stdint.h' are not as + * unambiguous on ARM as you would expect. For the types below, there is a + * difference on ARM between GCC built for bare metal ARM, GCC built for glibc + * and the kernel itself, which results in build errors if you try to build + * with -ffreestanding and include 'stdint.h' (such as when you include + * 'arm_neon.h' in order to use NEON intrinsics) + * + * As the typedefs for these types in 'stdint.h' are based on builtin defines + * supplied by GCC, we can tweak these to align with the kernel's idea of those + * types, so 'linux/types.h' and 'stdint.h' can be safely included from the + * same source file (provided that -ffreestanding is used). + * + * int32_t uint32_t intptr_t uintptr_t + * bare metal GCC long unsigned long int unsigned int + * glibc GCC int unsigned int int unsigned int + * kernel int unsigned int long unsigned long + */ + +#ifdef __INT32_TYPE__ +#undef __INT32_TYPE__ +#define __INT32_TYPE__ int +#endif + +#ifdef __UINT32_TYPE__ +#undef __UINT32_TYPE__ +#define __UINT32_TYPE__ unsigned int +#endif + +#ifdef __INTPTR_TYPE__ +#undef __INTPTR_TYPE__ +#define __INTPTR_TYPE__ long +#endif + +#ifdef __UINTPTR_TYPE__ +#undef __UINTPTR_TYPE__ +#define __UINTPTR_TYPE__ unsigned long +#endif + +/* + * genksyms chokes on the ARM NEON instrinsics system header, but we + * don't export anything it defines anyway, so just disregard when + * genksyms execute. + */ +#ifndef __GENKSYMS__ +#include <arm_neon.h> +#endif + +#endif /* __ASM_NEON_INTRINSICS_H */ diff --git a/crypto/Makefile b/crypto/Makefile index c73f4d51d0368..481ee417ff446 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -103,13 +103,14 @@ obj-$(CONFIG_CRYPTO_CHACHA20POLY1305) += chacha20poly1305.o obj-$(CONFIG_CRYPTO_AEGIS128) += aegis128.o aegis128-y := aegis128-core.o +CFLAGS_aegis128-neon-inner.o += $(CC_FLAGS_FPU) +CFLAGS_REMOVE_aegis128-neon-inner.o += $(CC_FLAGS_NO_FPU) ifeq ($(ARCH),arm) -CFLAGS_aegis128-neon-inner.o += -ffreestanding -march=armv8-a -mfloat-abi=softfp -CFLAGS_aegis128-neon-inner.o += -mfpu=crypto-neon-fp-armv8 +CFLAGS_aegis128-neon-inner.o += -march=armv8-a -mfpu=crypto-neon-fp-armv8 aegis128-$(CONFIG_CRYPTO_AEGIS128_SIMD) += aegis128-neon.o aegis128-neon-inner.o endif ifeq ($(ARCH),arm64) -aegis128-cflags-y := -ffreestanding -mcpu=generic+crypto +aegis128-cflags-y := -mcpu=generic+crypto aegis128-cflags-$(CONFIG_CC_IS_GCC) += -ffixed-q16 -ffixed-q17 -ffixed-q18 \ -ffixed-q19 -ffixed-q20 -ffixed-q21 \ -ffixed-q22 -ffixed-q23 -ffixed-q24 \ @@ -117,11 +118,8 @@ aegis128-cflags-$(CONFIG_CC_IS_GCC) += -ffixed-q16 -ffixed-q17 -ffixed-q18 \ -ffixed-q28 -ffixed-q29 -ffixed-q30 \ -ffixed-q31 CFLAGS_aegis128-neon-inner.o += $(aegis128-cflags-y) -CFLAGS_REMOVE_aegis128-neon-inner.o += -mgeneral-regs-only aegis128-$(CONFIG_CRYPTO_AEGIS128_SIMD) += aegis128-neon.o aegis128-neon-inner.o endif -# Enable <arm_neon.h> -CFLAGS_aegis128-neon-inner.o += -isystem $(shell $(CC) -print-file-name=include) obj-$(CONFIG_CRYPTO_PCRYPT) += pcrypt.o obj-$(CONFIG_CRYPTO_CRYPTD) += cryptd.o diff --git a/crypto/aegis128-neon-inner.c b/crypto/aegis128-neon-inner.c index b6a52a386b220..56b534eeb6807 100644 --- a/crypto/aegis128-neon-inner.c +++ b/crypto/aegis128-neon-inner.c @@ -3,13 +3,11 @@ * Copyright (C) 2019 Linaro, Ltd. <ard.biesheuvel@linaro.org> */ -#ifdef CONFIG_ARM64 #include <asm/neon-intrinsics.h> +#ifdef CONFIG_ARM64 #define AES_ROUND "aese %0.16b, %1.16b \n\t aesmc %0.16b, %0.16b" #else -#include <arm_neon.h> - #define AES_ROUND "aese.8 %q0, %q1 \n\t aesmc.8 %q0, %q0" #endif diff --git a/lib/crc/Kconfig b/lib/crc/Kconfig index f47bb4c706fb7..927fc6a6b2b9d 100644 --- a/lib/crc/Kconfig +++ b/lib/crc/Kconfig @@ -82,6 +82,7 @@ config CRC64 config CRC64_ARCH bool depends on CRC64 && CRC_OPTIMIZATIONS + default y if ARM && KERNEL_MODE_NEON && !CPU_BIG_ENDIAN default y if ARM64 default y if RISCV && RISCV_ISA_ZBC && 64BIT default y if X86_64 diff --git a/lib/crc/Makefile b/lib/crc/Makefile index ff213590e4e31..386e9c1752632 100644 --- a/lib/crc/Makefile +++ b/lib/crc/Makefile @@ -39,9 +39,12 @@ crc64-y := crc64-main.o ifeq ($(CONFIG_CRC64_ARCH),y) CFLAGS_crc64-main.o += -I$(src)/$(SRCARCH) -CFLAGS_REMOVE_arm64/crc64-neon-inner.o += $(CC_FLAGS_NO_FPU) -CFLAGS_arm64/crc64-neon-inner.o += $(CC_FLAGS_FPU) -march=armv8-a+crypto -crc64-$(CONFIG_ARM64) += arm64/crc64-neon-inner.o +crc64-cflags-$(CONFIG_ARM) += -march=armv8-a -mfpu=crypto-neon-fp-armv8 +crc64-cflags-$(CONFIG_ARM64) += -march=armv8-a+crypto +CFLAGS_REMOVE_crc64-neon.o += $(CC_FLAGS_NO_FPU) +CFLAGS_crc64-neon.o += $(CC_FLAGS_FPU) -I$(src)/$(SRCARCH) $(crc64-cflags-y) +crc64-$(CONFIG_ARM) += crc64-neon.o +crc64-$(CONFIG_ARM64) += crc64-neon.o crc64-$(CONFIG_RISCV) += riscv/crc64_lsb.o riscv/crc64_msb.o crc64-$(CONFIG_X86) += x86/crc64-pclmul.o diff --git a/lib/crc/arm/crc64-neon.h b/lib/crc/arm/crc64-neon.h new file mode 100644 index 0000000000000..645f553220ffb --- /dev/null +++ b/lib/crc/arm/crc64-neon.h @@ -0,0 +1,34 @@ +// SPDX-License-Identifier: GPL-2.0-only + +static inline uint64x2_t pmull64(uint64x2_t a, uint64x2_t b) +{ + uint64_t l = vgetq_lane_u64(a, 0); + uint64_t m = vgetq_lane_u64(b, 0); + uint64x2_t result; + + asm("vmull.p64 %q0, %P1, %P2" : "=w"(result) : "w"(l), "w"(m)); + + return result; +} + +static inline uint64x2_t pmull64_high(uint64x2_t a, uint64x2_t b) +{ + uint64_t l = vgetq_lane_u64(a, 1); + uint64_t m = vgetq_lane_u64(b, 1); + uint64x2_t result; + + asm("vmull.p64 %q0, %P1, %P2" : "=w"(result) : "w"(l), "w"(m)); + + return result; +} + +static inline uint64x2_t pmull64_hi_lo(uint64x2_t a, uint64x2_t b) +{ + uint64_t l = vgetq_lane_u64(a, 1); + uint64_t m = vgetq_lane_u64(b, 0); + uint64x2_t result; + + asm("vmull.p64 %q0, %P1, %P2" : "=w"(result) : "w"(l), "w"(m)); + + return result; +} diff --git a/lib/crc/arm/crc64.h b/lib/crc/arm/crc64.h new file mode 100644 index 0000000000000..de274288af615 --- /dev/null +++ b/lib/crc/arm/crc64.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * CRC64 using ARM PMULL instructions + */ + +#include <asm/simd.h> + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pmull); + +u64 crc64_nvme_neon(u64 crc, const u8 *p, size_t len); + +#define crc64_be_arch crc64_be_generic + +static inline u64 crc64_nvme_arch(u64 crc, const u8 *p, size_t len) +{ + if (len >= 128 && static_branch_likely(&have_pmull) && + likely(may_use_simd())) { + do { + size_t chunk = min_t(size_t, len & ~15, SZ_4K); + + scoped_ksimd() + crc = crc64_nvme_neon(crc, p, chunk); + + p += chunk; + len -= chunk; + } while (len >= 128); + } + return crc64_nvme_generic(crc, p, len); +} + +#define crc64_mod_init_arch crc64_mod_init_arch +static void crc64_mod_init_arch(void) +{ + if (elf_hwcap2 & HWCAP2_PMULL) + static_branch_enable(&have_pmull); +} diff --git a/lib/crc/arm64/crc64-neon.h b/lib/crc/arm64/crc64-neon.h new file mode 100644 index 0000000000000..fcd5b1e6f8124 --- /dev/null +++ b/lib/crc/arm64/crc64-neon.h @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0-only + +static inline uint64x2_t pmull64(uint64x2_t a, uint64x2_t b) +{ + return vreinterpretq_u64_p128(vmull_p64(vgetq_lane_u64(a, 0), + vgetq_lane_u64(b, 0))); +} + +static inline uint64x2_t pmull64_high(uint64x2_t a, uint64x2_t b) +{ + poly64x2_t l = vreinterpretq_p64_u64(a); + poly64x2_t m = vreinterpretq_p64_u64(b); + + return vreinterpretq_u64_p128(vmull_high_p64(l, m)); +} + +static inline uint64x2_t pmull64_hi_lo(uint64x2_t a, uint64x2_t b) +{ + return vreinterpretq_u64_p128(vmull_p64(vgetq_lane_u64(a, 1), + vgetq_lane_u64(b, 0))); +} diff --git a/lib/crc/arm64/crc64.h b/lib/crc/arm64/crc64.h index 60151ec3035af..c7a69e1f3d8f7 100644 --- a/lib/crc/arm64/crc64.h +++ b/lib/crc/arm64/crc64.h @@ -8,7 +8,7 @@ #include <linux/minmax.h> #include <linux/sizes.h> -u64 crc64_nvme_arm64_c(u64 crc, const u8 *p, size_t len); +u64 crc64_nvme_neon(u64 crc, const u8 *p, size_t len); #define crc64_be_arch crc64_be_generic @@ -19,7 +19,7 @@ static inline u64 crc64_nvme_arch(u64 crc, const u8 *p, size_t len) size_t chunk = len & ~15; scoped_ksimd() - crc = crc64_nvme_arm64_c(crc, p, chunk); + crc = crc64_nvme_neon(crc, p, chunk); p += chunk; len &= 15; diff --git a/lib/crc/arm64/crc64-neon-inner.c b/lib/crc/crc64-neon.c index 28527e544ff63..4753fb94a4beb 100644 --- a/lib/crc/arm64/crc64-neon-inner.c +++ b/lib/crc/crc64-neon.c @@ -6,7 +6,9 @@ #include <linux/types.h> #include <asm/neon-intrinsics.h> -u64 crc64_nvme_arm64_c(u64 crc, const u8 *p, size_t len); +#include "crc64-neon.h" + +u64 crc64_nvme_neon(u64 crc, const u8 *p, size_t len); /* x^191 mod G, x^127 mod G */ static const u64 fold_consts_val[2] = { 0xeadc41fd2ba3d420ULL, @@ -15,27 +17,7 @@ static const u64 fold_consts_val[2] = { 0xeadc41fd2ba3d420ULL, static const u64 bconsts_val[2] = { 0x27ecfa329aef9f77ULL, 0x34d926535897936aULL }; -static inline uint64x2_t pmull64(uint64x2_t a, uint64x2_t b) -{ - return vreinterpretq_u64_p128(vmull_p64(vgetq_lane_u64(a, 0), - vgetq_lane_u64(b, 0))); -} - -static inline uint64x2_t pmull64_high(uint64x2_t a, uint64x2_t b) -{ - poly64x2_t l = vreinterpretq_p64_u64(a); - poly64x2_t m = vreinterpretq_p64_u64(b); - - return vreinterpretq_u64_p128(vmull_high_p64(l, m)); -} - -static inline uint64x2_t pmull64_hi_lo(uint64x2_t a, uint64x2_t b) -{ - return vreinterpretq_u64_p128(vmull_p64(vgetq_lane_u64(a, 1), - vgetq_lane_u64(b, 0))); -} - -u64 crc64_nvme_arm64_c(u64 crc, const u8 *p, size_t len) +u64 crc64_nvme_neon(u64 crc, const u8 *p, size_t len) { uint64x2_t fold_consts = vld1q_u64(fold_consts_val); uint64x2_t v0 = { crc, 0 }; diff --git a/lib/raid/xor/Makefile b/lib/raid/xor/Makefile index 4d633dfd5b90c..e8ecec3c09f9f 100644 --- a/lib/raid/xor/Makefile +++ b/lib/raid/xor/Makefile @@ -17,9 +17,10 @@ endif xor-$(CONFIG_ALPHA) += alpha/xor.o xor-$(CONFIG_ARM) += arm/xor.o ifeq ($(CONFIG_ARM),y) -xor-$(CONFIG_KERNEL_MODE_NEON) += arm/xor-neon.o arm/xor-neon-glue.o +xor-$(CONFIG_KERNEL_MODE_NEON) += xor-neon.o arm/xor-neon-glue.o endif -xor-$(CONFIG_ARM64) += arm64/xor-neon.o arm64/xor-neon-glue.o +xor-$(CONFIG_ARM64) += xor-neon.o arm64/xor-eor3.o \ + arm64/xor-neon-glue.o xor-$(CONFIG_CPU_HAS_LSX) += loongarch/xor_simd.o xor-$(CONFIG_CPU_HAS_LSX) += loongarch/xor_simd_glue.o xor-$(CONFIG_ALTIVEC) += powerpc/xor_vmx.o powerpc/xor_vmx_glue.o @@ -31,11 +32,11 @@ xor-$(CONFIG_X86_32) += x86/xor-avx.o x86/xor-sse.o x86/xor-mmx.o xor-$(CONFIG_X86_64) += x86/xor-avx.o x86/xor-sse.o obj-y += tests/ -CFLAGS_arm/xor-neon.o += $(CC_FLAGS_FPU) -CFLAGS_REMOVE_arm/xor-neon.o += $(CC_FLAGS_NO_FPU) +CFLAGS_xor-neon.o += $(CC_FLAGS_FPU) -I$(src)/$(SRCARCH) +CFLAGS_REMOVE_xor-neon.o += $(CC_FLAGS_NO_FPU) -CFLAGS_arm64/xor-neon.o += $(CC_FLAGS_FPU) -CFLAGS_REMOVE_arm64/xor-neon.o += $(CC_FLAGS_NO_FPU) +CFLAGS_arm64/xor-eor3.o += $(CC_FLAGS_FPU) +CFLAGS_REMOVE_arm64/xor-eor3.o += $(CC_FLAGS_NO_FPU) CFLAGS_powerpc/xor_vmx.o += -mhard-float -maltivec \ $(call cc-option,-mabi=altivec) \ diff --git a/lib/raid/xor/arm/xor-neon.c b/lib/raid/xor/arm/xor-neon.c deleted file mode 100644 index 23147e3a79044..0000000000000 --- a/lib/raid/xor/arm/xor-neon.c +++ /dev/null @@ -1,26 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org> - */ - -#include "xor_impl.h" -#include "xor_arch.h" - -#ifndef __ARM_NEON__ -#error You should compile this file with '-march=armv7-a -mfloat-abi=softfp -mfpu=neon' -#endif - -/* - * Pull in the reference implementations while instructing GCC (through - * -ftree-vectorize) to attempt to exploit implicit parallelism and emit - * NEON instructions. Clang does this by default at O2 so no pragma is - * needed. - */ -#ifdef CONFIG_CC_IS_GCC -#pragma GCC optimize "tree-vectorize" -#endif - -#define NO_TEMPLATE -#include "../xor-8regs.c" - -__DO_XOR_BLOCKS(neon_inner, xor_8regs_2, xor_8regs_3, xor_8regs_4, xor_8regs_5); diff --git a/lib/raid/xor/arm/xor-neon.h b/lib/raid/xor/arm/xor-neon.h new file mode 100644 index 0000000000000..406e0356f05bb --- /dev/null +++ b/lib/raid/xor/arm/xor-neon.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +extern struct xor_block_template xor_block_arm4regs; +extern struct xor_block_template xor_block_neon; + +void xor_gen_neon_inner(void *dest, void **srcs, unsigned int src_cnt, + unsigned int bytes); diff --git a/lib/raid/xor/arm/xor_arch.h b/lib/raid/xor/arm/xor_arch.h index 775ff835df656..f1ddb64fe62ab 100644 --- a/lib/raid/xor/arm/xor_arch.h +++ b/lib/raid/xor/arm/xor_arch.h @@ -3,12 +3,7 @@ * Copyright (C) 2001 Russell King */ #include <asm/neon.h> - -extern struct xor_block_template xor_block_arm4regs; -extern struct xor_block_template xor_block_neon; - -void xor_gen_neon_inner(void *dest, void **srcs, unsigned int src_cnt, - unsigned int bytes); +#include "xor-neon.h" static __always_inline void __init arch_xor_init(void) { diff --git a/lib/raid/xor/arm64/xor-eor3.c b/lib/raid/xor/arm64/xor-eor3.c new file mode 100644 index 0000000000000..e44016c363f1c --- /dev/null +++ b/lib/raid/xor/arm64/xor-eor3.c @@ -0,0 +1,146 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/cache.h> +#include <asm/neon-intrinsics.h> +#include "xor_impl.h" +#include "xor_arch.h" +#include "xor-neon.h" + +extern void __xor_eor3_2(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2); + +static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r) +{ + uint64x2_t res; + + asm(ARM64_ASM_PREAMBLE ".arch_extension sha3\n" + "eor3 %0.16b, %1.16b, %2.16b, %3.16b" + : "=w"(res) : "w"(p), "w"(q), "w"(r)); + return res; +} + +static void __xor_eor3_3(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3) +{ + uint64_t *dp1 = (uint64_t *)p1; + uint64_t *dp2 = (uint64_t *)p2; + uint64_t *dp3 = (uint64_t *)p3; + + register uint64x2_t v0, v1, v2, v3; + long lines = bytes / (sizeof(uint64x2_t) * 4); + + do { + /* p1 ^= p2 ^ p3 */ + v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0), + vld1q_u64(dp3 + 0)); + v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2), + vld1q_u64(dp3 + 2)); + v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4), + vld1q_u64(dp3 + 4)); + v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6), + vld1q_u64(dp3 + 6)); + + /* store */ + vst1q_u64(dp1 + 0, v0); + vst1q_u64(dp1 + 2, v1); + vst1q_u64(dp1 + 4, v2); + vst1q_u64(dp1 + 6, v3); + + dp1 += 8; + dp2 += 8; + dp3 += 8; + } while (--lines > 0); +} + +static void __xor_eor3_4(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4) +{ + uint64_t *dp1 = (uint64_t *)p1; + uint64_t *dp2 = (uint64_t *)p2; + uint64_t *dp3 = (uint64_t *)p3; + uint64_t *dp4 = (uint64_t *)p4; + + register uint64x2_t v0, v1, v2, v3; + long lines = bytes / (sizeof(uint64x2_t) * 4); + + do { + /* p1 ^= p2 ^ p3 */ + v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0), + vld1q_u64(dp3 + 0)); + v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2), + vld1q_u64(dp3 + 2)); + v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4), + vld1q_u64(dp3 + 4)); + v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6), + vld1q_u64(dp3 + 6)); + + /* p1 ^= p4 */ + v0 = veorq_u64(v0, vld1q_u64(dp4 + 0)); + v1 = veorq_u64(v1, vld1q_u64(dp4 + 2)); + v2 = veorq_u64(v2, vld1q_u64(dp4 + 4)); + v3 = veorq_u64(v3, vld1q_u64(dp4 + 6)); + + /* store */ + vst1q_u64(dp1 + 0, v0); + vst1q_u64(dp1 + 2, v1); + vst1q_u64(dp1 + 4, v2); + vst1q_u64(dp1 + 6, v3); + + dp1 += 8; + dp2 += 8; + dp3 += 8; + dp4 += 8; + } while (--lines > 0); +} + +static void __xor_eor3_5(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4, + const unsigned long * __restrict p5) +{ + uint64_t *dp1 = (uint64_t *)p1; + uint64_t *dp2 = (uint64_t *)p2; + uint64_t *dp3 = (uint64_t *)p3; + uint64_t *dp4 = (uint64_t *)p4; + uint64_t *dp5 = (uint64_t *)p5; + + register uint64x2_t v0, v1, v2, v3; + long lines = bytes / (sizeof(uint64x2_t) * 4); + + do { + /* p1 ^= p2 ^ p3 */ + v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0), + vld1q_u64(dp3 + 0)); + v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2), + vld1q_u64(dp3 + 2)); + v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4), + vld1q_u64(dp3 + 4)); + v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6), + vld1q_u64(dp3 + 6)); + + /* p1 ^= p4 ^ p5 */ + v0 = eor3(v0, vld1q_u64(dp4 + 0), vld1q_u64(dp5 + 0)); + v1 = eor3(v1, vld1q_u64(dp4 + 2), vld1q_u64(dp5 + 2)); + v2 = eor3(v2, vld1q_u64(dp4 + 4), vld1q_u64(dp5 + 4)); + v3 = eor3(v3, vld1q_u64(dp4 + 6), vld1q_u64(dp5 + 6)); + + /* store */ + vst1q_u64(dp1 + 0, v0); + vst1q_u64(dp1 + 2, v1); + vst1q_u64(dp1 + 4, v2); + vst1q_u64(dp1 + 6, v3); + + dp1 += 8; + dp2 += 8; + dp3 += 8; + dp4 += 8; + dp5 += 8; + } while (--lines > 0); +} + +__DO_XOR_BLOCKS(eor3_inner, __xor_eor3_2, __xor_eor3_3, __xor_eor3_4, + __xor_eor3_5); diff --git a/lib/raid/xor/xor-8regs.c b/lib/raid/xor/xor-8regs.c index 1edaed8acffe6..46b3c8bdc27f3 100644 --- a/lib/raid/xor/xor-8regs.c +++ b/lib/raid/xor/xor-8regs.c @@ -93,11 +93,9 @@ xor_8regs_5(unsigned long bytes, unsigned long * __restrict p1, } while (--lines > 0); } -#ifndef NO_TEMPLATE DO_XOR_BLOCKS(8regs, xor_8regs_2, xor_8regs_3, xor_8regs_4, xor_8regs_5); struct xor_block_template xor_block_8regs = { .name = "8regs", .xor_gen = xor_gen_8regs, }; -#endif /* NO_TEMPLATE */ diff --git a/lib/raid/xor/arm64/xor-neon.c b/lib/raid/xor/xor-neon.c index 97ef3cb924968..c7c3cf634e23a 100644 --- a/lib/raid/xor/arm64/xor-neon.c +++ b/lib/raid/xor/xor-neon.c @@ -4,12 +4,11 @@ * Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd. */ -#include <linux/cache.h> -#include <asm/neon-intrinsics.h> #include "xor_impl.h" -#include "xor_arch.h" #include "xor-neon.h" +#include <asm/neon-intrinsics.h> + static void __xor_neon_2(unsigned long bytes, unsigned long * __restrict p1, const unsigned long * __restrict p2) { @@ -175,138 +174,6 @@ static void __xor_neon_5(unsigned long bytes, unsigned long * __restrict p1, __DO_XOR_BLOCKS(neon_inner, __xor_neon_2, __xor_neon_3, __xor_neon_4, __xor_neon_5); -static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r) -{ - uint64x2_t res; - - asm(ARM64_ASM_PREAMBLE ".arch_extension sha3\n" - "eor3 %0.16b, %1.16b, %2.16b, %3.16b" - : "=w"(res) : "w"(p), "w"(q), "w"(r)); - return res; -} - -static void __xor_eor3_3(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2, - const unsigned long * __restrict p3) -{ - uint64_t *dp1 = (uint64_t *)p1; - uint64_t *dp2 = (uint64_t *)p2; - uint64_t *dp3 = (uint64_t *)p3; - - register uint64x2_t v0, v1, v2, v3; - long lines = bytes / (sizeof(uint64x2_t) * 4); - - do { - /* p1 ^= p2 ^ p3 */ - v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0), - vld1q_u64(dp3 + 0)); - v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2), - vld1q_u64(dp3 + 2)); - v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4), - vld1q_u64(dp3 + 4)); - v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6), - vld1q_u64(dp3 + 6)); - - /* store */ - vst1q_u64(dp1 + 0, v0); - vst1q_u64(dp1 + 2, v1); - vst1q_u64(dp1 + 4, v2); - vst1q_u64(dp1 + 6, v3); - - dp1 += 8; - dp2 += 8; - dp3 += 8; - } while (--lines > 0); -} - -static void __xor_eor3_4(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2, - const unsigned long * __restrict p3, - const unsigned long * __restrict p4) -{ - uint64_t *dp1 = (uint64_t *)p1; - uint64_t *dp2 = (uint64_t *)p2; - uint64_t *dp3 = (uint64_t *)p3; - uint64_t *dp4 = (uint64_t *)p4; - - register uint64x2_t v0, v1, v2, v3; - long lines = bytes / (sizeof(uint64x2_t) * 4); - - do { - /* p1 ^= p2 ^ p3 */ - v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0), - vld1q_u64(dp3 + 0)); - v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2), - vld1q_u64(dp3 + 2)); - v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4), - vld1q_u64(dp3 + 4)); - v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6), - vld1q_u64(dp3 + 6)); - - /* p1 ^= p4 */ - v0 = veorq_u64(v0, vld1q_u64(dp4 + 0)); - v1 = veorq_u64(v1, vld1q_u64(dp4 + 2)); - v2 = veorq_u64(v2, vld1q_u64(dp4 + 4)); - v3 = veorq_u64(v3, vld1q_u64(dp4 + 6)); - - /* store */ - vst1q_u64(dp1 + 0, v0); - vst1q_u64(dp1 + 2, v1); - vst1q_u64(dp1 + 4, v2); - vst1q_u64(dp1 + 6, v3); - - dp1 += 8; - dp2 += 8; - dp3 += 8; - dp4 += 8; - } while (--lines > 0); -} - -static void __xor_eor3_5(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2, - const unsigned long * __restrict p3, - const unsigned long * __restrict p4, - const unsigned long * __restrict p5) -{ - uint64_t *dp1 = (uint64_t *)p1; - uint64_t *dp2 = (uint64_t *)p2; - uint64_t *dp3 = (uint64_t *)p3; - uint64_t *dp4 = (uint64_t *)p4; - uint64_t *dp5 = (uint64_t *)p5; - - register uint64x2_t v0, v1, v2, v3; - long lines = bytes / (sizeof(uint64x2_t) * 4); - - do { - /* p1 ^= p2 ^ p3 */ - v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0), - vld1q_u64(dp3 + 0)); - v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2), - vld1q_u64(dp3 + 2)); - v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4), - vld1q_u64(dp3 + 4)); - v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6), - vld1q_u64(dp3 + 6)); - - /* p1 ^= p4 ^ p5 */ - v0 = eor3(v0, vld1q_u64(dp4 + 0), vld1q_u64(dp5 + 0)); - v1 = eor3(v1, vld1q_u64(dp4 + 2), vld1q_u64(dp5 + 2)); - v2 = eor3(v2, vld1q_u64(dp4 + 4), vld1q_u64(dp5 + 4)); - v3 = eor3(v3, vld1q_u64(dp4 + 6), vld1q_u64(dp5 + 6)); - - /* store */ - vst1q_u64(dp1 + 0, v0); - vst1q_u64(dp1 + 2, v1); - vst1q_u64(dp1 + 4, v2); - vst1q_u64(dp1 + 6, v3); - - dp1 += 8; - dp2 += 8; - dp3 += 8; - dp4 += 8; - dp5 += 8; - } while (--lines > 0); -} - -__DO_XOR_BLOCKS(eor3_inner, __xor_neon_2, __xor_eor3_3, __xor_eor3_4, - __xor_eor3_5); +#ifdef CONFIG_ARM64 +extern typeof(__xor_neon_2) __xor_eor3_2 __alias(__xor_neon_2); +#endif |
