aboutsummaryrefslogtreecommitdiffstats
diff options
-rw-r--r--Documentation/arch/arm/kernel_mode_neon.rst4
-rw-r--r--arch/arm/include/asm/neon-intrinsics.h60
-rw-r--r--crypto/Makefile10
-rw-r--r--crypto/aegis128-neon-inner.c4
-rw-r--r--lib/crc/Kconfig1
-rw-r--r--lib/crc/Makefile9
-rw-r--r--lib/crc/arm/crc64-neon.h34
-rw-r--r--lib/crc/arm/crc64.h36
-rw-r--r--lib/crc/arm64/crc64-neon.h21
-rw-r--r--lib/crc/arm64/crc64.h4
-rw-r--r--lib/crc/crc64-neon.c (renamed from lib/crc/arm64/crc64-neon-inner.c)26
-rw-r--r--lib/raid/xor/Makefile13
-rw-r--r--lib/raid/xor/arm/xor-neon.c26
-rw-r--r--lib/raid/xor/arm/xor-neon.h7
-rw-r--r--lib/raid/xor/arm/xor_arch.h7
-rw-r--r--lib/raid/xor/arm64/xor-eor3.c146
-rw-r--r--lib/raid/xor/xor-8regs.c2
-rw-r--r--lib/raid/xor/xor-neon.c (renamed from lib/raid/xor/arm64/xor-neon.c)143
18 files changed, 338 insertions, 215 deletions
diff --git a/Documentation/arch/arm/kernel_mode_neon.rst b/Documentation/arch/arm/kernel_mode_neon.rst
index 9bfb71a2a9b96..1efb6d35b7bd0 100644
--- a/Documentation/arch/arm/kernel_mode_neon.rst
+++ b/Documentation/arch/arm/kernel_mode_neon.rst
@@ -121,4 +121,6 @@ observe the following in addition to the rules above:
* Compile the unit containing the NEON intrinsics with '-ffreestanding' so GCC
uses its builtin version of <stdint.h> (this is a C99 header which the kernel
does not supply);
-* Include <arm_neon.h> last, or at least after <linux/types.h>
+* Do not include <arm_neon.h> directly: instead, include <asm/neon-intrinsics.h>,
+ which tweaks some macro definitions so that system headers can be included
+ safely.
diff --git a/arch/arm/include/asm/neon-intrinsics.h b/arch/arm/include/asm/neon-intrinsics.h
new file mode 100644
index 0000000000000..8b80c05ce1d7d
--- /dev/null
+++ b/arch/arm/include/asm/neon-intrinsics.h
@@ -0,0 +1,60 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef __ASM_NEON_INTRINSICS_H
+#define __ASM_NEON_INTRINSICS_H
+
+#ifndef __ARM_NEON__
+#error You should compile this file with '-march=armv7-a -mfloat-abi=softfp -mfpu=neon'
+#endif
+
+#include <asm-generic/int-ll64.h>
+
+/*
+ * The C99 types uintXX_t that are usually defined in 'stdint.h' are not as
+ * unambiguous on ARM as you would expect. For the types below, there is a
+ * difference on ARM between GCC built for bare metal ARM, GCC built for glibc
+ * and the kernel itself, which results in build errors if you try to build
+ * with -ffreestanding and include 'stdint.h' (such as when you include
+ * 'arm_neon.h' in order to use NEON intrinsics)
+ *
+ * As the typedefs for these types in 'stdint.h' are based on builtin defines
+ * supplied by GCC, we can tweak these to align with the kernel's idea of those
+ * types, so 'linux/types.h' and 'stdint.h' can be safely included from the
+ * same source file (provided that -ffreestanding is used).
+ *
+ * int32_t uint32_t intptr_t uintptr_t
+ * bare metal GCC long unsigned long int unsigned int
+ * glibc GCC int unsigned int int unsigned int
+ * kernel int unsigned int long unsigned long
+ */
+
+#ifdef __INT32_TYPE__
+#undef __INT32_TYPE__
+#define __INT32_TYPE__ int
+#endif
+
+#ifdef __UINT32_TYPE__
+#undef __UINT32_TYPE__
+#define __UINT32_TYPE__ unsigned int
+#endif
+
+#ifdef __INTPTR_TYPE__
+#undef __INTPTR_TYPE__
+#define __INTPTR_TYPE__ long
+#endif
+
+#ifdef __UINTPTR_TYPE__
+#undef __UINTPTR_TYPE__
+#define __UINTPTR_TYPE__ unsigned long
+#endif
+
+/*
+ * genksyms chokes on the ARM NEON instrinsics system header, but we
+ * don't export anything it defines anyway, so just disregard when
+ * genksyms execute.
+ */
+#ifndef __GENKSYMS__
+#include <arm_neon.h>
+#endif
+
+#endif /* __ASM_NEON_INTRINSICS_H */
diff --git a/crypto/Makefile b/crypto/Makefile
index c73f4d51d0368..481ee417ff446 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -103,13 +103,14 @@ obj-$(CONFIG_CRYPTO_CHACHA20POLY1305) += chacha20poly1305.o
obj-$(CONFIG_CRYPTO_AEGIS128) += aegis128.o
aegis128-y := aegis128-core.o
+CFLAGS_aegis128-neon-inner.o += $(CC_FLAGS_FPU)
+CFLAGS_REMOVE_aegis128-neon-inner.o += $(CC_FLAGS_NO_FPU)
ifeq ($(ARCH),arm)
-CFLAGS_aegis128-neon-inner.o += -ffreestanding -march=armv8-a -mfloat-abi=softfp
-CFLAGS_aegis128-neon-inner.o += -mfpu=crypto-neon-fp-armv8
+CFLAGS_aegis128-neon-inner.o += -march=armv8-a -mfpu=crypto-neon-fp-armv8
aegis128-$(CONFIG_CRYPTO_AEGIS128_SIMD) += aegis128-neon.o aegis128-neon-inner.o
endif
ifeq ($(ARCH),arm64)
-aegis128-cflags-y := -ffreestanding -mcpu=generic+crypto
+aegis128-cflags-y := -mcpu=generic+crypto
aegis128-cflags-$(CONFIG_CC_IS_GCC) += -ffixed-q16 -ffixed-q17 -ffixed-q18 \
-ffixed-q19 -ffixed-q20 -ffixed-q21 \
-ffixed-q22 -ffixed-q23 -ffixed-q24 \
@@ -117,11 +118,8 @@ aegis128-cflags-$(CONFIG_CC_IS_GCC) += -ffixed-q16 -ffixed-q17 -ffixed-q18 \
-ffixed-q28 -ffixed-q29 -ffixed-q30 \
-ffixed-q31
CFLAGS_aegis128-neon-inner.o += $(aegis128-cflags-y)
-CFLAGS_REMOVE_aegis128-neon-inner.o += -mgeneral-regs-only
aegis128-$(CONFIG_CRYPTO_AEGIS128_SIMD) += aegis128-neon.o aegis128-neon-inner.o
endif
-# Enable <arm_neon.h>
-CFLAGS_aegis128-neon-inner.o += -isystem $(shell $(CC) -print-file-name=include)
obj-$(CONFIG_CRYPTO_PCRYPT) += pcrypt.o
obj-$(CONFIG_CRYPTO_CRYPTD) += cryptd.o
diff --git a/crypto/aegis128-neon-inner.c b/crypto/aegis128-neon-inner.c
index b6a52a386b220..56b534eeb6807 100644
--- a/crypto/aegis128-neon-inner.c
+++ b/crypto/aegis128-neon-inner.c
@@ -3,13 +3,11 @@
* Copyright (C) 2019 Linaro, Ltd. <ard.biesheuvel@linaro.org>
*/
-#ifdef CONFIG_ARM64
#include <asm/neon-intrinsics.h>
+#ifdef CONFIG_ARM64
#define AES_ROUND "aese %0.16b, %1.16b \n\t aesmc %0.16b, %0.16b"
#else
-#include <arm_neon.h>
-
#define AES_ROUND "aese.8 %q0, %q1 \n\t aesmc.8 %q0, %q0"
#endif
diff --git a/lib/crc/Kconfig b/lib/crc/Kconfig
index f47bb4c706fb7..927fc6a6b2b9d 100644
--- a/lib/crc/Kconfig
+++ b/lib/crc/Kconfig
@@ -82,6 +82,7 @@ config CRC64
config CRC64_ARCH
bool
depends on CRC64 && CRC_OPTIMIZATIONS
+ default y if ARM && KERNEL_MODE_NEON && !CPU_BIG_ENDIAN
default y if ARM64
default y if RISCV && RISCV_ISA_ZBC && 64BIT
default y if X86_64
diff --git a/lib/crc/Makefile b/lib/crc/Makefile
index ff213590e4e31..386e9c1752632 100644
--- a/lib/crc/Makefile
+++ b/lib/crc/Makefile
@@ -39,9 +39,12 @@ crc64-y := crc64-main.o
ifeq ($(CONFIG_CRC64_ARCH),y)
CFLAGS_crc64-main.o += -I$(src)/$(SRCARCH)
-CFLAGS_REMOVE_arm64/crc64-neon-inner.o += $(CC_FLAGS_NO_FPU)
-CFLAGS_arm64/crc64-neon-inner.o += $(CC_FLAGS_FPU) -march=armv8-a+crypto
-crc64-$(CONFIG_ARM64) += arm64/crc64-neon-inner.o
+crc64-cflags-$(CONFIG_ARM) += -march=armv8-a -mfpu=crypto-neon-fp-armv8
+crc64-cflags-$(CONFIG_ARM64) += -march=armv8-a+crypto
+CFLAGS_REMOVE_crc64-neon.o += $(CC_FLAGS_NO_FPU)
+CFLAGS_crc64-neon.o += $(CC_FLAGS_FPU) -I$(src)/$(SRCARCH) $(crc64-cflags-y)
+crc64-$(CONFIG_ARM) += crc64-neon.o
+crc64-$(CONFIG_ARM64) += crc64-neon.o
crc64-$(CONFIG_RISCV) += riscv/crc64_lsb.o riscv/crc64_msb.o
crc64-$(CONFIG_X86) += x86/crc64-pclmul.o
diff --git a/lib/crc/arm/crc64-neon.h b/lib/crc/arm/crc64-neon.h
new file mode 100644
index 0000000000000..645f553220ffb
--- /dev/null
+++ b/lib/crc/arm/crc64-neon.h
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+static inline uint64x2_t pmull64(uint64x2_t a, uint64x2_t b)
+{
+ uint64_t l = vgetq_lane_u64(a, 0);
+ uint64_t m = vgetq_lane_u64(b, 0);
+ uint64x2_t result;
+
+ asm("vmull.p64 %q0, %P1, %P2" : "=w"(result) : "w"(l), "w"(m));
+
+ return result;
+}
+
+static inline uint64x2_t pmull64_high(uint64x2_t a, uint64x2_t b)
+{
+ uint64_t l = vgetq_lane_u64(a, 1);
+ uint64_t m = vgetq_lane_u64(b, 1);
+ uint64x2_t result;
+
+ asm("vmull.p64 %q0, %P1, %P2" : "=w"(result) : "w"(l), "w"(m));
+
+ return result;
+}
+
+static inline uint64x2_t pmull64_hi_lo(uint64x2_t a, uint64x2_t b)
+{
+ uint64_t l = vgetq_lane_u64(a, 1);
+ uint64_t m = vgetq_lane_u64(b, 0);
+ uint64x2_t result;
+
+ asm("vmull.p64 %q0, %P1, %P2" : "=w"(result) : "w"(l), "w"(m));
+
+ return result;
+}
diff --git a/lib/crc/arm/crc64.h b/lib/crc/arm/crc64.h
new file mode 100644
index 0000000000000..de274288af615
--- /dev/null
+++ b/lib/crc/arm/crc64.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * CRC64 using ARM PMULL instructions
+ */
+
+#include <asm/simd.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pmull);
+
+u64 crc64_nvme_neon(u64 crc, const u8 *p, size_t len);
+
+#define crc64_be_arch crc64_be_generic
+
+static inline u64 crc64_nvme_arch(u64 crc, const u8 *p, size_t len)
+{
+ if (len >= 128 && static_branch_likely(&have_pmull) &&
+ likely(may_use_simd())) {
+ do {
+ size_t chunk = min_t(size_t, len & ~15, SZ_4K);
+
+ scoped_ksimd()
+ crc = crc64_nvme_neon(crc, p, chunk);
+
+ p += chunk;
+ len -= chunk;
+ } while (len >= 128);
+ }
+ return crc64_nvme_generic(crc, p, len);
+}
+
+#define crc64_mod_init_arch crc64_mod_init_arch
+static void crc64_mod_init_arch(void)
+{
+ if (elf_hwcap2 & HWCAP2_PMULL)
+ static_branch_enable(&have_pmull);
+}
diff --git a/lib/crc/arm64/crc64-neon.h b/lib/crc/arm64/crc64-neon.h
new file mode 100644
index 0000000000000..fcd5b1e6f8124
--- /dev/null
+++ b/lib/crc/arm64/crc64-neon.h
@@ -0,0 +1,21 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+static inline uint64x2_t pmull64(uint64x2_t a, uint64x2_t b)
+{
+ return vreinterpretq_u64_p128(vmull_p64(vgetq_lane_u64(a, 0),
+ vgetq_lane_u64(b, 0)));
+}
+
+static inline uint64x2_t pmull64_high(uint64x2_t a, uint64x2_t b)
+{
+ poly64x2_t l = vreinterpretq_p64_u64(a);
+ poly64x2_t m = vreinterpretq_p64_u64(b);
+
+ return vreinterpretq_u64_p128(vmull_high_p64(l, m));
+}
+
+static inline uint64x2_t pmull64_hi_lo(uint64x2_t a, uint64x2_t b)
+{
+ return vreinterpretq_u64_p128(vmull_p64(vgetq_lane_u64(a, 1),
+ vgetq_lane_u64(b, 0)));
+}
diff --git a/lib/crc/arm64/crc64.h b/lib/crc/arm64/crc64.h
index 60151ec3035af..c7a69e1f3d8f7 100644
--- a/lib/crc/arm64/crc64.h
+++ b/lib/crc/arm64/crc64.h
@@ -8,7 +8,7 @@
#include <linux/minmax.h>
#include <linux/sizes.h>
-u64 crc64_nvme_arm64_c(u64 crc, const u8 *p, size_t len);
+u64 crc64_nvme_neon(u64 crc, const u8 *p, size_t len);
#define crc64_be_arch crc64_be_generic
@@ -19,7 +19,7 @@ static inline u64 crc64_nvme_arch(u64 crc, const u8 *p, size_t len)
size_t chunk = len & ~15;
scoped_ksimd()
- crc = crc64_nvme_arm64_c(crc, p, chunk);
+ crc = crc64_nvme_neon(crc, p, chunk);
p += chunk;
len &= 15;
diff --git a/lib/crc/arm64/crc64-neon-inner.c b/lib/crc/crc64-neon.c
index 28527e544ff63..4753fb94a4beb 100644
--- a/lib/crc/arm64/crc64-neon-inner.c
+++ b/lib/crc/crc64-neon.c
@@ -6,7 +6,9 @@
#include <linux/types.h>
#include <asm/neon-intrinsics.h>
-u64 crc64_nvme_arm64_c(u64 crc, const u8 *p, size_t len);
+#include "crc64-neon.h"
+
+u64 crc64_nvme_neon(u64 crc, const u8 *p, size_t len);
/* x^191 mod G, x^127 mod G */
static const u64 fold_consts_val[2] = { 0xeadc41fd2ba3d420ULL,
@@ -15,27 +17,7 @@ static const u64 fold_consts_val[2] = { 0xeadc41fd2ba3d420ULL,
static const u64 bconsts_val[2] = { 0x27ecfa329aef9f77ULL,
0x34d926535897936aULL };
-static inline uint64x2_t pmull64(uint64x2_t a, uint64x2_t b)
-{
- return vreinterpretq_u64_p128(vmull_p64(vgetq_lane_u64(a, 0),
- vgetq_lane_u64(b, 0)));
-}
-
-static inline uint64x2_t pmull64_high(uint64x2_t a, uint64x2_t b)
-{
- poly64x2_t l = vreinterpretq_p64_u64(a);
- poly64x2_t m = vreinterpretq_p64_u64(b);
-
- return vreinterpretq_u64_p128(vmull_high_p64(l, m));
-}
-
-static inline uint64x2_t pmull64_hi_lo(uint64x2_t a, uint64x2_t b)
-{
- return vreinterpretq_u64_p128(vmull_p64(vgetq_lane_u64(a, 1),
- vgetq_lane_u64(b, 0)));
-}
-
-u64 crc64_nvme_arm64_c(u64 crc, const u8 *p, size_t len)
+u64 crc64_nvme_neon(u64 crc, const u8 *p, size_t len)
{
uint64x2_t fold_consts = vld1q_u64(fold_consts_val);
uint64x2_t v0 = { crc, 0 };
diff --git a/lib/raid/xor/Makefile b/lib/raid/xor/Makefile
index 4d633dfd5b90c..e8ecec3c09f9f 100644
--- a/lib/raid/xor/Makefile
+++ b/lib/raid/xor/Makefile
@@ -17,9 +17,10 @@ endif
xor-$(CONFIG_ALPHA) += alpha/xor.o
xor-$(CONFIG_ARM) += arm/xor.o
ifeq ($(CONFIG_ARM),y)
-xor-$(CONFIG_KERNEL_MODE_NEON) += arm/xor-neon.o arm/xor-neon-glue.o
+xor-$(CONFIG_KERNEL_MODE_NEON) += xor-neon.o arm/xor-neon-glue.o
endif
-xor-$(CONFIG_ARM64) += arm64/xor-neon.o arm64/xor-neon-glue.o
+xor-$(CONFIG_ARM64) += xor-neon.o arm64/xor-eor3.o \
+ arm64/xor-neon-glue.o
xor-$(CONFIG_CPU_HAS_LSX) += loongarch/xor_simd.o
xor-$(CONFIG_CPU_HAS_LSX) += loongarch/xor_simd_glue.o
xor-$(CONFIG_ALTIVEC) += powerpc/xor_vmx.o powerpc/xor_vmx_glue.o
@@ -31,11 +32,11 @@ xor-$(CONFIG_X86_32) += x86/xor-avx.o x86/xor-sse.o x86/xor-mmx.o
xor-$(CONFIG_X86_64) += x86/xor-avx.o x86/xor-sse.o
obj-y += tests/
-CFLAGS_arm/xor-neon.o += $(CC_FLAGS_FPU)
-CFLAGS_REMOVE_arm/xor-neon.o += $(CC_FLAGS_NO_FPU)
+CFLAGS_xor-neon.o += $(CC_FLAGS_FPU) -I$(src)/$(SRCARCH)
+CFLAGS_REMOVE_xor-neon.o += $(CC_FLAGS_NO_FPU)
-CFLAGS_arm64/xor-neon.o += $(CC_FLAGS_FPU)
-CFLAGS_REMOVE_arm64/xor-neon.o += $(CC_FLAGS_NO_FPU)
+CFLAGS_arm64/xor-eor3.o += $(CC_FLAGS_FPU)
+CFLAGS_REMOVE_arm64/xor-eor3.o += $(CC_FLAGS_NO_FPU)
CFLAGS_powerpc/xor_vmx.o += -mhard-float -maltivec \
$(call cc-option,-mabi=altivec) \
diff --git a/lib/raid/xor/arm/xor-neon.c b/lib/raid/xor/arm/xor-neon.c
deleted file mode 100644
index 23147e3a79044..0000000000000
--- a/lib/raid/xor/arm/xor-neon.c
+++ /dev/null
@@ -1,26 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
- */
-
-#include "xor_impl.h"
-#include "xor_arch.h"
-
-#ifndef __ARM_NEON__
-#error You should compile this file with '-march=armv7-a -mfloat-abi=softfp -mfpu=neon'
-#endif
-
-/*
- * Pull in the reference implementations while instructing GCC (through
- * -ftree-vectorize) to attempt to exploit implicit parallelism and emit
- * NEON instructions. Clang does this by default at O2 so no pragma is
- * needed.
- */
-#ifdef CONFIG_CC_IS_GCC
-#pragma GCC optimize "tree-vectorize"
-#endif
-
-#define NO_TEMPLATE
-#include "../xor-8regs.c"
-
-__DO_XOR_BLOCKS(neon_inner, xor_8regs_2, xor_8regs_3, xor_8regs_4, xor_8regs_5);
diff --git a/lib/raid/xor/arm/xor-neon.h b/lib/raid/xor/arm/xor-neon.h
new file mode 100644
index 0000000000000..406e0356f05bb
--- /dev/null
+++ b/lib/raid/xor/arm/xor-neon.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+extern struct xor_block_template xor_block_arm4regs;
+extern struct xor_block_template xor_block_neon;
+
+void xor_gen_neon_inner(void *dest, void **srcs, unsigned int src_cnt,
+ unsigned int bytes);
diff --git a/lib/raid/xor/arm/xor_arch.h b/lib/raid/xor/arm/xor_arch.h
index 775ff835df656..f1ddb64fe62ab 100644
--- a/lib/raid/xor/arm/xor_arch.h
+++ b/lib/raid/xor/arm/xor_arch.h
@@ -3,12 +3,7 @@
* Copyright (C) 2001 Russell King
*/
#include <asm/neon.h>
-
-extern struct xor_block_template xor_block_arm4regs;
-extern struct xor_block_template xor_block_neon;
-
-void xor_gen_neon_inner(void *dest, void **srcs, unsigned int src_cnt,
- unsigned int bytes);
+#include "xor-neon.h"
static __always_inline void __init arch_xor_init(void)
{
diff --git a/lib/raid/xor/arm64/xor-eor3.c b/lib/raid/xor/arm64/xor-eor3.c
new file mode 100644
index 0000000000000..e44016c363f1c
--- /dev/null
+++ b/lib/raid/xor/arm64/xor-eor3.c
@@ -0,0 +1,146 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/cache.h>
+#include <asm/neon-intrinsics.h>
+#include "xor_impl.h"
+#include "xor_arch.h"
+#include "xor-neon.h"
+
+extern void __xor_eor3_2(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2);
+
+static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r)
+{
+ uint64x2_t res;
+
+ asm(ARM64_ASM_PREAMBLE ".arch_extension sha3\n"
+ "eor3 %0.16b, %1.16b, %2.16b, %3.16b"
+ : "=w"(res) : "w"(p), "w"(q), "w"(r));
+ return res;
+}
+
+static void __xor_eor3_3(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2,
+ const unsigned long * __restrict p3)
+{
+ uint64_t *dp1 = (uint64_t *)p1;
+ uint64_t *dp2 = (uint64_t *)p2;
+ uint64_t *dp3 = (uint64_t *)p3;
+
+ register uint64x2_t v0, v1, v2, v3;
+ long lines = bytes / (sizeof(uint64x2_t) * 4);
+
+ do {
+ /* p1 ^= p2 ^ p3 */
+ v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
+ vld1q_u64(dp3 + 0));
+ v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
+ vld1q_u64(dp3 + 2));
+ v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
+ vld1q_u64(dp3 + 4));
+ v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
+ vld1q_u64(dp3 + 6));
+
+ /* store */
+ vst1q_u64(dp1 + 0, v0);
+ vst1q_u64(dp1 + 2, v1);
+ vst1q_u64(dp1 + 4, v2);
+ vst1q_u64(dp1 + 6, v3);
+
+ dp1 += 8;
+ dp2 += 8;
+ dp3 += 8;
+ } while (--lines > 0);
+}
+
+static void __xor_eor3_4(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2,
+ const unsigned long * __restrict p3,
+ const unsigned long * __restrict p4)
+{
+ uint64_t *dp1 = (uint64_t *)p1;
+ uint64_t *dp2 = (uint64_t *)p2;
+ uint64_t *dp3 = (uint64_t *)p3;
+ uint64_t *dp4 = (uint64_t *)p4;
+
+ register uint64x2_t v0, v1, v2, v3;
+ long lines = bytes / (sizeof(uint64x2_t) * 4);
+
+ do {
+ /* p1 ^= p2 ^ p3 */
+ v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
+ vld1q_u64(dp3 + 0));
+ v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
+ vld1q_u64(dp3 + 2));
+ v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
+ vld1q_u64(dp3 + 4));
+ v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
+ vld1q_u64(dp3 + 6));
+
+ /* p1 ^= p4 */
+ v0 = veorq_u64(v0, vld1q_u64(dp4 + 0));
+ v1 = veorq_u64(v1, vld1q_u64(dp4 + 2));
+ v2 = veorq_u64(v2, vld1q_u64(dp4 + 4));
+ v3 = veorq_u64(v3, vld1q_u64(dp4 + 6));
+
+ /* store */
+ vst1q_u64(dp1 + 0, v0);
+ vst1q_u64(dp1 + 2, v1);
+ vst1q_u64(dp1 + 4, v2);
+ vst1q_u64(dp1 + 6, v3);
+
+ dp1 += 8;
+ dp2 += 8;
+ dp3 += 8;
+ dp4 += 8;
+ } while (--lines > 0);
+}
+
+static void __xor_eor3_5(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2,
+ const unsigned long * __restrict p3,
+ const unsigned long * __restrict p4,
+ const unsigned long * __restrict p5)
+{
+ uint64_t *dp1 = (uint64_t *)p1;
+ uint64_t *dp2 = (uint64_t *)p2;
+ uint64_t *dp3 = (uint64_t *)p3;
+ uint64_t *dp4 = (uint64_t *)p4;
+ uint64_t *dp5 = (uint64_t *)p5;
+
+ register uint64x2_t v0, v1, v2, v3;
+ long lines = bytes / (sizeof(uint64x2_t) * 4);
+
+ do {
+ /* p1 ^= p2 ^ p3 */
+ v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
+ vld1q_u64(dp3 + 0));
+ v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
+ vld1q_u64(dp3 + 2));
+ v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
+ vld1q_u64(dp3 + 4));
+ v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
+ vld1q_u64(dp3 + 6));
+
+ /* p1 ^= p4 ^ p5 */
+ v0 = eor3(v0, vld1q_u64(dp4 + 0), vld1q_u64(dp5 + 0));
+ v1 = eor3(v1, vld1q_u64(dp4 + 2), vld1q_u64(dp5 + 2));
+ v2 = eor3(v2, vld1q_u64(dp4 + 4), vld1q_u64(dp5 + 4));
+ v3 = eor3(v3, vld1q_u64(dp4 + 6), vld1q_u64(dp5 + 6));
+
+ /* store */
+ vst1q_u64(dp1 + 0, v0);
+ vst1q_u64(dp1 + 2, v1);
+ vst1q_u64(dp1 + 4, v2);
+ vst1q_u64(dp1 + 6, v3);
+
+ dp1 += 8;
+ dp2 += 8;
+ dp3 += 8;
+ dp4 += 8;
+ dp5 += 8;
+ } while (--lines > 0);
+}
+
+__DO_XOR_BLOCKS(eor3_inner, __xor_eor3_2, __xor_eor3_3, __xor_eor3_4,
+ __xor_eor3_5);
diff --git a/lib/raid/xor/xor-8regs.c b/lib/raid/xor/xor-8regs.c
index 1edaed8acffe6..46b3c8bdc27f3 100644
--- a/lib/raid/xor/xor-8regs.c
+++ b/lib/raid/xor/xor-8regs.c
@@ -93,11 +93,9 @@ xor_8regs_5(unsigned long bytes, unsigned long * __restrict p1,
} while (--lines > 0);
}
-#ifndef NO_TEMPLATE
DO_XOR_BLOCKS(8regs, xor_8regs_2, xor_8regs_3, xor_8regs_4, xor_8regs_5);
struct xor_block_template xor_block_8regs = {
.name = "8regs",
.xor_gen = xor_gen_8regs,
};
-#endif /* NO_TEMPLATE */
diff --git a/lib/raid/xor/arm64/xor-neon.c b/lib/raid/xor/xor-neon.c
index 97ef3cb924968..c7c3cf634e23a 100644
--- a/lib/raid/xor/arm64/xor-neon.c
+++ b/lib/raid/xor/xor-neon.c
@@ -4,12 +4,11 @@
* Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd.
*/
-#include <linux/cache.h>
-#include <asm/neon-intrinsics.h>
#include "xor_impl.h"
-#include "xor_arch.h"
#include "xor-neon.h"
+#include <asm/neon-intrinsics.h>
+
static void __xor_neon_2(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2)
{
@@ -175,138 +174,6 @@ static void __xor_neon_5(unsigned long bytes, unsigned long * __restrict p1,
__DO_XOR_BLOCKS(neon_inner, __xor_neon_2, __xor_neon_3, __xor_neon_4,
__xor_neon_5);
-static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r)
-{
- uint64x2_t res;
-
- asm(ARM64_ASM_PREAMBLE ".arch_extension sha3\n"
- "eor3 %0.16b, %1.16b, %2.16b, %3.16b"
- : "=w"(res) : "w"(p), "w"(q), "w"(r));
- return res;
-}
-
-static void __xor_eor3_3(unsigned long bytes, unsigned long * __restrict p1,
- const unsigned long * __restrict p2,
- const unsigned long * __restrict p3)
-{
- uint64_t *dp1 = (uint64_t *)p1;
- uint64_t *dp2 = (uint64_t *)p2;
- uint64_t *dp3 = (uint64_t *)p3;
-
- register uint64x2_t v0, v1, v2, v3;
- long lines = bytes / (sizeof(uint64x2_t) * 4);
-
- do {
- /* p1 ^= p2 ^ p3 */
- v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
- vld1q_u64(dp3 + 0));
- v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
- vld1q_u64(dp3 + 2));
- v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
- vld1q_u64(dp3 + 4));
- v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
- vld1q_u64(dp3 + 6));
-
- /* store */
- vst1q_u64(dp1 + 0, v0);
- vst1q_u64(dp1 + 2, v1);
- vst1q_u64(dp1 + 4, v2);
- vst1q_u64(dp1 + 6, v3);
-
- dp1 += 8;
- dp2 += 8;
- dp3 += 8;
- } while (--lines > 0);
-}
-
-static void __xor_eor3_4(unsigned long bytes, unsigned long * __restrict p1,
- const unsigned long * __restrict p2,
- const unsigned long * __restrict p3,
- const unsigned long * __restrict p4)
-{
- uint64_t *dp1 = (uint64_t *)p1;
- uint64_t *dp2 = (uint64_t *)p2;
- uint64_t *dp3 = (uint64_t *)p3;
- uint64_t *dp4 = (uint64_t *)p4;
-
- register uint64x2_t v0, v1, v2, v3;
- long lines = bytes / (sizeof(uint64x2_t) * 4);
-
- do {
- /* p1 ^= p2 ^ p3 */
- v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
- vld1q_u64(dp3 + 0));
- v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
- vld1q_u64(dp3 + 2));
- v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
- vld1q_u64(dp3 + 4));
- v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
- vld1q_u64(dp3 + 6));
-
- /* p1 ^= p4 */
- v0 = veorq_u64(v0, vld1q_u64(dp4 + 0));
- v1 = veorq_u64(v1, vld1q_u64(dp4 + 2));
- v2 = veorq_u64(v2, vld1q_u64(dp4 + 4));
- v3 = veorq_u64(v3, vld1q_u64(dp4 + 6));
-
- /* store */
- vst1q_u64(dp1 + 0, v0);
- vst1q_u64(dp1 + 2, v1);
- vst1q_u64(dp1 + 4, v2);
- vst1q_u64(dp1 + 6, v3);
-
- dp1 += 8;
- dp2 += 8;
- dp3 += 8;
- dp4 += 8;
- } while (--lines > 0);
-}
-
-static void __xor_eor3_5(unsigned long bytes, unsigned long * __restrict p1,
- const unsigned long * __restrict p2,
- const unsigned long * __restrict p3,
- const unsigned long * __restrict p4,
- const unsigned long * __restrict p5)
-{
- uint64_t *dp1 = (uint64_t *)p1;
- uint64_t *dp2 = (uint64_t *)p2;
- uint64_t *dp3 = (uint64_t *)p3;
- uint64_t *dp4 = (uint64_t *)p4;
- uint64_t *dp5 = (uint64_t *)p5;
-
- register uint64x2_t v0, v1, v2, v3;
- long lines = bytes / (sizeof(uint64x2_t) * 4);
-
- do {
- /* p1 ^= p2 ^ p3 */
- v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
- vld1q_u64(dp3 + 0));
- v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
- vld1q_u64(dp3 + 2));
- v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
- vld1q_u64(dp3 + 4));
- v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
- vld1q_u64(dp3 + 6));
-
- /* p1 ^= p4 ^ p5 */
- v0 = eor3(v0, vld1q_u64(dp4 + 0), vld1q_u64(dp5 + 0));
- v1 = eor3(v1, vld1q_u64(dp4 + 2), vld1q_u64(dp5 + 2));
- v2 = eor3(v2, vld1q_u64(dp4 + 4), vld1q_u64(dp5 + 4));
- v3 = eor3(v3, vld1q_u64(dp4 + 6), vld1q_u64(dp5 + 6));
-
- /* store */
- vst1q_u64(dp1 + 0, v0);
- vst1q_u64(dp1 + 2, v1);
- vst1q_u64(dp1 + 4, v2);
- vst1q_u64(dp1 + 6, v3);
-
- dp1 += 8;
- dp2 += 8;
- dp3 += 8;
- dp4 += 8;
- dp5 += 8;
- } while (--lines > 0);
-}
-
-__DO_XOR_BLOCKS(eor3_inner, __xor_neon_2, __xor_eor3_3, __xor_eor3_4,
- __xor_eor3_5);
+#ifdef CONFIG_ARM64
+extern typeof(__xor_neon_2) __xor_eor3_2 __alias(__xor_neon_2);
+#endif