lib/crypto: arm64: Move arch/arm64/lib/crypto/ into lib/crypto/

Move the contents of arch/arm64/lib/crypto/ into lib/crypto/arm64/. The new code organization makes a lot more sense for how this code actually works and is developed. In particular, it makes it possible to build each algorithm as a single module, with better inlining and dead code elimination. For a more detailed explanation, see the patchset which did this for the CRC library code: https://lore.kernel.org/r/20250607200454.73587-1-ebiggers@kernel.org/. Also see the patchset which did this for SHA-512: https://lore.kernel.org/linux-crypto/20250616014019.415791-1-ebiggers@kernel.org/ This is just a preparatory commit, which does the move to get the files into their new location but keeps them building the same way as before. Later commits will make the actual improvements to the way the arch-optimized code is integrated for each algorithm. Add a gitignore entry for the removed directory arch/arm64/lib/crypto/ so that people don't accidentally commit leftover generated files. Acked-by: Ard Biesheuvel <ardb@kernel.org> Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com> Reviewed-by: Sohil Mehta <sohil.mehta@intel.com> Link: https://lore.kernel.org/r/20250619191908.134235-3-ebiggers@kernel.org Signed-off-by: Eric Biggers <ebiggers@kernel.org>
author: Eric Biggers <ebiggers@kernel.org> 2025-06-19 12:19:01 -0700
committer: Eric Biggers <ebiggers@kernel.org> 2025-06-30 09:26:20 -0700
commit: 61f86c70cf419c826ee3b9c175d1e355844d8b45 (patch)
tree: 7c969eb081049f6a1b1ed583aca62db77e039211 /lib/crypto
parent: 4a32e5dc1dcfa8f1ed6ca33328e71dcb6196c09e (diff)
download: linux-61f86c70cf419c826ee3b9c175d1e355844d8b45.tar.gz
linux-61f86c70cf419c826ee3b9c175d1e355844d8b45.tar.bz2
linux-61f86c70cf419c826ee3b9c175d1e355844d8b45.zip
12 files changed, 2960 insertions, 2 deletions
diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig
index e14bef8e87af..fdeb91bf0032 100644
--- a/lib/crypto/Kconfig
+++ b/lib/crypto/Kconfig
@@ -193,7 +193,7 @@ if ARM
 source "lib/crypto/arm/Kconfig"
 endif
 if ARM64
-source "arch/arm64/lib/crypto/Kconfig"
+source "lib/crypto/arm64/Kconfig"
 endif
 if MIPS
 source "arch/mips/lib/crypto/Kconfig"
diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile
index 5f2b81f82a85..19e9880c5d5f 100644
--- a/lib/crypto/Makefile
+++ b/lib/crypto/Makefile
@@ -87,7 +87,7 @@ endif
 
 ifeq ($(CONFIG_ARM64),y)
 libsha512-y += arm64/sha512-core.o
-$(obj)/arm64/sha512-core.S: $(src)/../../arch/arm64/lib/crypto/sha2-armv8.pl
+$(obj)/arm64/sha512-core.S: $(src)/arm64/sha2-armv8.pl
 	$(call cmd,perlasm_with_args)
 clean-files += arm64/sha512-core.S
 libsha512-$(CONFIG_KERNEL_MODE_NEON) += arm64/sha512-ce-core.o
@@ -108,3 +108,4 @@ obj-$(CONFIG_CRYPTO_LIB_SM3)			+= libsm3.o
 libsm3-y					:= sm3.o
 
 obj-$(CONFIG_ARM) += arm/
+obj-$(CONFIG_ARM64) += arm64/
diff --git a/lib/crypto/arm64/.gitignore b/lib/crypto/arm64/.gitignore
index 670a4d97b568..f6c4e8ef80da 100644
--- a/lib/crypto/arm64/.gitignore
+++ b/lib/crypto/arm64/.gitignore
@@ -1,2 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0-only
+poly1305-core.S
+sha256-core.S
 sha512-core.S
diff --git a/lib/crypto/arm64/Kconfig b/lib/crypto/arm64/Kconfig
new file mode 100644
index 000000000000..129a7685cb4c
--- /dev/null
+++ b/lib/crypto/arm64/Kconfig
@@ -0,0 +1,20 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+config CRYPTO_CHACHA20_NEON
+	tristate
+	depends on KERNEL_MODE_NEON
+	default CRYPTO_LIB_CHACHA
+	select CRYPTO_LIB_CHACHA_GENERIC
+	select CRYPTO_ARCH_HAVE_LIB_CHACHA
+
+config CRYPTO_POLY1305_NEON
+	tristate
+	depends on KERNEL_MODE_NEON
+	default CRYPTO_LIB_POLY1305
+	select CRYPTO_ARCH_HAVE_LIB_POLY1305
+
+config CRYPTO_SHA256_ARM64
+	tristate
+	default CRYPTO_LIB_SHA256
+	select CRYPTO_ARCH_HAVE_LIB_SHA256
+	select CRYPTO_ARCH_HAVE_LIB_SHA256_SIMD
diff --git a/lib/crypto/arm64/Makefile b/lib/crypto/arm64/Makefile
new file mode 100644
index 000000000000..946c09903711
--- /dev/null
+++ b/lib/crypto/arm64/Makefile
@@ -0,0 +1,24 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
+chacha-neon-y := chacha-neon-core.o chacha-neon-glue.o
+
+obj-$(CONFIG_CRYPTO_POLY1305_NEON) += poly1305-neon.o
+poly1305-neon-y := poly1305-core.o poly1305-glue.o
+AFLAGS_poly1305-core.o += -Dpoly1305_init=poly1305_block_init_arch
+AFLAGS_poly1305-core.o += -Dpoly1305_emit=poly1305_emit_arch
+
+obj-$(CONFIG_CRYPTO_SHA256_ARM64) += sha256-arm64.o
+sha256-arm64-y := sha256.o sha256-core.o
+sha256-arm64-$(CONFIG_KERNEL_MODE_NEON) += sha256-ce.o
+
+quiet_cmd_perlasm = PERLASM $@
+      cmd_perlasm = $(PERL) $(<) void $(@)
+
+$(obj)/%-core.S: $(src)/%-armv8.pl
+	$(call cmd,perlasm)
+
+$(obj)/sha256-core.S: $(src)/sha2-armv8.pl
+	$(call cmd,perlasm)
+
+clean-files += poly1305-core.S sha256-core.S
diff --git a/lib/crypto/arm64/chacha-neon-core.S b/lib/crypto/arm64/chacha-neon-core.S
new file mode 100644
index 000000000000..80079586ecc7
--- /dev/null
+++ b/lib/crypto/arm64/chacha-neon-core.S
@@ -0,0 +1,805 @@
+/*
+ * ChaCha/HChaCha NEON helper functions
+ *
+ * Copyright (C) 2016-2018 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Originally based on:
+ * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+#include <asm/cache.h>
+
+	.text
+	.align		6
+
+/*
+ * chacha_permute - permute one block
+ *
+ * Permute one 64-byte block where the state matrix is stored in the four NEON
+ * registers v0-v3.  It performs matrix operations on four words in parallel,
+ * but requires shuffling to rearrange the words after each round.
+ *
+ * The round count is given in w3.
+ *
+ * Clobbers: w3, x10, v4, v12
+ */
+SYM_FUNC_START_LOCAL(chacha_permute)
+
+	adr_l		x10, ROT8
+	ld1		{v12.4s}, [x10]
+
+.Ldoubleround:
+	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	add		v0.4s, v0.4s, v1.4s
+	eor		v3.16b, v3.16b, v0.16b
+	rev32		v3.8h, v3.8h
+
+	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	add		v2.4s, v2.4s, v3.4s
+	eor		v4.16b, v1.16b, v2.16b
+	shl		v1.4s, v4.4s, #12
+	sri		v1.4s, v4.4s, #20
+
+	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	add		v0.4s, v0.4s, v1.4s
+	eor		v3.16b, v3.16b, v0.16b
+	tbl		v3.16b, {v3.16b}, v12.16b
+
+	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	add		v2.4s, v2.4s, v3.4s
+	eor		v4.16b, v1.16b, v2.16b
+	shl		v1.4s, v4.4s, #7
+	sri		v1.4s, v4.4s, #25
+
+	// x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+	ext		v1.16b, v1.16b, v1.16b, #4
+	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	ext		v2.16b, v2.16b, v2.16b, #8
+	// x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+	ext		v3.16b, v3.16b, v3.16b, #12
+
+	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	add		v0.4s, v0.4s, v1.4s
+	eor		v3.16b, v3.16b, v0.16b
+	rev32		v3.8h, v3.8h
+
+	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	add		v2.4s, v2.4s, v3.4s
+	eor		v4.16b, v1.16b, v2.16b
+	shl		v1.4s, v4.4s, #12
+	sri		v1.4s, v4.4s, #20
+
+	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	add		v0.4s, v0.4s, v1.4s
+	eor		v3.16b, v3.16b, v0.16b
+	tbl		v3.16b, {v3.16b}, v12.16b
+
+	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	add		v2.4s, v2.4s, v3.4s
+	eor		v4.16b, v1.16b, v2.16b
+	shl		v1.4s, v4.4s, #7
+	sri		v1.4s, v4.4s, #25
+
+	// x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+	ext		v1.16b, v1.16b, v1.16b, #12
+	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	ext		v2.16b, v2.16b, v2.16b, #8
+	// x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+	ext		v3.16b, v3.16b, v3.16b, #4
+
+	subs		w3, w3, #2
+	b.ne		.Ldoubleround
+
+	ret
+SYM_FUNC_END(chacha_permute)
+
+SYM_FUNC_START(chacha_block_xor_neon)
+	// x0: Input state matrix, s
+	// x1: 1 data block output, o
+	// x2: 1 data block input, i
+	// w3: nrounds
+
+	stp		x29, x30, [sp, #-16]!
+	mov		x29, sp
+
+	// x0..3 = s0..3
+	ld1		{v0.4s-v3.4s}, [x0]
+	ld1		{v8.4s-v11.4s}, [x0]
+
+	bl		chacha_permute
+
+	ld1		{v4.16b-v7.16b}, [x2]
+
+	// o0 = i0 ^ (x0 + s0)
+	add		v0.4s, v0.4s, v8.4s
+	eor		v0.16b, v0.16b, v4.16b
+
+	// o1 = i1 ^ (x1 + s1)
+	add		v1.4s, v1.4s, v9.4s
+	eor		v1.16b, v1.16b, v5.16b
+
+	// o2 = i2 ^ (x2 + s2)
+	add		v2.4s, v2.4s, v10.4s
+	eor		v2.16b, v2.16b, v6.16b
+
+	// o3 = i3 ^ (x3 + s3)
+	add		v3.4s, v3.4s, v11.4s
+	eor		v3.16b, v3.16b, v7.16b
+
+	st1		{v0.16b-v3.16b}, [x1]
+
+	ldp		x29, x30, [sp], #16
+	ret
+SYM_FUNC_END(chacha_block_xor_neon)
+
+SYM_FUNC_START(hchacha_block_neon)
+	// x0: Input state matrix, s
+	// x1: output (8 32-bit words)
+	// w2: nrounds
+
+	stp		x29, x30, [sp, #-16]!
+	mov		x29, sp
+
+	ld1		{v0.4s-v3.4s}, [x0]
+
+	mov		w3, w2
+	bl		chacha_permute
+
+	st1		{v0.4s}, [x1], #16
+	st1		{v3.4s}, [x1]
+
+	ldp		x29, x30, [sp], #16
+	ret
+SYM_FUNC_END(hchacha_block_neon)
+
+	a0		.req	w12
+	a1		.req	w13
+	a2		.req	w14
+	a3		.req	w15
+	a4		.req	w16
+	a5		.req	w17
+	a6		.req	w19
+	a7		.req	w20
+	a8		.req	w21
+	a9		.req	w22
+	a10		.req	w23
+	a11		.req	w24
+	a12		.req	w25
+	a13		.req	w26
+	a14		.req	w27
+	a15		.req	w28
+
+	.align		6
+SYM_FUNC_START(chacha_4block_xor_neon)
+	frame_push	10
+
+	// x0: Input state matrix, s
+	// x1: 4 data blocks output, o
+	// x2: 4 data blocks input, i
+	// w3: nrounds
+	// x4: byte count
+
+	adr_l		x10, .Lpermute
+	and		x5, x4, #63
+	add		x10, x10, x5
+
+	//
+	// This function encrypts four consecutive ChaCha blocks by loading
+	// the state matrix in NEON registers four times. The algorithm performs
+	// each operation on the corresponding word of each state matrix, hence
+	// requires no word shuffling. For final XORing step we transpose the
+	// matrix by interleaving 32- and then 64-bit words, which allows us to
+	// do XOR in NEON registers.
+	//
+	// At the same time, a fifth block is encrypted in parallel using
+	// scalar registers
+	//
+	adr_l		x9, CTRINC		// ... and ROT8
+	ld1		{v30.4s-v31.4s}, [x9]
+
+	// x0..15[0-3] = s0..3[0..3]
+	add		x8, x0, #16
+	ld4r		{ v0.4s- v3.4s}, [x0]
+	ld4r		{ v4.4s- v7.4s}, [x8], #16
+	ld4r		{ v8.4s-v11.4s}, [x8], #16
+	ld4r		{v12.4s-v15.4s}, [x8]
+
+	mov		a0, v0.s[0]
+	mov		a1, v1.s[0]
+	mov		a2, v2.s[0]
+	mov		a3, v3.s[0]
+	mov		a4, v4.s[0]
+	mov		a5, v5.s[0]
+	mov		a6, v6.s[0]
+	mov		a7, v7.s[0]
+	mov		a8, v8.s[0]
+	mov		a9, v9.s[0]
+	mov		a10, v10.s[0]
+	mov		a11, v11.s[0]
+	mov		a12, v12.s[0]
+	mov		a13, v13.s[0]
+	mov		a14, v14.s[0]
+	mov		a15, v15.s[0]
+
+	// x12 += counter values 1-4
+	add		v12.4s, v12.4s, v30.4s
+
+.Ldoubleround4:
+	// x0 += x4, x12 = rotl32(x12 ^ x0, 16)
+	// x1 += x5, x13 = rotl32(x13 ^ x1, 16)
+	// x2 += x6, x14 = rotl32(x14 ^ x2, 16)
+	// x3 += x7, x15 = rotl32(x15 ^ x3, 16)
+	add		v0.4s, v0.4s, v4.4s
+	  add		a0, a0, a4
+	add		v1.4s, v1.4s, v5.4s
+	  add		a1, a1, a5
+	add		v2.4s, v2.4s, v6.4s
+	  add		a2, a2, a6
+	add		v3.4s, v3.4s, v7.4s
+	  add		a3, a3, a7
+
+	eor		v12.16b, v12.16b, v0.16b
+	  eor		a12, a12, a0
+	eor		v13.16b, v13.16b, v1.16b
+	  eor		a13, a13, a1
+	eor		v14.16b, v14.16b, v2.16b
+	  eor		a14, a14, a2
+	eor		v15.16b, v15.16b, v3.16b
+	  eor		a15, a15, a3
+
+	rev32		v12.8h, v12.8h
+	  ror		a12, a12, #16
+	rev32		v13.8h, v13.8h
+	  ror		a13, a13, #16
+	rev32		v14.8h, v14.8h
+	  ror		a14, a14, #16
+	rev32		v15.8h, v15.8h
+	  ror		a15, a15, #16
+
+	// x8 += x12, x4 = rotl32(x4 ^ x8, 12)
+	// x9 += x13, x5 = rotl32(x5 ^ x9, 12)
+	// x10 += x14, x6 = rotl32(x6 ^ x10, 12)
+	// x11 += x15, x7 = rotl32(x7 ^ x11, 12)
+	add		v8.4s, v8.4s, v12.4s
+	  add		a8, a8, a12
+	add		v9.4s, v9.4s, v13.4s
+	  add		a9, a9, a13
+	add		v10.4s, v10.4s, v14.4s
+	  add		a10, a10, a14
+	add		v11.4s, v11.4s, v15.4s
+	  add		a11, a11, a15
+
+	eor		v16.16b, v4.16b, v8.16b
+	  eor		a4, a4, a8
+	eor		v17.16b, v5.16b, v9.16b
+	  eor		a5, a5, a9
+	eor		v18.16b, v6.16b, v10.16b
+	  eor		a6, a6, a10
+	eor		v19.16b, v7.16b, v11.16b
+	  eor		a7, a7, a11
+
+	shl		v4.4s, v16.4s, #12
+	shl		v5.4s, v17.4s, #12
+	shl		v6.4s, v18.4s, #12
+	shl		v7.4s, v19.4s, #12
+
+	sri		v4.4s, v16.4s, #20
+	  ror		a4, a4, #20
+	sri		v5.4s, v17.4s, #20
+	  ror		a5, a5, #20
+	sri		v6.4s, v18.4s, #20
+	  ror		a6, a6, #20
+	sri		v7.4s, v19.4s, #20
+	  ror		a7, a7, #20
+
+	// x0 += x4, x12 = rotl32(x12 ^ x0, 8)
+	// x1 += x5, x13 = rotl32(x13 ^ x1, 8)
+	// x2 += x6, x14 = rotl32(x14 ^ x2, 8)
+	// x3 += x7, x15 = rotl32(x15 ^ x3, 8)
+	add		v0.4s, v0.4s, v4.4s
+	  add		a0, a0, a4
+	add		v1.4s, v1.4s, v5.4s
+	  add		a1, a1, a5
+	add		v2.4s, v2.4s, v6.4s
+	  add		a2, a2, a6
+	add		v3.4s, v3.4s, v7.4s
+	  add		a3, a3, a7
+
+	eor		v12.16b, v12.16b, v0.16b
+	  eor		a12, a12, a0
+	eor		v13.16b, v13.16b, v1.16b
+	  eor		a13, a13, a1
+	eor		v14.16b, v14.16b, v2.16b
+	  eor		a14, a14, a2
+	eor		v15.16b, v15.16b, v3.16b
+	  eor		a15, a15, a3
+
+	tbl		v12.16b, {v12.16b}, v31.16b
+	  ror		a12, a12, #24
+	tbl		v13.16b, {v13.16b}, v31.16b
+	  ror		a13, a13, #24
+	tbl		v14.16b, {v14.16b}, v31.16b
+	  ror		a14, a14, #24
+	tbl		v15.16b, {v15.16b}, v31.16b
+	  ror		a15, a15, #24
+
+	// x8 += x12, x4 = rotl32(x4 ^ x8, 7)
+	// x9 += x13, x5 = rotl32(x5 ^ x9, 7)
+	// x10 += x14, x6 = rotl32(x6 ^ x10, 7)
+	// x11 += x15, x7 = rotl32(x7 ^ x11, 7)
+	add		v8.4s, v8.4s, v12.4s
+	  add		a8, a8, a12
+	add		v9.4s, v9.4s, v13.4s
+	  add		a9, a9, a13
+	add		v10.4s, v10.4s, v14.4s
+	  add		a10, a10, a14
+	add		v11.4s, v11.4s, v15.4s
+	  add		a11, a11, a15
+
+	eor		v16.16b, v4.16b, v8.16b
+	  eor		a4, a4, a8
+	eor		v17.16b, v5.16b, v9.16b
+	  eor		a5, a5, a9
+	eor		v18.16b, v6.16b, v10.16b
+	  eor		a6, a6, a10
+	eor		v19.16b, v7.16b, v11.16b
+	  eor		a7, a7, a11
+
+	shl		v4.4s, v16.4s, #7
+	shl		v5.4s, v17.4s, #7
+	shl		v6.4s, v18.4s, #7
+	shl		v7.4s, v19.4s, #7
+
+	sri		v4.4s, v16.4s, #25
+	  ror		a4, a4, #25
+	sri		v5.4s, v17.4s, #25
+	  ror		a5, a5, #25
+	sri		v6.4s, v18.4s, #25
+	 ror		a6, a6, #25
+	sri		v7.4s, v19.4s, #25
+	  ror		a7, a7, #25
+
+	// x0 += x5, x15 = rotl32(x15 ^ x0, 16)
+	// x1 += x6, x12 = rotl32(x12 ^ x1, 16)
+	// x2 += x7, x13 = rotl32(x13 ^ x2, 16)
+	// x3 += x4, x14 = rotl32(x14 ^ x3, 16)
+	add		v0.4s, v0.4s, v5.4s
+	  add		a0, a0, a5
+	add		v1.4s, v1.4s, v6.4s
+	  add		a1, a1, a6
+	add		v2.4s, v2.4s, v7.4s
+	  add		a2, a2, a7
+	add		v3.4s, v3.4s, v4.4s
+	  add		a3, a3, a4
+
+	eor		v15.16b, v15.16b, v0.16b
+	  eor		a15, a15, a0
+	eor		v12.16b, v12.16b, v1.16b
+	  eor		a12, a12, a1
+	eor		v13.16b, v13.16b, v2.16b
+	  eor		a13, a13, a2
+	eor		v14.16b, v14.16b, v3.16b
+	  eor		a14, a14, a3
+
+	rev32		v15.8h, v15.8h
+	  ror		a15, a15, #16
+	rev32		v12.8h, v12.8h
+	  ror		a12, a12, #16
+	rev32		v13.8h, v13.8h
+	  ror		a13, a13, #16
+	rev32		v14.8h, v14.8h
+	  ror		a14, a14, #16
+
+	// x10 += x15, x5 = rotl32(x5 ^ x10, 12)
+	// x11 += x12, x6 = rotl32(x6 ^ x11, 12)
+	// x8 += x13, x7 = rotl32(x7 ^ x8, 12)
+	// x9 += x14, x4 = rotl32(x4 ^ x9, 12)
+	add		v10.4s, v10.4s, v15.4s
+	  add		a10, a10, a15
+	add		v11.4s, v11.4s, v12.4s
+	  add		a11, a11, a12
+	add		v8.4s, v8.4s, v13.4s
+	  add		a8, a8, a13
+	add		v9.4s, v9.4s, v14.4s
+	  add		a9, a9, a14
+
+	eor		v16.16b, v5.16b, v10.16b
+	  eor		a5, a5, a10
+	eor		v17.16b, v6.16b, v11.16b
+	  eor		a6, a6, a11
+	eor		v18.16b, v7.16b, v8.16b
+	  eor		a7, a7, a8
+	eor		v19.16b, v4.16b, v9.16b
+	  eor		a4, a4, a9
+
+	shl		v5.4s, v16.4s, #12
+	shl		v6.4s, v17.4s, #12
+	shl		v7.4s, v18.4s, #12
+	shl		v4.4s, v19.4s, #12
+
+	sri		v5.4s, v16.4s, #20
+	  ror		a5, a5, #20
+	sri		v6.4s, v17.4s, #20
+	  ror		a6, a6, #20
+	sri		v7.4s, v18.4s, #20
+	  ror		a7, a7, #20
+	sri		v4.4s, v19.4s, #20
+	  ror		a4, a4, #20
+
+	// x0 += x5, x15 = rotl32(x15 ^ x0, 8)
+	// x1 += x6, x12 = rotl32(x12 ^ x1, 8)
+	// x2 += x7, x13 = rotl32(x13 ^ x2, 8)
+	// x3 += x4, x14 = rotl32(x14 ^ x3, 8)
+	add		v0.4s, v0.4s, v5.4s
+	  add		a0, a0, a5
+	add		v1.4s, v1.4s, v6.4s
+	  add		a1, a1, a6
+	add		v2.4s, v2.4s, v7.4s
+	  add		a2, a2, a7
+	add		v3.4s, v3.4s, v4.4s
+	  add		a3, a3, a4
+
+	eor		v15.16b, v15.16b, v0.16b
+	  eor		a15, a15, a0
+	eor		v12.16b, v12.16b, v1.16b
+	  eor		a12, a12, a1
+	eor		v13.16b, v13.16b, v2.16b
+	  eor		a13, a13, a2
+	eor		v14.16b, v14.16b, v3.16b
+	  eor		a14, a14, a3
+
+	tbl		v15.16b, {v15.16b}, v31.16b
+	  ror		a15, a15, #24
+	tbl		v12.16b, {v12.16b}, v31.16b
+	  ror		a12, a12, #24
+	tbl		v13.16b, {v13.16b}, v31.16b
+	  ror		a13, a13, #24
+	tbl		v14.16b, {v14.16b}, v31.16b
+	  ror		a14, a14, #24
+
+	// x10 += x15, x5 = rotl32(x5 ^ x10, 7)
+	// x11 += x12, x6 = rotl32(x6 ^ x11, 7)
+	// x8 += x13, x7 = rotl32(x7 ^ x8, 7)
+	// x9 += x14, x4 = rotl32(x4 ^ x9, 7)
+	add		v10.4s, v10.4s, v15.4s
+	  add		a10, a10, a15
+	add		v11.4s, v11.4s, v12.4s
+	  add		a11, a11, a12
+	add		v8.4s, v8.4s, v13.4s
+	  add		a8, a8, a13
+	add		v9.4s, v9.4s, v14.4s
+	  add		a9, a9, a14
+
+	eor		v16.16b, v5.16b, v10.16b
+	  eor		a5, a5, a10
+	eor		v17.16b, v6.16b, v11.16b
+	  eor		a6, a6, a11
+	eor		v18.16b, v7.16b, v8.16b
+	  eor		a7, a7, a8
+	eor		v19.16b, v4.16b, v9.16b
+	  eor		a4, a4, a9
+
+	shl		v5.4s, v16.4s, #7
+	shl		v6.4s, v17.4s, #7
+	shl		v7.4s, v18.4s, #7
+	shl		v4.4s, v19.4s, #7
+
+	sri		v5.4s, v16.4s, #25
+	  ror		a5, a5, #25
+	sri		v6.4s, v17.4s, #25
+	  ror		a6, a6, #25
+	sri		v7.4s, v18.4s, #25
+	  ror		a7, a7, #25
+	sri		v4.4s, v19.4s, #25
+	  ror		a4, a4, #25
+
+	subs		w3, w3, #2
+	b.ne		.Ldoubleround4
+
+	ld4r		{v16.4s-v19.4s}, [x0], #16
+	ld4r		{v20.4s-v23.4s}, [x0], #16
+
+	// x12 += counter values 0-3
+	add		v12.4s, v12.4s, v30.4s
+
+	// x0[0-3] += s0[0]
+	// x1[0-3] += s0[1]
+	// x2[0-3] += s0[2]
+	// x3[0-3] += s0[3]
+	add		v0.4s, v0.4s, v16.4s
+	  mov		w6, v16.s[0]
+	  mov		w7, v17.s[0]
+	add		v1.4s, v1.4s, v17.4s
+	  mov		w8, v18.s[0]
+	  mov		w9, v19.s[0]
+	add		v2.4s, v2.4s, v18.4s
+	  add		a0, a0, w6
+	  add		a1, a1, w7
+	add		v3.4s, v3.4s, v19.4s
+	  add		a2, a2, w8
+	  add		a3, a3, w9
+CPU_BE(	  rev		a0, a0		)
+CPU_BE(	  rev		a1, a1		)
+CPU_BE(	  rev		a2, a2		)
+CPU_BE(	  rev		a3, a3		)
+
+	ld4r		{v24.4s-v27.4s}, [x0], #16
+	ld4r		{v28.4s-v31.4s}, [x0]
+
+	// x4[0-3] += s1[0]
+	// x5[0-3] += s1[1]
+	// x6[0-3] += s1[2]
+	// x7[0-3] += s1[3]
+	add		v4.4s, v4.4s, v20.4s
+	  mov		w6, v20.s[0]
+	  mov		w7, v21.s[0]
+	add		v5.4s, v5.4s, v21.4s
+	  mov		w8, v22.s[0]
+	  mov		w9, v23.s[0]
+	add		v6.4s, v6.4s, v22.4s
+	  add		a4, a4, w6
+	  add		a5, a5, w7
+	add		v7.4s, v7.4s, v23.4s
+	  add		a6, a6, w8
+	  add		a7, a7, w9
+CPU_BE(	  rev		a4, a4		)
+CPU_BE(	  rev		a5, a5		)
+CPU_BE(	  rev		a6, a6		)
+CPU_BE(	  rev		a7, a7		)
+
+	// x8[0-3] += s2[0]
+	// x9[0-3] += s2[1]
+	// x10[0-3] += s2[2]
+	// x11[0-3] += s2[3]
+	add		v8.4s, v8.4s, v24.4s
+	  mov		w6, v24.s[0]
+	  mov		w7, v25.s[0]
+	add		v9.4s, v9.4s, v25.4s
+	  mov		w8, v26.s[0]
+	  mov		w9, v27.s[0]
+	add		v10.4s, v10.4s, v26.4s
+	  add		a8, a8, w6
+	  add		a9, a9, w7
+	add		v11.4s, v11.4s, v27.4s
+	  add		a10, a10, w8
+	  add		a11, a11, w9
+CPU_BE(	  rev		a8, a8		)
+CPU_BE(	  rev		a9, a9		)
+CPU_BE(	  rev		a10, a10	)
+CPU_BE(	  rev		a11, a11	)
+
+	// x12[0-3] += s3[0]
+	// x13[0-3] += s3[1]
+	// x14[0-3] += s3[2]
+	// x15[0-3] += s3[3]
+	add		v12.4s, v12.4s, v28.4s
+	  mov		w6, v28.s[0]
+	  mov		w7, v29.s[0]
+	add		v13.4s, v13.4s, v29.4s
+	  mov		w8, v30.s[0]
+	  mov		w9, v31.s[0]
+	add		v14.4s, v14.4s, v30.4s
+	  add		a12, a12, w6
+	  add		a13, a13, w7
+	add		v15.4s, v15.4s, v31.4s
+	  add		a14, a14, w8
+	  add		a15, a15, w9
+CPU_BE(	  rev		a12, a12	)
+CPU_BE(	  rev		a13, a13	)
+CPU_BE(	  rev		a14, a14	)
+CPU_BE(	  rev		a15, a15	)
+
+	// interleave 32-bit words in state n, n+1
+	  ldp		w6, w7, [x2], #64
+	zip1		v16.4s, v0.4s, v1.4s
+	  ldp		w8, w9, [x2, #-56]
+	  eor		a0, a0, w6
+	zip2		v17.4s, v0.4s, v1.4s
+	  eor		a1, a1, w7
+	zip1		v18.4s, v2.4s, v3.4s
+	  eor		a2, a2, w8
+	zip2		v19.4s, v2.4s, v3.4s
+	  eor		a3, a3, w9
+	  ldp		w6, w7, [x2, #-48]
+	zip1		v20.4s, v4.4s, v5.4s
+	  ldp		w8, w9, [x2, #-40]
+	  eor		a4, a4, w6
+	zip2		v21.4s, v4.4s, v5.4s
+	  eor		a5, a5, w7
+	zip1		v22.4s, v6.4s, v7.4s
+	  eor		a6, a6, w8
+	zip2		v23.4s, v6.4s, v7.4s
+	  eor		a7, a7, w9
+	  ldp		w6, w7, [x2, #-32]
+	zip1		v24.4s, v8.4s, v9.4s
+	  ldp		w8, w9, [x2, #-24]
+	  eor		a8, a8, w6
+	zip2		v25.4s, v8.4s, v9.4s
+	  eor		a9, a9, w7
+	zip1		v26.4s, v10.4s, v11.4s
+	  eor		a10, a10, w8
+	zip2		v27.4s, v10.4s, v11.4s
+	  eor		a11, a11, w9
+	  ldp		w6, w7, [x2, #-16]
+	zip1		v28.4s, v12.4s, v13.4s
+	  ldp		w8, w9, [x2, #-8]
+	  eor		a12, a12, w6
+	zip2		v29.4s, v12.4s, v13.4s
+	  eor		a13, a13, w7
+	zip1		v30.4s, v14.4s, v15.4s
+	  eor		a14, a14, w8
+	zip2		v31.4s, v14.4s, v15.4s
+	  eor		a15, a15, w9
+
+	add		x3, x2, x4
+	sub		x3, x3, #128		// start of last block
+
+	subs		x5, x4, #128
+	csel		x2, x2, x3, ge
+
+	// interleave 64-bit words in state n, n+2
+	zip1		v0.2d, v16.2d, v18.2d
+	zip2		v4.2d, v16.2d, v18.2d
+	  stp		a0, a1, [x1], #64
+	zip1		v8.2d, v17.2d, v19.2d
+	zip2		v12.2d, v17.2d, v19.2d
+	  stp		a2, a3, [x1, #-56]
+
+	subs		x6, x4, #192
+	ld1		{v16.16b-v19.16b}, [x2], #64
+	csel		x2, x2, x3, ge
+
+	zip1		v1.2d, v20.2d, v22.2d
+	zip2		v5.2d, v20.2d, v22.2d
+	  stp		a4, a5, [x1, #-48]
+	zip1		v9.2d, v21.2d, v23.2d
+	zip2		v13.2d, v21.2d, v23.2d
+	  stp		a6, a7, [x1, #-40]
+
+	subs		x7, x4, #256
+	ld1		{v20.16b-v23.16b}, [x2], #64
+	csel		x2, x2, x3, ge
+
+	zip1		v2.2d, v24.2d, v26.2d
+	zip2		v6.2d, v24.2d, v26.2d
+	  stp		a8, a9, [x1, #-32]
+	zip1		v10.2d, v25.2d, v27.2d
+	zip2		v14.2d, v25.2d, v27.2d
+	  stp		a10, a11, [x1, #-24]
+
+	subs		x8, x4, #320
+	ld1		{v24.16b-v27.16b}, [x2], #64
+	csel		x2, x2, x3, ge
+
+	zip1		v3.2d, v28.2d, v30.2d
+	zip2		v7.2d, v28.2d, v30.2d
+	  stp		a12, a13, [x1, #-16]
+	zip1		v11.2d, v29.2d, v31.2d
+	zip2		v15.2d, v29.2d, v31.2d
+	  stp		a14, a15, [x1, #-8]
+
+	tbnz		x5, #63, .Lt128
+	ld1		{v28.16b-v31.16b}, [x2]
+
+	// xor with corresponding input, write to output
+	eor		v16.16b, v16.16b, v0.16b
+	eor		v17.16b, v17.16b, v1.16b
+	eor		v18.16b, v18.16b, v2.16b
+	eor		v19.16b, v19.16b, v3.16b
+
+	tbnz		x6, #63, .Lt192
+
+	eor		v20.16b, v20.16b, v4.16b
+	eor		v21.16b, v21.16b, v5.16b
+	eor		v22.16b, v22.16b, v6.16b
+	eor		v23.16b, v23.16b, v7.16b
+
+	st1		{v16.16b-v19.16b}, [x1], #64
+	tbnz		x7, #63, .Lt256
+
+	eor		v24.16b, v24.16b, v8.16b
+	eor		v25.16b, v25.16b, v9.16b
+	eor		v26.16b, v26.16b, v10.16b
+	eor		v27.16b, v27.16b, v11.16b
+
+	st1		{v20.16b-v23.16b}, [x1], #64
+	tbnz		x8, #63, .Lt320
+
+	eor		v28.16b, v28.16b, v12.16b
+	eor		v29.16b, v29.16b, v13.16b
+	eor		v30.16b, v30.16b, v14.16b
+	eor		v31.16b, v31.16b, v15.16b
+
+	st1		{v24.16b-v27.16b}, [x1], #64
+	st1		{v28.16b-v31.16b}, [x1]
+
+.Lout:	frame_pop
+	ret
+
+	// fewer than 192 bytes of in/output
+.Lt192:	cbz		x5, 1f				// exactly 128 bytes?
+	ld1		{v28.16b-v31.16b}, [x10]
+	add		x5, x5, x1
+	tbl		v28.16b, {v4.16b-v7.16b}, v28.16b
+	tbl		v29.16b, {v4.16b-v7.16b}, v29.16b
+	tbl		v30.16b, {v4.16b-v7.16b}, v30.16b
+	tbl		v31.16b, {v4.16b-v7.16b}, v31.16b
+
+0:	eor		v20.16b, v20.16b, v28.16b
+	eor		v21.16b, v21.16b, v29.16b
+	eor		v22.16b, v22.16b, v30.16b
+	eor		v23.16b, v23.16b, v31.16b
+	st1		{v20.16b-v23.16b}, [x5]		// overlapping stores
+1:	st1		{v16.16b-v19.16b}, [x1]
+	b		.Lout
+
+	// fewer than 128 bytes of in/output
+.Lt128:	ld1		{v28.16b-v31.16b}, [x10]
+	add		x5, x5, x1
+	sub		x1, x1, #64
+	tbl		v28.16b, {v0.16b-v3.16b}, v28.16b
+	tbl		v29.16b, {v0.16b-v3.16b}, v29.16b
+	tbl		v30.16b, {v0.16b-v3.16b}, v30.16b
+	tbl		v31.16b, {v0.16b-v3.16b}, v31.16b
+	ld1		{v16.16b-v19.16b}, [x1]		// reload first output block
+	b		0b
+
+	// fewer than 256 bytes of in/output
+.Lt256:	cbz		x6, 2f				// exactly 192 bytes?
+	ld1		{v4.16b-v7.16b}, [x10]
+	add		x6, x6, x1
+	tbl		v0.16b, {v8.16b-v11.16b}, v4.16b
+	tbl		v1.16b, {v8.16b-v11.16b}, v5.16b
+	tbl		v2.16b, {v8.16b-v11.16b}, v6.16b
+	tbl		v3.16b, {v8.16b-v11.16b}, v7.16b
+
+	eor		v28.16b, v28.16b, v0.16b
+	eor		v29.16b, v29.16b, v1.16b
+	eor		v30.16b, v30.16b, v2.16b
+	eor		v31.16b, v31.16b, v3.16b
+	st1		{v28.16b-v31.16b}, [x6]		// overlapping stores
+2:	st1		{v20.16b-v23.16b}, [x1]
+	b		.Lout
+
+	// fewer than 320 bytes of in/output
+.Lt320:	cbz		x7, 3f				// exactly 256 bytes?
+	ld1		{v4.16b-v7.16b}, [x10]
+	add		x7, x7, x1
+	tbl		v0.16b, {v12.16b-v15.16b}, v4.16b
+	tbl		v1.16b, {v12.16b-v15.16b}, v5.16b
+	tbl		v2.16b, {v12.16b-v15.16b}, v6.16b
+	tbl		v3.16b, {v12.16b-v15.16b}, v7.16b
+
+	eor		v28.16b, v28.16b, v0.16b
+	eor		v29.16b, v29.16b, v1.16b
+	eor		v30.16b, v30.16b, v2.16b
+	eor		v31.16b, v31.16b, v3.16b
+	st1		{v28.16b-v31.16b}, [x7]		// overlapping stores
+3:	st1		{v24.16b-v27.16b}, [x1]
+	b		.Lout
+SYM_FUNC_END(chacha_4block_xor_neon)
+
+	.section	".rodata", "a", %progbits
+	.align		L1_CACHE_SHIFT
+.Lpermute:
+	.set		.Li, 0
+	.rept		128
+	.byte		(.Li - 64)
+	.set		.Li, .Li + 1
+	.endr
+
+CTRINC:	.word		1, 2, 3, 4
+ROT8:	.word		0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f
diff --git a/lib/crypto/arm64/chacha-neon-glue.c b/lib/crypto/arm64/chacha-neon-glue.c
new file mode 100644
index 000000000000..d0188f974ca5
--- /dev/null
+++ b/lib/crypto/arm64/chacha-neon-glue.c
@@ -0,0 +1,119 @@
+/*
+ * ChaCha and HChaCha functions (ARM64 optimized)
+ *
+ * Copyright (C) 2016 - 2017 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Based on:
+ * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <crypto/chacha.h>
+#include <crypto/internal/simd.h>
+#include <linux/jump_label.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+#include <asm/simd.h>
+
+asmlinkage void chacha_block_xor_neon(const struct chacha_state *state,
+				      u8 *dst, const u8 *src, int nrounds);
+asmlinkage void chacha_4block_xor_neon(const struct chacha_state *state,
+				       u8 *dst, const u8 *src,
+				       int nrounds, int bytes);
+asmlinkage void hchacha_block_neon(const struct chacha_state *state,
+				   u32 out[HCHACHA_OUT_WORDS], int nrounds);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+
+static void chacha_doneon(struct chacha_state *state, u8 *dst, const u8 *src,
+			  int bytes, int nrounds)
+{
+	while (bytes > 0) {
+		int l = min(bytes, CHACHA_BLOCK_SIZE * 5);
+
+		if (l <= CHACHA_BLOCK_SIZE) {
+			u8 buf[CHACHA_BLOCK_SIZE];
+
+			memcpy(buf, src, l);
+			chacha_block_xor_neon(state, buf, buf, nrounds);
+			memcpy(dst, buf, l);
+			state->x[12] += 1;
+			break;
+		}
+		chacha_4block_xor_neon(state, dst, src, nrounds, l);
+		bytes -= l;
+		src += l;
+		dst += l;
+		state->x[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE);
+	}
+}
+
+void hchacha_block_arch(const struct chacha_state *state,
+			u32 out[HCHACHA_OUT_WORDS], int nrounds)
+{
+	if (!static_branch_likely(&have_neon) || !crypto_simd_usable()) {
+		hchacha_block_generic(state, out, nrounds);
+	} else {
+		kernel_neon_begin();
+		hchacha_block_neon(state, out, nrounds);
+		kernel_neon_end();
+	}
+}
+EXPORT_SYMBOL(hchacha_block_arch);
+
+void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src,
+		       unsigned int bytes, int nrounds)
+{
+	if (!static_branch_likely(&have_neon) || bytes <= CHACHA_BLOCK_SIZE ||
+	    !crypto_simd_usable())
+		return chacha_crypt_generic(state, dst, src, bytes, nrounds);
+
+	do {
+		unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
+
+		kernel_neon_begin();
+		chacha_doneon(state, dst, src, todo, nrounds);
+		kernel_neon_end();
+
+		bytes -= todo;
+		src += todo;
+		dst += todo;
+	} while (bytes);
+}
+EXPORT_SYMBOL(chacha_crypt_arch);
+
+bool chacha_is_arch_optimized(void)
+{
+	return static_key_enabled(&have_neon);
+}
+EXPORT_SYMBOL(chacha_is_arch_optimized);
+
+static int __init chacha_simd_mod_init(void)
+{
+	if (cpu_have_named_feature(ASIMD))
+		static_branch_enable(&have_neon);
+	return 0;
+}
+subsys_initcall(chacha_simd_mod_init);
+
+static void __exit chacha_simd_mod_exit(void)
+{
+}
+module_exit(chacha_simd_mod_exit);
+
+MODULE_DESCRIPTION("ChaCha and HChaCha functions (ARM64 optimized)");
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
diff --git a/lib/crypto/arm64/poly1305-armv8.pl b/lib/crypto/arm64/poly1305-armv8.pl
new file mode 100644
index 000000000000..22c9069c0650
--- /dev/null
+++ b/lib/crypto/arm64/poly1305-armv8.pl
@@ -0,0 +1,917 @@
+#!/usr/bin/env perl
+# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
author	Eric Biggers <ebiggers@kernel.org>	2025-06-19 12:19:01 -0700
committer	Eric Biggers <ebiggers@kernel.org>	2025-06-30 09:26:20 -0700
commit	61f86c70cf419c826ee3b9c175d1e355844d8b45 (patch)
tree	7c969eb081049f6a1b1ed583aca62db77e039211 /lib/crypto
parent	4a32e5dc1dcfa8f1ed6ca33328e71dcb6196c09e (diff)
download	linux-61f86c70cf419c826ee3b9c175d1e355844d8b45.tar.gz linux-61f86c70cf419c826ee3b9c175d1e355844d8b45.tar.bz2 linux-61f86c70cf419c826ee3b9c175d1e355844d8b45.zip