summaryrefslogtreecommitdiff
path: root/arch
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-03-21 16:02:36 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2022-03-21 16:02:36 -0700
commit93e220a62da36f766b3188e76e234607e41488f9 (patch)
treed56d5609e4b290baa9b46a48b123ab9c4f23f073 /arch
parent5628b8de1228436d47491c662dc521bc138a3d43 (diff)
parent0e03b8fd29363f2df44e2a7a176d486de550757a (diff)
downloadlinux-93e220a62da36f766b3188e76e234607e41488f9.tar.gz
linux-93e220a62da36f766b3188e76e234607e41488f9.tar.bz2
linux-93e220a62da36f766b3188e76e234607e41488f9.zip
Merge branch 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6
Pull crypto updates from Herbert Xu: "API: - hwrng core now credits for low-quality RNG devices. Algorithms: - Optimisations for neon aes on arm/arm64. - Add accelerated crc32_be on arm64. - Add ffdheXYZ(dh) templates. - Disallow hmac keys < 112 bits in FIPS mode. - Add AVX assembly implementation for sm3 on x86. Drivers: - Add missing local_bh_disable calls for crypto_engine callback. - Ensure BH is disabled in crypto_engine callback path. - Fix zero length DMA mappings in ccree. - Add synchronization between mailbox accesses in octeontx2. - Add Xilinx SHA3 driver. - Add support for the TDES IP available on sama7g5 SoC in atmel" * 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6: (137 commits) crypto: xilinx - Turn SHA into a tristate and allow COMPILE_TEST MAINTAINERS: update HPRE/SEC2/TRNG driver maintainers list crypto: dh - Remove the unused function dh_safe_prime_dh_alg() hwrng: nomadik - Change clk_disable to clk_disable_unprepare crypto: arm64 - cleanup comments crypto: qat - fix initialization of pfvf rts_map_msg structures crypto: qat - fix initialization of pfvf cap_msg structures crypto: qat - remove unneeded assignment crypto: qat - disable registration of algorithms crypto: hisilicon/qm - fix memset during queues clearing crypto: xilinx: prevent probing on non-xilinx hardware crypto: marvell/octeontx - Use swap() instead of open coding it crypto: ccree - Fix use after free in cc_cipher_exit() crypto: ccp - ccp_dmaengine_unregister release dma channels crypto: octeontx2 - fix missing unlock hwrng: cavium - fix NULL but dereferenced coccicheck error crypto: cavium/nitrox - don't cast parameter in bit operations crypto: vmx - add missing dependencies MAINTAINERS: Add maintainer for Xilinx ZynqMP SHA3 driver crypto: xilinx - Add Xilinx SHA3 driver ...
Diffstat (limited to 'arch')
-rw-r--r--arch/alpha/include/asm/xor.h53
-rw-r--r--arch/arm/crypto/aes-neonbs-core.S105
-rw-r--r--arch/arm/crypto/aes-neonbs-glue.c35
-rw-r--r--arch/arm/include/asm/xor.h42
-rw-r--r--arch/arm/lib/xor-neon.c12
-rw-r--r--arch/arm64/crypto/Kconfig2
-rw-r--r--arch/arm64/crypto/aes-glue.c22
-rw-r--r--arch/arm64/crypto/aes-modes.S18
-rw-r--r--arch/arm64/crypto/aes-neonbs-core.S264
-rw-r--r--arch/arm64/crypto/aes-neonbs-glue.c97
-rw-r--r--arch/arm64/crypto/sha3-ce-glue.c2
-rw-r--r--arch/arm64/crypto/sha512-armv8.pl2
-rw-r--r--arch/arm64/crypto/sha512-ce-glue.c2
-rw-r--r--arch/arm64/crypto/sm3-ce-glue.c28
-rw-r--r--arch/arm64/include/asm/xor.h21
-rw-r--r--arch/arm64/lib/crc32.S87
-rw-r--r--arch/arm64/lib/xor-neon.c46
-rw-r--r--arch/ia64/include/asm/xor.h21
-rw-r--r--arch/powerpc/include/asm/xor_altivec.h25
-rw-r--r--arch/powerpc/lib/xor_vmx.c28
-rw-r--r--arch/powerpc/lib/xor_vmx.h27
-rw-r--r--arch/powerpc/lib/xor_vmx_glue.c32
-rw-r--r--arch/s390/lib/xor.c21
-rw-r--r--arch/sparc/include/asm/xor_32.h21
-rw-r--r--arch/sparc/include/asm/xor_64.h42
-rw-r--r--arch/x86/crypto/Makefile3
-rw-r--r--arch/x86/crypto/aes_ctrby8_avx-x86_64.S63
-rw-r--r--arch/x86/crypto/blowfish_glue.c12
-rw-r--r--arch/x86/crypto/des3_ede_glue.c8
-rw-r--r--arch/x86/crypto/sm3-avx-asm_64.S517
-rw-r--r--arch/x86/crypto/sm3_avx_glue.c134
-rw-r--r--arch/x86/include/asm/xor.h42
-rw-r--r--arch/x86/include/asm/xor_32.h42
-rw-r--r--arch/x86/include/asm/xor_avx.h21
34 files changed, 1284 insertions, 613 deletions
diff --git a/arch/alpha/include/asm/xor.h b/arch/alpha/include/asm/xor.h
index 5aeb4fb3cb7c..e0de0c233ab9 100644
--- a/arch/alpha/include/asm/xor.h
+++ b/arch/alpha/include/asm/xor.h
@@ -5,24 +5,43 @@
* Optimized RAID-5 checksumming functions for alpha EV5 and EV6
*/
-extern void xor_alpha_2(unsigned long, unsigned long *, unsigned long *);
-extern void xor_alpha_3(unsigned long, unsigned long *, unsigned long *,
- unsigned long *);
-extern void xor_alpha_4(unsigned long, unsigned long *, unsigned long *,
- unsigned long *, unsigned long *);
-extern void xor_alpha_5(unsigned long, unsigned long *, unsigned long *,
- unsigned long *, unsigned long *, unsigned long *);
+extern void
+xor_alpha_2(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2);
+extern void
+xor_alpha_3(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2,
+ const unsigned long * __restrict p3);
+extern void
+xor_alpha_4(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2,
+ const unsigned long * __restrict p3,
+ const unsigned long * __restrict p4);
+extern void
+xor_alpha_5(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2,
+ const unsigned long * __restrict p3,
+ const unsigned long * __restrict p4,
+ const unsigned long * __restrict p5);
-extern void xor_alpha_prefetch_2(unsigned long, unsigned long *,
- unsigned long *);
-extern void xor_alpha_prefetch_3(unsigned long, unsigned long *,
- unsigned long *, unsigned long *);
-extern void xor_alpha_prefetch_4(unsigned long, unsigned long *,
- unsigned long *, unsigned long *,
- unsigned long *);
-extern void xor_alpha_prefetch_5(unsigned long, unsigned long *,
- unsigned long *, unsigned long *,
- unsigned long *, unsigned long *);
+extern void
+xor_alpha_prefetch_2(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2);
+extern void
+xor_alpha_prefetch_3(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2,
+ const unsigned long * __restrict p3);
+extern void
+xor_alpha_prefetch_4(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2,
+ const unsigned long * __restrict p3,
+ const unsigned long * __restrict p4);
+extern void
+xor_alpha_prefetch_5(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2,
+ const unsigned long * __restrict p3,
+ const unsigned long * __restrict p4,
+ const unsigned long * __restrict p5);
asm(" \n\
.text \n\
diff --git a/arch/arm/crypto/aes-neonbs-core.S b/arch/arm/crypto/aes-neonbs-core.S
index 7d0cc7f226a5..7b61032f29fa 100644
--- a/arch/arm/crypto/aes-neonbs-core.S
+++ b/arch/arm/crypto/aes-neonbs-core.S
@@ -758,29 +758,24 @@ ENTRY(aesbs_cbc_decrypt)
ENDPROC(aesbs_cbc_decrypt)
.macro next_ctr, q
- vmov.32 \q\()h[1], r10
+ vmov \q\()h, r9, r10
adds r10, r10, #1
- vmov.32 \q\()h[0], r9
adcs r9, r9, #0
- vmov.32 \q\()l[1], r8
+ vmov \q\()l, r7, r8
adcs r8, r8, #0
- vmov.32 \q\()l[0], r7
adc r7, r7, #0
vrev32.8 \q, \q
.endm
/*
* aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
- * int rounds, int blocks, u8 ctr[], u8 final[])
+ * int rounds, int bytes, u8 ctr[])
*/
ENTRY(aesbs_ctr_encrypt)
mov ip, sp
push {r4-r10, lr}
- ldm ip, {r5-r7} // load args 4-6
- teq r7, #0
- addne r5, r5, #1 // one extra block if final != 0
-
+ ldm ip, {r5, r6} // load args 4-5
vld1.8 {q0}, [r6] // load counter
vrev32.8 q1, q0
vmov r9, r10, d3
@@ -792,20 +787,19 @@ ENTRY(aesbs_ctr_encrypt)
adc r7, r7, #0
99: vmov q1, q0
+ sub lr, r5, #1
vmov q2, q0
+ adr ip, 0f
vmov q3, q0
+ and lr, lr, #112
vmov q4, q0
+ cmp r5, #112
vmov q5, q0
+ sub ip, ip, lr, lsl #1
vmov q6, q0
+ add ip, ip, lr, lsr #2
vmov q7, q0
-
- adr ip, 0f
- sub lr, r5, #1
- and lr, lr, #7
- cmp r5, #8
- sub ip, ip, lr, lsl #5
- sub ip, ip, lr, lsl #2
- movlt pc, ip // computed goto if blocks < 8
+ movle pc, ip // computed goto if bytes < 112
next_ctr q1
next_ctr q2
@@ -820,12 +814,14 @@ ENTRY(aesbs_ctr_encrypt)
bl aesbs_encrypt8
adr ip, 1f
- and lr, r5, #7
- cmp r5, #8
- movgt r4, #0
- ldrle r4, [sp, #40] // load final in the last round
- sub ip, ip, lr, lsl #2
- movlt pc, ip // computed goto if blocks < 8
+ sub lr, r5, #1
+ cmp r5, #128
+ bic lr, lr, #15
+ ands r4, r5, #15 // preserves C flag
+ teqcs r5, r5 // set Z flag if not last iteration
+ sub ip, ip, lr, lsr #2
+ rsb r4, r4, #16
+ movcc pc, ip // computed goto if bytes < 128
vld1.8 {q8}, [r1]!
vld1.8 {q9}, [r1]!
@@ -834,46 +830,70 @@ ENTRY(aesbs_ctr_encrypt)
vld1.8 {q12}, [r1]!
vld1.8 {q13}, [r1]!
vld1.8 {q14}, [r1]!
- teq r4, #0 // skip last block if 'final'
-1: bne 2f
+1: subne r1, r1, r4
vld1.8 {q15}, [r1]!
-2: adr ip, 3f
- cmp r5, #8
- sub ip, ip, lr, lsl #3
- movlt pc, ip // computed goto if blocks < 8
+ add ip, ip, #2f - 1b
veor q0, q0, q8
- vst1.8 {q0}, [r0]!
veor q1, q1, q9
- vst1.8 {q1}, [r0]!
veor q4, q4, q10
- vst1.8 {q4}, [r0]!
veor q6, q6, q11
- vst1.8 {q6}, [r0]!
veor q3, q3, q12
- vst1.8 {q3}, [r0]!
veor q7, q7, q13
- vst1.8 {q7}, [r0]!
veor q2, q2, q14
+ bne 3f
+ veor q5, q5, q15
+
+ movcc pc, ip // computed goto if bytes < 128
+
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q1}, [r0]!
+ vst1.8 {q4}, [r0]!
+ vst1.8 {q6}, [r0]!
+ vst1.8 {q3}, [r0]!
+ vst1.8 {q7}, [r0]!
vst1.8 {q2}, [r0]!
- teq r4, #0 // skip last block if 'final'
- W(bne) 5f
-3: veor q5, q5, q15
+2: subne r0, r0, r4
vst1.8 {q5}, [r0]!
-4: next_ctr q0
+ next_ctr q0
- subs r5, r5, #8
+ subs r5, r5, #128
bgt 99b
vst1.8 {q0}, [r6]
pop {r4-r10, pc}
-5: vst1.8 {q5}, [r4]
- b 4b
+3: adr lr, .Lpermute_table + 16
+ cmp r5, #16 // Z flag remains cleared
+ sub lr, lr, r4
+ vld1.8 {q8-q9}, [lr]
+ vtbl.8 d16, {q5}, d16
+ vtbl.8 d17, {q5}, d17
+ veor q5, q8, q15
+ bcc 4f // have to reload prev if R5 < 16
+ vtbx.8 d10, {q2}, d18
+ vtbx.8 d11, {q2}, d19
+ mov pc, ip // branch back to VST sequence
+
+4: sub r0, r0, r4
+ vshr.s8 q9, q9, #7 // create mask for VBIF
+ vld1.8 {q8}, [r0] // reload
+ vbif q5, q8, q9
+ vst1.8 {q5}, [r0]
+ pop {r4-r10, pc}
ENDPROC(aesbs_ctr_encrypt)
+ .align 6
+.Lpermute_table:
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
+ .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+
.macro next_tweak, out, in, const, tmp
vshr.s64 \tmp, \in, #63
vand \tmp, \tmp, \const
@@ -888,6 +908,7 @@ ENDPROC(aesbs_ctr_encrypt)
* aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
* int blocks, u8 iv[], int reorder_last_tweak)
*/
+ .align 6
__xts_prepare8:
vld1.8 {q14}, [r7] // load iv
vmov.i32 d30, #0x87 // compose tweak mask vector
diff --git a/arch/arm/crypto/aes-neonbs-glue.c b/arch/arm/crypto/aes-neonbs-glue.c
index 5c6cd3c63cbc..f00f042ef357 100644
--- a/arch/arm/crypto/aes-neonbs-glue.c
+++ b/arch/arm/crypto/aes-neonbs-glue.c
@@ -37,7 +37,7 @@ asmlinkage void aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[],
int rounds, int blocks, u8 iv[]);
asmlinkage void aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
- int rounds, int blocks, u8 ctr[], u8 final[]);
+ int rounds, int blocks, u8 ctr[]);
asmlinkage void aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[],
int rounds, int blocks, u8 iv[], int);
@@ -243,32 +243,25 @@ static int ctr_encrypt(struct skcipher_request *req)
err = skcipher_walk_virt(&walk, req, false);
while (walk.nbytes > 0) {
- unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
- u8 *final = (walk.total % AES_BLOCK_SIZE) ? buf : NULL;
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+ int bytes = walk.nbytes;
- if (walk.nbytes < walk.total) {
- blocks = round_down(blocks,
- walk.stride / AES_BLOCK_SIZE);
- final = NULL;
- }
+ if (unlikely(bytes < AES_BLOCK_SIZE))
+ src = dst = memcpy(buf + sizeof(buf) - bytes,
+ src, bytes);
+ else if (walk.nbytes < walk.total)
+ bytes &= ~(8 * AES_BLOCK_SIZE - 1);
kernel_neon_begin();
- aesbs_ctr_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
- ctx->rk, ctx->rounds, blocks, walk.iv, final);
+ aesbs_ctr_encrypt(dst, src, ctx->rk, ctx->rounds, bytes, walk.iv);
kernel_neon_end();
- if (final) {
- u8 *dst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE;
- u8 *src = walk.src.virt.addr + blocks * AES_BLOCK_SIZE;
+ if (unlikely(bytes < AES_BLOCK_SIZE))
+ memcpy(walk.dst.virt.addr,
+ buf + sizeof(buf) - bytes, bytes);
- crypto_xor_cpy(dst, src, final,
- walk.total % AES_BLOCK_SIZE);
-
- err = skcipher_walk_done(&walk, 0);
- break;
- }
- err = skcipher_walk_done(&walk,
- walk.nbytes - blocks * AES_BLOCK_SIZE);
+ err = skcipher_walk_done(&walk, walk.nbytes - bytes);
}
return err;
diff --git a/arch/arm/include/asm/xor.h b/arch/arm/include/asm/xor.h
index aefddec79286..669cad5194d3 100644
--- a/arch/arm/include/asm/xor.h
+++ b/arch/arm/include/asm/xor.h
@@ -44,7 +44,8 @@
: "0" (dst), "r" (a1), "r" (a2), "r" (a3), "r" (a4))
static void
-xor_arm4regs_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
+xor_arm4regs_2(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2)
{
unsigned int lines = bytes / sizeof(unsigned long) / 4;
register unsigned int a1 __asm__("r4");
@@ -64,8 +65,9 @@ xor_arm4regs_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
}
static void
-xor_arm4regs_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
- unsigned long *p3)
+xor_arm4regs_3(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2,
+ const unsigned long * __restrict p3)
{
unsigned int lines = bytes / sizeof(unsigned long) / 4;
register unsigned int a1 __asm__("r4");
@@ -86,8 +88,10 @@ xor_arm4regs_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
}
static void
-xor_arm4regs_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
- unsigned long *p3, unsigned long *p4)
+xor_arm4regs_4(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2,
+ const unsigned long * __restrict p3,
+ const unsigned long * __restrict p4)
{
unsigned int lines = bytes / sizeof(unsigned long) / 2;
register unsigned int a1 __asm__("r8");
@@ -105,8 +109,11 @@ xor_arm4regs_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
}
static void
-xor_arm4regs_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
- unsigned long *p3, unsigned long *p4, unsigned long *p5)
+xor_arm4regs_5(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2,
+ const unsigned long * __restrict p3,
+ const unsigned long * __restrict p4,
+ const unsigned long * __restrict p5)
{
unsigned int lines = bytes / sizeof(unsigned long) / 2;
register unsigned int a1 __asm__("r8");
@@ -146,7 +153,8 @@ static struct xor_block_template xor_block_arm4regs = {
extern struct xor_block_template const xor_block_neon_inner;
static void
-xor_neon_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
+xor_neon_2(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2)
{
if (in_interrupt()) {
xor_arm4regs_2(bytes, p1, p2);
@@ -158,8 +166,9 @@ xor_neon_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
}
static void
-xor_neon_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
- unsigned long *p3)
+xor_neon_3(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2,
+ const unsigned long * __restrict p3)
{
if (in_interrupt()) {
xor_arm4regs_3(bytes, p1, p2, p3);
@@ -171,8 +180,10 @@ xor_neon_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
}
static void
-xor_neon_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
- unsigned long *p3, unsigned long *p4)
+xor_neon_4(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2,
+ const unsigned long * __restrict p3,
+ const unsigned long * __restrict p4)
{
if (in_interrupt()) {
xor_arm4regs_4(bytes, p1, p2, p3, p4);
@@ -184,8 +195,11 @@ xor_neon_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
}
static void
-xor_neon_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
- unsigned long *p3, unsigned long *p4, unsigned long *p5)
+xor_neon_5(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2,
+ const unsigned long * __restrict p3,
+ const unsigned long * __restrict p4,
+ const unsigned long * __restrict p5)
{
if (in_interrupt()) {
xor_arm4regs_5(bytes, p1, p2, p3, p4, p5);
diff --git a/arch/arm/lib/xor-neon.c b/arch/arm/lib/xor-neon.c
index b99dd8e1c93f..522510baed49 100644
--- a/arch/arm/lib/xor-neon.c
+++ b/arch/arm/lib/xor-neon.c
@@ -17,17 +17,11 @@ MODULE_LICENSE("GPL");
/*
* Pull in the reference implementations while instructing GCC (through
* -ftree-vectorize) to attempt to exploit implicit parallelism and emit
- * NEON instructions.
+ * NEON instructions. Clang does this by default at O2 so no pragma is
+ * needed.
*/
-#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
+#ifdef CONFIG_CC_IS_GCC
#pragma GCC optimize "tree-vectorize"
-#else
-/*
- * While older versions of GCC do not generate incorrect code, they fail to
- * recognize the parallel nature of these functions, and emit plain ARM code,
- * which is known to be slower than the optimized ARM code in asm-arm/xor.h.
- */
-#warning This code requires at least version 4.6 of GCC
#endif
#pragma GCC diagnostic ignored "-Wunused-variable"
diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
index addfa413650b..2a965aa0188d 100644
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@@ -45,7 +45,7 @@ config CRYPTO_SM3_ARM64_CE
tristate "SM3 digest algorithm (ARMv8.2 Crypto Extensions)"
depends on KERNEL_MODE_NEON
select CRYPTO_HASH
- select CRYPTO_SM3
+ select CRYPTO_LIB_SM3
config CRYPTO_SM4_ARM64_CE
tristate "SM4 symmetric cipher (ARMv8.2 Crypto Extensions)"
diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c
index 30b7cc6a7079..561dd2332571 100644
--- a/arch/arm64/crypto/aes-glue.c
+++ b/arch/arm64/crypto/aes-glue.c
@@ -24,7 +24,6 @@
#ifdef USE_V8_CRYPTO_EXTENSIONS
#define MODE "ce"
#define PRIO 300
-#define STRIDE 5
#define aes_expandkey ce_aes_expandkey
#define aes_ecb_encrypt ce_aes_ecb_encrypt
#define aes_ecb_decrypt ce_aes_ecb_decrypt
@@ -42,7 +41,6 @@ MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 Crypto Extensions");
#else
#define MODE "neon"
#define PRIO 200
-#define STRIDE 4
#define aes_ecb_encrypt neon_aes_ecb_encrypt
#define aes_ecb_decrypt neon_aes_ecb_decrypt
#define aes_cbc_encrypt neon_aes_cbc_encrypt
@@ -89,7 +87,7 @@ asmlinkage void aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[],
int rounds, int bytes, u8 const iv[]);
asmlinkage void aes_ctr_encrypt(u8 out[], u8 const in[], u32 const rk[],
- int rounds, int bytes, u8 ctr[], u8 finalbuf[]);
+ int rounds, int bytes, u8 ctr[]);
asmlinkage void aes_xts_encrypt(u8 out[], u8 const in[], u32 const rk1[],
int rounds, int bytes, u32 const rk2[], u8 iv[],
@@ -458,26 +456,21 @@ static int __maybe_unused ctr_encrypt(struct skcipher_request *req)
unsigned int nbytes = walk.nbytes;
u8 *dst = walk.dst.virt.addr;
u8 buf[AES_BLOCK_SIZE];
- unsigned int tail;
if (unlikely(nbytes < AES_BLOCK_SIZE))
- src = memcpy(buf, src, nbytes);
+ src = dst = memcpy(buf + sizeof(buf) - nbytes,
+ src, nbytes);
else if (nbytes < walk.total)
nbytes &= ~(AES_BLOCK_SIZE - 1);
kernel_neon_begin();
aes_ctr_encrypt(dst, src, ctx->key_enc, rounds, nbytes,
- walk.iv, buf);
+ walk.iv);
kernel_neon_end();
- tail = nbytes % (STRIDE * AES_BLOCK_SIZE);
- if (tail > 0 && tail < AES_BLOCK_SIZE)
- /*
- * The final partial block could not be returned using
- * an overlapping store, so it was passed via buf[]
- * instead.
- */
- memcpy(dst + nbytes - tail, buf, tail);
+ if (unlikely(nbytes < AES_BLOCK_SIZE))
+ memcpy(walk.dst.virt.addr,
+ buf + sizeof(buf) - nbytes, nbytes);
err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
}
@@ -983,6 +976,7 @@ module_cpu_feature_match(AES, aes_init);
module_init(aes_init);
EXPORT_SYMBOL(neon_aes_ecb_encrypt);
EXPORT_SYMBOL(neon_aes_cbc_encrypt);
+EXPORT_SYMBOL(neon_aes_ctr_encrypt);
EXPORT_SYMBOL(neon_aes_xts_encrypt);
EXPORT_SYMBOL(neon_aes_xts_decrypt);
#endif
diff --git a/arch/arm64/crypto/aes-modes.S b/arch/arm64/crypto/aes-modes.S
index ff01f0167ba2..dc35eb0245c5 100644
--- a/arch/arm64/crypto/aes-modes.S
+++ b/arch/arm64/crypto/aes-modes.S
@@ -321,7 +321,7 @@ AES_FUNC_END(aes_cbc_cts_decrypt)
/*
* aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
- * int bytes, u8 ctr[], u8 finalbuf[])
+ * int bytes, u8 ctr[])
*/
AES_FUNC_START(aes_ctr_encrypt)
@@ -414,8 +414,8 @@ ST5( st1 {v4.16b}, [x0], #16 )
.Lctrtail:
/* XOR up to MAX_STRIDE * 16 - 1 bytes of in/output with v0 ... v3/v4 */
mov x16, #16
- ands x13, x4, #0xf
- csel x13, x13, x16, ne
+ ands x6, x4, #0xf
+ csel x13, x6, x16, ne
ST5( cmp w4, #64 - (MAX_STRIDE << 4) )
ST5( csel x14, x16, xzr, gt )
@@ -424,10 +424,10 @@ ST5( csel x14, x16, xzr, gt )
cmp w4, #32 - (MAX_STRIDE << 4)
csel x16, x16, xzr, gt
cmp w4, #16 - (MAX_STRIDE << 4)
- ble .Lctrtail1x
adr_l x12, .Lcts_permute_table
add x12, x12, x13
+ ble .Lctrtail1x
ST5( ld1 {v5.16b}, [x1], x14 )
ld1 {v6.16b}, [x1], x15
@@ -462,11 +462,19 @@ ST5( st1 {v5.16b}, [x0], x14 )
b .Lctrout
.Lctrtail1x:
- csel x0, x0, x6, eq // use finalbuf if less than a full block
+ sub x7, x6, #16
+ csel x6, x6, x7, eq
+ add x1, x1, x6
+ add x0, x0, x6
ld1 {v5.16b}, [x1]
+ ld1 {v6.16b}, [x0]
ST5( mov v3.16b, v4.16b )
encrypt_block v3, w3, x2, x8, w7
+ ld1 {v10.16b-v11.16b}, [x12]
+ tbl v3.16b, {v3.16b}, v10.16b
+ sshr v11.16b, v11.16b, #7
eor v5.16b, v5.16b, v3.16b
+ bif v5.16b, v6.16b, v11.16b
st1 {v5.16b}, [x0]
b .Lctrout
AES_FUNC_END(aes_ctr_encrypt)
diff --git a/arch/arm64/crypto/aes-neonbs-core.S b/arch/arm64/crypto/aes-neonbs-core.S
index a3405b8c344b..d427f4556b6e 100644
--- a/arch/arm64/crypto/aes-neonbs-core.S
+++ b/arch/arm64/crypto/aes-neonbs-core.S
@@ -735,119 +735,67 @@ SYM_FUNC_END(aesbs_cbc_decrypt)
* int blocks, u8 iv[])
*/
SYM_FUNC_START_LOCAL(__xts_crypt8)
- mov x6, #1
- lsl x6, x6, x23
- subs w23, w23, #8
- csel x23, x23, xzr, pl
- csel x6, x6, xzr, mi
+ movi v18.2s, #0x1
+ movi v19.2s, #0x87
+ uzp1 v18.4s, v18.4s, v19.4s
+
+ ld1 {v0.16b-v3.16b}, [x1], #64
+ ld1 {v4.16b-v7.16b}, [x1], #64
+
+ next_tweak v26, v25, v18, v19
+ next_tweak v27, v26, v18, v19
+ next_tweak v28, v27, v18, v19
+ next_tweak v29, v28, v18, v19
+ next_tweak v30, v29, v18, v19
+ next_tweak v31, v30, v18, v19
+ next_tweak v16, v31, v18, v19
+ next_tweak v17, v16, v18, v19
- ld1 {v0.16b}, [x20], #16
- next_tweak v26, v25, v30, v31
eor v0.16b, v0.16b, v25.16b
- tbnz x6, #1, 0f
-
- ld1 {v1.16b}, [x20], #16
- next_tweak v27, v26, v30, v31
eor v1.16b, v1.16b, v26.16b
- tbnz x6, #2, 0f
-
- ld1 {v2.16b}, [x20], #16
- next_tweak v28, v27, v30, v31
eor v2.16b, v2.16b, v27.16b
- tbnz x6, #3, 0f
-
- ld1 {v3.16b}, [x20], #16
- next_tweak v29, v28, v30, v31
eor v3.16b, v3.16b, v28.16b
- tbnz x6, #4, 0f
-
- ld1 {v4.16b}, [x20], #16
- str q29, [sp, #.Lframe_local_offset]
eor v4.16b, v4.16b, v29.16b
- next_tweak v29, v29, v30, v31
- tbnz x6, #5, 0f
-
- ld1 {v5.16b}, [x20], #16
- str q29, [sp, #.Lframe_local_offset + 16]
- eor v5.16b, v5.16b, v29.16b
- next_tweak v29, v29, v30, v31
- tbnz x6, #6, 0f
-
- ld1 {v6.16b}, [x20], #16
- str q29, [sp, #.Lframe_local_offset + 32]
- eor v6.16b, v6.16b, v29.16b
- next_tweak v29, v29, v30, v31
- tbnz x6, #7, 0f
+ eor v5.16b, v5.16b, v30.16b
+ eor v6.16b, v6.16b, v31.16b
+ eor v7.16b, v7.16b, v16.16b
- ld1 {v7.16b}, [x20], #16
- str q29, [sp, #.Lframe_local_offset + 48]
- eor v7.16b, v7.16b, v29.16b
- next_tweak v29, v29, v30, v31
+ stp q16, q17, [sp, #16]
-0: mov bskey, x21
- mov rounds, x22
+ mov bskey, x2
+ mov rounds, x3
br x16
SYM_FUNC_END(__xts_crypt8)
.macro __xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
- frame_push 6, 64
-
- mov x19, x0
- mov x20, x1
- mov x21, x2
- mov x22, x3
- mov x23, x4
- mov x24, x5
+ stp x29, x30, [sp, #-48]!
+ mov x29, sp
- movi v30.2s, #0x1
- movi v25.2s, #0x87
- uzp1 v30.4s, v30.4s, v25.4s
- ld1 {v25.16b}, [x24]
+ ld1 {v25.16b}, [x5]
-99: adr x16, \do8
+0: adr x16, \do8
bl __xts_crypt8
- ldp q16, q17, [sp, #.Lframe_local_offset]
- ldp q18, q19, [sp, #.Lframe_local_offset + 32]
+ eor v16.16b, \o0\().16b, v25.16b
+ eor v17.16b, \o1\().16b, v26.16b
+ eor v18.16b, \o2\().16b, v27.16b
+ eor v19.16b, \o3\().16b, v28.16b
- eor \o0\().16b, \o0\().16b, v25.16b
- eor \o1\().16b, \o1\().16b, v26.16b
- eor \o2\().16b, \o2\().16b, v27.16b
- eor \o3\().16b, \o3\().16b, v28.16b
+ ldp q24, q25, [sp, #16]
- st1 {\o0\().16b}, [x19], #16
- mov v25.16b, v26.16b
- tbnz x6, #1, 1f
- st1 {\o1\().16b}, [x19], #16
- mov v25.16b, v27.16b
- tbnz x6, #2, 1f
- st1 {\o2\().16b}, [x19], #16
- mov v25.16b, v28.16b
- tbnz x6, #3, 1f
- st1 {\o3\().16b}, [x19], #16
- mov v25.16b, v29.16b
- tbnz x6, #4, 1f
+ eor v20.16b, \o4\().16b, v29.16b
+ eor v21.16b, \o5\().16b, v30.16b
+ eor v22.16b, \o6\().16b, v31.16b
+ eor v23.16b, \o7\().16b, v24.16b
- eor \o4\().16b, \o4\().16b, v16.16b
- eor \o5\().16b, \o5\().16b, v17.16b
- eor \o6\().16b, \o6\().16b, v18.16b
- eor \o7\().16b, \o7\().16b, v19.16b
+ st1 {v16.16b-v19.16b}, [x0], #64
+ st1 {v20.16b-v23.16b}, [x0], #64
- st1 {\o4\().16b}, [x19], #16
- tbnz x6, #5, 1f
- st1 {\o5\().16b}, [x19], #16
- tbnz x6, #6, 1f
- st1 {\o6\().16b}, [x19], #16
- tbnz x6, #7, 1f
- st1 {\o7\().16b}, [x19], #16
+ subs x4, x4, #8
+ b.gt 0b
- cbz x23, 1f
- st1 {v25.16b}, [x24]
-
- b 99b
-
-1: st1 {v25.16b}, [x24]
- frame_pop
+ st1 {v25.16b}, [x5]
+ ldp x29, x30, [sp], #48
ret
.endm
@@ -869,133 +817,51 @@ SYM_FUNC_END(aesbs_xts_decrypt)
/*
* aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
- * int rounds, int blocks, u8 iv[], u8 final[])
+ * int rounds, int blocks, u8 iv[])
*/
SYM_FUNC_START(aesbs_ctr_encrypt)
- frame_push 8
-
- mov x19, x0
- mov x20, x1
- mov x21, x2
- mov x22, x3
- mov x23, x4