diff options
| author | Danny Tsen <dtsen@linux.ibm.com> | 2024-09-23 09:30:38 -0400 |
|---|---|---|
| committer | Herbert Xu <herbert@gondor.apana.org.au> | 2024-10-05 13:22:05 +0800 |
| commit | 7aa747edcb266490f93651dd749c69b7eb8541d9 (patch) | |
| tree | fa2b64d265de53b445579a1938409c7c77ef9255 /arch/powerpc | |
| parent | fb10c7a84661471cdcc8998d63703211b873c126 (diff) | |
| download | linux-7aa747edcb266490f93651dd749c69b7eb8541d9.tar.gz linux-7aa747edcb266490f93651dd749c69b7eb8541d9.tar.bz2 linux-7aa747edcb266490f93651dd749c69b7eb8541d9.zip | |
crypto: powerpc/p10-aes-gcm - Re-write AES/GCM stitched implementation
Re-write AES/GCM assembly codes with smaller footprints and
small performance gain. Handling the partial blocks differently that
computes partial block to AES states and re-assembles to a complete
block and then computes a full-block hash.
Added gcm_update() to update the last partial block hash value and
generate the final digest.
Fixes: fd0e9b3e2ee6 ("crypto: p10-aes-gcm - An accelerated AES/GCM stitched implementation")
Signed-off-by: Danny Tsen <dtsen@linux.ibm.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Diffstat (limited to 'arch/powerpc')
| -rw-r--r-- | arch/powerpc/crypto/aes-gcm-p10.S | 2421 |
1 files changed, 1068 insertions, 1353 deletions
diff --git a/arch/powerpc/crypto/aes-gcm-p10.S b/arch/powerpc/crypto/aes-gcm-p10.S index a51f4b265308..89f50eef3512 100644 --- a/arch/powerpc/crypto/aes-gcm-p10.S +++ b/arch/powerpc/crypto/aes-gcm-p10.S @@ -1,42 +1,42 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ - # - # Accelerated AES-GCM stitched implementation for ppc64le. - # - # Copyright 2022- IBM Inc. All rights reserved - # - #=================================================================================== - # Written by Danny Tsen <dtsen@linux.ibm.com> - # - # GHASH is based on the Karatsuba multiplication method. - # - # Xi xor X1 - # - # X1 * H^4 + X2 * H^3 + x3 * H^2 + X4 * H = - # (X1.h * H4.h + xX.l * H4.l + X1 * H4) + - # (X2.h * H3.h + X2.l * H3.l + X2 * H3) + - # (X3.h * H2.h + X3.l * H2.l + X3 * H2) + - # (X4.h * H.h + X4.l * H.l + X4 * H) - # - # Xi = v0 - # H Poly = v2 - # Hash keys = v3 - v14 - # ( H.l, H, H.h) - # ( H^2.l, H^2, H^2.h) - # ( H^3.l, H^3, H^3.h) - # ( H^4.l, H^4, H^4.h) - # - # v30 is IV - # v31 - counter 1 - # - # AES used, - # vs0 - vs14 for round keys - # v15, v16, v17, v18, v19, v20, v21, v22 for 8 blocks (encrypted) - # - # This implementation uses stitched AES-GCM approach to improve overall performance. - # AES is implemented with 8x blocks and GHASH is using 2 4x blocks. - # - # =================================================================================== - # +# +# Accelerated AES-GCM stitched implementation for ppc64le. +# +# Copyright 2024- IBM Inc. +# +#=================================================================================== +# Written by Danny Tsen <dtsen@us.ibm.com> +# +# GHASH is based on the Karatsuba multiplication method. +# +# Xi xor X1 +# +# X1 * H^4 + X2 * H^3 + x3 * H^2 + X4 * H = +# (X1.h * H4.h + xX.l * H4.l + X1 * H4) + +# (X2.h * H3.h + X2.l * H3.l + X2 * H3) + +# (X3.h * H2.h + X3.l * H2.l + X3 * H2) + +# (X4.h * H.h + X4.l * H.l + X4 * H) +# +# Xi = v0 +# H Poly = v2 +# Hash keys = v3 - v14 +# ( H.l, H, H.h) +# ( H^2.l, H^2, H^2.h) +# ( H^3.l, H^3, H^3.h) +# ( H^4.l, H^4, H^4.h) +# +# v30 is IV +# v31 - counter 1 +# +# AES used, +# vs0 - round key 0 +# v15, v16, v17, v18, v19, v20, v21, v22 for 8 blocks (encrypted) +# +# This implementation uses stitched AES-GCM approach to improve overall performance. +# AES is implemented with 8x blocks and GHASH is using 2 4x blocks. +# +# =================================================================================== +# #include <asm/ppc_asm.h> #include <linux/linkage.h> @@ -44,483 +44,224 @@ .machine "any" .text - # 4x loops - # v15 - v18 - input states - # vs1 - vs9 - round keys - # -.macro Loop_aes_middle4x - xxlor 19+32, 1, 1 - xxlor 20+32, 2, 2 - xxlor 21+32, 3, 3 - xxlor 22+32, 4, 4 - - vcipher 15, 15, 19 - vcipher 16, 16, 19 - vcipher 17, 17, 19 - vcipher 18, 18, 19 - - vcipher 15, 15, 20 - vcipher 16, 16, 20 - vcipher 17, 17, 20 - vcipher 18, 18, 20 - - vcipher 15, 15, 21 - vcipher 16, 16, 21 - vcipher 17, 17, 21 - vcipher 18, 18, 21 - - vcipher 15, 15, 22 - vcipher 16, 16, 22 - vcipher 17, 17, 22 - vcipher 18, 18, 22 - - xxlor 19+32, 5, 5 - xxlor 20+32, 6, 6 - xxlor 21+32, 7, 7 - xxlor 22+32, 8, 8 - - vcipher 15, 15, 19 - vcipher 16, 16, 19 - vcipher 17, 17, 19 - vcipher 18, 18, 19 - - vcipher 15, 15, 20 - vcipher 16, 16, 20 - vcipher 17, 17, 20 - vcipher 18, 18, 20 - - vcipher 15, 15, 21 - vcipher 16, 16, 21 - vcipher 17, 17, 21 - vcipher 18, 18, 21 - - vcipher 15, 15, 22 - vcipher 16, 16, 22 - vcipher 17, 17, 22 - vcipher 18, 18, 22 - - xxlor 23+32, 9, 9 - vcipher 15, 15, 23 - vcipher 16, 16, 23 - vcipher 17, 17, 23 - vcipher 18, 18, 23 +.macro SAVE_GPR GPR OFFSET FRAME + std \GPR,\OFFSET(\FRAME) .endm - # 8x loops - # v15 - v22 - input states - # vs1 - vs9 - round keys - # -.macro Loop_aes_middle8x - xxlor 23+32, 1, 1 - xxlor 24+32, 2, 2 - xxlor 25+32, 3, 3 - xxlor 26+32, 4, 4 - - vcipher 15, 15, 23 - vcipher 16, 16, 23 - vcipher 17, 17, 23 - vcipher 18, 18, 23 - vcipher 19, 19, 23 - vcipher 20, 20, 23 - vcipher 21, 21, 23 - vcipher 22, 22, 23 - - vcipher 15, 15, 24 - vcipher 16, 16, 24 - vcipher 17, 17, 24 - vcipher 18, 18, 24 - vcipher 19, 19, 24 - vcipher 20, 20, 24 - vcipher 21, 21, 24 - vcipher 22, 22, 24 - - vcipher 15, 15, 25 - vcipher 16, 16, 25 - vcipher 17, 17, 25 - vcipher 18, 18, 25 - vcipher 19, 19, 25 - vcipher 20, 20, 25 - vcipher 21, 21, 25 - vcipher 22, 22, 25 - - vcipher 15, 15, 26 - vcipher 16, 16, 26 - vcipher 17, 17, 26 - vcipher 18, 18, 26 - vcipher 19, 19, 26 - vcipher 20, 20, 26 - vcipher 21, 21, 26 - vcipher 22, 22, 26 - - xxlor 23+32, 5, 5 - xxlor 24+32, 6, 6 - xxlor 25+32, 7, 7 - xxlor 26+32, 8, 8 - - vcipher 15, 15, 23 - vcipher 16, 16, 23 - vcipher 17, 17, 23 - vcipher 18, 18, 23 - vcipher 19, 19, 23 - vcipher 20, 20, 23 - vcipher 21, 21, 23 - vcipher 22, 22, 23 - - vcipher 15, 15, 24 - vcipher 16, 16, 24 - vcipher 17, 17, 24 - vcipher 18, 18, 24 - vcipher 19, 19, 24 - vcipher 20, 20, 24 - vcipher 21, 21, 24 - vcipher 22, 22, 24 - - vcipher 15, 15, 25 - vcipher 16, 16, 25 - vcipher 17, 17, 25 - vcipher 18, 18, 25 - vcipher 19, 19, 25 - vcipher 20, 20, 25 - vcipher 21, 21, 25 - vcipher 22, 22, 25 - - vcipher 15, 15, 26 - vcipher 16, 16, 26 - vcipher 17, 17, 26 - vcipher 18, 18, 26 - vcipher 19, 19, 26 - vcipher 20, 20, 26 - vcipher 21, 21, 26 - vcipher 22, 22, 26 - - xxlor 23+32, 9, 9 - vcipher 15, 15, 23 - vcipher 16, 16, 23 - vcipher 17, 17, 23 - vcipher 18, 18, 23 - vcipher 19, 19, 23 - vcipher 20, 20, 23 - vcipher 21, 21, 23 - vcipher 22, 22, 23 +.macro SAVE_VRS VRS OFFSET FRAME + stxv \VRS+32, \OFFSET(\FRAME) .endm -.macro Loop_aes_middle_1x - xxlor 19+32, 1, 1 - xxlor 20+32, 2, 2 - xxlor 21+32, 3, 3 - xxlor 22+32, 4, 4 - - vcipher 15, 15, 19 - vcipher 15, 15, 20 - vcipher 15, 15, 21 - vcipher 15, 15, 22 - - xxlor 19+32, 5, 5 - xxlor 20+32, 6, 6 - xxlor 21+32, 7, 7 - xxlor 22+32, 8, 8 - - vcipher 15, 15, 19 - vcipher 15, 15, 20 - vcipher 15, 15, 21 - vcipher 15, 15, 22 - - xxlor 19+32, 9, 9 - vcipher 15, 15, 19 +.macro RESTORE_GPR GPR OFFSET FRAME + ld \GPR,\OFFSET(\FRAME) .endm - # - # Compute 4x hash values based on Karatsuba method. - # -.macro ppc_aes_gcm_ghash - vxor 15, 15, 0 - - vpmsumd 23, 12, 15 # H4.L * X.L - vpmsumd 24, 9, 16 - vpmsumd 25, 6, 17 - vpmsumd 26, 3, 18 - - vxor 23, 23, 24 - vxor 23, 23, 25 - vxor 23, 23, 26 # L - - vpmsumd 24, 13, 15 # H4.L * X.H + H4.H * X.L - vpmsumd 25, 10, 16 # H3.L * X1.H + H3.H * X1.L - vpmsumd 26, 7, 17 - vpmsumd 27, 4, 18 - - vxor 24, 24, 25 - vxor 24, 24, 26 - vxor 24, 24, 27 # M - - # sum hash and reduction with H Poly - vpmsumd 28, 23, 2 # reduction - - vxor 29, 29, 29 - vsldoi 26, 24, 29, 8 # mL - vsldoi 29, 29, 24, 8 # mH - vxor 23, 23, 26 # mL + L - - vsldoi 23, 23, 23, 8 # swap - vxor 23, 23, 28 - - vpmsumd 24, 14, 15 # H4.H * X.H - vpmsumd 25, 11, 16 - vpmsumd 26, 8, 17 - vpmsumd 27, 5, 18 - - vxor 24, 24, 25 - vxor 24, 24, 26 - vxor 24, 24, 27 - - vxor 24, 24, 29 - - # sum hash and reduction with H Poly - vsldoi 27, 23, 23, 8 # swap - vpmsumd 23, 23, 2 - vxor 27, 27, 24 - vxor 23, 23, 27 - - xxlor 32, 23+32, 23+32 # update hash - +.macro RESTORE_VRS VRS OFFSET FRAME + lxv \VRS+32, \OFFSET(\FRAME) .endm - # - # Combine two 4x ghash - # v15 - v22 - input blocks - # -.macro ppc_aes_gcm_ghash2_4x - # first 4x hash - vxor 15, 15, 0 # Xi + X - - vpmsumd 23, 12, 15 # H4.L * X.L - vpmsumd 24, 9, 16 - vpmsumd 25, 6, 17 - vpmsumd 26, 3, 18 - - vxor 23, 23, 24 - vxor 23, 23, 25 - vxor 23, 23, 26 # L - - vpmsumd 24, 13, 15 # H4.L * X.H + H4.H * X.L - vpmsumd 25, 10, 16 # H3.L * X1.H + H3.H * X1.L - vpmsumd 26, 7, 17 - vpmsumd 27, 4, 18 - - vxor 24, 24, 25 - vxor 24, 24, 26 - - # sum hash and reduction with H Poly - vpmsumd 28, 23, 2 # reduction - - vxor 29, 29, 29 - - vxor 24, 24, 27 # M - vsldoi 26, 24, 29, 8 # mL - vsldoi 29, 29, 24, 8 # mH - vxor 23, 23, 26 # mL + L - - vsldoi 23, 23, 23, 8 # swap - vxor 23, 23, 28 +.macro SAVE_REGS + mflr 0 + std 0, 16(1) + stdu 1,-512(1) + + SAVE_GPR 14, 112, 1 + SAVE_GPR 15, 120, 1 + SAVE_GPR 16, 128, 1 + SAVE_GPR 17, 136, 1 + SAVE_GPR 18, 144, 1 + SAVE_GPR 19, 152, 1 + SAVE_GPR 20, 160, 1 + SAVE_GPR 21, 168, 1 + SAVE_GPR 22, 176, 1 + SAVE_GPR 23, 184, 1 + SAVE_GPR 24, 192, 1 + + addi 9, 1, 256 + SAVE_VRS 20, 0, 9 + SAVE_VRS 21, 16, 9 + SAVE_VRS 22, 32, 9 + SAVE_VRS 23, 48, 9 + SAVE_VRS 24, 64, 9 + SAVE_VRS 25, 80, 9 + SAVE_VRS 26, 96, 9 + SAVE_VRS 27, 112, 9 + SAVE_VRS 28, 128, 9 + SAVE_VRS 29, 144, 9 + SAVE_VRS 30, 160, 9 + SAVE_VRS 31, 176, 9 +.endm # SAVE_REGS - vpmsumd 24, 14, 15 # H4.H * X.H - vpmsumd 25, 11, 16 - vpmsumd 26, 8, 17 - vpmsumd 27, 5, 18 +.macro RESTORE_REGS + addi 9, 1, 256 + RESTORE_VRS 20, 0, 9 + RESTORE_VRS 21, 16, 9 + RESTORE_VRS 22, 32, 9 + RESTORE_VRS 23, 48, 9 + RESTORE_VRS 24, 64, 9 + RESTORE_VRS 25, 80, 9 + RESTORE_VRS 26, 96, 9 + RESTORE_VRS 27, 112, 9 + RESTORE_VRS 28, 128, 9 + RESTORE_VRS 29, 144, 9 + RESTORE_VRS 30, 160, 9 + RESTORE_VRS 31, 176, 9 + + RESTORE_GPR 14, 112, 1 + RESTORE_GPR 15, 120, 1 + RESTORE_GPR 16, 128, 1 + RESTORE_GPR 17, 136, 1 + RESTORE_GPR 18, 144, 1 + RESTORE_GPR 19, 152, 1 + RESTORE_GPR 20, 160, 1 + RESTORE_GPR 21, 168, 1 + RESTORE_GPR 22, 176, 1 + RESTORE_GPR 23, 184, 1 + RESTORE_GPR 24, 192, 1 + + addi 1, 1, 512 + ld 0, 16(1) + mtlr 0 +.endm # RESTORE_REGS + +# 4x loops +.macro AES_CIPHER_4x _VCIPHER ST r + \_VCIPHER \ST, \ST, \r + \_VCIPHER \ST+1, \ST+1, \r + \_VCIPHER \ST+2, \ST+2, \r + \_VCIPHER \ST+3, \ST+3, \r +.endm - vxor 24, 24, 25 - vxor 24, 24, 26 - vxor 24, 24, 27 # H +# 8x loops +.macro AES_CIPHER_8x _VCIPHER ST r + \_VCIPHER \ST, \ST, \r + \_VCIPHER \ST+1, \ST+1, \r + \_VCIPHER \ST+2, \ST+2, \r + \_VCIPHER \ST+3, \ST+3, \r + \_VCIPHER \ST+4, \ST+4, \r + \_VCIPHER \ST+5, \ST+5, \r + \_VCIPHER \ST+6, \ST+6, \r + \_VCIPHER \ST+7, \ST+7, \r +.endm - vxor 24, 24, 29 # H + mH +.macro LOOP_8AES_STATE + xxlor 32+23, 1, 1 + xxlor 32+24, 2, 2 + xxlor 32+25, 3, 3 + xxlor 32+26, 4, 4 + AES_CIPHER_8x vcipher, 15, 23 + AES_CIPHER_8x vcipher, 15, 24 + AES_CIPHER_8x vcipher, 15, 25 + AES_CIPHER_8x vcipher, 15, 26 + xxlor 32+23, 5, 5 + xxlor 32+24, 6, 6 + xxlor 32+25, 7, 7 + xxlor 32+26, 8, 8 + AES_CIPHER_8x vcipher, 15, 23 + AES_CIPHER_8x vcipher, 15, 24 + AES_CIPHER_8x vcipher, 15, 25 + AES_CIPHER_8x vcipher, 15, 26 +.endm - # sum hash and reduction with H Poly - vsldoi 27, 23, 23, 8 # swap - vpmsumd 23, 23, 2 - vxor 27, 27, 24 - vxor 27, 23, 27 # 1st Xi - - # 2nd 4x hash - vpmsumd 24, 9, 20 - vpmsumd 25, 6, 21 - vpmsumd 26, 3, 22 - vxor 19, 19, 27 # Xi + X - vpmsumd 23, 12, 19 # H4.L * X.L - - vxor 23, 23, 24 - vxor 23, 23, 25 - vxor 23, 23, 26 # L - - vpmsumd 24, 13, 19 # H4.L * X.H + H4.H * X.L - vpmsumd 25, 10, 20 # H3.L * X1.H + H3.H * X1.L - vpmsumd 26, 7, 21 - vpmsumd 27, 4, 22 - - vxor 24, 24, 25 - vxor 24, 24, 26 +# +# PPC_GHASH4x(H, S1, S2, S3, S4): Compute 4x hash values based on Karatsuba method. +# H: returning digest +# S#: states +# +# S1 should xor with the previous digest +# +# Xi = v0 +# H Poly = v2 +# Hash keys = v3 - v14 +# Scratch: v23 - v29 +# +.macro PPC_GHASH4x H S1 S2 S3 S4 + + vpmsumd 23, 12, \S1 # H4.L * X.L + vpmsumd 24, 9, \S2 + vpmsumd 25, 6, \S3 + vpmsumd 26, 3, \S4 + + vpmsumd 27, 13, \S1 # H4.L * X.H + H4.H * X.L + vpmsumd 28, 10, \S2 # H3.L * X1.H + H3.H * X1.L + + vxor 23, 23, 24 + vxor 23, 23, 25 + vxor 23, 23, 26 # L + + vxor 24, 27, 28 + vpmsumd 25, 7, \S3 + vpmsumd 26, 4, \S4 + + vxor 24, 24, 25 + vxor 24, 24, 26 # M # sum hash and reduction with H Poly - vpmsumd 28, 23, 2 # reduction - - vxor 29, 29, 29 + vpmsumd 28, 23, 2 # reduction - vxor 24, 24, 27 # M - vsldoi 26, 24, 29, 8 # mL - vsldoi 29, 29, 24, 8 # mH - vxor 23, 23, 26 # mL + L + vxor 1, 1, 1 + vsldoi 25, 24, 1, 8 # mL + vsldoi 1, 1, 24, 8 # mH + vxor 23, 23, 25 # mL + L - vsldoi 23, 23, 23, 8 # swap - vxor 23, 23, 28 + # This performs swap and xor like, + # vsldoi 23, 23, 23, 8 # swap + # vxor 23, 23, 28 + xxlor 32+25, 10, 10 + vpermxor 23, 23, 28, 25 - vpmsumd 24, 14, 19 # H4.H * X.H - vpmsumd 25, 11, 20 - vpmsumd 26, 8, 21 - vpmsumd 27, 5, 22 + vpmsumd 26, 14, \S1 # H4.H * X.H + vpmsumd 27, 11, \S2 + vpmsumd 28, 8, \S3 + vpmsumd 29, 5, \S4 - vxor 24, 24, 25 - vxor 24, 24, 26 - vxor 24, 24, 27 # H + vxor 24, 26, 27 + vxor 24, 24, 28 + vxor 24, 24, 29 - vxor 24, 24, 29 # H + mH + vxor 24, 24, 1 # sum hash and reduction with H Poly - vsldoi 27, 23, 23, 8 # swap - vpmsumd 23, 23, 2 - vxor 27, 27, 24 - vxor 23, 23, 27 - - xxlor 32, 23+32, 23+32 # update hash - + vsldoi 25, 23, 23, 8 # swap + vpmsumd 23, 23, 2 + vxor 27, 25, 24 + vxor \H, 23, 27 .endm - # - # Compute update single hash - # -.macro ppc_update_hash_1x - vxor 28, 28, 0 - - vxor 19, 19, 19 +# +# Compute update single ghash +# scratch: v1, v22..v27 +# +.macro PPC_GHASH1x H S1 - vpmsumd 22, 3, 28 # L - vpmsumd 23, 4, 28 # M - vpmsumd 24, 5, 28 # H + vxor 1, 1, 1 - vpmsumd 27, 22, 2 # reduction + vpmsumd 22, 3, \S1 # L + vpmsumd 23, 4, \S1 # M + vpmsumd 24, 5, \S1 # H - vsldoi 25, 23, 19, 8 # mL - vsldoi 26, 19, 23, 8 # mH - vxor 22, 22, 25 # LL + LL - vxor 24, 24, 26 # HH + HH + vpmsumd 27, 22, 2 # reduction - vsldoi 22, 22, 22, 8 # swap - vxor 22, 22, 27 + vsldoi 25, 23, 1, 8 # mL + vsldoi 26, 1, 23, 8 # mH + vxor 22, 22, 25 # LL + LL + vxor 24, 24, 26 # HH + HH - vsldoi 20, 22, 22, 8 # swap - vpmsumd 22, 22, 2 # reduction - vxor 20, 20, 24 - vxor 22, 22, 20 + xxlor 32+25, 10, 10 + vpermxor 22, 22, 27, 25 - vmr 0, 22 # update hash - -.endm - -.macro SAVE_REGS - stdu 1,-640(1) - mflr 0 - - std 14,112(1) - std 15,120(1) - std 16,128(1) - std 17,136(1) - std 18,144(1) - std 19,152(1) - std 20,160(1) - std 21,168(1) - li 9, 256 - stvx 20, 9, 1 - addi 9, 9, 16 - stvx 21, 9, 1 - addi 9, 9, 16 - stvx 22, 9, 1 - addi 9, 9, 16 - stvx 23, 9, 1 - addi 9, 9, 16 - stvx 24, 9, 1 - addi 9, 9, 16 - stvx 25, 9, 1 - addi 9, 9, 16 - stvx 26, 9, 1 - addi 9, 9, 16 - stvx 27, 9, 1 - addi 9, 9, 16 - stvx 28, 9, 1 - addi 9, 9, 16 - stvx 29, 9, 1 - addi 9, 9, 16 - stvx 30, 9, 1 - addi 9, 9, 16 - stvx 31, 9, 1 - stxv 14, 464(1) - stxv 15, 480(1) - stxv 16, 496(1) - stxv 17, 512(1) - stxv 18, 528(1) - stxv 19, 544(1) - stxv 20, 560(1) - stxv 21, 576(1) - stxv 22, 592(1) - std 0, 656(1) -.endm - -.macro RESTORE_REGS - lxv 14, 464(1) - lxv 15, 480(1) - lxv 16, 496(1) - lxv 17, 512(1) - lxv 18, 528(1) - lxv 19, 544(1) - lxv 20, 560(1) - lxv 21, 576(1) - lxv 22, 592(1) - li 9, 256 - lvx 20, 9, 1 - addi 9, 9, 16 - lvx 21, 9, 1 - addi 9, 9, 16 - lvx 22, 9, 1 - addi 9, 9, 16 - lvx 23, 9, 1 - addi 9, 9, 16 - lvx 24, 9, 1 - addi 9, 9, 16 - lvx 25, 9, 1 - addi 9, 9, 16 - lvx 26, 9, 1 - addi 9, 9, 16 - lvx 27, 9, 1 - addi 9, 9, 16 - lvx 28, 9, 1 - addi 9, 9, 16 - lvx 29, 9, 1 - addi 9, 9, 16 - lvx 30, 9, 1 - addi 9, 9, 16 - lvx 31, 9, 1 - - ld 0, 656(1) - ld 14,112(1) - ld 15,120(1) - ld 16,128(1) - ld 17,136(1) - ld 18,144(1) - ld 19,152(1) - ld 20,160(1) - ld 21,168(1) - - mtlr 0 - addi 1, 1, 640 + vsldoi 23, 22, 22, 8 # swap + vpmsumd 22, 22, 2 # reduction + vxor 23, 23, 24 + vxor \H, 22, 23 .endm +# +# LOAD_HASH_TABLE +# Xi = v0 +# H Poly = v2 +# Hash keys = v3 - v14 +# .macro LOAD_HASH_TABLE # Load Xi lxvb16x 32, 0, 8 # load Xi @@ -557,657 +298,434 @@ lxvd2x 14+32, 10, 8 # H^4h .endm - # - # aes_p10_gcm_encrypt (const void *inp, void *out, size_t len, - # const char *rk, unsigned char iv[16], void *Xip); - # - # r3 - inp - # r4 - out - # r5 - len - # r6 - AES round keys - # r7 - iv and other data - # r8 - Xi, HPoli, hash keys - # - # rounds is at offset 240 in rk - # Xi is at 0 in gcm_table (Xip). - # -_GLOBAL(aes_p10_gcm_encrypt) -.align 5 - - SAVE_REGS - - LOAD_HASH_TABLE - - # initialize ICB: GHASH( IV ), IV - r7 - lxvb16x 30+32, 0, 7 # load IV - v30 - - mr 12, 5 # length - li 11, 0 # block index - - # counter 1 - vxor 31, 31, 31 - vspltisb 22, 1 - vsldoi 31, 31, 22,1 # counter 1 - - # load round key to VSR - lxv 0, 0(6) - lxv 1, 0x10(6) - lxv 2, 0x20(6) - lxv 3, 0x30(6) - lxv 4, 0x40(6) - lxv 5, 0x50(6) - lxv 6, 0x60(6) - lxv 7, 0x70(6) - lxv 8, 0x80(6) - lxv 9, 0x90(6) - lxv 10, 0xa0(6) - - # load rounds - 10 (128), 12 (192), 14 (256) - lwz 9,240(6) - - # - # vxor state, state, w # addroundkey - xxlor 32+29, 0, 0 - vxor 15, 30, 29 # IV + round key - add round key 0 - - cmpdi 9, 10 - beq Loop_aes_gcm_8x - - # load 2 more round keys (v11, v12) - lxv 11, 0xb0(6) - lxv 12, 0xc0(6) - - cmpdi 9, 12 - beq Loop_aes_gcm_8x - - # load 2 more round keys (v11, v12, v13, v14) - lxv 13, 0xd0(6) - lxv 14, 0xe0(6) - cmpdi 9, 14 - beq Loop_aes_gcm_8x - - b aes_gcm_out - -.align 5 -Loop_aes_gcm_8x: - mr 14, 3 - mr 9, 4 - - # - # check partial block - # -Continue_partial_check: - ld 15, 56(7) - cmpdi 15, 0 - beq Continue - bgt Final_block - cmpdi 15, 16 - blt Final_block - -Continue: - # n blcoks - li 10, 128 - divdu 10, 12, 10 # n 128 bytes-blocks - cmpdi 10, 0 - beq Loop_last_block - - vaddudm 30, 30, 31 # IV + counter - vxor 16, 30, 29 - vaddudm 30, 30, 31 - vxor 17, 30, 29 - vaddudm 30, 30, 31 - vxor 18, 30, 29 - vaddudm 30, 30, 31 - vxor 19, 30, 29 - vaddudm 30, 30, 31 - vxor 20, 30, 29 - vaddudm 30, 30, 31 - vxor 21, 30, 29 - vaddudm 30, 30, 31 - vxor 22, 30, 29 - - mtctr 10 - - li 15, 16 - li 16, 32 - li 17, 48 - li 18, 64 - li 19, 80 - li 20, 96 - li 21, 112 - - lwz 10, 240(6) - -Loop_8x_block: - - lxvb16x 15, 0, 14 # load block - lxvb16x 16, 15, 14 # load block - lxvb16x 17, 16, 14 # load block - lxvb16x 18, 17, 14 # load block - lxvb16x 19, 18, 14 # load block - lxvb16x 20, 19, 14 # load block - lxvb16x 21, 20, 14 # load block - lxvb16x 22, 21, 14 # load block - addi 14, 14, 128 - - Loop_aes_middle8x - - xxlor 23+32, 10, 10 - - cmpdi 10, 10 - beq Do_next_ghash - - # 192 bits - xxlor 24+32, 11, 11 - - vcipher 15, 15, 23 - vcipher 16, 16, 23 - vcipher 17, 17, 23 - vcipher 18, 18, 23 - vcipher 19, 19, 23 - vcipher 20, 20, 23 - vcipher 21, 21, 23 - vcipher 22, 22, 23 - - vcipher 15, 15, 24 - vcipher 16, 16, 24 - vcipher 17, 17, 24 - vcipher 18, 18, 24 - vcipher 19, 19, 24 - vcipher 20, 20, 24 - vcipher 21, 21, 24 - vcipher 22, 22, 24 - - xxlor 23+32, 12, 12 - - cmpdi 10, 12 - beq Do_next_ghash - - # 256 bits - xxlor 24+32, 13, 13 - - vcipher 15, 15, 23 - vcipher 16, 16, 23 - vcipher 17, 17, 23 - vcipher 18, 18, 23 - vcipher 19, 19, 23 - vcipher 20, 20, 23 - vcipher 21, 21, 23 - vcipher 22, 22, 23 - - vcipher 15, 15, 24 - vcipher 16, 16, 24 - vcipher 17, 17, 24 - vcipher 18, 18, 24 - vcipher 19, 19, 24 - vcipher 20, 20, 24 - vcipher 21, 21, 24 - vcipher 22, 22, 24 - - xxlor 23+32, 14, 14 - - cmpdi 10, 14 - beq Do_next_ghash - b aes_gcm_out - -Do_next_ghash: - - # - # last round - vcipherlast 15, 15, 23 - vcipherlast 16, 16, 23 - - xxlxor 47, 47, 15 - stxvb16x 47, 0, 9 # store output - xxlxor 48, 48, 16 - stxvb16x 48, 15, 9 # store output - - vcipherlast 17, 17, 23 - vcipherlast 18, 18, 23 - - xxlxor 49, 49, 17 - stxvb16x 49, 16, 9 # store output - xxlxor 50, 50, 18 - stxvb16x 50, 17, 9 # store output - - vcipherlast 19, 19, 23 - vcipherlast 20, 20, 23 - - xxlxor 51, 51, 19 - stxvb16x 51, 18, 9 # store output - xxlxor 52, 52, 20 - stxvb16x 52, 19, 9 # store output - - vcipherlast 21, 21, 23 - vcipherlast 22, 22, 23 - - xxlxor 53, 53, 21 - stxvb16x 53, 20, 9 # store output - xxlxor 54, 54, 22 - stxvb16x 54, 21, 9 # store output - - addi 9, 9, 128 - - # ghash here - ppc_aes_gcm_ghash2_4x - - xxlor 27+32, 0, 0 - vaddudm 30, 30, 31 # IV + counter - vmr 29, 30 - vxor 15, 30, 27 # add round key - vaddudm 30, 30, 31 - vxor 16, 30, 27 - vaddudm 30, 30, 31 - vxor 17, 30, 27 - vaddudm 30, 30, 31 - vxor 18, 30, 27 - vaddudm 30, 30, 31 - vxor 19, 30, 27 - vaddudm 30, 30, 31 - vxor 20, 30, 27 - vaddudm 30, 30, 31 - vxor 21, 30, 27 - vaddudm 30, 30, 31 - vxor 22, 30, 27 - - addi 12, 12, -128 - addi 11, 11, 128 - - bdnz Loop_8x_block - - vmr 30, 29 - stxvb16x 30+32, 0, 7 # update IV - -Loop_last_block: - cmpdi 12, 0 - beq aes_gcm_out - - # loop last few blocks +################################################################################ +# Compute AES and ghash one block at a time. +# r23: AES rounds +# v30: current IV +# vs0: roundkey 0 +# +################################################################################ +SYM_FUNC_START_LOCAL(aes_gcm_crypt_1x) + + cmpdi 5, 16 + bge __More_1x + blr +__More_1x: li 10, 16 - divdu 10, 12, 10 - - mtctr 10 - - lwz 10, 240(6) - - cmpdi 12, 16 - blt Final_block - -Next_rem_block: - lxvb16x 15, 0, 14 # load block - - Loop_aes_middle_1x - - xxlor 23+32, 10, 10 - - cmpdi 10, 10 - beq Do_next_1x - - # 192 bits - xxlor 24+32, 11, 11 - - vcipher 15, 15, 23 - vcipher 15, 15, 24 - - xxlor 23+32, 12, 12 + divdu 12, 5, 10 + + xxlxor 32+15, 32+30, 0 + + # Pre-load 8 AES rounds to scratch vectors. + xxlor 32+16, 1, 1 + xxlor 32+17, 2, 2 + xxlor 32+18, 3, 3 + xxlor 32+19, 4, 4 + xxlor 32+20, 5, 5 + xxlor 32+21, 6, 6 + xxlor 32+28, 7, 7 + xxlor 32+29, 8, 8 + lwz 23, 240(6) # n rounds + addi 22, 23, -9 # remaing AES rounds - cmpdi 10, 12 - beq Do_next_1x - - # 256 bits - xxlor 24+32, 13, 13 - - vcipher 15, 15, 23 - vcipher 15, 15, 24 - - xxlor 23+32, 14, 14 - - cmpdi 10, 14 - beq Do_next_1x - -Do_next_1x: - vcipherlast 15, 15, 23 - - xxlxor 47, 47, 15 - stxvb16x 47, 0, 9 # store output - addi 14, 14, 16 - addi 9, 9, 16 - - vmr 28, 15 - ppc_update_hash_1x - - addi 12, 12, -16 - addi 11, 11, 16 - xxlor 19+32, 0, 0 - vaddudm 30, 30, 31 # IV + counter - vxor 15, 30, 19 # add round key - - bdnz Next_rem_block - - li 15, 0 - std 15, 56(7) # clear partial? - stxvb16x 30+32, 0, 7 # update IV cmpdi 12, 0 - beq aes_gcm_out - -Final_block: - lwz 10, 240(6) - Loop_aes_middle_1x - - xxlor 23+32, 10, 10 - - cmpdi 10, 10 - beq Do_final_1x - - # 192 bits - xxlor 24+32, 11, 11 - - vcipher 15, 15, 23 - vcipher 15, 15, 24 - - xxlor 23+32, 12, 12 - - cmpdi 10, 12 - beq Do_final_1x - - # 256 bits - xxlor 24+32, 13, 13 - - vcipher 15, 15, 23 - vcipher 15, 15, 24 + bgt __Loop_1x + blr - xxlor 23+32, 14, 14 +__Loop_1x: + mtctr 22 + addi 10, 6, 144 + vcipher 15, 15, 16 + vcipher 15, 15, 17 + vcipher 15, 15, 18 + vcipher 15, 15, 19 + vcipher 15, 15, 20 + vcipher 15, 15, 21 + vcipher 15, 15, 28 + vcipher 15, 15, 29 - cmpdi 10, 14 - beq Do_final_1x +__Loop_aes_1state: + lxv 32+1, 0(10) + vcipher 15, 15, 1 + addi 10, 10, 16 + bdnz __Loop_aes_1state + lxv 32+1, 0(10) # last round key + lxvb16x 11, 0, 14 # load input block + vcipherlast 15, 15, 1 + + xxlxor 32+15, 32+15, 11 + stxvb16x 32+15, 0, 9 # store output + addi 14, 14, 16 + addi 9, 9, 16 -Do_final_1x: - vcipherlast 15, 15, 23 + cmpdi 24, 0 # decrypt? + bne __Encrypt_1x + xxlor 15+32, 11, 11 +__Encrypt_1x: + vxor 15, 15, 0 + PPC_GHASH1x 0, 15 - # check partial block - li 21, 0 # encrypt - ld 15, 56(7) # partial? - cmpdi 15, 0 - beq Normal_block - bl Do_partial_block + addi 5, 5, -16 + addi 11, 11, 16 + vadduwm 30, 30, 31 # IV + counter + xxlxor 32+15, 32+30, 0 + addi 12, 12, -1 cmpdi 12, 0 - ble aes_gcm_out + bgt __Loop_1x - b Continue_partial_check - -Normal_block: - lxvb16x 15, 0, 14 # load last block - xxlxor 47, 47, 15 - - # create partial block mask - li 15, 16 - sub 15, 15, 12 # index to the mask - - vspltisb 16, -1 # first 16 bytes - 0xffff...ff - vspltisb 17, 0 # second 16 bytes - 0x0000...00 - li 10, 192 - stvx 16, 10, 1 + stxvb16x 32+30, 0, 7 # update IV + stxvb16x 32+0, 0, 8 # update Xi + blr +SYM_FUNC_END(aes_gcm_crypt_1x) + +################################################################################ +# Process a normal partial block when we come here. +# Compute partial mask, Load and store partial block to stack. +# Update partial_len and pblock. +# pblock is (encrypted ^ AES state) for encrypt +# and (input ^ AES state) for decrypt. +# +################################################################################ +SYM_FUNC_START_LOCAL(__Process_partial) + + # create partial mask + vspltisb 16, -1 + li 12, 16 + sub 12, 12, 5 + sldi 12, 12, 3 + mtvsrdd 32+17, 0, 12 + vslo 16, 16, 17 # partial block mask + + lxvb16x 11, 0, 14 # load partial block + xxland 11, 11, 32+16 + + # AES crypt partial + xxlxor 32+15, 32+30, 0 + lwz 23, 240(6) # n rounds + addi 22, 23, -1 # loop - 1 + mtctr 22 + addi 10, 6, 16 + +__Loop_aes_pstate: + lxv 32+1, 0(10) + vcipher 15, 15, 1 addi 10, 10, 16 - stvx 17, 10, 1 - - addi 10, 1, 192 - lxvb16x 16, 15, 10 # load partial block mask - xxland 47, 47, 16 - - vmr 28, 15 - ppc_update_hash_1x + bdnz __Loop_aes_pstate + lxv 32+1, 0(10) # last round key + vcipherlast 15, 15, 1 - # * should store only the remaining bytes. - bl Write_partial_block - - stxvb16x 30+32, 0, 7 # update IV - std 12, 56(7) # update partial? - li 16, 16 + xxlxor 32+15, 32+15, 11 + vand 15, 15, 16 - stxvb16x 32, 0, 8 # write out Xi - stxvb16x 32, 16, 8 # write out Xi - b aes_gcm_out - - # - # Compute data mask - # -.macro GEN_MASK _mask _start _end - vspltisb 16, -1 # first 16 bytes - 0xffff...ff - vspltisb 17, 0 # second 16 bytes - 0x0000...00 - li 10, 192 - stxvb16x 17+32, 10, 1 - add 10, 10, \_start - stxvb16x 16+32, 10, 1 - add 10, 10, \_end - stxvb16x 17+32, 10, 1 - - addi 10, 1, 192 - lxvb16x \_mask, 0, 10 # load partial block mask -.endm + # AES crypt output v15 + # Write partial + li 10, 224 + stxvb16x 15+32, 10, 1 # write v15 to stack + addi 10, 1, 223 + addi 12, 9, -1 + mtctr 5 # partial block len +__Write_partial: + lbzu 22, 1(10) + stbu 22, 1(12) + bdnz __Write_partial + + cmpdi 24, 0 # decrypt? + bne __Encrypt_partial + xxlor 32+15, 11, 11 # decrypt using the input block +__Encrypt_partial: + #vxor 15, 15, 0 # ^ previous hash + #PPC_GHASH1x 0, 15 + + add 14, 14, 5 + add 9, 9, 5 + std 5, 56(7) # update partial + sub 11, 11, 5 + li 5, 0 # done last byte - # - # Handle multiple partial blocks for encrypt and decrypt - # operations. - # -SYM_FUNC_START_LOCAL(Do_partial_block) - add 17, 15, 5 - cmpdi 17, 16 - bgt Big_block - GEN_MASK 18, 15, 5 - b _Partial -SYM_FUNC_END(Do_partial_block) -Big_block: + # + # Don't increase IV since this is the last partial. + # It should get updated in gcm_update if no more data blocks. + #vadduwm 30, 30, 31 # increase IV + stxvb16x 32+30, 0, 7 # update IV + li 10, 64 + stxvb16x 32+0, 0, 8 # Update X1 + stxvb16x 32+15, 10, 7 # Update pblock + blr +SYM_FUNC_END(__Process_partial) + +#### |
