diff options
Diffstat (limited to 'arch')
| -rw-r--r-- | arch/arm/boot/dts/stm32746g-eval.dts | 4 | ||||
| -rw-r--r-- | arch/arm/boot/dts/stm32f746.dtsi | 7 | ||||
| -rw-r--r-- | arch/arm/configs/stm32_defconfig | 2 | ||||
| -rw-r--r-- | arch/arm/crypto/Kconfig | 2 | ||||
| -rw-r--r-- | arch/arm/crypto/aes-neonbs-glue.c | 60 | ||||
| -rw-r--r-- | arch/arm64/boot/dts/amlogic/meson-gx.dtsi | 2 | ||||
| -rw-r--r-- | arch/arm64/boot/dts/amlogic/meson-gxbb.dtsi | 5 | ||||
| -rw-r--r-- | arch/metag/kernel/stacktrace.c | 2 | ||||
| -rw-r--r-- | arch/powerpc/crypto/Makefile | 3 | ||||
| -rw-r--r-- | arch/powerpc/crypto/crc-vpmsum_test.c | 137 | ||||
| -rw-r--r-- | arch/powerpc/crypto/crc32-vpmsum_core.S | 755 | ||||
| -rw-r--r-- | arch/powerpc/crypto/crc32c-vpmsum_asm.S | 715 | ||||
| -rw-r--r-- | arch/powerpc/crypto/crct10dif-vpmsum_asm.S | 850 | ||||
| -rw-r--r-- | arch/powerpc/crypto/crct10dif-vpmsum_glue.c | 128 | ||||
| -rw-r--r-- | arch/x86/crypto/aes_ctrby8_avx-x86_64.S | 7 | ||||
| -rw-r--r-- | arch/x86/crypto/camellia_glue.c | 4 | ||||
| -rw-r--r-- | arch/x86/crypto/glue_helper.c | 3 | ||||
| -rw-r--r-- | arch/x86/crypto/serpent_sse2_glue.c | 4 | ||||
| -rw-r--r-- | arch/x86/crypto/twofish_glue_3way.c | 4 | ||||
| -rw-r--r-- | arch/x86/include/asm/crypto/glue_helper.h | 10 |
20 files changed, 1952 insertions, 752 deletions
diff --git a/arch/arm/boot/dts/stm32746g-eval.dts b/arch/arm/boot/dts/stm32746g-eval.dts index aa03fac1ec55..0dc18a0f0940 100644 --- a/arch/arm/boot/dts/stm32746g-eval.dts +++ b/arch/arm/boot/dts/stm32746g-eval.dts @@ -89,6 +89,10 @@ clock-frequency = <25000000>; }; +&crc { + status = "okay"; +}; + &usart1 { pinctrl-0 = <&usart1_pins_a>; pinctrl-names = "default"; diff --git a/arch/arm/boot/dts/stm32f746.dtsi b/arch/arm/boot/dts/stm32f746.dtsi index f321ffe87144..755fb923c07b 100644 --- a/arch/arm/boot/dts/stm32f746.dtsi +++ b/arch/arm/boot/dts/stm32f746.dtsi @@ -289,6 +289,13 @@ }; }; + crc: crc@40023000 { + compatible = "st,stm32f7-crc"; + reg = <0x40023000 0x400>; + clocks = <&rcc 0 12>; + status = "disabled"; + }; + rcc: rcc@40023800 { #clock-cells = <2>; compatible = "st,stm32f42xx-rcc", "st,stm32-rcc"; diff --git a/arch/arm/configs/stm32_defconfig b/arch/arm/configs/stm32_defconfig index a9d8e3c9b487..03437f8f9ad1 100644 --- a/arch/arm/configs/stm32_defconfig +++ b/arch/arm/configs/stm32_defconfig @@ -75,5 +75,7 @@ CONFIG_MAGIC_SYSRQ=y # CONFIG_SCHED_DEBUG is not set # CONFIG_DEBUG_BUGVERBOSE is not set # CONFIG_FTRACE is not set +CONFIG_CRYPTO=y +CONFIG_CRYPTO_DEV_STM32=y CONFIG_CRC_ITU_T=y CONFIG_CRC7=y diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig index a8fce93137fb..b9adedcc5b2e 100644 --- a/arch/arm/crypto/Kconfig +++ b/arch/arm/crypto/Kconfig @@ -73,7 +73,7 @@ config CRYPTO_AES_ARM_BS depends on KERNEL_MODE_NEON select CRYPTO_BLKCIPHER select CRYPTO_SIMD - select CRYPTO_AES_ARM + select CRYPTO_AES help Use a faster and more secure NEON based implementation of AES in CBC, CTR and XTS modes diff --git a/arch/arm/crypto/aes-neonbs-glue.c b/arch/arm/crypto/aes-neonbs-glue.c index 2920b96dbd36..c76377961444 100644 --- a/arch/arm/crypto/aes-neonbs-glue.c +++ b/arch/arm/crypto/aes-neonbs-glue.c @@ -42,9 +42,6 @@ asmlinkage void aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], asmlinkage void aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, int blocks, u8 iv[]); -asmlinkage void __aes_arm_encrypt(const u32 rk[], int rounds, const u8 in[], - u8 out[]); - struct aesbs_ctx { int rounds; u8 rk[13 * (8 * AES_BLOCK_SIZE) + 32] __aligned(AES_BLOCK_SIZE); @@ -52,12 +49,12 @@ struct aesbs_ctx { struct aesbs_cbc_ctx { struct aesbs_ctx key; - u32 enc[AES_MAX_KEYLENGTH_U32]; + struct crypto_cipher *enc_tfm; }; struct aesbs_xts_ctx { struct aesbs_ctx key; - u32 twkey[AES_MAX_KEYLENGTH_U32]; + struct crypto_cipher *tweak_tfm; }; static int aesbs_setkey(struct crypto_skcipher *tfm, const u8 *in_key, @@ -132,20 +129,18 @@ static int aesbs_cbc_setkey(struct crypto_skcipher *tfm, const u8 *in_key, ctx->key.rounds = 6 + key_len / 4; - memcpy(ctx->enc, rk.key_enc, sizeof(ctx->enc)); - kernel_neon_begin(); aesbs_convert_key(ctx->key.rk, rk.key_enc, ctx->key.rounds); kernel_neon_end(); - return 0; + return crypto_cipher_setkey(ctx->enc_tfm, in_key, key_len); } static void cbc_encrypt_one(struct crypto_skcipher *tfm, const u8 *src, u8 *dst) { struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm); - __aes_arm_encrypt(ctx->enc, ctx->key.rounds, src, dst); + crypto_cipher_encrypt_one(ctx->enc_tfm, dst, src); } static int cbc_encrypt(struct skcipher_request *req) @@ -181,6 +176,23 @@ static int cbc_decrypt(struct skcipher_request *req) return err; } +static int cbc_init(struct crypto_tfm *tfm) +{ + struct aesbs_cbc_ctx *ctx = crypto_tfm_ctx(tfm); + + ctx->enc_tfm = crypto_alloc_cipher("aes", 0, 0); + if (IS_ERR(ctx->enc_tfm)) + return PTR_ERR(ctx->enc_tfm); + return 0; +} + +static void cbc_exit(struct crypto_tfm *tfm) +{ + struct aesbs_cbc_ctx *ctx = crypto_tfm_ctx(tfm); + + crypto_free_cipher(ctx->enc_tfm); +} + static int ctr_encrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); @@ -228,7 +240,6 @@ static int aesbs_xts_setkey(struct crypto_skcipher *tfm, const u8 *in_key, unsigned int key_len) { struct aesbs_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - struct crypto_aes_ctx rk; int err; err = xts_verify_key(tfm, in_key, key_len); @@ -236,15 +247,30 @@ static int aesbs_xts_setkey(struct crypto_skcipher *tfm, const u8 *in_key, return err; key_len /= 2; - err = crypto_aes_expand_key(&rk, in_key + key_len, key_len); + err = crypto_cipher_setkey(ctx->tweak_tfm, in_key + key_len, key_len); if (err) return err; - memcpy(ctx->twkey, rk.key_enc, sizeof(ctx->twkey)); - return aesbs_setkey(tfm, in_key, key_len); } +static int xts_init(struct crypto_tfm *tfm) +{ + struct aesbs_xts_ctx *ctx = crypto_tfm_ctx(tfm); + + ctx->tweak_tfm = crypto_alloc_cipher("aes", 0, 0); + if (IS_ERR(ctx->tweak_tfm)) + return PTR_ERR(ctx->tweak_tfm); + return 0; +} + +static void xts_exit(struct crypto_tfm *tfm) +{ + struct aesbs_xts_ctx *ctx = crypto_tfm_ctx(tfm); + + crypto_free_cipher(ctx->tweak_tfm); +} + static int __xts_crypt(struct skcipher_request *req, void (*fn)(u8 out[], u8 const in[], u8 const rk[], int rounds, int blocks, u8 iv[])) @@ -256,7 +282,7 @@ static int __xts_crypt(struct skcipher_request *req, err = skcipher_walk_virt(&walk, req, true); - __aes_arm_encrypt(ctx->twkey, ctx->key.rounds, walk.iv, walk.iv); + crypto_cipher_encrypt_one(ctx->tweak_tfm, walk.iv, walk.iv); kernel_neon_begin(); while (walk.nbytes >= AES_BLOCK_SIZE) { @@ -309,6 +335,8 @@ static struct skcipher_alg aes_algs[] = { { .base.cra_ctxsize = sizeof(struct aesbs_cbc_ctx), .base.cra_module = THIS_MODULE, .base.cra_flags = CRYPTO_ALG_INTERNAL, + .base.cra_init = cbc_init, + .base.cra_exit = cbc_exit, .min_keysize = AES_MIN_KEY_SIZE, .max_keysize = AES_MAX_KEY_SIZE, @@ -342,6 +370,8 @@ static struct skcipher_alg aes_algs[] = { { .base.cra_ctxsize = sizeof(struct aesbs_xts_ctx), .base.cra_module = THIS_MODULE, .base.cra_flags = CRYPTO_ALG_INTERNAL, + .base.cra_init = xts_init, + .base.cra_exit = xts_exit, .min_keysize = 2 * AES_MIN_KEY_SIZE, .max_keysize = 2 * AES_MAX_KEY_SIZE, @@ -402,5 +432,5 @@ unregister_simds: return err; } -module_init(aes_init); +late_initcall(aes_init); module_exit(aes_exit); diff --git a/arch/arm64/boot/dts/amlogic/meson-gx.dtsi b/arch/arm64/boot/dts/amlogic/meson-gx.dtsi index 5d995f7724af..620495a43363 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gx.dtsi +++ b/arch/arm64/boot/dts/amlogic/meson-gx.dtsi @@ -380,7 +380,7 @@ #size-cells = <2>; ranges = <0x0 0x0 0x0 0xc8834000 0x0 0x2000>; - rng { + hwrng: rng { compatible = "amlogic,meson-rng"; reg = <0x0 0x0 0x0 0x4>; }; diff --git a/arch/arm64/boot/dts/amlogic/meson-gxbb.dtsi b/arch/arm64/boot/dts/amlogic/meson-gxbb.dtsi index 04b3324bc132..a375cb21cc8b 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gxbb.dtsi +++ b/arch/arm64/boot/dts/amlogic/meson-gxbb.dtsi @@ -524,3 +524,8 @@ &vpu { compatible = "amlogic,meson-gxbb-vpu", "amlogic,meson-gx-vpu"; }; + +&hwrng { + clocks = <&clkc CLKID_RNG0>; + clock-names = "core"; +}; diff --git a/arch/metag/kernel/stacktrace.c b/arch/metag/kernel/stacktrace.c index 91ffc4b75c33..09d67b7f51ca 100644 --- a/arch/metag/kernel/stacktrace.c +++ b/arch/metag/kernel/stacktrace.c @@ -31,8 +31,6 @@ static void tbi_boing_init(void) } #endif -#define ALIGN_DOWN(addr, size) ((addr)&(~((size)-1))) - /* * Unwind the current stack frame and store the new register values in the * structure passed as argument. Unwinding is equivalent to a function return, diff --git a/arch/powerpc/crypto/Makefile b/arch/powerpc/crypto/Makefile index 87f40454bad3..67eca3af9fc7 100644 --- a/arch/powerpc/crypto/Makefile +++ b/arch/powerpc/crypto/Makefile @@ -10,6 +10,8 @@ obj-$(CONFIG_CRYPTO_SHA1_PPC) += sha1-powerpc.o obj-$(CONFIG_CRYPTO_SHA1_PPC_SPE) += sha1-ppc-spe.o obj-$(CONFIG_CRYPTO_SHA256_PPC_SPE) += sha256-ppc-spe.o obj-$(CONFIG_CRYPTO_CRC32C_VPMSUM) += crc32c-vpmsum.o +obj-$(CONFIG_CRYPTO_CRCT10DIF_VPMSUM) += crct10dif-vpmsum.o +obj-$(CONFIG_CRYPTO_VPMSUM_TESTER) += crc-vpmsum_test.o aes-ppc-spe-y := aes-spe-core.o aes-spe-keys.o aes-tab-4k.o aes-spe-modes.o aes-spe-glue.o md5-ppc-y := md5-asm.o md5-glue.o @@ -17,3 +19,4 @@ sha1-powerpc-y := sha1-powerpc-asm.o sha1.o sha1-ppc-spe-y := sha1-spe-asm.o sha1-spe-glue.o sha256-ppc-spe-y := sha256-spe-asm.o sha256-spe-glue.o crc32c-vpmsum-y := crc32c-vpmsum_asm.o crc32c-vpmsum_glue.o +crct10dif-vpmsum-y := crct10dif-vpmsum_asm.o crct10dif-vpmsum_glue.o diff --git a/arch/powerpc/crypto/crc-vpmsum_test.c b/arch/powerpc/crypto/crc-vpmsum_test.c new file mode 100644 index 000000000000..0153a9c6f4af --- /dev/null +++ b/arch/powerpc/crypto/crc-vpmsum_test.c @@ -0,0 +1,137 @@ +/* + * CRC vpmsum tester + * Copyright 2017 Daniel Axtens, IBM Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/crc-t10dif.h> +#include <linux/crc32.h> +#include <crypto/internal/hash.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/string.h> +#include <linux/kernel.h> +#include <linux/cpufeature.h> +#include <asm/switch_to.h> + +static unsigned long iterations = 10000; + +#define MAX_CRC_LENGTH 65535 + + +static int __init crc_test_init(void) +{ + u16 crc16 = 0, verify16 = 0; + u32 crc32 = 0, verify32 = 0; + __le32 verify32le = 0; + unsigned char *data; + unsigned long i; + int ret; + + struct crypto_shash *crct10dif_tfm; + struct crypto_shash *crc32c_tfm; + + if (!cpu_has_feature(CPU_FTR_ARCH_207S)) + return -ENODEV; + + data = kmalloc(MAX_CRC_LENGTH, GFP_KERNEL); + if (!data) + return -ENOMEM; + + crct10dif_tfm = crypto_alloc_shash("crct10dif", 0, 0); + + if (IS_ERR(crct10dif_tfm)) { + pr_err("Error allocating crc-t10dif\n"); + goto free_buf; + } + + crc32c_tfm = crypto_alloc_shash("crc32c", 0, 0); + + if (IS_ERR(crc32c_tfm)) { + pr_err("Error allocating crc32c\n"); + goto free_16; + } + + do { + SHASH_DESC_ON_STACK(crct10dif_shash, crct10dif_tfm); + SHASH_DESC_ON_STACK(crc32c_shash, crc32c_tfm); + + crct10dif_shash->tfm = crct10dif_tfm; + ret = crypto_shash_init(crct10dif_shash); + + if (ret) { + pr_err("Error initing crc-t10dif\n"); + goto free_32; + } + + + crc32c_shash->tfm = crc32c_tfm; + ret = crypto_shash_init(crc32c_shash); + + if (ret) { + pr_err("Error initing crc32c\n"); + goto free_32; + } + + pr_info("crc-vpmsum_test begins, %lu iterations\n", iterations); + for (i=0; i<iterations; i++) { + size_t len, offset; + + get_random_bytes(data, MAX_CRC_LENGTH); + get_random_bytes(&len, sizeof(len)); + get_random_bytes(&offset, sizeof(offset)); + + len %= MAX_CRC_LENGTH; + offset &= 15; + if (len <= offset) + continue; + len -= offset; + + crypto_shash_update(crct10dif_shash, data+offset, len); + crypto_shash_final(crct10dif_shash, (u8 *)(&crc16)); + verify16 = crc_t10dif_generic(verify16, data+offset, len); + + + if (crc16 != verify16) { + pr_err("FAILURE in CRC16: got 0x%04x expected 0x%04x (len %lu)\n", + crc16, verify16, len); + break; + } + + crypto_shash_update(crc32c_shash, data+offset, len); + crypto_shash_final(crc32c_shash, (u8 *)(&crc32)); + verify32 = le32_to_cpu(verify32le); + verify32le = ~cpu_to_le32(__crc32c_le(~verify32, data+offset, len)); + if (crc32 != (u32)verify32le) { + pr_err("FAILURE in CRC32: got 0x%08x expected 0x%08x (len %lu)\n", + crc32, verify32, len); + break; + } + } + pr_info("crc-vpmsum_test done, completed %lu iterations\n", i); + } while (0); + +free_32: + crypto_free_shash(crc32c_tfm); + +free_16: + crypto_free_shash(crct10dif_tfm); + +free_buf: + kfree(data); + + return 0; +} + +static void __exit crc_test_exit(void) {} + +module_init(crc_test_init); +module_exit(crc_test_exit); +module_param(iterations, long, 0400); + +MODULE_AUTHOR("Daniel Axtens <dja@axtens.net>"); +MODULE_DESCRIPTION("Vector polynomial multiply-sum CRC tester"); +MODULE_LICENSE("GPL"); diff --git a/arch/powerpc/crypto/crc32-vpmsum_core.S b/arch/powerpc/crypto/crc32-vpmsum_core.S new file mode 100644 index 000000000000..aadb59c96a27 --- /dev/null +++ b/arch/powerpc/crypto/crc32-vpmsum_core.S @@ -0,0 +1,755 @@ +/* + * Core of the accelerated CRC algorithm. + * In your file, define the constants and CRC_FUNCTION_NAME + * Then include this file. + * + * Calculate the checksum of data that is 16 byte aligned and a multiple of + * 16 bytes. + * + * The first step is to reduce it to 1024 bits. We do this in 8 parallel + * chunks in order to mask the latency of the vpmsum instructions. If we + * have more than 32 kB of data to checksum we repeat this step multiple + * times, passing in the previous 1024 bits. + * + * The next step is to reduce the 1024 bits to 64 bits. This step adds + * 32 bits of 0s to the end - this matches what a CRC does. We just + * calculate constants that land the data in this 32 bits. + * + * We then use fixed point Barrett reduction to compute a mod n over GF(2) + * for n = CRC using POWER8 instructions. We use x = 32. + * + * http://en.wikipedia.org/wiki/Barrett_reduction + * + * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. +*/ + +#include <asm/ppc_asm.h> +#include <asm/ppc-opcode.h> + +#define MAX_SIZE 32768 + + .text + +#if defined(__BIG_ENDIAN__) && defined(REFLECT) +#define BYTESWAP_DATA +#elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT) +#define BYTESWAP_DATA +#else +#undef BYTESWAP_DATA +#endif + +#define off16 r25 +#define off32 r26 +#define off48 r27 +#define off64 r28 +#define off80 r29 +#define off96 r30 +#define off112 r31 + +#define const1 v24 +#define const2 v25 + +#define byteswap v26 +#define mask_32bit v27 +#define mask_64bit v28 +#define zeroes v29 + +#ifdef BYTESWAP_DATA +#define VPERM(A, B, C, D) vperm A, B, C, D +#else +#define VPERM(A, B, C, D) +#endif + +/* unsigned int CRC_FUNCTION_NAME(unsigned int crc, void *p, unsigned long len) */ +FUNC_START(CRC_FUNCTION_NAME) + std r31,-8(r1) + std r30,-16(r1) + std r29,-24(r1) + std r28,-32(r1) + std r27,-40(r1) + std r26,-48(r1) + std r25,-56(r1) + + li off16,16 + li off32,32 + li off48,48 + li off64,64 + li off80,80 + li off96,96 + li off112,112 + li r0,0 + + /* Enough room for saving 10 non volatile VMX registers */ + subi r6,r1,56+10*16 + subi r7,r1,56+2*16 + + stvx v20,0,r6 + stvx v21,off16,r6 + stvx v22,off32,r6 + stvx v23,off48,r6 + stvx v24,off64,r6 + stvx v25,off80,r6 + stvx v26,off96,r6 + stvx v27,off112,r6 + stvx v28,0,r7 + stvx v29,off16,r7 + + mr r10,r3 + + vxor zeroes,zeroes,zeroes + vspltisw v0,-1 + + vsldoi mask_32bit,zeroes,v0,4 + vsldoi mask_64bit,zeroes,v0,8 + + /* Get the initial value into v8 */ + vxor v8,v8,v8 + MTVRD(v8, R3) +#ifdef REFLECT + vsldoi v8,zeroes,v8,8 /* shift into bottom 32 bits */ +#else + vsldoi v8,v8,zeroes,4 /* shift into top 32 bits */ +#endif + +#ifdef BYTESWAP_DATA + addis r3,r2,.byteswap_constant@toc@ha + addi r3,r3,.byteswap_constant@toc@l + + lvx byteswap,0,r3 + addi r3,r3,16 +#endif + + cmpdi r5,256 + blt .Lshort + + rldicr r6,r5,0,56 + + /* Checksum in blocks of MAX_SIZE */ +1: lis r7,MAX_SIZE@h + ori r7,r7,MAX_SIZE@l + mr r9,r7 + cmpd r6,r7 + bgt 2f + mr r7,r6 +2: subf r6,r7,r6 + + /* our main loop does 128 bytes at a time */ + srdi r7,r7,7 + + /* + * Work out the offset into the constants table to start at. Each + * constant is 16 bytes, and it is used against 128 bytes of input + * data - 128 / 16 = 8 + */ + sldi r8,r7,4 + srdi r9,r9,3 + subf r8,r8,r9 + + /* We reduce our final 128 bytes in a separate step */ + addi r7,r7,-1 + mtctr r7 + + addis r3,r2,.constants@toc@ha + addi r3,r3,.constants@toc@l + + /* Find the start of our constants */ + add r3,r3,r8 + + /* zero v0-v7 which will contain our checksums */ + vxor v0,v0,v0 + vxor v1,v1,v1 + vxor v2,v2,v2 + vxor v3,v3,v3 + vxor v4,v4,v4 + vxor v5,v5,v5 + vxor v6,v6,v6 + vxor v7,v7,v7 + + lvx const1,0,r3 + + /* + * If we are looping back to consume more data we use the values + * already in v16-v23. + */ + cmpdi r0,1 + beq 2f + + /* First warm up pass */ + lvx v16,0,r4 + lvx v17,off16,r4 + VPERM(v16,v16,v16,byteswap) + VPERM(v17,v17,v17,byteswap) + lvx v18,off32,r4 + lvx v19,off48,r4 + VPERM(v18,v18,v18,byteswap) + VPERM(v19,v19,v19,byteswap) + lvx v20,off64,r4 + lvx v21,off80,r4 + VPERM(v20,v20,v20,byteswap) + VPERM(v21,v21,v21,byteswap) + lvx v22,off96,r4 + lvx v23,off112,r4 + VPERM(v22,v22,v22,byteswap) + VPERM(v23,v23,v23,byteswap) + addi r4,r4,8*16 + + /* xor in initial value */ + vxor v16,v16,v8 + +2: bdz .Lfirst_warm_up_done + + addi r3,r3,16 + lvx const2,0,r3 + + /* Second warm up pass */ + VPMSUMD(v8,v16,const1) + lvx v16,0,r4 + VPERM(v16,v16,v16,byteswap) + ori r2,r2,0 + + VPMSUMD(v9,v17,const1) + lvx v17,off16,r4 + VPERM(v17,v17,v17,byteswap) + ori r2,r2,0 + + VPMSUMD(v10,v18,const1) + lvx v18,off32,r4 + VPERM(v18,v18,v18,byteswap) + ori r2,r2,0 + + VPMSUMD(v11,v19,const1) + lvx v19,off48,r4 + VPERM(v19,v19,v19,byteswap) + ori r2,r2,0 + + VPMSUMD(v12,v20,const1) + lvx v20,off64,r4 + VPERM(v20,v20,v20,byteswap) + ori r2,r2,0 + + VPMSUMD(v13,v21,const1) + lvx v21,off80,r4 + VPERM(v21,v21,v21,byteswap) + ori r2,r2,0 + + VPMSUMD(v14,v22,const1) + lvx v22,off96,r4 + VPERM(v22,v22,v22,byteswap) + ori r2,r2,0 + + VPMSUMD(v15,v23,const1) + lvx v23,off112,r4 + VPERM(v23,v23,v23,byteswap) + + addi r4,r4,8*16 + + bdz .Lfirst_cool_down + + /* + * main loop. We modulo schedule it such that it takes three iterations + * to complete - first iteration load, second iteration vpmsum, third + * iteration xor. + */ + .balign 16 +4: lvx const1,0,r3 + addi r3,r3,16 + ori r2,r2,0 + + vxor v0,v0,v8 + VPMSUMD(v8,v16,const2) + lvx v16,0,r4 + VPERM(v16,v16,v16,byteswap) + ori r2,r2,0 + + vxor v1,v1,v9 + VPMSUMD(v9,v17,const2) + lvx v17,off16,r4 + VPERM(v17,v17,v17,byteswap) + ori r2,r2,0 + + vxor v2,v2,v10 + VPMSUMD(v10,v18,const2) + lvx v18,off32,r4 + VPERM(v18,v18,v18,byteswap) + ori r2,r2,0 + + vxor v3,v3,v11 + VPMSUMD(v11,v19,const2) + lvx v19,off48,r4 + VPERM(v19,v19,v19,byteswap) + lvx const2,0,r3 + ori r2,r2,0 + + vxor v4,v4,v12 + VPMSUMD(v12,v20,const1) + lvx v20,off64,r4 + VPERM(v20,v20,v20,byteswap) + ori r2,r2,0 + + vxor v5,v5,v13 + VPMSUMD(v13,v21,const1) + lvx v21,off80,r4 + VPERM(v21,v21,v21,byteswap) + ori r2,r2,0 + + vxor v6,v6,v14 + VPMSUMD(v14,v22,const1) + lvx v22,off96,r4 + VPERM(v22,v22,v22,byteswap) + ori r2,r2,0 + + vxor v7,v7,v15 + VPMSUMD(v15,v23,const1) + lvx v23,off112,r4 + VPERM(v23,v23,v23,byteswap) + + addi r4,r4,8*16 + + bdnz 4b + +.Lfirst_cool_down: + /* First cool down pass */ + lvx const1,0,r3 + addi r3,r3,16 + + vxor v0,v0,v8 + VPMSUMD(v8,v16,const1) + ori r2,r2,0 + + vxor v1,v1,v9 + VPMSUMD(v9,v17,const1) + ori r2,r2,0 + + vxor v2,v2,v10 + VPMSUMD(v10,v18,const1) + ori r2,r2,0 + + vxor v3,v3,v11 + VPMSUMD(v11,v19,const1) + ori r2,r2,0 + + vxor v4,v4,v12 + VPMSUMD(v12,v20,const1) + ori r2,r2,0 + + vxor v5,v5,v13 + VPMSUMD(v13,v21,const1) + ori r2,r2,0 + + vxor v6,v6,v14 + VPMSUMD(v14,v22,const1) + ori r2,r2,0 + + vxor v7,v7,v15 + VPMSUMD(v15,v23,const1) + ori r2,r2,0 + +.Lsecond_cool_down: + /* Second cool down pass */ + vxor v0,v0,v8 + vxor v1,v1,v9 + vxor v2,v2,v10 + vxor v3,v3,v11 + vxor v4,v4,v12 + vxor v5,v5,v13 + vxor v6,v6,v14 + vxor v7,v7,v15 + +#ifdef REFLECT + /* + * vpmsumd produces a 96 bit result in the least significant bits + * of the register. Since we are bit reflected we have to shift it + * left 32 bits so it occupies the least significant bits in the + * bit reflected domain. + */ + vsldoi v0,v0,zeroes,4 + vsldoi v1,v1,zeroes,4 + vsldoi v2,v2,zeroes,4 + vsldoi v3,v3,zeroes,4 + vsldoi v4,v4,zeroes,4 + vsldoi v5,v5,zeroes,4 + vsldoi v6,v6,zeroes,4 + vsldoi v7,v7,zeroes,4 +#endif + + /* xor with last 1024 bits */ + lvx v8,0,r4 + lvx v9,off16,r4 + VPERM(v8,v8,v8,byteswap) + VPERM(v9,v9,v9,byteswap) + lvx v10,off32,r4 + lvx v11,off48,r4 + VPERM(v10,v10,v10,byteswap) + VPERM(v11,v11,v11,byteswap) + lvx v12,off64,r4 + lvx v13,off80,r4 + VPERM(v12,v12,v12,byteswap) + VPERM(v13,v13,v13,byteswap) + lvx v14,off96,r4 + lvx v15,off112,r4 + VPERM(v14,v14,v14,byteswap) + VPERM(v15,v15,v15,byteswap) + + addi r4,r4,8*16 + + vxor v16,v0,v8 + vxor v17,v1,v9 + vxor v18,v2,v10 + vxor v19,v3,v11 + vxor v20,v4,v12 + vxor v21,v5,v13 + vxor v22,v6,v14 + vxor v23,v7,v15 + + li r0,1 + cmpdi r6,0 + addi r6,r6,128 + bne 1b + + /* Work out how many bytes we have left */ + andi. r5,r5,127 + + /* Calculate where in the constant table we need to start */ + subfic r6,r5,128 + add r3,r3,r6 + + /* How many 16 byte chunks are in the tail */ + srdi r7,r5,4 + mtctr r7 + + /* + * Reduce the previously calculated 1024 bits to 64 bits, shifting + * 32 bits to include the trailing 32 bits of zeros + */ + lvx v0,0,r3 + lvx v1,off16,r3 + lvx v2,off32,r3 + lvx v3,off48,r3 + lvx v4,off64,r3 + lvx v5,off80,r3 + lvx v6,off96,r3 + lvx v7,off112,r3 + addi r3,r3,8*16 + + VPMSUMW(v0,v16,v0) + VPMSUMW(v1,v17,v1) + VPMSUMW(v2,v18,v2) + VPMSUMW(v3,v19,v3) + VPMSUMW(v4,v20,v4) + VPMSUMW(v5,v21,v5) + VPMSUMW(v6,v22,v6) + VPMSUMW(v7,v23,v7) + + /* Now reduce the tail (0 - 112 bytes) */ + cmpdi r7,0 + beq 1f + + lvx v16,0,r4 + lvx v17,0,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + bdz 1f + + lvx v16,off16,r4 + lvx v17,off16,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + bdz 1f + + lvx v16,off32,r4 + lvx v17,off32,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + bdz 1f + + lvx v16,off48,r4 + lvx v17,off48,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + bdz 1f + + lvx v16,off64,r4 + lvx v17,off64,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + bdz 1f + + lvx v16,off80,r4 + lvx v17,off80,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + bdz 1f + + lvx v16,off96,r4 + lvx v17,off96,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + + /* Now xor all the parallel chunks together */ +1: vxor v0,v0,v1 + vxor v2,v2,v3 + vxor v4,v4,v5 + vxor v6,v6,v7 + + vxor v0,v0,v2 + vxor v4,v4,v6 + + vxor v0,v0,v4 + +.Lbarrett_reduction: + /* Barrett constants */ + addis r3,r2,.barrett_constants@toc@ha + addi r3,r3,.barrett_constants@toc@l + + lvx const1,0,r3 + lvx const2,off16,r3 + + vsldoi v1,v0,v0,8 + vxor v0,v0,v1 /* xor two 64 bit results together */ + +#ifdef REFLECT + /* shift left one bit */ + vspltisb v1,1 + vsl v0,v0,v1 +#endif + + vand v0,v0,mask_64bit +#ifndef REFLECT + /* + * Now for the Barrett reduction algorithm. The idea is to calculate q, + * the multiple of our polynomial that we need to subtract. By + * doing the computation 2x bits higher (ie 64 bits) and shifting the + * result back down 2x bits, we round down to the nearest multiple. + */ + VPMSUMD(v1,v0,const1) /* ma */ + vsldoi v1,zeroes,v1,8 /* q = floor(ma/(2^64)) */ + VPMSUMD(v1,v1,const2) /* qn */ + vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */ + + /* + * Get the result into r3. We need to shift it left 8 bytes: + * V0 [ 0 1 2 X ] + * V0 [ 0 X 2 3 ] + */ + vsldoi v0,v0,zeroes,8 /* shift result into top 64 bits */ +#else + /* + * The reflected version of Barrett reduction. Instead of bit + * reflecting our data (which is expensive to do), we bit reflect our + * constants and our algorithm, which means the intermediate data in + * our vector registers goes from 0-63 instead of 63-0. We can reflect + * the algorithm because we don't carry in mod 2 arithmetic. + */ + vand v1,v0,mask_32bit /* bottom 32 bits of a */ + VPMSUMD(v1,v1,const1) /* ma */ + vand v1,v1,mask_32bit /* bottom 32bits of ma */ + VPMSUMD(v1,v1,const2) /* qn */ + vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */ + + /* + * Since we are bit reflected, the result (ie the low 32 bits) is in + * the high 32 bits. We just need to shift it left 4 bytes + * V0 [ 0 1 X 3 ] + * V0 [ 0 X 2 3 ] + */ + vsldoi v0,v0,zeroes,4 /* shift result into top 64 bits of */ +#endif + + /* Get it into r3 */ + MFVRD(R3, v0) + +.Lout: + subi r6,r1,56+10*16 + subi r7,r1,56+2*16 + + lvx v20,0,r6 + lvx v21,off16,r6 + lvx v22,off32,r6 + lvx v23,off48,r6 + lvx v24,off64,r6 + lvx v25,off80,r6 + lvx v26,off96,r6 + lvx v27,off112,r6 + lvx v28,0,r7 + lvx v29,off16,r7 + + ld r31,-8(r1) + ld r30,-16(r1) + ld r29,-24(r1) + ld r28,-32(r1) + ld r27,-40(r1) + ld r26,-48(r1) + ld r25,-56(r1) + + blr + +.Lfirst_warm_up_done: + lvx const1,0,r3 + addi r3,r3,16 + + VPMSUMD(v8,v16,const1) + VPMSUMD(v9,v17,const1) + VPMSUMD(v10,v18,const1) + VPMSUMD(v11,v19,const1) + VPMSUMD(v12,v20,const1) + VPMSUMD(v13,v21,const1) + VPMSUMD(v14,v22,const1) + VPMSUMD(v15,v23,const1) + + b .Lsecond_cool_down + +.Lshort: + cmpdi r5,0 + beq .Lzero + + addis r3,r2,.short_constants@toc@ha + addi r3,r3,.short_constants@toc@l + + /* Calculate where in the constant table we need to start */ + subfic r6,r5,256 + add r3,r3,r6 + + /* How many 16 byte chunks? */ + srdi r7,r5,4 + mtctr r7 + + vxor v19,v19,v19 + vxor v20,v20,v20 + + lvx v0,0,r4 + lvx v16,0,r3 + VPERM(v0,v0,v16,byteswap) + vxor v0,v0,v8 /* xor in initial value */ + VPMSUMW(v0,v0,v16) + bdz .Lv0 + + lvx v1,off16,r4 + lvx v17,off16,r3 + VPERM(v1,v1,v17,byteswap) + VPMSUMW(v1,v1,v17) + bdz .Lv1 |
