diff options
Diffstat (limited to 'arch/powerpc')
54 files changed, 3458 insertions, 293 deletions
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 7709b62e6843..21edd664689e 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -157,6 +157,7 @@ config PPC select ARCH_HAS_UBSAN_SANITIZE_ALL select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_KEEP_MEMBLOCK + select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE if PPC_RADIX_MMU select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_MIGHT_HAVE_PC_SERIO select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX @@ -174,6 +175,7 @@ config PPC select ARCH_WANT_IPC_PARSE_VERSION select ARCH_WANT_IRQS_OFF_ACTIVATE_MM select ARCH_WANT_LD_ORPHAN_WARN + select ARCH_WANT_OPTIMIZE_DAX_VMEMMAP if PPC_RADIX_MMU select ARCH_WANTS_MODULES_DATA_IN_VMALLOC if PPC_BOOK3S_32 || PPC_8xx select ARCH_WEAK_RELEASE_ACQUIRE select BINFMT_ELF @@ -193,6 +195,7 @@ config PPC select GENERIC_CPU_VULNERABILITIES if PPC_BARRIER_NOSPEC select GENERIC_EARLY_IOREMAP select GENERIC_GETTIMEOFDAY + select GENERIC_IOREMAP select GENERIC_IRQ_SHOW select GENERIC_IRQ_SHOW_LEVEL select GENERIC_PCI_IOMAP if PCI diff --git a/arch/powerpc/crypto/Kconfig b/arch/powerpc/crypto/Kconfig index ad1872518992..f25024afdda5 100644 --- a/arch/powerpc/crypto/Kconfig +++ b/arch/powerpc/crypto/Kconfig @@ -111,4 +111,30 @@ config CRYPTO_AES_GCM_P10 Support for cryptographic acceleration instructions on Power10 or later CPU. This module supports stitched acceleration for AES/GCM. +config CRYPTO_CHACHA20_P10 + tristate "Ciphers: ChaCha20, XChacha20, XChacha12 (P10 or later)" + depends on PPC64 && CPU_LITTLE_ENDIAN + select CRYPTO_SKCIPHER + select CRYPTO_LIB_CHACHA_GENERIC + select CRYPTO_ARCH_HAVE_LIB_CHACHA + help + Length-preserving ciphers: ChaCha20, XChaCha20, and XChaCha12 + stream cipher algorithms + + Architecture: PowerPC64 + - Power10 or later + - Little-endian + +config CRYPTO_POLY1305_P10 + tristate "Hash functions: Poly1305 (P10 or later)" + depends on PPC64 && CPU_LITTLE_ENDIAN + select CRYPTO_HASH + select CRYPTO_LIB_POLY1305_GENERIC + help + Poly1305 authenticator algorithm (RFC7539) + + Architecture: PowerPC64 + - Power10 or later + - Little-endian + endmenu diff --git a/arch/powerpc/crypto/Makefile b/arch/powerpc/crypto/Makefile index 7b4f516abec1..ebdac1b9eb9a 100644 --- a/arch/powerpc/crypto/Makefile +++ b/arch/powerpc/crypto/Makefile @@ -14,6 +14,8 @@ obj-$(CONFIG_CRYPTO_CRC32C_VPMSUM) += crc32c-vpmsum.o obj-$(CONFIG_CRYPTO_CRCT10DIF_VPMSUM) += crct10dif-vpmsum.o obj-$(CONFIG_CRYPTO_VPMSUM_TESTER) += crc-vpmsum_test.o obj-$(CONFIG_CRYPTO_AES_GCM_P10) += aes-gcm-p10-crypto.o +obj-$(CONFIG_CRYPTO_CHACHA20_P10) += chacha-p10-crypto.o +obj-$(CONFIG_CRYPTO_POLY1305_P10) += poly1305-p10-crypto.o aes-ppc-spe-y := aes-spe-core.o aes-spe-keys.o aes-tab-4k.o aes-spe-modes.o aes-spe-glue.o md5-ppc-y := md5-asm.o md5-glue.o @@ -23,6 +25,8 @@ sha256-ppc-spe-y := sha256-spe-asm.o sha256-spe-glue.o crc32c-vpmsum-y := crc32c-vpmsum_asm.o crc32c-vpmsum_glue.o crct10dif-vpmsum-y := crct10dif-vpmsum_asm.o crct10dif-vpmsum_glue.o aes-gcm-p10-crypto-y := aes-gcm-p10-glue.o aes-gcm-p10.o ghashp10-ppc.o aesp10-ppc.o +chacha-p10-crypto-y := chacha-p10-glue.o chacha-p10le-8x.o +poly1305-p10-crypto-y := poly1305-p10-glue.o poly1305-p10le_64.o quiet_cmd_perl = PERL $@ cmd_perl = $(PERL) $< $(if $(CONFIG_CPU_LITTLE_ENDIAN), linux-ppc64le, linux-ppc64) > $@ diff --git a/arch/powerpc/crypto/chacha-p10-glue.c b/arch/powerpc/crypto/chacha-p10-glue.c new file mode 100644 index 000000000000..74fb86b0d209 --- /dev/null +++ b/arch/powerpc/crypto/chacha-p10-glue.c @@ -0,0 +1,221 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * PowerPC P10 (ppc64le) accelerated ChaCha and XChaCha stream ciphers, + * including ChaCha20 (RFC7539) + * + * Copyright 2023- IBM Corp. All rights reserved. + */ + +#include <crypto/algapi.h> +#include <crypto/internal/chacha.h> +#include <crypto/internal/simd.h> +#include <crypto/internal/skcipher.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/cpufeature.h> +#include <linux/sizes.h> +#include <asm/simd.h> +#include <asm/switch_to.h> + +asmlinkage void chacha_p10le_8x(u32 *state, u8 *dst, const u8 *src, + unsigned int len, int nrounds); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_p10); + +static void vsx_begin(void) +{ + preempt_disable(); + enable_kernel_vsx(); +} + +static void vsx_end(void) +{ + disable_kernel_vsx(); + preempt_enable(); +} + +static void chacha_p10_do_8x(u32 *state, u8 *dst, const u8 *src, + unsigned int bytes, int nrounds) +{ + unsigned int l = bytes & ~0x0FF; + + if (l > 0) { + chacha_p10le_8x(state, dst, src, l, nrounds); + bytes -= l; + src += l; + dst += l; + state[12] += l / CHACHA_BLOCK_SIZE; + } + + if (bytes > 0) + chacha_crypt_generic(state, dst, src, bytes, nrounds); +} + +void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds) +{ + hchacha_block_generic(state, stream, nrounds); +} +EXPORT_SYMBOL(hchacha_block_arch); + +void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv) +{ + chacha_init_generic(state, key, iv); +} +EXPORT_SYMBOL(chacha_init_arch); + +void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes, + int nrounds) +{ + if (!static_branch_likely(&have_p10) || bytes <= CHACHA_BLOCK_SIZE || + !crypto_simd_usable()) + return chacha_crypt_generic(state, dst, src, bytes, nrounds); + + do { + unsigned int todo = min_t(unsigned int, bytes, SZ_4K); + + vsx_begin(); + chacha_p10_do_8x(state, dst, src, todo, nrounds); + vsx_end(); + + bytes -= todo; + src += todo; + dst += todo; + } while (bytes); +} +EXPORT_SYMBOL(chacha_crypt_arch); + +static int chacha_p10_stream_xor(struct skcipher_request *req, + const struct chacha_ctx *ctx, const u8 *iv) +{ + struct skcipher_walk walk; + u32 state[16]; + int err; + + err = skcipher_walk_virt(&walk, req, false); + if (err) + return err; + + chacha_init_generic(state, ctx->key, iv); + + while (walk.nbytes > 0) { + unsigned int nbytes = walk.nbytes; + + if (nbytes < walk.total) + nbytes = rounddown(nbytes, walk.stride); + + if (!crypto_simd_usable()) { + chacha_crypt_generic(state, walk.dst.virt.addr, + walk.src.virt.addr, nbytes, + ctx->nrounds); + } else { + vsx_begin(); + chacha_p10_do_8x(state, walk.dst.virt.addr, + walk.src.virt.addr, nbytes, ctx->nrounds); + vsx_end(); + } + err = skcipher_walk_done(&walk, walk.nbytes - nbytes); + if (err) + break; + } + + return err; +} + +static int chacha_p10(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); + + return chacha_p10_stream_xor(req, ctx, req->iv); +} + +static int xchacha_p10(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); + struct chacha_ctx subctx; + u32 state[16]; + u8 real_iv[16]; + + chacha_init_generic(state, ctx->key, req->iv); + hchacha_block_arch(state, subctx.key, ctx->nrounds); + subctx.nrounds = ctx->nrounds; + + memcpy(&real_iv[0], req->iv + 24, 8); + memcpy(&real_iv[8], req->iv + 16, 8); + return chacha_p10_stream_xor(req, &subctx, real_iv); +} + +static struct skcipher_alg algs[] = { + { + .base.cra_name = "chacha20", + .base.cra_driver_name = "chacha20-p10", + .base.cra_priority = 300, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct chacha_ctx), + .base.cra_module = THIS_MODULE, + + .min_keysize = CHACHA_KEY_SIZE, + .max_keysize = CHACHA_KEY_SIZE, + .ivsize = CHACHA_IV_SIZE, + .chunksize = CHACHA_BLOCK_SIZE, + .setkey = chacha20_setkey, + .encrypt = chacha_p10, + .decrypt = chacha_p10, + }, { + .base.cra_name = "xchacha20", + .base.cra_driver_name = "xchacha20-p10", + .base.cra_priority = 300, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct chacha_ctx), + .base.cra_module = THIS_MODULE, + + .min_keysize = CHACHA_KEY_SIZE, + .max_keysize = CHACHA_KEY_SIZE, + .ivsize = XCHACHA_IV_SIZE, + .chunksize = CHACHA_BLOCK_SIZE, + .setkey = chacha20_setkey, + .encrypt = xchacha_p10, + .decrypt = xchacha_p10, + }, { + .base.cra_name = "xchacha12", + .base.cra_driver_name = "xchacha12-p10", + .base.cra_priority = 300, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct chacha_ctx), + .base.cra_module = THIS_MODULE, + + .min_keysize = CHACHA_KEY_SIZE, + .max_keysize = CHACHA_KEY_SIZE, + .ivsize = XCHACHA_IV_SIZE, + .chunksize = CHACHA_BLOCK_SIZE, + .setkey = chacha12_setkey, + .encrypt = xchacha_p10, + .decrypt = xchacha_p10, + } +}; + +static int __init chacha_p10_init(void) +{ + static_branch_enable(&have_p10); + + return crypto_register_skciphers(algs, ARRAY_SIZE(algs)); +} + +static void __exit chacha_p10_exit(void) +{ + crypto_unregister_skciphers(algs, ARRAY_SIZE(algs)); +} + +module_cpu_feature_match(PPC_MODULE_FEATURE_P10, chacha_p10_init); +module_exit(chacha_p10_exit); + +MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (P10 accelerated)"); +MODULE_AUTHOR("Danny Tsen <dtsen@linux.ibm.com>"); +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS_CRYPTO("chacha20"); +MODULE_ALIAS_CRYPTO("chacha20-p10"); +MODULE_ALIAS_CRYPTO("xchacha20"); +MODULE_ALIAS_CRYPTO("xchacha20-p10"); +MODULE_ALIAS_CRYPTO("xchacha12"); +MODULE_ALIAS_CRYPTO("xchacha12-p10"); diff --git a/arch/powerpc/crypto/chacha-p10le-8x.S b/arch/powerpc/crypto/chacha-p10le-8x.S new file mode 100644 index 000000000000..17bedb66b822 --- /dev/null +++ b/arch/powerpc/crypto/chacha-p10le-8x.S @@ -0,0 +1,842 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +# +# Accelerated chacha20 implementation for ppc64le. +# +# Copyright 2023- IBM Corp. All rights reserved +# +#=================================================================================== +# Written by Danny Tsen <dtsen@us.ibm.com> +# +# chacha_p10le_8x(u32 *state, byte *dst, const byte *src, +# size_t len, int nrounds); +# +# do rounds, 8 quarter rounds +# 1. a += b; d ^= a; d <<<= 16; +# 2. c += d; b ^= c; b <<<= 12; +# 3. a += b; d ^= a; d <<<= 8; +# 4. c += d; b ^= c; b <<<= 7 +# +# row1 = (row1 + row2), row4 = row1 xor row4, row4 rotate each word by 16 +# row3 = (row3 + row4), row2 = row3 xor row2, row2 rotate each word by 12 +# row1 = (row1 + row2), row4 = row1 xor row4, row4 rotate each word by 8 +# row3 = (row3 + row4), row2 = row3 xor row2, row2 rotate each word by 7 +# +# 4 blocks (a b c d) +# +# a0 b0 c0 d0 +# a1 b1 c1 d1 +# ... +# a4 b4 c4 d4 +# ... +# a8 b8 c8 d8 +# ... +# a12 b12 c12 d12 +# a13 ... +# a14 ... +# a15 b15 c15 d15 +# +# Column round (v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15) +# Diagnal round (v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14) +# + +#include <asm/ppc_asm.h> +#include <asm/asm-offsets.h> +#include <asm/asm-compat.h> +#include <linux/linkage.h> + +.machine "any" +.text + +.macro SAVE_GPR GPR OFFSET FRAME + std \GPR,\OFFSET(\FRAME) +.endm + +.macro SAVE_VRS VRS OFFSET FRAME + li 16, \OFFSET + stvx \VRS, 16, \FRAME +.endm + +.macro SAVE_VSX VSX OFFSET FRAME + li 16, \OFFSET + stxvx \VSX, 16, \FRAME +.endm + +.macro RESTORE_GPR GPR OFFSET FRAME + ld \GPR,\OFFSET(\FRAME) +.endm + +.macro RESTORE_VRS VRS OFFSET FRAME + li 16, \OFFSET + lvx \VRS, 16, \FRAME +.endm + +.macro RESTORE_VSX VSX OFFSET FRAME + li 16, \OFFSET + lxvx \VSX, 16, \FRAME +.endm + +.macro SAVE_REGS + mflr 0 + std 0, 16(1) + stdu 1,-752(1) + + SAVE_GPR 14, 112, 1 + SAVE_GPR 15, 120, 1 + SAVE_GPR 16, 128, 1 + SAVE_GPR 17, 136, 1 + SAVE_GPR 18, 144, 1 + SAVE_GPR 19, 152, 1 + SAVE_GPR 20, 160, 1 + SAVE_GPR 21, 168, 1 + SAVE_GPR 22, 176, 1 + SAVE_GPR 23, 184, 1 + SAVE_GPR 24, 192, 1 + SAVE_GPR 25, 200, 1 + SAVE_GPR 26, 208, 1 + SAVE_GPR 27, 216, 1 + SAVE_GPR 28, 224, 1 + SAVE_GPR 29, 232, 1 + SAVE_GPR 30, 240, 1 + SAVE_GPR 31, 248, 1 + + addi 9, 1, 256 + SAVE_VRS 20, 0, 9 + SAVE_VRS 21, 16, 9 + SAVE_VRS 22, 32, 9 + SAVE_VRS 23, 48, 9 + SAVE_VRS 24, 64, 9 + SAVE_VRS 25, 80, 9 + SAVE_VRS 26, 96, 9 + SAVE_VRS 27, 112, 9 + SAVE_VRS 28, 128, 9 + SAVE_VRS 29, 144, 9 + SAVE_VRS 30, 160, 9 + SAVE_VRS 31, 176, 9 + + SAVE_VSX 14, 192, 9 + SAVE_VSX 15, 208, 9 + SAVE_VSX 16, 224, 9 + SAVE_VSX 17, 240, 9 + SAVE_VSX 18, 256, 9 + SAVE_VSX 19, 272, 9 + SAVE_VSX 20, 288, 9 + SAVE_VSX 21, 304, 9 + SAVE_VSX 22, 320, 9 + SAVE_VSX 23, 336, 9 + SAVE_VSX 24, 352, 9 + SAVE_VSX 25, 368, 9 + SAVE_VSX 26, 384, 9 + SAVE_VSX 27, 400, 9 + SAVE_VSX 28, 416, 9 + SAVE_VSX 29, 432, 9 + SAVE_VSX 30, 448, 9 + SAVE_VSX 31, 464, 9 +.endm # SAVE_REGS + +.macro RESTORE_REGS + addi 9, 1, 256 + RESTORE_VRS 20, 0, 9 + RESTORE_VRS 21, 16, 9 + RESTORE_VRS 22, 32, 9 + RESTORE_VRS 23, 48, 9 + RESTORE_VRS 24, 64, 9 + RESTORE_VRS 25, 80, 9 + RESTORE_VRS 26, 96, 9 + RESTORE_VRS 27, 112, 9 + RESTORE_VRS 28, 128, 9 + RESTORE_VRS 29, 144, 9 + RESTORE_VRS 30, 160, 9 + RESTORE_VRS 31, 176, 9 + + RESTORE_VSX 14, 192, 9 + RESTORE_VSX 15, 208, 9 + RESTORE_VSX 16, 224, 9 + RESTORE_VSX 17, 240, 9 + RESTORE_VSX 18, 256, 9 + RESTORE_VSX 19, 272, 9 + RESTORE_VSX 20, 288, 9 + RESTORE_VSX 21, 304, 9 + RESTORE_VSX 22, 320, 9 + RESTORE_VSX 23, 336, 9 + RESTORE_VSX 24, 352, 9 + RESTORE_VSX 25, 368, 9 + RESTORE_VSX 26, 384, 9 + RESTORE_VSX 27, 400, 9 + RESTORE_VSX 28, 416, 9 + RESTORE_VSX 29, 432, 9 + RESTORE_VSX 30, 448, 9 + RESTORE_VSX 31, 464, 9 + + RESTORE_GPR 14, 112, 1 + RESTORE_GPR 15, 120, 1 + RESTORE_GPR 16, 128, 1 + RESTORE_GPR 17, 136, 1 + RESTORE_GPR 18, 144, 1 + RESTORE_GPR 19, 152, 1 + RESTORE_GPR 20, 160, 1 + RESTORE_GPR 21, 168, 1 + RESTORE_GPR 22, 176, 1 + RESTORE_GPR 23, 184, 1 + RESTORE_GPR 24, 192, 1 + RESTORE_GPR 25, 200, 1 + RESTORE_GPR 26, 208, 1 + RESTORE_GPR 27, 216, 1 + RESTORE_GPR 28, 224, 1 + RESTORE_GPR 29, 232, 1 + RESTORE_GPR 30, 240, 1 + RESTORE_GPR 31, 248, 1 + + addi 1, 1, 752 + ld 0, 16(1) + mtlr 0 +.endm # RESTORE_REGS + +.macro QT_loop_8x + # QR(v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15) + xxlor 0, 32+25, 32+25 + xxlor 32+25, 20, 20 + vadduwm 0, 0, 4 + vadduwm 1, 1, 5 + vadduwm 2, 2, 6 + vadduwm 3, 3, 7 + vadduwm 16, 16, 20 + vadduwm 17, 17, 21 + vadduwm 18, 18, 22 + vadduwm 19, 19, 23 + + vpermxor 12, 12, 0, 25 + vpermxor 13, 13, 1, 25 + vpermxor 14, 14, 2, 25 + vpermxor 15, 15, 3, 25 + vpermxor 28, 28, 16, 25 + vpermxor 29, 29, 17, 25 + vpermxor 30, 30, 18, 25 + vpermxor 31, 31, 19, 25 + xxlor 32+25, 0, 0 + vadduwm 8, 8, 12 + vadduwm 9, 9, 13 + vadduwm 10, 10, 14 + vadduwm 11, 11, 15 + vadduwm 24, 24, 28 + vadduwm 25, 25, 29 + vadduwm 26, 26, 30 + vadduwm 27, 27, 31 + vxor 4, 4, 8 + vxor 5, 5, 9 + vxor 6, 6, 10 + vxor 7, 7, 11 + vxor 20, 20, 24 + vxor 21, 21, 25 + vxor 22, 22, 26 + vxor 23, 23, 27 + + xxlor 0, 32+25, 32+25 + xxlor 32+25, 21, 21 + vrlw 4, 4, 25 # + vrlw 5, 5, 25 + vrlw 6, 6, 25 + vrlw 7, 7, 25 + vrlw 20, 20, 25 # + vrlw 21, 21, 25 + vrlw 22, 22, 25 + vrlw 23, 23, 25 + xxlor 32+25, 0, 0 + vadduwm 0, 0, 4 + vadduwm 1, 1, 5 + vadduwm 2, 2, 6 + vadduwm 3, 3, 7 + vadduwm 16, 16, 20 + vadduwm 17, 17, 21 + vadduwm 18, 18, 22 + vadduwm 19, 19, 23 + + xxlor 0, 32+25, 32+25 + xxlor 32+25, 22, 22 + vpermxor 12, 12, 0, 25 + vpermxor 13, 13, 1, 25 + vpermxor 14, 14, 2, 25 + vpermxor 15, 15, 3, 25 + vpermxor 28, 28, 16, 25 + vpermxor 29, 29, 17, 25 + vpermxor 30, 30, 18, 25 + vpermxor 31, 31, 19, 25 + xxlor 32+25, 0, 0 + vadduwm 8, 8, 12 + vadduwm 9, 9, 13 + vadduwm 10, 10, 14 + vadduwm 11, 11, 15 + vadduwm 24, 24, 28 + vadduwm 25, 25, 29 + vadduwm 26, 26, 30 + vadduwm 27, 27, 31 + xxlor 0, 32+28, 32+28 + xxlor 32+28, 23, 23 + vxor 4, 4, 8 + vxor 5, 5, 9 + vxor 6, 6, 10 + vxor 7, 7, 11 + vxor 20, 20, 24 + vxor 21, 21, 25 + vxor 22, 22, 26 + vxor 23, 23, 27 + vrlw 4, 4, 28 # + vrlw 5, 5, 28 + vrlw 6, 6, 28 + vrlw 7, 7, 28 + vrlw 20, 20, 28 # + vrlw 21, 21, 28 + vrlw 22, 22, 28 + vrlw 23, 23, 28 + xxlor 32+28, 0, 0 + + # QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14) + xxlor 0, 32+25, 32+25 + xxlor 32+25, 20, 20 + vadduwm 0, 0, 5 + vadduwm 1, 1, 6 + vadduwm 2, 2, 7 + vadduwm 3, 3, 4 + vadduwm 16, 16, 21 + vadduwm 17, 17, 22 + vadduwm 18, 18, 23 + vadduwm 19, 19, 20 + + vpermxor 15, 15, 0, 25 + vpermxor 12, 12, 1, 25 + vpermxor 13, 13, 2, 25 + vpermxor 14, 14, 3, 25 + vpermxor 31, 31, 16, 25 + vpermxor 28, 28, 17, 25 + vpermxor 29, 29, 18, 25 + vpermxor 30, 30, 19, 25 + + xxlor 32+25, 0, 0 + vadduwm 10, 10, 15 + vadduwm 11, 11, 12 + vadduwm 8, 8, 13 + vadduwm 9, 9, 14 + vadduwm 26, 26, 31 + vadduwm 27, 27, 28 + vadduwm 24, 24, 29 + vadduwm 25, 25, 30 + vxor 5, 5, 10 + vxor 6, 6, 11 + vxor 7, 7, 8 + vxor 4, 4, 9 + vxor 21, 21, 26 + vxor 22, 22, 27 + vxor 23, 23, 24 + vxor 20, 20, 25 + + xxlor 0, 32+25, 32+25 + xxlor 32+25, 21, 21 + vrlw 5, 5, 25 + vrlw 6, 6, 25 + vrlw 7, 7, 25 + vrlw 4, 4, 25 + vrlw 21, 21, 25 + vrlw 22, 22, 25 + vrlw 23, 23, 25 + vrlw 20, 20, 25 + xxlor 32+25, 0, 0 + + vadduwm 0, 0, 5 + vadduwm 1, 1, 6 + vadduwm 2, 2, 7 + vadduwm 3, 3, 4 + vadduwm 16, 16, 21 + vadduwm 17, 17, 22 + vadduwm 18, 18, 23 + vadduwm 19, 19, 20 + + xxlor 0, 32+25, 32+25 + xxlor 32+25, 22, 22 + vpermxor 15, 15, 0, 25 + vpermxor 12, 12, 1, 25 + vpermxor 13, 13, 2, 25 + vpermxor 14, 14, 3, 25 + vpermxor 31, 31, 16, 25 + vpermxor 28, 28, 17, 25 + vp |
