/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* ARIA Cipher 64-way parallel algorithm (AVX512)
*
* Copyright (c) 2022 Taehee Yoo <ap420073@gmail.com>
*
*/
#include <linux/linkage.h>
#include <asm/frame.h>
#include <asm/asm-offsets.h>
#include <linux/cfi_types.h>
/* register macros */
#define CTX %rdi
#define BV8(a0, a1, a2, a3, a4, a5, a6, a7) \
( (((a0) & 1) << 0) | \
(((a1) & 1) << 1) | \
(((a2) & 1) << 2) | \
(((a3) & 1) << 3) | \
(((a4) & 1) << 4) | \
(((a5) & 1) << 5) | \
(((a6) & 1) << 6) | \
(((a7) & 1) << 7) )
#define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7) \
( ((l7) << (0 * 8)) | \
((l6) << (1 * 8)) | \
((l5) << (2 * 8)) | \
((l4) << (3 * 8)) | \
((l3) << (4 * 8)) | \
((l2) << (5 * 8)) | \
((l1) << (6 * 8)) | \
((l0) << (7 * 8)) )
#define add_le128(out, in, lo_counter, hi_counter1) \
vpaddq lo_counter, in, out; \
vpcmpuq $1, lo_counter, out, %k1; \
kaddb %k1, %k1, %k1; \
vpaddq hi_counter1, out, out{%k1};
#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
vpandq x, mask4bit, tmp0; \
vpandqn x, mask4bit, x; \
vpsrld $4, x, x; \
\
vpshufb tmp0, lo_t, tmp0; \
vpshufb x, hi_t, x; \
vpxorq tmp0, x, x;
#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
vpunpckhdq x1, x0, t2; \
vpunpckldq x1, x0, x0; \
\
vpunpckldq x3, x2, t1; \
vpunpckhdq x3, x2, x2; \
\
vpunpckhqdq t1, x0, x1; \
vpunpcklqdq t1, x0, x0; \
\
vpunpckhqdq x2, t2, x3; \
vpunpcklqdq x2, t2, x2;
#define byteslice_16x16b(a0, b0, c0, d0, \
a1, b1, c1, d1, \
a2, b2, c2, d2, \
a3, b3, c3, d3, \
st0, st1) \
vmovdqu64 d2, st0; \
vmovdqu64 d3, st1; \
transpose_4x4(a0, a1, a2, a3, d2, d3); \
transpose_4x4(b0, b1, b2, b3, d2, d3); \
vmovdqu64 st0, d2; \
vmovdqu64 st1, d3; \
\
vmovdqu64 a0, st0; \
vmovdqu64 a1, st1; \
transpose_4x4(c0, c1, c2, c3, a0, a1); \
transpose_4x4(d0, d1, d2, d3, a0, a1); \
\
vbroadcasti64x2 .Lshufb_16x16b(%rip), a0; \
vmovdqu64 st1, a1; \
vpshufb a0, a2, a2; \
vpshufb a0, a3, a3; \
vpshufb a0, b0, b0; \
vpshufb a0, b1, b1; \
vpshufb a0, b2, b2; \
vpshufb a0, b3, b3; \
vpshufb a0, a1, a1; \
vpshufb a0, c0, c0; \
vpshufb a0, c1, c1; \
vpshufb a0, c2, c2; \
vpshufb a0, c3, c3; \
vpshufb a0, d0, d0; \
vpshufb a0, d1, d1; \
vpshufb a0, d2, d2; \
vpshufb a0, d3, d3; \
vmovdqu64 d3, st1; \
vmovdqu64 st0, d3; \
vpshufb a0, d3, a0; \
vmovdqu64 d2, st0; \
\
transpose_4x4(a0, b0, c0, d0, d2, d3); \
transpose_4x4(a1, b1, c1, d1, d2, d3); \
vmovdqu64 st0, d2; \
vmovdqu64 st1, d3; \
\
vmovdqu64 b0, st0; \
vmovdqu64 b1, st1; \
transpose_4x4(a2, b2, c2, d2, b0, b1); \
transpose_4x4(a3, b3, c3, d3, b0, b1); \
vmovdqu64 st0, b0; \
vmovdqu64 st1, b1; \
/* does not adjust output bytes inside vectors */
#define debyteslice_16x16b(a0, b0, c0, d0, \
a1, b1, c1, d1, \
a2, b2, c2, d2, \
a3, b3, c3, d3, \
st0, st1) \
vmovdqu64 d2, st0; \
vmovdqu64 d3, st1; \
transpose_4x4(a0, a1, a2, a3, d2, d3); \
transpose_4x4(b0, b1, b2, b3, d2, d3); \
vmovdqu64 st0, d2; \
vmovdqu64 st1, d3; \
\
vmovdqu64 a0, st0; \
vmovdqu64 a1, st1; \
transpose_4x4(c0, c1, c2, c3, a0, a1); \
transpose_4x4(d0, d1, d2, d3, a0, a1); \
\
vbroadcasti64x2 .Lshufb_16x16b(%rip), a0; \
vmovdqu64 st1, a1; \
vpshufb a0, a2, a2; \
vpshufb a0, a3, a3; \
vpshufb