/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* x86_64/AVX2/AES-NI assembler implementation of Camellia
*
* Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
*/
#include <linux/linkage.h>
#include <asm/frame.h>
#define CAMELLIA_TABLE_BYTE_LEN 272
/* struct camellia_ctx: */
#define key_table 0
#define key_length CAMELLIA_TABLE_BYTE_LEN
/* register macros */
#define CTX %rdi
#define RIO %r8
/**********************************************************************
helper macros
**********************************************************************/
#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
vpand x, mask4bit, tmp0; \
vpandn x, mask4bit, x; \
vpsrld $4, x, x; \
\
vpshufb tmp0, lo_t, tmp0; \
vpshufb x, hi_t, x; \
vpxor tmp0, x, x;
#define ymm0_x xmm0
#define ymm1_x xmm1
#define ymm2_x xmm2
#define ymm3_x xmm3
#define ymm4_x xmm4
#define ymm5_x xmm5
#define ymm6_x xmm6
#define ymm7_x xmm7
#define ymm8_x xmm8
#define ymm9_x xmm9
#define ymm10_x xmm10
#define ymm11_x xmm11
#define ymm12_x xmm12
#define ymm13_x xmm13
#define ymm14_x xmm14
#define ymm15_x xmm15
/**********************************************************************
32-way camellia
**********************************************************************/
/*
* IN:
* x0..x7: byte-sliced AB state
* mem_cd: register pointer storing CD state
* key: index for key material
* OUT:
* x0..x7: new byte-sliced CD state
*/
#define roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \
t7, mem_cd, key) \
/* \
* S-function with AES subbytes \
*/ \
vbroadcasti128 .Linv_shift_row(%rip), t4; \
vpbroadcastd .L0f0f0f0f(%rip), t7; \
vbroadcasti128 .Lpre_tf_lo_s1(%rip), t5; \
vbroadcasti128 .Lpre_tf_hi_s1(%rip), t6; \
vbroadcasti128 .Lpre_tf_lo_s4(%rip), t2; \
vbroadcasti128 .Lpre_tf_hi_s4(%rip), t3; \
\
/* AES inverse shift rows */ \
vpshufb t4, x0, x0; \
vpshufb t4, x7, x7; \
vpshufb t4, x3, x3; \
vpshufb t4, x6, x6; \
vpshufb t4, x2, x2; \
vpshufb t4, x5, x5; \
vpshufb t4, x1, x1; \
vpshufb t4, x4, x4; \
\
/* prefilter sboxes 1, 2 and 3 */ \
/* prefilter sbox 4 */ \
filter_8bit(x0, t5, t6, t7, t4); \
filter_8bit(x7, t5, t6, t7, t4); \
vextracti128 $1, x0, t0##_x; \
vextracti128 $1, x7, t1##_x; \
filter_8bit(x3, t2, t3, t7, t4); \
filter_8bit(x6, t2, t3, t7, t4); \
vextracti128 $1, x3, t3##_x; \
vextracti128 $1, x6, t2##_x; \
filter_8bit(x2, t5, t6, t7, t4); \
filter_8bit(x5, t5, t6, t7, t4); \
filter_8bit(x1, t5, t6, t7, t4); \
filter_8bit(x4, t5, t6, t7, t4); \
\
vpxor t4##_x, t4##_x, t4##_x; \
\
/* AES subbytes + AES shift rows */ \
vextracti128 $1, x2, t6##_x; \
vextracti128 $1, x5, t5##_x; \
vaesenclast t4##_x, x0##_x, x0##_x; \
vaesenclast t4##_x, t0##_x, t0##_x; \
vinserti128 $1, t0##_x, x0, x0; \
vaesenclast t4##_x, x7##_x, x7##_x; \
vaesenclast t4##_x, t1##_x, t1##_x; \
vinserti128 $1, t1##_x, x7, x7; \
vaesenclast t4##_x, x3##_x, x3##_x; \
vaesenclast t4##_x, t3##_x, t3##_x; \
vinserti128 $1, t3##_x, x3, x3; \
vaesenclast t4##_x, x6##_x, x6##_x; \
vaesenclast t4##_x, t2##_x, t2##_x; \
vinserti128 $1, t2##_x, x6, x6; \
vextracti128 $1, x1, t3##_x; \
vextracti128 $1, x4, t2##_x; \
vbroadcasti128 .Lpost_tf_lo_s1(%rip), t0; \
vbroadcasti128 .Lpost_tf_hi_s1(%rip), t1; \
vaesenclast t4##_x, x2##_x, x2##_x; \
vaesenclast t4##_x, t6##_x, t6##_x; \
vinserti128 $1, t6##_x, x2, x2; \
vaesenclast t4##_x, x5##_x, x5##_x; \
vaesenclast t4##_x, t5##_x, t5##_x; \
vinserti128 $1, t5##_x, x5, x5; \
vaesenclast t4##_x, x1##_x, x1##_x; \
vaesenclast t4##_x, t3##_x, t3##_x; \
vinserti128 $1, t3##_x, x1, x1; \
vaesenclast t4##_x, x4##_x, x4##_x; \
vaesenclast t4##_x, t2##_x, t2##_x; \
vinserti128 $1, t2##_x, x4, x4; \
\
/* postfilter sboxes 1 and 4 */ \
vbroadcasti128 .Lpost_tf_lo_s3(%rip), t2; \
vbroadcasti128 .Lpost_tf_hi_s3(%rip), t3; \
filter_8bit(x0, t0, t1, t7, t6); \
filter_8bit(x7, t0, t1, t7, t6); \
filter_8bit(x3, t0, t1, t7, t6); \
filter_8bit(x6, t0, t1, t7, t6); \
\
/* postfilter sbox 3 */ \
vbroadcasti128 .Lpost_tf_lo_s2(%rip), t4; \
vbroadcasti128 .Lpost_tf_hi_s2(%rip), t5; \
filter_8bit(x2, t2, t3, t7, t6); \
filter_8bit(x5, t2, t3, t7, t6); \
\
vpbroadcastq key, t0; /* higher 64-bit duplicate ignored */ \
\
/* postfilter sbox 2 */ \
filter_8bit(x1, t4, t5, t7, t2); \
filter_8bit(x4, t4, t5, t7, t2); \
vpxor t7, t7, t7; \
\
vpsrldq $1, t0, t1; \
vpsrldq $2, t0, t2; \
vpshufb t7, t1, t1; \
vpsrldq $3, t0, t3; \
\
/* P-function */ \
vpxor x5, x0, x0; \
vpxor x6, x1, x1; \
vpxor x7, x2, x2; \
vpxor x4, x3