/* SPDX-License-Identifier: GPL-2.0-or-later */
#
# Accelerated poly1305 implementation for ppc64le.
#
# Copyright 2023- IBM Corp. All rights reserved
#
#===================================================================================
# Written by Danny Tsen <dtsen@us.ibm.com>
#
# Poly1305 - this version mainly using vector/VSX/Scalar
# - 26 bits limbs
# - Handle multiple 64 byte blcok.
#
# Block size 16 bytes
# key = (r, s)
# clamp r &= 0x0FFFFFFC0FFFFFFC 0x0FFFFFFC0FFFFFFF
# p = 2^130 - 5
# a += m
# a = (r + a) % p
# a += s
#
# Improve performance by breaking down polynominal to the sum of products with
# h4 = m1 * r⁴ + m2 * r³ + m3 * r² + m4 * r
#
# 07/22/21 - this revison based on the above sum of products. Setup r^4, r^3, r^2, r and s3, s2, s1, s0
# to 9 vectors for multiplications.
#
# setup r^4, r^3, r^2, r vectors
# vs [r^1, r^3, r^2, r^4]
# vs0 = [r0,.....]
# vs1 = [r1,.....]
# vs2 = [r2,.....]
# vs3 = [r3,.....]
# vs4 = [r4,.....]
# vs5 = [r1*5,...]
# vs6 = [r2*5,...]
# vs7 = [r2*5,...]
# vs8 = [r4*5,...]
#
# Each word in a vector consists a member of a "r/s" in [a * r/s].
#
# r0, r4*5, r3*5, r2*5, r1*5;
# r1, r0, r4*5, r3*5, r2*5;
# r2, r1, r0, r4*5, r3*5;
# r3, r2, r1, r0, r4*5;
# r4, r3, r2, r1, r0 ;
#
#
# poly1305_p10le_4blocks( uint8_t *k, uint32_t mlen, uint8_t *m)
# k = 32 bytes key
# r3 = k (r, s)
# r4 = mlen
# r5 = m
#
#include <asm/ppc_asm.h>
#include <asm/asm-offsets.h>
#include <asm/asm-compat.h>
#include <linux/linkage.h>
.machine "any"
.text
.macro SAVE_GPR GPR OFFSET FRAME
std \GPR,\OFFSET(\FRAME)
.endm
.macro SAVE_VRS VRS OFFSET FRAME
li 16, \OFFSET
stvx \VRS, 16, \FRAME
.endm
.macro SAVE_VSX VSX OFFSET FRAME
li 16, \OFFSET
stxvx \VSX, 16, \FRAME
.endm
.macro RESTORE_GPR GPR OFFSET FRAME
ld \GPR,\OFFSET(\FRAME)
.endm
.macro RESTORE_VRS VRS OFFSET FRAME
li 16, \OFFSET
lvx \VRS, 16, \FRAME
.endm
.macro RESTORE_VSX VSX OFFSET FRAME
li 16, \OFFSET
lxvx \VSX, 16, \FRAME
.endm
.macro SAVE_REGS
mflr 0
std 0, 16(1)
stdu 1,-752(1)
SAVE_GPR 14, 112, 1
SAVE_GPR 15, 120, 1
SAVE_GPR 16, 128, 1
SAVE_GPR 17, 136, 1
SAVE_GPR 18, 144, 1
SAVE_GPR 19, 152, 1
SAVE_GPR 20, 160, 1
SAVE_GPR 21, 168, 1
SAVE_GPR 22, 176, 1
SAVE_GPR 23, 184, 1
SAVE_GPR 24, 192, 1
SAVE_GPR 25, 200, 1
SAVE_GPR 26, 208, 1
SAVE_GPR 27, 216, 1
SAVE_GPR 28, 224, 1
SAVE_GPR 29, 232, 1
SAVE_GPR 30, 240, 1
SAVE_GPR 31, 248, 1
addi 9, 1, 256
SAVE_VRS 20, 0, 9
SAVE_VRS 21, 16, 9
SAVE_VRS 22, 32, 9
SAVE_VRS 23, 48, 9
SAVE_VRS 24, 64, 9
SAVE_VRS 25, 80, 9
SAVE_VRS 26, 96, 9
SAVE_VRS 27, 112, 9
SAVE_VRS 28, 128, 9
SAVE_VRS 29, 144, 9
SAVE_VRS 30, 160, 9
SAVE_VRS 31, 176, 9
SAVE_VSX 14, 192, 9
SAVE_VSX 15, 208, 9
SAVE_VSX 16, 224, 9
SAVE_VSX 17, 240, 9
SAVE_VSX 18, 256, 9
SAVE_VSX 19, 272, 9
SAVE_VSX 20, 288, 9
SAVE_VSX 21, 304, 9
SAVE_VSX 22, 320, 9
SAVE_VSX 23, 336, 9
SAVE_VSX 24, 352, 9
SAVE_VSX 25, 368, 9
SAVE_VSX 26, 384, 9
SAVE_VSX 27, 400, 9
SAVE_VSX 28, 416, 9
SAVE_VSX 29, 432, 9
SAVE_VSX 30, 448, 9
SAVE_VSX 31, 464, 9
.endm # SAVE_REGS
.macro RESTORE_REGS
addi 9, 1, 256
RESTORE_VRS 20, 0, 9
RESTORE_VRS 21, 16, 9
RESTORE_VRS 22, 32, 9
RESTORE_VRS 23, 48, 9
RESTORE_VRS 24, 64, 9
RESTORE_VRS 25, 80, 9
RESTORE_VRS 26, 96, 9
RESTORE_VRS 27, 112, 9
RESTORE_VRS 28, 128, 9
RESTORE_VRS 29, 144, 9
RESTORE_VRS 30, 160, 9
RESTORE_VRS 31, 176, 9
RESTORE_VSX 14, 192, 9
RESTORE_VSX 15, 208, 9
RESTORE_VSX 16, 224, 9
RESTORE_VSX 17, 240, 9
RESTORE_VSX 18, 256, 9
RESTORE_VSX 19, 272, 9
RESTORE_VSX 20, 288, 9
RESTORE_VSX 21, 304, 9
RESTORE_VSX 22, 320, 9
RESTORE_VSX 23, 336, 9
RESTORE_VSX 24, 352, 9
RESTORE_VSX 25, 368, 9
RESTORE_VSX 26, 384, 9
RESTORE_VSX 27, 400, 9
RESTORE_VSX 28, 416, 9
RESTORE_VSX 29, 432, 9
RESTORE_VSX 30, 448, 9
RESTORE_VSX 31, 464, 9
RESTORE_GPR 14, 112,