summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorShahab Vahedi <shahab@synopsys.com>2024-04-30 16:56:04 +0200
committerAlexei Starovoitov <ast@kernel.org>2024-05-12 16:51:36 -0700
commitf122668ddcce450c2585f0be4bf4478d6fd6176b (patch)
tree728a4617241183e817241430b59315e81c41afc5
parentfcd1ed89a0439c45e1336bd9649485c44b7597c7 (diff)
downloadlinux-f122668ddcce450c2585f0be4bf4478d6fd6176b.tar.gz
linux-f122668ddcce450c2585f0be4bf4478d6fd6176b.tar.bz2
linux-f122668ddcce450c2585f0be4bf4478d6fd6176b.zip
ARC: Add eBPF JIT support
This will add eBPF JIT support to the 32-bit ARCv2 processors. The implementation is qualified by running the BPF tests on a Synopsys HSDK board with "ARC HS38 v2.1c at 500 MHz" as the 4-core CPU. The test_bpf.ko reports 2-10 fold improvements in execution time of its tests. For instance: test_bpf: #33 tcpdump port 22 jited:0 704 1766 2104 PASS test_bpf: #33 tcpdump port 22 jited:1 120 224 260 PASS test_bpf: #141 ALU_DIV_X: 4294967295 / 4294967295 = 1 jited:0 238 PASS test_bpf: #141 ALU_DIV_X: 4294967295 / 4294967295 = 1 jited:1 23 PASS test_bpf: #776 JMP32_JGE_K: all ... magnitudes jited:0 2034681 PASS test_bpf: #776 JMP32_JGE_K: all ... magnitudes jited:1 1020022 PASS Deployment and structure ------------------------ The related codes are added to "arch/arc/net": - bpf_jit.h -- The interface that a back-end translator must provide - bpf_jit_core.c -- Knows how to handle the input eBPF byte stream - bpf_jit_arcv2.c -- The back-end code that knows the translation logic The bpf_int_jit_compile() at the end of bpf_jit_core.c is the entrance to the whole process. Normally, the translation is done in one pass, namely the "normal pass". In case some relocations are not known during this pass, some data (arc_jit_data) is allocated for the next pass to come. This possible next (and last) pass is called the "extra pass". 1. Normal pass # The necessary pass 1a. Dry run # Get the whole JIT length, epilogue offset, etc. 1b. Emit phase # Allocate memory and start emitting instructions 2. Extra pass # Only needed if there are relocations to be fixed 2a. Patch relocations Support status -------------- The JIT compiler supports BPF instructions up to "cpu=v4". However, it does not yet provide support for: - Tail calls - Atomic operations - 64-bit division/remainder - BPF_PROBE_MEM* (exception table) The result of "test_bpf" test suite on an HSDK board is: hsdk-lnx# insmod test_bpf.ko test_suite=test_bpf test_bpf: Summary: 863 PASSED, 186 FAILED, [851/851 JIT'ed] All the failing test cases are due to the ones that were not JIT'ed. Categorically, they can be represented as: .-----------.------------.-------------. | test type | opcodes | # of cases | |-----------+------------+-------------| | atomic | 0xC3, 0xDB | 149 | | div64 | 0x37, 0x3F | 22 | | mod64 | 0x97, 0x9F | 15 | `-----------^------------+-------------| | (total) 186 | `-------------' Setup: build config ------------------- The following configs must be set to have a working JIT test: CONFIG_BPF_JIT=y CONFIG_BPF_JIT_ALWAYS_ON=y CONFIG_TEST_BPF=m The following options are not necessary for the tests module, but are good to have: CONFIG_DEBUG_INFO=y # prerequisite for below CONFIG_DEBUG_INFO_BTF=y # so bpftool can generate vmlinux.h CONFIG_FTRACE=y # CONFIG_BPF_SYSCALL=y # all these options lead to CONFIG_KPROBE_EVENTS=y # having CONFIG_BPF_EVENTS=y CONFIG_PERF_EVENTS=y # Some BPF programs provide data through /sys/kernel/debug: CONFIG_DEBUG_FS=y arc# mount -t debugfs debugfs /sys/kernel/debug Setup: elfutils --------------- The libdw.{so,a} library that is used by pahole for processing the final binary must come from elfutils 0.189 or newer. The support for ARCv2 [1] has been added since that version. [1] https://sourceware.org/git/?p=elfutils.git;a=commit;h=de3d46b3e7 Setup: pahole ------------- The line below in linux/scripts/Makefile.btf must be commented out: pahole-flags-$(call test-ge, $(pahole-ver), 121) += --btf_gen_floats Or else, the build will fail: $ make V=1 ... BTF .btf.vmlinux.bin.o pahole -J --btf_gen_floats \ -j --lang_exclude=rust \ --skip_encoding_btf_inconsistent_proto \ --btf_gen_optimized .tmp_vmlinux.btf Complex, interval and imaginary float types are not supported Encountered error while encoding BTF. ... BTFIDS vmlinux ./tools/bpf/resolve_btfids/resolve_btfids vmlinux libbpf: failed to find '.BTF' ELF section in vmlinux FAILED: load BTF from vmlinux: No data available This is due to the fact that the ARC toolchains generate "complex float" DIE entries in libgcc and at the moment, pahole can't handle such entries. Running the tests ----------------- host$ scp /bld/linux/lib/test_bpf.ko arc: arc # sysctl net.core.bpf_jit_enable=1 arc # insmod test_bpf.ko test_suite=test_bpf ... test_bpf: #1048 Staggered jumps: JMP32_JSLE_X jited:1 697811 PASS test_bpf: Summary: 863 PASSED, 186 FAILED, [851/851 JIT'ed] Acknowledgments --------------- - Claudiu Zissulescu for his unwavering support - Yuriy Kolerov for testing and troubleshooting - Vladimir Isaev for the pahole workaround - Sergey Matyukevich for paving the road by adding the interpreter support Signed-off-by: Shahab Vahedi <shahab@synopsys.com> Link: https://lore.kernel.org/r/20240430145604.38592-1-list+bpf@vahedi.org Signed-off-by: Alexei Starovoitov <ast@kernel.org>
-rw-r--r--Documentation/admin-guide/sysctl/net.rst1
-rw-r--r--Documentation/networking/filter.rst4
-rw-r--r--MAINTAINERS6
-rw-r--r--arch/arc/Kbuild1
-rw-r--r--arch/arc/Kconfig1
-rw-r--r--arch/arc/net/Makefile6
-rw-r--r--arch/arc/net/bpf_jit.h164
-rw-r--r--arch/arc/net/bpf_jit_arcv2.c3005
-rw-r--r--arch/arc/net/bpf_jit_core.c1425
9 files changed, 4611 insertions, 2 deletions
diff --git a/Documentation/admin-guide/sysctl/net.rst b/Documentation/admin-guide/sysctl/net.rst
index 7250c0542828..7b0c4291c686 100644
--- a/Documentation/admin-guide/sysctl/net.rst
+++ b/Documentation/admin-guide/sysctl/net.rst
@@ -72,6 +72,7 @@ two flavors of JITs, the newer eBPF JIT currently supported on:
- riscv64
- riscv32
- loongarch64
+ - arc
And the older cBPF JIT supported on the following archs:
diff --git a/Documentation/networking/filter.rst b/Documentation/networking/filter.rst
index 7d8c5380492f..8eb9a5d40f31 100644
--- a/Documentation/networking/filter.rst
+++ b/Documentation/networking/filter.rst
@@ -513,7 +513,7 @@ JIT compiler
------------
The Linux kernel has a built-in BPF JIT compiler for x86_64, SPARC,
-PowerPC, ARM, ARM64, MIPS, RISC-V and s390 and can be enabled through
+PowerPC, ARM, ARM64, MIPS, RISC-V, s390, and ARC and can be enabled through
CONFIG_BPF_JIT. The JIT compiler is transparently invoked for each
attached filter from user space or for internal kernel users if it has
been previously enabled by root::
@@ -650,7 +650,7 @@ before a conversion to the new layout is being done behind the scenes!
Currently, the classic BPF format is being used for JITing on most
32-bit architectures, whereas x86-64, aarch64, s390x, powerpc64,
-sparc64, arm32, riscv64, riscv32, loongarch64 perform JIT compilation
+sparc64, arm32, riscv64, riscv32, loongarch64, arc perform JIT compilation
from eBPF instruction set.
Testing
diff --git a/MAINTAINERS b/MAINTAINERS
index 943921d642ad..b6a946d24f00 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3712,6 +3712,12 @@ S: Maintained
F: Documentation/devicetree/bindings/iio/imu/bosch,bmi323.yaml
F: drivers/iio/imu/bmi323/
+BPF JIT for ARC
+M: Shahab Vahedi <shahab@synopsys.com>
+L: bpf@vger.kernel.org
+S: Maintained
+F: arch/arc/net/
+
BPF JIT for ARM
M: Russell King <linux@armlinux.org.uk>
M: Puranjay Mohan <puranjay12@gmail.com>
diff --git a/arch/arc/Kbuild b/arch/arc/Kbuild
index b94102fff68b..20ea7dd482d4 100644
--- a/arch/arc/Kbuild
+++ b/arch/arc/Kbuild
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
obj-y += kernel/
obj-y += mm/
+obj-y += net/
# for cleaning
subdir- += boot
diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index 99d2845f3feb..551fc65a9898 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -52,6 +52,7 @@ config ARC
select PCI_SYSCALL if PCI
select HAVE_ARCH_JUMP_LABEL if ISA_ARCV2 && !CPU_ENDIAN_BE32
select TRACE_IRQFLAGS_SUPPORT
+ select HAVE_EBPF_JIT if ISA_ARCV2
config LOCKDEP_SUPPORT
def_bool y
diff --git a/arch/arc/net/Makefile b/arch/arc/net/Makefile
new file mode 100644
index 000000000000..ea5790952e9a
--- /dev/null
+++ b/arch/arc/net/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+ifeq ($(CONFIG_ISA_ARCV2),y)
+ obj-$(CONFIG_BPF_JIT) += bpf_jit_core.o
+ obj-$(CONFIG_BPF_JIT) += bpf_jit_arcv2.o
+endif
diff --git a/arch/arc/net/bpf_jit.h b/arch/arc/net/bpf_jit.h
new file mode 100644
index 000000000000..ec44873c42d1
--- /dev/null
+++ b/arch/arc/net/bpf_jit.h
@@ -0,0 +1,164 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * The interface that a back-end should provide to bpf_jit_core.c.
+ *
+ * Copyright (c) 2024 Synopsys Inc.
+ * Author: Shahab Vahedi <shahab@synopsys.com>
+ */
+
+#ifndef _ARC_BPF_JIT_H
+#define _ARC_BPF_JIT_H
+
+#include <linux/bpf.h>
+#include <linux/filter.h>
+
+/* Print debug info and assert. */
+//#define ARC_BPF_JIT_DEBUG
+
+/* Determine the address type of the target. */
+#ifdef CONFIG_ISA_ARCV2
+#define ARC_ADDR u32
+#endif
+
+/*
+ * For the translation of some BPF instructions, a temporary register
+ * might be needed for some interim data.
+ */
+#define JIT_REG_TMP MAX_BPF_JIT_REG
+
+/*
+ * Buffer access: If buffer "b" is not NULL, advance by "n" bytes.
+ *
+ * This macro must be used in any place that potentially requires a
+ * "buf + len". This way, we make sure that the "buf" argument for
+ * the underlying "arc_*(buf, ...)" ends up as NULL instead of something
+ * like "0+4" or "0+8", etc. Those "arc_*()" functions check their "buf"
+ * value to decide if instructions should be emitted or not.
+ */
+#define BUF(b, n) (((b) != NULL) ? ((b) + (n)) : (b))
+
+/************** Functions that the back-end must provide **************/
+/* Extension for 32-bit operations. */
+inline u8 zext(u8 *buf, u8 rd);
+/***** Moves *****/
+u8 mov_r32(u8 *buf, u8 rd, u8 rs, u8 sign_ext);
+u8 mov_r32_i32(u8 *buf, u8 reg, s32 imm);
+u8 mov_r64(u8 *buf, u8 rd, u8 rs, u8 sign_ext);
+u8 mov_r64_i32(u8 *buf, u8 reg, s32 imm);
+u8 mov_r64_i64(u8 *buf, u8 reg, u32 lo, u32 hi);
+/***** Loads and stores *****/
+u8 load_r(u8 *buf, u8 rd, u8 rs, s16 off, u8 size, bool sign_ext);
+u8 store_r(u8 *buf, u8 rd, u8 rs, s16 off, u8 size);
+u8 store_i(u8 *buf, s32 imm, u8 rd, s16 off, u8 size);
+/***** Addition *****/
+u8 add_r32(u8 *buf, u8 rd, u8 rs);
+u8 add_r32_i32(u8 *buf, u8 rd, s32 imm);
+u8 add_r64(u8 *buf, u8 rd, u8 rs);
+u8 add_r64_i32(u8 *buf, u8 rd, s32 imm);
+/***** Subtraction *****/
+u8 sub_r32(u8 *buf, u8 rd, u8 rs);
+u8 sub_r32_i32(u8 *buf, u8 rd, s32 imm);
+u8 sub_r64(u8 *buf, u8 rd, u8 rs);
+u8 sub_r64_i32(u8 *buf, u8 rd, s32 imm);
+/***** Multiplication *****/
+u8 mul_r32(u8 *buf, u8 rd, u8 rs);
+u8 mul_r32_i32(u8 *buf, u8 rd, s32 imm);
+u8 mul_r64(u8 *buf, u8 rd, u8 rs);
+u8 mul_r64_i32(u8 *buf, u8 rd, s32 imm);
+/***** Division *****/
+u8 div_r32(u8 *buf, u8 rd, u8 rs, bool sign_ext);
+u8 div_r32_i32(u8 *buf, u8 rd, s32 imm, bool sign_ext);
+/***** Remainder *****/
+u8 mod_r32(u8 *buf, u8 rd, u8 rs, bool sign_ext);
+u8 mod_r32_i32(u8 *buf, u8 rd, s32 imm, bool sign_ext);
+/***** Bitwise AND *****/
+u8 and_r32(u8 *buf, u8 rd, u8 rs);
+u8 and_r32_i32(u8 *buf, u8 rd, s32 imm);
+u8 and_r64(u8 *buf, u8 rd, u8 rs);
+u8 and_r64_i32(u8 *buf, u8 rd, s32 imm);
+/***** Bitwise OR *****/
+u8 or_r32(u8 *buf, u8 rd, u8 rs);
+u8 or_r32_i32(u8 *buf, u8 rd, s32 imm);
+u8 or_r64(u8 *buf, u8 rd, u8 rs);
+u8 or_r64_i32(u8 *buf, u8 rd, s32 imm);
+/***** Bitwise XOR *****/
+u8 xor_r32(u8 *buf, u8 rd, u8 rs);
+u8 xor_r32_i32(u8 *buf, u8 rd, s32 imm);
+u8 xor_r64(u8 *buf, u8 rd, u8 rs);
+u8 xor_r64_i32(u8 *buf, u8 rd, s32 imm);
+/***** Bitwise Negate *****/
+u8 neg_r32(u8 *buf, u8 r);
+u8 neg_r64(u8 *buf, u8 r);
+/***** Bitwise left shift *****/
+u8 lsh_r32(u8 *buf, u8 rd, u8 rs);
+u8 lsh_r32_i32(u8 *buf, u8 rd, u8 imm);
+u8 lsh_r64(u8 *buf, u8 rd, u8 rs);
+u8 lsh_r64_i32(u8 *buf, u8 rd, s32 imm);
+/***** Bitwise right shift (logical) *****/
+u8 rsh_r32(u8 *buf, u8 rd, u8 rs);
+u8 rsh_r32_i32(u8 *buf, u8 rd, u8 imm);
+u8 rsh_r64(u8 *buf, u8 rd, u8 rs);
+u8 rsh_r64_i32(u8 *buf, u8 rd, s32 imm);
+/***** Bitwise right shift (arithmetic) *****/
+u8 arsh_r32(u8 *buf, u8 rd, u8 rs);
+u8 arsh_r32_i32(u8 *buf, u8 rd, u8 imm);
+u8 arsh_r64(u8 *buf, u8 rd, u8 rs);
+u8 arsh_r64_i32(u8 *buf, u8 rd, s32 imm);
+/***** Frame related *****/
+u32 mask_for_used_regs(u8 bpf_reg, bool is_call);
+u8 arc_prologue(u8 *buf, u32 usage, u16 frame_size);
+u8 arc_epilogue(u8 *buf, u32 usage, u16 frame_size);
+/***** Jumps *****/
+/*
+ * Different sorts of conditions (ARC enum as opposed to BPF_*).
+ *
+ * Do not change the order of enums here. ARC_CC_SLE+1 is used
+ * to determine the number of JCCs.
+ */
+enum ARC_CC {
+ ARC_CC_UGT = 0, /* unsigned > */
+ ARC_CC_UGE, /* unsigned >= */
+ ARC_CC_ULT, /* unsigned < */
+ ARC_CC_ULE, /* unsigned <= */
+ ARC_CC_SGT, /* signed > */
+ ARC_CC_SGE, /* signed >= */
+ ARC_CC_SLT, /* signed < */
+ ARC_CC_SLE, /* signed <= */
+ ARC_CC_AL, /* always */
+ ARC_CC_EQ, /* == */
+ ARC_CC_NE, /* != */
+ ARC_CC_SET, /* test */
+ ARC_CC_LAST
+};
+
+/*
+ * A few notes:
+ *
+ * - check_jmp_*() are prerequisites before calling the gen_jmp_*().
+ * They return "true" if the jump is possible and "false" otherwise.
+ *
+ * - The notion of "*_off" is to emphasize that these parameters are
+ * merely offsets in the JIT stream and not absolute addresses. One
+ * can look at them as addresses if the JIT code would start from
+ * address 0x0000_0000. Nonetheless, since the buffer address for the
+ * JIT is on a word-aligned address, this works and actually makes
+ * things simpler (offsets are in the range of u32 which is more than
+ * enough).
+ */
+bool check_jmp_32(u32 curr_off, u32 targ_off, u8 cond);
+bool check_jmp_64(u32 curr_off, u32 targ_off, u8 cond);
+u8 gen_jmp_32(u8 *buf, u8 rd, u8 rs, u8 cond, u32 c_off, u32 t_off);
+u8 gen_jmp_64(u8 *buf, u8 rd, u8 rs, u8 cond, u32 c_off, u32 t_off);
+/***** Miscellaneous *****/
+u8 gen_func_call(u8 *buf, ARC_ADDR func_addr, bool external_func);
+u8 arc_to_bpf_return(u8 *buf);
+/*
+ * - Perform byte swaps on "rd" based on the "size".
+ * - If "force" is set, do it unconditionally. Otherwise, consider the
+ * desired "endian"ness and the host endianness.
+ * - For data "size"s up to 32 bits, perform a zero-extension if asked
+ * by the "do_zext" boolean.
+ */
+u8 gen_swap(u8 *buf, u8 rd, u8 size, u8 endian, bool force, bool do_zext);
+
+#endif /* _ARC_BPF_JIT_H */
diff --git a/arch/arc/net/bpf_jit_arcv2.c b/arch/arc/net/bpf_jit_arcv2.c
new file mode 100644
index 000000000000..31bfb6e9ce00
--- /dev/null
+++ b/arch/arc/net/bpf_jit_arcv2.c
@@ -0,0 +1,3005 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * The ARCv2 backend of Just-In-Time compiler for eBPF bytecode.
+ *
+ * Copyright (c) 2024 Synopsys Inc.
+ * Author: Shahab Vahedi <shahab@synopsys.com>
+ */
+#include <linux/bug.h>
+#include "bpf_jit.h"
+
+/* ARC core registers. */
+enum {
+ ARC_R_0, ARC_R_1, ARC_R_2, ARC_R_3, ARC_R_4, ARC_R_5,
+ ARC_R_6, ARC_R_7, ARC_R_8, ARC_R_9, ARC_R_10, ARC_R_11,
+ ARC_R_12, ARC_R_13, ARC_R_14, ARC_R_15, ARC_R_16, ARC_R_17,
+ ARC_R_18, ARC_R_19, ARC_R_20, ARC_R_21, ARC_R_22, ARC_R_23,
+ ARC_R_24, ARC_R_25, ARC_R_26, ARC_R_FP, ARC_R_SP, ARC_R_ILINK,
+ ARC_R_30, ARC_R_BLINK,
+ /*
+ * Having ARC_R_IMM encoded as source register means there is an
+ * immediate that must be interpreted from the next 4 bytes. If
+ * encoded as the destination register though, it implies that the
+ * output of the operation is not assigned to any register. The
+ * latter is helpful if we only care about updating the CPU status
+ * flags.
+ */
+ ARC_R_IMM = 62
+};
+
+/*
+ * Remarks about the rationale behind the chosen mapping:
+ *
+ * - BPF_REG_{1,2,3,4} are the argument registers and must be mapped to
+ * argument registers in ARCv2 ABI: r0-r7. The r7 registers is the last
+ * argument register in the ABI. Therefore BPF_REG_5, as the fifth
+ * argument, must be pushed onto the stack. This is a must for calling
+ * in-kernel functions.
+ *
+ * - In ARCv2 ABI, the return value is in r0 for 32-bit results and (r1,r0)
+ * for 64-bit results. However, because they're already used for BPF_REG_1,
+ * the next available scratch registers, r8 and r9, are the best candidates
+ * for BPF_REG_0. After a "call" to a(n) (in-kernel) function, the result
+ * is "mov"ed to these registers. At a BPF_EXIT, their value is "mov"ed to
+ * (r1,r0).
+ * It is worth mentioning that scratch registers are the best choice for
+ * BPF_REG_0, because it is very popular in BPF instruction encoding.
+ *
+ * - JIT_REG_TMP is an artifact needed to translate some BPF instructions.
+ * Its life span is one single BPF instruction. Since during the
+ * analyze_reg_usage(), it is not known if temporary registers are used,
+ * it is mapped to ARC's scratch registers: r10 and r11. Therefore, they
+ * don't matter in analysing phase and don't need saving. This temporary
+ * register is added as yet another index in the bpf2arc array, so it will
+ * unfold like the rest of registers during the code generation process.
+ *
+ * - Mapping of callee-saved BPF registers, BPF_REG_{6,7,8,9}, starts from
+ * (r15,r14) register pair. The (r13,r12) is not a good choice, because
+ * in ARCv2 ABI, r12 is not a callee-saved register and this can cause
+ * problem when calling an in-kernel function. Theoretically, the mapping
+ * could start from (r14,r13), but it is not a conventional ARCv2 register
+ * pair. To have a future proof design, I opted for this arrangement.
+ * If/when we decide to add ARCv2 instructions that do use register pairs,
+ * the mapping, hopefully, doesn't need to be revisited.
+ */
+const u8 bpf2arc[][2] = {
+ /* Return value from in-kernel function, and exit value from eBPF */
+ [BPF_REG_0] = {ARC_R_8, ARC_R_9},
+ /* Arguments from eBPF program to in-kernel function */
+ [BPF_REG_1] = {ARC_R_0, ARC_R_1},
+ [BPF_REG_2] = {ARC_R_2, ARC_R_3},
+ [BPF_REG_3] = {ARC_R_4, ARC_R_5},
+ [BPF_REG_4] = {ARC_R_6, ARC_R_7},
+ /* Remaining arguments, to be passed on the stack per 32-bit ABI */
+ [BPF_REG_5] = {ARC_R_22, ARC_R_23},
+ /* Callee-saved registers that in-kernel function will preserve */
+ [BPF_REG_6] = {ARC_R_14, ARC_R_15},
+ [BPF_REG_7] = {ARC_R_16, ARC_R_17},
+ [BPF_REG_8] = {ARC_R_18, ARC_R_19},
+ [BPF_REG_9] = {ARC_R_20, ARC_R_21},
+ /* Read-only frame pointer to access the eBPF stack. 32-bit only. */
+ [BPF_REG_FP] = {ARC_R_FP, },
+ /* Register for blinding constants */
+ [BPF_REG_AX] = {ARC_R_24, ARC_R_25},
+ /* Temporary registers for internal use */
+ [JIT_REG_TMP] = {ARC_R_10, ARC_R_11}
+};
+
+#define ARC_CALLEE_SAVED_REG_FIRST ARC_R_13
+#define ARC_CALLEE_SAVED_REG_LAST ARC_R_25
+
+#define REG_LO(r) (bpf2arc[(r)][0])
+#define REG_HI(r) (bpf2arc[(r)][1])
+
+/*
+ * To comply with ARCv2 ABI, BPF's arg5 must be put on stack. After which,
+ * the stack needs to be restored by ARG5_SIZE.
+ */
+#define ARG5_SIZE 8
+
+/* Instruction lengths in bytes. */
+enum {
+ INSN_len_normal = 4, /* Normal instructions length. */
+ INSN_len_imm = 4 /* Length of an extra 32-bit immediate. */
+};
+
+/* ZZ defines the size of operation in encodings that it is used. */
+enum {
+ ZZ_1_byte = 1,
+ ZZ_2_byte = 2,
+ ZZ_4_byte = 0,
+ ZZ_8_byte = 3
+};
+
+/*
+ * AA is mostly about address write back mode. It determines if the
+ * address in question should be updated before usage or after:
+ * addr += offset; data = *addr;
+ * data = *addr; addr += offset;
+ *
+ * In "scaling" mode, the effective address will become the sum
+ * of "address" + "index"*"size". The "size" is specified by the
+ * "ZZ" field. There is no write back when AA is set for scaling:
+ * data = *(addr + offset<<zz)
+ */
+enum {
+ AA_none = 0,
+ AA_pre = 1, /* in assembly known as "a/aw". */
+ AA_post = 2, /* in assembly known as "ab". */
+ AA_scale = 3 /* in assembly known as "as". */
+};
+
+/* X flag determines the mode of extension. */
+enum {
+ X_zero = 0,
+ X_sign = 1
+};
+
+/* Condition codes. */
+enum {
+ CC_always = 0, /* condition is true all the time */
+ CC_equal = 1, /* if status32.z flag is set */
+ CC_unequal = 2, /* if status32.z flag is clear */
+ CC_positive = 3, /* if status32.n flag is clear */
+ CC_negative = 4, /* if status32.n flag is set */
+ CC_less_u = 5, /* less than (unsigned) */
+ CC_less_eq_u = 14, /* less than or equal (unsigned) */
+ CC_great_eq_u = 6, /* greater than or equal (unsigned) */
+ CC_great_u = 13, /* greater than (unsigned) */
+ CC_less_s = 11, /* less than (signed) */
+ CC_less_eq_s = 12, /* less than or equal (signed) */
+ CC_great_eq_s = 10, /* greater than or equal (signed) */
+ CC_great_s = 9 /* greater than (signed) */
+};
+
+#define IN_U6_RANGE(x) ((x) <= (0x40 - 1) && (x) >= 0)
+#define IN_S9_RANGE(x) ((x) <= (0x100 - 1) && (x) >= -0x100)
+#define IN_S12_RANGE(x) ((x) <= (0x800 - 1) && (x) >= -0x800)
+#define IN_S21_RANGE(x) ((x) <= (0x100000 - 1) && (x) >= -0x100000)
+#define IN_S25_RANGE(x) ((x) <= (0x1000000 - 1) && (x) >= -0x1000000)
+
+/* Operands in most of the encodings. */
+#define OP_A(x) ((x) & 0x03f)
+#define OP_B(x) ((((x) & 0x07) << 24) | (((x) & 0x38) << 9))
+#define OP_C(x) (((x) & 0x03f) << 6)
+#define OP_IMM (OP_C(ARC_R_IMM))
+#define COND(x) (OP_A((x) & 31))
+#define FLAG(x) (((x) & 1) << 15)
+
+/*
+ * The 4-byte encoding of "mov b,c":
+ *
+ * 0010_0bbb 0000_1010 0BBB_cccc cc00_0000
+ *
+ * b: BBBbbb destination register
+ * c: cccccc source register
+ */
+#define OPC_MOV 0x200a0000
+
+/*
+ * The 4-byte encoding of "mov b,s12" (used for moving small immediates):
+ *
+ * 0010_0bbb 1000_1010 0BBB_ssss ssSS_SSSS
+ *
+ * b: BBBbbb destination register
+ * s: SSSSSSssssss source immediate (signed)
+ */
+#define OPC_MOVI 0x208a0000
+#define MOVI_S12(x) ((((x) & 0xfc0) >> 6) | (((x) & 0x3f) << 6))
+
+/*
+ * The 4-byte encoding of "mov[.qq] b,u6", used for conditional
+ * moving of even smaller immediates:
+ *
+ * 0010_0bbb 1100_1010 0BBB_cccc cciq_qqqq
+ *
+ * qq: qqqqq condition code
+ * i: If set, c is considered a 6-bit immediate, else a reg.
+ *
+ * b: BBBbbb destination register
+ * c: cccccc source
+ */
+#define OPC_MOV_CC 0x20ca0000
+#define MOV_CC_I BIT(5)
+#define OPC_MOVU_CC (OPC_MOV_CC | MOV_CC_I)
+
+/*
+ * The 4-byte encoding of "sexb b,c" (8-bit sign extension):
+ *
+ * 0010_0bbb 0010_1111 0BBB_cccc cc00_0101
+ *
+ * b: BBBbbb destination register
+ * c: cccccc source register
+ */
+#define OPC_SEXB 0x202f0005
+
+/*
+ * The 4-byte encoding of "sexh b,c" (16-bit sign extension):
+ *
+ * 0010_0bbb 0010_1111 0BBB_cccc cc00_0110
+ *
+ * b: BBBbbb destination register
+ * c: cccccc source register
+ */
+#define OPC_SEXH 0x202f0006
+
+/*
+ * The 4-byte encoding of "ld[zz][.x][.aa] c,[b,s9]":
+ *
+ * 0001_0bbb ssss_ssss SBBB_0aaz zxcc_cccc
+ *
+ * zz: size mode
+ * aa: address write back mode
+ * x: extension mode
+ *
+ * s9: S_ssss_ssss 9-bit signed number
+ * b: BBBbbb source reg for address
+ * c: cccccc destination register
+ */
+#define OPC_LOAD 0x10000000
+#define LOAD_X(x) ((x) << 6)
+#define LOAD_ZZ(x) ((x) << 7)
+#define LOAD_AA(x) ((x) << 9)
+#define LOAD_S9(x) ((((x) & 0x0ff) << 16) | (((x) & 0x100) << 7))
+#define LOAD_C(x) ((x) & 0x03f)
+/* Unsigned and signed loads. */
+#define OPC_LDU (OPC_LOAD | LOAD_X(X_zero))
+#define OPC_LDS (OPC_LOAD | LOAD_X(X_sign))
+/* 32-bit load. */
+#define OPC_LD32 (OPC_LDU | LOAD_ZZ(ZZ_4_byte))
+/* "pop reg" is merely a "ld.ab reg,[sp,4]". */
+#define OPC_POP \
+ (OPC_LD32 | LOAD_AA(AA_post) | LOAD_S9(4) | OP_B(ARC_R_SP))
+
+/*
+ * The 4-byte encoding of "st[zz][.aa] c,[b,s9]":
+ *
+ * 0001_1bbb ssss_ssss SBBB_cccc cc0a_azz0
+ *
+ * zz: zz size mode
+ * aa: aa address write back mode
+ *
+ * s9: S_ssss_ssss 9-bit signed number
+ * b: BBBbbb source reg for address
+ * c: cccccc source reg to be stored
+ */
+#define OPC_STORE 0x18000000
+#define STORE_ZZ(x) ((x) << 1)
+#define STORE_AA(x) ((x) << 3)
+#define STORE_S9(x) ((((x) & 0x0ff) << 16) | (((x) & 0x100) << 7))
+/* 32-bit store. */
+#define OPC_ST32 (OPC_STORE | STORE_ZZ(ZZ_4_byte))
+/* "push reg" is merely a "st.aw reg,[sp,-4]". */
+#define OPC_PUSH \
+ (OPC_ST32 | STORE_AA(AA_pre) | STORE_S9(-4) | OP_B(ARC_R_SP))
+
+/*
+ * The 4-byte encoding of "add a,b,c":
+ *
+ * 0010_0bbb 0i00_0000 fBBB_cccc ccaa_aaaa
+ *
+ * f: indicates if flags (carry, etc.) should be updated
+ * i: If set, c is considered a 6-bit immediate, else a reg.
+ *
+ * a: aaaaaa result
+ * b: BBBbbb the 1st input operand
+ * c: cccccc the 2nd input operand
+ */
+#define OPC_ADD 0x20000000
+/* Addition with updating the pertinent flags in "status32" register. */
+#define OPC_ADDF (OPC_ADD | FLAG(1))
+#define ADDI BIT(22)
+#define ADDI_U6(x) OP_C(x)
+#define OPC_ADDI (OPC_ADD | ADDI)
+#define OPC_ADDIF (OPC_ADDI | FLAG(1))
+#define OPC_ADD_I (OPC_ADD | OP_IMM)
+
+/*
+ * The 4-byte encoding of "adc a,b,c" (addition with carry):
+ *
+ * 0010_0bbb 0i00_0001 0BBB_cccc ccaa_aaaa
+ *
+ * i: if set, c is considered a 6-bit immediate, else a reg.
+ *
+ * a: aaaaaa result
+ * b: BBBbbb the 1st input operand
+ * c: cccccc the 2nd input operand
+ */
+#define OPC_ADC 0x20010000
+#define ADCI BIT(22)
+#define ADCI_U6(x) OP_C(x)
+#define OPC_ADCI (OPC_ADC | ADCI)
+
+/*
+ * The 4-byte encoding of "sub a,b,c":
+ *
+ * 0010_0bbb 0i00_0010 fBBB_cccc ccaa_aaaa
+ *
+ * f: indicates if flags (carry, etc.) should be updated
+ * i: if set, c is considered a 6-bit immediate, else a reg.
+ *
+ * a: aaaaaa result
+ * b: BBBbbb the 1st input operand
+ * c: cccccc the 2nd input operand
+ */
+#define OPC_SUB 0x20020000
+/* Subtraction with updating the pertinent flags in "status32" register. */
+#define OPC_SUBF (OPC_SUB | FLAG(1))
+#define SUBI BIT(22)
+#define SUBI_U6(x) OP_C(x)
+#define OPC_SUBI (OPC_SUB | SUBI)
+#define OPC_SUB_I (OPC_SUB | OP_IMM)
+
+/*
+ * The 4-byte encoding of "sbc a,b,c" (subtraction with carry):
+ *
+ * 0010_0bbb 0000_0011 fBBB_cccc ccaa_aaaa
+ *
+ * f: indicates if flags (carry, etc.) should be updated
+ *
+ * a: aaaaaa result
+ * b: BBBbbb the 1st input operand
+ * c: cccccc the 2nd input operand
+ */
+#define OPC_SBC 0x20030000
+
+/*
+ * The 4-byte encoding of "cmp[.qq] b,c":
+ *
+ * 0010_0bbb 1100_1100 1BBB_cccc cc0q_qqqq
+ *
+ * qq: qqqqq condition code
+ *
+ * b: BBBbbb the 1st operand
+ * c: cccccc the 2nd operand
+ */
+#define OPC_CMP 0x20cc8000
+
+/*
+ * The 4-byte encoding of "neg a,b":
+ *
+ * 0010_0bbb 0100_1110 0BBB_0000 00aa_aaaa
+ *
+ * a: aaaaaa result
+ * b: BBBbbb input
+ */
+#define OPC_NEG 0x204e0000
+
+/*
+ * The 4-byte encoding of "mpy a,b,c".
+ * mpy is the signed 32-bit multiplication with the lower 32-bit
+ * of the product as the result.
+ *
+ * 0010_0bbb 0001_1010 0BBB_cccc ccaa_aaaa
+ *
+ * a: aaaaaa result
+ * b: BBBbbb the 1st input operand
+ * c: cccccc the 2nd input operand
+ */
+#define OPC_MPY 0x201a0000
+#define OPC_MPYI (OPC_MPY | OP_IMM)
+
+/*
+ * The 4-byte encoding of "mpydu a,b,c".
+ * mpydu is the unsigned 32-bit multiplication with the lower 32-bit of
+ * the product in register "a" and the higher 32-bit in register "a+1".
+ *
+ * 0010_1bbb 0001_1001 0BBB_cccc ccaa_aaaa
+ *
+ * a: aaaaaa 64-bit result in registers (R_a+1,R_a)
+ * b: BBBbbb the 1st input operand
+ * c: cccccc the 2nd input operand
+ */
+#define OPC_MPYDU 0x28190000
+#define OPC_MPYDUI (OPC_MPYDU | OP_IMM)
+
+/*
+ * The 4-byte encoding of "divu a,b,c" (unsigned division):
+ *
+ * 0010_1bbb 0000_0101 0BBB_cccc ccaa_aaaa
+ *
+ * a: aaaaaa result (quotient)
+ * b: BBBbbb the 1st input operand
+ * c: cccccc the 2nd input operand (divisor)
+ */
+#define OPC_DIVU 0x28050000
+#define OPC_DIVUI (OPC_DIVU | OP_IMM)
+
+/*
+ * The 4-byte encoding of "div a,b,c" (signed division):
+ *
+ * 0010_1bbb 0000_0100 0BBB_cccc ccaa_aaaa
+ *
+ * a: aaaaaa result (quotient)
+ * b: BBBbbb the 1st input operand
+ * c: cccccc the 2nd input operand (divisor)
+ */
+#define OPC_DIVS 0x28040000
+#define OPC_DIVSI (OPC_DIVS | OP_IMM)
+
+/*
+ * The 4-byte encoding of "remu a,b,c" (unsigned remainder):
+ *
+ * 0010_1bbb 0000_1001 0BBB_cccc ccaa_aaaa
+ *
+ * a: aaaaaa result (remainder)
+ * b: BBBbbb the 1st input operand
+ * c: cccccc the 2nd input operand (divisor)
+ */
+#define OPC_REMU 0x28090000
+#define OPC_REMUI (OPC_REMU | OP_IMM)
+
+/*
+ * The 4-byte encoding of "rem a,b,c" (signed remainder):
+ *
+ * 0010_1bbb 0000_1000 0BBB_cccc ccaa_aaaa
+ *
+ * a: aaaaaa result (remainder)
+ * b: BBBbbb the 1st input operand
+ * c: cccccc the 2nd input operand (divisor)
+ */
+#define OPC_REMS 0x28080000
+#define OPC_REMSI (OPC_REMS | OP_IMM)
+
+/*
+ * The 4-byte encoding of "and a,b,c":
+ *
+ * 0010_0bbb 0000_0100 fBBB_cccc ccaa_aaaa
+ *
+ * f: indicates if zero and negative flags should be updated
+ *
+ * a: aaaaaa result
+ * b: BBBbbb the 1st input operand
+ * c: cccccc the 2nd input operand
+ */
+#define OPC_AND 0x20040000
+#define OPC_ANDI (OPC_AND | OP_IMM)
+
+/*
+ * The 4-byte encoding of "tst[.qq] b,c".
+ * Checks if the two input operands have any bit set at the same
+ * position.
+ *
+ * 0010_0bbb 1100_1011 1BBB_cccc cc0q_qqqq
+ *
+ * qq: qqqqq condition code
+ *
+ * b: BBBbbb the 1st input operand
+ * c: cccccc the 2nd input operand
+ */
+#define OPC_TST 0x20cb8000
+
+/*
+ * The 4-byte encoding of "or a,b,c":
+ *
+ * 0010_0bbb 0000_0101 0BBB_cccc ccaa_aaaa
+ *
+ * a: aaaaaa result
+ * b: BBBbbb the 1st input operand
+ * c: cccccc the 2nd input operand
+ */
+#define OPC_OR 0x20050000
+#define OPC_ORI (OPC_OR | OP_IMM)
+
+/*
+ * The 4-byte encoding of "xor a,b,c":
+ *
+ * 0010_0bbb 0000_0111 0BBB_cccc ccaa_aaaa
+ *
+ * a: aaaaaa result
+ * b: BBBbbb the 1st input operand
+ * c: cccccc the 2nd input operand
+ */
+#define OPC_XOR 0x20070000
+#define OPC_XORI (OPC_XOR | OP_IMM)
+
+/*
+ * The 4-byte encoding of "not b,c":
+ *
+ * 0010_0bbb 0010_1111 0BBB_cccc cc00_1010
+ *
+ * b: BBBbbb result
+ * c: cccccc input
+ */
+#define OPC_NOT 0x202f000a
+
+/*
+ * The 4-byte encoding of "btst b,u6":
+ *
+ * 0010_0bbb 0101_0001 1BBB_uuuu uu00_0000
+ *
+ * b: BBBbbb input number to check
+ * u6: uuuuuu 6-bit unsigned number specifying bit position to check
+ */
+#define OPC_BTSTU6 0x20518000
+#define BTST_U6(x) (OP_C((x) & 63))
+
+/*
+ * The 4-byte encoding of "asl[.qq] b,b,c" (arithmetic shift left):
+ *
+ * 0010_1bbb 0i00_0000 0BBB_cccc ccaa_aaaa
+ *
+ * i: if set, c is considered a 5-bit immediate, else a reg.
+ *
+ * b: BBBbbb result and the first operand (number to be shifted)
+ * c: cccccc amount to be shifted
+ */
+#define OPC_ASL 0x28000000
+#define ASL_I BIT(22)
+#define ASLI_U6(x) OP_C((x) & 31)
+#define OPC_ASLI (OPC_ASL | ASL_I)
+
+/*
+ * The 4-byte encoding of "asr a,b,c" (arithmetic shift right):
+ *
+ * 0010_1bbb 0i00_0010 0BBB_cccc ccaa_aaaa
+ *
+ * i: if set, c is considered a 6-bit immediate, else a reg.
+ *
+ * a: aaaaaa result
+ * b: BBBbbb first input: number to be shifted
+ * c: cccccc second input: amount to be shifted
+ */
+#define OPC_ASR 0x28020000
+#define ASR_I ASL_I
+#define ASRI_U6(x) ASLI_U6(x)
+#define OPC_ASRI (OPC_ASR | ASR_I)
+
+/*
+ * The 4-byte encoding of "lsr a,b,c" (logical shift right):
+ *
+ * 0010_1bbb 0i00_0001 0BBB_cccc ccaa_aaaa
+ *
+ * i: if set, c is considered a 6-bit immediate, else a reg.
+ *
+ * a: aaaaaa result
+ * b: BBBbbb first input: number to be shifted
+ * c: cccccc second input: amount to be shifted
+ */
+#define OPC_LSR 0x28010000
+#define LSR_I ASL_I
+#define LSRI_U6(x) ASLI_U6(x)
+#define OPC_LSRI (OPC_LSR | LSR_I)
+
+/*
+ * The 4-byte encoding of "swape b,c":
+ *
+ * 0010_1bbb 0010_1111 0bbb_cccc cc00_1001
+ *
+ * b: BBBbbb destination register
+ * c: cccccc source register
+ */
+#define OPC_SWAPE 0x282f0009
+
+/*
+ * Encoding for jump to an address in register:
+ * j reg_c
+ *
+ * 0010_0000 1110_0000 0000_cccc cc00_0000
+ *
+ * c: cccccc register holding the destination address
+ */
+#define OPC_JMP 0x20e00000
+/* Jump to "branch-and-link" register, which effectively is a "return". */
+#define OPC_J_BLINK (OPC_JMP | OP_C(ARC_R_BLINK))
+
+/*
+ * Encoding for jump-and-link to an address in register:
+ * jl reg_c
+ *
+ * 0010_0000 0010_0010 0000_cccc cc00_0000
+ *
+ * c: cccccc register holding the destination address
+ */
+#define OPC_JL 0x20220000
+
+/*
+ * Encoding for (conditional) branch to an offset from the current location
+ * that is word aligned: (PC & 0xffff_fffc) + s21
+ * B[qq] s21
+ *
+ * 0000_0sss ssss_sss0 SSSS_SSSS SS0q_qqqq
+ *
+ * qq: qqqqq condition code
+ * s21: SSSS SSSS_SSss ssss_ssss The displacement (21-bit signed)
+ *
+ * The displacement is supposed to be 16-bit (2-byte) aligned. Therefore,
+ * it should be a multiple of 2. Hence, there is an implied '0' bit at its
+ * LSB: S_SSSS SSSS_Ssss ssss_sss0
+ */
+#define OPC_BCC 0x00000000
+#define BCC_S21(d) ((((d) & 0x7fe) << 16) | (((d) & 0x1ff800) >> 5))
+
+/*
+ * Encoding for unconditional branch to an offset from the current location
+ * that is word aligned: (PC & 0xffff_fffc) + s25
+ * B s25
+ *
+ * 0000_0sss ssss_sss1 SSSS_SSSS SS00_TTTT
+ *
+ * s25: TTTT SSSS SSSS_SSss ssss_ssss The displacement (25-bit signed)
+ *
+ * The displacement is supposed to be 16-bit (2-byte) aligned. Therefore,
+ * it should be a multiple of 2. Hence, there is an implied '0' bit at its
+ * LSB: T TTTS_SSSS SSSS_Ssss ssss_sss0
+ */
+#define OPC_B 0x00010000
+#define B_S25(d) ((((d) & 0x1e00000) >> 21) | BCC_S21(d))
+
+static inline void emit_2_bytes(u8 *buf, u16 bytes)
+{
+ *((u16 *)buf) = bytes;
+}
+
+static inline void emit_4_bytes(u8 *buf, u32 bytes)
+{
+ emit_2_bytes(buf, bytes >> 16);
+ emit_2_bytes(buf + 2, bytes & 0xffff);
+}
+
+static inline u8 bpf_to_arc_size(u8 size)
+{
+ switch (size) {
+ case BPF_B:
+ return ZZ_1_byte;
+ case BPF_H:
+ return ZZ_2_byte;
+ case BPF_W:</