From 71189fa9b092ef125ee741eccb2f5fa916798afd Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 30 May 2017 13:31:27 -0700 Subject: bpf: free up BPF_JMP | BPF_CALL | BPF_X opcode free up BPF_JMP | BPF_CALL | BPF_X opcode to be used by actual indirect call by register and use kernel internal opcode to mark call instruction into bpf_tail_call() helper. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/filter.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux/filter.h') diff --git a/include/linux/filter.h b/include/linux/filter.h index 62d948f80730..a20ba40fcb73 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -57,6 +57,9 @@ struct bpf_prog_aux; #define BPF_REG_AX MAX_BPF_REG #define MAX_BPF_JIT_REG (MAX_BPF_REG + 1) +/* unused opcode to mark special call to bpf_tail_call() helper */ +#define BPF_TAIL_CALL 0xf0 + /* As per nm, we expose JITed images as text (code) section for * kallsyms. That way, tools like perf can find it to match * addresses. -- cgit v1.2.3 From 783d28dd11f68fb25d1f2e0de7c42336394ef128 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 5 Jun 2017 12:15:51 -0700 Subject: bpf: Add jited_len to struct bpf_prog Add jited_len to struct bpf_prog. It will be useful for the struct bpf_prog_info which will be added in the later patch. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/filter.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/filter.h') diff --git a/include/linux/filter.h b/include/linux/filter.h index a20ba40fcb73..1e2dddf21f3b 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -432,6 +432,7 @@ struct bpf_prog { kmemcheck_bitfield_end(meta); enum bpf_prog_type type; /* Type of BPF program */ u32 len; /* Number of filter blocks */ + u32 jited_len; /* Size of jited insns in bytes */ u8 tag[BPF_TAG_SIZE]; struct bpf_prog_aux *aux; /* Auxiliary fields */ struct sock_fprog_kern *orig_prog; /* Original BPF program */ -- cgit v1.2.3 From 1e270976908686ec25fb91b8a34145be54137976 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 5 Jun 2017 12:15:52 -0700 Subject: bpf: Add BPF_OBJ_GET_INFO_BY_FD A single BPF_OBJ_GET_INFO_BY_FD cmd is used to obtain the info for both bpf_prog and bpf_map. The kernel can figure out the fd is associated with a bpf_prog or bpf_map. The suggested struct bpf_prog_info and struct bpf_map_info are not meant to be a complete list and it is not the goal of this patch. New fields can be added in the future patch. The focus of this patch is to create the interface, BPF_OBJ_GET_INFO_BY_FD cmd for exposing the bpf_prog's and bpf_map's info. The obj's info, which will be extended (and get bigger) over time, is separated from the bpf_attr to avoid bloating the bpf_attr. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/filter.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux/filter.h') diff --git a/include/linux/filter.h b/include/linux/filter.h index 1e2dddf21f3b..1fa26dc562ce 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -69,8 +69,6 @@ struct bpf_prog_aux; /* BPF program can access up to 512 bytes of stack space. */ #define MAX_BPF_STACK 512 -#define BPF_TAG_SIZE 8 - /* Helper macros for filter block array initializers. */ /* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */ -- cgit v1.2.3 From 40304b2a1567fecc321f640ee4239556dd0f3ee0 Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Fri, 30 Jun 2017 20:02:40 -0700 Subject: bpf: BPF support for sock_ops Created a new BPF program type, BPF_PROG_TYPE_SOCK_OPS, and a corresponding struct that allows BPF programs of this type to access some of the socket's fields (such as IP addresses, ports, etc.). It uses the existing bpf cgroups infrastructure so the programs can be attached per cgroup with full inheritance support. The program will be called at appropriate times to set relevant connections parameters such as buffer sizes, SYN and SYN-ACK RTOs, etc., based on connection information such as IP addresses, port numbers, etc. Alghough there are already 3 mechanisms to set parameters (sysctls, route metrics and setsockopts), this new mechanism provides some distinct advantages. Unlike sysctls, it can set parameters per connection. In contrast to route metrics, it can also use port numbers and information provided by a user level program. In addition, it could set parameters probabilistically for evaluation purposes (i.e. do something different on 10% of the flows and compare results with the other 90% of the flows). Also, in cases where IPv6 addresses contain geographic information, the rules to make changes based on the distance (or RTT) between the hosts are much easier than route metric rules and can be global. Finally, unlike setsockopt, it oes not require application changes and it can be updated easily at any time. Although the bpf cgroup framework already contains a sock related program type (BPF_PROG_TYPE_CGROUP_SOCK), I created the new type (BPF_PROG_TYPE_SOCK_OPS) beccause the existing type expects to be called only once during the connections's lifetime. In contrast, the new program type will be called multiple times from different places in the network stack code. For example, before sending SYN and SYN-ACKs to set an appropriate timeout, when the connection is established to set congestion control, etc. As a result it has "op" field to specify the type of operation requested. The purpose of this new program type is to simplify setting connection parameters, such as buffer sizes, TCP's SYN RTO, etc. For example, it is easy to use facebook's internal IPv6 addresses to determine if both hosts of a connection are in the same datacenter. Therefore, it is easy to write a BPF program to choose a small SYN RTO value when both hosts are in the same datacenter. This patch only contains the framework to support the new BPF program type, following patches add the functionality to set various connection parameters. This patch defines a new BPF program type: BPF_PROG_TYPE_SOCKET_OPS and a new bpf syscall command to load a new program of this type: BPF_PROG_LOAD_SOCKET_OPS. Two new corresponding structs (one for the kernel one for the user/BPF program): /* kernel version */ struct bpf_sock_ops_kern { struct sock *sk; __u32 op; union { __u32 reply; __u32 replylong[4]; }; }; /* user version * Some fields are in network byte order reflecting the sock struct * Use the bpf_ntohl helper macro in samples/bpf/bpf_endian.h to * convert them to host byte order. */ struct bpf_sock_ops { __u32 op; union { __u32 reply; __u32 replylong[4]; }; __u32 family; __u32 remote_ip4; /* In network byte order */ __u32 local_ip4; /* In network byte order */ __u32 remote_ip6[4]; /* In network byte order */ __u32 local_ip6[4]; /* In network byte order */ __u32 remote_port; /* In network byte order */ __u32 local_port; /* In host byte horder */ }; Currently there are two types of ops. The first type expects the BPF program to return a value which is then used by the caller (or a negative value to indicate the operation is not supported). The second type expects state changes to be done by the BPF program, for example through a setsockopt BPF helper function, and they ignore the return value. The reply fields of the bpf_sockt_ops struct are there in case a bpf program needs to return a value larger than an integer. Signed-off-by: Lawrence Brakmo Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/filter.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux/filter.h') diff --git a/include/linux/filter.h b/include/linux/filter.h index 1fa26dc562ce..738f8b14f025 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -898,4 +898,13 @@ static inline int bpf_tell_extensions(void) return SKF_AD_MAX; } +struct bpf_sock_ops_kern { + struct sock *sk; + u32 op; + union { + u32 reply; + u32 replylong[4]; + }; +}; + #endif /* __LINUX_FILTER_H__ */ -- cgit v1.2.3 From f96da09473b52c09125cc9bf7d7d4576ae8229e0 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 2 Jul 2017 02:13:27 +0200 Subject: bpf: simplify narrower ctx access This work tries to make the semantics and code around the narrower ctx access a bit easier to follow. Right now everything is done inside the .is_valid_access(). Offset matching is done differently for read/write types, meaning writes don't support narrower access and thus matching only on offsetof(struct foo, bar) is enough whereas for read case that supports narrower access we must check for offsetof(struct foo, bar) + offsetof(struct foo, bar) + sizeof() - 1 for each of the cases. For read cases of individual members that don't support narrower access (like packet pointers or skb->cb[] case which has its own narrow access logic), we check as usual only offsetof(struct foo, bar) like in write case. Then, for the case where narrower access is allowed, we also need to set the aux info for the access. Meaning, ctx_field_size and converted_op_size have to be set. First is the original field size e.g. sizeof() as in above example from the user facing ctx, and latter one is the target size after actual rewrite happened, thus for the kernel facing ctx. Also here we need the range match and we need to keep track changing convert_ctx_access() and converted_op_size from is_valid_access() as both are not at the same location. We can simplify the code a bit: check_ctx_access() becomes simpler in that we only store ctx_field_size as a meta data and later in convert_ctx_accesses() we fetch the target_size right from the location where we do convert. Should the verifier be misconfigured we do reject for BPF_WRITE cases or target_size that are not provided. For the subsystems, we always work on ranges in is_valid_access() and add small helpers for ranges and narrow access, convert_ctx_accesses() sets target_size for the relevant instruction. Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Cc: Yonghong Song Signed-off-by: David S. Miller --- include/linux/filter.h | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) (limited to 'include/linux/filter.h') diff --git a/include/linux/filter.h b/include/linux/filter.h index 738f8b14f025..f1fc9baa3509 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -337,6 +337,22 @@ struct bpf_prog_aux; bpf_size; \ }) +#define bpf_size_to_bytes(bpf_size) \ +({ \ + int bytes = -EINVAL; \ + \ + if (bpf_size == BPF_B) \ + bytes = sizeof(u8); \ + else if (bpf_size == BPF_H) \ + bytes = sizeof(u16); \ + else if (bpf_size == BPF_W) \ + bytes = sizeof(u32); \ + else if (bpf_size == BPF_DW) \ + bytes = sizeof(u64); \ + \ + bytes; \ +}) + #define BPF_SIZEOF(type) \ ({ \ const int __size = bytes_to_bpf_size(sizeof(type)); \ @@ -351,6 +367,13 @@ struct bpf_prog_aux; __size; \ }) +#define BPF_LDST_BYTES(insn) \ + ({ \ + const int __size = bpf_size_to_bytes(BPF_SIZE(insn->code)); \ + WARN_ON(__size < 0); \ + __size; \ + }) + #define __BPF_MAP_0(m, v, ...) v #define __BPF_MAP_1(m, v, t, a, ...) m(t, a) #define __BPF_MAP_2(m, v, t, a, ...) m(t, a), __BPF_MAP_1(m, v, __VA_ARGS__) @@ -401,6 +424,18 @@ struct bpf_prog_aux; #define BPF_CALL_4(name, ...) BPF_CALL_x(4, name, __VA_ARGS__) #define BPF_CALL_5(name, ...) BPF_CALL_x(5, name, __VA_ARGS__) +#define bpf_ctx_range(TYPE, MEMBER) \ + offsetof(TYPE, MEMBER) ... offsetofend(TYPE, MEMBER) - 1 +#define bpf_ctx_range_till(TYPE, MEMBER1, MEMBER2) \ + offsetof(TYPE, MEMBER1) ... offsetofend(TYPE, MEMBER2) - 1 + +#define bpf_target_off(TYPE, MEMBER, SIZE, PTR_SIZE) \ + ({ \ + BUILD_BUG_ON(FIELD_SIZEOF(TYPE, MEMBER) != (SIZE)); \ + *(PTR_SIZE) = (SIZE); \ + offsetof(TYPE, MEMBER); \ + }) + #ifdef CONFIG_COMPAT /* A struct sock_filter is architecture independent. */ struct compat_sock_fprog { @@ -564,6 +599,18 @@ static inline bool bpf_prog_was_classic(const struct bpf_prog *prog) return prog->type == BPF_PROG_TYPE_UNSPEC; } +static inline bool +bpf_ctx_narrow_access_ok(u32 off, u32 size, const u32 size_default) +{ + bool off_ok; +#ifdef __LITTLE_ENDIAN + off_ok = (off & (size_default - 1)) == 0; +#else + off_ok = (off & (size_default - 1)) + size == size_default; +#endif + return off_ok && size <= size_default && (size & (size - 1)) == 0; +} + #define bpf_classic_proglen(fprog) (fprog->len * sizeof(fprog->filter[0])) #ifdef CONFIG_ARCH_HAS_SET_MEMORY -- cgit v1.2.3 From 820a0b24b261c650cb07ea0f60aea9191f658f25 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 6 Jul 2017 15:36:01 -0700 Subject: include/linux/filter.h: use linux/set_memory.h This header always exists, so doesn't require an ifdef around its inclusion. When CONFIG_ARCH_HAS_SET_MEMORY=y it includes the asm header, otherwise it provides empty versions of the set_memory_xx() routines. Link: http://lkml.kernel.org/r/1498717781-29151-4-git-send-email-mpe@ellerman.id.au Signed-off-by: Michael Ellerman Acked-by: Daniel Borkmann Acked-by: Kees Cook Acked-by: Laura Abbott Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/filter.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include/linux/filter.h') diff --git a/include/linux/filter.h b/include/linux/filter.h index f1fc9baa3509..bfef1e5734f8 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -16,13 +16,10 @@ #include #include #include +#include #include -#ifdef CONFIG_ARCH_HAS_SET_MEMORY -#include -#endif - #include #include -- cgit v1.2.3