From a4393861a351f66fef1102e775743c86a276afce Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Mon, 17 Feb 2020 12:15:28 +0000 Subject: bpf, sk_msg: Let ULP restore sk_proto and write_space callback We don't need a fallback for when the socket is not using ULP. tcp_update_ulp handles this case exactly the same as we do in sk_psock_restore_proto. Get rid of the duplicated code. Signed-off-by: Jakub Sitnicki Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200217121530.754315-2-jakub@cloudflare.com --- include/linux/skmsg.h | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 14d61bba0b79..8605947d6c08 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -361,16 +361,7 @@ static inline void sk_psock_restore_proto(struct sock *sk, sk->sk_prot->unhash = psock->saved_unhash; if (psock->sk_proto) { - struct inet_connection_sock *icsk = inet_csk(sk); - bool has_ulp = !!icsk->icsk_ulp_data; - - if (has_ulp) { - tcp_update_ulp(sk, psock->sk_proto, - psock->saved_write_space); - } else { - sk->sk_prot = psock->sk_proto; - sk->sk_write_space = psock->saved_write_space; - } + tcp_update_ulp(sk, psock->sk_proto, psock->saved_write_space); psock->sk_proto = NULL; } else { sk->sk_write_space = psock->saved_write_space; -- cgit v1.2.3 From a178b4585865a4c756c41bc5376f63416b7d9271 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Mon, 17 Feb 2020 12:15:29 +0000 Subject: bpf, sk_msg: Don't clear saved sock proto on restore There is no need to clear psock->sk_proto when restoring socket protocol callbacks in sk->sk_prot. The psock is about to get detached from the sock and eventually destroyed. At worst we will restore the protocol callbacks and the write callback twice. This makes reasoning about psock state easier. Once psock is initialized, we can count on psock->sk_proto always being set. Signed-off-by: Jakub Sitnicki Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200217121530.754315-3-jakub@cloudflare.com --- include/linux/skmsg.h | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 8605947d6c08..d90ef61712a1 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -359,13 +359,7 @@ static inline void sk_psock_restore_proto(struct sock *sk, struct sk_psock *psock) { sk->sk_prot->unhash = psock->saved_unhash; - - if (psock->sk_proto) { - tcp_update_ulp(sk, psock->sk_proto, psock->saved_write_space); - psock->sk_proto = NULL; - } else { - sk->sk_write_space = psock->saved_write_space; - } + tcp_update_ulp(sk, psock->sk_proto, psock->saved_write_space); } static inline void sk_psock_set_state(struct sk_psock *psock, -- cgit v1.2.3 From fff7b64355eac6e29b50229ad1512315bc04b44e Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Mon, 17 Feb 2020 19:04:31 -0800 Subject: bpf: Add bpf_read_branch_records() helper Branch records are a CPU feature that can be configured to record certain branches that are taken during code execution. This data is particularly interesting for profile guided optimizations. perf has had branch record support for a while but the data collection can be a bit coarse grained. We (Facebook) have seen in experiments that associating metadata with branch records can improve results (after postprocessing). We generally use bpf_probe_read_*() to get metadata out of userspace. That's why bpf support for branch records is useful. Aside from this particular use case, having branch data available to bpf progs can be useful to get stack traces out of userspace applications that omit frame pointers. Signed-off-by: Daniel Xu Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200218030432.4600-2-dxu@dxuuu.xyz --- include/uapi/linux/bpf.h | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f1d74a2bd234..a7e59756853f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2892,6 +2892,25 @@ union bpf_attr { * Obtain the 64bit jiffies * Return * The 64 bit jiffies + * + * int bpf_read_branch_records(struct bpf_perf_event_data *ctx, void *buf, u32 size, u64 flags) + * Description + * For an eBPF program attached to a perf event, retrieve the + * branch records (struct perf_branch_entry) associated to *ctx* + * and store it in the buffer pointed by *buf* up to size + * *size* bytes. + * Return + * On success, number of bytes written to *buf*. On error, a + * negative value. + * + * The *flags* can be set to **BPF_F_GET_BRANCH_RECORDS_SIZE** to + * instead return the number of bytes required to store all the + * branch entries. If this flag is set, *buf* may be NULL. + * + * **-EINVAL** if arguments invalid or **size** not a multiple + * of sizeof(struct perf_branch_entry). + * + * **-ENOENT** if architecture does not support branch records. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3012,7 +3031,8 @@ union bpf_attr { FN(probe_read_kernel_str), \ FN(tcp_send_ack), \ FN(send_signal_thread), \ - FN(jiffies64), + FN(jiffies64), \ + FN(read_branch_records), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -3091,6 +3111,9 @@ enum bpf_func_id { /* BPF_FUNC_sk_storage_get flags */ #define BPF_SK_STORAGE_GET_F_CREATE (1ULL << 0) +/* BPF_FUNC_read_branch_records flags. */ +#define BPF_F_GET_BRANCH_RECORDS_SIZE (1ULL << 0) + /* Mode for BPF_FUNC_skb_adjust_room helper. */ enum bpf_adj_room_mode { BPF_ADJ_ROOM_NET, -- cgit v1.2.3 From b8e202d1d1d0f182f01062804efb523ea9a9008c Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Tue, 18 Feb 2020 17:10:13 +0000 Subject: net, sk_msg: Annotate lockless access to sk_prot on clone sk_msg and ULP frameworks override protocol callbacks pointer in sk->sk_prot, while tcp accesses it locklessly when cloning the listening socket, that is with neither sk_lock nor sk_callback_lock held. Once we enable use of listening sockets with sockmap (and hence sk_msg), there will be shared access to sk->sk_prot if socket is getting cloned while being inserted/deleted to/from the sockmap from another CPU: Read side: tcp_v4_rcv sk = __inet_lookup_skb(...) tcp_check_req(sk) inet_csk(sk)->icsk_af_ops->syn_recv_sock tcp_v4_syn_recv_sock tcp_create_openreq_child inet_csk_clone_lock sk_clone_lock READ_ONCE(sk->sk_prot) Write side: sock_map_ops->map_update_elem sock_map_update_elem sock_map_update_common sock_map_link_no_progs tcp_bpf_init tcp_bpf_update_sk_prot sk_psock_update_proto WRITE_ONCE(sk->sk_prot, ops) sock_map_ops->map_delete_elem sock_map_delete_elem __sock_map_delete sock_map_unref sk_psock_put sk_psock_drop sk_psock_restore_proto tcp_update_ulp WRITE_ONCE(sk->sk_prot, proto) Mark the shared access with READ_ONCE/WRITE_ONCE annotations. Signed-off-by: Jakub Sitnicki Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200218171023.844439-2-jakub@cloudflare.com --- include/linux/skmsg.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index d90ef61712a1..112765bd146d 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -352,7 +352,8 @@ static inline void sk_psock_update_proto(struct sock *sk, psock->saved_write_space = sk->sk_write_space; psock->sk_proto = sk->sk_prot; - sk->sk_prot = ops; + /* Pairs with lockless read in sk_clone_lock() */ + WRITE_ONCE(sk->sk_prot, ops); } static inline void sk_psock_restore_proto(struct sock *sk, -- cgit v1.2.3 From f1ff5ce2cd5ef3335f19c0f6576582c87045b04f Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Tue, 18 Feb 2020 17:10:14 +0000 Subject: net, sk_msg: Clear sk_user_data pointer on clone if tagged sk_user_data can hold a pointer to an object that is not intended to be shared between the parent socket and the child that gets a pointer copy on clone. This is the case when sk_user_data points at reference-counted object, like struct sk_psock. One way to resolve it is to tag the pointer with a no-copy flag by repurposing its lowest bit. Based on the bit-flag value we clear the child sk_user_data pointer after cloning the parent socket. The no-copy flag is stored in the pointer itself as opposed to externally, say in socket flags, to guarantee that the pointer and the flag are copied from parent to child socket in an atomic fashion. Parent socket state is subject to change while copying, we don't hold any locks at that time. This approach relies on an assumption that sk_user_data holds a pointer to an object aligned at least 2 bytes. A manual audit of existing users of rcu_dereference_sk_user_data helper confirms our assumption. Also, an RCU-protected sk_user_data is not likely to hold a pointer to a char value or a pathological case of "struct { char c; }". To be safe, warn when the flag-bit is set when setting sk_user_data to catch any future misuses. It is worth considering why clearing sk_user_data unconditionally is not an option. There exist users, DRBD, NVMe, and Xen drivers being among them, that rely on the pointer being copied when cloning the listening socket. Potentially we could distinguish these users by checking if the listening socket has been created in kernel-space via sock_create_kern, and hence has sk_kern_sock flag set. However, this is not the case for NVMe and Xen drivers, which create sockets without marking them as belonging to the kernel. Signed-off-by: Jakub Sitnicki Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200218171023.844439-3-jakub@cloudflare.com --- include/net/sock.h | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 02162b0378f7..9f37fdfd15d4 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -502,10 +502,43 @@ enum sk_pacing { SK_PACING_FQ = 2, }; +/* Pointer stored in sk_user_data might not be suitable for copying + * when cloning the socket. For instance, it can point to a reference + * counted object. sk_user_data bottom bit is set if pointer must not + * be copied. + */ +#define SK_USER_DATA_NOCOPY 1UL +#define SK_USER_DATA_PTRMASK ~(SK_USER_DATA_NOCOPY) + +/** + * sk_user_data_is_nocopy - Test if sk_user_data pointer must not be copied + * @sk: socket + */ +static inline bool sk_user_data_is_nocopy(const struct sock *sk) +{ + return ((uintptr_t)sk->sk_user_data & SK_USER_DATA_NOCOPY); +} + #define __sk_user_data(sk) ((*((void __rcu **)&(sk)->sk_user_data))) -#define rcu_dereference_sk_user_data(sk) rcu_dereference(__sk_user_data((sk))) -#define rcu_assign_sk_user_data(sk, ptr) rcu_assign_pointer(__sk_user_data((sk)), ptr) +#define rcu_dereference_sk_user_data(sk) \ +({ \ + void *__tmp = rcu_dereference(__sk_user_data((sk))); \ + (void *)((uintptr_t)__tmp & SK_USER_DATA_PTRMASK); \ +}) +#define rcu_assign_sk_user_data(sk, ptr) \ +({ \ + uintptr_t __tmp = (uintptr_t)(ptr); \ + WARN_ON_ONCE(__tmp & ~SK_USER_DATA_PTRMASK); \ + rcu_assign_pointer(__sk_user_data((sk)), __tmp); \ +}) +#define rcu_assign_sk_user_data_nocopy(sk, ptr) \ +({ \ + uintptr_t __tmp = (uintptr_t)(ptr); \ + WARN_ON_ONCE(__tmp & ~SK_USER_DATA_PTRMASK); \ + rcu_assign_pointer(__sk_user_data((sk)), \ + __tmp | SK_USER_DATA_NOCOPY); \ +}) /* * SK_CAN_REUSE and SK_NO_REUSE on a socket mean that the socket is OK -- cgit v1.2.3 From e80251555f0befd1271e74b080bccf0ff0348bfc Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Tue, 18 Feb 2020 17:10:15 +0000 Subject: tcp_bpf: Don't let child socket inherit parent protocol ops on copy Prepare for cloning listening sockets that have their protocol callbacks overridden by sk_msg. Child sockets must not inherit parent callbacks that access state stored in sk_user_data owned by the parent. Restore the child socket protocol callbacks before it gets hashed and any of the callbacks can get invoked. Signed-off-by: Jakub Sitnicki Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200218171023.844439-4-jakub@cloudflare.com --- include/net/tcp.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index a5ea27df3c2b..07f947cc80e6 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2203,6 +2203,13 @@ int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len); int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, int len, int flags); +#ifdef CONFIG_NET_SOCK_MSG +void tcp_bpf_clone(const struct sock *sk, struct sock *newsk); +#else +static inline void tcp_bpf_clone(const struct sock *sk, struct sock *newsk) +{ +} +#endif /* Call BPF_SOCK_OPS program that returns an int. If the return value * is < 0, then the BPF op failed (for example if the loaded BPF -- cgit v1.2.3 From 035ff358f2d9e2f5e1639ba4defe4dc40ac642dd Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Tue, 18 Feb 2020 17:10:21 +0000 Subject: net: Generate reuseport group ID on group creation Commit 736b46027eb4 ("net: Add ID (if needed) to sock_reuseport and expose reuseport_lock") has introduced lazy generation of reuseport group IDs that survive group resize. By comparing the identifier we check if BPF reuseport program is not trying to select a socket from a BPF map that belongs to a different reuseport group than the one the packet is for. Because SOCKARRAY used to be the only BPF map type that can be used with reuseport BPF, it was possible to delay the generation of reuseport group ID until a socket from the group was inserted into BPF map for the first time. Now that SOCK{MAP,HASH} can be used with reuseport BPF we have two options, either generate the reuseport ID on map update, like SOCKARRAY does, or allocate an ID from the start when reuseport group gets created. This patch takes the latter approach to keep sockmap free of calls into reuseport code. This streamlines the reuseport_id access as its lifetime now matches the longevity of reuseport object. The cost of this simplification, however, is that we allocate reuseport IDs for all SO_REUSEPORT users. Even those that don't use SOCKARRAY in their setups. With the way identifiers are currently generated, we can have at most S32_MAX reuseport groups, which hopefully is sufficient. If we ever get close to the limit, we can switch an u64 counter like sk_cookie. Another change is that we now always call into SOCKARRAY logic to unlink the socket from the map when unhashing or closing the socket. Previously we did it only when at least one socket from the group was in a BPF map. It is worth noting that this doesn't conflict with sockmap tear-down in case a socket is in a SOCK{MAP,HASH} and belongs to a reuseport group. sockmap tear-down happens first: prot->unhash `- tcp_bpf_unhash |- tcp_bpf_remove | `- while (sk_psock_link_pop(psock)) | `- sk_psock_unlink | `- sock_map_delete_from_link | `- __sock_map_delete | `- sock_map_unref | `- sk_psock_put | `- sk_psock_drop | `- rcu_assign_sk_user_data(sk, NULL) `- inet_unhash `- reuseport_detach_sock `- bpf_sk_reuseport_detach `- WRITE_ONCE(sk->sk_user_data, NULL) Suggested-by: Martin Lau Signed-off-by: Jakub Sitnicki Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200218171023.844439-10-jakub@cloudflare.com --- include/net/sock_reuseport.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h index 43f4a818d88f..3ecaa15d1850 100644 --- a/include/net/sock_reuseport.h +++ b/include/net/sock_reuseport.h @@ -55,6 +55,4 @@ static inline bool reuseport_has_conns(struct sock *sk, bool set) return ret; } -int reuseport_get_id(struct sock_reuseport *reuse); - #endif /* _SOCK_REUSEPORT_H */ -- cgit v1.2.3