diff options
Diffstat (limited to 'kernel/bpf/sockmap.c')
| -rw-r--r-- | kernel/bpf/sockmap.c | 2629 |
1 files changed, 0 insertions, 2629 deletions
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c deleted file mode 100644 index d37a1a0a6e1e..000000000000 --- a/kernel/bpf/sockmap.c +++ /dev/null @@ -1,2629 +0,0 @@ -/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - */ - -/* A BPF sock_map is used to store sock objects. This is primarly used - * for doing socket redirect with BPF helper routines. - * - * A sock map may have BPF programs attached to it, currently a program - * used to parse packets and a program to provide a verdict and redirect - * decision on the packet are supported. Any programs attached to a sock - * map are inherited by sock objects when they are added to the map. If - * no BPF programs are attached the sock object may only be used for sock - * redirect. - * - * A sock object may be in multiple maps, but can only inherit a single - * parse or verdict program. If adding a sock object to a map would result - * in having multiple parsing programs the update will return an EBUSY error. - * - * For reference this program is similar to devmap used in XDP context - * reviewing these together may be useful. For an example please review - * ./samples/bpf/sockmap/. - */ -#include <linux/bpf.h> -#include <net/sock.h> -#include <linux/filter.h> -#include <linux/errno.h> -#include <linux/file.h> -#include <linux/kernel.h> -#include <linux/net.h> -#include <linux/skbuff.h> -#include <linux/workqueue.h> -#include <linux/list.h> -#include <linux/mm.h> -#include <net/strparser.h> -#include <net/tcp.h> -#include <linux/ptr_ring.h> -#include <net/inet_common.h> -#include <linux/sched/signal.h> - -#define SOCK_CREATE_FLAG_MASK \ - (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) - -struct bpf_sock_progs { - struct bpf_prog *bpf_tx_msg; - struct bpf_prog *bpf_parse; - struct bpf_prog *bpf_verdict; -}; - -struct bpf_stab { - struct bpf_map map; - struct sock **sock_map; - struct bpf_sock_progs progs; - raw_spinlock_t lock; -}; - -struct bucket { - struct hlist_head head; - raw_spinlock_t lock; -}; - -struct bpf_htab { - struct bpf_map map; - struct bucket *buckets; - atomic_t count; - u32 n_buckets; - u32 elem_size; - struct bpf_sock_progs progs; - struct rcu_head rcu; -}; - -struct htab_elem { - struct rcu_head rcu; - struct hlist_node hash_node; - u32 hash; - struct sock *sk; - char key[0]; -}; - -enum smap_psock_state { - SMAP_TX_RUNNING, -}; - -struct smap_psock_map_entry { - struct list_head list; - struct bpf_map *map; - struct sock **entry; - struct htab_elem __rcu *hash_link; -}; - -struct smap_psock { - struct rcu_head rcu; - refcount_t refcnt; - - /* datapath variables */ - struct sk_buff_head rxqueue; - bool strp_enabled; - - /* datapath error path cache across tx work invocations */ - int save_rem; - int save_off; - struct sk_buff *save_skb; - - /* datapath variables for tx_msg ULP */ - struct sock *sk_redir; - int apply_bytes; - int cork_bytes; - int sg_size; - int eval; - struct sk_msg_buff *cork; - struct list_head ingress; - - struct strparser strp; - struct bpf_prog *bpf_tx_msg; - struct bpf_prog *bpf_parse; - struct bpf_prog *bpf_verdict; - struct list_head maps; - spinlock_t maps_lock; - - /* Back reference used when sock callback trigger sockmap operations */ - struct sock *sock; - unsigned long state; - - struct work_struct tx_work; - struct work_struct gc_work; - - struct proto *sk_proto; - void (*save_unhash)(struct sock *sk); - void (*save_close)(struct sock *sk, long timeout); - void (*save_data_ready)(struct sock *sk); - void (*save_write_space)(struct sock *sk); -}; - -static void smap_release_sock(struct smap_psock *psock, struct sock *sock); -static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, - int nonblock, int flags, int *addr_len); -static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); -static int bpf_tcp_sendpage(struct sock *sk, struct page *page, - int offset, size_t size, int flags); -static void bpf_tcp_unhash(struct sock *sk); -static void bpf_tcp_close(struct sock *sk, long timeout); - -static inline struct smap_psock *smap_psock_sk(const struct sock *sk) -{ - return rcu_dereference_sk_user_data(sk); -} - -static bool bpf_tcp_stream_read(const struct sock *sk) -{ - struct smap_psock *psock; - bool empty = true; - - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (unlikely(!psock)) - goto out; - empty = list_empty(&psock->ingress); -out: - rcu_read_unlock(); - return !empty; -} - -enum { - SOCKMAP_IPV4, - SOCKMAP_IPV6, - SOCKMAP_NUM_PROTS, -}; - -enum { - SOCKMAP_BASE, - SOCKMAP_TX, - SOCKMAP_NUM_CONFIGS, -}; - -static struct proto *saved_tcpv6_prot __read_mostly; -static DEFINE_SPINLOCK(tcpv6_prot_lock); -static struct proto bpf_tcp_prots[SOCKMAP_NUM_PROTS][SOCKMAP_NUM_CONFIGS]; -static void build_protos(struct proto prot[SOCKMAP_NUM_CONFIGS], - struct proto *base) -{ - prot[SOCKMAP_BASE] = *base; - prot[SOCKMAP_BASE].unhash = bpf_tcp_unhash; - prot[SOCKMAP_BASE].close = bpf_tcp_close; - prot[SOCKMAP_BASE].recvmsg = bpf_tcp_recvmsg; - prot[SOCKMAP_BASE].stream_memory_read = bpf_tcp_stream_read; - - prot[SOCKMAP_TX] = prot[SOCKMAP_BASE]; - prot[SOCKMAP_TX].sendmsg = bpf_tcp_sendmsg; - prot[SOCKMAP_TX].sendpage = bpf_tcp_sendpage; -} - -static void update_sk_prot(struct sock *sk, struct smap_psock *psock) -{ - int family = sk->sk_family == AF_INET6 ? SOCKMAP_IPV6 : SOCKMAP_IPV4; - int conf = psock->bpf_tx_msg ? SOCKMAP_TX : SOCKMAP_BASE; - - sk->sk_prot = &bpf_tcp_prots[family][conf]; -} - -static int bpf_tcp_init(struct sock *sk) -{ - struct smap_psock *psock; - - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (unlikely(!psock)) { - rcu_read_unlock(); - return -EINVAL; - } - - if (unlikely(psock->sk_proto)) { - rcu_read_unlock(); - return -EBUSY; - } - - psock->save_unhash = sk->sk_prot->unhash; - psock->save_close = sk->sk_prot->close; - psock->sk_proto = sk->sk_prot; - - /* Build IPv6 sockmap whenever the address of tcpv6_prot changes */ - if (sk->sk_family == AF_INET6 && - unlikely(sk->sk_prot != smp_load_acquire(&saved_tcpv6_prot))) { - spin_lock_bh(&tcpv6_prot_lock); - if (likely(sk->sk_prot != saved_tcpv6_prot)) { - build_protos(bpf_tcp_prots[SOCKMAP_IPV6], sk->sk_prot); - smp_store_release(&saved_tcpv6_prot, sk->sk_prot); - } - spin_unlock_bh(&tcpv6_prot_lock); - } - update_sk_prot(sk, psock); - rcu_read_unlock(); - return 0; -} - -static void smap_release_sock(struct smap_psock *psock, struct sock *sock); -static int free_start_sg(struct sock *sk, struct sk_msg_buff *md, bool charge); - -static void bpf_tcp_release(struct sock *sk) -{ - struct smap_psock *psock; - - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (unlikely(!psock)) - goto out; - - if (psock->cork) { - free_start_sg(psock->sock, psock->cork, true); - kfree(psock->cork); - psock->cork = NULL; - } - - if (psock->sk_proto) { - sk->sk_prot = psock->sk_proto; - psock->sk_proto = NULL; - } -out: - rcu_read_unlock(); -} - -static struct htab_elem *lookup_elem_raw(struct hlist_head *head, - u32 hash, void *key, u32 key_size) -{ - struct htab_elem *l; - - hlist_for_each_entry_rcu(l, head, hash_node) { - if (l->hash == hash && !memcmp(&l->key, key, key_size)) - return l; - } - - return NULL; -} - -static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash) -{ - return &htab->buckets[hash & (htab->n_buckets - 1)]; -} - -static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash) -{ - return &__select_bucket(htab, hash)->head; -} - -static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) -{ - atomic_dec(&htab->count); - kfree_rcu(l, rcu); -} - -static struct smap_psock_map_entry *psock_map_pop(struct sock *sk, - struct smap_psock *psock) -{ - struct smap_psock_map_entry *e; - - spin_lock_bh(&psock->maps_lock); - e = list_first_entry_or_null(&psock->maps, - struct smap_psock_map_entry, - list); - if (e) - list_del(&e->list); - spin_unlock_bh(&psock->maps_lock); - return e; -} - -static void bpf_tcp_remove(struct sock *sk, struct smap_psock *psock) -{ - struct smap_psock_map_entry *e; - struct sk_msg_buff *md, *mtmp; - struct sock *osk; - - if (psock->cork) { - free_start_sg(psock->sock, psock->cork, true); - kfree(psock->cork); - psock->cork = NULL; - } - - list_for_each_entry_safe(md, mtmp, &psock->ingress, list) { - list_del(&md->list); - free_start_sg(psock->sock, md, true); - kfree(md); - } - - e = psock_map_pop(sk, psock); - while (e) { - if (e->entry) { - struct bpf_stab *stab = container_of(e->map, struct bpf_stab, map); - - raw_spin_lock_bh(&stab->lock); - osk = *e->entry; - if (osk == sk) { - *e->entry = NULL; - smap_release_sock(psock, sk); - } - raw_spin_unlock_bh(&stab->lock); - } else { - struct htab_elem *link = rcu_dereference(e->hash_link); - struct bpf_htab *htab = container_of(e->map, struct bpf_htab, map); - struct hlist_head *head; - struct htab_elem *l; - struct bucket *b; - - b = __select_bucket(htab, link->hash); - head = &b->head; - raw_spin_lock_bh(&b->lock); - l = lookup_elem_raw(head, - link->hash, link->key, - htab->map.key_size); - /* If another thread deleted this object skip deletion. - * The refcnt on psock may or may not be zero. - */ - if (l && l == link) { - hlist_del_rcu(&link->hash_node); - smap_release_sock(psock, link->sk); - free_htab_elem(htab, link); - } - raw_spin_unlock_bh(&b->lock); - } - kfree(e); - e = psock_map_pop(sk, psock); - } -} - -static void bpf_tcp_unhash(struct sock *sk) -{ - void (*unhash_fun)(struct sock *sk); - struct smap_psock *psock; - - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (unlikely(!psock)) { - rcu_read_unlock(); - if (sk->sk_prot->unhash) - sk->sk_prot->unhash(sk); - return; - } - unhash_fun = psock->save_unhash; - bpf_tcp_remove(sk, psock); - rcu_read_unlock(); - unhash_fun(sk); -} - -static void bpf_tcp_close(struct sock *sk, long timeout) -{ - void (*close_fun)(struct sock *sk, long timeout); - struct smap_psock *psock; - - lock_sock(sk); - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (unlikely(!psock)) { - rcu_read_unlock(); - release_sock(sk); - return sk->sk_prot->close(sk, timeout); - } - close_fun = psock->save_close; - bpf_tcp_remove(sk, psock); - rcu_read_unlock(); - release_sock(sk); - close_fun(sk, timeout); -} - -enum __sk_action { - __SK_DROP = 0, - __SK_PASS, - __SK_REDIRECT, - __SK_NONE, -}; - -static struct tcp_ulp_ops bpf_tcp_ulp_ops __read_mostly = { - .name = "bpf_tcp", - .uid = TCP_ULP_BPF, - .user_visible = false, - .owner = NULL, - .init = bpf_tcp_init, - .release = bpf_tcp_release, -}; - -static int memcopy_from_iter(struct sock *sk, - struct sk_msg_buff *md, - struct iov_iter *from, int bytes) -{ - struct scatterlist *sg = md->sg_data; - int i = md->sg_curr, rc = -ENOSPC; - - do { - int copy; - char *to; - - if (md->sg_copybreak >= sg[i].length) { - md->sg_copybreak = 0; - - if (++i == MAX_SKB_FRAGS) - i = 0; - - if (i == md->sg_end) - break; - } - - copy = sg[i].length - md->sg_copybreak; - to = sg_virt(&sg[i]) + md->sg_copybreak; - md->sg_copybreak += copy; - - if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) - rc = copy_from_iter_nocache(to, copy, from); - else - rc = copy_from_iter(to, copy, from); - - if (rc != copy) { - rc = -EFAULT; - goto out; - } - - bytes -= copy; - if (!bytes) - break; - - md->sg_copybreak = 0; - if (++i == MAX_SKB_FRAGS) - i = 0; - } while (i != md->sg_end); -out: - md->sg_curr = i; - return rc; -} - -static int bpf_tcp_push(struct sock *sk, int apply_bytes, - struct sk_msg_buff *md, - int flags, bool uncharge) -{ - bool apply = apply_bytes; - struct scatterlist *sg; - int offset, ret = 0; - struct page *p; - size_t size; - - while (1) { - sg = md->sg_data + md->sg_start; - size = (apply && apply_bytes < sg->length) ? - apply_bytes : sg->length; - offset = sg->offset; - - tcp_rate_check_app_limited(sk); - p = sg_page(sg); -retry: - ret = do_tcp_sendpages(sk, p, offset, size, flags); - if (ret != size) { - if (ret > 0) { - if (apply) - apply_bytes -= ret; - - sg->offset += ret; - sg->length -= ret; - size -= ret; - offset += ret; - if (uncharge) - sk_mem_uncharge(sk, ret); - goto retry; - } - - return ret; - } - - if (apply) - apply_bytes -= ret; - sg->offset += ret; - sg->length -= ret; - if (uncharge) - sk_mem_uncharge(sk, ret); - - if (!sg->length) { - put_page(p); - md->sg_start++; - if (md->sg_start == MAX_SKB_FRAGS) - md->sg_start = 0; - sg_init_table(sg, 1); - - if (md->sg_start == md->sg_end) - break; - } - - if (apply && !apply_bytes) - break; - } - return 0; -} - -static inline void bpf_compute_data_pointers_sg(struct sk_msg_buff *md) -{ - struct scatterlist *sg = md->sg_data + md->sg_start; - - if (md->sg_copy[md->sg_start]) { - md->data = md->data_end = 0; - } else { - md->data = sg_virt(sg); - md->data_end = md->data + sg->length; - } -} - -static void return_mem_sg(struct sock *sk, int bytes, struct sk_msg_buff *md) -{ - struct scatterlist *sg = md->sg_data; - int i = md->sg_start; - - do { - int uncharge = (bytes < sg[i].length) ? bytes : sg[i].length; - - sk_mem_uncharge(sk, uncharge); - bytes -= uncharge; - if (!bytes) - break; - i++; - if (i == MAX_SKB_FRAGS) - i = 0; - } while (i != md->sg_end); -} - -static void free_bytes_sg(struct sock *sk, int bytes, - struct sk_msg_buff *md, bool charge) -{ - struct scatterlist *sg = md->sg_data; - int i = md->sg_start, free; - - while (bytes && sg[i].length) { - free = sg[i].length; - if (bytes < free) { - sg[i].length -= bytes; - sg[i].offset += bytes; - if (charge) - sk_mem_uncharge(sk, bytes); - break; - } - - if (charge) - sk_mem_uncharge(sk, sg[i].length); - put_page(sg_page(&sg[i])); - bytes -= sg[i].length; - sg[i].length = 0; - sg[i].page_link = 0; - sg[i].offset = 0; - i++; - - if (i == MAX_SKB_FRAGS) - i = 0; - } - md->sg_start = i; -} - -static int free_sg(struct sock *sk, int start, - struct sk_msg_buff *md, bool charge) -{ - struct scatterlist *sg = md->sg_data; - int i = start, free = 0; - - while (sg[i].length) { - free += sg[i].length; - if (charge) - sk_mem_uncharge(sk, sg[i].length); - if (!md->skb) - put_page(sg_page(&sg[i])); - sg[i].length = 0; - sg[i].page_link = 0; - sg[i].offset = 0; - i++; - - if (i == MAX_SKB_FRAGS) - i = 0; - } - consume_skb(md->skb); - - return free; -} - -static int free_start_sg(struct sock *sk, struct sk_msg_buff *md, bool charge) -{ - int free = free_sg(sk, md->sg_start, md, charge); - - md->sg_start = md->sg_end; - return free; -} - -static int free_curr_sg(struct sock *sk, struct sk_msg_buff *md) -{ - return free_sg(sk, md->sg_curr, md, true); -} - -static int bpf_map_msg_verdict(int _rc, struct sk_msg_buff *md) -{ - return ((_rc == SK_PASS) ? - (md->sk_redir ? __SK_REDIRECT : __SK_PASS) : - __SK_DROP); -} - -static unsigned int smap_do_tx_msg(struct sock *sk, - struct smap_psock *psock, - struct sk_msg_buff *md) -{ - struct bpf_prog *prog; - unsigned int rc, _rc; - - preempt_disable(); - rcu_read_lock(); - - /* If the policy was removed mid-send then default to 'accept' */ - prog = READ_ONCE(psock->bpf_tx_msg); - if (unlikely(!prog)) { - _rc = SK_PASS; - goto verdict; - } - - bpf_compute_data_pointers_sg(md); - md->sk = sk; - rc = (*prog->bpf_func)(md, prog->insnsi); - psock->apply_bytes = md->apply_bytes; - - /* Moving return codes from UAPI namespace into internal namespace */ - _rc = bpf_map_msg_verdict(rc, md); - - /* The psock has a refcount on the sock but not on the map and because - * we need to drop rcu read lock here its possible the map could be - * removed between here and when we need it to execute the sock - * redirect. So do the map lookup now for future use. - */ - if (_rc == __SK_REDIRECT) { - if (psock->sk_redir) - sock_put(psock->sk_redir); - psock->sk_redir = do_msg_redirect_map(md); - if (!psock->sk_redir) { - _rc = __SK_DROP; - goto verdict; - } - sock_hold(psock->sk_redir); - } -verdict: - rcu_read_unlock(); - preempt_enable(); - - return _rc; -} - -static int bpf_tcp_ingress(struct sock *sk, int apply_bytes, - struct smap_psock *psock, - struct sk_msg_buff *md, int flags) -{ - bool apply = apply_bytes; - size_t size, copied = 0; - struct sk_msg_buff *r; - int err = 0, i; - - r = kzalloc(sizeof(struct sk_msg_buff), __GFP_NOWARN | GFP_KERNEL); - if (unlikely(!r)) - return -ENOMEM; - - lock_sock(sk); - r->sg_start = md->sg_start; - i = md->sg_start; - - do { - size = (apply && apply_bytes < md->sg_data[i].length) ? - apply_bytes : md->sg_data[i].length; - - if (!sk_wmem_schedule(sk, size)) { - if (!copied) - err = -ENOMEM; - break; - } - - sk_mem_charge(sk, size); - r->sg_data[i] = md->sg_data[i]; - r->sg_data[i].length = size; - md->sg_data[i].length -= size; - md->sg_data[i].offset += size; - copied += size; - - if (md->sg_data[i].length) { - get_page(sg_page(&r->sg_data[i])); - r->sg_end = (i + 1) == MAX_SKB_FRAGS ? 0 : i + 1; - } else { - i++; - if (i == MAX_SKB_FRAGS) - i = 0; - r->sg_end = i; - } - - if (apply) { - apply_bytes -= size; - if (!apply_bytes) - break; - } - } while (i != md->sg_end); - - md->sg_start = i; - - if (!err) { - list_add_tail(&r->list, &psock->ingress); - sk->sk_data_ready(sk); - } else { - free_start_sg(sk, r, true); - kfree(r); - } - - release_sock(sk); - return err; -} - -static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send, - struct sk_msg_buff *md, - int flags) -{ - bool ingress = !!(md->flags & BPF_F_INGRESS); - struct smap_psock *psock; - int err = 0; - - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (unlikely(!psock)) - goto out_rcu; - - if (!refcount_inc_not_zero(&psock->refcnt)) - goto out_rcu; - - rcu_read_unlock(); - - if (ingress) { - err = bpf_tcp_ingress(sk, send, psock, md, flags); - } else { - lock_sock(sk); - err = bpf_tcp_push(sk, send, md, flags, false); - release_sock(sk); - } - smap_release_sock(psock, sk); - return err; -out_rcu: - rcu_read_unlock(); - return 0; -} - -static inline void bpf_md_init(struct smap_psock *psock) -{ - if (!psock->apply_bytes) { - psock->eval = __SK_NONE; - if (psock->sk_redir) { - sock_put(psock->sk_redir); - psock->sk_redir = NULL; - } - } -} - -static void apply_bytes_dec(struct smap_psock *psock, int i) -{ - if (psock->apply_bytes) { - if (psock->apply_bytes < i) - psock->apply_bytes = 0; - else - psock->apply_bytes -= i; - } -} - -static int bpf_exec_tx_verdict(struct smap_psock *psock, - struct sk_msg_buff *m, - struct sock *sk, - int *copied, int flags) -{ - bool cork = false, enospc = (m->sg_start == m->sg_end); - struct sock *redir; - int err = 0; - int send; - -more_data: - if (psock->eval == __SK_NONE) - psock->eval = smap_do_tx_msg(sk, psock, m); - - if (m->cork_bytes && - m->cork_bytes > psock->sg_size && !enospc) { - psock->cork_bytes = m->cork_bytes - psock->sg_size; - if (!psock->cork) { - psock->cork = kcalloc(1, - sizeof(struct sk_msg_buff), - GFP_ATOMIC | __GFP_NOWARN); - - if (!psock->cork) { - err = -ENOMEM; - goto out_err; - } - } - memcpy(psock->cork, m, sizeof(*m)); - goto out_err; - } - - send = psock->sg_size; - if (psock->apply_bytes && psock->apply_bytes < send) - send = psock->apply_bytes; - - switch (psock->eval) { - case __SK_PASS: - err = bpf_tcp_push(sk, send, m, flags, true); - if (unlikely(err)) { - *copied -= free_start_sg(sk, m, true); - break; - } - - apply_bytes_dec(psock, send); - psock->sg_size -= send; - break; - case __SK_REDIRECT: - redir = psock->sk_redir; - apply_bytes_dec(psock, send); - - if (psock->cork) { - cork = true; - psock->cork = NULL; - } - - return_mem_sg(sk, send, m); - release_sock(sk); - - err = bpf_tcp_sendmsg_do_redirect(redir, send, m, flags); - lock_sock(sk); - - if (unlikely(err < 0)) { - int free = free_start_sg(sk, m, false); - - psock->sg_size = 0; - if (!cork) - *copied -= free; - } else { - psock->sg_size -= send; - } - - if (cork) { - free_start_sg(sk, m, true); - psock->sg_size = 0; - kfree(m); - m = NULL; - err = 0; - } - break; - case __SK_DROP: - default: - free_bytes_sg(sk, send, m, true); - apply_bytes_dec(psock, send); - *copied -= send; - psock->sg_size -= send; - err = -EACCES; - break; - } - - if (likely(!err)) { - bpf_md_init(psock); - if (m && - m->sg_data[m->sg_start].page_link && - m->sg_data[m->sg_start].length) - goto more_data; - } - -out_err: - return err; -} - -static int bpf_wait_data(struct sock *sk, - struct smap_psock *psk, int flags, - long timeo, int *err) -{ - int rc; - - DEFINE_WAIT_FUNC(wait, woken_wake_function); - - add_wait_queue(sk_sleep(sk), &wait); - sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); - rc = sk_wait_event(sk, &timeo, - !list_empty(&psk->ingress) || - !skb_queue_empty(&sk->sk_receive_queue), - &wait); - sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); - remove_wait_queue(sk_sleep(sk), &wait); - - return rc; -} - -static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, - int nonblock, int flags, int *addr_len) -{ - struct iov_iter *iter = &msg->msg_iter; - struct smap_psock *psock; - int copied = 0; - - if (unlikely(flags & MSG_ERRQUEUE)) - return inet_recv_error(sk, msg, len, addr_len); - if (!skb_queue_empty(&sk->sk_receive_queue)) - return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); - - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (unlikely(!psock)) - goto out; - - if (unlikely(!refcount_inc_not_zero(&psock->refcnt))) - goto out; - rcu_read_unlock(); - - lock_sock(sk); -bytes_ready: - while (copied != len) { - struct scatterlist *sg; - struct sk_msg_buff *md; - int i; - - md = list_first_entry_or_null(&psock->ingress, - struct sk_msg_buff, list); - if (unlikely(!md)) - break; - i = md->sg_start; - do { - struct page *page; - int n, copy; - - sg = &md->sg_data[i]; - copy = sg->length; - page = sg_page(sg); - - if (copied + copy > len) - copy = len - copied; - - n = copy_page_to_iter(page, sg->offset, copy, iter); - if (n != copy) { - md->sg_start = i; - release_sock(sk); - smap_release_sock(psock, sk); - return -EFAULT; - } - - copied += copy; - sg->offset += copy; - sg->length -= copy; - sk_mem_uncharge(sk, copy); - - if (!sg->length) { - i++; - if (i == MAX_SKB_FRAGS) - i = 0; - if (!md->skb) - put_page(page); - } - if (copied == len) - break; - } while (i != md->sg_end); - md->sg_start = i; - - if (!sg->length && md->sg_start == md->sg_end) { - list_del(&md->list); - consume_skb(md->skb); - kfree(md); - } - } - - if (!copied) { - long timeo; - int data; - int err = 0; - - timeo = sock_rcvtimeo(sk, nonblock); - data = bpf_wait_data(sk, psock, flags, timeo, &err); - - if (data) { - if (!skb_queue_empty(&sk->sk_receive_queue)) { - release_sock(sk); - smap_release_sock(psock, sk); - copied = tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); - return copied; - } - goto bytes_ready; - } - - if (err) - copied = err; - } - - release_sock(sk); - smap_release_sock(psock, sk); - return copied; -out: - rcu_read_unlock(); - return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); -} - - -static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) -{ - int flags = msg->msg_flags | MSG_NO_SHARED_FRAGS; - struct sk_msg_buff md = {0}; - unsigned int sg_copy = 0; - struct smap_psock *psock; - int copied = 0, err = 0; - struct scatterlist *sg; - long timeo; - - /* Its possible a sock event or user removed the psock _but_ the ops - * have not been reprogrammed yet so we get here. In this case fallback - * to tcp_sendmsg. Note this only works because we _only_ ever allow - * a single ULP there is no hierarchy here. - */ - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (unlikely(!psock)) { - rcu_read_unlock(); - return tcp_sendmsg(sk, msg, size); - } - - /* Increment the psock refcnt to ensure its not released while sending a - * message. Required because sk lookup and bpf programs are used in - * separate rcu critical sections. Its OK if we lose the map entry - * but we can't lose the sock reference. - */ - if (!refcount_inc_not_zero(&psock->refcnt)) { - rcu_read_unlock(); - return tcp_sendmsg(sk, msg, size); - } - - sg = md.sg_data; - sg_init_marker(sg, MAX_SKB_FRAGS); - rcu_read_unlock(); - - lock_sock(sk); - timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); - - while (msg_data_left(msg)) { - struct sk_msg_buff *m = NULL; - bool enospc = false; - int copy; - - if (sk->sk_err) { - err = -sk->sk_err; - goto out_err; - } - - copy = msg_data_left(msg); - if (!sk_stream_memory_free(sk)) - goto wait_for_sndbuf; - - m = psock->cork_bytes ? psock->cork : &md; - m->sg_curr = m->sg_copybreak ? m->sg_curr : m->sg_end; - err = sk_alloc_sg(sk, copy, m->sg_data, - m->sg_start, &m->sg_end, &sg_copy, - m->sg_end - 1); - if (err) { - if (err != -ENOSPC) - goto wait_for_memory; - enospc = true; - copy = sg_copy; - } - - err = memcopy_from_iter(sk, m, &msg->msg_iter, copy); - if (err < 0) { - free_curr_sg(sk, m); - goto out_err; - } - - psock->sg_size += copy; - copied += copy; - sg_copy = 0; - - /* When bytes are being corked skip running BPF program and - * applying verdict unless there is no more buffer space. In - * the ENOSPC case simply run BPF prorgram with currently - * accumulated data. We don't have much choice at this point - * we could try extending the page frags or chaining complex - * frags but even in these cases _eventually_ we will hit an - * OOM scenario. More complex recovery schemes may be - * implemented in the future, but BPF programs must handle - * the case where apply_cork requests are not honored. The - * canonical method to verify this is to check data length. - */ - if (psock->cork_bytes) { - if (copy > psock->cork_bytes) - psock->cork_bytes = 0; - else - psock->cork_bytes -= copy; - - if (psock->cork_bytes && !enospc) - goto out_cork; - - /* All cork bytes accounted for re-run filter */ - psock->eval = __SK_NONE; - psock->cork_bytes = 0; - } - - err = bpf_exec_tx_verdict(psock, m, sk, &copied, flags); - if (unlikely(err < 0)) - goto out_err; - continue; -wait_for_sndbuf: - set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); -wait_for_memory: - err = sk_stream_wait_memory(sk, &timeo); - if (err) { - if (m && m != psock->cork) - free_start_sg(sk, m, true); - goto out_err; - } - } -out_err: - if (err < 0) - err = sk_stream_error(sk, msg->msg_flags, err); -out_cork: - release_sock(sk); - smap_release_sock(psock, sk); - return copied ? copied : err; -} - -static int bpf_tcp_sendpage(struct sock *sk, struct page *page, - int offset, size_t size, int flags) -{ - struct sk_msg_buff md = {0}, *m = NULL; - int err = 0, copied = 0; - struct smap_psock *psock; - struct scatterlist *sg; - bool enospc = false; - - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (unlikely(!psock)) - goto accept; - - if (!refcount_inc_not_zero(&psock->refcnt)) - goto accept; - rcu_read_unlock(); - - lock_sock(sk); - - if (psock->cork_bytes) { - m = psock->cork; - sg = &m->sg_data[m->sg_end]; - } else { - m = &md; - sg = m->sg_data; - sg_init_marker(sg, MAX_SKB_FRAGS); - } - - /* Catch case where ring is full and sendpage is stalled. */ - if (unlikely(m->sg_end == m->sg_start && - m->sg_data[m->sg_end].length)) - goto out_err; - - psock->sg_size += size; - sg_set_page(sg, page, size, offset); - get_page(page); - m->sg_copy[m->sg_end] = true; - sk_mem_charge(sk, size); - m->sg_end++; - copied = size; - - if (m->sg_end == MAX_SKB_FRAGS) - m->sg_end = 0; - - if (m->sg_end == m->sg_start) - enospc = true; - - if (psock->cork_bytes) { - if (size > psock->cork_bytes) - psock->cork_bytes = 0; - else - psock->cork_bytes -= size; - - if (psock->cork_bytes && !enospc) - goto out_err; - - /* All cork bytes accounted for re-run filter */ - psock->eval = __SK_NONE; - psock->cork_bytes = 0; - } - - err = bpf_exec_tx_verdict(psock, m, sk, &copied, flags); -out_err: - release_sock(sk); - smap_release_sock(psock, sk); - return copied ? copied : err; -accept: - rcu_read_unlock(); - return tcp_sendpage(sk, page, offset, size, flags); -} - -static void bpf_tcp_msg_add(struct smap_psock *psock, - struct sock *sk, - struct bpf_prog *tx_msg) -{ - struct bpf_prog *orig_tx_msg; - - orig_tx_msg = xchg(&psock->bpf_tx_msg, tx_msg); - if (orig_tx_msg) - bpf_prog_put(orig_tx_msg); -} - -static int bpf_tcp_ulp_register(void) -{ - build_protos(bpf_tcp_prots[SOCKMAP_IPV4], &tcp_prot); - /* Once BPF TX ULP is registered it is never unregistered. It - * will be in the ULP list for the lifetime of the system. Doing - * duplicate registers is not a problem. - */ - return tcp_register_ulp(&bpf_tcp_ulp_ops); -} - -static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) -{ - struct bpf_prog *prog = READ_ONCE(psock->bpf_verdict); - int rc; - - if (unlikely(!prog)) - return __SK_DROP; - - skb_orphan(skb); - /* We need to ensure that BPF metadata for maps is also cleared - * when we orphan the skb so that we don't have the possibility - * t |
