diff options
Diffstat (limited to 'net/core')
32 files changed, 1440 insertions, 201 deletions
diff --git a/net/core/Makefile b/net/core/Makefile index 62be9aef2528..c3ebbaf9c81e 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -19,6 +19,7 @@ obj-$(CONFIG_NETDEV_ADDR_LIST_TEST) += dev_addr_lists_test.o obj-y += net-sysfs.o obj-y += hotdata.o +obj-y += netdev_rx_queue.o obj-$(CONFIG_PAGE_POOL) += page_pool.o page_pool_user.o obj-$(CONFIG_PROC_FS) += net-procfs.o obj-$(CONFIG_NET_PKTGEN) += pktgen.o @@ -43,3 +44,4 @@ obj-$(CONFIG_BPF_SYSCALL) += sock_map.o obj-$(CONFIG_BPF_SYSCALL) += bpf_sk_storage.o obj-$(CONFIG_OF) += of_net.o obj-$(CONFIG_NET_TEST) += net_test.o +obj-$(CONFIG_NET_DEVMEM) += devmem.o diff --git a/net/core/datagram.c b/net/core/datagram.c index a40f733b37d7..f0693707aece 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -407,6 +407,9 @@ static int __skb_datagram_iter(const struct sk_buff *skb, int offset, return 0; } + if (!skb_frags_readable(skb)) + goto short_copy; + /* Copy paged appendix. Hmm... why does this look so complicated? */ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { int end; @@ -623,6 +626,9 @@ int zerocopy_fill_skb_from_iter(struct sk_buff *skb, { int frag = skb_shinfo(skb)->nr_frags; + if (!skb_frags_readable(skb)) + return -EFAULT; + while (length && iov_iter_count(from)) { struct page *head, *last_head = NULL; struct page *pages[MAX_SKB_FRAGS]; diff --git a/net/core/dev.c b/net/core/dev.c index f66e61407883..1e740faf9e78 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -158,8 +158,10 @@ #include <net/page_pool/types.h> #include <net/page_pool/helpers.h> #include <net/rps.h> +#include <linux/phy_link_topology.h> #include "dev.h" +#include "devmem.h" #include "net-sysfs.h" static DEFINE_SPINLOCK(ptype_lock); @@ -3310,6 +3312,10 @@ int skb_checksum_help(struct sk_buff *skb) return -EINVAL; } + if (!skb_frags_readable(skb)) { + return -EFAULT; + } + /* Before computing a checksum, we should make sure no frag could * be modified by an external entity : checksum could be wrong. */ @@ -3386,6 +3392,7 @@ int skb_crc32c_csum_help(struct sk_buff *skb) out: return ret; } +EXPORT_SYMBOL(skb_crc32c_csum_help); __be16 skb_network_protocol(struct sk_buff *skb, int *depth) { @@ -3431,8 +3438,9 @@ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb) if (!(dev->features & NETIF_F_HIGHDMA)) { for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + struct page *page = skb_frag_page(frag); - if (PageHighMem(skb_frag_page(frag))) + if (page && PageHighMem(page)) return 1; } } @@ -3705,7 +3713,7 @@ struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *d next = skb->next; skb_mark_not_on_list(skb); - /* in case skb wont be segmented, point to itself */ + /* in case skb won't be segmented, point to itself */ skb->prev = skb; skb = validate_xmit_skb(skb, dev, again); @@ -4245,13 +4253,6 @@ u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb, } EXPORT_SYMBOL(dev_pick_tx_zero); -u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb, - struct net_device *sb_dev) -{ - return (u16)raw_smp_processor_id() % dev->real_num_tx_queues; -} -EXPORT_SYMBOL(dev_pick_tx_cpu_id); - u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev) { @@ -5725,10 +5726,9 @@ static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemallo struct packet_type *pt_curr = NULL; /* Current (common) orig_dev of sublist */ struct net_device *od_curr = NULL; - struct list_head sublist; struct sk_buff *skb, *next; + LIST_HEAD(sublist); - INIT_LIST_HEAD(&sublist); list_for_each_entry_safe(skb, next, head, list) { struct net_device *orig_dev = skb->dev; struct packet_type *pt_prev = NULL; @@ -5866,9 +5866,8 @@ static int netif_receive_skb_internal(struct sk_buff *skb) void netif_receive_skb_list_internal(struct list_head *head) { struct sk_buff *skb, *next; - struct list_head sublist; + LIST_HEAD(sublist); - INIT_LIST_HEAD(&sublist); list_for_each_entry_safe(skb, next, head, list) { net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), skb); @@ -9272,7 +9271,7 @@ EXPORT_SYMBOL(netdev_port_same_parent_id); */ int dev_change_proto_down(struct net_device *dev, bool proto_down) { - if (!(dev->priv_flags & IFF_CHANGE_PROTO_DOWN)) + if (!dev->change_proto_down) return -EOPNOTSUPP; if (!netif_device_present(dev)) return -ENODEV; @@ -9369,6 +9368,20 @@ u8 dev_xdp_prog_count(struct net_device *dev) } EXPORT_SYMBOL_GPL(dev_xdp_prog_count); +int dev_xdp_propagate(struct net_device *dev, struct netdev_bpf *bpf) +{ + if (!dev->netdev_ops->ndo_bpf) + return -EOPNOTSUPP; + + if (dev_get_min_mp_channel_count(dev)) { + NL_SET_ERR_MSG(bpf->extack, "unable to propagate XDP to device using memory provider"); + return -EBUSY; + } + + return dev->netdev_ops->ndo_bpf(dev, bpf); +} +EXPORT_SYMBOL_GPL(dev_xdp_propagate); + u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode) { struct bpf_prog *prog = dev_xdp_prog(dev, mode); @@ -9397,6 +9410,11 @@ static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode, struct netdev_bpf xdp; int err; + if (dev_get_min_mp_channel_count(dev)) { + NL_SET_ERR_MSG(extack, "unable to install XDP to device using memory provider"); + return -EBUSY; + } + memset(&xdp, 0, sizeof(xdp)); xdp.command = mode == XDP_MODE_HW ? XDP_SETUP_PROG_HW : XDP_SETUP_PROG; xdp.extack = extack; @@ -9821,6 +9839,20 @@ err_out: return err; } +u32 dev_get_min_mp_channel_count(const struct net_device *dev) +{ + int i; + + ASSERT_RTNL(); + + for (i = dev->real_num_rx_queues - 1; i >= 0; i--) + if (dev->_rx[i].mp_params.mp_priv) + /* The channel count is the idx plus 1. */ + return i + 1; + + return 0; +} + /** * dev_index_reserve() - allocate an ifindex in a namespace * @net: the applicable net namespace @@ -10321,6 +10353,17 @@ static void netdev_do_free_pcpu_stats(struct net_device *dev) } } +static void netdev_free_phy_link_topology(struct net_device *dev) +{ + struct phy_link_topology *topo = dev->link_topo; + + if (IS_ENABLED(CONFIG_PHYLIB) && topo) { + xa_destroy(&topo->phys); + kfree(topo); + dev->link_topo = NULL; + } +} + /** * register_netdevice() - register a network device * @dev: device to register @@ -10868,7 +10911,7 @@ noinline void netdev_core_stats_inc(struct net_device *dev, u32 offset) return; } - field = (__force unsigned long __percpu *)((__force void *)p + offset); + field = (unsigned long __percpu *)((void __percpu *)p + offset); this_cpu_inc(*field); } EXPORT_SYMBOL_GPL(netdev_core_stats_inc); @@ -11099,6 +11142,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, #ifdef CONFIG_NET_SCHED hash_init(dev->qdisc_hash); #endif + dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; setup(dev); @@ -11120,7 +11164,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, if (!dev->ethtool) goto free_all; - strcpy(dev->name, name); + strscpy(dev->name, name); dev->name_assign_type = name_assign_type; dev->group = INIT_NETDEV_GROUP; if (!dev->ethtool_ops) @@ -11191,6 +11235,8 @@ void free_netdev(struct net_device *dev) free_percpu(dev->xdp_bulkq); dev->xdp_bulkq = NULL; + netdev_free_phy_link_topology(dev); + /* Compatibility with error handling in drivers */ if (dev->reg_state == NETREG_UNINITIALIZED || dev->reg_state == NETREG_DUMMY) { @@ -11343,6 +11389,7 @@ void unregister_netdevice_many_notify(struct list_head *head, dev_tcx_uninstall(dev); dev_xdp_uninstall(dev); bpf_dev_bound_netdev_unregister(dev); + dev_dmabuf_uninstall(dev); netdev_offload_xstats_disable_all(dev); @@ -11407,7 +11454,7 @@ void unregister_netdevice_many_notify(struct list_head *head, * @head: list of devices * * Note: As most callers use a stack allocated list_head, - * we force a list_del() to make sure stack wont be corrupted later. + * we force a list_del() to make sure stack won't be corrupted later. */ void unregister_netdevice_many(struct list_head *head) { @@ -11462,10 +11509,10 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, /* Don't allow namespace local devices to be moved. */ err = -EINVAL; - if (dev->features & NETIF_F_NETNS_LOCAL) + if (dev->netns_local) goto out; - /* Ensure the device has been registrered */ + /* Ensure the device has been registered */ if (dev->reg_state != NETREG_REGISTERED) goto out; @@ -11844,7 +11891,7 @@ static void __net_exit default_device_exit_net(struct net *net) char fb_name[IFNAMSIZ]; /* Ignore unmoveable devices (i.e. loopback) */ - if (dev->features & NETIF_F_NETNS_LOCAL) + if (dev->netns_local) continue; /* Leave virtual devices for the generic cleanup */ @@ -11905,7 +11952,7 @@ static struct pernet_operations __net_initdata default_device_ops = { static void __init net_dev_struct_check(void) { /* TX read-mostly hotpath */ - CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, priv_flags); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, priv_flags_fast); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, netdev_ops); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, header_ops); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, _tx); diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index baa63dee2829..166e404f7c03 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -262,7 +262,7 @@ static int __hw_addr_sync_multiple(struct netdev_hw_addr_list *to_list, } /* This function only works where there is a strict 1-1 relationship - * between source and destionation of they synch. If you ever need to + * between source and destination of they synch. If you ever need to * sync addresses to more then 1 destination, you need to use * __hw_addr_sync_multiple(). */ @@ -299,8 +299,8 @@ void __hw_addr_unsync(struct netdev_hw_addr_list *to_list, EXPORT_SYMBOL(__hw_addr_unsync); /** - * __hw_addr_sync_dev - Synchonize device's multicast list - * @list: address list to syncronize + * __hw_addr_sync_dev - Synchronize device's multicast list + * @list: address list to synchronize * @dev: device to sync * @sync: function to call if address should be added * @unsync: function to call if address should be removed diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 8592c052c0f4..473c437b6b53 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -317,8 +317,7 @@ static int dev_get_hwtstamp(struct net_device *dev, struct ifreq *ifr) * should take precedence in front of hardware timestamping provided by the * netdev. If the netdev driver needs to perform specific actions even for PHY * timestamping to work properly (a switch port must trap the timestamped - * frames and not forward them), it must set IFF_SEE_ALL_HWTSTAMP_REQUESTS in - * dev->priv_flags. + * frames and not forward them), it must set dev->see_all_hwtstamp_requests. */ int dev_set_hwtstamp_phylib(struct net_device *dev, struct kernel_hwtstamp_config *cfg, @@ -332,13 +331,13 @@ int dev_set_hwtstamp_phylib(struct net_device *dev, cfg->source = phy_ts ? HWTSTAMP_SOURCE_PHYLIB : HWTSTAMP_SOURCE_NETDEV; - if (phy_ts && (dev->priv_flags & IFF_SEE_ALL_HWTSTAMP_REQUESTS)) { + if (phy_ts && dev->see_all_hwtstamp_requests) { err = ops->ndo_hwtstamp_get(dev, &old_cfg); if (err) return err; } - if (!phy_ts || (dev->priv_flags & IFF_SEE_ALL_HWTSTAMP_REQUESTS)) { + if (!phy_ts || dev->see_all_hwtstamp_requests) { err = ops->ndo_hwtstamp_set(dev, cfg, extack); if (err) { if (extack->_msg) @@ -347,7 +346,7 @@ int dev_set_hwtstamp_phylib(struct net_device *dev, } } - if (phy_ts && (dev->priv_flags & IFF_SEE_ALL_HWTSTAMP_REQUESTS)) + if (phy_ts && dev->see_all_hwtstamp_requests) changed = kernel_hwtstamp_config_changed(&old_cfg, cfg); if (phy_ts) { diff --git a/net/core/devmem.c b/net/core/devmem.c new file mode 100644 index 000000000000..11b91c12ee11 --- /dev/null +++ b/net/core/devmem.c @@ -0,0 +1,389 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Devmem TCP + * + * Authors: Mina Almasry <almasrymina@google.com> + * Willem de Bruijn <willemdebruijn.kernel@gmail.com> + * Kaiyuan Zhang <kaiyuanz@google.com + */ + +#include <linux/dma-buf.h> +#include <linux/genalloc.h> +#include <linux/mm.h> +#include <linux/netdevice.h> +#include <linux/types.h> +#include <net/netdev_queues.h> +#include <net/netdev_rx_queue.h> +#include <net/page_pool/helpers.h> +#include <trace/events/page_pool.h> + +#include "devmem.h" +#include "mp_dmabuf_devmem.h" +#include "page_pool_priv.h" + +/* Device memory support */ + +/* Protected by rtnl_lock() */ +static DEFINE_XARRAY_FLAGS(net_devmem_dmabuf_bindings, XA_FLAGS_ALLOC1); + +static void net_devmem_dmabuf_free_chunk_owner(struct gen_pool *genpool, + struct gen_pool_chunk *chunk, + void *not_used) +{ + struct dmabuf_genpool_chunk_owner *owner = chunk->owner; + + kvfree(owner->niovs); + kfree(owner); +} + +static dma_addr_t net_devmem_get_dma_addr(const struct net_iov *niov) +{ + struct dmabuf_genpool_chunk_owner *owner = net_iov_owner(niov); + + return owner->base_dma_addr + + ((dma_addr_t)net_iov_idx(niov) << PAGE_SHIFT); +} + +void __net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding) +{ + size_t size, avail; + + gen_pool_for_each_chunk(binding->chunk_pool, + net_devmem_dmabuf_free_chunk_owner, NULL); + + size = gen_pool_size(binding->chunk_pool); + avail = gen_pool_avail(binding->chunk_pool); + + if (!WARN(size != avail, "can't destroy genpool. size=%zu, avail=%zu", + size, avail)) + gen_pool_destroy(binding->chunk_pool); + + dma_buf_unmap_attachment_unlocked(binding->attachment, binding->sgt, + DMA_FROM_DEVICE); + dma_buf_detach(binding->dmabuf, binding->attachment); + dma_buf_put(binding->dmabuf); + xa_destroy(&binding->bound_rxqs); + kfree(binding); +} + +struct net_iov * +net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding) +{ + struct dmabuf_genpool_chunk_owner *owner; + unsigned long dma_addr; + struct net_iov *niov; + ssize_t offset; + ssize_t index; + + dma_addr = gen_pool_alloc_owner(binding->chunk_pool, PAGE_SIZE, + (void **)&owner); + if (!dma_addr) + return NULL; + + offset = dma_addr - owner->base_dma_addr; + index = offset / PAGE_SIZE; + niov = &owner->niovs[index]; + + niov->pp_magic = 0; + niov->pp = NULL; + atomic_long_set(&niov->pp_ref_count, 0); + + return niov; +} + +void net_devmem_free_dmabuf(struct net_iov *niov) +{ + struct net_devmem_dmabuf_binding *binding = net_iov_binding(niov); + unsigned long dma_addr = net_devmem_get_dma_addr(niov); + + if (WARN_ON(!gen_pool_has_addr(binding->chunk_pool, dma_addr, + PAGE_SIZE))) + return; + + gen_pool_free(binding->chunk_pool, dma_addr, PAGE_SIZE); +} + +void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding) +{ + struct netdev_rx_queue *rxq; + unsigned long xa_idx; + unsigned int rxq_idx; + + if (binding->list.next) + list_del(&binding->list); + + xa_for_each(&binding->bound_rxqs, xa_idx, rxq) { + WARN_ON(rxq->mp_params.mp_priv != binding); + + rxq->mp_params.mp_priv = NULL; + + rxq_idx = get_netdev_rx_queue_index(rxq); + + WARN_ON(netdev_rx_queue_restart(binding->dev, rxq_idx)); + } + + xa_erase(&net_devmem_dmabuf_bindings, binding->id); + + net_devmem_dmabuf_binding_put(binding); +} + +int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, + struct net_devmem_dmabuf_binding *binding, + struct netlink_ext_ack *extack) +{ + struct netdev_rx_queue *rxq; + u32 xa_idx; + int err; + + if (rxq_idx >= dev->real_num_rx_queues) { + NL_SET_ERR_MSG(extack, "rx queue index out of range"); + return -ERANGE; + } + + rxq = __netif_get_rx_queue(dev, rxq_idx); + if (rxq->mp_params.mp_priv) { + NL_SET_ERR_MSG(extack, "designated queue already memory provider bound"); + return -EEXIST; + } + +#ifdef CONFIG_XDP_SOCKETS + if (rxq->pool) { + NL_SET_ERR_MSG(extack, "designated queue already in use by AF_XDP"); + return -EBUSY; + } +#endif + + err = xa_alloc(&binding->bound_rxqs, &xa_idx, rxq, xa_limit_32b, + GFP_KERNEL); + if (err) + return err; + + rxq->mp_params.mp_priv = binding; + + err = netdev_rx_queue_restart(dev, rxq_idx); + if (err) + goto err_xa_erase; + + return 0; + +err_xa_erase: + rxq->mp_params.mp_priv = NULL; + xa_erase(&binding->bound_rxqs, xa_idx); + + return err; +} + +struct net_devmem_dmabuf_binding * +net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, + struct netlink_ext_ack *extack) +{ + struct net_devmem_dmabuf_binding *binding; + static u32 id_alloc_next; + struct scatterlist *sg; + struct dma_buf *dmabuf; + unsigned int sg_idx, i; + unsigned long virtual; + int err; + + dmabuf = dma_buf_get(dmabuf_fd); + if (IS_ERR(dmabuf)) + return ERR_CAST(dmabuf); + + binding = kzalloc_node(sizeof(*binding), GFP_KERNEL, + dev_to_node(&dev->dev)); + if (!binding) { + err = -ENOMEM; + goto err_put_dmabuf; + } + + binding->dev = dev; + + err = xa_alloc_cyclic(&net_devmem_dmabuf_bindings, &binding->id, + binding, xa_limit_32b, &id_alloc_next, + GFP_KERNEL); + if (err < 0) + goto err_free_binding; + + xa_init_flags(&binding->bound_rxqs, XA_FLAGS_ALLOC); + + refcount_set(&binding->ref, 1); + + binding->dmabuf = dmabuf; + + binding->attachment = dma_buf_attach(binding->dmabuf, dev->dev.parent); + if (IS_ERR(binding->attachment)) { + err = PTR_ERR(binding->attachment); + NL_SET_ERR_MSG(extack, "Failed to bind dmabuf to device"); + goto err_free_id; + } + + binding->sgt = dma_buf_map_attachment_unlocked(binding->attachment, + DMA_FROM_DEVICE); + if (IS_ERR(binding->sgt)) { + err = PTR_ERR(binding->sgt); + NL_SET_ERR_MSG(extack, "Failed to map dmabuf attachment"); + goto err_detach; + } + + /* For simplicity we expect to make PAGE_SIZE allocations, but the + * binding can be much more flexible than that. We may be able to + * allocate MTU sized chunks here. Leave that for future work... + */ + binding->chunk_pool = + gen_pool_create(PAGE_SHIFT, dev_to_node(&dev->dev)); + if (!binding->chunk_pool) { + err = -ENOMEM; + goto err_unmap; + } + + virtual = 0; + for_each_sgtable_dma_sg(binding->sgt, sg, sg_idx) { + dma_addr_t dma_addr = sg_dma_address(sg); + struct dmabuf_genpool_chunk_owner *owner; + size_t len = sg_dma_len(sg); + struct net_iov *niov; + + owner = kzalloc_node(sizeof(*owner), GFP_KERNEL, + dev_to_node(&dev->dev)); + if (!owner) { + err = -ENOMEM; + goto err_free_chunks; + } + + owner->base_virtual = virtual; + owner->base_dma_addr = dma_addr; + owner->num_niovs = len / PAGE_SIZE; + owner->binding = binding; + + err = gen_pool_add_owner(binding->chunk_pool, dma_addr, + dma_addr, len, dev_to_node(&dev->dev), + owner); + if (err) { + kfree(owner); + err = -EINVAL; + goto err_free_chunks; + } + + owner->niovs = kvmalloc_array(owner->num_niovs, + sizeof(*owner->niovs), + GFP_KERNEL); + if (!owner->niovs) { + err = -ENOMEM; + goto err_free_chunks; + } + + for (i = 0; i < owner->num_niovs; i++) { + niov = &owner->niovs[i]; + niov->owner = owner; + page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov), + net_devmem_get_dma_addr(niov)); + } + + virtual += len; + } + + return binding; + +err_free_chunks: + gen_pool_for_each_chunk(binding->chunk_pool, + net_devmem_dmabuf_free_chunk_owner, NULL); + gen_pool_destroy(binding->chunk_pool); +err_unmap: + dma_buf_unmap_attachment_unlocked(binding->attachment, binding->sgt, + DMA_FROM_DEVICE); +err_detach: + dma_buf_detach(dmabuf, binding->attachment); +err_free_id: + xa_erase(&net_devmem_dmabuf_bindings, binding->id); +err_free_binding: + kfree(binding); +err_put_dmabuf: + dma_buf_put(dmabuf); + return ERR_PTR(err); +} + +void dev_dmabuf_uninstall(struct net_device *dev) +{ + struct net_devmem_dmabuf_binding *binding; + struct netdev_rx_queue *rxq; + unsigned long xa_idx; + unsigned int i; + + for (i = 0; i < dev->real_num_rx_queues; i++) { + binding = dev->_rx[i].mp_params.mp_priv; + if (!binding) + continue; + + xa_for_each(&binding->bound_rxqs, xa_idx, rxq) + if (rxq == &dev->_rx[i]) { + xa_erase(&binding->bound_rxqs, xa_idx); + break; + } + } +} + +/*** "Dmabuf devmem memory provider" ***/ + +int mp_dmabuf_devmem_init(struct page_pool *pool) +{ + struct net_devmem_dmabuf_binding *binding = pool->mp_priv; + + if (!binding) + return -EINVAL; + + if (!pool->dma_map) + return -EOPNOTSUPP; + + if (pool->dma_sync) + return -EOPNOTSUPP; + + if (pool->p.order != 0) + return -E2BIG; + + net_devmem_dmabuf_binding_get(binding); + return 0; +} + +netmem_ref mp_dmabuf_devmem_alloc_netmems(struct page_pool *pool, gfp_t gfp) +{ + struct net_devmem_dmabuf_binding *binding = pool->mp_priv; + struct net_iov *niov; + netmem_ref netmem; + + niov = net_devmem_alloc_dmabuf(binding); + if (!niov) + return 0; + + netmem = net_iov_to_netmem(niov); + + page_pool_set_pp_info(pool, netmem); + + pool->pages_state_hold_cnt++; + trace_page_pool_state_hold(pool, netmem, pool->pages_state_hold_cnt); + return netmem; +} + +void mp_dmabuf_devmem_destroy(struct page_pool *pool) +{ + struct net_devmem_dmabuf_binding *binding = pool->mp_priv; + + net_devmem_dmabuf_binding_put(binding); +} + +bool mp_dmabuf_devmem_release_page(struct page_pool *pool, netmem_ref netmem) +{ + long refcount = atomic_long_read(netmem_get_pp_ref_count_ref(netmem)); + + if (WARN_ON_ONCE(!netmem_is_net_iov(netmem))) + return false; + + if (WARN_ON_ONCE(refcount != 1)) + return false; + + page_pool_clear_pp_info(netmem); + + net_devmem_free_dmabuf(netmem_to_net_iov(netmem)); + + /* We don't want the page pool put_page()ing our net_iovs. */ + return false; +} diff --git a/net/core/devmem.h b/net/core/devmem.h new file mode 100644 index 000000000000..76099ef9c482 --- /dev/null +++ b/net/core/devmem.h @@ -0,0 +1,180 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Device memory TCP support + * + * Authors: Mina Almasry <almasrymina@google.com> + * Willem de Bruijn <willemb@google.com> + * Kaiyuan Zhang <kaiyuanz@google.com> + * + */ +#ifndef _NET_DEVMEM_H +#define _NET_DEVMEM_H + +struct netlink_ext_ack; + +struct net_devmem_dmabuf_binding { + struct dma_buf *dmabuf; + struct dma_buf_attachment *attachment; + struct sg_table *sgt; + struct net_device *dev; + struct gen_pool *chunk_pool; + + /* The user holds a ref (via the netlink API) for as long as they want + * the binding to remain alive. Each page pool using this binding holds + * a ref to keep the binding alive. Each allocated net_iov holds a + * ref. + * + * The binding undos itself and unmaps the underlying dmabuf once all + * those refs are dropped and the binding is no longer desired or in + * use. + */ + refcount_t ref; + + /* The list of bindings currently active. Used for netlink to notify us + * of the user dropping the bind. + */ + struct list_head list; + + /* rxq's this binding is active on. */ + struct xarray bound_rxqs; + + /* ID of this binding. Globally unique to all bindings currently + * active. + */ + u32 id; +}; + +#if defined(CONFIG_NET_DEVMEM) +/* Owner of the dma-buf chunks inserted into the gen pool. Each scatterlist + * entry from the dmabuf is inserted into the genpool as a chunk, and needs + * this owner struct to keep track of some metadata necessary to create + * allocations from this chunk. + */ +struct dmabuf_genpool_chunk_owner { + /* Offset into the dma-buf where this chunk starts. */ + unsigned long base_virtual; + + /* dma_addr of the start of the chunk. */ + dma_addr_t base_dma_addr; + + /* Array of net_iovs for this chunk. */ + struct net_iov *niovs; + size_t num_niovs; + + struct net_devmem_dmabuf_binding *binding; +}; + +void __net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding); +struct net_devmem_dmabuf_binding * +net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, + struct netlink_ext_ack *extack); +void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding); +int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, + struct net_devmem_dmabuf_binding *binding, + struct netlink_ext_ack *extack); +void dev_dmabuf_uninstall(struct net_device *dev); + +static inline struct dmabuf_genpool_chunk_owner * +net_iov_owner(const struct net_iov *niov) +{ + return niov->owner; +} + +static inline unsigned int net_iov_idx(const struct net_iov *niov) |