// SPDX-License-Identifier: GPL-2.0
/* XDP sockets
*
* AF_XDP sockets allows a channel between XDP programs and userspace
* applications.
* Copyright(c) 2018 Intel Corporation.
*
* Author(s): Björn Töpel <bjorn.topel@intel.com>
* Magnus Karlsson <magnus.karlsson@intel.com>
*/
#define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__
#include <linux/if_xdp.h>
#include <linux/init.h>
#include <linux/sched/mm.h>
#include <linux/sched/signal.h>
#include <linux/sched/task.h>
#include <linux/socket.h>
#include <linux/file.h>
#include <linux/uaccess.h>
#include <linux/net.h>
#include <linux/netdevice.h>
#include <linux/rculist.h>
#include <net/xdp_sock_drv.h>
#include <net/busy_poll.h>
#include <net/xdp.h>
#include "xsk_queue.h"
#include "xdp_umem.h"
#include "xsk.h"
#define TX_BATCH_SIZE 32
static DEFINE_PER_CPU(struct list_head, xskmap_flush_list);
void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
{
if (pool->cached_need_wakeup & XDP_WAKEUP_RX)
return;
pool->fq->ring->flags |= XDP_RING_NEED_WAKEUP;
pool->cached_need_wakeup |= XDP_WAKEUP_RX;
}
EXPORT_SYMBOL(xsk_set_rx_need_wakeup);
void xsk_set_tx_need_wakeup(struct xsk_buff_pool *pool)
{
struct xdp_sock *xs;
if (pool->cached_need_wakeup & XDP_WAKEUP_TX)
return;
rcu_read_lock();
list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
}
rcu_read_unlock();
pool->cached_need_wakeup |= XDP_WAKEUP_TX;
}
EXPORT_SYMBOL(xsk_set_tx_need_wakeup);
void xsk_clear_rx_need_wakeup(struct xsk_buff_pool *pool)
{
if (!(pool->cached_need_wakeup & XDP_WAKEUP_RX))
return;
pool->fq->ring->flags &= ~XDP_RING_NEED_WAKEUP;
pool->cached_need_wakeup &= ~XDP_WAKEUP_RX;
}
EXPORT_SYMBOL(xsk_clear_rx_need_wakeup);
void xsk_clear_tx_need_wakeup(struct xsk_buff_pool *pool)
{
struct xdp_sock *xs;
if (!(pool->cached_need_wakeup & XDP_WAKEUP_TX))
return;
rcu_read_lock();
list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP;
}
rcu_read_unlock();
pool->cached_need_wakeup &= ~XDP_WAKEUP_TX;
}
EXPORT_SYMBOL(xsk_clear_tx_need_wakeup);
bool xsk_uses_need_wakeup(struct xsk_buff_pool *pool)
{
return pool->uses_need_wakeup;
}
EXPORT_SYMBOL(xsk_uses_need_wakeup);
struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev,
u16 queue_id)
{
if (queue_id < dev->real_num_rx_queues)
return dev->_rx[queue_id].pool;
if (queue_id < dev->real_num_tx_queues)
return dev->_tx[queue_id].pool;
return NULL;
}
EXPORT_SYMBOL(xsk_get_pool_from_qid);
void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id)
{
if (queue_id < dev->num_rx_queues)
dev->_rx[queue_id].pool = NULL;
if (queue_id < dev->num_tx_queues)
dev->_tx[queue_id].pool = NULL;
}
/* The buffer pool is stored both in the _rx struct and the _tx struct as we do
* not know if the device has more tx queues than rx, or the opposite.
* This might also change during run time.
*/
int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool,
u16 queue_id)
{
if (queue_id >= max_t(unsigned int,
dev->real_num_rx_queues,
dev->real_num_tx_queues))
return -EINVAL;
if (queue_id < dev->real_num_rx_queues)
dev->_rx[queue_id].pool = pool;
if (queue_id < dev->real_num_tx_queues)
dev->_tx[queue_id].pool = pool;
return 0;
}
static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
{
struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
u64 addr;
int err;
addr = xp_get_handle(xskb);
err = xskq_prod_reserve_desc(xs->rx, addr, len);
if (err) {
xs->rx_queue_full++;
return err;
}
xp_release(xskb);
return 0;
}
static void xsk_copy_xdp(struct xdp_buff *to, struct xdp_buff *from, u32 len)
{
void *from_buf, *to_buf;
u32 metalen;
if (unlikely(xdp_data_meta_unsupported(from))) {
from_buf = from->data;
to_buf = to->data;
metalen = 0;
} else {
from_buf = from->data_meta;
metalen = from->data - from->data_meta;
to_buf = to->data - metalen;
}
memcpy(to_buf, from_buf, len + metalen);
}
static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
{
struct xdp_buff *xsk_xdp;
int err;
u32 len;
len = xdp->data_end - xdp->data;
if (len > xsk_pool_get_rx_frame_size(xs->pool)) {
xs->rx_dropped++;
return -ENOSPC;
}
xsk_xdp = xsk_buff_alloc(xs->pool);
if (!xsk_xdp) {
xs->rx_dropped++;
return -ENOMEM;
}
xsk_copy_xdp(xsk_xdp, xdp, len);
err = __xsk_rcv_zc(xs, xsk_xdp, len);
if (err) {
xsk_buff_free(xsk_xdp);
return err;
}
return 0;
}
static bool xsk_tx_writeable(struct xdp_sock *xs)
{
if (xskq_cons_present_entries(xs->tx) > xs->tx->nentries / 2)
return false;
return true;
}
static bool xsk_is_bound(struct xdp_sock *xs)
{
if (READ_ONCE(xs->state) == XSK_BOUND) {
/* Matches smp_wmb() in bind(). */
smp_rmb();
return true;
}
return false;
}
static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp)
{
if (!xsk_is_bound(xs))
return -ENXIO;
if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
return -EINVAL;
sk_mark_napi_id_once_xdp(&xs->sk, xdp);
return 0;
}
static void xsk_flush(struct xdp_sock *xs)
{
xskq_prod_submit(xs->rx);
__xskq_cons_release(xs->pool->fq);
sock_def_readable(&xs->sk);
}
int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
{
int err;
spin_lock_bh(&xs->rx_lock);
err = xsk_rcv_check(xs, xdp);
if (!err) {
err = __xsk_rcv(xs, xdp);
xsk_flush(xs);
}
spin_unlock_bh(&xs->rx_lock);
return err;
}
static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
{
int err;
u32 len;
err = xsk_rcv_check(xs, xdp);
if (err)
return err;
if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) {
len = xdp->data_end - xdp->data;
return __xsk_rcv_zc(xs, xdp, len);
}
err = __xsk_rcv(xs, xdp);
if (!err)
xdp_return_buff(xdp);
return err;
}
int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp)
{
struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
int err;
err = xsk_rcv(xs, xdp);
if (err)
return err;
if (!xs->flush_node.prev)
list_add(&xs->flush_node, flush_list);
return 0;
}
void __xsk_map_flush(void)
{
struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
struct xdp_sock *xs, *tmp;
list_for_each_entry_safe(xs, tmp, flush_list, flush_node) {
xsk_flush(xs);
__list_del_clearprev(&xs->flush_node);
}
}
void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries)
{
xskq_prod_submit_n(pool->cq, nb_entries);
}
EXPORT_SYMBOL(xsk_tx_completed);
void xsk_tx_release(struct xsk_buff_pool *pool)
{
struct xdp_sock *xs;
rcu_read_lock();
list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
__xskq_cons_release(xs->tx);
if (xsk_tx_writeable(xs))
xs->sk.sk_write_space(&xs->sk);
}
rcu_read_unlock();
}
EXPORT_SYMBOL(xsk_tx_release);
bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc)
{
struct xdp_sock *xs;
rcu_read_lock();
list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
if (!xskq_cons_peek_desc(xs->tx, desc, pool)) {
xs->tx->queue_empty_descs++;
continue;
}
/* This is the backpressure mechanism for the Tx path.
* Reserve space in the completion queue and only proceed
* if there is space in it. This avoids having to implement
* any buffering in the Tx path.
*/
if (xskq_prod_reserve_addr(pool->cq, desc->addr))
goto out;
xskq_cons_release(xs->tx);
rcu_read_unlock();
return true;
}
out:
rcu_read_unlock();
return false;
}
EXPORT_SYMBOL(xsk_tx_peek_desc);
static u32 xsk_tx_peek_release_fallback(struct xsk_buff_pool *pool, u32 max_entries)
{
struct xdp_desc *descs = pool->tx_descs;
u32 nb_pkts = 0;
|