From a2c09ac0fb6756d7085c359b6c020ef8b4205e0f Mon Sep 17 00:00:00 2001 From: Inju Song Date: Tue, 27 Mar 2018 23:14:40 +0900 Subject: netfilter: ipvs: Keep latest weight of destination The hashing table in scheduler such as source hash or maglev hash should ignore the changed weight to 0 and allow changing the weight from/to non-0 values. So, struct ip_vs_dest needs to keep weight with latest non-0 weight. Signed-off-by: Inju Song Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/ip_vs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index eb0bec043c96..0ac795b41ab8 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -668,6 +668,7 @@ struct ip_vs_dest { volatile unsigned int flags; /* dest status flags */ atomic_t conn_flags; /* flags to copy to conn */ atomic_t weight; /* server weight */ + atomic_t last_weight; /* server latest weight */ refcount_t refcnt; /* reference counter */ struct ip_vs_stats stats; /* statistics */ -- cgit v1.2.3 From d1361840f8c519eaee9a78ffe09e4f0a1b586846 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 16 Apr 2018 10:33:35 -0700 Subject: tcp: fix SO_RCVLOWAT and RCVBUF autotuning Applications might use SO_RCVLOWAT on TCP socket hoping to receive one [E]POLLIN event only when a given amount of bytes are ready in socket receive queue. Problem is that receive autotuning is not aware of this constraint, meaning sk_rcvbuf might be too small to allow all bytes to be stored. Add a new (struct proto_ops)->set_rcvlowat method so that a protocol can override the default setsockopt(SO_RCVLOWAT) behavior. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/net.h | 1 + include/net/tcp.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/net.h b/include/linux/net.h index 2248a052061d..6554d3ba4396 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -197,6 +197,7 @@ struct proto_ops { int offset, size_t size, int flags); int (*sendmsg_locked)(struct sock *sk, struct msghdr *msg, size_t size); + int (*set_rcvlowat)(struct sock *sk, int val); }; #define DECLARE_SOCKADDR(type, dst, src) \ diff --git a/include/net/tcp.h b/include/net/tcp.h index 9c9b3768b350..b2318242cad8 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -402,6 +402,7 @@ void tcp_set_keepalive(struct sock *sk, int val); void tcp_syn_ack_timeout(const struct request_sock *req); int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len); +int tcp_set_rcvlowat(struct sock *sk, int val); void tcp_parse_options(const struct net *net, const struct sk_buff *skb, struct tcp_options_received *opt_rx, int estab, struct tcp_fastopen_cookie *foc); -- cgit v1.2.3 From 03f45c883c6f391ed4fff8292415b35bd1107519 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 16 Apr 2018 10:33:37 -0700 Subject: tcp: avoid extra wakeups for SO_RCVLOWAT users SO_RCVLOWAT is properly handled in tcp_poll(), so that POLLIN is only generated when enough bytes are available in receive queue, after David change (commit c7004482e8dc "tcp: Respect SO_RCVLOWAT in tcp_poll().") But TCP still calls sk->sk_data_ready() for each chunk added in receive queue, meaning thread is awaken, and goes back to sleep shortly after. Tested: tcp_mmap test program, receiving 32768 MB of data with SO_RCVLOWAT set to 512KB -> Should get ~2 wakeups (c-switches) per MB, regardless of how many (tiny or big) packets were received. High speed (mostly full size GRO packets) received 32768 MB (100 % mmap'ed) in 8.03112 s, 34.2266 Gbit, cpu usage user:0.037 sys:1.404, 43.9758 usec per MB, 65497 c-switches received 32768 MB (99.9954 % mmap'ed) in 7.98453 s, 34.4263 Gbit, cpu usage user:0.03 sys:1.422, 44.3115 usec per MB, 65485 c-switches Low speed (sender is ratelimited and sends 1-MSS at a time, so GRO is not helping) received 22474.5 MB (100 % mmap'ed) in 6015.35 s, 0.0313414 Gbit, cpu usage user:0.05 sys:1.586, 72.7952 usec per MB, 44950 c-switches Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index b2318242cad8..0ee85c47c185 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -403,6 +403,7 @@ void tcp_syn_ack_timeout(const struct request_sock *req); int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len); int tcp_set_rcvlowat(struct sock *sk, int val); +void tcp_data_ready(struct sock *sk); void tcp_parse_options(const struct net *net, const struct sk_buff *skb, struct tcp_options_received *opt_rx, int estab, struct tcp_fastopen_cookie *foc); -- cgit v1.2.3 From 93ab6cc69162775201587cc9da00d5016dc890e2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 16 Apr 2018 10:33:38 -0700 Subject: tcp: implement mmap() for zero copy receive Some networks can make sure TCP payload can exactly fit 4KB pages, with well chosen MSS/MTU and architectures. Implement mmap() system call so that applications can avoid copying data without complex splice() games. Note that a successful mmap( X bytes) on TCP socket is consuming bytes, as if recvmsg() has been done. (tp->copied += X) Only PROT_READ mappings are accepted, as skb page frags are fundamentally shared and read only. If tcp_mmap() finds data that is not a full page, or a patch of urgent data, -EINVAL is returned, no bytes are consumed. Application must fallback to recvmsg() to read the problematic sequence. mmap() wont block, regardless of socket being in blocking or non-blocking mode. If not enough bytes are in receive queue, mmap() would return -EAGAIN, or -EIO if socket is in a state where no other bytes can be added into receive queue. An application might use SO_RCVLOWAT, poll() and/or ioctl( FIONREAD) to efficiently use mmap() On the sender side, MSG_EOR might help to clearly separate unaligned headers and 4K-aligned chunks if necessary. Tested: mlx4 (cx-3) 40Gbit NIC, with tcp_mmap program provided in following patch. MTU set to 4168 (4096 TCP payload, 40 bytes IPv6 header, 32 bytes TCP header) Without mmap() (tcp_mmap -s) received 32768 MB (0 % mmap'ed) in 8.13342 s, 33.7961 Gbit, cpu usage user:0.034 sys:3.778, 116.333 usec per MB, 63062 c-switches received 32768 MB (0 % mmap'ed) in 8.14501 s, 33.748 Gbit, cpu usage user:0.029 sys:3.997, 122.864 usec per MB, 61903 c-switches received 32768 MB (0 % mmap'ed) in 8.11723 s, 33.8635 Gbit, cpu usage user:0.048 sys:3.964, 122.437 usec per MB, 62983 c-switches received 32768 MB (0 % mmap'ed) in 8.39189 s, 32.7552 Gbit, cpu usage user:0.038 sys:4.181, 128.754 usec per MB, 55834 c-switches With mmap() on receiver (tcp_mmap -s -z) received 32768 MB (100 % mmap'ed) in 8.03083 s, 34.2278 Gbit, cpu usage user:0.024 sys:1.466, 45.4712 usec per MB, 65479 c-switches received 32768 MB (100 % mmap'ed) in 7.98805 s, 34.4111 Gbit, cpu usage user:0.026 sys:1.401, 43.5486 usec per MB, 65447 c-switches received 32768 MB (100 % mmap'ed) in 7.98377 s, 34.4296 Gbit, cpu usage user:0.028 sys:1.452, 45.166 usec per MB, 65496 c-switches received 32768 MB (99.9969 % mmap'ed) in 8.01838 s, 34.281 Gbit, cpu usage user:0.02 sys:1.446, 44.7388 usec per MB, 65505 c-switches Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 0ee85c47c185..833154e3df17 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -404,6 +404,8 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len); int tcp_set_rcvlowat(struct sock *sk, int val); void tcp_data_ready(struct sock *sk); +int tcp_mmap(struct file *file, struct socket *sock, + struct vm_area_struct *vma); void tcp_parse_options(const struct net *net, const struct sk_buff *skb, struct tcp_options_received *opt_rx, int estab, struct tcp_fastopen_cookie *foc); -- cgit v1.2.3 From a5724fc3834643a975bd0db71f001ca65f4a8382 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Mon, 16 Apr 2018 21:37:13 +0200 Subject: PCI: Add two more values for PCIe Max_Read_Request_Size This patch adds missing values for the max read request size. E.g. network driver r8169 uses a value of 4K. Signed-off-by: Heiner Kallweit Acked-by: Bjorn Helgaas Signed-off-by: David S. Miller --- include/uapi/linux/pci_regs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index 103ba797a8f3..83ade9b5cf95 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -506,6 +506,8 @@ #define PCI_EXP_DEVCTL_READRQ_256B 0x1000 /* 256 Bytes */ #define PCI_EXP_DEVCTL_READRQ_512B 0x2000 /* 512 Bytes */ #define PCI_EXP_DEVCTL_READRQ_1024B 0x3000 /* 1024 Bytes */ +#define PCI_EXP_DEVCTL_READRQ_2048B 0x4000 /* 2048 Bytes */ +#define PCI_EXP_DEVCTL_READRQ_4096B 0x5000 /* 4096 Bytes */ #define PCI_EXP_DEVCTL_BCR_FLR 0x8000 /* Bridge Configuration Retry / FLR */ #define PCI_EXP_DEVSTA 10 /* Device Status */ #define PCI_EXP_DEVSTA_CED 0x0001 /* Correctable Error Detected */ -- cgit v1.2.3 From ef53e9e14714de2ce26eaae0244c07c426064d69 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Mon, 16 Apr 2018 15:07:13 -0700 Subject: net: Remove unused tcp_set_state tracepoint This tracepoint was replaced by inet_sock_set_state in 563e0bb and not used anywhere in the kernel anymore. Remove it. Signed-off-by: Andrey Ignatov Signed-off-by: David S. Miller --- include/trace/events/tcp.h | 47 ---------------------------------------------- 1 file changed, 47 deletions(-) (limited to 'include') diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h index 878b2be7ce77..3dd68029d77a 100644 --- a/include/trace/events/tcp.h +++ b/include/trace/events/tcp.h @@ -166,53 +166,6 @@ DEFINE_EVENT(tcp_event_sk, tcp_destroy_sock, TP_ARGS(sk) ); -TRACE_EVENT(tcp_set_state, - - TP_PROTO(const struct sock *sk, const int oldstate, const int newstate), - - TP_ARGS(sk, oldstate, newstate), - - TP_STRUCT__entry( - __field(const void *, skaddr) - __field(int, oldstate) - __field(int, newstate) - __field(__u16, sport) - __field(__u16, dport) - __array(__u8, saddr, 4) - __array(__u8, daddr, 4) - __array(__u8, saddr_v6, 16) - __array(__u8, daddr_v6, 16) - ), - - TP_fast_assign( - struct inet_sock *inet = inet_sk(sk); - __be32 *p32; - - __entry->skaddr = sk; - __entry->oldstate = oldstate; - __entry->newstate = newstate; - - __entry->sport = ntohs(inet->inet_sport); - __entry->dport = ntohs(inet->inet_dport); - - p32 = (__be32 *) __entry->saddr; - *p32 = inet->inet_saddr; - - p32 = (__be32 *) __entry->daddr; - *p32 = inet->inet_daddr; - - TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr, - sk->sk_v6_rcv_saddr, sk->sk_v6_daddr); - ), - - TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c oldstate=%s newstate=%s", - __entry->sport, __entry->dport, - __entry->saddr, __entry->daddr, - __entry->saddr_v6, __entry->daddr_v6, - show_tcp_state_name(__entry->oldstate), - show_tcp_state_name(__entry->newstate)) -); - TRACE_EVENT(tcp_retransmit_synack, TP_PROTO(const struct sock *sk, const struct request_sock *req), -- cgit v1.2.3 From 5ab073ffd326480a6185d096e9703f62ef92b86c Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 17 Apr 2018 16:45:26 +0200 Subject: xdp: introduce xdp_return_frame API and use in cpumap Introduce an xdp_return_frame API, and convert over cpumap as the first user, given it have queued XDP frame structure to leverage. V3: Cleanup and remove C99 style comments, pointed out by Alex Duyck. V6: Remove comment that id will be added later (Req by Alex Duyck) V8: Rename enum mem_type to xdp_mem_type (found by kbuild test robot) Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/net/xdp.h | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'include') diff --git a/include/net/xdp.h b/include/net/xdp.h index b2362ddfa694..e4207699c410 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -33,16 +33,43 @@ * also mandatory during RX-ring setup. */ +enum xdp_mem_type { + MEM_TYPE_PAGE_SHARED = 0, /* Split-page refcnt based model */ + MEM_TYPE_PAGE_ORDER0, /* Orig XDP full page model */ + MEM_TYPE_MAX, +}; + +struct xdp_mem_info { + u32 type; /* enum xdp_mem_type, but known size type */ +}; + struct xdp_rxq_info { struct net_device *dev; u32 queue_index; u32 reg_state; + struct xdp_mem_info mem; } ____cacheline_aligned; /* perf critical, avoid false-sharing */ + +static inline +void xdp_return_frame(void *data, struct xdp_mem_info *mem) +{ + if (mem->type == MEM_TYPE_PAGE_SHARED) + page_frag_free(data); + + if (mem->type == MEM_TYPE_PAGE_ORDER0) { + struct page *page = virt_to_page(data); /* Assumes order0 page*/ + + put_page(page); + } +} + int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, struct net_device *dev, u32 queue_index); void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq); void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq); bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq); +int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, + enum xdp_mem_type type, void *allocator); #endif /* __LINUX_NET_XDP_H__ */ -- cgit v1.2.3 From 106ca27f2922e8de820d1bd3d79b1cbdf2d78eea Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 17 Apr 2018 16:45:37 +0200 Subject: xdp: move struct xdp_buff from filter.h to xdp.h This is done to prepare for the next patch, and it is also nice to move this XDP related struct out of filter.h. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/linux/filter.h | 24 +----------------------- include/net/xdp.h | 22 ++++++++++++++++++++++ 2 files changed, 23 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index fc4e8f91b03d..4da8b2308174 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -30,6 +30,7 @@ struct sock; struct seccomp_data; struct bpf_prog_aux; struct xdp_rxq_info; +struct xdp_buff; /* ArgX, context and stack frame pointer register positions. Note, * Arg1, Arg2, Arg3, etc are used as argument mappings of function @@ -500,14 +501,6 @@ struct bpf_skb_data_end { void *data_end; }; -struct xdp_buff { - void *data; - void *data_end; - void *data_meta; - void *data_hard_start; - struct xdp_rxq_info *rxq; -}; - struct sk_msg_buff { void *data; void *data_end; @@ -772,21 +765,6 @@ int xdp_do_redirect(struct net_device *dev, struct bpf_prog *prog); void xdp_do_flush_map(void); -/* Drivers not supporting XDP metadata can use this helper, which - * rejects any room expansion for metadata as a result. - */ -static __always_inline void -xdp_set_data_meta_invalid(struct xdp_buff *xdp) -{ - xdp->data_meta = xdp->data + 1; -} - -static __always_inline bool -xdp_data_meta_unsupported(const struct xdp_buff *xdp) -{ - return unlikely(xdp->data_meta > xdp->data); -} - void bpf_warn_invalid_xdp_action(u32 act); struct sock *do_sk_redirect_map(struct sk_buff *skb); diff --git a/include/net/xdp.h b/include/net/xdp.h index e4207699c410..15f8ade008b5 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -50,6 +50,13 @@ struct xdp_rxq_info { struct xdp_mem_info mem; } ____cacheline_aligned; /* perf critical, avoid false-sharing */ +struct xdp_buff { + void *data; + void *data_end; + void *data_meta; + void *data_hard_start; + struct xdp_rxq_info *rxq; +}; static inline void xdp_return_frame(void *data, struct xdp_mem_info *mem) @@ -72,4 +79,19 @@ bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq); int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, enum xdp_mem_type type, void *allocator); +/* Drivers not supporting XDP metadata can use this helper, which + * rejects any room expansion for metadata as a result. + */ +static __always_inline void +xdp_set_data_meta_invalid(struct xdp_buff *xdp) +{ + xdp->data_meta = xdp->data + 1; +} + +static __always_inline bool +xdp_data_meta_unsupported(const struct xdp_buff *xdp) +{ + return unlikely(xdp->data_meta > xdp->data); +} + #endif /* __LINUX_NET_XDP_H__ */ -- cgit v1.2.3 From c0048cff8abb69c956ce1277d17a3f7a14e41522 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 17 Apr 2018 16:45:42 +0200 Subject: xdp: introduce a new xdp_frame type This is needed to convert drivers tuntap and virtio_net. This is a generalization of what is done inside cpumap, which will be converted later. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/net/xdp.h | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'include') diff --git a/include/net/xdp.h b/include/net/xdp.h index 15f8ade008b5..756c42811e78 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -58,6 +58,46 @@ struct xdp_buff { struct xdp_rxq_info *rxq; }; +struct xdp_frame { + void *data; + u16 len; + u16 headroom; + u16 metasize; + /* Lifetime of xdp_rxq_info is limited to NAPI/enqueue time, + * while mem info is valid on remote CPU. + */ + struct xdp_mem_info mem; +}; + +/* Convert xdp_buff to xdp_frame */ +static inline +struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp) +{ + struct xdp_frame *xdp_frame; + int metasize; + int headroom; + + /* Assure headroom is available for storing info */ + headroom = xdp->data - xdp->data_hard_start; + metasize = xdp->data - xdp->data_meta; + metasize = metasize > 0 ? metasize : 0; + if (unlikely((headroom - metasize) < sizeof(*xdp_frame))) + return NULL; + + /* Store info in top of packet */ + xdp_frame = xdp->data_hard_start; + + xdp_frame->data = xdp->data; + xdp_frame->len = xdp->data_end - xdp->data; + xdp_frame->headroom = headroom - sizeof(*xdp_frame); + xdp_frame->metasize = metasize; + + /* rxq only valid until napi_schedule ends, convert to xdp_mem_info */ + xdp_frame->mem = xdp->rxq->mem; + + return xdp_frame; +} + static inline void xdp_return_frame(void *data, struct xdp_mem_info *mem) { -- cgit v1.2.3 From 1ffcbc8537d0bc32aaca7000cb9c904ec4b6300f Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 17 Apr 2018 16:45:47 +0200 Subject: tun: convert to use generic xdp_frame and xdp_return_frame API The tuntap driver invented it's own driver specific way of queuing XDP packets, by storing the xdp_buff information in the top of the XDP frame data. Convert it over to use the more generic xdp_frame structure. The main problem with the in-driver method is that the xdp_rxq_info pointer cannot be trused/used when dequeueing the frame. V3: Remove check based on feedback from Jason Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/linux/if_tun.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h index fd00170b494f..3d2996dc7d85 100644 --- a/include/linux/if_tun.h +++ b/include/linux/if_tun.h @@ -22,7 +22,7 @@ #if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE) struct socket *tun_get_socket(struct file *); struct ptr_ring *tun_get_tx_ring(struct file *file); -bool tun_is_xdp_buff(void *ptr); +bool tun_is_xdp_frame(void *ptr); void *tun_xdp_to_ptr(void *ptr); void *tun_ptr_to_xdp(void *ptr); void tun_ptr_free(void *ptr); @@ -39,7 +39,7 @@ static inline struct ptr_ring *tun_get_tx_ring(struct file *f) { return ERR_PTR(-EINVAL); } -static inline bool tun_is_xdp_buff(void *ptr) +static inline bool tun_is_xdp_frame(void *ptr) { return false; } -- cgit v1.2.3 From 70280ed91cb8acb43e8fd7a8094840846c172ac5 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 17 Apr 2018 16:45:57 +0200 Subject: bpf: cpumap convert to use generic xdp_frame The generic xdp_frame format, was inspired by the cpumap own internal xdp_pkt format. It is now time to convert it over to the generic xdp_frame format. The cpumap needs one extra field dev_rx. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/net/xdp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/xdp.h b/include/net/xdp.h index 756c42811e78..ea3773f94f65 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -67,6 +67,7 @@ struct xdp_frame { * while mem info is valid on remote CPU. */ struct xdp_mem_info mem; + struct net_device *dev_rx; /* used by cpumap */ }; /* Convert xdp_buff to xdp_frame */ -- cgit v1.2.3 From 8d5d88527587516bd58ff0f3810f07c38e65e2be Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 17 Apr 2018 16:46:12 +0200 Subject: xdp: rhashtable with allocator ID to pointer mapping Use the IDA infrastructure for getting a cyclic increasing ID number, that is used for keeping track of each registered allocator per RX-queue xdp_rxq_info. Instead of using the IDR infrastructure, which uses a radix tree, use a dynamic rhashtable, for creating ID to pointer lookup table, because this is faster. The problem that is being solved here is that, the xdp_rxq_info pointer (stored in xdp_buff) cannot be used directly, as the guaranteed lifetime is too short. The info is needed on a (potentially) remote CPU during DMA-TX completion time . In an xdp_frame the xdp_mem_info is stored, when it got converted from an xdp_buff, which is sufficient for the simple page refcnt based recycle schemes. For more advanced allocators there is a need to store a pointer to the registered allocator. Thus, there is a need to guard the lifetime or validity of the allocator pointer, which is done through this rhashtable ID map to pointer. The removal and validity of of the allocator and helper struct xdp_mem_allocator is guarded by RCU. The allocator will be created by the driver, and registered with xdp_rxq_info_reg_mem_model(). It is up-to debate who is responsible for freeing the allocator pointer or invoking the allocator destructor function. In any case, this must happen via RCU freeing. Use the IDA infrastructure for getting a cyclic increasing ID number, that is used for keeping track of each registered allocator per RX-queue xdp_rxq_info. V4: Per req of Jason Wang - Use xdp_rxq_info_reg_mem_model() in all drivers implementing XDP_REDIRECT, even-though it's not strictly necessary when allocator==NULL for type MEM_TYPE_PAGE_SHARED (given it's zero). V6: Per req of Alex Duyck - Introduce rhashtable_lookup() call in later patch V8: Address sparse should be static warnings (from kbuild test robot) Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/net/xdp.h | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/net/xdp.h b/include/net/xdp.h index ea3773f94f65..5f67c62540aa 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -41,6 +41,7 @@ enum xdp_mem_type { struct xdp_mem_info { u32 type; /* enum xdp_mem_type, but known size type */ + u32 id; }; struct xdp_rxq_info { @@ -99,18 +100,7 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp) return xdp_frame; } -static inline -void xdp_return_frame(void *data, struct xdp_mem_info *mem) -{ - if (mem->type == MEM_TYPE_PAGE_SHARED) - page_frag_free(data); - - if (mem->type == MEM_TYPE_PAGE_ORDER0) { - struct page *page = virt_to_page(data); /* Assumes order0 page*/ - - put_page(page); - } -} +void xdp_return_frame(void *data, struct xdp_mem_info *mem); int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, struct net_device *dev, u32 queue_index); -- cgit v1.2.3 From ff7d6b27f894f1469dc51ccb828b7363ccd9799f Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 17 Apr 2018 16:46:17 +0200 Subject: page_pool: refurbish version of page_pool code Need a fast page recycle mechanism for ndo_xdp_xmit API for returning pages on DMA-TX completion time, which have good cross CPU performance, given DMA-TX completion time can happen on a remote CPU. Refurbish my page_pool code, that was presented[1] at MM-summit 2016. Adapted page_pool code to not depend the page allocator and integration into struct page. The DMA mapping feature is kept, even-though it will not be activated/used in this patchset. [1] http://people.netfilter.org/hawk/presentations/MM-summit2016/generic_page_pool_mm_summit2016.pdf V2: Adjustments requested by Tariq - Changed page_pool_create return codes, don't return NULL, only ERR_PTR, as this simplifies err handling in drivers. V4: many small improvements and cleanups - Add DOC comment section, that can be used by kernel-doc - Improve fallback mode, to work better with refcnt based recycling e.g. remove a WARN as pointed out by Tariq e.g. quicker fallback if ptr_ring is empty. V5: Fixed SPDX license as pointed out by Alexei V6: Adjustments requested by Eric Dumazet - Adjust ____cacheline_aligned_in_smp usage/placement - Move rcu_head in struct page_pool - Free pages quicker on destroy, minimize resources delayed an RCU period - Remove code for forward/backward compat ABI interface V8: Issues found by kbuild test robot - Address sparse should be static warnings - Only compile+link when a driver use/select page_pool, mlx5 selects CONFIG_PAGE_POOL, although its first used in two patches Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/net/page_pool.h | 129 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 129 insertions(+) create mode 100644 include/net/page_pool.h (limited to 'include') diff --git a/include/net/page_pool.h b/include/net/page_pool.h new file mode 100644 index 000000000000..1fe77db59518 --- /dev/null +++ b/include/net/page_pool.h @@ -0,0 +1,129 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * page_pool.h + * Author: Jesper Dangaard Brouer + * Copyright (C) 2016 Red Hat, Inc. + */ + +/** + * DOC: page_pool allocator + * + * This page_pool allocator is optimized for the XDP mode that + * uses one-frame-per-page, but have fallbacks that act like the + * regular page allocator APIs. + * + * Basic use involve replacing alloc_pages() calls with the + * page_pool_alloc_pages() call. Drivers should likely use + * page_pool_dev_alloc_pages() replacing dev_alloc_pages(). + * + * If page_pool handles DMA mapping (use page->private), then API user + * is responsible for invoking page_pool_put_page() once. In-case of + * elevated refcnt, the DMA state is released, assuming other users of + * the page will eventually call put_page(). + * + * If no DMA mapping is done, then it can act as shim-layer that + * fall-through to alloc_page. As no state is kept on the page, the + * regular put_page() call is sufficient. + */ +#ifndef _NET_PAGE_POOL_H +#define _NET_PAGE_POOL_H + +#include /* Needed by ptr_ring */ +#include +#include + +#define PP_FLAG_DMA_MAP 1 /* Should page_pool do the DMA map/unmap */ +#define PP_FLAG_ALL PP_FLAG_DMA_MAP + +/* + * Fast allocation side cache array/stack + * + * The cache size and refill watermark is related to the network + * use-case. The NAPI budget is 64 packets. After a NAPI poll the RX + * ring is usually refilled and the max consumed elements will be 64, + * thus a natural max size of objects needed in the cache. + * + * Keeping room for more objects, is due to XDP_DROP use-case. As + * XDP_DROP allows the opportunity to recycle objects directly into + * this array, as it shares the same softirq/NAPI protection. If + * cache is already full (or partly full) then the XDP_DROP recycles + * would have to take a slower code path. + */ +#define PP_ALLOC_CACHE_SIZE 128 +#define PP_ALLOC_CACHE_REFILL 64 +struct pp_alloc_cache { + u32 count; + void *cache[PP_ALLOC_CACHE_SIZE]; +}; + +struct page_pool_params { + unsigned int flags; + unsigned int order; + unsigned int pool_size; + int nid; /* Numa node id to allocate from pages from */ + struct device *dev; /* device, for DMA pre-mapping purposes */ + enum dma_data_direction dma_dir; /* DMA mapping direction */ +}; + +struct page_pool { + struct rcu_head rcu; + struct page_pool_params p; + + /* + * Data structure for allocation side + * + * Drivers allocation side usually already perform some kind + * of resource protection. Piggyback on this protection, and + * require driver to protect allocation side. + * + * For NIC drivers this means, allocate a page_pool per + * RX-queue. As the RX-queue is already protected by + * Softirq/BH scheduling and napi_schedule. NAPI schedule + * guarantee that a single napi_struct will only be scheduled + * on a single CPU (see napi_schedule). + */ + struct pp_alloc_cache alloc ____cacheline_aligned_in_smp; + + /* Data structure for storing recycled pages. + * + * Returning/freeing pages is more complicated synchronization + * wise, because free's can happen on remote CPUs, with no + * association with allocation resource. + * + * Use ptr_ring, as it separates consumer and producer + * effeciently, it a way that doesn't bounce cache-lines. + * + * TODO: Implement bulk return pages into this structure. + */ + struct ptr_ring ring; +}; + +struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp); + +static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool) +{ + gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN); + + return page_pool_alloc_pages(pool, gfp); +} + +struct page_pool *page_pool_create(const struct page_pool_params *params); + +void page_pool_destroy(struct page_pool *pool); + +/* Never call this directly, use helpers below */ +void __page_pool_put_page(struct page_pool *pool, + struct page *page, bool allow_direct); + +static inline void page_pool_put_page(struct page_pool *pool, struct page *page) +{ + __page_pool_put_page(pool, page, false); +} +/* Very limited use-cases allow recycle direct */ +static inline void page_pool_recycle_direct(struct page_pool *pool, + struct page *page) +{ + __page_pool_put_page(pool, page, true); +} + +#endif /* _NET_PAGE_POOL_H */ -- cgit v1.2.3 From 57d0a1c1ac9e6a836bbab4698ba2a2e03f64bf1b Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 17 Apr 2018 16:46:22 +0200 Subject: xdp: allow page_pool as an allocator type in xdp_return_frame New allocator type MEM_TYPE_PAGE_POOL for page_pool usage. The registered allocator page_pool pointer is not available directly from xdp_rxq_info, but it could be (if needed). For now, the driver should keep separate track of the page_pool pointer, which it should use for RX-ring page allocation. As suggested by Saeed, to maintain a symmetric API it is the drivers responsibility to allocate/create and free/destroy the page_pool. Thus, after the driver have called xdp_rxq_info_unreg(), it is drivers responsibility to free the page_pool, but with a RCU free call. This is done easily via the page_pool helper page_pool_destroy() (which avoids touching any driver code during the RCU callback, which could happen after the driver have been unloaded). V8: address issues found by kbuild test robot - Address sparse should be static warnings - Allow xdp.o to be compiled without page_pool.o V9: Remove inline from .c file, compiler knows best Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/net/page_pool.h | 14 ++++++++++++++ include/net/xdp.h | 3 +++ 2 files changed, 17 insertions(+) (limited to 'include') diff --git a/include/net/page_pool.h b/include/net/page_pool.h index 1fe77db59518..c79087153148 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -117,7 +117,12 @@ void __page_pool_put_page(struct page_pool *pool, static inline void page_pool_put_page(struct page_pool *pool, struct page *page) { + /* When page_pool isn't compiled-in, net/core/xdp.c doesn't + * allow registering MEM_TYPE_PAGE_POOL, but shield linker. + */ +#ifdef CONFIG_PAGE_POOL __page_pool_put_page(pool, page, false); +#endif } /* Very limited use-cases allow recycle direct */ static inline void page_pool_recycle_direct(struct page_pool *pool, @@ -126,4 +131,13 @@ static inline void page_pool_recycle_direct(struct page_pool *pool, __page_pool_put_page(pool, page, true); } +static inline bool is_page_pool_compiled_in(void) +{ +#ifdef CONFIG_PAGE_POOL + return true; +#else + return false; +#endif +} + #endif /* _NET_PAGE_POOL_H */ diff --git a/include/net/xdp.h b/include/net/xdp.h index 5f67c62540aa..d0ee437753dc 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -36,6 +36,7 @@ enum xdp_mem_type { MEM_TYPE_PAGE_SHARED = 0, /* Split-page refcnt based model */ MEM_TYPE_PAGE_ORDER0, /* Orig XDP full page model */ + MEM_TYPE_PAGE_POOL, MEM_TYPE_MAX, }; @@ -44,6 +45,8 @@ struct xdp_mem_info { u32 id; }; +struct page_pool; + struct xdp_rxq_info { struct net_device *dev; u32 queue_index; -- cgit v1.2.3 From 039930945a72d9af5ff04ae9b9e60658a52e0770 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 17 Apr 2018 16:46:32 +0200 Subject: xdp: transition into using xdp_frame for return API Changing API xdp_return_frame() to take struct xdp_frame as argument, seems like a natural choice. But there are some subtle performance details here that needs extra care, which is a deliberate choice. When de-referencing xdp_frame on a remote CPU during DMA-TX completion, result in the cache-line is change to "Shared" state. Later when the page is reused for RX, then this xdp_frame cache-line is written, which change the state to "Modified". This situation already happens (naturally) for, virtio_net, tun and cpumap as the xdp_frame pointer is the queued object. In tun and cpumap, the ptr_ring is used for efficiently transferring cache-lines (with pointers) between CPUs. Thus, the only option is to de-referencing xdp_frame. It is only the ixgbe driver that had an optimization, in which it can avoid doing the de-reference of xdp_frame. The driver already have TX-ring queue, which (in case of remote DMA-TX completion) have to be transferred between CPUs anyhow. In this data area, we stored a struct xdp_mem_info and a data pointer, which allowed us to avoid de-referencing xdp_frame. To compensate for this, a prefetchw is used for telling the cache coherency protocol about our access pattern. My benchmarks show that this prefetchw is enough to compensate the ixgbe driver. V7: Adjust for commit d9314c474d4f ("i40e: add support for XDP_REDIRECT") V8: Adjust for commit bd658dda4237 ("net/mlx5e: Separate dma base address and offset in dma_sync call") Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/net/xdp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/xdp.h b/include/net/xdp.h index d0ee437753dc..137ad5f9f40f 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -103,7 +103,7 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp) return xdp_frame; } -void xdp_return_frame(void *data, struct xdp_mem_info *mem); +void xdp_return_frame(struct xdp_frame *xdpf); int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, struct net_device *dev, u32 queue_index); -- cgit v1.2.3 From 44fa2dbd475996ddc8f3a0e6113dee983e0ee3aa Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 17 Apr 2018 16:46:37 +0200 Subject: xdp: transition into using xdp_frame for ndo_xdp_xmit Changing API ndo_xdp_xmit to take a struct xdp_frame instead of struct xdp_buff. This brings xdp_return_frame and ndp_xdp_xmit in sync. This builds towards changing the API further to become a bulk API, because xdp_buff is not a queue-able object while xdp_frame is. V4: Adjust for commit 59655a5b6c83 ("tuntap: XDP_TX can use native XDP") V7: Adjust for commit d9314c474d4f ("i40e: add support for XDP_REDIRECT") Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index cf44503ea81a..14e0777ffcfb 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1165,7 +1165,7 @@ struct dev_ifalias { * This function is used to set or query state related to XDP on the * netdevice and manage BPF offload. See definition of * enum bpf_netdev_command for details. - * int (*ndo_xdp_xmit)(struct net_device *dev, struct xdp_buff *xdp); + * int (*ndo_xdp_xmit)(struct net_device *dev, struct xdp_frame *xdp); * This function is used to submit a XDP packet for transmit on a * netdevice. * void (*ndo_xdp_flush)(struct net_device *dev); @@ -1356,7 +1356,7 @@ struct net_device_ops { int (*ndo_bpf)(struct net_device *dev, struct netdev_bpf *bpf); int (*ndo_xdp_xmit)(struct net_device *dev, - struct xdp_buff *xdp); + struct xdp_frame *xdp); void (*ndo_xdp_flush)(struct net_device *dev); }; -- cgit v1.2.3 From 032234d8231909ac049f22ea3b408487e1c103eb Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Apr 2018 10:00:39 -0700 Subject: net/ipv6: Make __inet6_bind static BPF core gets access to __inet6_bind via ipv6_bpf_stub_impl, so it is not invoked directly outside of af_inet6.c. Make it static and move inet6_bind after to avoid forward declaration. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ipv6.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 836f31af1369..68b167d98879 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -1044,8 +1044,6 @@ void ipv6_local_error(struct sock *sk, int err, struct flowi6 *fl6, u32 info); void ipv6_local_rxpmtu(struct sock *sk, struct flowi6 *fl6, u32 mtu); int inet6_release(struct socket *sock); -int __inet6_bind(struct sock *sock, struct sockaddr *uaddr, int addr_len, - bool force_bind_address_no_port, bool with_lock); int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); int inet6_getname(struct socket *sock, struct sockaddr *uaddr, int peer); -- cgit v1.2.3 From bdb7cc643fc9db8d6ed9a2b9e524e27ac5882029 Mon Sep 17 00:00:00 2001 From: Stephen Suryaputra Date: Mon, 16 Apr 2018 13:42:16 -0400 Subject: ipv6: Count interface receive statistics on the ingress netdev The statistics such as InHdrErrors should be counted on the ingress netdev rather than on the dev from the dst, which is the egress. Signed-off-by: Stephen Suryaputra Signed-off-by: David S. Miller --- include/net/addrconf.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include') diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 378d601258be..8312cc25a3af 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -307,6 +307,20 @@ static inline struct inet6_dev *__in6_dev_get(const struct net_device *dev) return rcu_dereference_rtnl(dev->ip6_ptr); } +/** + * __in6_dev_get_safely - get inet6_dev pointer from netdevice + * @dev: network device + * + * This is a safer version of __in6_dev_get + */ +static inline struct inet6_dev *__in6_dev_get_safely(const struct net_device *dev) +{ + if (likely(dev)) + return rcu_dereference_rtnl(dev->ip6_ptr); + else + return NULL; +} + /** * in6_dev_get - get inet6_dev pointer from netdevice * @dev: network device -- cgit v1.2.3 From 72f6d71e491e6ce269b564865b21fab0a4402dd3 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Tue, 17 Apr 2018 14:11:28 +0800 Subject: vxlan: add ttl inherit support Like tos inherit, ttl inherit should also means inherit the inner protocol's ttl values, which actually not implemented in vxlan yet. But we could not treat ttl == 0 as "use the inner TTL", because that would be used also when the "ttl" option is not specified and that would be a behavior change, and breaking real use cases. So add a different attribute IFLA_VXLAN_TTL_INHERIT when "ttl inherit" is specified with ip cmd. Reported-by: Jianlin Shi Suggested-by: Jiri Benc Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 11 +++++++++++ include/net/vxlan.h | 1 + include/uapi/linux/if_link.h | 1 + 3 files changed, 13 insertions(+) (limited to 'include') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 540a4b4417bf..751646adc769 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -379,6 +379,17 @@ static inline u8 ip_tunnel_get_dsfield(const struct iphdr *iph, return 0; } +static inline u8 ip_tunnel_get_ttl(const struct iphdr *iph, + const struct sk_buff *skb) +{ + if (skb->protocol == htons(ETH_P_IP)) + return iph->ttl; + else if (skb->protocol == htons(ETH_P_IPV6)) + return ((const struct ipv6hdr *)iph)->hop_limit; + else + return 0; +} + /* Propogate ECN bits out */ static inline u8 ip_tunnel_ecn_encap(u8 tos, const struct iphdr *iph, const struct sk_buff *skb) diff --git a/include/net/vxlan.h b/include/net/vxlan.h index ad73d8b3fcc2..b99a02ae3934 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -262,6 +262,7 @@ struct vxlan_dev { #define VXLAN_F_COLLECT_METADATA 0x2000 #define VXLAN_F_GPE 0x4000 #define VXLAN_F_IPV6_LINKLOCAL 0x8000 +#define VXLAN_F_TTL_INHERIT 0x10000 /* Flags that are used in the receive path. These flags must match in * order for a socket to be shareable diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 68699f654118..b85266420bfb 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -516,6 +516,7 @@ enum { IFLA_VXLAN_COLLECT_METADATA, IFLA_VXLAN_LABEL, IFLA_VXLAN_GPE, + IFLA_VXLAN_TTL_INHERIT, __IFLA_VXLAN_MAX }; #define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1) -- cgit v1.2.3 From a919525ad832d2bb1388b2303832a2307b30aeff Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Apr 2018 17:33:07 -0700 Subject: net: Move fib_convert_metrics to metrics file Move logic of fib_convert_metrics into ip_metrics_convert. This allows the code that converts netlink attributes into metrics struct to be re-used in a later patch by IPv6. This is mostly a code move with the following changes to variable names: - fi->fib_net becomes net - fc_mx and fc_mx_len are passed as inputs pulled from fib_config - metrics array is passed as an input from fi->fib_metrics->metrics Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/ip.h b/include/net/ip.h index ecffd843e7b8..dc4a2d6e58a5 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -396,6 +396,9 @@ static inline unsigned int ip_skb_dst_mtu(struct sock *sk, return min(READ_ONCE(skb_dst(skb)->dev->mtu), IP_MAX_MTU); } +int ip_metrics_convert(struct net *net, struct nlattr *fc_mx, int fc_mx_len, + u32 *metrics); + u32 ip_idents_reserve(u32 hash, int segs); void __ip_select_ident(struct net *net, struct iphdr *iph, int segs); -- cgit v1.2.3 From 7aef6859ee91ea867a3dff9ba47bca9b2de382f6 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Apr 2018 17:33:10 -0700 Subject: net/ipv6: Pass net to fib6_update_sernum Pass net namespace to fib6_update_sernum. It can not be marked const as fib6_new_sernum will change ipv6.fib6_sernum. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 5e86fd9dc857..f0aaf1c8f1a8 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -408,7 +408,7 @@ void __net_exit fib6_notifier_exit(struct net *net); unsigned int fib6_tables_seq_read(struct net *net); int fib6_tables_dump(struct net *net, struct notifier_block *nb); -void fib6_update_sernum(struct rt6_info *rt); +void fib6_update_sernum(struct net *net, struct rt6_info *rt); void fib6_update_sernum_upto_root(struct net *net, struct rt6_info *rt); #ifdef CONFIG_IPV6_MULTIPLE_TABLES -- cgit v1.2.3 From afb1d4b59311a8252f67c214b37ec69d8100cb55 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Apr 2018 17:33:11 -0700 Subject: net/ipv6: Pass net namespace to route functions Pass network namespace reference into route add, delete and get functions. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_route.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 08b132381984..1130a1144dfd 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -101,8 +101,8 @@ void ip6_route_cleanup(void); int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg); int ip6_route_add(struct fib6_config *cfg, struct netlink_ext_ack *extack); -int ip6_ins_rt(struct rt6_info *); -int ip6_del_rt(struct rt6_info *); +int ip6_ins_rt(struct net *net, struct rt6_info *rt); +int ip6_del_rt(struct net *net, struct rt6_info *rt); void rt6_flush_exceptions(struct rt6_info *rt); int rt6_remove_exception_rt(struct rt6_info *rt); @@ -137,7 +137,7 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct flowi6 *fl6); void fib6_force_start_gc(struct net *net); -struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, +struct rt6_info *addrconf_dst_alloc(struct net *net, struct inet6_dev *idev, const struct in6_addr *addr, bool anycast); struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, @@ -147,9 +147,11 @@ struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, * support functions for ND * */ -struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, +struct rt6_info *rt6_get_dflt_router(struct net *net, + const struct in6_addr *addr, struct net_device *dev); -struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr, +struct rt6_info *rt6_add_dflt_router(struct net *net, + const struct in6_addr *gwaddr, struct net_device *dev, unsigned int pref); void rt6_purge_dflt_routers(struct net *net); -- cgit v1.2.3 From e8478e80e5a74f4ce47b043735f0066588fb64c7 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Apr 2018 17:33:13 -0700 Subject: net/ipv6: Save route type in rt6_info The RTN_ type for IPv6 FIB entries is currently embedded in rt6i_flags and dst.error. Since dst is going to be removed, it can no longer be relied on for FIB dumps so save the route type as fib6_type. fc_type is set in current users based on the algorithm in rt6_fill_node: - rt6i_flags contains RTF_LOCAL: fc_type = RTN_LOCAL - rt6i_flags contains RTF_ANYCAST: fc_type = RTN_ANYCAST - else fc_type = RTN_UNICAST Similarly, fib6_type is set in the rt6_info templates based on the RTF_REJECT section of rt6_fill_node converting dst.error to RTN type. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index f0aaf1c8f1a8..0165820bbafb 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -174,6 +174,7 @@ struct rt6_info { int rt6i_nh_weight; unsigned short rt6i_nfheader_len; u8 rt6i_protocol; + u8 fib6_type; u8 exception_bucket_flushed:1, should_flush:1, unused:6; -- cgit v1.2.3 From 5e670d844b2a4e47d1b9b9aceb14dd3c12a6d4bf Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Apr 2018 17:33:14 -0700 Subject: net/ipv6: Move nexthop data to fib6_nh Introduce fib6_nh structure and move nexthop related data from rt6_info and rt6_info.dst to fib6_nh. References to dev, gateway or lwtstate from a FIB lookup perspective are converted to use fib6_nh; datapath references to dst version are left as is. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 16 ++++++++++++---- include/net/ip6_route.h | 6 +++--- 2 files changed, 15 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 0165820bbafb..f0a88370ba95 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -127,6 +127,16 @@ struct rt6_exception { #define FIB6_EXCEPTION_BUCKET_SIZE (1 << FIB6_EXCEPTION_BUCKET_SIZE_SHIFT) #define FIB6_MAX_DEPTH 5 +struct fib6_nh { + struct in6_addr nh_gw; + struct net_device *nh_dev; + struct lwtunnel_state *nh_lwtstate; + + unsigned int nh_flags; + atomic_t nh_upper_bound; + int nh_weight; +}; + struct rt6_info { struct dst_entry dst; struct rt6_info __rcu *rt6_next; @@ -149,12 +159,9 @@ struct rt6_info { */ struct list_head rt6i_siblings; unsigned int rt6i_nsiblings; - atomic_t rt6i_nh_upper_bound; atomic_t rt6i_ref; - unsigned int rt6i_nh_flags; - /* These are in a separate cache line. */ struct rt6key rt6i_dst ____cacheline_aligned_in_smp; u32 rt6i_flags; @@ -171,13 +178,14 @@ struct rt6_info { u32 rt6i_metric; u32 rt6i_pmtu; /* more non-fragment space at head required */ - int rt6i_nh_weight; unsigned short rt6i_nfheader_len; u8 rt6i_protocol; u8 fib6_type; u8 exception_bucket_flushed:1, should_flush:1, unused:6; + + struct fib6_nh fib6_nh; }; #define for_each_fib6_node_rt_rcu(fn) \ diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 1130a1144dfd..655e13017a45 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -273,10 +273,10 @@ static inline struct in6_addr *rt6_nexthop(struct rt6_info *rt, static inline bool rt6_duplicate_nexthop(struct rt6_info *a, struct rt6_info *b) { - return a->dst.dev == b->dst.dev && + return a->fib6_nh.nh_dev == b->fib6_nh.nh_dev && a->rt6i_idev == b->rt6i_idev && - ipv6_addr_equal(&a->rt6i_gateway, &b->rt6i_gateway) && - !lwtunnel_cmp_encap(a->dst.lwtstate, b->dst.lwtstate); + ipv6_addr_equal(&a->fib6_nh.nh_gw, &b->fib6_nh.nh_gw) && + !lwtunnel_cmp_encap(a->fib6_nh.nh_lwtstate, b->fib6_nh.nh_lwtstate); } #endif -- cgit v1.2.3 From d4ead6b34b67fd711639324b6465a050bcb197d4 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Apr 2018 17:33:16 -0700 Subject: net/ipv6: move metrics from dst to rt6_info Similar to IPv4, add fib metrics to the fib struct, which at the moment is rt6_info. Will be moved to fib6_info in a later patch. Copy metrics into dst by reference using refcount. To make the transition: - add dst_metrics to rt6_info. Default to dst_default_metrics if no metrics are passed during route add. No need for a separate pmtu entry; it can reference the MTU slot in fib6_metrics - ip6_convert_metrics allocates memory in the FIB entry and uses ip_metrics_convert to copy from netlink attribute to metrics entry - the convert metrics call is done in ip6_route_info_create simplifying the route add path + fib6_commit_metrics and fib6_copy_metrics and the temporary mx6_config are no longer needed - add fib6_metric_set helper to change the value of a metric in the fib entry since dst_metric_set can no longer be used - cow_metrics for IPv6 can drop to dst_cow_metrics_generic - rt6_dst_from_metrics_check is no longer needed - rt6_fill_node needs the FIB entry and dst as separate arguments to keep compatibility with existing output. Current dst address is renamed to dest. (to be consistent with IPv4 rt6_fill_node really should be split into 2 functions similar to fib_dump_info and rt_fill_info) - rt6_fill_node no longer needs the temporary metrics variable Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index f0a88370ba95..1f8dc9d12abb 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -94,11 +94,6 @@ struct fib6_gc_args { #define FIB6_SUBTREE(fn) (rcu_dereference_protected((fn)->subtree, 1)) #endif -struct mx6_config { - const u32 *mx; - DECLARE_BITMAP(mx_valid, RTAX_MAX); -}; - /* * routing information * @@ -176,7 +171,6 @@ struct rt6_info { struct rt6_exception_bucket __rcu *rt6i_exception_bucket; u32 rt6i_metric; - u32 rt6i_pmtu; /* more non-fragment space at head required */ unsigned short rt6i_nfheader_len; u8 rt6i_protocol; @@ -185,6 +179,8 @@ struct rt6_info { should_flush:1, unused:6; + struct dst_metrics *fib6_metrics; +#define fib6_pmtu fib6_metrics->metrics[RTAX_MTU-1] struct fib6_nh fib6_nh; }; @@ -390,8 +386,7 @@ void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *arg), void *arg); int fib6_add(struct fib6_node *root, struct rt6_info *rt, - struct nl_info *info, struct mx6_config *mxc, - struct netlink_ext_ack *extack); + struct nl_info *info, struct netlink_ext_ack *extack); int fib6_del(struct rt6_info *rt, struct nl_info *info); void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info, @@ -420,6 +415,12 @@ int fib6_tables_dump(struct net *net, struct notifier_block *nb); void fib6_update_sernum(struct net *net, struct rt6_info *rt); void fib6_update_sernum_upto_root(struct net *net, struct rt6_info *rt); +void fib6_metric_set(struct rt6_info *f6i, int metric, u32 val); +static inline bool fib6_metric_locked(struct rt6_info *f6i, int metric) +{ + return !!(f6i->fib6_metrics->metrics[RTAX_LOCK - 1] & (1 << metric)); +} + #ifdef CONFIG_IPV6_MULTIPLE_TABLES int fib6_rules_init(void); void fib6_rules_cleanup(void); -- cgit v1.2.3 From 14895687d36805f051bb54014c32e48e5937f7e1 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Apr 2018 17:33:17 -0700 Subject: net/ipv6: move expires into rt6_info Add expires to rt6_info for FIB entries, and add fib6 helpers to manage it. Data path use of dst.expires remains. The transition is fairly straightforward: when working with fib entries, rt->dst.expires is just rt->expires, rt6_clean_expires is replaced with fib6_clean_expires, rt6_set_expires becomes fib6_set_expires, and rt6_check_expired becomes fib6_check_expired, where the fib6 versions are added by this patch. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 1f8dc9d12abb..c73b985734f5 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -179,6 +179,7 @@ struct rt6_info { should_flush:1, unused:6; + unsigned long expires; struct dst_metrics *fib6_metrics; #define fib6_pmtu fib6_metrics->metrics[RTAX_MTU-1] struct fib6_nh fib6_nh; @@ -197,6 +198,26 @@ static inline struct inet6_dev *ip6_dst_idev(struct dst_entry *dst) return ((struct rt6_info *)dst)->rt6i_idev; } +static inline void fib6_clean_expires(struct rt6_info *f6i) +{ + f6i->rt6i_flags &= ~RTF_EXPIRES; + f6i->expires = 0; +} + +static inline void fib6_set_expires(struct rt6_info *f6i, + unsigned long expires) +{ + f6i->expires = expires; + f6i->rt6i_flags |= RTF_EXPIRES; +} + +static inline bool fib6_check_expired(const struct rt6_info *f6i) +{ + if (f6i->rt6i_flags & RTF_EXPIRES) + return time_after(jiffies, f6i->expires); + return false; +} + static inline void rt6_clean_expires(struct rt6_info *rt) { rt->rt6i_flags &= ~RTF_EXPIRES; @@ -211,11 +232,9 @@ static inline void rt6_set_expires(struct rt6_info *rt, unsigned long expires) static inline void rt6_update_expires(struct rt6_info *rt0, int timeout) { - struct rt6_info *rt; + if (!(rt0->rt6i_flags & RTF_EXPIRES) && rt0->from) + rt0->dst.expires = rt0->from->expires; - for (rt = rt0; rt && !(rt->rt6i_flags & RTF_EXPIRES); rt = rt->from); - if (rt && rt != rt0) - rt0->dst.expires = rt->dst.expires; dst_set_expires(&rt0->dst, timeout); rt0->rt6i_flags |= RTF_EXPIRES; } -- cgit v1.2.3 From 421842edeaf62c4e180b687f5a4efca8c19c49ad Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Apr 2018 17:33:18 -0700 Subject: net/ipv6: Add fib6_null_entry ip6_null_entry will stay a dst based return for lookups that fail to match an entry. Add a new fib6_null_entry which constitutes the root node and leafs for fibs. Replace existing references to ip6_null_entry with the new fib6_null_entry when dealing with FIBs. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/netns/ipv6.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index c29f09cfc9d7..74e4e1e449d5 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -60,7 +60,8 @@ struct netns_ipv6 { #endif struct xt_table *ip6table_nat; #endif - struct rt6_info *ip6_null_entry; + struct rt6_info *fib6_null_entry; + struct rt6_info *ip6_null_entry; struct rt6_statistics *rt6_stats; struct timer_list ip6_fib_timer; struct hlist_head *fib_table_hash; -- cgit v1.2.3 From 3b6761d18bc11f2af2a6fc494e9026d39593f22c Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Apr 2018 17:33:20 -0700 Subject: net/ipv6: Move dst flags to booleans in fib entries Continuing to wean FIB paths off of dst_entry, use a bool to hold requests for certain dst settings. Add a helper to convert the flags to DST flags when a FIB entry is converted to a dst_entry. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index c73b985734f5..159f651dee55 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -177,7 +177,10 @@ struct rt6_info { u8 fib6_type; u8 exception_bucket_flushed:1, should_flush:1, - unused:6; + dst_nocount:1, + dst_nopolicy:1, + dst_host:1, + unused:3; unsigned long expires; struct dst_metrics *fib6_metrics; -- cgit v1.2.3 From f8a1b43b709d8ef33a8de2f8f35095b4a4413713 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Apr 2018 17:33:21 -0700 Subject: net/ipv6: Create a neigh_lookup for FIB entries The router discovery code has a FIB entry and wants to validate the gateway has a neighbor entry. Refactor the existing dst_neigh_lookup for IPv6 and create a new function that takes the gateway and device and returns a neighbor entry. Use the new function in ndisc_router_discovery to validate the gateway. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_route.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 655e13017a45..cb6fb7e16a28 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -279,4 +279,7 @@ static inline bool rt6_duplicate_nexthop(struct rt6_info *a, struct rt6_info *b) !lwtunnel_cmp_encap(a->fib6_nh.nh_lwtstate, b->fib6_nh.nh_lwtstate); } +struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw, + struct net_device *dev, struct sk_buff *skb, + const void *daddr); #endif -- cgit v1.2.3 From acb54e3cba404c20f07733f3222c0418a7724a5b Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Apr 2018 17:33:22 -0700 Subject: net/ipv6: Add gfp_flags to route add functions Most FIB entries can be added using memory allocated with GFP_KERNEL. Add gfp_flags to ip6_route_add and addrconf_dst_alloc. Code paths that can be reached from the packet path (e.g., ndisc and autoconfig) or atomic notifiers use GFP_ATOMIC; paths from user context (adding addresses and routes) use GFP_KERNEL. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_route.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index cb6fb7e16a28..ff70266e30d7 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -100,7 +100,8 @@ void ip6_route_cleanup(void); int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg); -int ip6_route_add(struct fib6_config *cfg, struct netlink_ext_ack *extack); +int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, + struct netlink_ext_ack *extack); int ip6_ins_rt(struct net *net, struct rt6_info *rt); int ip6_del_rt(struct net *net, struct rt6_info *rt); @@ -138,7 +139,8 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct flowi6 *fl6); void fib6_force_start_gc(struct net *net); struct rt6_info *addrconf_dst_alloc(struct net *net, struct inet6_dev *idev, - const struct in6_addr *addr, bool anycast); + const struct in6_addr *addr, bool anycast, + gfp_t gfp_flags); struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, int flags); -- cgit v1.2.3 From 23fb93a4d3f118a900790066d03368a296dce0d6 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Apr 2018 17:33:23 -0700 Subject: net/ipv6: Cleanup exception and cache route handling IPv6 FIB will only contain FIB entries with exception routes added to the FIB entry. Once this transformation is complete, FIB lookups will return a fib6_info with the lookup functions still returning a dst based rt6_info. The current code uses rt6_info for both paths and overloads the rt6_info variable usually called 'rt'. This patch introduces a new 'f6i' variable name for the result of the FIB lookup and keeps 'rt' as the dst based return variable. 'f6i' becomes a fib6_info in a later patch which is why it is introduced as f6i now; avoids the additional churn in the later patch. In addition, remove RTF_CACHE and dst checks from fib6 add and delete since they can not happen now and will never happen after the data type flip. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_route.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index ff70266e30d7..686cdc7f356a 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -106,7 +106,6 @@ int ip6_ins_rt(struct net *net, struct rt6_info *rt); int ip6_del_rt(struct net *net, struct rt6_info *rt); void rt6_flush_exceptions(struct rt6_info *rt); -in