From 8142b227ef43119e19acf6122a9eea1a82492645 Mon Sep 17 00:00:00 2001 From: Andrey Vagin Date: Mon, 31 Mar 2014 18:14:18 +0400 Subject: netfilter: nf_conntrack: flush net_gre->keymap_list only from gre helper nf_ct_gre_keymap_flush() removes a nf_ct_gre_keymap object from net_gre->keymap_list and frees the object. But it doesn't clean a reference on this object from ct_pptp_info->keymap[dir]. Then nf_ct_gre_keymap_destroy() may release the same object again. So nf_ct_gre_keymap_flush() can be called only when we are sure that when nf_ct_gre_keymap_destroy will not be called. nf_ct_gre_keymap is created by nf_ct_gre_keymap_add() and the right way to destroy it is to call nf_ct_gre_keymap_destroy(). This patch marks nf_ct_gre_keymap_flush() as static, so this patch can break compilation of third party modules, which use nf_ct_gre_keymap_flush. I'm not sure this is the right way to deprecate this function. [ 226.540793] general protection fault: 0000 [#1] SMP [ 226.541750] Modules linked in: nf_nat_pptp nf_nat_proto_gre nf_conntrack_pptp nf_conntrack_proto_gre ip_gre ip_tunnel gre ppp_deflate bsd_comp ppp_async crc_ccitt ppp_generic slhc xt_nat iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack veth tun bridge stp llc ppdev microcode joydev pcspkr serio_raw virtio_console virtio_balloon floppy parport_pc parport pvpanic i2c_piix4 virtio_net drm_kms_helper ttm ata_generic virtio_pci virtio_ring virtio drm i2c_core pata_acpi [last unloaded: ip_tunnel] [ 226.541776] CPU: 0 PID: 49 Comm: kworker/u4:2 Not tainted 3.14.0-rc8+ #101 [ 226.541776] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [ 226.541776] Workqueue: netns cleanup_net [ 226.541776] task: ffff8800371e0000 ti: ffff88003730c000 task.ti: ffff88003730c000 [ 226.541776] RIP: 0010:[] [] __list_del_entry+0x29/0xd0 [ 226.541776] RSP: 0018:ffff88003730dbd0 EFLAGS: 00010a83 [ 226.541776] RAX: 6b6b6b6b6b6b6b6b RBX: ffff8800374e6c40 RCX: dead000000200200 [ 226.541776] RDX: 6b6b6b6b6b6b6b6b RSI: ffff8800371e07d0 RDI: ffff8800374e6c40 [ 226.541776] RBP: ffff88003730dbd0 R08: 0000000000000000 R09: 0000000000000000 [ 226.541776] R10: 0000000000000001 R11: ffff88003730d92e R12: 0000000000000002 [ 226.541776] R13: ffff88007a4c42d0 R14: ffff88007aef0000 R15: ffff880036cf0018 [ 226.541776] FS: 0000000000000000(0000) GS:ffff88007fc00000(0000) knlGS:0000000000000000 [ 226.541776] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [ 226.541776] CR2: 00007f07f643f7d0 CR3: 0000000036fd2000 CR4: 00000000000006f0 [ 226.541776] Stack: [ 226.541776] ffff88003730dbe8 ffffffff81389c5d ffff8800374ffbe4 ffff88003730dc28 [ 226.541776] ffffffffa0162a43 ffffffffa01627c5 ffff88007a4c42d0 ffff88007aef0000 [ 226.541776] ffffffffa01651c0 ffff88007a4c45e0 ffff88007aef0000 ffff88003730dc40 [ 226.541776] Call Trace: [ 226.541776] [] list_del+0xd/0x30 [ 226.541776] [] nf_ct_gre_keymap_destroy+0x283/0x2d0 [nf_conntrack_proto_gre] [ 226.541776] [] ? nf_ct_gre_keymap_destroy+0x5/0x2d0 [nf_conntrack_proto_gre] [ 226.541776] [] gre_destroy+0x27/0x70 [nf_conntrack_proto_gre] [ 226.541776] [] destroy_conntrack+0x83/0x200 [nf_conntrack] [ 226.541776] [] ? destroy_conntrack+0x27/0x200 [nf_conntrack] [ 226.541776] [] ? nf_conntrack_hash_check_insert+0x2e0/0x2e0 [nf_conntrack] [ 226.541776] [] nf_conntrack_destroy+0x72/0x180 [ 226.541776] [] ? nf_conntrack_destroy+0x5/0x180 [ 226.541776] [] ? kill_l3proto+0x20/0x20 [nf_conntrack] [ 226.541776] [] nf_ct_iterate_cleanup+0x14e/0x170 [nf_conntrack] [ 226.541776] [] nf_ct_l4proto_pernet_unregister+0x5b/0x90 [nf_conntrack] [ 226.541776] [] proto_gre_net_exit+0x19/0x30 [nf_conntrack_proto_gre] [ 226.541776] [] ops_exit_list.isra.1+0x39/0x60 [ 226.541776] [] cleanup_net+0x100/0x1d0 [ 226.541776] [] process_one_work+0x1ea/0x4f0 [ 226.541776] [] ? process_one_work+0x188/0x4f0 [ 226.541776] [] worker_thread+0x11b/0x3a0 [ 226.541776] [] ? process_one_work+0x4f0/0x4f0 [ 226.541776] [] kthread+0xed/0x110 [ 226.541776] [] ? _raw_spin_unlock_irq+0x2c/0x40 [ 226.541776] [] ? kthread_create_on_node+0x200/0x200 [ 226.541776] [] ret_from_fork+0x7c/0xb0 [ 226.541776] [] ? kthread_create_on_node+0x200/0x200 [ 226.541776] Code: 00 00 55 48 8b 17 48 b9 00 01 10 00 00 00 ad de 48 8b 47 08 48 89 e5 48 39 ca 74 29 48 b9 00 02 20 00 00 00 ad de 48 39 c8 74 7a <4c> 8b 00 4c 39 c7 75 53 4c 8b 42 08 4c 39 c7 75 2b 48 89 42 08 [ 226.541776] RIP [] __list_del_entry+0x29/0xd0 [ 226.541776] RSP [ 226.612193] ---[ end trace 985ae23ddfcc357c ]--- Cc: Pablo Neira Ayuso Cc: Patrick McHardy Cc: Jozsef Kadlecsik Cc: "David S. Miller" Signed-off-by: Andrey Vagin Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nf_conntrack_proto_gre.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/netfilter/nf_conntrack_proto_gre.h b/include/linux/netfilter/nf_conntrack_proto_gre.h index ec2ffaf418c8..df78dc2b5524 100644 --- a/include/linux/netfilter/nf_conntrack_proto_gre.h +++ b/include/linux/netfilter/nf_conntrack_proto_gre.h @@ -87,7 +87,6 @@ int nf_ct_gre_keymap_add(struct nf_conn *ct, enum ip_conntrack_dir dir, /* delete keymap entries */ void nf_ct_gre_keymap_destroy(struct nf_conn *ct); -void nf_ct_gre_keymap_flush(struct net *net); void nf_nat_need_gre(void); #endif /* __KERNEL__ */ -- cgit v1.2.3 From b855d416dc17061ebb271ea7ef1201d100531770 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Sat, 12 Apr 2014 13:17:57 +0200 Subject: netfilter: nf_tables: fix nft_cmp_fast failure on big endian for size < 4 nft_cmp_fast is used for equality comparisions of size <= 4. For comparisions of size < 4 byte a mask is calculated that is applied to both the data from userspace (during initialization) and the register value (during runtime). Both values are stored using (in effect) memcpy to a memory area that is then interpreted as u32 by nft_cmp_fast. This works fine on little endian since smaller types have the same base address, however on big endian this is not true and the smaller types are interpreted as a big number with trailing zero bytes. The mask therefore must not include the lower bytes, but the higher bytes on big endian. Add a helper function that does a cpu_to_le32 to switch the bytes on big endian. Since we're dealing with a mask of just consequitive bits, this works out fine. Signed-off-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables_core.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h index cf2b7ae2b9d8..a75fc8e27cd6 100644 --- a/include/net/netfilter/nf_tables_core.h +++ b/include/net/netfilter/nf_tables_core.h @@ -13,6 +13,16 @@ struct nft_cmp_fast_expr { u8 len; }; +/* Calculate the mask for the nft_cmp_fast expression. On big endian the + * mask needs to include the *upper* bytes when interpreting that data as + * something smaller than the full u32, therefore a cpu_to_le32 is done. + */ +static inline u32 nft_cmp_fast_mask(unsigned int len) +{ + return cpu_to_le32(~0U >> (FIELD_SIZEOF(struct nft_cmp_fast_expr, + data) * BITS_PER_BYTE - len)); +} + extern const struct nft_expr_ops nft_cmp_fast_ops; int nft_cmp_module_init(void); -- cgit v1.2.3 From 30f78d8ebf7f514801e71b88a10c948275168518 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 10 Apr 2014 21:23:36 -0700 Subject: ipv6: Limit mtu to 65575 bytes Francois reported that setting big mtu on loopback device could prevent tcp sessions making progress. We do not support (yet ?) IPv6 Jumbograms and cook corrupted packets. We must limit the IPv6 MTU to (65535 + 40) bytes in theory. Tested: ifconfig lo mtu 70000 netperf -H ::1 Before patch : Throughput : 0.05 Mbits After patch : Throughput : 35484 Mbits Reported-by: Francois WELLENREITER Signed-off-by: Eric Dumazet Acked-by: YOSHIFUJI Hideaki Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/net/ip6_route.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 3c3bb184eb8f..6c4f5eac98e7 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -32,6 +32,11 @@ struct route_info { #define RT6_LOOKUP_F_SRCPREF_PUBLIC 0x00000010 #define RT6_LOOKUP_F_SRCPREF_COA 0x00000020 +/* We do not (yet ?) support IPv6 jumbograms (RFC 2675) + * Unlike IPv4, hdr->seg_len doesn't include the IPv6 header + */ +#define IP6_MAX_MTU (0xFFFF + sizeof(struct ipv6hdr)) + /* * rt6_srcprefs2flags() and rt6_flags2srcprefs() translate * between IPV6_ADDR_PREFERENCES socket option values -- cgit v1.2.3 From 8c482cdc358ef931ee02262e0a4ef0f29946aa0c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 14 Apr 2014 21:20:12 +0200 Subject: net: filter: seccomp: fix wrong decoding of BPF_S_ANC_SECCOMP_LD_W While reviewing seccomp code, we found that BPF_S_ANC_SECCOMP_LD_W has been wrongly decoded by commit a8fc927780 ("sk-filter: Add ability to get socket filter program (v2)") into the opcode BPF_LD|BPF_B|BPF_ABS although it should have been decoded as BPF_LD|BPF_W|BPF_ABS. In practice, this should not have much side-effect though, as such conversion is/was being done through prctl(2) PR_SET_SECCOMP. Reverse operation PR_GET_SECCOMP will only return the current seccomp mode, but not the filter itself. Since the transition to the new BPF infrastructure, it's also not used anymore, so we can simply remove this as it's unreachable. Fixes: a8fc927780 ("sk-filter: Add ability to get socket filter program (v2)") Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Cc: Pavel Emelyanov Signed-off-by: David S. Miller --- include/linux/filter.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index 262dcbb75ffe..024fd03e5d18 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -220,7 +220,6 @@ enum { BPF_S_ANC_RXHASH, BPF_S_ANC_CPU, BPF_S_ANC_ALU_XOR_X, - BPF_S_ANC_SECCOMP_LD_W, BPF_S_ANC_VLAN_TAG, BPF_S_ANC_VLAN_TAG_PRESENT, BPF_S_ANC_PAY_OFFSET, -- cgit v1.2.3 From 362d52040c71f6e8d8158be48c812d7729cb8df1 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 14 Apr 2014 21:45:17 +0200 Subject: Revert "net: sctp: Fix a_rwnd/rwnd management to reflect real state of the receiver's buffer" This reverts commit ef2820a735f7 ("net: sctp: Fix a_rwnd/rwnd management to reflect real state of the receiver's buffer") as it introduced a serious performance regression on SCTP over IPv4 and IPv6, though a not as dramatic on the latter. Measurements are on 10Gbit/s with ixgbe NICs. Current state: [root@Lab200slot2 ~]# iperf3 --sctp -4 -c 192.168.241.3 -V -l 1452 -t 60 iperf version 3.0.1 (10 January 2014) Linux Lab200slot2 3.14.0 #1 SMP Thu Apr 3 23:18:29 EDT 2014 x86_64 Time: Fri, 11 Apr 2014 17:56:21 GMT Connecting to host 192.168.241.3, port 5201 Cookie: Lab200slot2.1397238981.812898.548918 [ 4] local 192.168.241.2 port 38616 connected to 192.168.241.3 port 5201 Starting Test: protocol: SCTP, 1 streams, 1452 byte blocks, omitting 0 seconds, 60 second test [ ID] Interval Transfer Bandwidth [ 4] 0.00-1.09 sec 20.8 MBytes 161 Mbits/sec [ 4] 1.09-2.13 sec 10.8 MBytes 86.8 Mbits/sec [ 4] 2.13-3.15 sec 3.57 MBytes 29.5 Mbits/sec [ 4] 3.15-4.16 sec 4.33 MBytes 35.7 Mbits/sec [ 4] 4.16-6.21 sec 10.4 MBytes 42.7 Mbits/sec [ 4] 6.21-6.21 sec 0.00 Bytes 0.00 bits/sec [ 4] 6.21-7.35 sec 34.6 MBytes 253 Mbits/sec [ 4] 7.35-11.45 sec 22.0 MBytes 45.0 Mbits/sec [ 4] 11.45-11.45 sec 0.00 Bytes 0.00 bits/sec [ 4] 11.45-11.45 sec 0.00 Bytes 0.00 bits/sec [ 4] 11.45-11.45 sec 0.00 Bytes 0.00 bits/sec [ 4] 11.45-12.51 sec 16.0 MBytes 126 Mbits/sec [ 4] 12.51-13.59 sec 20.3 MBytes 158 Mbits/sec [ 4] 13.59-14.65 sec 13.4 MBytes 107 Mbits/sec [ 4] 14.65-16.79 sec 33.3 MBytes 130 Mbits/sec [ 4] 16.79-16.79 sec 0.00 Bytes 0.00 bits/sec [ 4] 16.79-17.82 sec 5.94 MBytes 48.7 Mbits/sec (etc) [root@Lab200slot2 ~]# iperf3 --sctp -6 -c 2001:db8:0:f101::1 -V -l 1400 -t 60 iperf version 3.0.1 (10 January 2014) Linux Lab200slot2 3.14.0 #1 SMP Thu Apr 3 23:18:29 EDT 2014 x86_64 Time: Fri, 11 Apr 2014 19:08:41 GMT Connecting to host 2001:db8:0:f101::1, port 5201 Cookie: Lab200slot2.1397243321.714295.2b3f7c [ 4] local 2001:db8:0:f101::2 port 55804 connected to 2001:db8:0:f101::1 port 5201 Starting Test: protocol: SCTP, 1 streams, 1400 byte blocks, omitting 0 seconds, 60 second test [ ID] Interval Transfer Bandwidth [ 4] 0.00-1.00 sec 169 MBytes 1.42 Gbits/sec [ 4] 1.00-2.00 sec 201 MBytes 1.69 Gbits/sec [ 4] 2.00-3.00 sec 188 MBytes 1.58 Gbits/sec [ 4] 3.00-4.00 sec 174 MBytes 1.46 Gbits/sec [ 4] 4.00-5.00 sec 165 MBytes 1.39 Gbits/sec [ 4] 5.00-6.00 sec 199 MBytes 1.67 Gbits/sec [ 4] 6.00-7.00 sec 163 MBytes 1.36 Gbits/sec [ 4] 7.00-8.00 sec 174 MBytes 1.46 Gbits/sec [ 4] 8.00-9.00 sec 193 MBytes 1.62 Gbits/sec [ 4] 9.00-10.00 sec 196 MBytes 1.65 Gbits/sec [ 4] 10.00-11.00 sec 157 MBytes 1.31 Gbits/sec [ 4] 11.00-12.00 sec 175 MBytes 1.47 Gbits/sec [ 4] 12.00-13.00 sec 192 MBytes 1.61 Gbits/sec [ 4] 13.00-14.00 sec 199 MBytes 1.67 Gbits/sec (etc) After patch: [root@Lab200slot2 ~]# iperf3 --sctp -4 -c 192.168.240.3 -V -l 1452 -t 60 iperf version 3.0.1 (10 January 2014) Linux Lab200slot2 3.14.0+ #1 SMP Mon Apr 14 12:06:40 EDT 2014 x86_64 Time: Mon, 14 Apr 2014 16:40:48 GMT Connecting to host 192.168.240.3, port 5201 Cookie: Lab200slot2.1397493648.413274.65e131 [ 4] local 192.168.240.2 port 50548 connected to 192.168.240.3 port 5201 Starting Test: protocol: SCTP, 1 streams, 1452 byte blocks, omitting 0 seconds, 60 second test [ ID] Interval Transfer Bandwidth [ 4] 0.00-1.00 sec 240 MBytes 2.02 Gbits/sec [ 4] 1.00-2.00 sec 239 MBytes 2.01 Gbits/sec [ 4] 2.00-3.00 sec 240 MBytes 2.01 Gbits/sec [ 4] 3.00-4.00 sec 239 MBytes 2.00 Gbits/sec [ 4] 4.00-5.00 sec 245 MBytes 2.05 Gbits/sec [ 4] 5.00-6.00 sec 240 MBytes 2.01 Gbits/sec [ 4] 6.00-7.00 sec 240 MBytes 2.02 Gbits/sec [ 4] 7.00-8.00 sec 239 MBytes 2.01 Gbits/sec With the reverted patch applied, the SCTP/IPv4 performance is back to normal on latest upstream for IPv4 and IPv6 and has same throughput as 3.4.2 test kernel, steady and interval reports are smooth again. Fixes: ef2820a735f7 ("net: sctp: Fix a_rwnd/rwnd management to reflect real state of the receiver's buffer") Reported-by: Peter Butler Reported-by: Dongsheng Song Reported-by: Fengguang Wu Tested-by: Peter Butler Signed-off-by: Daniel Borkmann Cc: Matija Glavinic Pecotic Cc: Alexander Sverdlin Cc: Vlad Yasevich Acked-by: Vlad Yasevich Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 6ee76c804893..d992ca3145fe 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -1653,6 +1653,17 @@ struct sctp_association { /* This is the last advertised value of rwnd over a SACK chunk. */ __u32 a_rwnd; + /* Number of bytes by which the rwnd has slopped. The rwnd is allowed + * to slop over a maximum of the association's frag_point. + */ + __u32 rwnd_over; + + /* Keeps treack of rwnd pressure. This happens when we have + * a window, but not recevie buffer (i.e small packets). This one + * is releases slowly (1 PMTU at a time ). + */ + __u32 rwnd_press; + /* This is the sndbuf size in use for the association. * This corresponds to the sndbuf size for the association, * as specified in the sk->sndbuf. @@ -1881,7 +1892,8 @@ void sctp_assoc_update(struct sctp_association *old, __u32 sctp_association_get_next_tsn(struct sctp_association *); void sctp_assoc_sync_pmtu(struct sock *, struct sctp_association *); -void sctp_assoc_rwnd_update(struct sctp_association *, bool); +void sctp_assoc_rwnd_increase(struct sctp_association *, unsigned int); +void sctp_assoc_rwnd_decrease(struct sctp_association *, unsigned int); void sctp_assoc_set_primary(struct sctp_association *, struct sctp_transport *); void sctp_assoc_del_nonprimary_peers(struct sctp_association *, -- cgit v1.2.3 From b0270e91014dabfceaf37f5b40ad51bbf21a1302 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 15 Apr 2014 12:58:34 -0400 Subject: ipv4: add a sock pointer to ip_queue_xmit() ip_queue_xmit() assumes the skb it has to transmit is attached to an inet socket. Commit 31c70d5956fc ("l2tp: keep original skb ownership") changed l2tp to not change skb ownership and thus broke this assumption. One fix is to add a new 'struct sock *sk' parameter to ip_queue_xmit(), so that we do not assume skb->sk points to the socket used by l2tp tunnel. Fixes: 31c70d5956fc ("l2tp: keep original skb ownership") Reported-by: Zhan Jianyu Tested-by: Zhan Jianyu Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet6_connection_sock.h | 2 +- include/net/inet_connection_sock.h | 2 +- include/net/ip.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/inet6_connection_sock.h b/include/net/inet6_connection_sock.h index f981ba7adeed..74af137304be 100644 --- a/include/net/inet6_connection_sock.h +++ b/include/net/inet6_connection_sock.h @@ -40,7 +40,7 @@ void inet6_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr); -int inet6_csk_xmit(struct sk_buff *skb, struct flowi *fl); +int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl); struct dst_entry *inet6_csk_update_pmtu(struct sock *sk, u32 mtu); #endif /* _INET6_CONNECTION_SOCK_H */ diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index c55aeed41ace..7a4313887568 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -36,7 +36,7 @@ struct tcp_congestion_ops; * (i.e. things that depend on the address family) */ struct inet_connection_sock_af_ops { - int (*queue_xmit)(struct sk_buff *skb, struct flowi *fl); + int (*queue_xmit)(struct sock *sk, struct sk_buff *skb, struct flowi *fl); void (*send_check)(struct sock *sk, struct sk_buff *skb); int (*rebuild_header)(struct sock *sk); void (*sk_rx_dst_set)(struct sock *sk, const struct sk_buff *skb); diff --git a/include/net/ip.h b/include/net/ip.h index 25064c28e059..77e73d293e09 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -111,7 +111,7 @@ int ip_do_nat(struct sk_buff *skb); void ip_send_check(struct iphdr *ip); int __ip_local_out(struct sk_buff *skb); int ip_local_out(struct sk_buff *skb); -int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl); +int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl); void ip_init(void); int ip_append_data(struct sock *sk, struct flowi4 *fl4, int getfrag(void *from, char *to, int offset, int len, -- cgit v1.2.3 From aad88724c9d54acb1a9737cb6069d8470fa85f74 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 15 Apr 2014 13:47:15 -0400 Subject: ipv4: add a sock pointer to dst->output() path. In the dst->output() path for ipv4, the code assumes the skb it has to transmit is attached to an inet socket, specifically via ip_mc_output() : The sk_mc_loop() test triggers a WARN_ON() when the provider of the packet is an AF_PACKET socket. The dst->output() method gets an additional 'struct sock *sk' parameter. This needs a cascade of changes so that this parameter can be propagated from vxlan to final consumer. Fixes: 8f646c922d55 ("vxlan: keep original skb ownership") Reported-by: lucien xin Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/dst.h | 14 +++++++++++--- include/net/ip.h | 11 ++++++++--- include/net/ip_tunnels.h | 2 +- include/net/ipv6.h | 2 +- include/net/xfrm.h | 6 +++--- 5 files changed, 24 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/net/dst.h b/include/net/dst.h index 46ed958e0c6e..71c60f42be48 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -45,7 +45,7 @@ struct dst_entry { void *__pad1; #endif int (*input)(struct sk_buff *); - int (*output)(struct sk_buff *); + int (*output)(struct sock *sk, struct sk_buff *skb); unsigned short flags; #define DST_HOST 0x0001 @@ -367,7 +367,11 @@ static inline struct dst_entry *skb_dst_pop(struct sk_buff *skb) return child; } -int dst_discard(struct sk_buff *skb); +int dst_discard_sk(struct sock *sk, struct sk_buff *skb); +static inline int dst_discard(struct sk_buff *skb) +{ + return dst_discard_sk(skb->sk, skb); +} void *dst_alloc(struct dst_ops *ops, struct net_device *dev, int initial_ref, int initial_obsolete, unsigned short flags); void __dst_free(struct dst_entry *dst); @@ -449,9 +453,13 @@ static inline void dst_set_expires(struct dst_entry *dst, int timeout) } /* Output packet to network from transport. */ +static inline int dst_output_sk(struct sock *sk, struct sk_buff *skb) +{ + return skb_dst(skb)->output(sk, skb); +} static inline int dst_output(struct sk_buff *skb) { - return skb_dst(skb)->output(skb); + return dst_output_sk(skb->sk, skb); } /* Input packet from network to transport. */ diff --git a/include/net/ip.h b/include/net/ip.h index 77e73d293e09..3ec2b0fb9d83 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -104,13 +104,18 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); int ip_local_deliver(struct sk_buff *skb); int ip_mr_input(struct sk_buff *skb); -int ip_output(struct sk_buff *skb); -int ip_mc_output(struct sk_buff *skb); +int ip_output(struct sock *sk, struct sk_buff *skb); +int ip_mc_output(struct sock *sk, struct sk_buff *skb); int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)); int ip_do_nat(struct sk_buff *skb); void ip_send_check(struct iphdr *ip); int __ip_local_out(struct sk_buff *skb); -int ip_local_out(struct sk_buff *skb); +int ip_local_out_sk(struct sock *sk, struct sk_buff *skb); +static inline int ip_local_out(struct sk_buff *skb) +{ + return ip_local_out_sk(skb->sk, skb); +} + int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl); void ip_init(void); int ip_append_data(struct sock *sk, struct flowi4 *fl4, diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index e77c10405d51..a4daf9eb8562 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -153,7 +153,7 @@ static inline u8 ip_tunnel_ecn_encap(u8 tos, const struct iphdr *iph, } int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto); -int iptunnel_xmit(struct rtable *rt, struct sk_buff *skb, +int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, __be32 src, __be32 dst, __u8 proto, __u8 tos, __u8 ttl, __be16 df, bool xnet); diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 4f541f11ce63..d640925bc454 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -731,7 +731,7 @@ struct dst_entry *ip6_blackhole_route(struct net *net, * skb processing functions */ -int ip6_output(struct sk_buff *skb); +int ip6_output(struct sock *sk, struct sk_buff *skb); int ip6_forward(struct sk_buff *skb); int ip6_input(struct sk_buff *skb); int ip6_mc_input(struct sk_buff *skb); diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 32682ae47b3f..116e9c7e19cb 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -333,7 +333,7 @@ struct xfrm_state_afinfo { const xfrm_address_t *saddr); int (*tmpl_sort)(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n); int (*state_sort)(struct xfrm_state **dst, struct xfrm_state **src, int n); - int (*output)(struct sk_buff *skb); + int (*output)(struct sock *sk, struct sk_buff *skb); int (*output_finish)(struct sk_buff *skb); int (*extract_input)(struct xfrm_state *x, struct sk_buff *skb); @@ -1540,7 +1540,7 @@ static inline int xfrm4_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi) int xfrm4_extract_output(struct xfrm_state *x, struct sk_buff *skb); int xfrm4_prepare_output(struct xfrm_state *x, struct sk_buff *skb); -int xfrm4_output(struct sk_buff *skb); +int xfrm4_output(struct sock *sk, struct sk_buff *skb); int xfrm4_output_finish(struct sk_buff *skb); int xfrm4_rcv_cb(struct sk_buff *skb, u8 protocol, int err); int xfrm4_protocol_register(struct xfrm4_protocol *handler, unsigned char protocol); @@ -1565,7 +1565,7 @@ __be32 xfrm6_tunnel_alloc_spi(struct net *net, xfrm_address_t *saddr); __be32 xfrm6_tunnel_spi_lookup(struct net *net, const xfrm_address_t *saddr); int xfrm6_extract_output(struct xfrm_state *x, struct sk_buff *skb); int xfrm6_prepare_output(struct xfrm_state *x, struct sk_buff *skb); -int xfrm6_output(struct sk_buff *skb); +int xfrm6_output(struct sock *sk, struct sk_buff *skb); int xfrm6_output_finish(struct sk_buff *skb); int xfrm6_find_1stfragopt(struct xfrm_state *x, struct sk_buff *skb, u8 **prevhdr); -- cgit v1.2.3