/*
* vrf.c: device driver to encapsulate a VRF space
*
* Copyright (c) 2015 Cumulus Networks. All rights reserved.
* Copyright (c) 2015 Shrijeet Mukherjee <shm@cumulusnetworks.com>
* Copyright (c) 2015 David Ahern <dsa@cumulusnetworks.com>
*
* Based on dummy, team and ipvlan drivers
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*/
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/ip.h>
#include <linux/init.h>
#include <linux/moduleparam.h>
#include <linux/netfilter.h>
#include <linux/rtnetlink.h>
#include <net/rtnetlink.h>
#include <linux/u64_stats_sync.h>
#include <linux/hashtable.h>
#include <linux/inetdevice.h>
#include <net/arp.h>
#include <net/ip.h>
#include <net/ip_fib.h>
#include <net/ip6_fib.h>
#include <net/ip6_route.h>
#include <net/route.h>
#include <net/addrconf.h>
#include <net/l3mdev.h>
#include <net/fib_rules.h>
#define DRV_NAME "vrf"
#define DRV_VERSION "1.0"
#define FIB_RULE_PREF 1000 /* default preference for FIB rules */
static bool add_fib_rules = true;
struct net_vrf {
struct rtable __rcu *rth;
struct rtable __rcu *rth_local;
struct rt6_info __rcu *rt6;
struct rt6_info __rcu *rt6_local;
u32 tb_id;
};
struct pcpu_dstats {
u64 tx_pkts;
u64 tx_bytes;
u64 tx_drps;
u64 rx_pkts;
u64 rx_bytes;
u64 rx_drps;
struct u64_stats_sync syncp;
};
static void vrf_rx_stats(struct net_device *dev, int len)
{
struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats);
u64_stats_update_begin(&dstats->syncp);
dstats->rx_pkts++;
dstats->rx_bytes += len;
u64_stats_update_end(&dstats->syncp);
}
static void vrf_tx_error(struct net_device *vrf_dev, struct sk_buff *skb)
{
vrf_dev->stats.tx_errors++;
kfree_skb(skb);
}
static void vrf_get_stats64(struct net_device *dev,
struct rtnl_link_stats64 *stats)
{
int i;
for_each_possible_cpu(i) {
const struct pcpu_dstats *dstats;
u64 tbytes, tpkts, tdrops, rbytes, rpkts;
unsigned int start;
dstats = per_cpu_ptr(dev->dstats, i);
do {
start = u64_stats_fetch_begin_irq(&dstats->syncp);
tbytes = dstats->tx_bytes;
tpkts = dstats->tx_pkts;
tdrops = dstats->tx_drps;
rbytes = dstats->rx_bytes;
rpkts = dstats->rx_pkts;
} while (u64_stats_fetch_retry_irq(&dstats->syncp, start));
stats->tx_bytes += tbytes;
stats->tx_packets += tpkts;
stats->tx_dropped += tdrops;
stats->rx_bytes += rbytes;
stats->rx_packets += rpkts;
}
}
/* by default VRF devices do not have a qdisc and are expected
* to be created with only a single queue.
*/
static bool qdisc_tx_is_default(const struct net_device *dev)
{
struct netdev_queue *txq;
struct Qdisc *qdisc;
if (dev->num_tx_queues > 1)
return false;
txq = netdev_get_tx_queue(dev, 0);
qdisc = rcu_access_pointer(txq->qdisc);
return !qdisc->enqueue;
}
/* Local traffic destined to local address. Reinsert the packet to rx
* path, similar to loopback handling.
*/
static int vrf_local_xmit(struct sk_buff *skb, struct net_device *dev,
struct dst_entry *dst)
{
int len = skb->len;
skb_orphan(skb);
skb_dst_set(skb, dst);
skb_dst_force(skb);
/* set pkt_type to avoid skb hitting packet taps twice -
* once on Tx and again in Rx processing
*/
skb->pkt_type = PACKET_LOOPBACK;
skb->protocol = eth_type_trans(skb, dev);
if (likely(netif_rx(skb) == NET_RX_SUCCESS))
vrf_rx_stats(dev, len);
else
this_cpu_inc(dev->dstats->rx_drps);
return NETDEV_TX_OK;
}
#if IS_ENABLED(CONFIG_IPV6)
static int vrf_ip6_local_out(struct net *net, struct sock *sk,
struct sk_buff *skb)
{
int err;
err = nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net,
sk, skb, NULL, skb_dst(skb)->dev, dst_output);
if (likely(err == 1))
err = dst_output(net, sk, skb);
return err;
}
static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb,
struct net_device *dev)
{
const struct ipv6hdr *iph = ipv6_hdr(skb);
struct net *net = dev_net(skb->dev);
struct flowi6 fl6 = {
/* needed to match OIF rule */
.flowi6_oif = dev->ifindex,
.flowi6_iif = LOOPBACK_IFINDEX,
.daddr = iph->daddr,
.saddr = iph->saddr,
.flowlabel = ip6_flowinfo(iph),
.flowi6_mark = skb->mark,
.flowi6_proto = iph->nexthdr,
.flowi6_flags = FLOWI_FLAG_SKIP_NH_OIF,
};
int ret = NET_XMIT_DROP;
struct dst_entry *dst;
struct dst_entry *dst_null = &net->ipv6.ip6_null_entry->dst;
dst = ip6_route_output(net, NULL, &fl6);
if (dst == dst_null)
goto err;
skb_dst_drop(skb);
/* if dst.dev is loopback or the VRF device again this is locally
* originated traffic destined to a local address. Short circuit
* to Rx path using our local dst
*/
if (dst->dev == net->loopback_dev || dst->dev == dev) {
struct net_vrf *vrf = netdev_priv(dev);
struct rt6_info *rt6_local;
/* release looked up dst and use cached local dst */
dst_release(dst);
rcu_read_lock();
rt6_local = rcu_dereference(vrf->rt6_local);
if (unlikely(!rt6_local)) {
rcu_read_unlock();
goto err;
}
/* Ordering issue: cached local dst is created on newlink
* before the IPv6 initialization. Using the local dst
* requires rt6i_idev to be set so make sure it is.
*/
if (unlikely(!rt6_local->rt6i_idev)) {
rt6_local->rt6i_idev = in6_dev_get(dev);
if (!rt6_local->rt6i_idev) {
rcu_read_unlock();
goto err;
}
}
dst = &rt6_local->dst;
dst_hold(dst);
rcu_read_unlock();
return vrf_local_xmit(skb, dev, &rt6_local->dst);
}
skb_dst_set(skb, dst);
/* strip the ethernet header added for pass through VRF device */
__skb_pull(skb, skb_network_offset(skb));
ret = vrf_ip6_local_out(net, skb->sk, skb);
if (unlikely(net_xmit_eval(ret)))
dev->stats.tx_errors++;
else
ret = NET_XMIT_SUCCESS;
return ret;
err:
vrf_tx_error(dev, skb);
return NET_XMIT_DROP;
}
#else
static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb,
struct net_device *dev)
{
vrf_tx_error(dev, skb);
return NET_XMIT_DROP;
}
#endif
/* based on ip_local_out; can't use it b/c the dst is switched pointing to us */
static int vrf_ip_local_out(struct net *net, struct sock *sk,
struct sk_buff *skb)
{
int err;
err = nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, net, sk,
skb, NULL, skb_dst(skb)->dev, dst_output);
if (likely(err == 1))
err = dst_output(net, sk, skb);
return err;
}
static netdev_tx_t vrf_process_v4_outbound(struct sk_buff *skb,
struct net_device *vrf_dev)
{
struct iphdr *ip4h = ip_hdr(skb);
int ret = NET_XMIT_DROP;
struct flowi4 fl4 = {
/* needed to match OIF rule */
.flowi4_oif = vrf_dev->ifindex,
.flowi4_iif = LOOPBACK_IFINDEX,
.flowi4_tos = RT_TOS(ip4h->tos),
.flowi4_flags = FLOWI_FLAG_ANYSRC | FLOWI_FLAG_SKIP_NH_OIF,
.flowi4_proto = ip4h->protocol,
.daddr = ip4h->daddr,
.saddr = ip4h->saddr,
};
struct net *net = dev_net(vrf_dev);
struct rtable *rt;
rt = ip_route_output_flow(net, &fl4, NULL);
if (IS_ERR(rt))
goto err;
skb_dst_drop(skb);
/* if dst.dev is loopback or the VRF device again this is locally
* originated traffic destined to a local address. Short circuit
* to Rx path using our local dst
*/
if (rt->dst.dev == net->loopback_dev || rt-&
|