/*
* VXLAN: Virtual eXtensible Local Area Network
*
* Copyright (c) 2012 Vyatta Inc.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*
* TODO
* - use IANA UDP port number (when defined)
* - IPv6 (not in RFC)
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/skbuff.h>
#include <linux/rculist.h>
#include <linux/netdevice.h>
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/udp.h>
#include <linux/igmp.h>
#include <linux/etherdevice.h>
#include <linux/if_ether.h>
#include <linux/hash.h>
#include <net/arp.h>
#include <net/ndisc.h>
#include <net/ip.h>
#include <net/icmp.h>
#include <net/udp.h>
#include <net/rtnetlink.h>
#include <net/route.h>
#include <net/dsfield.h>
#include <net/inet_ecn.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
#define VXLAN_VERSION "0.1"
#define VNI_HASH_BITS 10
#define VNI_HASH_SIZE (1<<VNI_HASH_BITS)
#define FDB_HASH_BITS 8
#define FDB_HASH_SIZE (1<<FDB_HASH_BITS)
#define FDB_AGE_DEFAULT 300 /* 5 min */
#define FDB_AGE_INTERVAL (10 * HZ) /* rescan interval */
#define VXLAN_N_VID (1u << 24)
#define VXLAN_VID_MASK (VXLAN_N_VID - 1)
/* IP header + UDP + VXLAN + Ethernet header */
#define VXLAN_HEADROOM (20 + 8 + 8 + 14)
#define VXLAN_FLAGS 0x08000000 /* struct vxlanhdr.vx_flags required value. */
/* VXLAN protocol header */
struct vxlanhdr {
__be32 vx_flags;
__be32 vx_vni;
};
/* UDP port for VXLAN traffic. */
static unsigned int vxlan_port __read_mostly = 8472;
module_param_named(udp_port, vxlan_port, uint, 0444);
MODULE_PARM_DESC(udp_port, "Destination UDP port");
static bool log_ecn_error = true;
module_param(log_ecn_error, bool, 0644);
MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
/* per-net private data for this module */
static unsigned int vxlan_net_id;
struct vxlan_net {
struct socket *sock; /* UDP encap socket */
struct hlist_head vni_list[VNI_HASH_SIZE];
};
/* Forwarding table entry */
struct vxlan_fdb {
struct hlist_node hlist; /* linked list of entries */
struct rcu_head rcu;
unsigned long updated; /* jiffies */
unsigned long used;
__be32 remote_ip;
u16 state; /* see ndm_state */
u8 eth_addr[ETH_ALEN];
};
/* Per-cpu network traffic stats */
struct vxlan_stats {
u64 rx_packets;
u64 rx_bytes;
u64 tx_packets;
u64 tx_bytes;
struct u64_stats_sync syncp;
};
/* Pseudo network device */
struct vxlan_dev {
struct hlist_node hlist;
struct net_device *dev;
struct vxlan_stats __percpu *stats;
__u32 vni; /* virtual network id */
__be32 gaddr; /* multicast group */
__be32 saddr; /* source address */
unsigned int link; /* link to multicast over */
__u16 port_min; /* source port range */
__u16 port_max;
__u8 tos; /* TOS override */
__u8 ttl;
u32 flags; /* VXLAN_F_* below */
unsigned long age_interval;
struct timer_list age_timer;
spinlock_t hash_lock;
unsigned int addrcnt;
unsigned int addrmax;
struct hlist_head fdb_head[FDB_HASH_SIZE];
};
#define VXLAN_F_LEARN 0x01
#define VXLAN_F_PROXY 0x02
#define VXLAN_F_RSC 0x04
#define VXLAN_F_L2MISS 0x08
#define VXLAN_F_L3MISS 0x10
/* salt for hash table */
static u32 vxlan_salt __read_mostly;
static inline struct hlist_head *vni_head(struct net *net, u32 id)
{
struct vxlan_net *vn = net_generic(net, vxlan_net_id);
return &vn->vni_list[hash_32(id, VNI_HASH_BITS)];
}
/* Look up VNI in a per net namespace table */
static struct vxlan_dev *vxlan_find_vni(struct net *net, u32 id)
{
struct vxlan_dev *vxlan;
struct hlist_node *node;
hlist_for_each_entry_rcu(vxlan, node, vni_head(net, id), hlist) {
if (vxlan->vni == id)
return vxlan;
}
return NULL;
}
/* Fill in neighbour message in skbuff. */
static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan,
const struct vxlan_fdb *fdb,
u32 portid, u32 seq, int type, unsigned int flags)
{
unsigned long now = jiffies;
struct nda_cacheinfo ci;
struct nlmsghdr *nlh;
struct ndmsg *ndm;
bool send_ip, send_eth;
nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags);
if (nlh == NULL)
return -EMSGSIZE;
ndm = nlmsg_data(nlh);
memset(ndm, 0, sizeof(*ndm));
send_eth = send_ip = true;
if (type == RTM_GETNEIGH) {
ndm->ndm_family = AF_INET;
send_ip = fdb->remote_ip != 0;
send_eth = !is_zero_ether_addr(fdb->eth_addr);
} else
ndm->ndm_family = AF_BRIDGE;
ndm->ndm_state = fdb->state;
ndm->ndm_ifindex = vxlan->dev->ifindex;
ndm->ndm_flags = NTF_SELF;
ndm->ndm_type = NDA_DST;
if (send_eth && nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr))
goto nla_put_failure;
if (send_ip && nla_put_be32(skb, NDA_DST, fdb->remote_ip))
goto nla_put_failure;
ci.ndm_used = jiffies_to_clock_t(now - fdb->used);
ci.ndm_confirmed = 0;
ci.ndm_updated = jiffies_to_clock_t(now - fdb->updated);
ci.ndm_refcnt = 0;
if (nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci))
goto nla_put_failure;
return nlmsg_end(skb, nlh);
nla_put_failure:
nlmsg_cancel(skb, nlh);
return -EMSGSIZE;
}
static inline size_t vxlan_nlmsg_size(void)
{
return NLMSG_ALIGN(sizeof(struct ndmsg))
+ nla_total_size(ETH_ALEN) /* NDA_LLADDR */
+ nla_total_size(sizeof(__be32)) /* NDA_DST */
+ nla_total_size(sizeof(struct nda_cacheinfo));
}
static void vxlan_fdb_notify(struct vxlan_dev *vxlan,
const struct vxlan_fdb *fdb, int type)
{
struct net *net = dev_net(vxlan->dev);
struct sk_buff *skb;
int err = -ENOBUFS;
skb = nlmsg_new(vxlan_nlmsg_size(), GFP_ATOMIC);
if (skb == NULL)
goto errout;
err = vxlan_fdb_info(skb, vxlan, fdb, 0, 0, type, 0);
if (err < 0) {
/* -EMSGSIZE implies BUG in vxlan_nlmsg_size() */
WARN_ON(err == -EMSGSIZE);
kfree_skb(skb);
goto errout;
}
rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
return;
errout:
if (err < 0)
rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
}
static void vxlan_ip_miss(struct net_device *dev, __be32 ipa)
{
struct vxlan_dev *vxlan = netdev_priv(dev);
struct vxlan_fdb f;
memset(&f, 0, sizeof f);
f.state = NUD_STALE;
f.remote_ip = ipa; /* goes to NDA_DST */
vxlan_fdb_notify(vxlan, &f, RTM_GETNEIGH);
}
static void vxlan_fdb_miss(struct vxlan_dev *vxlan, const u8 eth_addr[ETH_ALEN])
{
struct vxlan_fdb f;
memset(&f, 0, sizeof f);
f.state = NUD_STALE;
memcpy(f.eth_addr, eth_addr, ETH_ALEN);
vxlan_fdb_notify(vxlan, &f, RTM_GETNEIGH);
}
/* Hash Ethernet address */
static u32 eth_hash(const unsigned char *addr)
{
u64 value = get_unaligned((u64 *)addr);
/* only want 6 bytes */
#ifdef __BIG_ENDIAN
value >>= 16;
#else
value <<= 16;
#endif
return hash_64(value, FDB_HASH_BITS);
}
/* Hash chain to use given mac address */
static inline struct hlist_head *vxlan_fdb_head(struct vxlan_dev *vxlan,
const u8 *mac)
{
return &vxlan->fdb_head[eth_hash(mac)];
}
/* Look up Ethernet
|