// SPDX-License-Identifier: GPL-2.0-or-later
/*
* NET3: Implementation of the ICMP protocol layer.
*
* Alan Cox, <alan@lxorguk.ukuu.org.uk>
*
* Some of the function names and the icmp unreach table for this
* module were derived from [icmp.c 1.0.11 06/02/93] by
* Ross Biro, Fred N. van Kempen, Mark Evans, Alan Cox, Gerhard Koerting.
* Other than that this module is a complete rewrite.
*
* Fixes:
* Clemens Fruhwirth : introduce global icmp rate limiting
* with icmp type masking ability instead
* of broken per type icmp timeouts.
* Mike Shaver : RFC1122 checks.
* Alan Cox : Multicast ping reply as self.
* Alan Cox : Fix atomicity lockup in ip_build_xmit
* call.
* Alan Cox : Added 216,128 byte paths to the MTU
* code.
* Martin Mares : RFC1812 checks.
* Martin Mares : Can be configured to follow redirects
* if acting as a router _without_ a
* routing protocol (RFC 1812).
* Martin Mares : Echo requests may be configured to
* be ignored (RFC 1812).
* Martin Mares : Limitation of ICMP error message
* transmit rate (RFC 1812).
* Martin Mares : TOS and Precedence set correctly
* (RFC 1812).
* Martin Mares : Now copying as much data from the
* original packet as we can without
* exceeding 576 bytes (RFC 1812).
* Willy Konynenberg : Transparent proxying support.
* Keith Owens : RFC1191 correction for 4.2BSD based
* path MTU bug.
* Thomas Quinot : ICMP Dest Unreach codes up to 15 are
* valid (RFC 1812).
* Andi Kleen : Check all packet lengths properly
* and moved all kfree_skb() up to
* icmp_rcv.
* Andi Kleen : Move the rate limit bookkeeping
* into the dest entry and use a token
* bucket filter (thanks to ANK). Make
* the rates sysctl configurable.
* Yu Tianli : Fixed two ugly bugs in icmp_send
* - IP option length was accounted wrongly
* - ICMP header length was not accounted
* at all.
* Tristan Greaves : Added sysctl option to ignore bogus
* broadcast responses from broken routers.
*
* To Fix:
*
* - Should use skb_pull() instead of all the manual checking.
* This would also greatly simply some upper layer error handlers. --AK
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
#include <linux/types.h>
#include <linux/jiffies.h>
#include <linux/kernel.h>
#include <linux/fcntl.h>
#include <linux/socket.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/inetdevice.h>
#include <linux/netdevice.h>
#include <linux/string.h>
#include <linux/netfilter_ipv4.h>
#include <linux/slab.h>
#include <net/snmp.h>
#include <net/ip.h>
#include <net/route.h>
#include <net/protocol.h>
#include <net/icmp.h>
#include <net/tcp.h>
#include <net/udp.h>
#include <net/raw.h>
#include <net/ping.h>
#include <linux/skbuff.h>
#include <net/sock.h>
#include <linux/errno.h>
#include <linux/timer.h>
#include <linux/init.h>
#include <linux/uaccess.h>
#include <net/checksum.h>
#include <net/xfrm.h>
#include <net/inet_common.h>
#include <net/ip_fib.h>
#include <net/l3mdev.h>
/*
* Build xmit assembly blocks
*/
struct icmp_bxm {
struct sk_buff *skb;
int offset;
int data_len;
struct {
struct icmphdr icmph;
__be32 times[3];
} data;
int head_len;
struct ip_options_data replyopts;
};
/* An array of errno for error messages from dest unreach. */
/* RFC 1122: 3.2.2.1 States that NET_UNREACH, HOST_UNREACH and SR_FAILED MUST be considered 'transient errs'. */
const struct icmp_err icmp_err_convert[] = {
{
.errno = ENETUNREACH, /* ICMP_NET_UNREACH */
.fatal = 0,
},
{
.errno = EHOSTUNREACH, /* ICMP_HOST_UNREACH */
.fatal = 0,
},
{
.errno = ENOPROTOOPT /* ICMP_PROT_UNREACH */,
.fatal = 1,
},
{
.errno = ECONNREFUSED, /* ICMP_PORT_UNREACH */
.fatal = 1,
},
{
.errno = EMSGSIZE, /* ICMP_FRAG_NEEDED */
.fatal = 0,
},
{
.errno = EOPNOTSUPP, /* ICMP_SR_FAILED */
.fatal = 0,
},
{
.errno = ENETUNREACH, /* ICMP_NET_UNKNOWN */
.fatal = 1,
},
{
.errno = EHOSTDOWN, /* ICMP_HOST_UNKNOWN */
.fatal = 1,
},
{
.errno = ENONET, /* ICMP_HOST_ISOLATED */
.fatal = 1,
},
{
.errno = ENETUNREACH, /* ICMP_NET_ANO */
.fatal = 1,
},
{
.errno = EHOSTUNREACH, /* ICMP_HOST_ANO */
.fatal = 1,
},
{
.errno = ENETUNREACH, /* ICMP_NET_UNR_TOS */
.fatal = 0,
},
{
.errno = EHOSTUNREACH, /* ICMP_HOST_UNR_TOS */
.fatal = 0,
},
{
.errno = EHOSTUNREACH, /* ICMP_PKT_FILTERED */
.fatal = 1,
},
{
.errno = EHOSTUNREACH, /* ICMP_PREC_VIOLATION */
.fatal = 1,
},
{
.errno = EHOSTUNREACH, /* ICMP_PREC_CUTOFF */
.fatal = 1,
},
};
EXPORT_SYMBOL(icmp_err_convert);
/*
* ICMP control array. This specifies what to do with each ICMP.
*/
struct icmp_control {
enum skb_drop_reason (*handler)(struct sk_buff *skb);
short error; /* This ICMP is classed as an error message */
};
static const struct icmp_control icmp_pointers[NR_ICMP_TYPES+1];
static DEFINE_PER_CPU(struct sock *, ipv4_icmp_sk);
/* Called with BH disabled */
static inline struct sock *icmp_xmit_lock(struct net *net)
{
struct sock *sk;
sk = this_cpu_read(ipv4_icmp_sk);
if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
/* This can happen if the output path signals a
* dst_link_failure() for an outgoing ICMP packet.
*/
return NULL;
}
sock_net_set(sk, net);
return sk;
}
static inline void icmp_xmit_unlock(struct sock *sk)
{
sock_net_set(sk, &init_net);
spin_unlock(&sk->sk_lock.slock);
}
int sysctl_icmp_msgs_per_sec __read_mostly = 1000;
int sysctl_icmp_msgs_burst __read_mostly = 50;
static struct {
spinlock_t lock;
u32 credit;
u32 stamp;
} icmp_global = {
.lock = __SPIN_LOCK_UNLOCKED(icmp_global.lock),
};
/**
* icmp_global_allow - Are we allowed to send one more ICMP message ?
*
* Uses a token bucket to limit our ICMP messages to ~sysctl_icmp_msgs_per_sec.
* Returns false if we reached the limit and can not send another packet.
* Note: called with BH disabled
*/
bool icmp_global_allow(void)
{
u32 credit, delta, incr = 0, now = (u32)jiffies;
bool rc = false;
/* Check if token bucket is empty and cannot be refilled
* without taking the spinlock. The READ_ONCE() are paired
* with the following WRITE_ONCE() in this same function.
*/
if (!READ_ONCE(icmp_global.credit)) {
delta = min_t(u32, now - READ_ONCE(icmp_global.stamp), HZ);
if (delta < HZ / 50)
return false;
}
spin_lock(&icmp_global.lock);
delta = min_t(u32, now - icmp_global.stamp, HZ);
if (delta >= HZ / 50) {
incr = READ_ONCE(sysctl_icmp_msgs_per_sec) * delta / HZ;
if (incr)
WRITE_ONCE(icmp_global.stamp, now);
}
credit = min_t(u32, icmp_global.credit + incr,
READ_ONCE(sysctl_icmp_msgs_burst));
if (credit) {
/* We want to use a credit of one in average, but need to randomize
* it for security reasons.
*/
credit = max_t(int, credit - get_random_u32_below(3), 0);
rc = true;
}
WRITE_ONCE(icmp_global.credit, credit);
spin_unlock(&icmp_global.lock);
return rc;
}
EXPORT_SYMBOL(icmp_global_allow);
static bool icmpv4_mask_allow(struct net *net, int type, int code)
{
if (type > NR_ICMP_TYPES)
return true;
/* Don't limit PMTU discovery. */
if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
return true;
/* Limit if icmp type is enabled in ratemask. */
if (!((1 << type) & READ_ONCE(net->ipv4.sysctl_icmp_ratemask)))
return true;
return false;
}
static bool icmpv4_global_allow(struct net *net, int type, int code)
{
if (icmpv4_mask_allow(net, type, code))
return true;
if (icmp_global_allow())
return true;
__ICMP_INC_STATS(net, ICMP_MIB_RATELIMITGLOBAL);
return false;
}
/*
* Send an ICMP frame.
*/
static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
struct flowi4 *fl4, int type, int code)
{
struct dst_entry *dst = &rt->dst;
struct inet_peer *peer;
bool rc = true;
int vif;
if (icmpv4_mask_allow(net, type, code))
goto out;
/* No rate limit on loopback */
if (dst->dev && (dst->dev->flags&IFF_LOOPBACK))
goto out;
vif = l3mdev_master_ifindex(dst->dev);
peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, vif, 1
|