diff options
Diffstat (limited to 'net/sched')
-rw-r--r-- | net/sched/act_ct.c | 50 | ||||
-rw-r--r-- | net/sched/cls_api.c | 26 | ||||
-rw-r--r-- | net/sched/cls_route.c | 37 | ||||
-rw-r--r-- | net/sched/cls_u32.c | 2 | ||||
-rw-r--r-- | net/sched/em_meta.c | 2 | ||||
-rw-r--r-- | net/sched/sch_fq.c | 391 | ||||
-rw-r--r-- | net/sched/sch_fq_pie.c | 2 | ||||
-rw-r--r-- | net/sched/sch_frag.c | 4 | ||||
-rw-r--r-- | net/sched/sch_generic.c | 9 | ||||
-rw-r--r-- | net/sched/sch_hfsc.c | 18 | ||||
-rw-r--r-- | net/sched/sch_netem.c | 2 | ||||
-rw-r--r-- | net/sched/sch_qfq.c | 4 | ||||
-rw-r--r-- | net/sched/sch_taprio.c | 2 |
13 files changed, 402 insertions, 147 deletions
diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index 7c652d14528b..9583645e86c2 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -278,7 +278,16 @@ err_nat: return err; } +static bool tcf_ct_flow_is_outdated(const struct flow_offload *flow) +{ + return test_bit(IPS_SEEN_REPLY_BIT, &flow->ct->status) && + test_bit(IPS_HW_OFFLOAD_BIT, &flow->ct->status) && + !test_bit(NF_FLOW_HW_PENDING, &flow->flags) && + !test_bit(NF_FLOW_HW_ESTABLISHED, &flow->flags); +} + static struct nf_flowtable_type flowtable_ct = { + .gc = tcf_ct_flow_is_outdated, .action = tcf_ct_flow_table_fill_actions, .owner = THIS_MODULE, }; @@ -690,7 +699,6 @@ static struct tc_action_ops act_ct_ops; struct tc_ct_action_net { struct tc_action_net tn; /* Must be first */ - bool labels; }; /* Determine whether skb->_nfct is equal to the result of conntrack lookup. */ @@ -829,8 +837,13 @@ static void tcf_ct_params_free(struct tcf_ct_params *params) } if (params->ct_ft) tcf_ct_flow_table_put(params->ct_ft); - if (params->tmpl) + if (params->tmpl) { + if (params->put_labels) + nf_connlabels_put(nf_ct_net(params->tmpl)); + nf_ct_put(params->tmpl); + } + kfree(params); } @@ -1154,9 +1167,9 @@ static int tcf_ct_fill_params(struct net *net, struct nlattr **tb, struct netlink_ext_ack *extack) { - struct tc_ct_action_net *tn = net_generic(net, act_ct_ops.net_id); struct nf_conntrack_zone zone; int err, family, proto, len; + bool put_labels = false; struct nf_conn *tmpl; char *name; @@ -1186,15 +1199,20 @@ static int tcf_ct_fill_params(struct net *net, } if (tb[TCA_CT_LABELS]) { + unsigned int n_bits = sizeof_field(struct tcf_ct_params, labels) * 8; + if (!IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS)) { NL_SET_ERR_MSG_MOD(extack, "Conntrack labels isn't enabled."); return -EOPNOTSUPP; } - if (!tn->labels) { + if (nf_connlabels_get(net, n_bits - 1)) { NL_SET_ERR_MSG_MOD(extack, "Failed to set connlabel length"); return -EOPNOTSUPP; + } else { + put_labels = true; } + tcf_ct_set_key_val(tb, p->labels, TCA_CT_LABELS, p->labels_mask, TCA_CT_LABELS_MASK, @@ -1238,10 +1256,15 @@ static int tcf_ct_fill_params(struct net *net, } } + p->put_labels = put_labels; + if (p->ct_action & TCA_CT_ACT_COMMIT) __set_bit(IPS_CONFIRMED_BIT, &tmpl->status); return 0; err: + if (put_labels) + nf_connlabels_put(net); + nf_ct_put(p->tmpl); p->tmpl = NULL; return err; @@ -1542,32 +1565,13 @@ static struct tc_action_ops act_ct_ops = { static __net_init int ct_init_net(struct net *net) { - unsigned int n_bits = sizeof_field(struct tcf_ct_params, labels) * 8; struct tc_ct_action_net *tn = net_generic(net, act_ct_ops.net_id); - if (nf_connlabels_get(net, n_bits - 1)) { - tn->labels = false; - pr_err("act_ct: Failed to set connlabels length"); - } else { - tn->labels = true; - } - return tc_action_net_init(net, &tn->tn, &act_ct_ops); } static void __net_exit ct_exit_net(struct list_head *net_list) { - struct net *net; - - rtnl_lock(); - list_for_each_entry(net, net_list, exit_list) { - struct tc_ct_action_net *tn = net_generic(net, act_ct_ops.net_id); - - if (tn->labels) - nf_connlabels_put(net); - } - rtnl_unlock(); - tc_action_net_exit(net_list, act_ct_ops.net_id); } diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index a193cc7b3241..1daeb2182b70 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1681,12 +1681,16 @@ reclassify: * time we got here with a cookie from hardware. */ if (unlikely(n->tp != tp || n->tp->chain != n->chain || - !tp->ops->get_exts)) + !tp->ops->get_exts)) { + tcf_set_drop_reason(res, SKB_DROP_REASON_TC_ERROR); return TC_ACT_SHOT; + } exts = tp->ops->get_exts(tp, n->handle); - if (unlikely(!exts || n->exts != exts)) + if (unlikely(!exts || n->exts != exts)) { + tcf_set_drop_reason(res, SKB_DROP_REASON_TC_ERROR); return TC_ACT_SHOT; + } n = NULL; err = tcf_exts_exec_ex(skb, exts, act_index, res); @@ -1712,8 +1716,10 @@ reclassify: return err; } - if (unlikely(n)) + if (unlikely(n)) { + tcf_set_drop_reason(res, SKB_DROP_REASON_TC_ERROR); return TC_ACT_SHOT; + } return TC_ACT_UNSPEC; /* signal: continue lookup */ #ifdef CONFIG_NET_CLS_ACT @@ -1723,6 +1729,7 @@ reset: tp->chain->block->index, tp->prio & 0xffff, ntohs(tp->protocol)); + tcf_set_drop_reason(res, SKB_DROP_REASON_TC_ERROR); return TC_ACT_SHOT; } @@ -1759,8 +1766,10 @@ int tcf_classify(struct sk_buff *skb, if (ext->act_miss) { n = tcf_exts_miss_cookie_lookup(ext->act_miss_cookie, &act_index); - if (!n) + if (!n) { + tcf_set_drop_reason(res, SKB_DROP_REASON_TC_ERROR); return TC_ACT_SHOT; + } chain = n->chain_index; } else { @@ -1768,8 +1777,10 @@ int tcf_classify(struct sk_buff *skb, } fchain = tcf_chain_lookup_rcu(block, chain); - if (!fchain) + if (!fchain) { + tcf_set_drop_reason(res, SKB_DROP_REASON_TC_ERROR); return TC_ACT_SHOT; + } /* Consume, so cloned/redirect skbs won't inherit ext */ skb_ext_del(skb, TC_SKB_EXT); @@ -1788,8 +1799,11 @@ int tcf_classify(struct sk_buff *skb, struct tc_skb_cb *cb = tc_skb_cb(skb); ext = tc_skb_ext_alloc(skb); - if (WARN_ON_ONCE(!ext)) + if (WARN_ON_ONCE(!ext)) { + tcf_set_drop_reason(res, SKB_DROP_REASON_TC_ERROR); return TC_ACT_SHOT; + } + ext->chain = last_executed_chain; ext->mru = cb->mru; ext->post_ct = cb->post_ct; diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index 1e20bbd687f1..1424bfeaca73 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -375,9 +375,9 @@ out: static const struct nla_policy route4_policy[TCA_ROUTE4_MAX + 1] = { [TCA_ROUTE4_CLASSID] = { .type = NLA_U32 }, - [TCA_ROUTE4_TO] = { .type = NLA_U32 }, - [TCA_ROUTE4_FROM] = { .type = NLA_U32 }, - [TCA_ROUTE4_IIF] = { .type = NLA_U32 }, + [TCA_ROUTE4_TO] = NLA_POLICY_MAX(NLA_U32, 0xFF), + [TCA_ROUTE4_FROM] = NLA_POLICY_MAX(NLA_U32, 0xFF), + [TCA_ROUTE4_IIF] = NLA_POLICY_MAX(NLA_U32, 0x7FFF), }; static int route4_set_parms(struct net *net, struct tcf_proto *tp, @@ -397,33 +397,37 @@ static int route4_set_parms(struct net *net, struct tcf_proto *tp, return err; if (tb[TCA_ROUTE4_TO]) { - if (new && handle & 0x8000) + if (new && handle & 0x8000) { + NL_SET_ERR_MSG(extack, "Invalid handle"); return -EINVAL; + } to = nla_get_u32(tb[TCA_ROUTE4_TO]); - if (to > 0xFF) - return -EINVAL; nhandle = to; } + if (tb[TCA_ROUTE4_FROM] && tb[TCA_ROUTE4_IIF]) { + NL_SET_ERR_MSG_ATTR(extack, tb[TCA_ROUTE4_FROM], + "'from' and 'fromif' are mutually exclusive"); + return -EINVAL; + } + if (tb[TCA_ROUTE4_FROM]) { - if (tb[TCA_ROUTE4_IIF]) - return -EINVAL; id = nla_get_u32(tb[TCA_ROUTE4_FROM]); - if (id > 0xFF) - return -EINVAL; nhandle |= id << 16; } else if (tb[TCA_ROUTE4_IIF]) { id = nla_get_u32(tb[TCA_ROUTE4_IIF]); - if (id > 0x7FFF) - return -EINVAL; nhandle |= (id | 0x8000) << 16; } else nhandle |= 0xFFFF << 16; if (handle && new) { nhandle |= handle & 0x7F00; - if (nhandle != handle) + if (nhandle != handle) { + NL_SET_ERR_MSG_FMT(extack, + "Handle mismatch constructed: %x (expected: %x)", + handle, nhandle); return -EINVAL; + } } if (!nhandle) { @@ -478,7 +482,6 @@ static int route4_change(struct net *net, struct sk_buff *in_skb, struct route4_filter __rcu **fp; struct route4_filter *fold, *f1, *pfp, *f = NULL; struct route4_bucket *b; - struct nlattr *opt = tca[TCA_OPTIONS]; struct nlattr *tb[TCA_ROUTE4_MAX + 1]; unsigned int h, th; int err; @@ -489,10 +492,12 @@ static int route4_change(struct net *net, struct sk_buff *in_skb, return -EINVAL; } - if (opt == NULL) + if (NL_REQ_ATTR_CHECK(extack, NULL, tca, TCA_OPTIONS)) { + NL_SET_ERR_MSG_MOD(extack, "Missing options"); return -EINVAL; + } - err = nla_parse_nested_deprecated(tb, TCA_ROUTE4_MAX, opt, + err = nla_parse_nested_deprecated(tb, TCA_ROUTE4_MAX, tca[TCA_OPTIONS], route4_policy, NULL); if (err < 0) return err; diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index da4c179a4d41..6663e971a13e 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -366,7 +366,7 @@ static int u32_init(struct tcf_proto *tp) idr_init(&root_ht->handle_idr); if (tp_c == NULL) { - tp_c = kzalloc(struct_size(tp_c, hlist->ht, 1), GFP_KERNEL); + tp_c = kzalloc(sizeof(*tp_c), GFP_KERNEL); if (tp_c == NULL) { kfree(root_ht); return -ENOBUFS; diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c index da34fd4c9269..09d8afd04a2a 100644 --- a/net/sched/em_meta.c +++ b/net/sched/em_meta.c @@ -546,7 +546,7 @@ META_COLLECTOR(int_sk_prio) *err = -1; return; } - dst->value = sk->sk_priority; + dst->value = READ_ONCE(sk->sk_priority); } META_COLLECTOR(int_sk_rcvlowat) diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index f59a2cb2c803..0fd18c344ab5 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -2,7 +2,7 @@ /* * net/sched/sch_fq.c Fair Queue Packet Scheduler (per flow pacing) * - * Copyright (C) 2013-2015 Eric Dumazet <edumazet@google.com> + * Copyright (C) 2013-2023 Eric Dumazet <edumazet@google.com> * * Meant to be mostly used for locally generated traffic : * Fast classification depends on skb->sk being set before reaching us. @@ -51,7 +51,8 @@ #include <net/tcp.h> struct fq_skb_cb { - u64 time_to_send; + u64 time_to_send; + u8 band; }; static inline struct fq_skb_cb *fq_skb_cb(struct sk_buff *skb) @@ -73,37 +74,41 @@ struct fq_flow { struct sk_buff *tail; /* last skb in the list */ unsigned long age; /* (jiffies | 1UL) when flow was emptied, for gc */ }; - struct rb_node fq_node; /* anchor in fq_root[] trees */ + union { + struct rb_node fq_node; /* anchor in fq_root[] trees */ + /* Following field is only used for q->internal, + * because q->internal is not hashed in fq_root[] + */ + u64 stat_fastpath_packets; + }; struct sock *sk; u32 socket_hash; /* sk_hash */ int qlen; /* number of packets in flow queue */ -/* Second cache line, used in fq_dequeue() */ +/* Second cache line */ int credit; - /* 32bit hole on 64bit arches */ - + int band; struct fq_flow *next; /* next pointer in RR lists */ struct rb_node rate_node; /* anchor in q->delayed tree */ u64 time_next_packet; -} ____cacheline_aligned_in_smp; +}; struct fq_flow_head { struct fq_flow *first; struct fq_flow *last; }; -struct fq_sched_data { +struct fq_perband_flows { struct fq_flow_head new_flows; - struct fq_flow_head old_flows; + int credit; + int quantum; /* based on band nr : 576KB, 192KB, 64KB */ +}; - struct rb_root delayed; /* for rate limited flows */ - u64 time_next_delayed_flow; - u64 ktime_cache; /* copy of last ktime_get_ns() */ - unsigned long unthrottle_latency_ns; +struct fq_sched_data { +/* Read mostly cache line */ - struct fq_flow internal; /* for non classified or high prio packets */ u32 quantum; u32 initial_quantum; u32 flow_refill_delay; @@ -117,24 +122,46 @@ struct fq_sched_data { u8 rate_enable; u8 fq_trees_log; u8 horizon_drop; + u8 prio2band[(TC_PRIO_MAX + 1) >> 2]; + u32 timer_slack; /* hrtimer slack in ns */ + +/* Read/Write fields. */ + + unsigned int band_nr; /* band being serviced in fq_dequeue() */ + + struct fq_perband_flows band_flows[FQ_BANDS]; + + struct fq_flow internal; /* fastpath queue. */ + struct rb_root delayed; /* for rate limited flows */ + u64 time_next_delayed_flow; + unsigned long unthrottle_latency_ns; + + u32 band_pkt_count[FQ_BANDS]; u32 flows; - u32 inactive_flows; + u32 inactive_flows; /* Flows with no packet to send. */ u32 throttled_flows; - u64 stat_gc_flows; - u64 stat_internal_packets; u64 stat_throttled; + struct qdisc_watchdog watchdog; + u64 stat_gc_flows; + +/* Seldom used fields. */ + + u64 stat_band_drops[FQ_BANDS]; u64 stat_ce_mark; u64 stat_horizon_drops; u64 stat_horizon_caps; u64 stat_flows_plimit; u64 stat_pkts_too_long; u64 stat_allocation_errors; - - u32 timer_slack; /* hrtimer slack in ns */ - struct qdisc_watchdog watchdog; }; +/* return the i-th 2-bit value ("crumb") */ +static u8 fq_prio2band(const u8 *prio2band, unsigned int prio) +{ + return (prio2band[prio / 4] >> (2 * (prio & 0x3))) & 0x3; +} + /* * f->tail and f->age share the same location. * We can use the low order bit to differentiate if this location points @@ -159,8 +186,19 @@ static bool fq_flow_is_throttled(const struct fq_flow *f) return f->next == &throttled; } -static void fq_flow_add_tail(struct fq_flow_head *head, struct fq_flow *flow) +enum new_flow { + NEW_FLOW, + OLD_FLOW +}; + +static void fq_flow_add_tail(struct fq_sched_data *q, struct fq_flow *flow, + enum new_flow list_sel) { + struct fq_perband_flows *pband = &q->band_flows[flow->band]; + struct fq_flow_head *head = (list_sel == NEW_FLOW) ? + &pband->new_flows : + &pband->old_flows; + if (head->first) head->last->next = flow; else @@ -173,7 +211,7 @@ static void fq_flow_unset_throttled(struct fq_sched_data *q, struct fq_flow *f) { rb_erase(&f->rate_node, &q->delayed); q->throttled_flows--; - fq_flow_add_tail(&q->old_flows, f); + fq_flow_add_tail(q, f, OLD_FLOW); } static void fq_flow_set_throttled(struct fq_sched_data *q, struct fq_flow *f) @@ -258,17 +296,61 @@ static void fq_gc(struct fq_sched_data *q, kmem_cache_free_bulk(fq_flow_cachep, fcnt, tofree); } -static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) +/* Fast path can be used if : + * 1) Packet tstamp is in the past. + * 2) FQ qlen == 0 OR + * (no flow is currently eligible for transmit, + * AND fast path queue has less than 8 packets) + * 3) No SO_MAX_PACING_RATE on the socket (if any). + * 4) No @maxrate attribute on this qdisc, + * + * FQ can not use generic TCQ_F_CAN_BYPASS infrastructure. + */ +static bool fq_fastpath_check(const struct Qdisc *sch, struct sk_buff *skb, + u64 now) { + const struct fq_sched_data *q = qdisc_priv(sch); + const struct sock *sk; + + if (fq_skb_cb(skb)->time_to_send > now) + return false; + + if (sch->q.qlen != 0) { + /* Even if some packets are stored in this qdisc, + * we can still enable fast path if all of them are + * scheduled in the future (ie no flows are eligible) + * or in the fast path queue. + */ + if (q->flows != q->inactive_flows + q->throttled_flows) + return false; + + /* Do not allow fast path queue to explode, we want Fair Queue mode + * under pressure. + */ + if (q->internal.qlen >= 8) + return false; + } + + sk = skb->sk; + if (sk && sk_fullsock(sk) && !sk_is_tcp(sk) && + sk->sk_max_pacing_rate != ~0UL) + return false; + + if (q->flow_max_rate != ~0UL) + return false; + + return true; +} + +static struct fq_flow *fq_classify(struct Qdisc *sch, struct sk_buff *skb, + u64 now) +{ + struct fq_sched_data *q = qdisc_priv(sch); struct rb_node **p, *parent; struct sock *sk = skb->sk; struct rb_root *root; struct fq_flow *f; - /* warning: no starvation prevention... */ - if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL)) - return &q->internal; - /* SYNACK messages are attached to a TCP_NEW_SYN_RECV request socket * or a listener (SYNCOOKIE mode) * 1) request sockets are not full blown, @@ -299,11 +381,18 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) sk = (struct sock *)((hash << 1) | 1UL); } + if (fq_fastpath_check(sch, skb, now)) { + q->internal.stat_fastpath_packets++; + if (skb->sk == sk && q->rate_enable && + READ_ONCE(sk->sk_pacing_status) != SK_PACING_FQ) + smp_store_release(&sk->sk_pacing_status, + SK_PACING_FQ); + return &q->internal; + } + root = &q->fq_root[hash_ptr(sk, q->fq_trees_log)]; - if (q->flows >= (2U << q->fq_trees_log) && - q->inactive_flows > q->flows/2) - fq_gc(q, root, sk); + fq_gc(q, root, sk); p = &root->rb_node; parent = NULL; @@ -396,7 +485,6 @@ static void fq_dequeue_skb(struct Qdisc *sch, struct fq_flow *flow, { fq_erase_head(sch, flow, skb); skb_mark_not_on_list(skb); - flow->qlen--; qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; } @@ -434,9 +522,9 @@ static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb) } static bool fq_packet_beyond_horizon(const struct sk_buff *skb, - const struct fq_sched_data *q) + const struct fq_sched_data *q, u64 now) { - return unlikely((s64)skb->tstamp > (s64)(q->ktime_cache + q->horizon)); + return unlikely((s64)skb->tstamp > (s64)(now + q->horizon)); } static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, @@ -444,53 +532,57 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, { struct fq_sched_data *q = qdisc_priv(sch); struct fq_flow *f; + u64 now; + u8 band; - if (unlikely(sch->q.qlen >= sch->limit)) + band = fq_prio2band(q->prio2band, skb->priority & TC_PRIO_MAX); + if (unlikely(q->band_pkt_count[band] >= sch->limit)) { + q->stat_band_drops[band]++; return qdisc_drop(skb, sch, to_free); + } + now = ktime_get_ns(); if (!skb->tstamp) { - fq_skb_cb(skb)->time_to_send = q->ktime_cache = ktime_get_ns(); + fq_skb_cb(skb)->time_to_send = now; } else { - /* Check if packet timestamp is too far in the future. - * Try first if our cached value, to avoid ktime_get_ns() - * cost in most cases. - */ - if (fq_packet_beyond_horizon(skb, q)) { - /* Refresh our cache and check another time */ - q->ktime_cache = ktime_get_ns(); - if (fq_packet_beyond_horizon(skb, q)) { - if (q->horizon_drop) { + /* Check if packet timestamp is too far in the future. */ + if (fq_packet_beyond_horizon(skb, q, now)) { + if (q->horizon_drop) { q->stat_horizon_drops++; return qdisc_drop(skb, sch, to_free); - } - q->stat_horizon_caps++; - skb->tstamp = q->ktime_cache + q->horizon; } + q->stat_horizon_caps++; + skb->tstamp = now + q->horizon; } fq_skb_cb(skb)->time_to_send = skb->tstamp; } - f = fq_classify(skb, q); - if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) { - q->stat_flows_plimit++; - return qdisc_drop(skb, sch, to_free); - } + f = fq_classify(sch, skb, now); - f->qlen++; - qdisc_qstats_backlog_inc(sch, skb); - if (fq_flow_is_detached(f)) { - fq_flow_add_tail(&q->new_flows, f); - if (time_after(jiffies, f->age + q->flow_refill_delay)) - f->credit = max_t(u32, f->credit, q->quantum); - q->inactive_flows--; + if (f != &q->internal) { + if (unlikely(f->qlen >= q->flow_plimit)) { + q->stat_flows_plimit++; + return qdisc_drop(skb, sch, to_free); + } + + if (fq_flow_is_detached(f)) { + fq_flow_add_tail(q, f, NEW_FLOW); + if (time_after(jiffies, f->age + q->flow_refill_delay)) + f->credit = max_t(u32, f->credit, q->quantum); + } + + f->band = band; + q->band_pkt_count[band]++; + fq_skb_cb(skb)->band = band; + if (f->qlen == 0) + q->inactive_flows--; } + f->qlen++; /* Note: this overwrites f->age */ flow_queue_add(f, skb); - if (unlikely(f == &q->internal)) { - q->stat_internal_packets++; - } + qdisc_qstats_backlog_inc(sch, skb); sch->q.qlen++; return NET_XMIT_SUCCESS; @@ -523,13 +615,26 @@ static void fq_check_throttled(struct fq_sched_data *q, u64 now) } } +static struct fq_flow_head *fq_pband_head_select(struct fq_perband_flows *pband) +{ + if (pband->credit <= 0) + return NULL; + + if (pband->new_flows.first) + return &pband->new_flows; + + return pband->old_flows.first ? &pband->old_flows : NULL; +} + static struct sk_buff *fq_dequeue(struct Qdisc *sch) { struct fq_sched_data *q = qdisc_priv(sch); + struct fq_perband_flows *pband; struct fq_flow_head *head; struct sk_buff *skb; struct fq_flow *f; unsigned long rate; + int retry; u32 plen; u64 now; @@ -538,30 +643,38 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch) skb = fq_peek(&q->internal); if (unlikely(skb)) { + q->internal.qlen--; fq_dequeue_skb(sch, &q->internal, skb); goto out; } - q->ktime_cache = now = ktime_get_ns(); + now = ktime_get_ns(); fq_check_throttled(q, now); + retry = 0; + pband = &q->band_flows[q->band_nr]; begin: - head = &q->new_flows; - if (!head->first) { - head = &q->old_flows; - if (!head->first) { - if (q->time_next_delayed_flow != ~0ULL) - qdisc_watchdog_schedule_range_ns(&q->watchdog, + head = fq_pband_head_select(pband); + if (!head) { + while (++retry <= FQ_BANDS) { + if (++q->band_nr == FQ_BANDS) + q->band_nr = 0; + pband = &q->band_flows[q->band_nr]; + pband->credit = min(pband->credit + pband->quantum, + pband->quantum); + goto begin; + } + if (q->time_next_delayed_flow != ~0ULL) + qdisc_watchdog_schedule_range_ns(&q->watchdog, q->time_next_delayed_flow, q->timer_slack); - return NULL; - } + return NULL; } f = head->first; - + retry = 0; if (f->credit <= 0) { f->credit += q->quantum; head->first = f->next; - fq_flow_add_tail(&q->old_flows, f); + fq_flow_add_tail(q, f, OLD_FLOW); goto begin; } @@ -581,20 +694,23 @@ begin: INET_ECN_set_ce(skb); q->stat_ce_mark++; } + if (--f->qlen == 0) + q->inactive_flows++; + q->band_pkt_count[fq_skb_cb(skb)->band]--; fq_dequeue_skb(sch, f, skb); } else { head->first = f->next; /* force a pass through old_flows to prevent starvation */ - if ((head == &q->new_flows) && q->old_flows.first) { - fq_flow_add_tail(&q->old_flows, f); + if (head == &pband->new_flows) { + fq_flow_add_tail(q, f, OLD_FLOW); } else { fq_flow_set_detached(f); - q->inactive_flows++; } goto begin; } plen = qdisc_pkt_len(skb); f->credit -= plen; + pband->credit -= plen; if (!q->rate_enable) goto out; @@ -607,7 +723,7 @@ begin: */ if (!skb->tstamp) { if (skb->sk) - rate = min(skb->sk->sk_pacing_rate, rate); + rate = min(READ_ONCE(skb->sk->sk_pacing_rate), rate); if (rate <= q->low_rate_threshold) { f->credit = 0; @@ -686,8 +802,10 @@ static void fq_reset(struct Qdisc *sch) kmem_cache_free(fq_flow_cachep, f); } } - q->new_flows.first = NULL; - q->old_flows.first = NULL; + for (idx = 0; idx < FQ_BANDS; idx++) { + q->band_flows[idx].new_flows.first = NULL; + q->band_flows[idx].old_flows.first = NULL; + } q->delayed = RB_ROOT; q->flows = 0; q->inactive_flows = 0; @@ -779,7 +897,7 @@ static int fq_resize(struct Qdisc *sch, u32 log) return 0; } -static struct netlink_range_validation iq_range = { +static const struct netlink_range_validation iq_range = { .max = INT_MAX, }; @@ -801,8 +919,77 @@ static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = { [TCA_FQ_TIMER_SLACK] = { .type = NLA_U32 }, [TCA_FQ_HORIZON] = { .type = NLA_U32 }, [TCA_FQ_HORIZON_DROP] = { .type = NLA_U8 }, + [TCA_FQ_PRIOMAP] = { + .type = NLA_BINARY, + .len = sizeof(struct tc_prio_qopt), + }, + [TCA_FQ_WEIGHTS] = { + .type = NLA_BINARY, + .len = FQ_BANDS * sizeof(s32), + }, }; +/* compress a u8 array with all elems <= 3 to an array of 2-bit fields */ +static void fq_prio2band_compress_crumb(const u8 *in, u8 *out) +{ + const int num_elems = TC_PRIO_MAX + 1; + int i; + + memset(out, 0, num_elems / 4); + for (i = 0; i < num_elems; i++) + out[i / 4] |= in[i] << (2 * (i & 0x3)); +} + +static void fq_prio2band_decompress_crumb(const u8 *in, u8 *out) +{ + const int num_elems = TC_PRIO_MAX + 1; + int i; + + for (i = 0; i < num_elems; i++) + out[i] = fq_prio2band(in, i); +} + +static int fq_load_weights(struct fq_sched_data *q, + const struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + s32 *weights = nla_data(attr); + int i; + + for (i = 0; i < FQ_BANDS; i++) { + if (weights[i] < FQ_MIN_WEIGHT) { + NL_SET_ERR_MSG_FMT_MOD(extack, "Weight %d less that minimum allowed %d", + weights[i], FQ_MIN_WEIGHT); + return -EINVAL; + } + } + for (i = 0; i < FQ_BANDS; i++) + q->band_flows[i].quantum = weights[i]; + return 0; +} + +static int fq_load_priomap(struct fq_sched_data *q, + const struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + const struct tc_prio_qopt *map = nla_data(attr); + int i; + + if (map->bands != FQ_BANDS) { + NL_SET_ERR_MSG_MOD(extack, "FQ only supports 3 bands"); + return -EINVAL; + } + for (i = 0; i < TC_PRIO_MAX + 1; i++) { + if (map->priomap[i] >= FQ_BANDS) { + NL_SET_ERR_MSG_FMT_MOD(extack, "FQ priomap field %d maps to a too high band %d", + i, map->priomap[i]); + return -EINVAL; + } + } + fq_prio2band_compress_crumb(map->priomap, q->prio2band); + return 0; +} + static int fq_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { @@ -877,6 +1064,12 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt, q->flow_refill_delay = usecs_to_jiffies(usecs_delay); } + if (!err && tb[TCA_FQ_PRIOMAP]) + err = fq_load_priomap(q, tb[TCA_FQ_PRIOMAP], extack); + + if (!err && tb[TCA_FQ_WEIGHTS]) + err = fq_load_weights(q, tb[TCA_FQ_WEIGHTS], extack); + if (tb[TCA_FQ_ORPHAN_MASK]) q->orphan_mask = nla_get_u32(tb[TCA_FQ_ORPHAN_MASK]); @@ -928,7 +1121,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct fq_sched_data *q = qdisc_priv(sch); - int err; + int i, err; sch->limit = 10000; q->flow_plimit = 100; @@ -938,8 +1131,13 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt, q->flow_max_rate = ~0UL; q->time_next_delayed_flow = ~0ULL; q->rate_enable = 1; - q->new_flows.first = NULL; - q->old_flows.first = NULL; + for (i = 0; i < FQ_BANDS; i++) { + q->band_flows[i].new_flows.first = NULL; + q->band_flows[i].old_flows.first = NULL; + } + q->band_flows[0].quantum = 9 << 16; + q->band_flows[1].quantum = 3 << 16; + q->band_flows[2].quantum = 1 << 16; q->delayed = RB_ROOT; q->fq_root = NULL; q->fq_trees_log = ilog2(1024); @@ -954,6 +1152,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt, /* Default ce_threshold of 4294 seconds */ q->ce_threshold = (u64)NSEC_PER_USEC * ~0U; + fq_prio2band_compress_crumb(sch_default_prio2band, q->prio2band); qdisc_watchdog_init_clockid(&q->watchdog, sch, CLOCK_MONOTONIC); if (opt) @@ -968,8 +1167,12 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) { struct fq_sched_data *q = qdisc_priv(sch); u64 ce_threshold = q->ce_threshold; + struct tc_prio_qopt prio = { + .bands = FQ_BANDS, + }; u64 horizon = q->horizon; struct nlattr *opts; + s32 weights[3]; opts = nla_nest_start_noflag(skb, TCA_OPTIONS); if (opts == NULL) @@ -999,6 +1202,16 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) nla_put_u8(skb, TCA_FQ_HORIZON_DROP, q->horizon_drop)) goto nla_put_failure; + fq_prio2band_decompress_crumb(q->prio2band, prio.priomap); + if (nla_put(skb, TCA_FQ_PRIOMAP, sizeof(prio), &prio)) + goto nla_put_failure; + + weights[0] = q->band_flows[0].quantum; + weights[1] = q->band_flows[1].quantum; + weights[2] = q->band_flows[2].quantum; + if (nla_put(skb, TCA_FQ_WEIGHTS, sizeof(weights), &weights)) + goto nla_put_failure; + return nla_nest_end(skb, opts); nla_put_failure: @@ -1009,11 +1222,15 @@ static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d) { struct fq_sched_data *q = qdisc_priv(sch); struct tc_fq_qd_stats st; + int i; + + st.pad = 0; sch_tree_lock(sch); st.gc_flows = q->stat_gc_flows; - st.highprio_packets = q->stat_internal_packets; + st.highprio_packets = 0; + st.fastpath_packets = q->internal.stat_fastpath_packets; st.tcp_retrans = 0; st.throttled = q->stat_throttled; st.flows_plimit = q->stat_flows_plimit; @@ -1029,6 +1246,10 @@ static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d) st.ce_mark = q->stat_ce_mark; st.horizon_drops = q->stat_horizon_drops; st.horizon_caps = q->stat_horizon_caps; + for (i = 0; i < FQ_BANDS; i++) { + st.band_drops[i] = q->stat_band_drops[i]; + st.band_pkt_count[i] = q->band_pkt_count[i]; + } sch_tree_unlock(sch); return gnet_stats_copy_app(d, &st, sizeof(st)); @@ -1056,7 +1277,7 @@ |