/*
* net/sched/sch_qfq.c Quick Fair Queueing Plus Scheduler.
*
* Copyright (c) 2009 Fabio Checconi, Luigi Rizzo, and Paolo Valente.
* Copyright (c) 2012 Paolo Valente.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* version 2 as published by the Free Software Foundation.
*/
#include <linux/module.h>
#include <linux/init.h>
#include <linux/bitops.h>
#include <linux/errno.h>
#include <linux/netdevice.h>
#include <linux/pkt_sched.h>
#include <net/sch_generic.h>
#include <net/pkt_sched.h>
#include <net/pkt_cls.h>
/* Quick Fair Queueing Plus
========================
Sources:
[1] Paolo Valente,
"Reducing the Execution Time of Fair-Queueing Schedulers."
http://algo.ing.unimo.it/people/paolo/agg-sched/agg-sched.pdf
Sources for QFQ:
[2] Fabio Checconi, Luigi Rizzo, and Paolo Valente: "QFQ: Efficient
Packet Scheduling with Tight Bandwidth Distribution Guarantees."
See also:
http://retis.sssup.it/~fabio/linux/qfq/
*/
/*
QFQ+ divides classes into aggregates of at most MAX_AGG_CLASSES
classes. Each aggregate is timestamped with a virtual start time S
and a virtual finish time F, and scheduled according to its
timestamps. S and F are computed as a function of a system virtual
time function V. The classes within each aggregate are instead
scheduled with DRR.
To speed up operations, QFQ+ divides also aggregates into a limited
number of groups. Which group a class belongs to depends on the
ratio between the maximum packet length for the class and the weight
of the class. Groups have their own S and F. In the end, QFQ+
schedules groups, then aggregates within groups, then classes within
aggregates. See [1] and [2] for a full description.
Virtual time computations.
S, F and V are all computed in fixed point arithmetic with
FRAC_BITS decimal bits.
QFQ_MAX_INDEX is the maximum index allowed for a group. We need
one bit per index.
QFQ_MAX_WSHIFT is the maximum power of two supported as a weight.
The layout of the bits is as below:
[ MTU_SHIFT ][ FRAC_BITS ]
[ MAX_INDEX ][ MIN_SLOT_SHIFT ]
^.__grp->index = 0
*.__grp->slot_shift
where MIN_SLOT_SHIFT is derived by difference from the others.
The max group index corresponds to Lmax/w_min, where
Lmax=1<<MTU_SHIFT, w_min = 1 .
From this, and knowing how many groups (MAX_INDEX) we want,
we can derive the shift corresponding to each group.
Because we often need to compute
F = S + len/w_i and V = V + len/wsum
instead of storing w_i store the value
inv_w = (1<<FRAC_BITS)/w_i
so we can do F = S + len * inv_w * wsum.
We use W_TOT in the formulas so we can easily move between
static and adaptive weight sum.
The per-scheduler-instance data contain all the data structures
for the scheduler: bitmaps and bucket lists.
*/
/*
* Maximum number of consecutive slots occupied by backlogged classes
* inside a group.
*/
#define QFQ_MAX_SLOTS 32
/*
* Shifts used for aggregate<->group mapping. We allow class weights that are
* in the range [1, 2^MAX_WSHIFT], and we try to map each aggregate i to the
* group with the smallest index that can support the L_i / r_i configured
* for the classes in the aggregate.
*
* grp->index is the index of the group; and grp->slot_shift
* is the shift for the corresponding (scaled) sigma_i.
*/
#define QFQ_MAX_INDEX 24
#define QFQ_MAX_WSHIFT 10
#define QFQ_MAX_WEIGHT (1<<QFQ_MAX_WSHIFT) /* see qfq_slot_insert */
#define QFQ_MAX_WSUM (64*QFQ_MAX_WEIGHT)
#define FRAC_BITS 30 /* fixed point arithmetic */
#define ONE_FP (1UL << FRAC_BITS)
#define QFQ_MTU_SHIFT 16 /* to support TSO/GSO */
#define QFQ_MIN_LMAX 512 /* see qfq_slot_insert */
#define QFQ_MAX_AGG_CLASSES 8 /* max num classes per aggregate allowed */
/*
* Possible group states. These values are used as indexes for the bitmaps
* array of struct qfq_queue.
*/
enum qfq_state { ER, IR, EB, IB, QFQ_MAX_STATE };
struct qfq_group;
struct qfq_aggregate;
struct qfq_class {
struct Qdisc_class_common common;
unsigned int filter_cnt;
struct gnet_stats_basic_packed bstats;
struct gnet_stats_queue qstats;
struct net_rate_estimator __rcu *rate_est;
struct Qdisc *qdisc;
struct list_head alist; /* Link for active-classes list. */
struct qfq_aggregate *agg; /* Parent aggregate. */
int deficit; /* DRR deficit counter. */
};
struct qfq_aggregate {
struct hlist_node next; /* Link for the slot list. */
u64 S, F; /* flow timestamps (exact) */
/* group we belong to. In principle we would need the index,
* which is log_2(lmax/weight), but we never reference it
* directly, only the group.
*/
struct qfq_group *grp;
/* these are copied from the flowset. */
u32 class_weight; /* Weight of each class in this aggregate. */
/* Max pkt size for the classes in this aggregate, DRR quantum. */
int lmax;
u32 inv_w; /* ONE_FP/(sum of weights of classes in aggr.). */
u32 budgetmax; /* Max budget for this aggregate. */
u32 initial_budget, budget; /* Initial and current budget. */
int num_classes; /* Number of classes in this aggr. */
struct list_head active; /* DRR queue of active classes. */
struct hlist_node nonfull_next; /* See nonfull_aggs in qfq_sched. */
};
struct qfq_group {
u64 S, F; /* group timestamps (approx). */
unsigned int slot_shift; /* Slot shift. */
unsigned int index; /* Group index. */
unsigned int front; /* Index of the front slot. */
unsigned long full_slots; /* non-empty slots */
/* Array of RR lists of active aggregates. */
struct hlist_head slots[QFQ_MAX_SLOTS];
};
struct qfq_sched {
struct tcf_proto __rcu *filter_list;
struct tcf_block *block;
struct Qdisc_class_hash clhash;
u64 oldV, V; /* Precise virtual times. */
struct qfq_aggregate *in_serv_agg; /* Aggregate being served. */
u32 wsum; /* weight sum */
u32 iwsum; /* inverse weight sum */
unsigned long bitmaps[QFQ_MAX_STATE]; /* Group bitmaps. */
struct qfq_group groups[QFQ_MAX_INDEX + 1]; /* The groups. */
u32 min_slot_shift; /* Index of the group-0 bit in the bitmaps. */
u32 max_agg_classes; /* Max number of classes per aggr. */
struct hlist_head nonfull_aggs; /* Aggs with room for more classes. */
};
/*
* Possible reasons why the timestamps of an aggregate are updated
* enqueue: the aggregate switches from idle to active and must scheduled
* for service
* requeue: the aggregate finishes its budget, so it stops being served and
* must be rescheduled for service
*/
enum update_reason {enqueue, requeue};
static struct qfq_class *qfq_find_class(struct Qdisc *sch, u32 classid)
{
struct qfq_sched *q = qdisc_priv(sch);
struct Qdisc_class_common *clc;
clc = qdisc_class_find(&q->clhash, classid);
if (clc == NULL)
return NULL;
return container_of(clc, struct qfq_class, common);
}
static const struct nla_policy qfq_policy[TCA_QFQ_MAX + 1] = {
[TCA_QFQ_WEIGHT] = { .type = NLA_U32 },
[TCA_QFQ_LMAX] = { .type = NLA_U32 },
};
/*
* Calculate a flow index, given its weight and maximum packet length.
* index = log_2(maxlen/weight) but we need to apply the scaling.
* This is used only once at flow creation.
*/
static int qfq_calc_index(u32 inv_w, unsigned int maxlen, u32 min_slot_shift)
{
u64 slot_size = (u64)maxlen * inv_w;
unsigned long size_map;
int index = 0;
size_map = slot_size >> min_slot_shift;
if (!size_map)
goto out;
index = __fls(size_map) + 1; /* basic
|