/*
* Interface for controlling IO bandwidth on a request queue
*
* Copyright (C) 2010 Vivek Goyal <vgoyal@redhat.com>
*/
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/bio.h>
#include <linux/blktrace_api.h>
#include <linux/blk-cgroup.h>
#include "blk.h"
/* Max dispatch from a group in 1 round */
static int throtl_grp_quantum = 8;
/* Total max dispatch from all groups in one round */
static int throtl_quantum = 32;
/* Throttling is performed over 100ms slice and after that slice is renewed */
static unsigned long throtl_slice = HZ/10; /* 100 ms */
static struct blkcg_policy blkcg_policy_throtl;
/* A workqueue to queue throttle related work */
static struct workqueue_struct *kthrotld_workqueue;
/*
* To implement hierarchical throttling, throtl_grps form a tree and bios
* are dispatched upwards level by level until they reach the top and get
* issued. When dispatching bios from the children and local group at each
* level, if the bios are dispatched into a single bio_list, there's a risk
* of a local or child group which can queue many bios at once filling up
* the list starving others.
*
* To avoid such starvation, dispatched bios are queued separately
* according to where they came from. When they are again dispatched to
* the parent, they're popped in round-robin order so that no single source
* hogs the dispatch window.
*
* throtl_qnode is used to keep the queued bios separated by their sources.
* Bios are queued to throtl_qnode which in turn is queued to
* throtl_service_queue and then dispatched in round-robin order.
*
* It's also used to track the reference counts on blkg's. A qnode always
* belongs to a throtl_grp and gets queued on itself or the parent, so
* incrementing the reference of the associated throtl_grp when a qnode is
* queued and decrementing when dequeued is enough to keep the whole blkg
* tree pinned while bios are in flight.
*/
struct throtl_qnode {
struct list_head node; /* service_queue->queued[] */
struct bio_list bios; /* queued bios */
struct throtl_grp *tg; /* tg this qnode belongs to */
};
struct throtl_service_queue {
struct throtl_service_queue *parent_sq; /* the parent service_queue */
/*
* Bios queued directly to this service_queue or dispatched from
* children throtl_grp's.
*/
struct list_head queued[2]; /* throtl_qnode [READ/WRITE] */
unsigned int nr_queued[2]; /* number of queued bios */
/*
* RB tree of active children throtl_grp's, which are sorted by
* their ->disptime.
*/
struct rb_root pending_tree; /* RB tree of active tgs */
struct rb_node *first_pending; /* first node in the tree */
unsigned int nr_pending; /* # queued in the tree */
unsigned long first_pending_disptime; /* disptime of the first tg */
struct timer_list pending_timer; /* fires on first_pending_disptime */
};
enum tg_state_flags {
THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */
THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */
};
#define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node)
struct throtl_grp {
/* must be the first member */
struct blkg_policy_data pd;
/* active throtl group service_queue member */
struct rb_node rb_node;
/* throtl_data this group belongs to */
struct throtl_data *td;
/* this group's service queue */
struct throtl_service_queue service_queue;
/*
* qnode_on_self is used when bios are directly queued to this
* throtl_grp so that local bios compete fairly with bios
* dispatched from children. qnode_on_parent is used when bios are
* dispatched from this throtl_grp into its parent and will compete
* with the sibling qnode_on_parents and the parent's
* qnode_on_self.
*/
struct throtl_qnode qnode_on_self[2];
struct throtl_qnode qnode_on_parent[2];
/*
* Dispatch time in jiffies. This is the estimated time when group
* will unthrottle and is ready to dispatch more bio. It is used as
* key to sort active groups in service tree.
*/
unsigned long disptime;
unsigned int flags;
/* are there any throtl rules between this group and td? */
bool has_rules[2];
/* bytes per second rate limits */
uint64_t bps[2];
/* IOPS limits */
unsigned int iops[2];
/* Number of bytes disptached in current slice */
uint64_t bytes_disp[2];
/* Number of bio's dispatched in current slice */
unsigned int io_disp[2];
/* When did we start a new slice */
unsigned long slice_start[2];
unsigned long slice_end[2];
};
struct throtl_data
{
/* service tree for active throtl groups */
struct throtl_service_queue service_queue;
struct request_queue *queue;
/* Total Number of queued bios on READ and WRITE lists */
unsigned int nr_queued[2];
/* Work for dispatching throttled bios */
struct work_struct dispatch_work;
};
static void throtl_pending_timer_fn(unsigned long arg);
static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd)
{
return pd ? container_of(pd, struct throtl_grp, pd) : NULL;
}
static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg)
{
return pd_to_tg(blkg_to_pd(blkg, &blkcg_policy_throtl));
}
static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg)
{
return pd_to_blkg(&tg->pd);
}
/**
* sq_to_tg - return the throl_grp the specified service queue belongs to
* @sq: the throtl_service_queue of interest
*
* Return the throtl_grp @sq belongs to. If @sq is the top-level one
* embedded in throtl_data, %NULL is returned.
*/
static struct throtl_grp *sq_to_tg(struct
|