diff options
author | Joe Thornber <ejt@redhat.com> | 2016-02-10 10:18:10 +0000 |
---|---|---|
committer | Mike Snitzer <snitzer@redhat.com> | 2016-03-10 17:12:08 -0500 |
commit | 9ed84698fdda63de93c68150c4f63673cc3d7b54 (patch) | |
tree | 3722d66694317bf32b99444fd31a52ec7e5825ed | |
parent | e233d800a9648f2c0802aa23250d9c8af57bab43 (diff) | |
download | linux-9ed84698fdda63de93c68150c4f63673cc3d7b54.tar.gz linux-9ed84698fdda63de93c68150c4f63673cc3d7b54.tar.bz2 linux-9ed84698fdda63de93c68150c4f63673cc3d7b54.zip |
dm cache: make the 'mq' policy an alias for 'smq'
smq seems to be performing better than the old mq policy in all
situations, as well as using a quarter of the memory.
Make 'mq' an alias for 'smq' when choosing a cache policy. The tunables
that were present for the old mq are faked, and have no effect. mq
should be considered deprecated now.
Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
-rw-r--r-- | Documentation/device-mapper/cache-policies.txt | 39 | ||||
-rw-r--r-- | drivers/md/Kconfig | 10 | ||||
-rw-r--r-- | drivers/md/Makefile | 2 | ||||
-rw-r--r-- | drivers/md/dm-cache-policy-mq.c | 1473 | ||||
-rw-r--r-- | drivers/md/dm-cache-policy-smq.c | 92 |
5 files changed, 87 insertions, 1529 deletions
diff --git a/Documentation/device-mapper/cache-policies.txt b/Documentation/device-mapper/cache-policies.txt index d9246a32e673..e5062ad18717 100644 --- a/Documentation/device-mapper/cache-policies.txt +++ b/Documentation/device-mapper/cache-policies.txt @@ -28,51 +28,16 @@ Overview of supplied cache replacement policies multiqueue (mq) --------------- -This policy has been deprecated in favor of the smq policy (see below). +This policy is now an alias for smq (see below). -The multiqueue policy has three sets of 16 queues: one set for entries -waiting for the cache and another two for those in the cache (a set for -clean entries and a set for dirty entries). +The following tunables are accepted, but have no effect: -Cache entries in the queues are aged based on logical time. Entry into -the cache is based on variable thresholds and queue selection is based -on hit count on entry. The policy aims to take different cache miss -costs into account and to adjust to varying load patterns automatically. - -Message and constructor argument pairs are: 'sequential_threshold <#nr_sequential_ios>' 'random_threshold <#nr_random_ios>' 'read_promote_adjustment <value>' 'write_promote_adjustment <value>' 'discard_promote_adjustment <value>' -The sequential threshold indicates the number of contiguous I/Os -required before a stream is treated as sequential. Once a stream is -considered sequential it will bypass the cache. The random threshold -is the number of intervening non-contiguous I/Os that must be seen -before the stream is treated as random again. - -The sequential and random thresholds default to 512 and 4 respectively. - -Large, sequential I/Os are probably better left on the origin device -since spindles tend to have good sequential I/O bandwidth. The -io_tracker counts contiguous I/Os to try to spot when the I/O is in one -of these sequential modes. But there are use-cases for wanting to -promote sequential blocks to the cache (e.g. fast application startup). -If sequential threshold is set to 0 the sequential I/O detection is -disabled and sequential I/O will no longer implicitly bypass the cache. -Setting the random threshold to 0 does _not_ disable the random I/O -stream detection. - -Internally the mq policy determines a promotion threshold. If the hit -count of a block not in the cache goes above this threshold it gets -promoted to the cache. The read, write and discard promote adjustment -tunables allow you to tweak the promotion threshold by adding a small -value based on the io type. They default to 4, 8 and 1 respectively. -If you're trying to quickly warm a new cache device you may wish to -reduce these to encourage promotion. Remember to switch them back to -their defaults after the cache fills though. - Stochastic multiqueue (smq) --------------------------- diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 0a2e7273db9e..2ae40913bbc6 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -304,16 +304,6 @@ config DM_CACHE algorithms used to select which blocks are promoted, demoted, cleaned etc. It supports writeback and writethrough modes. -config DM_CACHE_MQ - tristate "MQ Cache Policy (EXPERIMENTAL)" - depends on DM_CACHE - default y - ---help--- - A cache policy that uses a multiqueue ordered by recent hit - count to select which blocks should be promoted and demoted. - This is meant to be a general purpose policy. It prioritises - reads over writes. - config DM_CACHE_SMQ tristate "Stochastic MQ Cache Policy (EXPERIMENTAL)" depends on DM_CACHE diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 62a65764e8e0..52ba8dd82821 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -12,7 +12,6 @@ dm-log-userspace-y \ += dm-log-userspace-base.o dm-log-userspace-transfer.o dm-thin-pool-y += dm-thin.o dm-thin-metadata.o dm-cache-y += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o -dm-cache-mq-y += dm-cache-policy-mq.o dm-cache-smq-y += dm-cache-policy-smq.o dm-cache-cleaner-y += dm-cache-policy-cleaner.o dm-era-y += dm-era-target.o @@ -55,7 +54,6 @@ obj-$(CONFIG_DM_RAID) += dm-raid.o obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o obj-$(CONFIG_DM_VERITY) += dm-verity.o obj-$(CONFIG_DM_CACHE) += dm-cache.o -obj-$(CONFIG_DM_CACHE_MQ) += dm-cache-mq.o obj-$(CONFIG_DM_CACHE_SMQ) += dm-cache-smq.o obj-$(CONFIG_DM_CACHE_CLEANER) += dm-cache-cleaner.o obj-$(CONFIG_DM_ERA) += dm-era.o diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c deleted file mode 100644 index ddb26980cd66..000000000000 --- a/drivers/md/dm-cache-policy-mq.c +++ /dev/null @@ -1,1473 +0,0 @@ -/* - * Copyright (C) 2012 Red Hat. All rights reserved. - * - * This file is released under the GPL. - */ - -#include "dm-cache-policy.h" -#include "dm.h" - -#include <linux/hash.h> -#include <linux/jiffies.h> -#include <linux/module.h> -#include <linux/mutex.h> -#include <linux/slab.h> -#include <linux/vmalloc.h> - -#define DM_MSG_PREFIX "cache-policy-mq" - -static struct kmem_cache *mq_entry_cache; - -/*----------------------------------------------------------------*/ - -static unsigned next_power(unsigned n, unsigned min) -{ - return roundup_pow_of_two(max(n, min)); -} - -/*----------------------------------------------------------------*/ - -/* - * Large, sequential ios are probably better left on the origin device since - * spindles tend to have good bandwidth. - * - * The io_tracker tries to spot when the io is in one of these sequential - * modes. - * - * Two thresholds to switch between random and sequential io mode are defaulting - * as follows and can be adjusted via the constructor and message interfaces. - */ -#define RANDOM_THRESHOLD_DEFAULT 4 -#define SEQUENTIAL_THRESHOLD_DEFAULT 512 - -enum io_pattern { - PATTERN_SEQUENTIAL, - PATTERN_RANDOM -}; - -struct io_tracker { - enum io_pattern pattern; - - unsigned nr_seq_samples; - unsigned nr_rand_samples; - unsigned thresholds[2]; - - dm_oblock_t last_end_oblock; -}; - -static void iot_init(struct io_tracker *t, - int sequential_threshold, int random_threshold) -{ - t->pattern = PATTERN_RANDOM; - t->nr_seq_samples = 0; - t->nr_rand_samples = 0; - t->last_end_oblock = 0; - t->thresholds[PATTERN_RANDOM] = random_threshold; - t->thresholds[PATTERN_SEQUENTIAL] = sequential_threshold; -} - -static enum io_pattern iot_pattern(struct io_tracker *t) -{ - return t->pattern; -} - -static void iot_update_stats(struct io_tracker *t, struct bio *bio) -{ - if (bio->bi_iter.bi_sector == from_oblock(t->last_end_oblock) + 1) - t->nr_seq_samples++; - else { - /* - * Just one non-sequential IO is enough to reset the - * counters. - */ - if (t->nr_seq_samples) { - t->nr_seq_samples = 0; - t->nr_rand_samples = 0; - } - - t->nr_rand_samples++; - } - - t->last_end_oblock = to_oblock(bio_end_sector(bio) - 1); -} - -static void iot_check_for_pattern_switch(struct io_tracker *t) -{ - switch (t->pattern) { - case PATTERN_SEQUENTIAL: - if (t->nr_rand_samples >= t->thresholds[PATTERN_RANDOM]) { - t->pattern = PATTERN_RANDOM; - t->nr_seq_samples = t->nr_rand_samples = 0; - } - break; - - case PATTERN_RANDOM: - if (t->nr_seq_samples >= t->thresholds[PATTERN_SEQUENTIAL]) { - t->pattern = PATTERN_SEQUENTIAL; - t->nr_seq_samples = t->nr_rand_samples = 0; - } - break; - } -} - -static void iot_examine_bio(struct io_tracker *t, struct bio *bio) -{ - iot_update_stats(t, bio); - iot_check_for_pattern_switch(t); -} - -/*----------------------------------------------------------------*/ - - -/* - * This queue is divided up into different levels. Allowing us to push - * entries to the back of any of the levels. Think of it as a partially - * sorted queue. - */ -#define NR_QUEUE_LEVELS 16u -#define NR_SENTINELS NR_QUEUE_LEVELS * 3 - -#define WRITEBACK_PERIOD HZ - -struct queue { - unsigned nr_elts; - bool current_writeback_sentinels; - unsigned long next_writeback; - struct list_head qs[NR_QUEUE_LEVELS]; - struct list_head sentinels[NR_SENTINELS]; -}; - -static void queue_init(struct queue *q) -{ - unsigned i; - - q->nr_elts = 0; - q->current_writeback_sentinels = false; - q->next_writeback = 0; - for (i = 0; i < NR_QUEUE_LEVELS; i++) { - INIT_LIST_HEAD(q->qs + i); - INIT_LIST_HEAD(q->sentinels + i); - INIT_LIST_HEAD(q->sentinels + NR_QUEUE_LEVELS + i); - INIT_LIST_HEAD(q->sentinels + (2 * NR_QUEUE_LEVELS) + i); - } -} - -static unsigned queue_size(struct queue *q) -{ - return q->nr_elts; -} - -static bool queue_empty(struct queue *q) -{ - return q->nr_elts == 0; -} - -/* - * Insert an entry to the back of the given level. - */ -static void queue_push(struct queue *q, unsigned level, struct list_head *elt) -{ - q->nr_elts++; - list_add_tail(elt, q->qs + level); -} - -static void queue_remove(struct queue *q, struct list_head *elt) -{ - q->nr_elts--; - list_del(elt); -} - -static bool is_sentinel(struct queue *q, struct list_head *h) -{ - return (h >= q->sentinels) && (h < (q->sentinels + NR_SENTINELS)); -} - -/* - * Gives us the oldest entry of the lowest popoulated level. If the first - * level is emptied then we shift down one level. - */ -static struct list_head *queue_peek(struct queue *q) -{ - unsigned level; - struct list_head *h; - - for (level = 0; level < NR_QUEUE_LEVELS; level++) - list_for_each(h, q->qs + level) - if (!is_sentinel(q, h)) - return h; - - return NULL; -} - -static struct list_head *queue_pop(struct queue *q) -{ - struct list_head *r = queue_peek(q); - - if (r) { - q->nr_elts--; - list_del(r); - } - - return r; -} - -/* - * Pops an entry from a level that is not past a sentinel. - */ -static struct list_head *queue_pop_old(struct queue *q) -{ - unsigned level; - struct list_head *h; - - for (level = 0; level < NR_QUEUE_LEVELS; level++) - list_for_each(h, q->qs + level) { - if (is_sentinel(q, h)) - break; - - q->nr_elts--; - list_del(h); - return h; - } - - return NULL; -} - -static struct list_head *list_pop(struct list_head *lh) -{ - struct list_head *r = lh->next; - - BUG_ON(!r); - list_del_init(r); - - return r; -} - -static struct list_head *writeback_sentinel(struct queue *q, unsigned level) -{ - if (q->current_writeback_sentinels) - return q->sentinels + NR_QUEUE_LEVELS + level; - else - return q->sentinels + 2 * NR_QUEUE_LEVELS + level; -} - -static void queue_update_writeback_sentinels(struct queue *q) -{ - unsigned i; - struct list_head *h; - - if (time_after(jiffies, q->next_writeback)) { - for (i = 0; i < NR_QUEUE_LEVELS; i++) { - h = writeback_sentinel(q, i); - list_del(h); - list_add_tail(h, q->qs + i); - } - - q->next_writeback = jiffies + WRITEBACK_PERIOD; - q->current_writeback_sentinels = !q->current_writeback_sentinels; - } -} - -/* - * Sometimes we want to iterate through entries that have been pushed since - * a certain event. We use sentinel entries on the queues to delimit these - * 'tick' events. - */ -static void queue_tick(struct queue *q) -{ - unsigned i; - - for (i = 0; i < NR_QUEUE_LEVELS; i++) { - list_del(q->sentinels + i); - list_add_tail(q->sentinels + i, q->qs + i); - } -} - -typedef void (*iter_fn)(struct list_head *, void *); -static void queue_iterate_tick(struct queue *q, iter_fn fn, void *context) -{ - unsigned i; - struct list_head *h; - - for (i = 0; i < NR_QUEUE_LEVELS; i++) { - list_for_each_prev(h, q->qs + i) { - if (is_sentinel(q, h)) - break; - - fn(h, context); - } - } -} - -/*----------------------------------------------------------------*/ - -/* - * Describes a cache entry. Used in both the cache and the pre_cache. - */ -struct entry { - struct hlist_node hlist; - struct list_head list; - dm_oblock_t oblock; - - /* - * FIXME: pack these better - */ - bool dirty:1; - unsigned hit_count; -}; - -/* - * Rather than storing the cblock in an entry, we allocate all entries in - * an array, and infer the cblock from the entry position. - * - * Free entries are linked together into a list. - */ -struct entry_pool { - struct entry *entries, *entries_end; - struct list_head free; - unsigned nr_allocated; -}; - -static int epool_init(struct entry_pool *ep, unsigned nr_entries) -{ - unsigned i; - - ep->entries = vzalloc(sizeof(struct entry) * nr_entries); - if (!ep->entries) - return -ENOMEM; - - ep->entries_end = ep->entries + nr_entries; - - INIT_LIST_HEAD(&ep->free); - for (i = 0; i < nr_entries; i++) - list_add(&ep->entries[i].list, &ep->free); - - ep->nr_allocated = 0; - - return 0; -} - -static void epool_exit(struct entry_pool *ep) -{ - vfree(ep->entries); -} - -static struct entry *alloc_entry(struct entry_pool *ep) -{ - struct entry *e; - - if (list_empty(&ep->free)) - return NULL; - - e = list_entry(list_pop(&ep->free), struct entry, list); - INIT_LIST_HEAD(&e->list); - INIT_HLIST_NODE(&e->hlist); - ep->nr_allocated++; - - return e; -} - -/* - * This assumes the cblock hasn't already been allocated. - */ -static struct entry *alloc_particular_entry(struct entry_pool *ep, dm_cblock_t cblock) -{ - struct entry *e = ep->entries + from_cblock(cblock); - - list_del_init(&e->list); - INIT_HLIST_NODE(&e->hlist); - ep->nr_allocated++; - - return e; -} - -static void free_entry(struct entry_pool *ep, struct entry *e) -{ - BUG_ON(!ep->nr_allocated); - ep->nr_allocated--; - INIT_HLIST_NODE(&e->hlist); - list_add(&e->list, &ep->free); -} - -/* - * Returns NULL if the entry is free. - */ -static struct entry *epool_find(struct entry_pool *ep, dm_cblock_t cblock) -{ - struct entry *e = ep->entries + from_cblock(cblock); - return !hlist_unhashed(&e->hlist) ? e : NULL; -} - -static bool epool_empty(struct entry_pool *ep) -{ - return list_empty(&ep->free); -} - -static bool in_pool(struct entry_pool *ep, struct entry *e) -{ - return e >= ep->entries && e < ep->entries_end; -} - -static dm_cblock_t infer_cblock(struct entry_pool *ep, struct entry *e) -{ - return to_cblock(e - ep->entries); -} - -/*----------------------------------------------------------------*/ - -struct mq_policy { - struct dm_cache_policy policy; - - /* protects everything */ - struct mutex lock; - dm_cblock_t cache_size; - struct io_tracker tracker; - - /* - * Entries come from two pools, one of pre-cache entries, and one - * for the cache proper. - */ - struct entry_pool pre_cache_pool; - struct entry_pool cache_pool; - - /* - * We maintain three queues of entries. The cache proper, - * consisting of a clean and dirty queue, contains the currently - * active mappings. Whereas the pre_cache tracks blocks that - * are being hit frequently and potential candidates for promotion - * to the cache. - */ - struct queue pre_cache; - struct queue cache_clean; - struct queue cache_dirty; - - /* - * Keeps track of time, incremented by the core. We use this to - * avoid attributing multiple hits within the same tick. - * - * Access to tick_protected should be done with the spin lock held. - * It's copied to tick at the start of the map function (within the - * mutex). - */ - spinlock_t tick_lock; - unsigned tick_protected; - unsigned tick; - - /* - * A count of the number of times the map function has been called - * and found an entry in the pre_cache or cache. Currently used to - * calculate the generation. - */ - unsigned hit_count; - - /* - * A generation is a longish period that is used to trigger some - * book keeping effects. eg, decrementing hit counts on entries. - * This is needed to allow the cache to evolve as io patterns - * change. - */ - unsigned generation; - unsigned generation_period; /* in lookups (will probably change) */ - - unsigned discard_promote_adjustment; - unsigned read_promote_adjustment; - unsigned write_promote_adjustment; - - /* - * The hash table allows us to quickly find an entry by origin - * block. Both pre_cache and cache entries are in here. - */ - unsigned nr_buckets; - dm_block_t hash_bits; - struct hlist_head *table; -}; - -#define DEFAULT_DISCARD_PROMOTE_ADJUSTMENT 1 -#define DEFAULT_READ_PROMOTE_ADJUSTMENT 4 -#define DEFAULT_WRITE_PROMOTE_ADJUSTMENT 8 -#define DISCOURAGE_DEMOTING_DIRTY_THRESHOLD 128 - -/*----------------------------------------------------------------*/ - -/* - * Simple hash table implementation. Should replace with the standard hash - * table that's making its way upstream. - */ -static void hash_insert(struct mq_policy *mq, struct entry *e) -{ - unsigned h = hash_64(from_oblock(e->oblock), mq->hash_bits); - - hlist_add_head(&e->hlist, mq->table + h); -} - -static struct entry *hash_lookup(struct mq_policy *mq, dm_oblock_t oblock) -{ - unsigned h = hash_64(from_oblock(oblock), mq->hash_bits); - struct hlist_head *bucket = mq->table + h; - struct entry *e; - - hlist_for_each_entry(e, bucket, hlist) - if (e->oblock == oblock) { - hlist_del(&e->hlist); - hlist_add_head(&e->hlist, bucket); - return e; - } - - return NULL; -} - -static void hash_remove(struct entry *e) -{ - hlist_del(&e->hlist); -} - -/*----------------------------------------------------------------*/ - -static bool any_free_cblocks(struct mq_policy *mq) -{ - return !epool_empty(&mq->cache_pool); -} - -static bool any_clean_cblocks(struct mq_policy *mq) -{ - return !queue_empty(&mq->cache_clean); -} - -/*----------------------------------------------------------------*/ - -/* - * Now we get to the meat of the policy. This section deals with deciding - * when to to add entries to the pre_cache and cache, and move between - * them. - */ - -/* - * The queue level is based on the log2 of the hit count. - */ -static unsigned queue_level(struct entry *e) -{ - return min((unsigned) ilog2(e->hit_count), NR_QUEUE_LEVELS - 1u); -} - -static bool in_cache(struct mq_policy *mq, struct entry *e) -{ - return in_pool(&mq->cache_pool, e); -} - -/* - * Inserts the entry into the pre_cache or the cache. Ensures the cache - * block is marked as allocated if necc. Inserts into the hash table. - * Sets the tick which records when the entry was last moved about. - */ -static void push(struct mq_policy *mq, struct entry *e) -{ - hash_insert(mq, e); - - if (in_cache(mq, e)) - queue_push(e->dirty ? &mq->cache_dirty : &mq->cache_clean, - queue_level(e), &e->list); - else - queue_push(&mq->pre_cache, queue_level(e), &e->list); -} - -/* - * Removes an entry from pre_cache or cache. Removes from the hash table. - */ -static void del(struct mq_policy *mq, struct entry *e) -{ - if (in_cache(mq, e)) - queue_remove(e->dirty ? &mq->cache_dirty : &mq->cache_clean, &e->list); - else - queue_remove(&mq->pre_cache, &e->list); - - hash_remove(e); -} - -/* - * Like del, except it removes the first entry in the queue (ie. the least - * recently used). - */ -static struct entry *pop(struct mq_policy *mq, struct queue *q) -{ - struct entry *e; - struct list_head *h = queue_pop(q); - - if (!h) - return NULL; - - e = container_of(h, struct entry, list); - hash_remove(e); - - return e; -} - -static struct entry *pop_old(struct mq_policy *mq, struct queue *q) -{ - struct entry *e; - struct list_head *h = queue_pop_old(q); - - if (!h) - return NULL; - - e = container_of(h, struct entry, list); - hash_remove(e); - - return e; -} - -static struct entry *peek(struct queue *q) -{ - struct list_head *h = queue_peek(q); - return h ? container_of(h, struct entry, list) : NULL; -} - -/* - * The promotion threshold is adjusted every generation. As are the counts - * of the entries. - * - * At the moment the threshold is taken by averaging the hit counts of some - * of the entries in the cache (the first 20 entries across all levels in - * ascending order, giving preference to the clean entries at each level). - * - * We can be much cleverer than this though. For example, each promotion - * could bump up the threshold helping to prevent churn. Much more to do - * here. - */ - -#define MAX_TO_AVERAGE 20 - -static void check_generation(struct mq_policy *mq) -{ - unsigned total = 0, nr = 0, count = 0, level; - struct list_head *head; - struct entry *e; - - if ((mq->hit_count >= mq->generation_period) && (epool_empty(&mq->cache_pool))) { - mq->hit_count = 0; - mq->generation++; - - for (level = 0; level < NR_QUEUE_LEVELS && count < MAX_TO_AVERAGE; level++) { - head = mq->cache_clean.qs + level; - list_for_each_entry(e, head, list) { - nr++; - total += e->hit_count; - - if (++count >= MAX_TO_AVERAGE) - break; - } - - head = mq->cache_dirty.qs + level; - list_for_each_entry(e, head, list) { - nr++; - total += e->hit_count; - - if (++count >= MAX_TO_AVERAGE) - break; - } - } - } -} - -/* - * Whenever we use an entry we bump up it's hit counter, and push it to the - * back to it's current level. - */ -static void requeue(struct mq_policy *mq, struct entry *e) -{ - check_generation(mq); - del(mq, e); - push(mq, e); -} - -/* - * Demote the least recently used entry from the cache to the pre_cache. - * Returns the new cache entry to use, and the old origin block it was - * mapped to. - * - * We drop the hit count on the demoted entry back to 1 to stop it bouncing - * straight back into the cache if it's subsequently hit. There are - * various options here, and more experimentation would be good: - * - * - just forget about the demoted entry completely (ie. don't insert it - into the pre_cache). - * - divide the hit count rather that setting to some hard coded value. - * - set the hit count to a hard coded value other than 1, eg, is it better - * if it goes in at level 2? - */ -static int demote_cblock(struct mq_policy *mq, - struct policy_locker *locker, dm_oblock_t *oblock) -{ - struct entry *demoted = peek(&mq->cache_clean); - - if (!demoted) - /* - * We could get a block from mq->cache_dirty, but that - * would add extra latency to the triggering bio as it - * waits for the writeback. Better to not promote this - * time and hope there's a clean block next time this block - * is hit. - */ - return -ENOSPC; - - if (locker->fn(locker, demoted->oblock)) - /* - * We couldn't lock the demoted block. - */ - return -EBUSY; - - del(mq, demoted); - *oblock = demoted->oblock; - free_entry(&mq->cache_pool, demoted); - - /* - * We used to put the demoted block into the pre-cache, but I think - * it's simpler to just let it work it's way up from zero again. - * Stops blocks flickering in and out of the cache. - */ - - return 0; -} - -/* - * Entries in the pre_cache whose hit count passes the promotion - * threshold move to the cache proper. Working out the correct - * value for the promotion_threshold is crucial to this policy. - */ -static unsigned promote_threshold(struct mq_policy *mq) -{ - struct entry *e; - - if (any_free_cblocks(mq)) - return 0; - - e = peek(&mq->cache_clean); - if (e) - return e->hit_count; - - e = peek(&mq->cache_dirty); - if (e) - return e->hit_count + DISCOURAGE_DEMOTING_DIRTY_THRESHOLD; - - /* This should never happen */ - return 0; -} - -/* - * We modify the basic promotion_threshold depending on the specific io. - * - * If the origin block has been discarded then there's no cost to copy it - * to the cache. - * - * We bias towards reads, since they can be demoted at no cost if they - * haven't been dirtied. - */ -static unsigned adjusted_promote_threshold(struct mq_policy *mq, - bool discarded_oblock, int data_dir) -{ - if (data_dir == READ) - return promote_threshold(mq) + mq->read_promote_adjustment; - - if (discarded_oblock && (any_free_cblocks(mq) || any_clean_cblocks(mq))) { - /* - * We don't need to do any copying at all, so give this a - * very low threshold. - */ - return mq->discard_promote_adjustment; - } - - return promote_threshold(mq) + mq->write_promote_adjustment; -} - -static bool should_promote(struct mq_policy *mq, struct entry *e, - bool discarded_oblock, int data_dir) -{ - return e->hit_count >= - adjusted_promote_threshold(mq, discarded_oblock, data_dir); -} - -static int cache_entry_found(struct mq_policy *mq, - struct entry *e, - struct policy_result *result) -{ - requeue(mq, e); - - if (in_cache(mq, e)) { - result->op = POLICY_HIT; - result->cblock = infer_cblock(&mq->cache_pool, e); - } - - return 0; -} - -/* - * Moves an entry from the pre_cache to the cache. The main work is - * finding which cache block to use. - */ -static int pre_cache_to_cache(struct mq_policy *mq, struct entry *e, - struct policy_locker *locker, - struct policy_result *result) -{ - int r; - struct entry *new_e; - - /* Ensure there's a free cblock in the cache */ - if (epool_empty(&mq->cache_pool)) { - result->op = POLICY_REPLACE; - r = demote_cblock(mq, locker, &result->old_oblock); - if (r) { - result->op = POLICY_MISS; - return 0; - } - - } else - result->op = POLICY_NEW; - - new_e = alloc_entry(&mq->cache_pool); - BUG_ON(!new_e); - - new_e->oblock = e->oblock; - new_e->dirty = false; - new_e->hit_count = e->hit_count; - - del(mq, e); - free_entry(&mq->pre_cache_pool, e); - push(mq, new_e); - - result->cblock = infer_cblock(&mq->cache_pool, new_e); - - return 0; -} - -static int pre_cache_entry_found(struct mq_policy *mq, struct entry *e, - bool can_migrate, bool discarded_oblock, - int data_dir, struct policy_locker *locker, - struct policy_result *result) -{ - int r = 0; - - if (!should_promote(mq, e, discarded_oblock, data_dir)) { - requeue(mq, e); - result->op = POLICY_MISS; - - } else if (!can_migrate) - r = -EWOULDBLOCK; - - else { - requeue(mq, e); - r = pre_cache_to_cache(mq, e, locker, result); - } - - return r; -} - -static void insert_in_pre_cache(struct mq_policy *mq, - dm_oblock_t oblock) -{ - struct entry *e = alloc_entry(&mq->pre_cache_pool); - - if (!e) - /* - * There's no spare entry structure, so we grab the least - * used one from the pre_cache. - */ - e = pop(mq, &mq->pre_cache); - - if (unlikely(!e)) { - DMWARN("couldn't pop from pre cache"); - return; - } - - e->dirty = false; - e->oblock = oblock; - e->hit_count = 1; - push(mq, e); -} - -static void insert_in_cache(struct mq_policy *mq, dm_oblock_t oblock, - struct policy_locker *locker, - struct policy_result *result) -{ - int r; - struct entry *e; - - if (epool_empty(&mq->cache_pool)) { - result->op = POLICY_REPLACE; - r = demote_cblock(mq, locker, &result->old_oblock); - if (unlikely(r)) { - result->op = POLICY_MISS; - insert_in_pre_cache(mq, oblock); - return; - } - - /* - * This will always succeed, since we've just demoted. - */ - e = alloc_entry(&mq->cache_pool); - BUG_ON(!e); - - } else { - e = alloc_entry(&mq->cache_pool); - result->op = POLICY_NEW; - } - - e->oblock = oblock; - e->dirty = false; - e->hit_count = 1; - push(mq, e); - - result->cblock = infer_cblock(&mq->cache_pool, e); -} - -static int no_entry_found(struct mq_policy *mq, dm_oblock_t oblock, - bool can_migrate, bool discarded_oblock, - int data_dir, struct policy_locker *locker, - struct policy_result *result) -{ - if (adjusted_promote_threshold(mq, discarded_oblock, data_dir) <= 1) { - if (can_migrate) - insert_in_cache(mq, oblock, locker, result); - else - return -EWOULDBLOCK; - } else { - insert_in_pre_cache(mq, oblock); - result->op = POLICY_MISS; - } - - return 0; -} - -/* - * Looks the oblock up in the hash table, then decides whether to put in - * pre_cache, or cache etc. - */ -static int map(struct mq_policy *mq, dm_oblock_t oblock, - bool can_migrate, bool discarded_oblock, - int data_dir, struct policy_locker *locker, - struct policy_result *result) -{ - int r = 0; - struct entry *e = hash_lookup(mq, oblock); - - if (e && in_cache(mq, e)) - r = cache_entry_found(mq, e, result); - - else if (mq->tracker.thresholds[PATTERN_SEQUENTIAL] && - iot_pattern(&mq->tracker) == PATTERN_SEQUENTIAL) - result->op = POLICY_MISS; - - else if (e) - r = pre_cache_entry_found(mq, e, can_migrate, discarded_oblock, - data_dir, locker, result); - - else - r = no_entry_found(mq, oblock, can_migrate, discarded_oblock, - data_dir, locker, result); - - if (r == -EWOULDBLOCK) - result->op = POLICY_MISS; - - return r; -} - -/*----------------------------------------------------------------*/ - -/* - * Public interface, via the policy struct. See dm-cache-policy.h for a - * description of these. - */ - -static struct mq_policy *to_mq_policy(struct dm_cache_policy *p) -{ - return container_of(p, struct mq_policy, policy); -} - -static void mq_destroy(struct dm_cache_policy *p) -{ - struct mq_policy *mq = to_mq_policy(p); - - vfree(mq->table); - epool_exit(&mq->cache_pool); - epool_exit(&mq->pre_cache_pool); - kfree(mq); -} - -static void update_pre_cache_hits(struct list_head *h, void *context) -{ - struct entry *e = container_of(h, struct entry, list); - e->hit_count++; -} - -static void update_cache_hits(struct list_head *h, void *context) -{ - struct mq_policy *mq = context; - struct entry *e = container_of(h, struct entry, list); - e->hit_count++; - mq->hit_count++; -} - -static void copy_tick(struct mq_policy *mq) -{ - unsigned long flags, tick; - - spin_lock_irqsave(&mq->tick_lock, flags); - tick = mq->tick_protected; - if (tick != mq->tick) { - queue_iterate_tick(&mq->pre_cache, update_pre_cache_hits, mq); - queue_iterate_tick(&mq->cache_dirty, update_cache_hits, mq); - queue_iterate_tick(&mq->cache_clean, update_cache_hits, mq); - mq->tick = tick; - } - - queue_tick(&mq->pre_cache); - queue_tick(&mq->cache_dirty); - queue_tick(&mq->cache_clean); - queue_update_writeback_sentinels(&mq->cache_dirty); - spin_unlock_irqrestore(&mq->tick_lock, flags); -} - -static int mq_map(struct dm_cache_policy *p, dm_oblock_t oblock, - bool can_block, bool ca |