summaryrefslogtreecommitdiff
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig43
-rw-r--r--drivers/md/Makefile2
-rw-r--r--drivers/md/bitmap.c189
-rw-r--r--drivers/md/bitmap.h10
-rw-r--r--drivers/md/dm-cache-policy-mq.c251
-rw-r--r--drivers/md/dm-crypt.c25
-rw-r--r--drivers/md/dm-delay.c2
-rw-r--r--drivers/md/dm-log-userspace-base.c91
-rw-r--r--drivers/md/dm-log-userspace-transfer.c5
-rw-r--r--drivers/md/dm-log-writes.c825
-rw-r--r--drivers/md/dm-mpath.c6
-rw-r--r--drivers/md/dm-sysfs.c43
-rw-r--r--drivers/md/dm-table.c71
-rw-r--r--drivers/md/dm-verity.c147
-rw-r--r--drivers/md/dm.c556
-rw-r--r--drivers/md/dm.h10
-rw-r--r--drivers/md/md-cluster.c965
-rw-r--r--drivers/md/md-cluster.h29
-rw-r--r--drivers/md/md.c382
-rw-r--r--drivers/md/md.h26
-rw-r--r--drivers/md/raid0.c48
-rw-r--r--drivers/md/raid1.c29
-rw-r--r--drivers/md/raid10.c8
-rw-r--r--drivers/md/raid5.c826
-rw-r--r--drivers/md/raid5.h59
25 files changed, 4045 insertions, 603 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 63e05e32b462..edcf4ab66e00 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -175,6 +175,22 @@ config MD_FAULTY
In unsure, say N.
+
+config MD_CLUSTER
+ tristate "Cluster Support for MD (EXPERIMENTAL)"
+ depends on BLK_DEV_MD
+ depends on DLM
+ default n
+ ---help---
+ Clustering support for MD devices. This enables locking and
+ synchronization across multiple systems on the cluster, so all
+ nodes in the cluster can access the MD devices simultaneously.
+
+ This brings the redundancy (and uptime) of RAID levels across the
+ nodes of the cluster.
+
+ If unsure, say N.
+
source "drivers/md/bcache/Kconfig"
config BLK_DEV_DM_BUILTIN
@@ -196,6 +212,17 @@ config BLK_DEV_DM
If unsure, say N.
+config DM_MQ_DEFAULT
+ bool "request-based DM: use blk-mq I/O path by default"
+ depends on BLK_DEV_DM
+ ---help---
+ This option enables the blk-mq based I/O path for request-based
+ DM devices by default. With the option the dm_mod.use_blk_mq
+ module/boot option defaults to Y, without it to N, but it can
+ still be overriden either way.
+
+ If unsure say N.
+
config DM_DEBUG
bool "Device mapper debugging support"
depends on BLK_DEV_DM
@@ -432,4 +459,20 @@ config DM_SWITCH
If unsure, say N.
+config DM_LOG_WRITES
+ tristate "Log writes target support"
+ depends on BLK_DEV_DM
+ ---help---
+ This device-mapper target takes two devices, one device to use
+ normally, one to log all write operations done to the first device.
+ This is for use by file system developers wishing to verify that
+ their fs is writing a consitent file system at all times by allowing
+ them to replay the log in a variety of ways and to check the
+ contents.
+
+ To compile this code as a module, choose M here: the module will
+ be called dm-log-writes.
+
+ If unsure, say N.
+
endif # MD
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index a2da532b1c2b..dba4db5985fb 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -30,6 +30,7 @@ obj-$(CONFIG_MD_RAID10) += raid10.o
obj-$(CONFIG_MD_RAID456) += raid456.o
obj-$(CONFIG_MD_MULTIPATH) += multipath.o
obj-$(CONFIG_MD_FAULTY) += faulty.o
+obj-$(CONFIG_MD_CLUSTER) += md-cluster.o
obj-$(CONFIG_BCACHE) += bcache/
obj-$(CONFIG_BLK_DEV_MD) += md-mod.o
obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o
@@ -55,6 +56,7 @@ obj-$(CONFIG_DM_CACHE) += dm-cache.o
obj-$(CONFIG_DM_CACHE_MQ) += dm-cache-mq.o
obj-$(CONFIG_DM_CACHE_CLEANER) += dm-cache-cleaner.o
obj-$(CONFIG_DM_ERA) += dm-era.o
+obj-$(CONFIG_DM_LOG_WRITES) += dm-log-writes.o
ifeq ($(CONFIG_DM_UEVENT),y)
dm-mod-objs += dm-uevent.o
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 3a5767968ba0..2bc56e2a3526 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -205,6 +205,10 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
struct block_device *bdev;
struct mddev *mddev = bitmap->mddev;
struct bitmap_storage *store = &bitmap->storage;
+ int node_offset = 0;
+
+ if (mddev_is_clustered(bitmap->mddev))
+ node_offset = bitmap->cluster_slot * store->file_pages;
while ((rdev = next_active_rdev(rdev, mddev)) != NULL) {
int size = PAGE_SIZE;
@@ -433,6 +437,7 @@ void bitmap_update_sb(struct bitmap *bitmap)
/* This might have been changed by a reshape */
sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors);
sb->chunksize = cpu_to_le32(bitmap->mddev->bitmap_info.chunksize);
+ sb->nodes = cpu_to_le32(bitmap->mddev->bitmap_info.nodes);
sb->sectors_reserved = cpu_to_le32(bitmap->mddev->
bitmap_info.space);
kunmap_atomic(sb);
@@ -544,6 +549,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
bitmap_super_t *sb;
unsigned long chunksize, daemon_sleep, write_behind;
unsigned long long events;
+ int nodes = 0;
unsigned long sectors_reserved = 0;
int err = -EINVAL;
struct page *sb_page;
@@ -562,6 +568,22 @@ static int bitmap_read_sb(struct bitmap *bitmap)
return -ENOMEM;
bitmap->storage.sb_page = sb_page;
+re_read:
+ /* If cluster_slot is set, the cluster is setup */
+ if (bitmap->cluster_slot >= 0) {
+ sector_t bm_blocks = bitmap->mddev->resync_max_sectors;
+
+ sector_div(bm_blocks,
+ bitmap->mddev->bitmap_info.chunksize >> 9);
+ /* bits to bytes */
+ bm_blocks = ((bm_blocks+7) >> 3) + sizeof(bitmap_super_t);
+ /* to 4k blocks */
+ bm_blocks = DIV_ROUND_UP_SECTOR_T(bm_blocks, 4096);
+ bitmap->mddev->bitmap_info.offset += bitmap->cluster_slot * (bm_blocks << 3);
+ pr_info("%s:%d bm slot: %d offset: %llu\n", __func__, __LINE__,
+ bitmap->cluster_slot, (unsigned long long)bitmap->mddev->bitmap_info.offset);
+ }
+
if (bitmap->storage.file) {
loff_t isize = i_size_read(bitmap->storage.file->f_mapping->host);
int bytes = isize > PAGE_SIZE ? PAGE_SIZE : isize;
@@ -577,12 +599,15 @@ static int bitmap_read_sb(struct bitmap *bitmap)
if (err)
return err;
+ err = -EINVAL;
sb = kmap_atomic(sb_page);
chunksize = le32_to_cpu(sb->chunksize);
daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ;
write_behind = le32_to_cpu(sb->write_behind);
sectors_reserved = le32_to_cpu(sb->sectors_reserved);
+ nodes = le32_to_cpu(sb->nodes);
+ strlcpy(bitmap->mddev->bitmap_info.cluster_name, sb->cluster_name, 64);
/* verify that the bitmap-specific fields are valid */
if (sb->magic != cpu_to_le32(BITMAP_MAGIC))
@@ -619,7 +644,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
goto out;
}
events = le64_to_cpu(sb->events);
- if (events < bitmap->mddev->events) {
+ if (!nodes && (events < bitmap->mddev->events)) {
printk(KERN_INFO
"%s: bitmap file is out of date (%llu < %llu) "
"-- forcing full recovery\n",
@@ -634,20 +659,40 @@ static int bitmap_read_sb(struct bitmap *bitmap)
if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN)
set_bit(BITMAP_HOSTENDIAN, &bitmap->flags);
bitmap->events_cleared = le64_to_cpu(sb->events_cleared);
+ strlcpy(bitmap->mddev->bitmap_info.cluster_name, sb->cluster_name, 64);
err = 0;
+
out:
kunmap_atomic(sb);
+ /* Assiging chunksize is required for "re_read" */
+ bitmap->mddev->bitmap_info.chunksize = chunksize;
+ if (nodes && (bitmap->cluster_slot < 0)) {
+ err = md_setup_cluster(bitmap->mddev, nodes);
+ if (err) {
+ pr_err("%s: Could not setup cluster service (%d)\n",
+ bmname(bitmap), err);
+ goto out_no_sb;
+ }
+ bitmap->cluster_slot = md_cluster_ops->slot_number(bitmap->mddev);
+ goto re_read;
+ }
+
+
out_no_sb:
if (test_bit(BITMAP_STALE, &bitmap->flags))
bitmap->events_cleared = bitmap->mddev->events;
bitmap->mddev->bitmap_info.chunksize = chunksize;
bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep;
bitmap->mddev->bitmap_info.max_write_behind = write_behind;
+ bitmap->mddev->bitmap_info.nodes = nodes;
if (bitmap->mddev->bitmap_info.space == 0 ||
bitmap->mddev->bitmap_info.space > sectors_reserved)
bitmap->mddev->bitmap_info.space = sectors_reserved;
- if (err)
+ if (err) {
bitmap_print_sb(bitmap);
+ if (bitmap->cluster_slot < 0)
+ md_cluster_stop(bitmap->mddev);
+ }
return err;
}
@@ -692,9 +737,10 @@ static inline struct page *filemap_get_page(struct bitmap_storage *store,
}
static int bitmap_storage_alloc(struct bitmap_storage *store,
- unsigned long chunks, int with_super)
+ unsigned long chunks, int with_super,
+ int slot_number)
{
- int pnum;
+ int pnum, offset = 0;
unsigned long num_pages;
unsigned long bytes;
@@ -703,6 +749,7 @@ static int bitmap_storage_alloc(struct bitmap_storage *store,
bytes += sizeof(bitmap_super_t);
num_pages = DIV_ROUND_UP(bytes, PAGE_SIZE);
+ offset = slot_number * (num_pages - 1);
store->filemap = kmalloc(sizeof(struct page *)
* num_pages, GFP_KERNEL);
@@ -713,20 +760,22 @@ static int bitmap_storage_alloc(struct bitmap_storage *store,
store->sb_page = alloc_page(GFP_KERNEL|__GFP_ZERO);
if (store->sb_page == NULL)
return -ENOMEM;
- store->sb_page->index = 0;
}
+
pnum = 0;
if (store->sb_page) {
store->filemap[0] = store->sb_page;
pnum = 1;
+ store->sb_page->index = offset;
}
+
for ( ; pnum < num_pages; pnum++) {
store->filemap[pnum] = alloc_page(GFP_KERNEL|__GFP_ZERO);
if (!store->filemap[pnum]) {
store->file_pages = pnum;
return -ENOMEM;
}
- store->filemap[pnum]->index = pnum;
+ store->filemap[pnum]->index = pnum + offset;
}
store->file_pages = pnum;
@@ -885,6 +934,28 @@ static void bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block)
}
}
+static int bitmap_file_test_bit(struct bitmap *bitmap, sector_t block)
+{
+ unsigned long bit;
+ struct page *page;
+ void *paddr;
+ unsigned long chunk = block >> bitmap->counts.chunkshift;
+ int set = 0;
+
+ page = filemap_get_page(&bitmap->storage, chunk);
+ if (!page)
+ return -EINVAL;
+ bit = file_page_offset(&bitmap->storage, chunk);
+ paddr = kmap_atomic(page);
+ if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags))
+ set = test_bit(bit, paddr);
+ else
+ set = test_bit_le(bit, paddr);
+ kunmap_atomic(paddr);
+ return set;
+}
+
+
/* this gets called when the md device is ready to unplug its underlying
* (slave) device queues -- before we let any writes go down, we need to
* sync the dirty pages of the bitmap file to disk */
@@ -935,7 +1006,7 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n
*/
static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
{
- unsigned long i, chunks, index, oldindex, bit;
+ unsigned long i, chunks, index, oldindex, bit, node_offset = 0;
struct page *page = NULL;
unsigned long bit_cnt = 0;
struct file *file;
@@ -981,6 +1052,9 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
if (!bitmap->mddev->bitmap_info.external)
offset = sizeof(bitmap_super_t);
+ if (mddev_is_clustered(bitmap->mddev))
+ node_offset = bitmap->cluster_slot * (DIV_ROUND_UP(store->bytes, PAGE_SIZE));
+
for (i = 0; i < chunks; i++) {
int b;
index = file_page_index(&bitmap->storage, i);
@@ -1001,7 +1075,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
bitmap->mddev,
bitmap->mddev->bitmap_info.offset,
page,
- index, count);
+ index + node_offset, count);
if (ret)
goto err;
@@ -1207,7 +1281,6 @@ void bitmap_daemon_work(struct mddev *mddev)
j < bitmap->storage.file_pages
&& !test_bit(BITMAP_STALE, &bitmap->flags);
j++) {
-
if (test_page_attr(bitmap, j,
BITMAP_PAGE_DIRTY))
/* bitmap_unplug will handle the rest */
@@ -1530,11 +1603,13 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n
return;
}
if (!*bmc) {
- *bmc = 2 | (needed ? NEEDED_MASK : 0);
+ *bmc = 2;
bitmap_count_page(&bitmap->counts, offset, 1);
bitmap_set_pending(&bitmap->counts, offset);
bitmap->allclean = 0;
}
+ if (needed)
+ *bmc |= NEEDED_MASK;
spin_unlock_irq(&bitmap->counts.lock);
}
@@ -1591,6 +1666,10 @@ static void bitmap_free(struct bitmap *bitmap)
if (!bitmap) /* there was no bitmap */
return;
+ if (mddev_is_clustered(bitmap->mddev) && bitmap->mddev->cluster_info &&
+ bitmap->cluster_slot == md_cluster_ops->slot_number(bitmap->mddev))
+ md_cluster_stop(bitmap->mddev);
+
/* Shouldn't be needed - but just in case.... */
wait_event(bitmap->write_wait,
atomic_read(&bitmap->pending_writes) == 0);
@@ -1636,7 +1715,7 @@ void bitmap_destroy(struct mddev *mddev)
* initialize the bitmap structure
* if this returns an error, bitmap_destroy must be called to do clean up
*/
-int bitmap_create(struct mddev *mddev)
+struct bitmap *bitmap_create(struct mddev *mddev, int slot)
{
struct bitmap *bitmap;
sector_t blocks = mddev->resync_max_sectors;
@@ -1650,7 +1729,7 @@ int bitmap_create(struct mddev *mddev)
bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL);
if (!bitmap)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
spin_lock_init(&bitmap->counts.lock);
atomic_set(&bitmap->pending_writes, 0);
@@ -1659,6 +1738,7 @@ int bitmap_create(struct mddev *mddev)
init_waitqueue_head(&bitmap->behind_wait);
bitmap->mddev = mddev;
+ bitmap->cluster_slot = slot;
if (mddev->kobj.sd)
bm = sysfs_get_dirent(mddev->kobj.sd, "bitmap");
@@ -1706,12 +1786,14 @@ int bitmap_create(struct mddev *mddev)
printk(KERN_INFO "created bitmap (%lu pages) for device %s\n",
bitmap->counts.pages, bmname(bitmap));
- mddev->bitmap = bitmap;
- return test_bit(BITMAP_WRITE_ERROR, &bitmap->flags) ? -EIO : 0;
+ err = test_bit(BITMAP_WRITE_ERROR, &bitmap->flags) ? -EIO : 0;
+ if (err)
+ goto error;
+ return bitmap;
error:
bitmap_free(bitmap);
- return err;
+ return ERR_PTR(err);
}
int bitmap_load(struct mddev *mddev)
@@ -1765,6 +1847,60 @@ out:
}
EXPORT_SYMBOL_GPL(bitmap_load);
+/* Loads the bitmap associated with slot and copies the resync information
+ * to our bitmap
+ */
+int bitmap_copy_from_slot(struct mddev *mddev, int slot,
+ sector_t *low, sector_t *high, bool clear_bits)
+{
+ int rv = 0, i, j;
+ sector_t block, lo = 0, hi = 0;
+ struct bitmap_counts *counts;
+ struct bitmap *bitmap = bitmap_create(mddev, slot);
+
+ if (IS_ERR(bitmap))
+ return PTR_ERR(bitmap);
+
+ rv = bitmap_read_sb(bitmap);
+ if (rv)
+ goto err;
+
+ rv = bitmap_init_from_disk(bitmap, 0);
+ if (rv)
+ goto err;
+
+ counts = &bitmap->counts;
+ for (j = 0; j < counts->chunks; j++) {
+ block = (sector_t)j << counts->chunkshift;
+ if (bitmap_file_test_bit(bitmap, block)) {
+ if (!lo)
+ lo = block;
+ hi = block;
+ bitmap_file_clear_bit(bitmap, block);
+ bitmap_set_memory_bits(mddev->bitmap, block, 1);
+ bitmap_file_set_bit(mddev->bitmap, block);
+ }
+ }
+
+ if (clear_bits) {
+ bitmap_update_sb(bitmap);
+ /* Setting this for the ev_page should be enough.
+ * And we do not require both write_all and PAGE_DIRT either
+ */
+ for (i = 0; i < bitmap->storage.file_pages; i++)
+ set_page_attr(bitmap, i, BITMAP_PAGE_DIRTY);
+ bitmap_write_all(bitmap);
+ bitmap_unplug(bitmap);
+ }
+ *low = lo;
+ *high = hi;
+err:
+ bitmap_free(bitmap);
+ return rv;
+}
+EXPORT_SYMBOL_GPL(bitmap_copy_from_slot);
+
+
void bitmap_status(struct seq_file *seq, struct bitmap *bitmap)
{
unsigned long chunk_kb;
@@ -1849,7 +1985,8 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks,
memset(&store, 0, sizeof(store));
if (bitmap->mddev->bitmap_info.offset || bitmap->mddev->bitmap_info.file)
ret = bitmap_storage_alloc(&store, chunks,
- !bitmap->mddev->bitmap_info.external);
+ !bitmap->mddev->bitmap_info.external,
+ bitmap->cluster_slot);
if (ret)
goto err;
@@ -2021,13 +2158,18 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
return -EINVAL;
mddev->bitmap_info.offset = offset;
if (mddev->pers) {
+ struct bitmap *bitmap;
mddev->pers->quiesce(mddev, 1);
- rv = bitmap_create(mddev);
- if (!rv)
+ bitmap = bitmap_create(mddev, -1);
+ if (IS_ERR(bitmap))
+ rv = PTR_ERR(bitmap);
+ else {
+ mddev->bitmap = bitmap;
rv = bitmap_load(mddev);
- if (rv) {
- bitmap_destroy(mddev);
- mddev->bitmap_info.offset = 0;
+ if (rv) {
+ bitmap_destroy(mddev);
+ mddev->bitmap_info.offset = 0;
+ }
}
mddev->pers->quiesce(mddev, 0);
if (rv)
@@ -2186,6 +2328,8 @@ __ATTR(chunksize, S_IRUGO|S_IWUSR, chunksize_show, chunksize_store);
static ssize_t metadata_show(struct mddev *mddev, char *page)
{
+ if (mddev_is_clustered(mddev))
+ return sprintf(page, "clustered\n");
return sprintf(page, "%s\n", (mddev->bitmap_info.external
? "external" : "internal"));
}
@@ -2198,7 +2342,8 @@ static ssize_t metadata_store(struct mddev *mddev, const char *buf, size_t len)
return -EBUSY;
if (strncmp(buf, "external", 8) == 0)
mddev->bitmap_info.external = 1;
- else if (strncmp(buf, "internal", 8) == 0)
+ else if ((strncmp(buf, "internal", 8) == 0) ||
+ (strncmp(buf, "clustered", 9) == 0))
mddev->bitmap_info.external = 0;
else
return -EINVAL;
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index 30210b9c4ef9..f1f4dd01090d 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -130,8 +130,9 @@ typedef struct bitmap_super_s {
__le32 write_behind; /* 60 number of outstanding write-behind writes */
__le32 sectors_reserved; /* 64 number of 512-byte sectors that are
* reserved for the bitmap. */
-
- __u8 pad[256 - 68]; /* set to zero */
+ __le32 nodes; /* 68 the maximum number of nodes in cluster. */
+ __u8 cluster_name[64]; /* 72 cluster name to which this md belongs */
+ __u8 pad[256 - 136]; /* set to zero */
} bitmap_super_t;
/* notes:
@@ -226,12 +227,13 @@ struct bitmap {
wait_queue_head_t behind_wait;
struct kernfs_node *sysfs_can_clear;
+ int cluster_slot; /* Slot offset for clustered env */
};
/* the bitmap API */
/* these are used only by md/bitmap */
-int bitmap_create(struct mddev *mddev);
+struct bitmap *bitmap_create(struct mddev *mddev, int slot);
int bitmap_load(struct mddev *mddev);
void bitmap_flush(struct mddev *mddev);
void bitmap_destroy(struct mddev *mddev);
@@ -260,6 +262,8 @@ void bitmap_daemon_work(struct mddev *mddev);
int bitmap_resize(struct bitmap *bitmap, sector_t blocks,
int chunksize, int init);
+int bitmap_copy_from_slot(struct mddev *mddev, int slot,
+ sector_t *lo, sector_t *hi, bool clear_bits);
#endif
#endif
diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c
index 13f547a4eeb6..3ddd1162334d 100644
--- a/drivers/md/dm-cache-policy-mq.c
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -8,6 +8,7 @@
#include "dm.h"
#include <linux/hash.h>
+#include <linux/jiffies.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/slab.h>
@@ -124,32 +125,41 @@ static void iot_examine_bio(struct io_tracker *t, struct bio *bio)
* sorted queue.
*/
#define NR_QUEUE_LEVELS 16u
+#define NR_SENTINELS NR_QUEUE_LEVELS * 3
+
+#define WRITEBACK_PERIOD HZ
struct queue {
+ unsigned nr_elts;
+ bool current_writeback_sentinels;
+ unsigned long next_writeback;
struct list_head qs[NR_QUEUE_LEVELS];
+ struct list_head sentinels[NR_SENTINELS];
};
static void queue_init(struct queue *q)
{
unsigned i;
- for (i = 0; i < NR_QUEUE_LEVELS; i++)
+ q->nr_elts = 0;
+ q->current_writeback_sentinels = false;
+ q->next_writeback = 0;
+ for (i = 0; i < NR_QUEUE_LEVELS; i++) {
INIT_LIST_HEAD(q->qs + i);
+ INIT_LIST_HEAD(q->sentinels + i);
+ INIT_LIST_HEAD(q->sentinels + NR_QUEUE_LEVELS + i);
+ INIT_LIST_HEAD(q->sentinels + (2 * NR_QUEUE_LEVELS) + i);
+ }
}
-/*
- * Checks to see if the queue is empty.
- * FIXME: reduce cpu usage.
- */
-static bool queue_empty(struct queue *q)
+static unsigned queue_size(struct queue *q)
{
- unsigned i;
-
- for (i = 0; i < NR_QUEUE_LEVELS; i++)
- if (!list_empty(q->qs + i))
- return false;
+ return q->nr_elts;
+}
- return true;
+static bool queue_empty(struct queue *q)
+{
+ return q->nr_elts == 0;
}
/*
@@ -157,24 +167,19 @@ static bool queue_empty(struct queue *q)
*/
static void queue_push(struct queue *q, unsigned level, struct list_head *elt)
{
+ q->nr_elts++;
list_add_tail(elt, q->qs + level);
}
-static void queue_remove(struct list_head *elt)
+static void queue_remove(struct queue *q, struct list_head *elt)
{
+ q->nr_elts--;
list_del(elt);
}
-/*
- * Shifts all regions down one level. This has no effect on the order of
- * the queue.
- */
-static void queue_shift_down(struct queue *q)
+static bool is_sentinel(struct queue *q, struct list_head *h)
{
- unsigned level;
-
- for (level = 1; level < NR_QUEUE_LEVELS; level++)
- list_splice_init(q->qs + level, q->qs + level - 1);
+ return (h >= q->sentinels) && (h < (q->sentinels + NR_SENTINELS));
}
/*
@@ -184,10 +189,12 @@ static void queue_shift_down(struct queue *q)
static struct list_head *queue_peek(struct queue *q)
{
unsigned level;
+ struct list_head *h;
for (level = 0; level < NR_QUEUE_LEVELS; level++)
- if (!list_empty(q->qs + level))
- return q->qs[level].next;
+ list_for_each(h, q->qs + level)
+ if (!is_sentinel(q, h))
+ return h;
return NULL;
}
@@ -197,16 +204,34 @@ static struct list_head *queue_pop(struct queue *q)
struct list_head *r = queue_peek(q);
if (r) {
+ q->nr_elts--;
list_del(r);
-
- /* have we just emptied the bottom level? */
- if (list_empty(q->qs))
- queue_shift_down(q);
}
return r;
}
+/*
+ * Pops an entry from a level that is not past a sentinel.
+ */
+static struct list_head *queue_pop_old(struct queue *q)
+{
+ unsigned level;
+ struct list_head *h;
+
+ for (level = 0; level < NR_QUEUE_LEVELS; level++)
+ list_for_each(h, q->qs + level) {
+ if (is_sentinel(q, h))
+ break;
+
+ q->nr_elts--;
+ list_del(h);
+ return h;
+ }
+
+ return NULL;
+}
+
static struct list_head *list_pop(struct list_head *lh)
{
struct list_head *r = lh->next;
@@ -217,6 +242,62 @@ static struct list_head *list_pop(struct list_head *lh)
return r;
}
+static struct list_head *writeback_sentinel(struct queue *q, unsigned level)
+{
+ if (q->current_writeback_sentinels)
+ return q->sentinels + NR_QUEUE_LEVELS + level;
+ else
+ return q->sentinels + 2 * NR_QUEUE_LEVELS + level;
+}
+
+static void queue_update_writeback_sentinels(struct queue *q)
+{
+ unsigned i;
+ struct list_head *h;
+
+ if (time_after(jiffies, q->next_writeback)) {
+ for (i = 0; i < NR_QUEUE_LEVELS; i++) {
+ h = writeback_sentinel(q, i);
+ list_del(h);
+ list_add_tail(h, q->qs + i);
+ }
+
+ q->next_writeback = jiffies + WRITEBACK_PERIOD;
+ q->current_writeback_sentinels = !q->current_writeback_sentinels;
+ }
+}
+
+/*
+ * Sometimes we want to iterate through entries that have been pushed since
+ * a certain event. We use sentinel entries on the queues to delimit these
+ * 'tick' events.
+ */
+static void queue_tick(struct queue *q)
+{
+ unsigned i;
+
+ for (i = 0; i < NR_QUEUE_LEVELS; i++) {
+ list_del(q->sentinels + i);
+ list_add_tail(q->sentinels + i, q->qs + i);
+ }
+}
+
+typedef void (*iter_fn)(struct list_head *, void *);
+static void queue_iterate_tick(struct queue *q, iter_fn fn, void *context)
+{
+ unsigned i;
+ struct list_head *h;
+
+ for (i = 0; i < NR_QUEUE_LEVELS; i++) {
+ list_for_each_prev(h, q->qs + i) {
+ if (is_sentinel(q, h))
+ break;
+
+ fn(h, context);
+ }
+ }
+}
+
/*----------------------------------------------------------------*/
/*
@@ -232,8 +313,6 @@ struct entry {
*/
bool dirty:1;
unsigned hit_count;
- unsigned generation;
- unsigned tick;
};
/*
@@ -481,7 +560,6 @@ static bool in_cache(struct mq_policy *mq, struct entry *e)
*/
static void push(struct mq_policy *mq, struct entry *e)
{
- e->tick = mq->tick;
hash_insert(mq, e);
if (in_cache(mq, e))
@@ -496,7 +574,11 @@ static void push(struct mq_policy *mq, struct entry *e)
*/
static void del(struct mq_policy *mq, struct entry *e)
{
- queue_remove(&e->list);
+ if (in_cache(mq, e))
+ queue_remove(e->dirty ? &mq->cache_dirty : &mq->cache_clean, &e->list);
+ else
+ queue_remove(&mq->pre_cache, &e->list);
+
hash_remove(e);
}
@@ -518,18 +600,24 @@ static struct entry *pop(struct mq_policy *mq, struct queue *q)
return e;
}
-static struct entry *peek(struct queue *q)
+static struct entry *pop_old(struct mq_policy *mq, struct queue *q)
{
- struct list_head *h = queue_peek(q);
- return h ? container_of(h, struct entry, list) : NULL;
+ struct entry *e;
+ struct list_head *h = queue_pop_old(q);
+
+ if (!h)
+ return NULL;
+
+ e = container_of(h, struct entry, list);
+ hash_remove(e);
+
+ return e;
}
-/*
- * Has this entry already been updated?
- */
-static bool updated_this_tick(struct mq_policy *mq, struct entry *e)
+static struct entry *peek(struct queue *q)
{
- return mq->tick == e->tick;
+ struct list_head *h = queue_peek(q);
+ return h ? container_of(h, struct entry, list) : NULL;
}
/*
@@ -583,20 +671,9 @@ static void check_generation(struct mq_policy *mq)
* Whenever we use an entry we bump up it's hit counter, and push it to the
* back to it's current level.
*/
-static void requeue_and_update_tick(struct mq_policy *mq, struct entry *e)
+static void requeue(struct mq_policy *mq, struct entry *e)
{
- if (updated_this_tick(mq, e))
- return;
-
- e->hit_count++;
- mq->hit_count++;
check_generation(mq);
-
- /* generation adjustment, to stop the counts increasing forever. */
- /* FIXME: divide? */
- /* e->hit_count -= min(e->hit_count - 1, mq->generation - e->generation); */
- e->generation = mq->generation;
-
del(mq, e);
push(mq, e);
}
@@ -703,7 +780,7 @@ static int cache_entry_found(struct mq_policy *mq,
struct entry *e,
struct policy_result *result)
{
- requeue_and_update_tick(mq, e);
+ requeue(mq, e);
if (in_cache(mq, e)) {
result->op = POLICY_HIT;
@@ -740,8 +817,6 @@ static int pre_cache_to_cache(struct mq_policy *mq, struct entry *e,
new_e->oblock = e->oblock;
new_e->dirty = false;
new_e->hit_count = e->hit_count;
- new_e->generation = e->generation;
- new_e->tick = e->tick;
del(mq, e);
free_entry(&mq->pre_cache_pool, e);
@@ -757,18 +832,16 @@ static int pre_cache_entry_found(struct mq_policy *mq, struct entry *e,
int data_dir, struct policy_result *result)
{
int r = 0;
- bool updated = updated_this_tick(mq, e);
- if ((!discarded_oblock && updated) ||
- !should_promote(mq, e, discarded_oblock, data_dir)) {
- requeue_and_update_tick(mq, e);
+ if (!should_promote(mq, e, discarded_oblock, data_dir)) {
+ requeue(mq, e);
result->op = POLICY_MISS;
} else if (!can_migrate)
r = -EWOULDBLOCK;
else {
- requeue_and_update_tick(mq, e);
+ requeue(mq, e);
r = pre_cache_to_cache(mq, e, result);
}
@@ -795,7 +868,6 @@ static void insert_in_pre_cache(struct mq_policy *mq,
e->dirty = false;
e->oblock = oblock;
e->hit_count = 1;
- e->generation = mq->generation;
push(mq, e);
}
@@ -828,7 +900,6 @@ static void insert_in_cache(struct mq_policy *mq, dm_oblock_t oblock,
e->oblock = oblock;
e->dirty = false;
e->hit_count = 1;
- e->generation = mq->generation;
push(mq, e);
result->cblock = infer_cblock(&mq->cache_pool, e);
@@ -905,12 +976,37 @@ static void mq_destroy(struct dm_cache_policy *p)
kfree(mq);
}
+static void update_pre_cache_hits(struct list_head *h, void *context)
+{
+ struct entry *e = container_of(h, struct entry, list);
+ e->hit_count++;
+}
+
+static void update_cache_hits(struct list_head *h, void *context)
+{
+ struct mq_policy *mq = context;
+ struct entry *e = container_of(h, struct entry, list);
+ e->hit_count++;
+ mq->hit_count++;
+}
+
static void copy_tick(struct mq_policy *mq)
{
- unsigned long flags;
+ unsigned long flags, tick;
spin_lock_irqsave(&mq->tick_lock, flags);
- mq->tick = mq->tick_protected;
+ tick = mq->tick_protected;
+ if (tick != mq->tick) {
+ queue_iterate_tick(&mq->pre_cache, update_pre_cache_hits, mq);
+ queue_iterate_tick(&mq->cache_dirty, update_cache_hits, mq);
+ queue_iterate_tick(&mq->cache_clean, update_cache_hits, mq);
+ mq->tick = tick;
+ }
+
+ queue_tick(&mq->pre_cache);
+ queue_tick(&mq->cache_dirty);
+ queue_tick(&mq->cache_clean);<