diff options
Diffstat (limited to 'drivers/md')
| -rw-r--r-- | drivers/md/Kconfig | 43 | ||||
| -rw-r--r-- | drivers/md/Makefile | 2 | ||||
| -rw-r--r-- | drivers/md/bitmap.c | 189 | ||||
| -rw-r--r-- | drivers/md/bitmap.h | 10 | ||||
| -rw-r--r-- | drivers/md/dm-cache-policy-mq.c | 251 | ||||
| -rw-r--r-- | drivers/md/dm-crypt.c | 25 | ||||
| -rw-r--r-- | drivers/md/dm-delay.c | 2 | ||||
| -rw-r--r-- | drivers/md/dm-log-userspace-base.c | 91 | ||||
| -rw-r--r-- | drivers/md/dm-log-userspace-transfer.c | 5 | ||||
| -rw-r--r-- | drivers/md/dm-log-writes.c | 825 | ||||
| -rw-r--r-- | drivers/md/dm-mpath.c | 6 | ||||
| -rw-r--r-- | drivers/md/dm-sysfs.c | 43 | ||||
| -rw-r--r-- | drivers/md/dm-table.c | 71 | ||||
| -rw-r--r-- | drivers/md/dm-verity.c | 147 | ||||
| -rw-r--r-- | drivers/md/dm.c | 556 | ||||
| -rw-r--r-- | drivers/md/dm.h | 10 | ||||
| -rw-r--r-- | drivers/md/md-cluster.c | 965 | ||||
| -rw-r--r-- | drivers/md/md-cluster.h | 29 | ||||
| -rw-r--r-- | drivers/md/md.c | 382 | ||||
| -rw-r--r-- | drivers/md/md.h | 26 | ||||
| -rw-r--r-- | drivers/md/raid0.c | 48 | ||||
| -rw-r--r-- | drivers/md/raid1.c | 29 | ||||
| -rw-r--r-- | drivers/md/raid10.c | 8 | ||||
| -rw-r--r-- | drivers/md/raid5.c | 826 | ||||
| -rw-r--r-- | drivers/md/raid5.h | 59 |
25 files changed, 4045 insertions, 603 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 63e05e32b462..edcf4ab66e00 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -175,6 +175,22 @@ config MD_FAULTY In unsure, say N. + +config MD_CLUSTER + tristate "Cluster Support for MD (EXPERIMENTAL)" + depends on BLK_DEV_MD + depends on DLM + default n + ---help--- + Clustering support for MD devices. This enables locking and + synchronization across multiple systems on the cluster, so all + nodes in the cluster can access the MD devices simultaneously. + + This brings the redundancy (and uptime) of RAID levels across the + nodes of the cluster. + + If unsure, say N. + source "drivers/md/bcache/Kconfig" config BLK_DEV_DM_BUILTIN @@ -196,6 +212,17 @@ config BLK_DEV_DM If unsure, say N. +config DM_MQ_DEFAULT + bool "request-based DM: use blk-mq I/O path by default" + depends on BLK_DEV_DM + ---help--- + This option enables the blk-mq based I/O path for request-based + DM devices by default. With the option the dm_mod.use_blk_mq + module/boot option defaults to Y, without it to N, but it can + still be overriden either way. + + If unsure say N. + config DM_DEBUG bool "Device mapper debugging support" depends on BLK_DEV_DM @@ -432,4 +459,20 @@ config DM_SWITCH If unsure, say N. +config DM_LOG_WRITES + tristate "Log writes target support" + depends on BLK_DEV_DM + ---help--- + This device-mapper target takes two devices, one device to use + normally, one to log all write operations done to the first device. + This is for use by file system developers wishing to verify that + their fs is writing a consitent file system at all times by allowing + them to replay the log in a variety of ways and to check the + contents. + + To compile this code as a module, choose M here: the module will + be called dm-log-writes. + + If unsure, say N. + endif # MD diff --git a/drivers/md/Makefile b/drivers/md/Makefile index a2da532b1c2b..dba4db5985fb 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -30,6 +30,7 @@ obj-$(CONFIG_MD_RAID10) += raid10.o obj-$(CONFIG_MD_RAID456) += raid456.o obj-$(CONFIG_MD_MULTIPATH) += multipath.o obj-$(CONFIG_MD_FAULTY) += faulty.o +obj-$(CONFIG_MD_CLUSTER) += md-cluster.o obj-$(CONFIG_BCACHE) += bcache/ obj-$(CONFIG_BLK_DEV_MD) += md-mod.o obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o @@ -55,6 +56,7 @@ obj-$(CONFIG_DM_CACHE) += dm-cache.o obj-$(CONFIG_DM_CACHE_MQ) += dm-cache-mq.o obj-$(CONFIG_DM_CACHE_CLEANER) += dm-cache-cleaner.o obj-$(CONFIG_DM_ERA) += dm-era.o +obj-$(CONFIG_DM_LOG_WRITES) += dm-log-writes.o ifeq ($(CONFIG_DM_UEVENT),y) dm-mod-objs += dm-uevent.o diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 3a5767968ba0..2bc56e2a3526 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -205,6 +205,10 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) struct block_device *bdev; struct mddev *mddev = bitmap->mddev; struct bitmap_storage *store = &bitmap->storage; + int node_offset = 0; + + if (mddev_is_clustered(bitmap->mddev)) + node_offset = bitmap->cluster_slot * store->file_pages; while ((rdev = next_active_rdev(rdev, mddev)) != NULL) { int size = PAGE_SIZE; @@ -433,6 +437,7 @@ void bitmap_update_sb(struct bitmap *bitmap) /* This might have been changed by a reshape */ sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors); sb->chunksize = cpu_to_le32(bitmap->mddev->bitmap_info.chunksize); + sb->nodes = cpu_to_le32(bitmap->mddev->bitmap_info.nodes); sb->sectors_reserved = cpu_to_le32(bitmap->mddev-> bitmap_info.space); kunmap_atomic(sb); @@ -544,6 +549,7 @@ static int bitmap_read_sb(struct bitmap *bitmap) bitmap_super_t *sb; unsigned long chunksize, daemon_sleep, write_behind; unsigned long long events; + int nodes = 0; unsigned long sectors_reserved = 0; int err = -EINVAL; struct page *sb_page; @@ -562,6 +568,22 @@ static int bitmap_read_sb(struct bitmap *bitmap) return -ENOMEM; bitmap->storage.sb_page = sb_page; +re_read: + /* If cluster_slot is set, the cluster is setup */ + if (bitmap->cluster_slot >= 0) { + sector_t bm_blocks = bitmap->mddev->resync_max_sectors; + + sector_div(bm_blocks, + bitmap->mddev->bitmap_info.chunksize >> 9); + /* bits to bytes */ + bm_blocks = ((bm_blocks+7) >> 3) + sizeof(bitmap_super_t); + /* to 4k blocks */ + bm_blocks = DIV_ROUND_UP_SECTOR_T(bm_blocks, 4096); + bitmap->mddev->bitmap_info.offset += bitmap->cluster_slot * (bm_blocks << 3); + pr_info("%s:%d bm slot: %d offset: %llu\n", __func__, __LINE__, + bitmap->cluster_slot, (unsigned long long)bitmap->mddev->bitmap_info.offset); + } + if (bitmap->storage.file) { loff_t isize = i_size_read(bitmap->storage.file->f_mapping->host); int bytes = isize > PAGE_SIZE ? PAGE_SIZE : isize; @@ -577,12 +599,15 @@ static int bitmap_read_sb(struct bitmap *bitmap) if (err) return err; + err = -EINVAL; sb = kmap_atomic(sb_page); chunksize = le32_to_cpu(sb->chunksize); daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ; write_behind = le32_to_cpu(sb->write_behind); sectors_reserved = le32_to_cpu(sb->sectors_reserved); + nodes = le32_to_cpu(sb->nodes); + strlcpy(bitmap->mddev->bitmap_info.cluster_name, sb->cluster_name, 64); /* verify that the bitmap-specific fields are valid */ if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) @@ -619,7 +644,7 @@ static int bitmap_read_sb(struct bitmap *bitmap) goto out; } events = le64_to_cpu(sb->events); - if (events < bitmap->mddev->events) { + if (!nodes && (events < bitmap->mddev->events)) { printk(KERN_INFO "%s: bitmap file is out of date (%llu < %llu) " "-- forcing full recovery\n", @@ -634,20 +659,40 @@ static int bitmap_read_sb(struct bitmap *bitmap) if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN) set_bit(BITMAP_HOSTENDIAN, &bitmap->flags); bitmap->events_cleared = le64_to_cpu(sb->events_cleared); + strlcpy(bitmap->mddev->bitmap_info.cluster_name, sb->cluster_name, 64); err = 0; + out: kunmap_atomic(sb); + /* Assiging chunksize is required for "re_read" */ + bitmap->mddev->bitmap_info.chunksize = chunksize; + if (nodes && (bitmap->cluster_slot < 0)) { + err = md_setup_cluster(bitmap->mddev, nodes); + if (err) { + pr_err("%s: Could not setup cluster service (%d)\n", + bmname(bitmap), err); + goto out_no_sb; + } + bitmap->cluster_slot = md_cluster_ops->slot_number(bitmap->mddev); + goto re_read; + } + + out_no_sb: if (test_bit(BITMAP_STALE, &bitmap->flags)) bitmap->events_cleared = bitmap->mddev->events; bitmap->mddev->bitmap_info.chunksize = chunksize; bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep; bitmap->mddev->bitmap_info.max_write_behind = write_behind; + bitmap->mddev->bitmap_info.nodes = nodes; if (bitmap->mddev->bitmap_info.space == 0 || bitmap->mddev->bitmap_info.space > sectors_reserved) bitmap->mddev->bitmap_info.space = sectors_reserved; - if (err) + if (err) { bitmap_print_sb(bitmap); + if (bitmap->cluster_slot < 0) + md_cluster_stop(bitmap->mddev); + } return err; } @@ -692,9 +737,10 @@ static inline struct page *filemap_get_page(struct bitmap_storage *store, } static int bitmap_storage_alloc(struct bitmap_storage *store, - unsigned long chunks, int with_super) + unsigned long chunks, int with_super, + int slot_number) { - int pnum; + int pnum, offset = 0; unsigned long num_pages; unsigned long bytes; @@ -703,6 +749,7 @@ static int bitmap_storage_alloc(struct bitmap_storage *store, bytes += sizeof(bitmap_super_t); num_pages = DIV_ROUND_UP(bytes, PAGE_SIZE); + offset = slot_number * (num_pages - 1); store->filemap = kmalloc(sizeof(struct page *) * num_pages, GFP_KERNEL); @@ -713,20 +760,22 @@ static int bitmap_storage_alloc(struct bitmap_storage *store, store->sb_page = alloc_page(GFP_KERNEL|__GFP_ZERO); if (store->sb_page == NULL) return -ENOMEM; - store->sb_page->index = 0; } + pnum = 0; if (store->sb_page) { store->filemap[0] = store->sb_page; pnum = 1; + store->sb_page->index = offset; } + for ( ; pnum < num_pages; pnum++) { store->filemap[pnum] = alloc_page(GFP_KERNEL|__GFP_ZERO); if (!store->filemap[pnum]) { store->file_pages = pnum; return -ENOMEM; } - store->filemap[pnum]->index = pnum; + store->filemap[pnum]->index = pnum + offset; } store->file_pages = pnum; @@ -885,6 +934,28 @@ static void bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block) } } +static int bitmap_file_test_bit(struct bitmap *bitmap, sector_t block) +{ + unsigned long bit; + struct page *page; + void *paddr; + unsigned long chunk = block >> bitmap->counts.chunkshift; + int set = 0; + + page = filemap_get_page(&bitmap->storage, chunk); + if (!page) + return -EINVAL; + bit = file_page_offset(&bitmap->storage, chunk); + paddr = kmap_atomic(page); + if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags)) + set = test_bit(bit, paddr); + else + set = test_bit_le(bit, paddr); + kunmap_atomic(paddr); + return set; +} + + /* this gets called when the md device is ready to unplug its underlying * (slave) device queues -- before we let any writes go down, we need to * sync the dirty pages of the bitmap file to disk */ @@ -935,7 +1006,7 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n */ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) { - unsigned long i, chunks, index, oldindex, bit; + unsigned long i, chunks, index, oldindex, bit, node_offset = 0; struct page *page = NULL; unsigned long bit_cnt = 0; struct file *file; @@ -981,6 +1052,9 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) if (!bitmap->mddev->bitmap_info.external) offset = sizeof(bitmap_super_t); + if (mddev_is_clustered(bitmap->mddev)) + node_offset = bitmap->cluster_slot * (DIV_ROUND_UP(store->bytes, PAGE_SIZE)); + for (i = 0; i < chunks; i++) { int b; index = file_page_index(&bitmap->storage, i); @@ -1001,7 +1075,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) bitmap->mddev, bitmap->mddev->bitmap_info.offset, page, - index, count); + index + node_offset, count); if (ret) goto err; @@ -1207,7 +1281,6 @@ void bitmap_daemon_work(struct mddev *mddev) j < bitmap->storage.file_pages && !test_bit(BITMAP_STALE, &bitmap->flags); j++) { - if (test_page_attr(bitmap, j, BITMAP_PAGE_DIRTY)) /* bitmap_unplug will handle the rest */ @@ -1530,11 +1603,13 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n return; } if (!*bmc) { - *bmc = 2 | (needed ? NEEDED_MASK : 0); + *bmc = 2; bitmap_count_page(&bitmap->counts, offset, 1); bitmap_set_pending(&bitmap->counts, offset); bitmap->allclean = 0; } + if (needed) + *bmc |= NEEDED_MASK; spin_unlock_irq(&bitmap->counts.lock); } @@ -1591,6 +1666,10 @@ static void bitmap_free(struct bitmap *bitmap) if (!bitmap) /* there was no bitmap */ return; + if (mddev_is_clustered(bitmap->mddev) && bitmap->mddev->cluster_info && + bitmap->cluster_slot == md_cluster_ops->slot_number(bitmap->mddev)) + md_cluster_stop(bitmap->mddev); + /* Shouldn't be needed - but just in case.... */ wait_event(bitmap->write_wait, atomic_read(&bitmap->pending_writes) == 0); @@ -1636,7 +1715,7 @@ void bitmap_destroy(struct mddev *mddev) * initialize the bitmap structure * if this returns an error, bitmap_destroy must be called to do clean up */ -int bitmap_create(struct mddev *mddev) +struct bitmap *bitmap_create(struct mddev *mddev, int slot) { struct bitmap *bitmap; sector_t blocks = mddev->resync_max_sectors; @@ -1650,7 +1729,7 @@ int bitmap_create(struct mddev *mddev) bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL); if (!bitmap) - return -ENOMEM; + return ERR_PTR(-ENOMEM); spin_lock_init(&bitmap->counts.lock); atomic_set(&bitmap->pending_writes, 0); @@ -1659,6 +1738,7 @@ int bitmap_create(struct mddev *mddev) init_waitqueue_head(&bitmap->behind_wait); bitmap->mddev = mddev; + bitmap->cluster_slot = slot; if (mddev->kobj.sd) bm = sysfs_get_dirent(mddev->kobj.sd, "bitmap"); @@ -1706,12 +1786,14 @@ int bitmap_create(struct mddev *mddev) printk(KERN_INFO "created bitmap (%lu pages) for device %s\n", bitmap->counts.pages, bmname(bitmap)); - mddev->bitmap = bitmap; - return test_bit(BITMAP_WRITE_ERROR, &bitmap->flags) ? -EIO : 0; + err = test_bit(BITMAP_WRITE_ERROR, &bitmap->flags) ? -EIO : 0; + if (err) + goto error; + return bitmap; error: bitmap_free(bitmap); - return err; + return ERR_PTR(err); } int bitmap_load(struct mddev *mddev) @@ -1765,6 +1847,60 @@ out: } EXPORT_SYMBOL_GPL(bitmap_load); +/* Loads the bitmap associated with slot and copies the resync information + * to our bitmap + */ +int bitmap_copy_from_slot(struct mddev *mddev, int slot, + sector_t *low, sector_t *high, bool clear_bits) +{ + int rv = 0, i, j; + sector_t block, lo = 0, hi = 0; + struct bitmap_counts *counts; + struct bitmap *bitmap = bitmap_create(mddev, slot); + + if (IS_ERR(bitmap)) + return PTR_ERR(bitmap); + + rv = bitmap_read_sb(bitmap); + if (rv) + goto err; + + rv = bitmap_init_from_disk(bitmap, 0); + if (rv) + goto err; + + counts = &bitmap->counts; + for (j = 0; j < counts->chunks; j++) { + block = (sector_t)j << counts->chunkshift; + if (bitmap_file_test_bit(bitmap, block)) { + if (!lo) + lo = block; + hi = block; + bitmap_file_clear_bit(bitmap, block); + bitmap_set_memory_bits(mddev->bitmap, block, 1); + bitmap_file_set_bit(mddev->bitmap, block); + } + } + + if (clear_bits) { + bitmap_update_sb(bitmap); + /* Setting this for the ev_page should be enough. + * And we do not require both write_all and PAGE_DIRT either + */ + for (i = 0; i < bitmap->storage.file_pages; i++) + set_page_attr(bitmap, i, BITMAP_PAGE_DIRTY); + bitmap_write_all(bitmap); + bitmap_unplug(bitmap); + } + *low = lo; + *high = hi; +err: + bitmap_free(bitmap); + return rv; +} +EXPORT_SYMBOL_GPL(bitmap_copy_from_slot); + + void bitmap_status(struct seq_file *seq, struct bitmap *bitmap) { unsigned long chunk_kb; @@ -1849,7 +1985,8 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks, memset(&store, 0, sizeof(store)); if (bitmap->mddev->bitmap_info.offset || bitmap->mddev->bitmap_info.file) ret = bitmap_storage_alloc(&store, chunks, - !bitmap->mddev->bitmap_info.external); + !bitmap->mddev->bitmap_info.external, + bitmap->cluster_slot); if (ret) goto err; @@ -2021,13 +2158,18 @@ location_store(struct mddev *mddev, const char *buf, size_t len) return -EINVAL; mddev->bitmap_info.offset = offset; if (mddev->pers) { + struct bitmap *bitmap; mddev->pers->quiesce(mddev, 1); - rv = bitmap_create(mddev); - if (!rv) + bitmap = bitmap_create(mddev, -1); + if (IS_ERR(bitmap)) + rv = PTR_ERR(bitmap); + else { + mddev->bitmap = bitmap; rv = bitmap_load(mddev); - if (rv) { - bitmap_destroy(mddev); - mddev->bitmap_info.offset = 0; + if (rv) { + bitmap_destroy(mddev); + mddev->bitmap_info.offset = 0; + } } mddev->pers->quiesce(mddev, 0); if (rv) @@ -2186,6 +2328,8 @@ __ATTR(chunksize, S_IRUGO|S_IWUSR, chunksize_show, chunksize_store); static ssize_t metadata_show(struct mddev *mddev, char *page) { + if (mddev_is_clustered(mddev)) + return sprintf(page, "clustered\n"); return sprintf(page, "%s\n", (mddev->bitmap_info.external ? "external" : "internal")); } @@ -2198,7 +2342,8 @@ static ssize_t metadata_store(struct mddev *mddev, const char *buf, size_t len) return -EBUSY; if (strncmp(buf, "external", 8) == 0) mddev->bitmap_info.external = 1; - else if (strncmp(buf, "internal", 8) == 0) + else if ((strncmp(buf, "internal", 8) == 0) || + (strncmp(buf, "clustered", 9) == 0)) mddev->bitmap_info.external = 0; else return -EINVAL; diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h index 30210b9c4ef9..f1f4dd01090d 100644 --- a/drivers/md/bitmap.h +++ b/drivers/md/bitmap.h @@ -130,8 +130,9 @@ typedef struct bitmap_super_s { __le32 write_behind; /* 60 number of outstanding write-behind writes */ __le32 sectors_reserved; /* 64 number of 512-byte sectors that are * reserved for the bitmap. */ - - __u8 pad[256 - 68]; /* set to zero */ + __le32 nodes; /* 68 the maximum number of nodes in cluster. */ + __u8 cluster_name[64]; /* 72 cluster name to which this md belongs */ + __u8 pad[256 - 136]; /* set to zero */ } bitmap_super_t; /* notes: @@ -226,12 +227,13 @@ struct bitmap { wait_queue_head_t behind_wait; struct kernfs_node *sysfs_can_clear; + int cluster_slot; /* Slot offset for clustered env */ }; /* the bitmap API */ /* these are used only by md/bitmap */ -int bitmap_create(struct mddev *mddev); +struct bitmap *bitmap_create(struct mddev *mddev, int slot); int bitmap_load(struct mddev *mddev); void bitmap_flush(struct mddev *mddev); void bitmap_destroy(struct mddev *mddev); @@ -260,6 +262,8 @@ void bitmap_daemon_work(struct mddev *mddev); int bitmap_resize(struct bitmap *bitmap, sector_t blocks, int chunksize, int init); +int bitmap_copy_from_slot(struct mddev *mddev, int slot, + sector_t *lo, sector_t *hi, bool clear_bits); #endif #endif diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c index 13f547a4eeb6..3ddd1162334d 100644 --- a/drivers/md/dm-cache-policy-mq.c +++ b/drivers/md/dm-cache-policy-mq.c @@ -8,6 +8,7 @@ #include "dm.h" #include <linux/hash.h> +#include <linux/jiffies.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/slab.h> @@ -124,32 +125,41 @@ static void iot_examine_bio(struct io_tracker *t, struct bio *bio) * sorted queue. */ #define NR_QUEUE_LEVELS 16u +#define NR_SENTINELS NR_QUEUE_LEVELS * 3 + +#define WRITEBACK_PERIOD HZ struct queue { + unsigned nr_elts; + bool current_writeback_sentinels; + unsigned long next_writeback; struct list_head qs[NR_QUEUE_LEVELS]; + struct list_head sentinels[NR_SENTINELS]; }; static void queue_init(struct queue *q) { unsigned i; - for (i = 0; i < NR_QUEUE_LEVELS; i++) + q->nr_elts = 0; + q->current_writeback_sentinels = false; + q->next_writeback = 0; + for (i = 0; i < NR_QUEUE_LEVELS; i++) { INIT_LIST_HEAD(q->qs + i); + INIT_LIST_HEAD(q->sentinels + i); + INIT_LIST_HEAD(q->sentinels + NR_QUEUE_LEVELS + i); + INIT_LIST_HEAD(q->sentinels + (2 * NR_QUEUE_LEVELS) + i); + } } -/* - * Checks to see if the queue is empty. - * FIXME: reduce cpu usage. - */ -static bool queue_empty(struct queue *q) +static unsigned queue_size(struct queue *q) { - unsigned i; - - for (i = 0; i < NR_QUEUE_LEVELS; i++) - if (!list_empty(q->qs + i)) - return false; + return q->nr_elts; +} - return true; +static bool queue_empty(struct queue *q) +{ + return q->nr_elts == 0; } /* @@ -157,24 +167,19 @@ static bool queue_empty(struct queue *q) */ static void queue_push(struct queue *q, unsigned level, struct list_head *elt) { + q->nr_elts++; list_add_tail(elt, q->qs + level); } -static void queue_remove(struct list_head *elt) +static void queue_remove(struct queue *q, struct list_head *elt) { + q->nr_elts--; list_del(elt); } -/* - * Shifts all regions down one level. This has no effect on the order of - * the queue. - */ -static void queue_shift_down(struct queue *q) +static bool is_sentinel(struct queue *q, struct list_head *h) { - unsigned level; - - for (level = 1; level < NR_QUEUE_LEVELS; level++) - list_splice_init(q->qs + level, q->qs + level - 1); + return (h >= q->sentinels) && (h < (q->sentinels + NR_SENTINELS)); } /* @@ -184,10 +189,12 @@ static void queue_shift_down(struct queue *q) static struct list_head *queue_peek(struct queue *q) { unsigned level; + struct list_head *h; for (level = 0; level < NR_QUEUE_LEVELS; level++) - if (!list_empty(q->qs + level)) - return q->qs[level].next; + list_for_each(h, q->qs + level) + if (!is_sentinel(q, h)) + return h; return NULL; } @@ -197,16 +204,34 @@ static struct list_head *queue_pop(struct queue *q) struct list_head *r = queue_peek(q); if (r) { + q->nr_elts--; list_del(r); - - /* have we just emptied the bottom level? */ - if (list_empty(q->qs)) - queue_shift_down(q); } return r; } +/* + * Pops an entry from a level that is not past a sentinel. + */ +static struct list_head *queue_pop_old(struct queue *q) +{ + unsigned level; + struct list_head *h; + + for (level = 0; level < NR_QUEUE_LEVELS; level++) + list_for_each(h, q->qs + level) { + if (is_sentinel(q, h)) + break; + + q->nr_elts--; + list_del(h); + return h; + } + + return NULL; +} + static struct list_head *list_pop(struct list_head *lh) { struct list_head *r = lh->next; @@ -217,6 +242,62 @@ static struct list_head *list_pop(struct list_head *lh) return r; } +static struct list_head *writeback_sentinel(struct queue *q, unsigned level) +{ + if (q->current_writeback_sentinels) + return q->sentinels + NR_QUEUE_LEVELS + level; + else + return q->sentinels + 2 * NR_QUEUE_LEVELS + level; +} + +static void queue_update_writeback_sentinels(struct queue *q) +{ + unsigned i; + struct list_head *h; + + if (time_after(jiffies, q->next_writeback)) { + for (i = 0; i < NR_QUEUE_LEVELS; i++) { + h = writeback_sentinel(q, i); + list_del(h); + list_add_tail(h, q->qs + i); + } + + q->next_writeback = jiffies + WRITEBACK_PERIOD; + q->current_writeback_sentinels = !q->current_writeback_sentinels; + } +} + +/* + * Sometimes we want to iterate through entries that have been pushed since + * a certain event. We use sentinel entries on the queues to delimit these + * 'tick' events. + */ +static void queue_tick(struct queue *q) +{ + unsigned i; + + for (i = 0; i < NR_QUEUE_LEVELS; i++) { + list_del(q->sentinels + i); + list_add_tail(q->sentinels + i, q->qs + i); + } +} + +typedef void (*iter_fn)(struct list_head *, void *); +static void queue_iterate_tick(struct queue *q, iter_fn fn, void *context) +{ + unsigned i; + struct list_head *h; + + for (i = 0; i < NR_QUEUE_LEVELS; i++) { + list_for_each_prev(h, q->qs + i) { + if (is_sentinel(q, h)) + break; + + fn(h, context); + } + } +} + /*----------------------------------------------------------------*/ /* @@ -232,8 +313,6 @@ struct entry { */ bool dirty:1; unsigned hit_count; - unsigned generation; - unsigned tick; }; /* @@ -481,7 +560,6 @@ static bool in_cache(struct mq_policy *mq, struct entry *e) */ static void push(struct mq_policy *mq, struct entry *e) { - e->tick = mq->tick; hash_insert(mq, e); if (in_cache(mq, e)) @@ -496,7 +574,11 @@ static void push(struct mq_policy *mq, struct entry *e) */ static void del(struct mq_policy *mq, struct entry *e) { - queue_remove(&e->list); + if (in_cache(mq, e)) + queue_remove(e->dirty ? &mq->cache_dirty : &mq->cache_clean, &e->list); + else + queue_remove(&mq->pre_cache, &e->list); + hash_remove(e); } @@ -518,18 +600,24 @@ static struct entry *pop(struct mq_policy *mq, struct queue *q) return e; } -static struct entry *peek(struct queue *q) +static struct entry *pop_old(struct mq_policy *mq, struct queue *q) { - struct list_head *h = queue_peek(q); - return h ? container_of(h, struct entry, list) : NULL; + struct entry *e; + struct list_head *h = queue_pop_old(q); + + if (!h) + return NULL; + + e = container_of(h, struct entry, list); + hash_remove(e); + + return e; } -/* - * Has this entry already been updated? - */ -static bool updated_this_tick(struct mq_policy *mq, struct entry *e) +static struct entry *peek(struct queue *q) { - return mq->tick == e->tick; + struct list_head *h = queue_peek(q); + return h ? container_of(h, struct entry, list) : NULL; } /* @@ -583,20 +671,9 @@ static void check_generation(struct mq_policy *mq) * Whenever we use an entry we bump up it's hit counter, and push it to the * back to it's current level. */ -static void requeue_and_update_tick(struct mq_policy *mq, struct entry *e) +static void requeue(struct mq_policy *mq, struct entry *e) { - if (updated_this_tick(mq, e)) - return; - - e->hit_count++; - mq->hit_count++; check_generation(mq); - - /* generation adjustment, to stop the counts increasing forever. */ - /* FIXME: divide? */ - /* e->hit_count -= min(e->hit_count - 1, mq->generation - e->generation); */ - e->generation = mq->generation; - del(mq, e); push(mq, e); } @@ -703,7 +780,7 @@ static int cache_entry_found(struct mq_policy *mq, struct entry *e, struct policy_result *result) { - requeue_and_update_tick(mq, e); + requeue(mq, e); if (in_cache(mq, e)) { result->op = POLICY_HIT; @@ -740,8 +817,6 @@ static int pre_cache_to_cache(struct mq_policy *mq, struct entry *e, new_e->oblock = e->oblock; new_e->dirty = false; new_e->hit_count = e->hit_count; - new_e->generation = e->generation; - new_e->tick = e->tick; del(mq, e); free_entry(&mq->pre_cache_pool, e); @@ -757,18 +832,16 @@ static int pre_cache_entry_found(struct mq_policy *mq, struct entry *e, int data_dir, struct policy_result *result) { int r = 0; - bool updated = updated_this_tick(mq, e); - if ((!discarded_oblock && updated) || - !should_promote(mq, e, discarded_oblock, data_dir)) { - requeue_and_update_tick(mq, e); + if (!should_promote(mq, e, discarded_oblock, data_dir)) { + requeue(mq, e); result->op = POLICY_MISS; } else if (!can_migrate) r = -EWOULDBLOCK; else { - requeue_and_update_tick(mq, e); + requeue(mq, e); r = pre_cache_to_cache(mq, e, result); } @@ -795,7 +868,6 @@ static void insert_in_pre_cache(struct mq_policy *mq, e->dirty = false; e->oblock = oblock; e->hit_count = 1; - e->generation = mq->generation; push(mq, e); } @@ -828,7 +900,6 @@ static void insert_in_cache(struct mq_policy *mq, dm_oblock_t oblock, e->oblock = oblock; e->dirty = false; e->hit_count = 1; - e->generation = mq->generation; push(mq, e); result->cblock = infer_cblock(&mq->cache_pool, e); @@ -905,12 +976,37 @@ static void mq_destroy(struct dm_cache_policy *p) kfree(mq); } +static void update_pre_cache_hits(struct list_head *h, void *context) +{ + struct entry *e = container_of(h, struct entry, list); + e->hit_count++; +} + +static void update_cache_hits(struct list_head *h, void *context) +{ + struct mq_policy *mq = context; + struct entry *e = container_of(h, struct entry, list); + e->hit_count++; + mq->hit_count++; +} + static void copy_tick(struct mq_policy *mq) { - unsigned long flags; + unsigned long flags, tick; spin_lock_irqsave(&mq->tick_lock, flags); - mq->tick = mq->tick_protected; + tick = mq->tick_protected; + if (tick != mq->tick) { + queue_iterate_tick(&mq->pre_cache, update_pre_cache_hits, mq); + queue_iterate_tick(&mq->cache_dirty, update_cache_hits, mq); + queue_iterate_tick(&mq->cache_clean, update_cache_hits, mq); + mq->tick = tick; + } + + queue_tick(&mq->pre_cache); + queue_tick(&mq->cache_dirty); + queue_tick(&mq->cache_clean);< |
