From 4e420c452b11edf9d510c8180ac66f529e5b6206 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Mon, 6 Oct 2014 13:48:51 +0100
Subject: dm bufio: switch from a huge hash table to an rbtree

Converting over to using an rbtree eliminates a fixed 8MB allocation
from vmalloc space for the hash table.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-bufio.c | 97 ++++++++++++++++++++++++++++-----------------------
 1 file changed, 54 insertions(+), 43 deletions(-)

diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 0be200b6dbf2..dcaa1d9dfbe4 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -14,6 +14,7 @@
 #include <linux/vmalloc.h>
 #include <linux/shrinker.h>
 #include <linux/module.h>
+#include <linux/rbtree.h>
 
 #define DM_MSG_PREFIX "bufio"
 
@@ -47,14 +48,6 @@
  */
 #define DM_BUFIO_INLINE_VECS		16
 
-/*
- * Buffer hash
- */
-#define DM_BUFIO_HASH_BITS	20
-#define DM_BUFIO_HASH(block) \
-	((((block) >> DM_BUFIO_HASH_BITS) ^ (block)) & \
-	 ((1 << DM_BUFIO_HASH_BITS) - 1))
-
 /*
  * Don't try to use kmem_cache_alloc for blocks larger than this.
  * For explanation, see alloc_buffer_data below.
@@ -106,7 +99,7 @@ struct dm_bufio_client {
 
 	unsigned minimum_buffers;
 
-	struct hlist_head *cache_hash;
+	struct rb_root buffer_tree;
 	wait_queue_head_t free_buffer_wait;
 
 	int async_write_error;
@@ -135,7 +128,7 @@ enum data_mode {
 };
 
 struct dm_buffer {
-	struct hlist_node hash_list;
+	struct rb_node node;
 	struct list_head lru_list;
 	sector_t block;
 	void *data;
@@ -253,6 +246,53 @@ static LIST_HEAD(dm_bufio_all_clients);
  */
 static DEFINE_MUTEX(dm_bufio_clients_lock);
 
+/*----------------------------------------------------------------
+ * A red/black tree acts as an index for all the buffers.
+ *--------------------------------------------------------------*/
+static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block)
+{
+	struct rb_node *n = c->buffer_tree.rb_node;
+	struct dm_buffer *b;
+
+	while (n) {
+		b = container_of(n, struct dm_buffer, node);
+
+		if (b->block == block)
+			return b;
+
+		n = (b->block < block) ? n->rb_left : n->rb_right;
+	}
+
+	return NULL;
+}
+
+static void __insert(struct dm_bufio_client *c, struct dm_buffer *b)
+{
+	struct rb_node **new = &c->buffer_tree.rb_node, *parent = NULL;
+	struct dm_buffer *found;
+
+	while (*new) {
+		found = container_of(*new, struct dm_buffer, node);
+
+		if (found->block == b->block) {
+			BUG_ON(found != b);
+			return;
+		}
+
+		parent = *new;
+		new = (found->block < b->block) ?
+			&((*new)->rb_left) : &((*new)->rb_right);
+	}
+
+	rb_link_node(&b->node, parent, new);
+	rb_insert_color(&b->node, &c->buffer_tree);
+}
+
+static void __remove(struct dm_bufio_client *c, struct dm_buffer *b)
+{
+	rb_erase(&b->node, &c->buffer_tree);
+}
+
 /*----------------------------------------------------------------*/
 
 static void adjust_total_allocated(enum data_mode data_mode, long diff)
@@ -434,7 +474,7 @@ static void __link_buffer(struct dm_buffer *b, sector_t block, int dirty)
 	b->block = block;
 	b->list_mode = dirty;
 	list_add(&b->lru_list, &c->lru[dirty]);
-	hlist_add_head(&b->hash_list, &c->cache_hash[DM_BUFIO_HASH(block)]);
+	__insert(b->c, b);
 	b->last_accessed = jiffies;
 }
 
@@ -448,7 +488,7 @@ static void __unlink_buffer(struct dm_buffer *b)
 	BUG_ON(!c->n_buffers[b->list_mode]);
 
 	c->n_buffers[b->list_mode]--;
-	hlist_del(&b->hash_list);
+	__remove(b->c, b);
 	list_del(&b->lru_list);
 }
 
@@ -888,23 +928,6 @@ static void __check_watermark(struct dm_bufio_client *c,
 		__write_dirty_buffers_async(c, 1, write_list);
 }
 
-/*
- * Find a buffer in the hash.
- */
-static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block)
-{
-	struct dm_buffer *b;
-
-	hlist_for_each_entry(b, &c->cache_hash[DM_BUFIO_HASH(block)],
-			     hash_list) {
-		dm_bufio_cond_resched();
-		if (b->block == block)
-			return b;
-	}
-
-	return NULL;
-}
-
 /*----------------------------------------------------------------
  * Getting a buffer
  *--------------------------------------------------------------*/
@@ -1534,11 +1557,7 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign
 		r = -ENOMEM;
 		goto bad_client;
 	}
-	c->cache_hash = vmalloc(sizeof(struct hlist_head) << DM_BUFIO_HASH_BITS);
-	if (!c->cache_hash) {
-		r = -ENOMEM;
-		goto bad_hash;
-	}
+	c->buffer_tree = RB_ROOT;
 
 	c->bdev = bdev;
 	c->block_size = block_size;
@@ -1557,9 +1576,6 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign
 		c->n_buffers[i] = 0;
 	}
 
-	for (i = 0; i < 1 << DM_BUFIO_HASH_BITS; i++)
-		INIT_HLIST_HEAD(&c->cache_hash[i]);
-
 	mutex_init(&c->lock);
 	INIT_LIST_HEAD(&c->reserved_buffers);
 	c->need_reserved_buffers = reserved_buffers;
@@ -1633,8 +1649,6 @@ bad_cache:
 	}
 	dm_io_client_destroy(c->dm_io);
 bad_dm_io:
-	vfree(c->cache_hash);
-bad_hash:
 	kfree(c);
 bad_client:
 	return ERR_PTR(r);
@@ -1661,9 +1675,7 @@ void dm_bufio_client_destroy(struct dm_bufio_client *c)
 
 	mutex_unlock(&dm_bufio_clients_lock);
 
-	for (i = 0; i < 1 << DM_BUFIO_HASH_BITS; i++)
-		BUG_ON(!hlist_empty(&c->cache_hash[i]));
-
+	BUG_ON(!RB_EMPTY_ROOT(&c->buffer_tree));
 	BUG_ON(c->need_reserved_buffers);
 
 	while (!list_empty(&c->reserved_buffers)) {
@@ -1681,7 +1693,6 @@ void dm_bufio_client_destroy(struct dm_bufio_client *c)
 		BUG_ON(c->n_buffers[i]);
 
 	dm_io_client_destroy(c->dm_io);
-	vfree(c->cache_hash);
 	kfree(c);
 }
 EXPORT_SYMBOL_GPL(dm_bufio_client_destroy);
-- 
cgit v1.2.3


From 33096a7822de63bc7dbdd090870b656a0304fa35 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Thu, 9 Oct 2014 11:10:25 +0100
Subject: dm bufio: evict buffers that are past the max age but retain some
 buffers

These changes help keep metadata backed by dm-bufio in-core longer which
fixes reports of metadata churn in the face of heavy random IO workloads.

Before, bufio evicted all buffers older than DM_BUFIO_DEFAULT_AGE_SECS.
Having a device (e.g. dm-thinp or dm-cache) lose all metadata just
because associated buffers had been idle for some time is unfriendly.

Now, the user may now configure the number of bytes that bufio retains
using the 'retain_bytes' module parameter.  The default is 256K.

Also, the DM_BUFIO_WORK_TIMER_SECS and DM_BUFIO_DEFAULT_AGE_SECS
defaults were quite low so increase them (to 30 and 300 respectively).

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-bufio.c | 109 ++++++++++++++++++++++++++++++++++----------------
 1 file changed, 75 insertions(+), 34 deletions(-)

diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index dcaa1d9dfbe4..99579c81ae0a 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -35,12 +35,17 @@
 /*
  * Check buffer ages in this interval (seconds)
  */
-#define DM_BUFIO_WORK_TIMER_SECS	10
+#define DM_BUFIO_WORK_TIMER_SECS	30
 
 /*
  * Free buffers when they are older than this (seconds)
  */
-#define DM_BUFIO_DEFAULT_AGE_SECS	60
+#define DM_BUFIO_DEFAULT_AGE_SECS	300
+
+/*
+ * The nr of bytes of cached data to keep around.
+ */
+#define DM_BUFIO_DEFAULT_RETAIN_BYTES   (256 * 1024)
 
 /*
  * The number of bvec entries that are embedded directly in the buffer.
@@ -216,6 +221,7 @@ static DEFINE_SPINLOCK(param_spinlock);
  * Buffers are freed after this timeout
  */
 static unsigned dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS;
+static unsigned dm_bufio_retain_bytes = DM_BUFIO_DEFAULT_RETAIN_BYTES;
 
 static unsigned long dm_bufio_peak_allocated;
 static unsigned long dm_bufio_allocated_kmem_cache;
@@ -1457,45 +1463,52 @@ static void drop_buffers(struct dm_bufio_client *c)
 }
 
 /*
- * Test if the buffer is unused and too old, and commit it.
+ * We may not be able to evict this buffer if IO pending or the client
+ * is still using it.  Caller is expected to know buffer is too old.
+ *
  * And if GFP_NOFS is used, we must not do any I/O because we hold
  * dm_bufio_clients_lock and we would risk deadlock if the I/O gets
  * rerouted to different bufio client.
  */
-static int __cleanup_old_buffer(struct dm_buffer *b, gfp_t gfp,
-				unsigned long max_jiffies)
+static bool __try_evict_buffer(struct dm_buffer *b, gfp_t gfp)
 {
-	if (jiffies - b->last_accessed < max_jiffies)
-		return 0;
-
 	if (!(gfp & __GFP_FS)) {
 		if (test_bit(B_READING, &b->state) ||
 		    test_bit(B_WRITING, &b->state) ||
 		    test_bit(B_DIRTY, &b->state))
-			return 0;
+			return false;
 	}
 
 	if (b->hold_count)
-		return 0;
+		return false;
 
 	__make_buffer_clean(b);
 	__unlink_buffer(b);
 	__free_buffer_wake(b);
 
-	return 1;
+	return true;
 }
 
-static long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan,
-		   gfp_t gfp_mask)
+static unsigned get_retain_buffers(struct dm_bufio_client *c)
+{
+        unsigned retain_bytes = ACCESS_ONCE(dm_bufio_retain_bytes);
+        return retain_bytes / c->block_size;
+}
+
+static unsigned long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan,
+			    gfp_t gfp_mask)
 {
 	int l;
 	struct dm_buffer *b, *tmp;
-	long freed = 0;
+	unsigned long freed = 0;
+	unsigned long count = nr_to_scan;
+	unsigned retain_target = get_retain_buffers(c);
 
 	for (l = 0; l < LIST_SIZE; l++) {
 		list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list) {
-			freed += __cleanup_old_buffer(b, gfp_mask, 0);
-			if (!--nr_to_scan)
+			if (__try_evict_buffer(b, gfp_mask))
+				freed++;
+			if (!--nr_to_scan || ((count - freed) <= retain_target))
 				return freed;
 			dm_bufio_cond_resched();
 		}
@@ -1697,31 +1710,56 @@ void dm_bufio_client_destroy(struct dm_bufio_client *c)
 }
 EXPORT_SYMBOL_GPL(dm_bufio_client_destroy);
 
-static void cleanup_old_buffers(void)
+static unsigned get_max_age_hz(void)
 {
-	unsigned long max_age = ACCESS_ONCE(dm_bufio_max_age);
-	struct dm_bufio_client *c;
+	unsigned max_age = ACCESS_ONCE(dm_bufio_max_age);
 
-	if (max_age > ULONG_MAX / HZ)
-		max_age = ULONG_MAX / HZ;
+	if (max_age > UINT_MAX / HZ)
+		max_age = UINT_MAX / HZ;
 
-	mutex_lock(&dm_bufio_clients_lock);
-	list_for_each_entry(c, &dm_bufio_all_clients, client_list) {
-		if (!dm_bufio_trylock(c))
-			continue;
+	return max_age * HZ;
+}
 
-		while (!list_empty(&c->lru[LIST_CLEAN])) {
-			struct dm_buffer *b;
-			b = list_entry(c->lru[LIST_CLEAN].prev,
-				       struct dm_buffer, lru_list);
-			if (!__cleanup_old_buffer(b, 0, max_age * HZ))
-				break;
-			dm_bufio_cond_resched();
-		}
+static bool older_than(struct dm_buffer *b, unsigned long age_hz)
+{
+	return (jiffies - b->last_accessed) >= age_hz;
+}
+
+static void __evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz)
+{
+	struct dm_buffer *b, *tmp;
+	unsigned retain_target = get_retain_buffers(c);
+	unsigned count;
+
+	dm_bufio_lock(c);
+
+	count = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY];
+	list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_CLEAN], lru_list) {
+		if (count <= retain_target)
+			break;
+
+		if (!older_than(b, age_hz))
+			break;
+
+		if (__try_evict_buffer(b, 0))
+			count--;
 
-		dm_bufio_unlock(c);
 		dm_bufio_cond_resched();
 	}
+
+	dm_bufio_unlock(c);
+}
+
+static void cleanup_old_buffers(void)
+{
+	unsigned long max_age_hz = get_max_age_hz();
+	struct dm_bufio_client *c;
+
+	mutex_lock(&dm_bufio_clients_lock);
+
+	list_for_each_entry(c, &dm_bufio_all_clients, client_list)
+		__evict_old_buffers(c, max_age_hz);
+
 	mutex_unlock(&dm_bufio_clients_lock);
 }
 
@@ -1846,6 +1884,9 @@ MODULE_PARM_DESC(max_cache_size_bytes, "Size of metadata cache");
 module_param_named(max_age_seconds, dm_bufio_max_age, uint, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds");
 
+module_param_named(retain_bytes, dm_bufio_retain_bytes, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(retain_bytes, "Try to keep at least this many bytes cached in memory");
+
 module_param_named(peak_allocated_bytes, dm_bufio_peak_allocated, ulong, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(peak_allocated_bytes, "Tracks the maximum allocated memory");
 
-- 
cgit v1.2.3


From a195db2d29a47c2c3a61386009bd400df18c86cf Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Mon, 6 Oct 2014 16:30:06 -0400
Subject: dm bio prison: switch to using a red black tree

Previously it was using a fixed sized hash table.  There are times
when very many concurrent cells are held (such as when processing a very
large discard).  When this happens the hash table performance becomes
very poor.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-bio-prison.c   | 172 ++++++++++++++++++-------------------------
 drivers/md/dm-bio-prison.h   |   7 +-
 drivers/md/dm-cache-target.c |   3 +-
 drivers/md/dm-thin.c         |   3 +-
 4 files changed, 79 insertions(+), 106 deletions(-)

diff --git a/drivers/md/dm-bio-prison.c b/drivers/md/dm-bio-prison.c
index f752d12081ff..90a56625245a 100644
--- a/drivers/md/dm-bio-prison.c
+++ b/drivers/md/dm-bio-prison.c
@@ -14,68 +14,38 @@
 
 /*----------------------------------------------------------------*/
 
-struct bucket {
-	spinlock_t lock;
-	struct hlist_head cells;
-};
+#define MIN_CELLS 1024
 
 struct dm_bio_prison {
+	spinlock_t lock;
 	mempool_t *cell_pool;
-
-	unsigned nr_buckets;
-	unsigned hash_mask;
-	struct bucket *buckets;
+	struct rb_root cells;
 };
 
-/*----------------------------------------------------------------*/
-
-static uint32_t calc_nr_buckets(unsigned nr_cells)
-{
-	uint32_t n = 128;
-
-	nr_cells /= 4;
-	nr_cells = min(nr_cells, 8192u);
-
-	while (n < nr_cells)
-		n <<= 1;
-
-	return n;
-}
-
 static struct kmem_cache *_cell_cache;
 
-static void init_bucket(struct bucket *b)
-{
-	spin_lock_init(&b->lock);
-	INIT_HLIST_HEAD(&b->cells);
-}
+/*----------------------------------------------------------------*/
 
 /*
  * @nr_cells should be the number of cells you want in use _concurrently_.
  * Don't confuse it with the number of distinct keys.
  */
-struct dm_bio_prison *dm_bio_prison_create(unsigned nr_cells)
+struct dm_bio_prison *dm_bio_prison_create(void)
 {
-	unsigned i;
-	uint32_t nr_buckets = calc_nr_buckets(nr_cells);
-	size_t len = sizeof(struct dm_bio_prison) +
-		(sizeof(struct bucket) * nr_buckets);
-	struct dm_bio_prison *prison = kmalloc(len, GFP_KERNEL);
+	struct dm_bio_prison *prison = kmalloc(sizeof(*prison), GFP_KERNEL);
 
 	if (!prison)
 		return NULL;
 
-	prison->cell_pool = mempool_create_slab_pool(nr_cells, _cell_cache);
+	spin_lock_init(&prison->lock);
+
+	prison->cell_pool = mempool_create_slab_pool(MIN_CELLS, _cell_cache);
 	if (!prison->cell_pool) {
 		kfree(prison);
 		return NULL;
 	}
 
-	prison->nr_buckets = nr_buckets;
-	prison->hash_mask = nr_buckets - 1;
-	prison->buckets = (struct bucket *) (prison + 1);
-	for (i = 0; i < nr_buckets; i++)
-		init_bucket(prison->buckets + i);
+	prison->cells = RB_ROOT;
 
 	return prison;
 }
@@ -101,68 +71,73 @@ void dm_bio_prison_free_cell(struct dm_bio_prison *prison,
 }
 EXPORT_SYMBOL_GPL(dm_bio_prison_free_cell);
 
-static uint32_t hash_key(struct dm_bio_prison *prison, struct dm_cell_key *key)
+static void __setup_new_cell(struct dm_cell_key *key,
+			     struct bio *holder,
+			     struct dm_bio_prison_cell *cell)
 {
-	const unsigned long BIG_PRIME = 4294967291UL;
-	uint64_t hash = key->block * BIG_PRIME;
-
-	return (uint32_t) (hash & prison->hash_mask);
+       memcpy(&cell->key, key, sizeof(cell->key));
+       cell->holder = holder;
+       bio_list_init(&cell->bios);
 }
 
-static int keys_equal(struct dm_cell_key *lhs, struct dm_cell_key *rhs)
+static int cmp_keys(struct dm_cell_key *lhs,
+		    struct dm_cell_key *rhs)
 {
-	       return (lhs->virtual == rhs->virtual) &&
-		       (lhs->dev == rhs->dev) &&
-		       (lhs->block == rhs->block);
-}
+	if (lhs->virtual < rhs->virtual)
+		return -1;
 
-static struct bucket *get_bucket(struct dm_bio_prison *prison,
-				 struct dm_cell_key *key)
-{
-	return prison->buckets + hash_key(prison, key);
-}
+	if (lhs->virtual > rhs->virtual)
+		return 1;
 
-static struct dm_bio_prison_cell *__search_bucket(struct bucket *b,
-						  struct dm_cell_key *key)
-{
-	struct dm_bio_prison_cell *cell;
+	if (lhs->dev < rhs->dev)
+		return -1;
 
-	hlist_for_each_entry(cell, &b->cells, list)
-		if (keys_equal(&cell->key, key))
-			return cell;
+	if (lhs->dev > rhs->dev)
+		return 1;
 
-	return NULL;
-}
+	if (lhs->block < rhs->block)
+		return -1;
 
-static void __setup_new_cell(struct bucket *b,
-			     struct dm_cell_key *key,
-			     struct bio *holder,
-			     struct dm_bio_prison_cell *cell)
-{
-	memcpy(&cell->key, key, sizeof(cell->key));
-	cell->holder = holder;
-	bio_list_init(&cell->bios);
-	hlist_add_head(&cell->list, &b->cells);
+	if (lhs->block > rhs->block)
+		return 1;
+
+	return 0;
 }
 
-static int __bio_detain(struct bucket *b,
+static int __bio_detain(struct dm_bio_prison *prison,
 			struct dm_cell_key *key,
 			struct bio *inmate,
 			struct dm_bio_prison_cell *cell_prealloc,
 			struct dm_bio_prison_cell **cell_result)
 {
-	struct dm_bio_prison_cell *cell;
-
-	cell = __search_bucket(b, key);
-	if (cell) {
-		if (inmate)
-			bio_list_add(&cell->bios, inmate);
-		*cell_result = cell;
-		return 1;
+	int r;
+	struct rb_node **new = &prison->cells.rb_node, *parent = NULL;
+
+	while (*new) {
+		struct dm_bio_prison_cell *cell =
+			container_of(*new, struct dm_bio_prison_cell, node);
+
+		r = cmp_keys(key, &cell->key);
+
+		parent = *new;
+		if (r < 0)
+			new = &((*new)->rb_left);
+		else if (r > 0)
+			new = &((*new)->rb_right);
+		else {
+			if (inmate)
+				bio_list_add(&cell->bios, inmate);
+			*cell_result = cell;
+			return 1;
+		}
 	}
 
-	__setup_new_cell(b, key, inmate, cell_prealloc);
+	__setup_new_cell(key, inmate, cell_prealloc);
 	*cell_result = cell_prealloc;
+
+	rb_link_node(&cell_prealloc->node, parent, new);
+	rb_insert_color(&cell_prealloc->node, &prison->cells);
+
 	return 0;
 }
 
@@ -174,11 +149,10 @@ static int bio_detain(struct dm_bio_prison *prison,
 {
 	int r;
 	unsigned long flags;
-	struct bucket *b = get_bucket(prison, key);
 
-	spin_lock_irqsave(&b->lock, flags);
-	r = __bio_detain(b, key, inmate, cell_prealloc, cell_result);
-	spin_unlock_irqrestore(&b->lock, flags);
+	spin_lock_irqsave(&prison->lock, flags);
+	r = __bio_detain(prison, key, inmate, cell_prealloc, cell_result);
+	spin_unlock_irqrestore(&prison->lock, flags);
 
 	return r;
 }
@@ -205,10 +179,11 @@ EXPORT_SYMBOL_GPL(dm_get_cell);
 /*
  * @inmates must have been initialised prior to this call
  */
-static void __cell_release(struct dm_bio_prison_cell *cell,
+static void __cell_release(struct dm_bio_prison *prison,
+			   struct dm_bio_prison_cell *cell,
 			   struct bio_list *inmates)
 {
-	hlist_del(&cell->list);
+	rb_erase(&cell->node, &prison->cells);
 
 	if (inmates) {
 		if (cell->holder)
@@ -222,21 +197,21 @@ void dm_cell_release(struct dm_bio_prison *prison,
 		     struct bio_list *bios)
 {
 	unsigned long flags;
-	struct bucket *b = get_bucket(prison, &cell->key);
 
-	spin_lock_irqsave(&b->lock, flags);
-	__cell_release(cell, bios);
-	spin_unlock_irqrestore(&b->lock, flags);
+	spin_lock_irqsave(&prison->lock, flags);
+	__cell_release(prison, cell, bios);
+	spin_unlock_irqrestore(&prison->lock, flags);
 }
 EXPORT_SYMBOL_GPL(dm_cell_release);
 
 /*
  * Sometimes we don't want the holder, just the additional bios.
  */
-static void __cell_release_no_holder(struct dm_bio_prison_cell *cell,
+static void __cell_release_no_holder(struct dm_bio_prison *prison,
+				     struct dm_bio_prison_cell *cell,
 				     struct bio_list *inmates)
 {
-	hlist_del(&cell->list);
+	rb_erase(&cell->node, &prison->cells);
 	bio_list_merge(inmates, &cell->bios);
 }
 
@@ -245,11 +220,10 @@ void dm_cell_release_no_holder(struct dm_bio_prison *prison,
 			       struct bio_list *inmates)
 {
 	unsigned long flags;
-	struct bucket *b = get_bucket(prison, &cell->key);
 
-	spin_lock_irqsave(&b->lock, flags);
-	__cell_release_no_holder(cell, inmates);
-	spin_unlock_irqrestore(&b->lock, flags);
+	spin_lock_irqsave(&prison->lock, flags);
+	__cell_release_no_holder(prison, cell, inmates);
+	spin_unlock_irqrestore(&prison->lock, flags);
 }
 EXPORT_SYMBOL_GPL(dm_cell_release_no_holder);
 
diff --git a/drivers/md/dm-bio-prison.h b/drivers/md/dm-bio-prison.h
index 6805a142b750..997a43960e77 100644
--- a/drivers/md/dm-bio-prison.h
+++ b/drivers/md/dm-bio-prison.h
@@ -10,8 +10,8 @@
 #include "persistent-data/dm-block-manager.h" /* FIXME: for dm_block_t */
 #include "dm-thin-metadata.h" /* FIXME: for dm_thin_id */
 
-#include <linux/list.h>
 #include <linux/bio.h>
+#include <linux/rbtree.h>
 
 /*----------------------------------------------------------------*/
 
@@ -35,13 +35,14 @@ struct dm_cell_key {
  * themselves.
  */
 struct dm_bio_prison_cell {
-	struct hlist_node list;
+	struct rb_node node;
+
 	struct dm_cell_key key;
 	struct bio *holder;
 	struct bio_list bios;
 };
 
-struct dm_bio_prison *dm_bio_prison_create(unsigned nr_cells);
+struct dm_bio_prison *dm_bio_prison_create(void);
 void dm_bio_prison_destroy(struct dm_bio_prison *prison);
 
 /*
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 7130505c2425..69de8b43ca12 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -95,7 +95,6 @@ static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio)
 
 /*----------------------------------------------------------------*/
 
-#define PRISON_CELLS 1024
 #define MIGRATION_POOL_SIZE 128
 #define COMMIT_PERIOD HZ
 #define MIGRATION_COUNT_WINDOW 10
@@ -2327,7 +2326,7 @@ static int cache_create(struct cache_args *ca, struct cache **result)
 	INIT_DELAYED_WORK(&cache->waker, do_waker);
 	cache->last_commit_jiffies = jiffies;
 
-	cache->prison = dm_bio_prison_create(PRISON_CELLS);
+	cache->prison = dm_bio_prison_create();
 	if (!cache->prison) {
 		*error = "could not create bio prison";
 		goto bad;
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 0f86d802b533..eecfe7495232 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -25,7 +25,6 @@
  */
 #define ENDIO_HOOK_POOL_SIZE 1024
 #define MAPPING_POOL_SIZE 1024
-#define PRISON_CELLS 1024
 #define COMMIT_PERIOD HZ
 #define NO_SPACE_TIMEOUT_SECS 60
 
@@ -2193,7 +2192,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
 		pool->sectors_per_block_shift = __ffs(block_size);
 	pool->low_water_blocks = 0;
 	pool_features_init(&pool->pf);
-	pool->prison = dm_bio_prison_create(PRISON_CELLS);
+	pool->prison = dm_bio_prison_create();
 	if (!pool->prison) {
 		*error = "Error creating pool's bio prison";
 		err_p = ERR_PTR(-ENOMEM);
-- 
cgit v1.2.3


From e5cfc69a513cdc9d9e753c5ce07f0cc6b496bfd3 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Mon, 6 Oct 2014 15:24:55 +0100
Subject: dm thin metadata: change dm_thin_find_block to allow blocking, but
 not issuing, IO

This change is a prerequisite for allowing metadata to be prefetched.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-thin-metadata.c | 30 +++++++++++++-----------------
 drivers/md/dm-thin-metadata.h |  4 ++--
 2 files changed, 15 insertions(+), 19 deletions(-)

diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index e9d33ad59df5..ee42d1c52387 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -1384,42 +1384,38 @@ static bool __snapshotted_since(struct dm_thin_device *td, uint32_t time)
 }
 
 int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block,
-		       int can_block, struct dm_thin_lookup_result *result)
+		       int can_issue_io, struct dm_thin_lookup_result *result)
 {
-	int r = -EINVAL;
-	uint64_t block_time = 0;
+	int r;
 	__le64 value;
 	struct dm_pool_metadata *pmd = td->pmd;
 	dm_block_t keys[2] = { td->id, block };
 	struct dm_btree_info *info;
 
-	if (can_block) {
-		down_read(&pmd->root_lock);
-		info = &pmd->info;
-	} else if (down_read_trylock(&pmd->root_lock))
-		info = &pmd->nb_info;
-	else
-		return -EWOULDBLOCK;
-
 	if (pmd->fail_io)
-		goto out;
+		return -EINVAL;
 
-	r = dm_btree_lookup(info, pmd->root, keys, &value);
-	if (!r)
-		block_time = le64_to_cpu(value);
+	down_read(&pmd->root_lock);
 
-out:
-	up_read(&pmd->root_lock);
+	if (can_issue_io) {
+		info = &pmd->info;
+	} else
+		info = &pmd->nb_info;
 
+	r = dm_btree_lookup(info, pmd->root, keys, &value);
 	if (!r) {
+		uint64_t block_time = 0;
 		dm_block_t exception_block;
 		uint32_t exception_time;
+
+		block_time = le64_to_cpu(value);
 		unpack_block_time(block_time, &exception_block,
 				  &exception_time);
 		result->block = exception_block;
 		result->shared = __snapshotted_since(td, exception_time);
 	}
 
+	up_read(&pmd->root_lock);
 	return r;
 }
 
diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h
index e3c857db195a..efedd5a4cd8f 100644
--- a/drivers/md/dm-thin-metadata.h
+++ b/drivers/md/dm-thin-metadata.h
@@ -139,12 +139,12 @@ struct dm_thin_lookup_result {
 
 /*
  * Returns:
- *   -EWOULDBLOCK iff @can_block is set and would block.
+ *   -EWOULDBLOCK iff @can_issue_io is set and would issue IO
  *   -ENODATA iff that mapping is not present.
  *   0 success
  */
 int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block,
-		       int can_block, struct dm_thin_lookup_result *result);
+		       int can_issue_io, struct dm_thin_lookup_result *result);
 
 /*
  * Obtain an unused block.
-- 
cgit v1.2.3


From 4646015d7e4ca5a4dc19427fb0a0aeff15a4fd91 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Mon, 6 Oct 2014 15:27:26 +0100
Subject: dm transaction manager: add support for prefetching blocks of
 metadata

Introduce the dm_tm_issue_prefetches interface.  If you're using a
non-blocking clone the tm will build up a list of requested blocks that
weren't in core.  dm_tm_issue_prefetches will request those blocks to be
prefetched.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 .../md/persistent-data/dm-transaction-manager.c    | 77 +++++++++++++++++++++-
 .../md/persistent-data/dm-transaction-manager.h    |  7 ++
 2 files changed, 82 insertions(+), 2 deletions(-)

diff --git a/drivers/md/persistent-data/dm-transaction-manager.c b/drivers/md/persistent-data/dm-transaction-manager.c
index 3bc30a0ae3d6..9cb797d800cf 100644
--- a/drivers/md/persistent-data/dm-transaction-manager.c
+++ b/drivers/md/persistent-data/dm-transaction-manager.c
@@ -10,6 +10,8 @@
 #include "dm-persistent-data-internal.h"
 
 #include <linux/export.h>
+#include <linux/mutex.h>
+#include <linux/hash.h>
 #include <linux/slab.h>
 #include <linux/device-mapper.h>
 
@@ -17,6 +19,61 @@
 
 /*----------------------------------------------------------------*/
 
+#define PREFETCH_SIZE 128
+#define PREFETCH_BITS 7
+#define PREFETCH_SENTINEL ((dm_block_t) -1ULL)
+
+struct prefetch_set {
+	struct mutex lock;
+	dm_block_t blocks[PREFETCH_SIZE];
+};
+
+static unsigned prefetch_hash(dm_block_t b)
+{
+	return hash_64(b, PREFETCH_BITS);
+}
+
+static void prefetch_wipe(struct prefetch_set *p)
+{
+	unsigned i;
+	for (i = 0; i < PREFETCH_SIZE; i++)
+		p->blocks[i] = PREFETCH_SENTINEL;
+}
+
+static void prefetch_init(struct prefetch_set *p)
+{
+	mutex_init(&p->lock);
+	prefetch_wipe(p);
+}
+
+static void prefetch_add(struct prefetch_set *p, dm_block_t b)
+{
+	unsigned h = prefetch_hash(b);
+
+	mutex_lock(&p->lock);
+	if (p->blocks[h] == PREFETCH_SENTINEL)
+		p->blocks[h] = b;
+
+	mutex_unlock(&p->lock);
+}
+
+static void prefetch_issue(struct prefetch_set *p, struct dm_block_manager *bm)
+{
+	unsigned i;
+
+	mutex_lock(&p->lock);
+
+	for (i = 0; i < PREFETCH_SIZE; i++)
+		if (p->blocks[i] != PREFETCH_SENTINEL) {
+			dm_bm_prefetch(bm, p->blocks[i]);
+			p->blocks[i] = PREFETCH_SENTINEL;
+		}
+
+	mutex_unlock(&p->lock);
+}
+
+/*----------------------------------------------------------------*/
+
 struct shadow_info {
 	struct hlist_node hlist;
 	dm_block_t where;
@@ -37,6 +94,8 @@ struct dm_transaction_manager {
 
 	spinlock_t lock;
 	struct hlist_head buckets[DM_HASH_SIZE];
+
+	struct prefetch_set prefetches;
 };
 
 /*----------------------------------------------------------------*/
@@ -117,6 +176,8 @@ static struct dm_transaction_manager *dm_tm_create(struct dm_block_manager *bm,
 	for (i = 0; i < DM_HASH_SIZE; i++)
 		INIT_HLIST_HEAD(tm->buckets + i);
 
+	prefetch_init(&tm->prefetches);
+
 	return tm;
 }
 
@@ -268,8 +329,14 @@ int dm_tm_read_lock(struct dm_transaction_manager *tm, dm_block_t b,
 		    struct dm_block_validator *v,
 		    struct dm_block **blk)
 {
-	if (tm->is_clone)
-		return dm_bm_read_try_lock(tm->real->bm, b, v, blk);
+	if (tm->is_clone) {
+		int r = dm_bm_read_try_lock(tm->real->bm, b, v, blk);
+
+		if (r == -EWOULDBLOCK)
+			prefetch_add(&tm->real->prefetches, b);
+
+		return r;
+	}
 
 	return dm_bm_read_lock(tm->bm, b, v, blk);
 }
@@ -317,6 +384,12 @@ struct dm_block_manager *dm_tm_get_bm(struct dm_transaction_manager *tm)
 	return tm->bm;
 }
 
+void dm_tm_issue_prefetches(struct dm_transaction_manager *tm)
+{
+	prefetch_issue(&tm->prefetches, tm->bm);
+}
+EXPORT_SYMBOL_GPL(dm_tm_issue_prefetches);
+
 /*----------------------------------------------------------------*/
 
 static int dm_tm_create_internal(struct dm_block_manager *bm,
diff --git a/drivers/md/persistent-data/dm-transaction-manager.h b/drivers/md/persistent-data/dm-transaction-manager.h
index 2772ed2a781a..2e0d4d66fb1b 100644
--- a/drivers/md/persistent-data/dm-transaction-manager.h
+++ b/drivers/md/persistent-data/dm-transaction-manager.h
@@ -108,6 +108,13 @@ int dm_tm_ref(struct dm_transaction_manager *tm, dm_block_t b,
 
 struct dm_block_manager *dm_tm_get_bm(struct dm_transaction_manager *tm);
 
+/*
+ * If you're using a non-blocking clone the tm will build up a list of
+ * requested blocks that weren't in core.  This call will request those
+ * blocks to be prefetched.
+ */
+void dm_tm_issue_prefetches(struct dm_transaction_manager *tm);
+
 /*
  * A little utility that ties the knot by producing a transaction manager
  * that has a space map managed by the transaction manager...
-- 
cgit v1.2.3


From 8a01a6af75f839ff8eb25dab69b49224e855bfa1 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Mon, 6 Oct 2014 15:28:30 +0100
Subject: dm thin: prefetch missing metadata pages

Prefetch metadata at the start of the worker thread and then again every
128th bio processed from the deferred list.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-thin-metadata.c |  5 +++++
 drivers/md/dm-thin-metadata.h |  5 +++++
 drivers/md/dm-thin.c          | 10 ++++++----
 3 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index ee42d1c52387..43adbb863f5a 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -1809,3 +1809,8 @@ bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd)
 
 	return needs_check;
 }
+
+void dm_pool_issue_prefetches(struct dm_pool_metadata *pmd)
+{
+	dm_tm_issue_prefetches(pmd->tm);
+}
diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h
index efedd5a4cd8f..921d15ee56a0 100644
--- a/drivers/md/dm-thin-metadata.h
+++ b/drivers/md/dm-thin-metadata.h
@@ -213,6 +213,11 @@ int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
 int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd);
 bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd);
 
+/*
+ * Issue any prefetches that may be useful.
+ */
+void dm_pool_issue_prefetches(struct dm_pool_metadata *pmd);
+
 /*----------------------------------------------------------------*/
 
 #endif
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index eecfe7495232..97a7eb4d0412 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -1526,6 +1526,7 @@ static void process_thin_deferred_bios(struct thin_c *tc)
 	struct bio *bio;
 	struct bio_list bios;
 	struct blk_plug plug;
+	unsigned count = 0;
 
 	if (tc->requeue_mode) {
 		requeue_bio_list(tc, &tc->deferred_bio_list);
@@ -1567,6 +1568,10 @@ static void process_thin_deferred_bios(struct thin_c *tc)
 			pool->process_discard(tc, bio);
 		else
 			pool->process_bio(tc, bio);
+
+		if ((count++ & 127) == 0) {
+			dm_pool_issue_prefetches(pool->pmd);
+		}
 	}
 	blk_finish_plug(&plug);
 }
@@ -1652,6 +1657,7 @@ static void do_worker(struct work_struct *ws)
 {
 	struct pool *pool = container_of(ws, struct pool, worker);
 
+	dm_pool_issue_prefetches(pool->pmd);
 	process_prepared(pool, &pool->prepared_mappings, &pool->process_prepared_mapping);
 	process_prepared(pool, &pool->prepared_discards, &pool->process_prepared_discard);
 	process_deferred_bios(pool);
@@ -1996,10 +2002,6 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
 		/* fall through */
 
 	case -EWOULDBLOCK:
-		/*
-		 * In future, the failed dm_thin_find_block above could
-		 * provide the hint to load the metadata into cache.
-		 */
 		thin_defer_bio(tc, bio);
 		cell_defer_no_holder_no_free(tc, &cell1);
 		return DM_MAPIO_SUBMITTED;
-- 
cgit v1.2.3


From 7d327fe051edcccf54da7b6733c58992473f228b Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Mon, 6 Oct 2014 15:45:59 +0100
Subject: dm thin: throttle incoming IO

Throttle IO based on the time it's taking the worker to do one loop.
There were reports of hung task timeouts occuring and it was observed
that the excessively long avgqu-sz (as reported by iostat) was
contributing to these hung tasks.

Throttling definitely helps dm-thinp perform better under heavy IO load
(without being detremental by being overzealous).  It reduces avgqu-sz
drastically, e.g.: from 60K to ~6K, and even as low as 150 once metadata
is cached by bufio, when dirty_ratio=5, dirty_background_ratio=2.  And
avgqu-sz stays at or below 30K even with dirty_ratio=20,
dirty_background_ratio=10.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-thin.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 65 insertions(+), 1 deletion(-)

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 97a7eb4d0412..91b430b883fd 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -126,6 +126,53 @@ static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
 
 /*----------------------------------------------------------------*/
 
+#define THROTTLE_THRESHOLD (1 * HZ)
+
+struct throttle {
+	struct rw_semaphore lock;
+	unsigned long threshold;
+	bool throttle_applied;
+};
+
+static void throttle_init(struct throttle *t)
+{
+	init_rwsem(&t->lock);
+	t->throttle_applied = false;
+}
+
+static void throttle_work_start(struct throttle *t)
+{
+	t->threshold = jiffies + THROTTLE_THRESHOLD;
+}
+
+static void throttle_work_update(struct throttle *t)
+{
+	if (!t->throttle_applied && jiffies > t->threshold) {
+		down_write(&t->lock);
+		t->throttle_applied = true;
+	}
+}
+
+static void throttle_work_complete(struct throttle *t)
+{
+	if (t->throttle_applied) {
+		t->throttle_applied = false;
+		up_write(&t->lock);
+	}
+}
+
+static void throttle_lock(struct throttle *t)
+{
+	down_read(&t->lock);
+}
+
+static void throttle_unlock(struct throttle *t)
+{
+	up_read(&t->lock);
+}
+
+/*----------------------------------------------------------------*/
+
 /*
  * A pool device ties together a metadata device and a data device.  It
  * also provides the interface for creating and destroying internal
@@ -175,6 +222,7 @@ struct pool {
 	struct dm_kcopyd_client *copier;
 
 	struct workqueue_struct *wq;
+	struct throttle throttle;
 	struct work_struct worker;
 	struct delayed_work waker;
 	struct delayed_work no_space_timeout;
@@ -1570,6 +1618,7 @@ static void process_thin_deferred_bios(struct thin_c *tc)
 			pool->process_bio(tc, bio);
 
 		if ((count++ & 127) == 0) {
+			throttle_work_update(&pool->throttle);
 			dm_pool_issue_prefetches(pool->pmd);
 		}
 	}
@@ -1657,10 +1706,15 @@ static void do_worker(struct work_struct *ws)
 {
 	struct pool *pool = container_of(ws, struct pool, worker);
 
+	throttle_work_start(&pool->throttle);
 	dm_pool_issue_prefetches(pool->pmd);
+	throttle_work_update(&pool->throttle);
 	process_prepared(pool, &pool->prepared_mappings, &pool->process_prepared_mapping);
+	throttle_work_update(&pool->throttle);
 	process_prepared(pool, &pool->prepared_discards, &pool->process_prepared_discard);
+	throttle_work_update(&pool->throttle);
 	process_deferred_bios(pool);
+	throttle_work_complete(&pool->throttle);
 }
 
 /*
@@ -1900,6 +1954,15 @@ static void thin_defer_bio(struct thin_c *tc, struct bio *bio)
 	wake_worker(pool);
 }
 
+static void thin_defer_bio_with_throttle(struct thin_c *tc, struct bio *bio)
+{
+	struct pool *pool = tc->pool;
+
+	throttle_lock(&pool->throttle);
+	thin_defer_bio(tc, bio);
+	throttle_unlock(&pool->throttle);
+}
+
 static void thin_hook_bio(struct thin_c *tc, struct bio *bio)
 {
 	struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
@@ -1937,7 +2000,7 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
 	}
 
 	if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) {
-		thin_defer_bio(tc, bio);
+		thin_defer_bio_with_throttle(tc, bio);
 		return DM_MAPIO_SUBMITTED;
 	}
 
@@ -2220,6 +2283,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
 		goto bad_wq;
 	}
 
+	throttle_init(&pool->throttle);
 	INIT_WORK(&pool->worker, do_worker);
 	INIT_DELAYED_WORK(&pool->waker, do_waker);
 	INIT_DELAYED_WORK(&pool->no_space_timeout, do_no_space_timeout);
-- 
cgit v1.2.3


From 604ea90641b45f41f8dee34ce45694f1e0c53a5a Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 9 Oct 2014 18:43:25 -0400
Subject: dm thin: adjust max_sectors_kb based on thinp blocksize

Allows for filesystems to submit bios that are a factor of the thinp
blocksize, improving dm-thinp efficiency (particularly when the data
volume is RAID).

Also set io_min to max_sectors_kb if it is a factor of the thinp
blocksize.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-thin.c | 34 +++++++++++++++++++++++++++++++---
 1 file changed, 31 insertions(+), 3 deletions(-)

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 91b430b883fd..de55ae9d4926 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -11,6 +11,7 @@
 #include <linux/device-mapper.h>
 #include <linux/dm-io.h>
 #include <linux/dm-kcopyd.h>
+#include <linux/log2.h>
 #include <linux/list.h>
 #include <linux/rculist.h>
 #include <linux/init.h>
@@ -3242,15 +3243,42 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
 {
 	struct pool_c *pt = ti->private;
 	struct pool *pool = pt->pool;
-	uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
+	sector_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
+
+	/*
+	 * Adjust max_sectors_kb to highest possible power-of-2
+	 * factor of pool->sectors_per_block.
+	 */
+	if (limits->max_hw_sectors & (limits->max_hw_sectors - 1))
+		limits->max_sectors = rounddown_pow_of_two(limits->max_hw_sectors);
+	else
+		limits->max_sectors = limits->max_hw_sectors;
+
+	if (limits->max_sectors < pool->sectors_per_block) {
+		while (!is_factor(pool->sectors_per_block, limits->max_sectors)) {
+			if ((limits->max_sectors & (limits->max_sectors - 1)) == 0)
+				limits->max_sectors--;
+			limits->max_sectors = rounddown_pow_of_two(limits->max_sectors);
+		}
+	} else if (block_size_is_power_of_two(pool)) {
+		/* max_sectors_kb is >= power-of-2 thinp blocksize */
+		while (!is_factor(limits->max_sectors, pool->sectors_per_block)) {
+			if ((limits->max_sectors & (limits->max_sectors - 1)) == 0)
+				limits->max_sectors--;
+			limits->max_sectors = rounddown_pow_of_two(limits->max_sectors);
+		}
+	}
 
 	/*
 	 * If the system-determined stacked limits are compatible with the
 	 * pool's blocksize (io_opt is a factor) do not override them.
 	 */
 	if (io_opt_sectors < pool->sectors_per_block ||
-	    do_div(io_opt_sectors, pool->sectors_per_block)) {
-		blk_limits_io_min(limits, pool->sectors_per_block << SECTOR_SHIFT);
+	    !is_factor(io_opt_sectors, pool->sectors_per_block)) {
+		if (is_factor(pool->sectors_per_block, limits->max_sectors))
+			blk_limits_io_min(limits, limits->max_sectors << SECTOR_SHIFT);
+		else
+			blk_limits_io_min(limits, pool->sectors_per_block << SECTOR_SHIFT);
 		blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
 	}
 
-- 
cgit v1.2.3


From 148e51baf8e7ae2070ec47c2a0ec05ddf6a47da1 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 9 Oct 2014 19:32:22 -0400
Subject: dm: improve documentation and code clarity in dm_merge_bvec

These code changes do not introduce a functional change.

But bio_add_page() will never attempt to build up a bio larger than
queue_max_sectors().  Similarly, bio_get_nr_vecs() is also bound by
queue_max_sectors().  Therefore, there is no point in allowing
dm_merge_bvec() to answer "how many sectors can a bio have at this
offset?" with anything larger than queue_max_sectors().  Using
queue_max_sectors() rather than BIO_MAX_SECTORS serves to more
accurately convey the limits that are being imposed.

Also, use unlikely() to clarify the fact that the defensive code in
dm_merge_bvec() relative to max_size going negative shouldn't ever
happen -- if it does happen there is a bug in the block layer for
requesting larger than dm_merge_bvec()'s initial response for a given
offset.  Also, update a comment in dm_merge_bvec() relative to
max_hw_sectors_kb.  And fix empty newline whitespace.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 58f3927fd7cc..0fee0e54d36f 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1607,9 +1607,9 @@ static int dm_merge_bvec(struct request_queue *q,
 	 * Find maximum amount of I/O that won't need splitting
 	 */
 	max_sectors = min(max_io_len(bvm->bi_sector, ti),
-			  (sector_t) BIO_MAX_SECTORS);
+			  (sector_t) queue_max_sectors(q));
 	max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size;
-	if (max_size < 0)
+	if (unlikely(max_size < 0)) /* this shouldn't _ever_ happen */
 		max_size = 0;
 
 	/*
@@ -1621,10 +1621,10 @@ static int dm_merge_bvec(struct request_queue *q,
 		max_size = ti->type->merge(ti, bvm, biovec, max_size);
 	/*
 	 * If the target doesn't support merge method and some of the devices
-	 * provided their merge_bvec method (we know this by looking at
-	 * queue_max_hw_sectors), then we can't allow bios with multiple vector
-	 * entries.  So always set max_size to 0, and the code below allows
-	 * just one page.
+	 * provided their merge_bvec method (we know this by looking for the
+	 * max_hw_sectors that dm_set_device_limits may set), then we can't
+	 * allow bios with multiple vector entries.  So always set max_size
+	 * to 0, and the code below allows just one page.
 	 */
 	else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9)
 		max_size = 0;
-- 
cgit v1.2.3


From 36f12aeb714fc04752997d6c07b6afb2fa0ac947 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 9 Oct 2014 15:24:12 -0400
Subject: dm thin: implement thin_merge

Introduce thin_merge so that any additional constraints from the data
volume may be taken into account when determing the maximum number of
sectors that can be issued relative to the specified logical offset.

This is particularly important if/when the data volume is layered ontop
of a more sophisticated device (e.g. dm-raid or some other DM target).

Reviewed-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-thin.c | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index de55ae9d4926..068607828691 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -3307,7 +3307,7 @@ static struct target_type pool_target = {
 	.name = "thin-pool",
 	.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
 		    DM_TARGET_IMMUTABLE,
-	.version = {1, 13, 0},
+	.version = {1, 14, 0},
 	.module = THIS_MODULE,
 	.ctr = pool_ctr,
 	.dtr = pool_dtr,
@@ -3634,6 +3634,21 @@ err:
 	DMEMIT("Error");
 }
 
+static int thin_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
+		      struct bio_vec *biovec, int max_size)
+{
+	struct thin_c *tc = ti->private;
+	struct request_queue *q = bdev_get_queue(tc->pool_dev->bdev);
+
+	if (!q->merge_bvec_fn)
+		return max_size;
+
+	bvm->bi_bdev = tc->pool_dev->bdev;
+	bvm->bi_sector = dm_target_offset(ti, bvm->bi_sector);
+
+	return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
+}
+
 static int thin_iterate_devices(struct dm_target *ti,
 				iterate_devices_callout_fn fn, void *data)
 {
@@ -3658,7 +3673,7 @@ static int thin_iterate_devices(struct dm_target *ti,
 
 static struct target_type thin_target = {
 	.name = "thin",
-	.version = {1, 13, 0},
+	.version = {1, 14, 0},
 	.module	= THIS_MODULE,
 	.ctr = thin_ctr,
 	.dtr = thin_dtr,
@@ -3668,6 +3683,7 @@ static struct target_type thin_target = {
 	.presuspend = thin_presuspend,
 	.postsuspend = thin_postsuspend,
 	.status = thin_status,
+	.merge = thin_merge,
 	.iterate_devices = thin_iterate_devices,
 };
 
-- 
cgit v1.2.3


From 7a7e97ca580b944d2c89b59bc74a7b9ddd044705 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Fri, 12 Sep 2014 11:34:01 +0100
Subject: dm thin: performance improvement to discard processing

When processing a discard bio, if the block is already quiesced do the
discard immediately rather than adding the mapping to a list for the
next iteration of the worker thread.

Discarding a fully provisioned 100G thin volume with 64k block size goes
from 860s to 95s with this change.

Clearly there's something wrong with the worker architecture, more
investigation needed.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-thin.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 068607828691..8c3d048dd319 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -1194,7 +1194,6 @@ static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *c
 static void process_discard(struct thin_c *tc, struct bio *bio)
 {
 	int r;
-	unsigned long flags;
 	struct pool *pool = tc->pool;
 	struct dm_bio_prison_cell *cell, *cell2;
 	struct dm_cell_key key, key2;
@@ -1235,12 +1234,9 @@ static void process_discard(struct thin_c *tc, struct bio *bio)
 			m->cell2 = cell2;
 			m->bio = bio;
 
-			if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list)) {
-				spin_lock_irqsave(&pool->lock, flags);
-				list_add_tail(&m->list, &pool->prepared_discards);
-				spin_unlock_irqrestore(&pool->lock, flags);
-				wake_worker(pool);
-			}
+			if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
+				pool->process_prepared_discard(m);
+
 		} else {
 			inc_all_io_entry(pool, bio);
 			cell_defer_no_holder(tc, cell);
-- 
cgit v1.2.3


From 452d7a620dc38cb525c403aa4b445028da359268 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 9 Oct 2014 19:20:21 -0400
Subject: dm thin: factor out remap_and_issue_overwrite

Purely cleanup of duplicated code, no functional change.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-thin.c | 38 ++++++++++++++++++++------------------
 1 file changed, 20 insertions(+), 18 deletions(-)

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 8c3d048dd319..52562710f6a0 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -890,6 +890,20 @@ static void ll_zero(struct thin_c *tc, struct dm_thin_new_mapping *m,
 	}
 }
 
+static void remap_and_issue_overwrite(struct thin_c *tc, struct bio *bio,
+				      dm_block_t data_block,
+				      struct dm_thin_new_mapping *m)
+{
+	struct pool *pool = tc->pool;
+	struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
+
+	h->overwrite_mapping = m;
+	m->bio = bio;
+	save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
+	inc_all_io_entry(pool, bio);
+	remap_and_issue(tc, bio, data_block);
+}
+
 /*
  * A partial copy also needs to zero the uncopied region.
  */
@@ -924,15 +938,9 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
 	 * If the whole block of data is being overwritten, we can issue the
 	 * bio immediately. Otherwise we use kcopyd to clone the data first.
 	 */
-	if (io_overwrites_block(pool, bio)) {
-		struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
-
-		h->overwrite_mapping = m;
-		m->bio = bio;
-		save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
-		inc_all_io_entry(pool, bio);
-		remap_and_issue(tc, bio, data_dest);
-	} else {
+	if (io_overwrites_block(pool, bio))
+		remap_and_issue_overwrite(tc, bio, data_dest, m);
+	else {
 		struct dm_io_region from, to;
 
 		from.bdev = origin->bdev;
@@ -1001,16 +1009,10 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
 	if (!pool->pf.zero_new_blocks)
 		process_prepared_mapping(m);
 
-	else if (io_overwrites_block(pool, bio)) {
-		struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
-
-		h->overwrite_mapping = m;
-		m->bio = bio;
-		save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
-		inc_all_io_entry(pool, bio);
-		remap_and_issue(tc, bio, data_block);
+	else if (io_overwrites_block(pool, bio))
+		remap_and_issue_overwrite(tc, bio, data_block, m);
 
-	} else
+	else
 		ll_zero(tc, m,
 			data_block * pool->sectors_per_block,
 			(data_block + 1) * pool->sectors_per_block);
-- 
cgit v1.2.3


From a374bb217b449a00eb96d0584bb833a8b62b672a Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Fri, 10 Oct 2014 13:43:14 +0100
Subject: dm thin: defer whole cells rather than individual bios

This avoids dropping the cell, so increases the probability that other
bios will collect within the cell, rather than being passed individually
to the worker.

Also add required process_cell and process_discard_cell error handling
wrappers and set associated pool-mode function pointers accordingly.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-bio-prison.h |   1 +
 drivers/md/dm-thin.c       | 254 ++++++++++++++++++++++++++++++++++++---------
 2 files changed, 208 insertions(+), 47 deletions(-)

diff --git a/drivers/md/dm-bio-prison.h b/drivers/md/dm-bio-prison.h
index 997a43960e77..c0cddb118582 100644
--- a/drivers/md/dm-bio-prison.h
+++ b/drivers/md/dm-bio-prison.h
@@ -35,6 +35,7 @@ struct dm_cell_key {
  * themselves.
  */
 struct dm_bio_prison_cell {
+	struct list_head user_list;	/* for client use */
 	struct rb_node node;
 
 	struct dm_cell_key key;
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 52562710f6a0..912d7f4d89d1 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -202,6 +202,7 @@ struct pool_features {
 
 struct thin_c;
 typedef void (*process_bio_fn)(struct thin_c *tc, struct bio *bio);
+typedef void (*process_cell_fn)(struct thin_c *tc, struct dm_bio_prison_cell *cell);
 typedef void (*process_mapping_fn)(struct dm_thin_new_mapping *m);
 
 struct pool {
@@ -246,6 +247,9 @@ struct pool {
 	process_bio_fn process_bio;
 	process_bio_fn process_discard;
 
+	process_cell_fn process_cell;
+	process_cell_fn process_discard_cell;
+
 	process_mapping_fn process_prepared_mapping;
 	process_mapping_fn process_prepared_discard;
 };
@@ -282,6 +286,7 @@ struct thin_c {
 	struct dm_thin_device *td;
 	bool requeue_mode:1;
 	spinlock_t lock;
+	struct list_head deferred_cells;
 	struct bio_list deferred_bio_list;
 	struct bio_list retry_on_resume_list;
 	struct rb_root sort_bio_list; /* sorted list of deferred bios */
@@ -346,19 +351,6 @@ static void cell_release_no_holder(struct pool *pool,
 	dm_bio_prison_free_cell(pool->prison, cell);
 }
 
-static void cell_defer_no_holder_no_free(struct thin_c *tc,
-					 struct dm_bio_prison_cell *cell)
-{
-	struct pool *pool = tc->pool;
-	unsigned long flags;
-
-	spin_lock_irqsave(&tc->lock, flags);
-	dm_cell_release_no_holder(pool->prison, cell, &tc->deferred_bio_list);
-	spin_unlock_irqrestore(&tc->lock, flags);
-
-	wake_worker(pool);
-}
-
 static void cell_error_with_code(struct pool *pool,
 				 struct dm_bio_prison_cell *cell, int error_code)
 {
@@ -371,6 +363,16 @@ static void cell_error(struct pool *pool, struct dm_bio_prison_cell *cell)
 	cell_error_with_code(pool, cell, -EIO);
 }
 
+static void cell_success(struct pool *pool, struct dm_bio_prison_cell *cell)
+{
+	cell_error_with_code(pool, cell, 0);
+}
+
+static void cell_requeue(struct pool *pool, struct dm_bio_prison_cell *cell)
+{
+	cell_error_with_code(pool, cell, DM_ENDIO_REQUEUE);
+}
+
 /*----------------------------------------------------------------*/
 
 /*
@@ -458,10 +460,28 @@ static void requeue_bio_list(struct thin_c *tc, struct bio_list *master)
 		bio_endio(bio, DM_ENDIO_REQUEUE);
 }
 
+static void requeue_deferred_cells(struct thin_c *tc)
+{
+	struct pool *pool = tc->pool;
+	unsigned long flags;
+	struct list_head cells;
+	struct dm_bio_prison_cell *cell, *tmp;
+
+	INIT_LIST_HEAD(&cells);
+
+	spin_lock_irqsave(&tc->lock, flags);
+	list_splice_init(&tc->deferred_cells, &cells);
+	spin_unlock_irqrestore(&tc->lock, flags);
+
+	list_for_each_entry_safe(cell, tmp, &cells, user_list)
+		cell_requeue(pool, cell);
+}
+
 static void requeue_io(struct thin_c *tc)
 {
 	requeue_bio_list(tc, &tc->deferred_bio_list);
 	requeue_bio_list(tc, &tc->retry_on_resume_list);
+	requeue_deferred_cells(tc);
 }
 
 static void error_thin_retry_list(struct thin_c *tc)
@@ -706,6 +726,28 @@ static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *c
 	wake_worker(pool);
 }
 
+static void thin_defer_bio(struct thin_c *tc, struct bio *bio);
+
+static void inc_remap_and_issue_cell(struct thin_c *tc,
+				     struct dm_bio_prison_cell *cell,
+				     dm_block_t block)
+{
+	struct bio *bio;
+	struct bio_list bios;
+
+	bio_list_init(&bios);
+	cell_release_no_holder(tc->pool, cell, &bios);
+
+	while ((bio = bio_list_pop(&bios))) {
+		if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA))
+			thin_defer_bio(tc, bio);
+		else {
+			inc_all_io_entry(tc->pool, bio);
+			remap_and_issue(tc, bio, block);
+		}
+	}
+}
+
 static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
 {
 	if (m->bio) {
@@ -1193,19 +1235,21 @@ static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *c
 			retry_on_resume(bio);
 }
 
-static void process_discard(struct thin_c *tc, struct bio *bio)
+static void process_discard_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)
 {
 	int r;
+	struct bio *bio = cell->holder;
 	struct pool *pool = tc->pool;
-	struct dm_bio_prison_cell *cell, *cell2;
-	struct dm_cell_key key, key2;
+	struct dm_bio_prison_cell *cell2;
+	struct dm_cell_key key2;
 	dm_block_t block = get_bio_block(tc, bio);
 	struct dm_thin_lookup_result lookup_result;
 	struct dm_thin_new_mapping *m;
 
-	build_virtual_key(tc->td, block, &key);
-	if (bio_detain(tc->pool, &key, bio, &cell))
+	if (tc->requeue_mode) {
+		cell_requeue(pool, cell);
 		return;
+	}
 
 	r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
 	switch (r) {
@@ -1273,6 +1317,19 @@ static void process_discard(struct thin_c *tc, struct bio *bio)
 	}
 }
 
+static void process_discard_bio(struct thin_c *tc, struct bio *bio)
+{
+	struct dm_bio_prison_cell *cell;
+	struct dm_cell_key key;
+	dm_block_t block = get_bio_block(tc, bio);
+
+	build_virtual_key(tc->td, block, &key);
+	if (bio_detain(tc->pool, &key, bio, &cell))
+		return;
+
+	process_discard_cell(tc, cell);
+}
+
 static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
 			  struct dm_cell_key *key,
 			  struct dm_thin_lookup_result *lookup_result,
@@ -1379,34 +1436,30 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block
 	}
 }
 
-static void process_bio(struct thin_c *tc, struct bio *bio)
+static void process_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)
 {
 	int r;
 	struct pool *pool = tc->pool;
+	struct bio *bio = cell->holder;
 	dm_block_t block = get_bio_block(tc, bio);
-	struct dm_bio_prison_cell *cell;
-	struct dm_cell_key key;
 	struct dm_thin_lookup_result lookup_result;
 
-	/*
-	 * If cell is already occupied, then the block is already
-	 * being provisioned so we have nothing further to do here.
-	 */
-	build_virtual_key(tc->td, block, &key);
-	if (bio_detain(pool, &key, bio, &cell))
+	if (tc->requeue_mode) {
+		cell_requeue(pool, cell);
 		return;
+	}
 
 	r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
 	switch (r) {
 	case 0:
 		if (lookup_result.shared) {
 			process_shared_bio(tc, bio, block, &lookup_result);
+			// FIXME: we can't remap because we're waiting on a commit
 			cell_defer_no_holder(tc, cell); /* FIXME: pass this cell into process_shared? */
 		} else {
 			inc_all_io_entry(pool, bio);
-			cell_defer_no_holder(tc, cell);
-
 			remap_and_issue(tc, bio, lookup_result.block);
+			inc_remap_and_issue_cell(tc, cell, lookup_result.block);
 		}
 		break;
 
@@ -1440,7 +1493,26 @@ static void process_bio(struct thin_c *tc, struct bio *bio)
 	}
 }
 
-static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
+static void process_bio(struct thin_c *tc, struct bio *bio)
+{
+	struct pool *pool = tc->pool;
+	dm_block_t block = get_bio_block(tc, bio);
+	struct dm_bio_prison_cell *cell;
+	struct dm_cell_key key;
+
+	/*
+	 * If cell is already occupied, then the block is already
+	 * being provisioned so we have nothing further to do here.
+	 */
+	build_virtual_key(tc->td, block, &key);
+	if (bio_detain(pool, &key, bio, &cell))
+		return;
+
+	process_cell(tc, cell);
+}
+
+static void __process_bio_read_only(struct thin_c *tc, struct bio *bio,
+				    struct dm_bio_prison_cell *cell)
 {
 	int r;
 	int rw = bio_data_dir(bio);
@@ -1450,15 +1522,21 @@ static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
 	r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
 	switch (r) {
 	case 0:
-		if (lookup_result.shared && (rw == WRITE) && bio->bi_iter.bi_size)
+		if (lookup_result.shared && (rw == WRITE) && bio->bi_iter.bi_size) {
 			handle_unserviceable_bio(tc->pool, bio);
-		else {
+			if (cell)
+				cell_defer_no_holder(tc, cell);
+		} else {
 			inc_all_io_entry(tc->pool, bio);
 			remap_and_issue(tc, bio, lookup_result.block);
+			if (cell)
+				inc_remap_and_issue_cell(tc, cell, lookup_result.block);
 		}
 		break;
 
 	case -ENODATA:
+		if (cell)
+			cell_defer_no_holder(tc, cell);
 		if (rw != READ) {
 			handle_unserviceable_bio(tc->pool, bio);
 			break;
@@ -1477,11 +1555,23 @@ static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
 	default:
 		DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
 			    __func__, r);
+		if (cell)
+			cell_defer_no_holder(tc, cell);
 		bio_io_error(bio);
 		break;
 	}
 }
 
+static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
+{
+	__process_bio_read_only(tc, bio, NULL);
+}
+
+static void process_cell_read_only(struct thin_c *tc, struct dm_bio_prison_cell *cell)
+{
+	__process_bio_read_only(tc, cell->holder, cell);
+}
+
 static void process_bio_success(struct thin_c *tc, struct bio *bio)
 {
 	bio_endio(bio, 0);
@@ -1492,6 +1582,16 @@ static void process_bio_fail(struct thin_c *tc, struct bio *bio)
 	bio_io_error(bio);
 }
 
+static void process_cell_success(struct thin_c *tc, struct dm_bio_prison_cell *cell)
+{
+	cell_success(tc->pool, cell);
+}
+
+static void process_cell_fail(struct thin_c *tc, struct dm_bio_prison_cell *cell)
+{
+	cell_error(tc->pool, cell);
+}
+
 /*
  * FIXME: should we also commit due to size of transaction, measured in
  * metadata blocks?
@@ -1624,6 +1724,45 @@ static void process_thin_deferred_bios(struct thin_c *tc)
 	blk_finish_plug(&plug);
 }
 
+static void process_thin_deferred_cells(struct thin_c *tc)
+{
+	struct pool *pool = tc->pool;
+	unsigned long flags;
+	struct list_head cells;
+	struct dm_bio_prison_cell *cell, *tmp;
+
+	INIT_LIST_HEAD(&cells);
+
+	spin_lock_irqsave(&tc->lock, flags);
+	list_splice_init(&tc->deferred_cells, &cells);
+	spin_unlock_irqrestore(&tc->lock, flags);
+
+	if (list_empty(&cells))
+		return;
+
+	list_for_each_entry_safe(cell, tmp, &cells, user_list) {
+		BUG_ON(!cell->holder);
+
+		/*
+		 * If we've got no free new_mapping structs, and processing
+		 * this bio might require one, we pause until there are some
+		 * prepared mappings to process.
+		 */
+		if (ensure_next_mapping(pool)) {
+			spin_lock_irqsave(&tc->lock, flags);
+			list_add(&cell->user_list, &tc->deferred_cells);
+			list_splice(&cells, &tc->deferred_cells);
+			spin_unlock_irqrestore(&tc->lock, flags);
+			break;
+		}
+
+		if (cell->holder->bi_rw & REQ_DISCARD)
+			pool->process_discard_cell(tc, cell);
+		else
+			pool->process_cell(tc, cell);
+	}
+}
+
 static void thin_get(struct thin_c *tc);
 static void thin_put(struct thin_c *tc);
 
@@ -1672,6 +1811,7 @@ static void process_deferred_bios(struct pool *pool)
 
 	tc = get_first_thin(pool);
 	while (tc) {
+		process_thin_deferred_cells(tc);
 		process_thin_deferred_bios(tc);
 		tc = get_next_thin(pool, tc);
 	}
@@ -1850,6 +1990,8 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
 		dm_pool_metadata_read_only(pool->pmd);
 		pool->process_bio = process_bio_fail;
 		pool->process_discard = process_bio_fail;
+		pool->process_cell = process_cell_fail;
+		pool->process_discard_cell = process_cell_fail;
 		pool->process_prepared_mapping = process_prepared_mapping_fail;
 		pool->process_prepared_discard = process_prepared_discard_fail;
 
@@ -1862,6 +2004,8 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
 		dm_pool_metadata_read_only(pool->pmd);
 		pool->process_bio = process_bio_read_only;
 		pool->process_discard = process_bio_success;
+		pool->process_cell = process_cell_read_only;
+		pool->process_discard_cell = process_cell_success;
 		pool->process_prepared_mapping = process_prepared_mapping_fail;
 		pool->process_prepared_discard = process_prepared_discard_passdown;
 
@@ -1880,7 +2024,9 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
 		if (old_mode != new_mode)
 			notify_of_pool_mode_change(pool, "out-of-data-space");
 		pool->process_bio = process_bio_read_only;
-		pool->process_discard = process_discard;
+		pool->process_discard = process_discard_bio;
+		pool->process_cell = process_cell_read_only;
+		pool->process_discard_cell = process_discard_cell;
 		pool->process_prepared_mapping = process_prepared_mapping;
 		pool->process_prepared_discard = process_prepared_discard_passdown;
 
@@ -1893,7 +2039,9 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
 			notify_of_pool_mode_change(pool, "write");
 		dm_pool_metadata_read_write(pool->pmd);
 		pool->process_bio = process_bio;
-		pool->process_discard = process_discard;
+		pool->process_discard = process_discard_bio;
+		pool->process_cell = process_cell;
+		pool->process_discard_cell = process_discard_cell;
 		pool->process_prepared_mapping = process_prepared_mapping;
 		pool->process_prepared_discard = process_prepared_discard;
 		break;
@@ -1962,6 +2110,20 @@ static void thin_defer_bio_with_throttle(struct thin_c *tc, struct bio *bio)
 	throttle_unlock(&pool->throttle);
 }
 
+static void thin_defer_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)
+{
+	unsigned long flags;
+	struct pool *pool = tc->pool;
+
+	throttle_lock(&pool->throttle);
+	spin_lock_irqsave(&tc->lock, flags);
+	list_add_tail(&cell->user_list, &tc->deferred_cells);
+	spin_unlock_irqrestore(&tc->lock, flags);
+	throttle_unlock(&pool->throttle);
+
+	wake_worker(pool);
+}
+
 static void thin_hook_bio(struct thin_c *tc, struct bio *bio)
 {
 	struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
@@ -1982,8 +2144,7 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
 	dm_block_t block = get_bio_block(tc, bio);
 	struct dm_thin_device *td = tc->td;
 	struct dm_thin_lookup_result result;
-	struct dm_bio_prison_cell cell1, cell2;
-	struct dm_bio_prison_cell *cell_result;
+	struct dm_bio_prison_cell *virt_cell, *data_cell;
 	struct dm_cell_key key;
 
 	thin_hook_bio(tc, bio);
@@ -2008,7 +2169,7 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
 	 * there's a race with discard.
 	 */
 	build_virtual_key(tc->td, block, &key);
-	if (dm_bio_detain(tc->pool->prison, &key, bio, &cell1, &cell_result))
+	if (bio_detain(tc->pool, &key, bio, &virt_cell))
 		return DM_MAPIO_SUBMITTED;
 
 	r = dm_thin_find_block(td, block, 0, &result);
@@ -2033,20 +2194,19 @@ static