From 6076905b5ef39e0ea58db32583c9e0036c05e47b Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 10 Dec 2009 23:51:52 +0000
Subject: dm: avoid _hash_lock deadlock

Fix a reported deadlock if there are still unprocessed multipath events
on a device that is being removed.

_hash_lock is held during dev_remove while trying to send the
outstanding events.  Sending the events requests the _hash_lock
again in dm_copy_name_and_uuid.

This patch introduces a separate lock around regions that modify the
link to the hash table (dm_set_mdptr) or the name or uuid so that
dm_copy_name_and_uuid no longer needs _hash_lock.

Additionally, dm_copy_name_and_uuid can only be called if md exists
so we can drop the dm_get() and dm_put() which can lead to a BUG()
while md is being freed.

The deadlock:
 #0 [ffff8106298dfb48] schedule at ffffffff80063035
 #1 [ffff8106298dfc20] __down_read at ffffffff8006475d
 #2 [ffff8106298dfc60] dm_copy_name_and_uuid at ffffffff8824f740
 #3 [ffff8106298dfc90] dm_send_uevents at ffffffff88252685
 #4 [ffff8106298dfcd0] event_callback at ffffffff8824c678
 #5 [ffff8106298dfd00] dm_table_event at ffffffff8824dd01
 #6 [ffff8106298dfd10] __hash_remove at ffffffff882507ad
 #7 [ffff8106298dfd30] dev_remove at ffffffff88250865
 #8 [ffff8106298dfd60] ctl_ioctl at ffffffff88250d80
 #9 [ffff8106298dfee0] do_ioctl at ffffffff800418c4
#10 [ffff8106298dff00] vfs_ioctl at ffffffff8002fab9
#11 [ffff8106298dff40] sys_ioctl at ffffffff8004bdaf
#12 [ffff8106298dff80] tracesys at ffffffff8005d28d (via system_call)

Cc: stable@kernel.org
Reported-by: guy keren <choo@actcom.co.il>
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-ioctl.c  | 17 +++++++++++++----
 drivers/md/dm-uevent.c |  9 ++++-----
 2 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index a67942931582..d19854c98184 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -56,6 +56,11 @@ static void dm_hash_remove_all(int keep_open_devices);
  */
 static DECLARE_RWSEM(_hash_lock);
 
+/*
+ * Protects use of mdptr to obtain hash cell name and uuid from mapped device.
+ */
+static DEFINE_MUTEX(dm_hash_cells_mutex);
+
 static void init_buckets(struct list_head *buckets)
 {
 	unsigned int i;
@@ -206,7 +211,9 @@ static int dm_hash_insert(const char *name, const char *uuid, struct mapped_devi
 		list_add(&cell->uuid_list, _uuid_buckets + hash_str(uuid));
 	}
 	dm_get(md);
+	mutex_lock(&dm_hash_cells_mutex);
 	dm_set_mdptr(md, cell);
+	mutex_unlock(&dm_hash_cells_mutex);
 	up_write(&_hash_lock);
 
 	return 0;
@@ -224,7 +231,9 @@ static void __hash_remove(struct hash_cell *hc)
 	/* remove from the dev hash */
 	list_del(&hc->uuid_list);
 	list_del(&hc->name_list);
+	mutex_lock(&dm_hash_cells_mutex);
 	dm_set_mdptr(hc->md, NULL);
+	mutex_unlock(&dm_hash_cells_mutex);
 
 	table = dm_get_table(hc->md);
 	if (table) {
@@ -321,7 +330,9 @@ static int dm_hash_rename(uint32_t cookie, const char *old, const char *new)
 	 */
 	list_del(&hc->name_list);
 	old_name = hc->name;
+	mutex_lock(&dm_hash_cells_mutex);
 	hc->name = new_name;
+	mutex_unlock(&dm_hash_cells_mutex);
 	list_add(&hc->name_list, _name_buckets + hash_str(new_name));
 
 	/*
@@ -1582,8 +1593,7 @@ int dm_copy_name_and_uuid(struct mapped_device *md, char *name, char *uuid)
 	if (!md)
 		return -ENXIO;
 
-	dm_get(md);
-	down_read(&_hash_lock);
+	mutex_lock(&dm_hash_cells_mutex);
 	hc = dm_get_mdptr(md);
 	if (!hc || hc->md != md) {
 		r = -ENXIO;
@@ -1596,8 +1606,7 @@ int dm_copy_name_and_uuid(struct mapped_device *md, char *name, char *uuid)
 		strcpy(uuid, hc->uuid ? : "");
 
 out:
-	up_read(&_hash_lock);
-	dm_put(md);
+	mutex_unlock(&dm_hash_cells_mutex);
 
 	return r;
 }
diff --git a/drivers/md/dm-uevent.c b/drivers/md/dm-uevent.c
index 6f65883aef12..c7c555a8c7b2 100644
--- a/drivers/md/dm-uevent.c
+++ b/drivers/md/dm-uevent.c
@@ -139,14 +139,13 @@ void dm_send_uevents(struct list_head *events, struct kobject *kobj)
 		list_del_init(&event->elist);
 
 		/*
-		 * Need to call dm_copy_name_and_uuid from here for now.
-		 * Context of previous var adds and locking used for
-		 * hash_cell not compatable.
+		 * When a device is being removed this copy fails and we
+		 * discard these unsent events.
 		 */
 		if (dm_copy_name_and_uuid(event->md, event->name,
 					  event->uuid)) {
-			DMERR("%s: dm_copy_name_and_uuid() failed",
-			      __func__);
+			DMINFO("%s: skipping sending uevent for lost device",
+			       __func__);
 			goto uevent_free;
 		}
 
-- 
cgit v1.2.3


From 613978f8711c7fd4d0aa856872375d2abd7c92ff Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia@diku.dk>
Date: Thu, 10 Dec 2009 23:51:52 +0000
Subject: dm exception store: free tmp_store on persistent flag error

Error handling code following a kmalloc should free the allocated data.

Cc: stable@kernel.org
Signed-off-by: Julia Lawall <julia@diku.dk>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-exception-store.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c
index 7dbe652efb5a..205215968ff1 100644
--- a/drivers/md/dm-exception-store.c
+++ b/drivers/md/dm-exception-store.c
@@ -216,7 +216,8 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
 		type = get_type("N");
 	else {
 		ti->error = "Persistent flag is not P or N";
-		return -EINVAL;
+		r = -EINVAL;
+		goto bad_type;
 	}
 
 	if (!type) {
-- 
cgit v1.2.3


From d2bb7df8cac647b92f51fb84ae735771e7adbfa7 Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Thu, 10 Dec 2009 23:51:53 +0000
Subject: dm: sysfs add empty release function to avoid debug warning

This patch just removes an unnecessary warning:
 kobject: 'dm': does not have a release() function,
 it is broken and must be fixed.

The kobject is embedded in mapped device struct, so
code does not need to release memory explicitly here.

Cc: stable@kernel.org
Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-sysfs.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/md/dm-sysfs.c b/drivers/md/dm-sysfs.c
index 4b045903a4e2..b000de38c99a 100644
--- a/drivers/md/dm-sysfs.c
+++ b/drivers/md/dm-sysfs.c
@@ -79,6 +79,13 @@ static struct sysfs_ops dm_sysfs_ops = {
 	.show	= dm_attr_show,
 };
 
+/*
+ * The sysfs structure is embedded in md struct, nothing to do here
+ */
+static void dm_sysfs_release(struct kobject *kobj)
+{
+}
+
 /*
  * dm kobject is embedded in mapped_device structure
  * no need to define release function here
@@ -86,6 +93,7 @@ static struct sysfs_ops dm_sysfs_ops = {
 static struct kobj_type dm_ktype = {
 	.sysfs_ops	= &dm_sysfs_ops,
 	.default_attrs	= dm_attrs,
+	.release	= dm_sysfs_release
 };
 
 /*
-- 
cgit v1.2.3


From 94e76572b5dd37b1f0f4b3742ee8a565daead932 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 10 Dec 2009 23:51:53 +0000
Subject: dm snapshot: only take lock for statustype info not table

Take snapshot lock only for STATUSTYPE_INFO, not STATUSTYPE_TABLE.

Commit 4c6fff445d7aa753957856278d4d93bcad6e2c14
(dm-snapshot-lock-snapshot-while-supplying-status.patch)
introduced this use of the lock, but userspace applications using
libdevmapper have been found to request STATUSTYPE_TABLE while the device
is suspended and the lock is already held, leading to deadlock.  Since
the lock is not necessary in this case, don't try to take it.

Cc: stable@kernel.org
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-snap.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 3a3ba46e6d4b..d135212958f1 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -1152,10 +1152,11 @@ static int snapshot_status(struct dm_target *ti, status_type_t type,
 	unsigned sz = 0;
 	struct dm_snapshot *snap = ti->private;
 
-	down_write(&snap->lock);
-
 	switch (type) {
 	case STATUSTYPE_INFO:
+
+		down_write(&snap->lock);
+
 		if (!snap->valid)
 			DMEMIT("Invalid");
 		else {
@@ -1171,6 +1172,9 @@ static int snapshot_status(struct dm_target *ti, status_type_t type,
 			else
 				DMEMIT("Unknown");
 		}
+
+		up_write(&snap->lock);
+
 		break;
 
 	case STATUSTYPE_TABLE:
@@ -1185,8 +1189,6 @@ static int snapshot_status(struct dm_target *ti, status_type_t type,
 		break;
 	}
 
-	up_write(&snap->lock);
-
 	return 0;
 }
 
-- 
cgit v1.2.3


From 8e87b9b81b3c370f7e53c1ab6e1c3519ef37a644 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 10 Dec 2009 23:51:54 +0000
Subject: dm snapshot: cope with chunk size larger than origin

Under some special conditions the snapshot hash_size is calculated as zero.
This patch instead sets a minimum value of 64, the same as for the
pending exception table.

rounddown_pow_of_two(0) is an undefined operation (it expands to shift
by -1).  init_exception_table with an argument of 0 would fail with -ENOMEM.

The way to trigger the problem is to create a snapshot with a chunk size
that is larger than the origin device.

Cc: stable@kernel.org
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-snap.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index d135212958f1..8a4a9c838afd 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -553,6 +553,8 @@ static int init_hash_tables(struct dm_snapshot *s)
 	hash_size = min(origin_dev_size, cow_dev_size) >> s->store->chunk_shift;
 	hash_size = min(hash_size, max_buckets);
 
+	if (hash_size < 64)
+		hash_size = 64;
 	hash_size = rounddown_pow_of_two(hash_size);
 	if (init_exception_table(&s->complete, hash_size,
 				 DM_CHUNK_CONSECUTIVE_BITS))
-- 
cgit v1.2.3


From 0b4309581b5be8749afdd5a9087fd82a2a5c9932 Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Thu, 10 Dec 2009 23:51:55 +0000
Subject: dm crypt: make wipe message also wipe tfm key

The "wipe key" message is used to wipe a volume key from memory
temporarily, for example when suspending to RAM.

There are two instances of the key in memory (inside crypto tfm)
but only one got wiped.  This patch wipes them both.

Cc: stable@kernel.org
Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-crypt.c | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index e412980763bd..f2c139305e13 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -934,14 +934,14 @@ static int crypt_set_key(struct crypt_config *cc, char *key)
 
 	set_bit(DM_CRYPT_KEY_VALID, &cc->flags);
 
-	return 0;
+	return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size);
 }
 
 static int crypt_wipe_key(struct crypt_config *cc)
 {
 	clear_bit(DM_CRYPT_KEY_VALID, &cc->flags);
 	memset(&cc->key, 0, cc->key_size * sizeof(u8));
-	return 0;
+	return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size);
 }
 
 /*
@@ -983,11 +983,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		return -ENOMEM;
 	}
 
- 	if (crypt_set_key(cc, argv[1])) {
-		ti->error = "Error decoding key";
-		goto bad_cipher;
-	}
-
 	/* Compatibility mode for old dm-crypt cipher strings */
 	if (!chainmode || (strcmp(chainmode, "plain") == 0 && !ivmode)) {
 		chainmode = "cbc";
@@ -1015,6 +1010,11 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	strcpy(cc->chainmode, chainmode);
 	cc->tfm = tfm;
 
+	if (crypt_set_key(cc, argv[1]) < 0) {
+		ti->error = "Error decoding and setting key";
+		goto bad_ivmode;
+	}
+
 	/*
 	 * Choose ivmode. Valid modes: "plain", "essiv:<esshash>", "benbi".
 	 * See comments at iv code
@@ -1085,11 +1085,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		goto bad_bs;
 	}
 
-	if (crypto_ablkcipher_setkey(tfm, cc->key, key_size) < 0) {
-		ti->error = "Error setting key";
-		goto bad_device;
-	}
-
 	if (sscanf(argv[2], "%llu", &tmpll) != 1) {
 		ti->error = "Invalid iv_offset sector";
 		goto bad_device;
-- 
cgit v1.2.3


From 6047359277517c4e56d8bfd6ea4966d7a3924151 Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Thu, 10 Dec 2009 23:51:55 +0000
Subject: dm crypt: move private iv fields to structs

Define private structures for IV so it's easy to add further attributes
in a following patch which fixes the way key material is wiped from
memory.  Also move ESSIV destructor and remove unnecessary 'status'
operation.

There are no functional changes in this patch.

Cc: stable@kernel.org
Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-crypt.c | 35 ++++++++++++++++++++++-------------
 1 file changed, 22 insertions(+), 13 deletions(-)

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index f2c139305e13..bec5ac54e23e 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -71,10 +71,17 @@ struct crypt_iv_operations {
 	int (*ctr)(struct crypt_config *cc, struct dm_target *ti,
 		   const char *opts);
 	void (*dtr)(struct crypt_config *cc);
-	const char *(*status)(struct crypt_config *cc);
 	int (*generator)(struct crypt_config *cc, u8 *iv, sector_t sector);
 };
 
+struct iv_essiv_private {
+	struct crypto_cipher *tfm;
+};
+
+struct iv_benbi_private {
+	int shift;
+};
+
 /*
  * Crypt: maps a linear range of a block device
  * and encrypts / decrypts at the same time.
@@ -102,8 +109,8 @@ struct crypt_config {
 	struct crypt_iv_operations *iv_gen_ops;
 	char *iv_mode;
 	union {
-		struct crypto_cipher *essiv_tfm;
-		int benbi_shift;
+		struct iv_essiv_private essiv;
+		struct iv_benbi_private benbi;
 	} iv_gen_private;
 	sector_t iv_offset;
 	unsigned int iv_size;
@@ -169,6 +176,14 @@ static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
 	return 0;
 }
 
+static void crypt_iv_essiv_dtr(struct crypt_config *cc)
+{
+	struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
+
+	crypto_free_cipher(essiv->tfm);
+	essiv->tfm = NULL;
+}
+
 static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
 			      const char *opts)
 {
@@ -236,21 +251,15 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
 	}
 	kfree(salt);
 
-	cc->iv_gen_private.essiv_tfm = essiv_tfm;
+	cc->iv_gen_private.essiv.tfm = essiv_tfm;
 	return 0;
 }
 
-static void crypt_iv_essiv_dtr(struct crypt_config *cc)
-{
-	crypto_free_cipher(cc->iv_gen_private.essiv_tfm);
-	cc->iv_gen_private.essiv_tfm = NULL;
-}
-
 static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
 {
 	memset(iv, 0, cc->iv_size);
 	*(u64 *)iv = cpu_to_le64(sector);
-	crypto_cipher_encrypt_one(cc->iv_gen_private.essiv_tfm, iv, iv);
+	crypto_cipher_encrypt_one(cc->iv_gen_private.essiv.tfm, iv, iv);
 	return 0;
 }
 
@@ -273,7 +282,7 @@ static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti,
 		return -EINVAL;
 	}
 
-	cc->iv_gen_private.benbi_shift = 9 - log;
+	cc->iv_gen_private.benbi.shift = 9 - log;
 
 	return 0;
 }
@@ -288,7 +297,7 @@ static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
 
 	memset(iv, 0, cc->iv_size - sizeof(u64)); /* rest is cleared below */
 
-	val = cpu_to_be64(((u64)sector << cc->iv_gen_private.benbi_shift) + 1);
+	val = cpu_to_be64(((u64)sector << cc->iv_gen_private.benbi.shift) + 1);
 	put_unaligned(val, (__be64 *)(iv + cc->iv_size - sizeof(u64)));
 
 	return 0;
-- 
cgit v1.2.3


From 5861f1be00b3b70f8ab5e5a81392a6cf69666cd2 Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Thu, 10 Dec 2009 23:51:56 +0000
Subject: dm crypt: restructure essiv error path

Use kzfree for salt deallocation because it is derived from the volume
key.  Use a common error path in ESSIV constructor.

Required by a later patch which fixes the way key material is wiped
from memory.

Cc: stable@kernel.org
Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-crypt.c | 46 ++++++++++++++++++++++++++--------------------
 1 file changed, 26 insertions(+), 20 deletions(-)

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index bec5ac54e23e..2301d223f2ae 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -187,15 +187,15 @@ static void crypt_iv_essiv_dtr(struct crypt_config *cc)
 static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
 			      const char *opts)
 {
-	struct crypto_cipher *essiv_tfm;
-	struct crypto_hash *hash_tfm;
+	struct crypto_cipher *essiv_tfm = NULL;
+	struct crypto_hash *hash_tfm = NULL;
 	struct hash_desc desc;
 	struct scatterlist sg;
 	unsigned int saltsize;
-	u8 *salt;
+	u8 *salt = NULL;
 	int err;
 
-	if (opts == NULL) {
+	if (!opts) {
 		ti->error = "Digest algorithm missing for ESSIV mode";
 		return -EINVAL;
 	}
@@ -204,15 +204,16 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
 	hash_tfm = crypto_alloc_hash(opts, 0, CRYPTO_ALG_ASYNC);
 	if (IS_ERR(hash_tfm)) {
 		ti->error = "Error initializing ESSIV hash";
-		return PTR_ERR(hash_tfm);
+		err = PTR_ERR(hash_tfm);
+		goto bad;
 	}
 
 	saltsize = crypto_hash_digestsize(hash_tfm);
-	salt = kmalloc(saltsize, GFP_KERNEL);
-	if (salt == NULL) {
+	salt = kzalloc(saltsize, GFP_KERNEL);
+	if (!salt) {
 		ti->error = "Error kmallocing salt storage in ESSIV";
-		crypto_free_hash(hash_tfm);
-		return -ENOMEM;
+		err = -ENOMEM;
+		goto bad;
 	}
 
 	sg_init_one(&sg, cc->key, cc->key_size);
@@ -220,39 +221,44 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
 	desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
 	err = crypto_hash_digest(&desc, &sg, cc->key_size, salt);
 	crypto_free_hash(hash_tfm);
+	hash_tfm = NULL;
 
 	if (err) {
 		ti->error = "Error calculating hash in ESSIV";
-		kfree(salt);
-		return err;
+		goto bad;
 	}
 
 	/* Setup the essiv_tfm with the given salt */
 	essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC);
 	if (IS_ERR(essiv_tfm)) {
 		ti->error = "Error allocating crypto tfm for ESSIV";
-		kfree(salt);
-		return PTR_ERR(essiv_tfm);
+		err = PTR_ERR(essiv_tfm);
+		goto bad;
 	}
 	if (crypto_cipher_blocksize(essiv_tfm) !=
 	    crypto_ablkcipher_ivsize(cc->tfm)) {
 		ti->error = "Block size of ESSIV cipher does "
 			    "not match IV size of block cipher";
-		crypto_free_cipher(essiv_tfm);
-		kfree(salt);
-		return -EINVAL;
+		err = -EINVAL;
+		goto bad;
 	}
 	err = crypto_cipher_setkey(essiv_tfm, salt, saltsize);
 	if (err) {
 		ti->error = "Failed to set key for ESSIV cipher";
-		crypto_free_cipher(essiv_tfm);
-		kfree(salt);
-		return err;
+		goto bad;
 	}
-	kfree(salt);
+	kzfree(salt);
 
 	cc->iv_gen_private.essiv.tfm = essiv_tfm;
 	return 0;
+
+bad:
+	if (essiv_tfm && !IS_ERR(essiv_tfm))
+		crypto_free_cipher(essiv_tfm);
+	if (hash_tfm && !IS_ERR(hash_tfm))
+		crypto_free_hash(hash_tfm);
+	kzfree(salt);
+	return err;
 }
 
 static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
-- 
cgit v1.2.3


From b95bf2d3d5a48b095bffe2a0cd8c40453cf59557 Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Thu, 10 Dec 2009 23:51:56 +0000
Subject: dm crypt: separate essiv allocation from initialisation

This patch separates the construction of IV from its initialisation.
(For ESSIV it is a hash calculation based on volume key.)

Constructor code now preallocates hash tfm and salt array
and saves it in a private IV structure.

The next patch requires this to reinitialise the wiped IV
without reallocating memory when resuming a suspended device.

Cc: stable@kernel.org
Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-crypt.c | 69 ++++++++++++++++++++++++++++++++-------------------
 1 file changed, 43 insertions(+), 26 deletions(-)

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 2301d223f2ae..446153a071d6 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -71,11 +71,14 @@ struct crypt_iv_operations {
 	int (*ctr)(struct crypt_config *cc, struct dm_target *ti,
 		   const char *opts);
 	void (*dtr)(struct crypt_config *cc);
+	int (*init)(struct crypt_config *cc);
 	int (*generator)(struct crypt_config *cc, u8 *iv, sector_t sector);
 };
 
 struct iv_essiv_private {
 	struct crypto_cipher *tfm;
+	struct crypto_hash *hash_tfm;
+	u8 *salt;
 };
 
 struct iv_benbi_private {
@@ -176,12 +179,38 @@ static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
 	return 0;
 }
 
+/* Initialise ESSIV - compute salt but no local memory allocations */
+static int crypt_iv_essiv_init(struct crypt_config *cc)
+{
+	struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
+	struct hash_desc desc;
+	struct scatterlist sg;
+	int err;
+
+	sg_init_one(&sg, cc->key, cc->key_size);
+	desc.tfm = essiv->hash_tfm;
+	desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+
+	err = crypto_hash_digest(&desc, &sg, cc->key_size, essiv->salt);
+	if (err)
+		return err;
+
+	return crypto_cipher_setkey(essiv->tfm, essiv->salt,
+				    crypto_hash_digestsize(essiv->hash_tfm));
+}
+
 static void crypt_iv_essiv_dtr(struct crypt_config *cc)
 {
 	struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
 
 	crypto_free_cipher(essiv->tfm);
 	essiv->tfm = NULL;
+
+	crypto_free_hash(essiv->hash_tfm);
+	essiv->hash_tfm = NULL;
+
+	kzfree(essiv->salt);
+	essiv->salt = NULL;
 }
 
 static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
@@ -189,9 +218,6 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
 {
 	struct crypto_cipher *essiv_tfm = NULL;
 	struct crypto_hash *hash_tfm = NULL;
-	struct hash_desc desc;
-	struct scatterlist sg;
-	unsigned int saltsize;
 	u8 *salt = NULL;
 	int err;
 
@@ -200,7 +226,7 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
 		return -EINVAL;
 	}
 
-	/* Hash the cipher key with the given hash algorithm */
+	/* Allocate hash algorithm */
 	hash_tfm = crypto_alloc_hash(opts, 0, CRYPTO_ALG_ASYNC);
 	if (IS_ERR(hash_tfm)) {
 		ti->error = "Error initializing ESSIV hash";
@@ -208,27 +234,14 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
 		goto bad;
 	}
 
-	saltsize = crypto_hash_digestsize(hash_tfm);
-	salt = kzalloc(saltsize, GFP_KERNEL);
+	salt = kzalloc(crypto_hash_digestsize(hash_tfm), GFP_KERNEL);
 	if (!salt) {
 		ti->error = "Error kmallocing salt storage in ESSIV";
 		err = -ENOMEM;
 		goto bad;
 	}
 
-	sg_init_one(&sg, cc->key, cc->key_size);
-	desc.tfm = hash_tfm;
-	desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
-	err = crypto_hash_digest(&desc, &sg, cc->key_size, salt);
-	crypto_free_hash(hash_tfm);
-	hash_tfm = NULL;
-
-	if (err) {
-		ti->error = "Error calculating hash in ESSIV";
-		goto bad;
-	}
-
-	/* Setup the essiv_tfm with the given salt */
+	/* Allocate essiv_tfm */
 	essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC);
 	if (IS_ERR(essiv_tfm)) {
 		ti->error = "Error allocating crypto tfm for ESSIV";
@@ -242,14 +255,11 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
 		err = -EINVAL;
 		goto bad;
 	}
-	err = crypto_cipher_setkey(essiv_tfm, salt, saltsize);
-	if (err) {
-		ti->error = "Failed to set key for ESSIV cipher";
-		goto bad;
-	}
-	kzfree(salt);
 
+	cc->iv_gen_private.essiv.salt = salt;
 	cc->iv_gen_private.essiv.tfm = essiv_tfm;
+	cc->iv_gen_private.essiv.hash_tfm = hash_tfm;
+
 	return 0;
 
 bad:
@@ -257,7 +267,7 @@ bad:
 		crypto_free_cipher(essiv_tfm);
 	if (hash_tfm && !IS_ERR(hash_tfm))
 		crypto_free_hash(hash_tfm);
-	kzfree(salt);
+	kfree(salt);
 	return err;
 }
 
@@ -323,6 +333,7 @@ static struct crypt_iv_operations crypt_iv_plain_ops = {
 static struct crypt_iv_operations crypt_iv_essiv_ops = {
 	.ctr       = crypt_iv_essiv_ctr,
 	.dtr       = crypt_iv_essiv_dtr,
+	.init      = crypt_iv_essiv_init,
 	.generator = crypt_iv_essiv_gen
 };
 
@@ -1054,6 +1065,12 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	    cc->iv_gen_ops->ctr(cc, ti, ivopts) < 0)
 		goto bad_ivmode;
 
+	if (cc->iv_gen_ops && cc->iv_gen_ops->init &&
+	    cc->iv_gen_ops->init(cc) < 0) {
+		ti->error = "Error initialising IV";
+		goto bad_slab_pool;
+	}
+
 	cc->iv_size = crypto_ablkcipher_ivsize(tfm);
 	if (cc->iv_size)
 		/* at least a 64 bit sector number should fit in our buffer */
-- 
cgit v1.2.3


From 542da317668c35036e8471822a564b609d05af66 Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Thu, 10 Dec 2009 23:51:57 +0000
Subject: dm crypt: make wipe message also wipe essiv key

The "wipe key" message is used to wipe the volume key from memory
temporarily, for example when suspending to RAM.

But the initialisation vector in ESSIV mode is calculated from the
hashed volume key, so the wipe message should wipe this IV key too and
reinitialise it when the volume key is reinstated.

This patch adds an IV wipe method called from a wipe message callback.
ESSIV is then reinitialised using the init function added by the
last patch.

Cc: stable@kernel.org
Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-crypt.c | 34 ++++++++++++++++++++++++++++++----
 1 file changed, 30 insertions(+), 4 deletions(-)

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 446153a071d6..91e1bf91769f 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1,7 +1,7 @@
 /*
  * Copyright (C) 2003 Christophe Saout <christophe@saout.de>
  * Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org>
- * Copyright (C) 2006-2008 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2006-2009 Red Hat, Inc. All rights reserved.
  *
  * This file is released under the GPL.
  */
@@ -72,6 +72,7 @@ struct crypt_iv_operations {
 		   const char *opts);
 	void (*dtr)(struct crypt_config *cc);
 	int (*init)(struct crypt_config *cc);
+	int (*wipe)(struct crypt_config *cc);
 	int (*generator)(struct crypt_config *cc, u8 *iv, sector_t sector);
 };
 
@@ -199,6 +200,17 @@ static int crypt_iv_essiv_init(struct crypt_config *cc)
 				    crypto_hash_digestsize(essiv->hash_tfm));
 }
 
+/* Wipe salt and reset key derived from volume key */
+static int crypt_iv_essiv_wipe(struct crypt_config *cc)
+{
+	struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
+	unsigned salt_size = crypto_hash_digestsize(essiv->hash_tfm);
+
+	memset(essiv->salt, 0, salt_size);
+
+	return crypto_cipher_setkey(essiv->tfm, essiv->salt, salt_size);
+}
+
 static void crypt_iv_essiv_dtr(struct crypt_config *cc)
 {
 	struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
@@ -334,6 +346,7 @@ static struct crypt_iv_operations crypt_iv_essiv_ops = {
 	.ctr       = crypt_iv_essiv_ctr,
 	.dtr       = crypt_iv_essiv_dtr,
 	.init      = crypt_iv_essiv_init,
+	.wipe      = crypt_iv_essiv_wipe,
 	.generator = crypt_iv_essiv_gen
 };
 
@@ -1305,6 +1318,7 @@ static void crypt_resume(struct dm_target *ti)
 static int crypt_message(struct dm_target *ti, unsigned argc, char **argv)
 {
 	struct crypt_config *cc = ti->private;
+	int ret = -EINVAL;
 
 	if (argc < 2)
 		goto error;
@@ -1314,10 +1328,22 @@ static int crypt_message(struct dm_target *ti, unsigned argc, char **argv)
 			DMWARN("not suspended during key manipulation.");
 			return -EINVAL;
 		}
-		if (argc == 3 && !strnicmp(argv[1], MESG_STR("set")))
-			return crypt_set_key(cc, argv[2]);
-		if (argc == 2 && !strnicmp(argv[1], MESG_STR("wipe")))
+		if (argc == 3 && !strnicmp(argv[1], MESG_STR("set"))) {
+			ret = crypt_set_key(cc, argv[2]);
+			if (ret)
+				return ret;
+			if (cc->iv_gen_ops && cc->iv_gen_ops->init)
+				ret = cc->iv_gen_ops->init(cc);
+			return ret;
+		}
+		if (argc == 2 && !strnicmp(argv[1], MESG_STR("wipe"))) {
+			if (cc->iv_gen_ops && cc->iv_gen_ops->wipe) {
+				ret = cc->iv_gen_ops->wipe(cc);
+				if (ret)
+					return ret;
+			}
 			return crypt_wipe_key(cc);
+		}
 	}
 
 error:
-- 
cgit v1.2.3


From 952b355760c196ec014dd0b6878f85a11496e3da Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 10 Dec 2009 23:51:57 +0000
Subject: dm io: use slab for struct io

Allocate "struct io" from a slab.

This patch changes dm-io, so that "struct io" is allocated from a slab cache.
It used to be allocated with kmalloc. Allocating from a slab will be needed
for the next patch, because it requires a special alignment of "struct io"
and kmalloc cannot meet this alignment.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-io.c | 21 ++++++++++++++++++++-
 drivers/md/dm.c    |  2 ++
 drivers/md/dm.h    |  3 +++
 3 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 3a2e6a2f8bdd..b0d264e684fd 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -5,6 +5,8 @@
  * This file is released under the GPL.
  */
 
+#include "dm.h"
+
 #include <linux/device-mapper.h>
 
 #include <linux/bio.h>
@@ -30,6 +32,8 @@ struct io {
 	void *context;
 };
 
+static struct kmem_cache *_dm_io_cache;
+
 /*
  * io contexts are only dynamically allocated for asynchronous
  * io.  Since async io is likely to be the majority of io we'll
@@ -53,7 +57,7 @@ struct dm_io_client *dm_io_client_create(unsigned num_pages)
 	if (!client)
 		return ERR_PTR(-ENOMEM);
 
-	client->pool = mempool_create_kmalloc_pool(ios, sizeof(struct io));
+	client->pool = mempool_create_slab_pool(ios, _dm_io_cache);
 	if (!client->pool)
 		goto bad;
 
@@ -472,3 +476,18 @@ int dm_io(struct dm_io_request *io_req, unsigned num_regions,
 			&dp, io_req->notify.fn, io_req->notify.context);
 }
 EXPORT_SYMBOL(dm_io);
+
+int __init dm_io_init(void)
+{
+	_dm_io_cache = KMEM_CACHE(io, 0);
+	if (!_dm_io_cache)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void dm_io_exit(void)
+{
+	kmem_cache_destroy(_dm_io_cache);
+	_dm_io_cache = NULL;
+}
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 724efc63904d..473f0c3c0192 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -275,6 +275,7 @@ static int (*_inits[])(void) __initdata = {
 	dm_target_init,
 	dm_linear_init,
 	dm_stripe_init,
+	dm_io_init,
 	dm_kcopyd_init,
 	dm_interface_init,
 };
@@ -284,6 +285,7 @@ static void (*_exits[])(void) = {
 	dm_target_exit,
 	dm_linear_exit,
 	dm_stripe_exit,
+	dm_io_exit,
 	dm_kcopyd_exit,
 	dm_interface_exit,
 };
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index a7663eba17e2..4a95e8fa3607 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -118,6 +118,9 @@ int dm_lock_for_deletion(struct mapped_device *md);
 void dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
 		       unsigned cookie);
 
+int dm_io_init(void);
+void dm_io_exit(void);
+
 int dm_kcopyd_init(void);
 void dm_kcopyd_exit(void);
 
-- 
cgit v1.2.3


From f1e539874655ae9e74c1644fd54133b19f1b14e2 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 10 Dec 2009 23:51:58 +0000
Subject: dm io: remove extra bi_io_vec region hack

Remove the hack where we allocate an extra bi_io_vec to store additional
private data.  This hack prevents us from supporting barriers in
dm-raid1 without first making another little block layer change.
Instead of doing that, this patch eliminates the bi_io_vec abuse by
storing the region number directly in the low bits of bi_private.

We need to store two things for each bio, the pointer to the main io
structure and, if parallel writes were requested, an index indicating
which of these writes this bio belongs to.  There can be at most
BITS_PER_LONG regions - 32 or 64.

The index (region number) was stored in the last (hidden) bio vector and
the pointer to struct io was stored in bi_private.

This patch now aligns "struct io" on BITS_PER_LONG bytes and stores the
region number in the low BITS_PER_LONG bits of bi_private.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-io.c | 89 +++++++++++++++++++++++++++++++++---------------------
 1 file changed, 55 insertions(+), 34 deletions(-)

diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index b0d264e684fd..f6a714c5aab0 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -16,12 +16,19 @@
 #include <linux/slab.h>
 #include <linux/dm-io.h>
 
+#define DM_MSG_PREFIX "io"
+
+#define DM_IO_MAX_REGIONS	BITS_PER_LONG
+
 struct dm_io_client {
 	mempool_t *pool;
 	struct bio_set *bios;
 };
 
-/* FIXME: can we shrink this ? */
+/*
+ * Aligning 'struct io' reduces the number of bits required to store
+ * its address.  Refer to store_io_and_region_in_bio() below.
+ */
 struct io {
 	unsigned long error_bits;
 	unsigned long eopnotsupp_bits;
@@ -30,7 +37,7 @@ struct io {
 	struct dm_io_client *client;
 	io_notify_fn callback;
 	void *context;
-};
+} __attribute__((aligned(DM_IO_MAX_REGIONS)));
 
 static struct kmem_cache *_dm_io_cache;
 
@@ -92,18 +99,29 @@ EXPORT_SYMBOL(dm_io_client_destroy);
 
 /*-----------------------------------------------------------------
  * We need to keep track of which region a bio is doing io for.
- * In order to save a memory allocation we store this the last
- * bvec which we know is unused (blech).
- * XXX This is ugly and can OOPS with some configs... find another way.
+ * To avoid a memory allocation to store just 5 or 6 bits, we
+ * ensure the 'struct io' pointer is aligned so enough low bits are
+ * always zero and then combine it with the region number directly in
+ * bi_private.
  *---------------------------------------------------------------*/
-static inline void bio_set_region(struct bio *bio, unsigned region)
+static void store_io_and_region_in_bio(struct bio *bio, struct io *io,
+				       unsigned region)
 {
-	bio->bi_io_vec[bio->bi_max_vecs].bv_len = region;
+	if (unlikely(!IS_ALIGNED((unsigned long)io, DM_IO_MAX_REGIONS))) {
+		DMCRIT("Unaligned struct io pointer %p", io);
+		BUG();
+	}
+
+	bio->bi_private = (void *)((unsigned long)io | region);
 }
 
-static inline unsigned bio_get_region(struct bio *bio)
+static void retrieve_io_and_region_from_bio(struct bio *bio, struct io **io,
+				       unsigned *region)
 {
-	return bio->bi_io_vec[bio->bi_max_vecs].bv_len;
+	unsigned long val = (unsigned long)bio->bi_private;
+
+	*io = (void *)(val & -(unsigned long)DM_IO_MAX_REGIONS);
+	*region = val & (DM_IO_MAX_REGIONS - 1);
 }
 
 /*-----------------------------------------------------------------
@@ -144,10 +162,8 @@ static void endio(struct bio *bio, int error)
 	/*
 	 * The bio destructor in bio_put() may use the io object.
 	 */
-	io = bio->bi_private;
-	region = bio_get_region(bio);
+	retrieve_io_and_region_from_bio(bio, &io, &region);
 
-	bio->bi_max_vecs++;
 	bio_put(bio);
 
 	dec_count(io, region, error);
@@ -247,7 +263,10 @@ static void vm_dp_init(struct dpages *dp, void *data)
 
 static void dm_bio_destructor(struct bio *bio)
 {
-	struct io *io = bio->bi_private;
+	unsigned region;
+	struct io *io;
+
+	retrieve_io_and_region_from_bio(bio, &io, &region);
 
 	bio_free(bio, io->client->bios);
 }
@@ -292,24 +311,17 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
 
 	while (remaining) {
 		/*
-		 * Allocate a suitably sized-bio: we add an extra
-		 * bvec for bio_get/set_region() and decrement bi_max_vecs
-		 * to hide it from bio_add_page().
+		 * Allocate a suitably sized-bio.
 		 */
 		num_bvecs = dm_sector_div_up(remaining,
 					     (PAGE_SIZE >> SECTOR_SHIFT));
-		num_bvecs = 1 + min_t(int, bio_get_nr_vecs(where->bdev),
-				      num_bvecs);
-		if (unlikely(num_bvecs > BIO_MAX_PAGES))
-			num_bvecs = BIO_MAX_PAGES;
+		num_bvecs = min_t(int, bio_get_nr_vecs(where->bdev), num_bvecs);
 		bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios);
 		bio->bi_sector = where->sector + (where->count - remaining);
 		bio->bi_bdev = where->bdev;
 		bio->bi_end_io = endio;
-		bio->bi_private = io;
 		bio->bi_destructor = dm_bio_destructor;
-		bio->bi_max_vecs--;
-		bio_set_region(bio, region);
+		store_io_and_region_in_bio(bio, io, region);
 
 		/*
 		 * Try and add as many pages as possible.
@@ -337,6 +349,8 @@ static void dispatch_io(int rw, unsigned int num_regions,
 	int i;
 	struct dpages old_pages = *dp;
 
+	BUG_ON(num_regions > DM_IO_MAX_REGIONS);
+
 	if (sync)
 		rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
 
@@ -361,7 +375,14 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
 		   struct dm_io_region *where, int rw, struct dpages *dp,
 		   unsigned long *error_bits)
 {
-	struct io io;
+	/*
+	 * gcc <= 4.3 can't do the alignment for stack variables, so we must
+	 * align it on our own.
+	 * volatile prevents the optimizer from removing or reusing
+	 * "io_" field from the stack frame (allowed in ANSI C).
+	 */
+	volatile char io_[sizeof(struct io) + __alignof__(struct io) - 1];
+	struct io *io = (struct io *)PTR_ALIGN(&io_, __alignof__(struct io));
 
 	if (num_regions > 1 && (rw & RW_MASK) != WRITE) {
 		WARN_ON(1);
@@ -369,33 +390,33 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
 	}
 
 retry:
-	io.error_bits = 0;
-	io.eopnotsupp_bits = 0;
-	atomic_set(&io.count, 1); /* see dispatch_io() */
-	io.sleeper = current;
-	io.client = client;
+	io->error_bits = 0;
+	io->eopnotsupp_bits = 0;
+	atomic_set(&io->count, 1); /* see dispatch_io() */
+	io->sleeper = current;
+	io->client = client;
 
-	dispatch_io(rw, num_regions, where, dp, &io, 1);
+	dispatch_io(rw, num_regions, where, dp, io, 1);
 
 	while (1) {
 		set_current_state(TASK_UNINTERRUPTIBLE);
 
-		if (!atomic_read(&io.count))
+		if (!atomic_read(&io->count))
 			break;
 
 		io_schedule();
 	}
 	set_current_state(TASK_RUNNING);
 
-	if (io.eopnotsupp_bits && (rw & (1 << BIO_RW_BARRIER))) {
+	if (io->eopnotsupp_bits && (rw & (1 << BIO_RW_BARRIER))) {
 		rw &= ~(1 << BIO_RW_BARRIER);
 		goto retry;
 	}
 
 	if (error_bits)
-		*error_bits = io.error_bits;
+		*error_bits = io->error_bits;
 
-	return io.error_bits ? -EIO : 0;
+	return io->error_bits ? -EIO : 0;
 }
 
 static int async_io(struct dm_io_client *client, unsigned int num_regions,
-- 
cgit v1.2.3


From 4184153f9e483f9bb63339ed316e059962fe9794 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 10 Dec 2009 23:51:59 +0000
Subject: dm raid1: support flush

Flush support for dm-raid1.

When it receives an empty barrier, submit it to all the devices via dm-io.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-raid1.c       | 13 +++++++++++--
 drivers/md/dm-region-hash.c | 25 +++++++++++++++++++++++--
 2 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index cc9dc79b0784..752a29e1855b 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -396,6 +396,8 @@ static int mirror_available(struct mirror_set *ms, struct bio *bio)
  */
 static sector_t map_sector(struct mirror *m, struct bio *bio)
 {
+	if (unlikely(!bio->bi_size))
+		return 0;
 	return m->offset + (bio->bi_sector - m->ms->ti->begin);
 }
 
@@ -562,7 +564,7 @@ static void do_write(struct mirror_set *ms, struct bio *bio)
 	struct dm_io_region io[ms->nr_mirrors], *dest = io;
 	struct mirror *m;
 	struct dm_io_request io_req = {
-		.bi_rw = WRITE,
+		.bi_rw = WRITE | (bio->bi_rw & WRITE_BARRIER),
 		.mem.type = DM_IO_BVEC,
 		.mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx,
 		.notify.fn = write_callback,
@@ -603,6 +605,11 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
 	bio_list_init(&requeue);
 
 	while ((bio = bio_list_pop(writes))) {
+		if (unlikely(bio_empty_barrier(bio))) {
+			bio_list_add(&sync, bio);
+			continue;
+		}
+
 		region = dm_rh_bio_to_region(ms->rh, bio);
 
 		if (log->type->is_remote_recovering &&
@@ -995,6 +1002,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 
 	ti->private = ms;
 	ti->split_io = dm_rh_get_region_size(ms->rh);
+	ti->num_flush_requests = 1;
 
 	ms->kmirrord_wq = create_singlethread_workqueue("kmirrord");
 	if (!ms->kmirrord_wq) {
@@ -1122,7 +1130,8 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio,
 	 * We need to dec pending if this was a write.
 	 */
 	if (rw == WRITE) {
-		dm_rh_dec(ms->rh, map_context->ll);
+		if (likely(!bio_empty_barrier(bio)))
+			dm_rh_dec(ms->rh, map_context->ll);
 		return error;
 	}
 
diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c
index 36dbe29f2fd6..00806b760ccd 100644
--- a/drivers/md/dm-region-hash.c
+++ b/drivers/md/dm-region-hash.c
@@ -79,6 +79,11 @@ struct dm_region_hash {
 	struct list_head recovered_regions;
 	struct list_head failed_recovered_regions;
 
+	/*
+	 * If there was a barrier failure no regions can be marked clean.
+	 */
+	int barrier_failure;
+
 	void *context;
 	sector_t target_begin;
 
@@ -211,6 +216,7 @@ struct dm_region_hash *dm_region_hash_create(
 	INIT_LIST_HEAD(&rh->quiesced_regions);
 	INIT_LIST_HEAD(&rh->recovered_regions);
 	INIT_LIST_HEAD(&rh->failed_recovered_regions);
+	rh->barrier_failure = 0;
 
 	rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS,
 						      sizeof(struct dm_region));
@@ -395,6 +401,11 @@ void dm_rh_mark_nosync(struct dm_region_hash *rh,
 	region_t region = dm_rh_bio_to_region(rh, bio);
 	int recovering = 0;
 
+	if (bio_empty_barrier(bio)) {
+		rh->barrier_failure = 1;
+		return;
+	}
+
 	/* We must inform the log that the sync count has changed. */
 	log->type->set_region_sync(log, region, 0);
 
@@ -515,8 +526,11 @@ void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios)
 {
 	struct bio *bio;
 
-	for (bio = bios->head; bio; bio = bio->bi_next)
+	for (bio = bios->head; bio; bio = bio->bi_next) {
+		if (bio_empty_barrier(bio))
+			continue;
 		rh_inc(rh, dm_rh_bio_to_region(rh, bio));
+	}
 }
 EXPORT_SYMBOL_GPL(dm_rh_inc_pending);
 
@@ -544,7 +558,14 @@ void dm_rh_dec(struct dm_region_hash *rh, region_t region)
 		 */
 
 		/* do nothing for DM_RH_NOSYNC */
-		if (reg->state == DM_RH_RECOVERING) {
+		if (unlikely(rh->barrier_failure)) {
+			/*
+			 * If a write barrier failed some time ago, we
+			 * don't know whether or not this write made it
+			 * to the disk, so we must resync the device.
+			 */
+			reg->state = DM_RH_NOSYNC;
+		} else if (reg->state == DM_RH_RECOVERING) {
 			list_add_tail(&reg->list, &rh->quiesced_regions);
 		} else if (reg->state == DM_RH_DIRTY) {
 			reg->state = DM_RH_CLEAN;
-- 
cgit v1.2.3


From b09acf1aa79462bdacfe6744b469a17722a52702 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 10 Dec 2009 23:51:59 +0000
Subject: dm raid1: split touched state into two

Split the variable "touched" into two, "touched_dirtied" and
"touched_cleaned", set when some region was dirtied or cleaned.

This will be used to optimize flushes.

After a transition from "dirty" to "clean" state we don't have flush hardware
cache on the log device. After a transition from "clean" to "dirty" the cache
must be flushed.

Before a transition from "clean" to "dirty" state we don't have to flush all
the raid legs. Before a transition from "dirty" to "clean" we must flush all
the legs to make sure that they are really in sync.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-log.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 9443896ede07..31dc33df95c7 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -208,7 +208,8 @@ struct log_header {
 
 struct log_c {
 	struct dm_target *ti;
-	int touched;
+	int touched_dirtied;
+	int touched_cleaned;
 	uint32_t region_size;
 	unsigned int region_count;
 	region_t sync_count;
@@ -253,14 +254,14 @@ static inline void log_set_bit(struct log_c *l,
 			       uint32_t *bs, unsigned bit)
 {
 	ext2_set_bit(bit, (unsigned long *) bs);
-	l->touched = 1;
+	l->touched_cleaned = 1;
 }
 
 static inline void log_clear_bit(struct log_c *l,
 				 uint32_t *bs, unsigned bit)
 {
 	ext2_clear_bit(bit, (unsigned long *) bs);
-	l->touched = 1;
+	l->touched_dirtied = 1;
 }
 
 /*----------------------------------------------------------------
@@ -378,7 +379,8 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
 	}
 
 	lc->ti = ti;
-	lc->touched = 0;
+	lc->touched_dirtied = 0;
+	lc->touched_cleaned = 0;
 	lc->region_size = region_size;
 	lc->region_count = region_count;
 	lc->sync = sync;
@@ -660,14 +662,16 @@ static int disk_flush(struct dm_dirty_log *log)
 	struct log_c *lc = (struct log_c *) log->context;
 
 	/* only write if the log has changed */
-	if (!lc->touched)
+	if (!lc->touched_cleaned && !lc->touched_dirtied)
 		return 0;
 
 	r = rw_header(lc, WRITE);
 	if (r)
 		fail_log_device(lc);
-	else
-		lc->touched = 0;
+	else {
+		lc->touched_dirtied = 0;
+		lc->touched_cleaned = 0;
+	}
 
 	return r;
 }
-- 
cgit v1.2.3


From 20a34a8ecc7d03eaa5054f58169ebff12f5f1f8c Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 10 Dec 2009 23:52:00 +0000
Subject: dm log: add flush_header function

Introduce flush_header and use it to flush the log device.

Note that we don't have to flush if all the regions transition
from "dirty" to "clean" state.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-log.c | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 31dc33df95c7..6b23631db5b5 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -288,6 +288,19 @@ static int rw_header(struct log_c *lc, int rw)
 	return dm_io(&lc->io_req, 1, &lc->header_location, NULL);
 }
 
+static int flush_header(struct log_c *lc)
+{
+	struct dm_io_region null_location = {
+		.bdev = lc->header_location.bdev,
+		.sector = 0,
+		.count = 0,
+	};
+
+	lc->io_req.bi_rw = WRITE_BARRIER;
+
+	return dm_io(&lc->io_req, 1, &null_location, NULL);
+}
+
 static int read_header(struct log_c *log)
 {
 	int r;
@@ -616,6 +629,8 @@ static int disk_resume(struct dm_dirty_log *log)
 
 	/* write the new header */
 	r = rw_header(lc, WRITE);
+	if (!r)
+		r = flush_header(lc);
 	if (r) {
 		DMWARN("%s: Failed to write header on dirty region log device",
 		       lc->log_dev->name);
@@ -669,7 +684,13 @@ static int disk_flush(struct dm_dirty_log *log)
 	if (r)
 		fail_log_device(lc);
 	else {
-		lc->touched_dirtied = 0;
+		if (lc->touched_dirtied) {
+			r = flush_header(lc);
+			if (r)
+				fail_log_device(lc);
+			else
+				lc->touched_dirtied = 0;
+		}
 		lc->touched_cleaned = 0;
 	}
 
-- 
cgit v1.2.3


From 5adc78d0d231b030405b31759f125f13404fdb64 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 10 Dec 2009 23:52:00 +0000
Subject: dm log: introduce flush_failed variable

Introduce "flush failed" variable.  When a flush before clearing a bit
in the log fails, we don't know anything about which which regions are
in-sync and which not.

So we need to set all regions as not-in-sync and set the variable
"flush_failed" to prevent setting the in-sync bit in the future.

A target reload is the only way to get out of this situation.

The variable will be set in following patches.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-log.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 6b23631db5b5..d779f8c915dd 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -210,6 +210,7 @@ struct log_c {
 	struct dm_target *ti;
 	int touched_dirtied;
 	int touched_cleaned;
+	int flush_failed;
 	uint32_t region_size;
 	unsigned int region_count;
 	region_t sync_count;
@@ -394,6 +395,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
 	lc->ti = ti;
 	lc->touched_dirtied = 0;
 	lc->touched_cleaned = 0;
+	lc->flush_failed = 0;
 	lc->region_size = region_size;
 	lc->region_count = region_count;
 	lc->sync = sync;
@@ -706,7 +708,8 @@ static void core_mark_region(struct dm_dirty_log *log, region_t region)
 static void core_clear_region(struct dm_dirty_log *log, region_t region)
 {
 	struct log_c *lc = (struct log_c *) log->context;
-	log_set_bit(lc, lc->clean_bits, region);
+	if (likely(!lc->flush_failed))
+		log_set_bit(lc, lc->clean_bits, region);
 }
 
 static int core_get_resync_work(struct dm_dirty_log *log, region_t *region)
-- 
cgit v1.2.3


From 87a8f240e9bcf025ba45e4563c842b0d59c5e8ef Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 10 Dec 2009 23:52:01 +0000
Subject: dm log: add flush callback fn

Introduce a callback pointer from the log to dm-raid1 layer.

Before some region is set as "in-sync", we need to flush hardware cache on
all the disks. But the log module doesn't have access to the mirror_set
structure. So it will use this callback.

So far the callback is unused, it will be used in further patches.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-log.c          | 6 ++++--
 drivers/md/dm-raid1.c        | 2 +-
 include/linux/dm-dirty-log.h | 6 ++++--
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index d779f8c915dd..666a80e3602e 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -145,8 +145,9 @@ int dm_dirty_log_type_unregister(struct dm_dirty_log_type *type)
 EXPORT_SYMBOL(dm_dirty_log_type_unregister);
 
 struct dm_dirty_log *dm_dirty_log_create(const char *type_name,
-					 struct dm_target *ti,
-					 unsigned int argc, char **argv)
+			struct dm_target *ti,
+			int (*flush_callback_fn)(struct dm_target *ti),
+			unsigned int argc, char **argv)
 {
 	struct dm_dirty_log_type *type;
 	struct dm_dirty_log *log;
@@ -161,6 +162,7 @@ struct dm_dirty_log *dm_dirty_log_create(const char *type_name,
 		return NULL;
 	}
 
+	log->flush_callback_fn = flush_callback_fn;
 	log->type = type;
 	if (type->ctr(log, ti, argc, argv)) {
 		kfree(log);
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 752a29e1855b..d44bc497dad9 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -896,7 +896,7 @@ static struct dm_dirty_log *create_dirty_log(struct dm_target *ti,
 		return NULL;
 	}
 
-	dl = dm_dirty_log_create(argv[0], ti, param_count, argv + 2);
+	dl = dm_dirty_log_create(argv[0], ti, NULL, param_count, argv + 2);
 	if (!dl) {
 		ti->error = "Error creating mirror dirty log";
 		return NULL;
diff --git a/include/linux/dm-dirty-log.h b/include/linux/dm-dirty-log.h
index 5e8b11d88f6f..7084503c3405 100644
--- a/include/linux/dm-dirty-log.h
+++ b/include/linux/dm-dirty-log.h
@@ -21,6 +21,7 @@ struct dm_dirty_log_type;
 
 struct dm_dirty_log {
 	struct dm_dirty_log_type *type;
+	int (*flush_callback_fn)(struct dm_target *ti);
 	void *context;
 };
 
@@ -136,8 +137,9 @@ int dm_dirty_log_type_unregister(struct dm_dirty_log_type *type);
  * type->constructor/destructor() directly.
  */
 struct dm_dirty_log *dm_dirty_log_create(const char *type_name,
-					 struct dm_target *ti,
-					 unsigned argc, char **argv);
+			struct dm_target *ti,
+			int (*flush_callback_fn)(struct dm_target *ti),
+			unsigned argc, char **argv);
 void dm_dirty_log_destroy(struct dm_dirty_log *log);
 
 #endif	/* __KERNEL__ */
-- 
cgit v1.2.3


From 076010e2e6ea5b66dfd1f81a6133fb014c9b291d Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 10 Dec 2009 23:52:01 +0000
Subject: dm log: use flush callback fn

Call the flush callback from the log.

If flush failed, we have no alternative but to mark the whole log as dirty.
Also we set the variable flush_failed to prevent any bits ever being marked as
clean again.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-log.c | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 666a80e3602e..315e36a96b6f 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -677,13 +677,26 @@ static int core_flush(struct dm_dirty_log *log)
 
 static int disk_flush(struct dm_dirty_log *log)
 {
-	int r;
-	struct log_c *lc = (struct log_c *) log->context;
+	int r, i;
+	struct log_c *lc = log->context;
 
 	/* only write if the log has changed */
 	if (!lc->touched_cleaned && !lc->touched_dirtied)
 		return 0;
 
+	if (lc->touched_cleaned && log->flush_callback_fn &&
+	    log->flush_callback_fn(lc->ti)) {
+		/*
+		 * At this point it is impossible to determine which
+		 * regions are clean and which are dirty (without
+		 * re-reading the log off disk). So mark all of them
+		 * dirty.
+		 */
+		lc->flush_failed = 1;
+		for (i = 0; i < lc->region_count; i++)
+			log_clear_bit(lc, lc->clean_bits, i);
+	}
+
 	r = rw_header(lc, WRITE);
 	if (r)
 		fail_log_device(lc);
-- 
cgit v1.2.3


From c0da3748b9a894b9f9b561ecc2d090a913988a0f Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 10 Dec 2009 23:52:02 +0000
Subject: dm raid1: implement mirror_flush

Implement flush callee. It uses dm_io to send zero-size barrier synchronously
and concurrently to all the mirror legs.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-raid1.c | 37 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index d44bc497dad9..751660b0c574 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -237,6 +237,40 @@ out:
 	schedule_work(&ms->trigger_event);
 }
 
+static int mirror_flush(struct dm_target *ti)
+{
+	struct mirror_set *ms = ti->private;
+	unsigned long error_bits;
+
+	unsigned int i;
+	struct dm_io_region io[ms->nr_mirrors];
+	struct mirror *m;
+	struct dm_io_request io_req = {
+		.bi_rw = WRITE_BARRIER,
+		.mem.type = DM_IO_KMEM,
+		.mem.ptr.bvec = NULL,
+		.client = ms->io_client,
+	};
+
+	for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++) {
+		io[i].bdev = m->dev->bdev;
+		io[i].sector = 0;
+		io[i].count = 0;
+	}
+
+	error_bits = -1;
+	dm_io(&io_req, ms->nr_mirrors, io, &error_bits);
+	if (unlikely(error_bits != 0)) {
+		for (i = 0; i < ms->nr_mirrors; i++)
+			if (test_bit(i, &error_bits))
+				fail_mirror(ms->mirror + i,
+					    DM_RAID1_WRITE_ERROR);
+		return -EIO;
+	}
+
+	return 0;
+}
+
 /*-----------------------------------------------------------------
  * Recovery.
  *
@@ -896,7 +930,8 @@ static struct dm_dirty_log *create_dirty_log(struct dm_target *ti,
 		return NULL;
 	}
 
-	dl = dm_dirty_log_create(argv[0], ti, NULL, param_count, argv + 2);
+	dl = dm_dirty_log_create(argv[0], ti, mirror_flush, param_count,
+				 argv + 2);
 	if (!dl) {
 		ti->error = "Error creating mirror dirty log";
 		return NULL;
-- 
cgit v1.2.3


From 64b30c46e866bbff8a9e17883a18636adc358455 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 10 Dec 2009 23:52:02 +0000
Subject: dm raid1: report flush errors separately in status

Report flush errors as 'F' instead of 'D' for log and mirror devices.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-log.c   | 16 ++++++++++++----
 drivers/md/dm-raid1.c |  6 ++++--
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 315e36a96b6f..7035582786fb 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -237,6 +237,7 @@ struct log_c {
 	 * Disk log fields
 	 */
 	int log_dev_failed;
+	int log_dev_flush_failed;
 	struct dm_dev *log_dev;
 	struct log_header header;
 
@@ -425,6 +426,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
 	} else {
 		lc->log_dev = dev;
 		lc->log_dev_failed = 0;
+		lc->log_dev_flush_failed = 0;
 		lc->header_location.bdev = lc->log_dev->bdev;
 		lc->header_location.sector = 0;
 
@@ -633,8 +635,11 @@ static int disk_resume(struct dm_dirty_log *log)
 
 	/* write the new header */
 	r = rw_header(lc, WRITE);
-	if (!r)
+	if (!r) {
 		r = flush_header(lc);
+		if (r)
+			lc->log_dev_flush_failed = 1;
+	}
 	if (r) {
 		DMWARN("%s: Failed to write header on dirty region log device",
 		       lc->log_dev->name);
@@ -703,9 +708,10 @@ static int disk_flush(struct dm_dirty_log *log)
 	else {
 		if (lc->touched_dirtied) {
 			r = flush_header(lc);
-			if (r)
+			if (r) {
+				lc->log_dev_flush_failed = 1;
 				fail_log_device(lc);
-			else
+			} else
 				lc->touched_dirtied = 0;
 		}
 		lc->touched_cleaned = 0;
@@ -805,7 +811,9 @@ static int disk_status(struct dm_dirty_log *log, status_type_t status,
 	switch(status) {
 	case STATUSTYPE_INFO:
 		DMEMIT("3 %s %s %c", log->type->name, lc->log_dev->name,
-		       lc->log_dev_failed ? 'D' : 'A');
+		       lc->log_dev_flush_failed ? 'F' :
+		       lc->log_dev_failed ? 'D' :
+		       'A');
 		break;
 
 	case STATUSTYPE_TABLE:
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 751660b0c574..85c8704c67bb 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -35,6 +35,7 @@ static DECLARE_WAIT_QUEUE_HEAD(_kmirrord_recovery_stopped);
  *---------------------------------------------------------------*/
 enum dm_raid1_error {
 	DM_RAID1_WRITE_ERROR,
+	DM_RAID1_FLUSH_ERROR,
 	DM_RAID1_SYNC_ERROR,
 	DM_RAID1_READ_ERROR
 };
@@ -264,7 +265,7 @@ static int mirror_flush(struct dm_target *ti)
 		for (i = 0; i < ms->nr_mirrors; i++)
 			if (test_bit(i, &error_bits))
 				fail_mirror(ms->mirror + i,
-					    DM_RAID1_WRITE_ERROR);
+					    DM_RAID1_FLUSH_ERROR);
 		return -EIO;
 	}
 
@@ -1288,7 +1289,8 @@ static char device_status_char(struct mirror *m)
 	if (!atomic_read(&(m->error_count)))
 		return 'A';
 
-	return (test_bit(DM_RAID1_WRITE_ERROR, &(m->error_type))) ? 'D' :
+	return (test_bit(DM_RAID1_FLUSH_ERROR, &(m->error_type))) ? 'F' :
+		(test_bit(DM_RAID1_WRITE_ERROR, &(m->error_type))) ? 'D' :
 		(test_bit(DM_RAID1_SYNC_ERROR, &(m->error_type))) ? 'S' :
 		(test_bit(DM_RAID1_READ_ERROR, &(m->error_type))) ? 'R' : 'U';
 }
-- 
cgit v1.2.3


From 04788507686d184d8166918b70ef52311bc36dcb Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 10 Dec 2009 23:52:03 +0000
Subject: dm raid1: add framework to hold bios during suspend

Add framework to delay bios until a suspend and then resubmit them with
either DM_ENDIO_REQUEUE (if the suspend was noflush) or complete them
with -EIO.  I/O barrier support will use this.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Reviewed-by: Takahiro Yasui <tyasui@redhat.com>
Tested-by: Takahiro Yasui <tyasui@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-raid1.c | 41 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 85c8704c67bb..895cce75eee9 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -58,6 +58,7 @@ struct mirror_set {
 	struct bio_list reads;
 	struct bio_list writes;
 	struct bio_list failures;
+	struct bio_list holds;	/* bios are waiting until suspend */
 
 	struct dm_region_hash *rh;
 	struct dm_kcopyd_client *kcopyd_client;
@@ -450,6 +451,27 @@ static void map_region(struct dm_io_region *io, struct mirror *m,
 	io->count = bio->bi_size >> 9;
 }
 
+static void hold_bio(struct mirror_set *ms, struct bio *bio)
+{
+	/*
+	 * If device is suspended, complete the bio.
+	 */
+	if (atomic_read(&ms->suspend)) {
+		if (dm_noflush_suspending(ms->ti))
+			bio_endio(bio, DM_ENDIO_REQUEUE);
+		else
+			bio_endio(bio, -EIO);
+		return;
+	}
+
+	/*
+	 * Hold bio until the suspend is complete.
+	 */
+	spin_lock_irq(&ms->lock);
+	bio_list_add(&ms->holds, bio);
+	spin_unlock_irq(&ms->lock);
+}
+
 /*-----------------------------------------------------------------
  * Reads
  *---------------------------------------------------------------*/
@@ -1225,6 +1247,9 @@ static void mirror_presuspend(struct dm_target *ti)
 	struct mirror_set *ms = (struct mirror_set *) ti->private;
 	struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh);
 
+	struct bio_list holds;
+	struct bio *bio;
+
 	atomic_set(&ms->suspend, 1);
 
 	/*
@@ -1247,6 +1272,22 @@ static void mirror_presuspend(struct dm_target *ti)
 	 * we know that all of our I/O has been pushed.
 	 */
 	flush_workqueue(ms->kmirrord_wq);
+
+	/*
+	 * Now set ms->suspend is set and the workqueue flushed, no more
+	 * entries can be added to ms->hold list, so process it.
+	 *
+	 * Bios can still arrive concurrently with or after this
+	 * presuspend function, but they cannot join the hold list
+	 * because ms->suspend is set.
+	 */
+	spin_lock_irq(&ms->lock);
+	holds = ms->holds;
+	bio_list_init(&ms->holds);
+	spin_unlock_irq(&ms->lock);
+
+	while ((bio = bio_list_pop(&holds)))
+		hold_bio(ms, bio);
 }
 
 static void mirror_postsuspend(struct dm_target *ti)
-- 
cgit v1.2.3


From 0f398a8403e31c737b429fddc3850093d0bf58d0 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 10 Dec 2009 23:52:04 +0000
Subject: dm raid1: use hold framework in do_failures

Use the hold framework in do_failures.

This patch doesn't change the bio processing logic, it just simplifies
failure handling and avoids periodically polling the failures list.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Reviewed-by: Takahiro Yasui <tyasui@redhat.com>
Tested-by: Takahiro Yasui <tyasui@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-raid1.c | 34 +++++++++-------------------------
 1 file changed, 9 insertions(+), 25 deletions(-)

diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 895cce75eee9..0253130fa13a 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -745,20 +745,12 @@ static void do_failures(struct mirror_set *ms, struct bio_list *failures)
 {
 	struct bio *bio;
 
-	if (!failures->head)
+	if (likely(!failures->head))
 		return;
 
-	if (!ms->log_failure) {
-		while ((bio = bio_list_pop(failures))) {
-			ms->in_sync = 0;
-			dm_rh_mark_nosync(ms->rh, bio, bio->bi_size, 0);
-		}
-		return;
-	}
-
 	/*
 	 * If the log has failed, unattempted writes are being
-	 * put on the failures list.  We can't issue those writes
+	 * put on the holds list.  We can't issue those writes
 	 * until a log has been marked, so we must store them.
 	 *
 	 * If a 'noflush' suspend is in progress, we can requeue
@@ -773,23 +765,15 @@ static void do_failures(struct mirror_set *ms, struct bio_list *failures)
 	 * for us to treat them the same and requeue them
 	 * as well.
 	 */
-	if (dm_noflush_suspending(ms->ti)) {
-		while ((bio = bio_list_pop(failures)))
-			bio_endio(bio, DM_ENDIO_REQUEUE);
-		return;
-	}
 
-	if (atomic_read(&ms->suspend)) {
-		while ((bio = bio_list_pop(failures)))
-			bio_endio(bio, -EIO);
-		return;
+	while ((bio = bio_list_pop(failures))) {
+		if (ms->log_failure)
+			hold_bio(ms, bio);
+		else {
+			ms->in_sync = 0;
+			dm_rh_mark_nosync(ms->rh, bio, bio->bi_size, 0);
+		}
 	}
-
-	spin_lock_irq(&ms->lock);
-	bio_list_merge(&ms->failures, failures);
-	spin_unlock_irq(&ms->lock);
-
-	delayed_wake(ms);
 }
 
 static void trigger_event(struct work_struct *work)
-- 
cgit v1.2.3


From 87968ddd2f3be1c21b932cac30157a83a1c4f935 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 10 Dec 2009 23:52:04 +0000
Subject: dm raid1: abstract get_valid_mirror function

Move the logic to get a valid mirror leg into a function for re-use
in a later patch.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Reviewed-by: Takahiro Yasui <tyasui@redhat.com>
Tested-by: Takahiro Yasui <tyasui@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-raid1.c | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 0253130fa13a..d1a7f1a4789c 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -181,6 +181,17 @@ static void set_default_mirror(struct mirror *m)
 	atomic_set(&ms->default_mirror, m - m0);
 }
 
+static struct mirror *get_valid_mirror(struct mirror_set *ms)
+{
+	struct mirror *m;
+
+	for (m = ms->mirror; m < ms->mirror + ms->nr_mirrors; m++)
+		if (!atomic_read(&m->error_count))
+			return m;
+
+	return NULL;
+}
+
 /* fail_mirror
  * @m: mirror device to fail
  * @error_type: one of the enum's, DM_RAID1_*_ERROR
@@ -226,13 +237,10 @@ static void fail_mirror(struct mirror *m, enum dm_raid1_error error_type)
 		goto out;
 	}
 
-	for (new = ms->mirror; new < ms->mirror + ms->nr_mirrors; new++)
-		if (!atomic_read(&new->error_count)) {
-			set_d