diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2018-01-31 11:05:47 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-01-31 11:05:47 -0800 |
commit | 0be600a5add76e8e8b9e1119f2a7426ff849aca8 (patch) | |
tree | d5fcc2b119f03143f9bed1b9aa5cb85458c8bd03 /drivers | |
parent | 040639b7fcf73ee39c15d38257f652a2048e96f2 (diff) | |
parent | 9614e2ba9161c7f5419f4212fa6057d2a65f6ae6 (diff) | |
download | linux-0be600a5add76e8e8b9e1119f2a7426ff849aca8.tar.gz linux-0be600a5add76e8e8b9e1119f2a7426ff849aca8.tar.bz2 linux-0be600a5add76e8e8b9e1119f2a7426ff849aca8.zip |
Merge tag 'for-4.16/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm
Pull device mapper updates from Mike Snitzer:
- DM core fixes to ensure that bio submission follows a depth-first
tree walk; this is critical to allow forward progress without the
need to use the bioset's BIOSET_NEED_RESCUER.
- Remove DM core's BIOSET_NEED_RESCUER based dm_offload infrastructure.
- DM core cleanups and improvements to make bio-based DM more efficient
(e.g. reduced memory footprint as well leveraging per-bio-data more).
- Introduce new bio-based mode (DM_TYPE_NVME_BIO_BASED) that leverages
the more direct IO submission path in the block layer; this mode is
used by DM multipath and also optimizes targets like DM thin-pool
that stack directly on NVMe data device.
- DM multipath improvements to factor out legacy SCSI-only (e.g.
scsi_dh) code paths to allow for more optimized support for NVMe
multipath.
- A fix for DM multipath path selectors (service-time and queue-length)
to select paths in a more balanced way; largely academic but doesn't
hurt.
- Numerous DM raid target fixes and improvements.
- Add a new DM "unstriped" target that enables Intel to workaround
firmware limitations in some NVMe drives that are striped internally
(this target also works when stacked above the DM "striped" target).
- Various Documentation fixes and improvements.
- Misc cleanups and fixes across various DM infrastructure and targets
(e.g. bufio, flakey, log-writes, snapshot).
* tag 'for-4.16/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: (69 commits)
dm cache: Documentation: update default migration_throttling value
dm mpath selector: more evenly distribute ties
dm unstripe: fix target length versus number of stripes size check
dm thin: fix trailing semicolon in __remap_and_issue_shared_cell
dm table: fix NVMe bio-based dm_table_determine_type() validation
dm: various cleanups to md->queue initialization code
dm mpath: delay the retry of a request if the target responded as busy
dm mpath: return DM_MAPIO_DELAY_REQUEUE if QUEUE_IO or PG_INIT_REQUIRED
dm mpath: return DM_MAPIO_REQUEUE on blk-mq rq allocation failure
dm log writes: fix max length used for kstrndup
dm: backfill missing calls to mutex_destroy()
dm snapshot: use mutex instead of rw_semaphore
dm flakey: check for null arg_name in parse_features()
dm thin: extend thinpool status format string with omitted fields
dm thin: fixes in thin-provisioning.txt
dm thin: document representation of <highest mapped sector> when there is none
dm thin: fix documentation relative to low water mark threshold
dm cache: be consistent in specifying sectors and SI units in cache.txt
dm cache: delete obsoleted paragraph in cache.txt
dm cache: fix grammar in cache-policies.txt
...
Diffstat (limited to 'drivers')
-rw-r--r-- | drivers/md/Kconfig | 7 | ||||
-rw-r--r-- | drivers/md/Makefile | 1 | ||||
-rw-r--r-- | drivers/md/dm-bufio.c | 37 | ||||
-rw-r--r-- | drivers/md/dm-core.h | 5 | ||||
-rw-r--r-- | drivers/md/dm-crypt.c | 5 | ||||
-rw-r--r-- | drivers/md/dm-delay.c | 2 | ||||
-rw-r--r-- | drivers/md/dm-flakey.c | 5 | ||||
-rw-r--r-- | drivers/md/dm-io.c | 3 | ||||
-rw-r--r-- | drivers/md/dm-kcopyd.c | 6 | ||||
-rw-r--r-- | drivers/md/dm-log-writes.c | 2 | ||||
-rw-r--r-- | drivers/md/dm-mpath.c | 297 | ||||
-rw-r--r-- | drivers/md/dm-queue-length.c | 6 | ||||
-rw-r--r-- | drivers/md/dm-raid.c | 380 | ||||
-rw-r--r-- | drivers/md/dm-rq.c | 6 | ||||
-rw-r--r-- | drivers/md/dm-service-time.c | 6 | ||||
-rw-r--r-- | drivers/md/dm-snap.c | 84 | ||||
-rw-r--r-- | drivers/md/dm-stats.c | 1 | ||||
-rw-r--r-- | drivers/md/dm-table.c | 114 | ||||
-rw-r--r-- | drivers/md/dm-thin.c | 9 | ||||
-rw-r--r-- | drivers/md/dm-unstripe.c | 219 | ||||
-rw-r--r-- | drivers/md/dm-zoned-metadata.c | 3 | ||||
-rw-r--r-- | drivers/md/dm-zoned-target.c | 3 | ||||
-rw-r--r-- | drivers/md/dm.c | 659 | ||||
-rw-r--r-- | drivers/md/dm.h | 4 |
24 files changed, 1252 insertions, 612 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 83b9362be09c..2c8ac3688815 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -269,6 +269,13 @@ config DM_BIO_PRISON source "drivers/md/persistent-data/Kconfig" +config DM_UNSTRIPED + tristate "Unstriped target" + depends on BLK_DEV_DM + ---help--- + Unstripes I/O so it is issued solely on a single drive in a HW + RAID0 or dm-striped target. + config DM_CRYPT tristate "Crypt target support" depends on BLK_DEV_DM diff --git a/drivers/md/Makefile b/drivers/md/Makefile index f701bb211783..63255f3ebd97 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -43,6 +43,7 @@ obj-$(CONFIG_BCACHE) += bcache/ obj-$(CONFIG_BLK_DEV_MD) += md-mod.o obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o obj-$(CONFIG_BLK_DEV_DM_BUILTIN) += dm-builtin.o +obj-$(CONFIG_DM_UNSTRIPED) += dm-unstripe.o obj-$(CONFIG_DM_BUFIO) += dm-bufio.o obj-$(CONFIG_DM_BIO_PRISON) += dm-bio-prison.o obj-$(CONFIG_DM_CRYPT) += dm-crypt.o diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index c546b567f3b5..414c9af54ded 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -662,7 +662,7 @@ static void submit_io(struct dm_buffer *b, int rw, bio_end_io_t *end_io) sector = (b->block << b->c->sectors_per_block_bits) + b->c->start; - if (rw != WRITE) { + if (rw != REQ_OP_WRITE) { n_sectors = 1 << b->c->sectors_per_block_bits; offset = 0; } else { @@ -740,7 +740,7 @@ static void __write_dirty_buffer(struct dm_buffer *b, b->write_end = b->dirty_end; if (!write_list) - submit_io(b, WRITE, write_endio); + submit_io(b, REQ_OP_WRITE, write_endio); else list_add_tail(&b->write_list, write_list); } @@ -753,7 +753,7 @@ static void __flush_write_list(struct list_head *write_list) struct dm_buffer *b = list_entry(write_list->next, struct dm_buffer, write_list); list_del(&b->write_list); - submit_io(b, WRITE, write_endio); + submit_io(b, REQ_OP_WRITE, write_endio); cond_resched(); } blk_finish_plug(&plug); @@ -1123,7 +1123,7 @@ static void *new_read(struct dm_bufio_client *c, sector_t block, return NULL; if (need_submit) - submit_io(b, READ, read_endio); + submit_io(b, REQ_OP_READ, read_endio); wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE); @@ -1193,7 +1193,7 @@ void dm_bufio_prefetch(struct dm_bufio_client *c, dm_bufio_unlock(c); if (need_submit) - submit_io(b, READ, read_endio); + submit_io(b, REQ_OP_READ, read_endio); dm_bufio_release(b); cond_resched(); @@ -1454,7 +1454,7 @@ retry: old_block = b->block; __unlink_buffer(b); __link_buffer(b, new_block, b->list_mode); - submit_io(b, WRITE, write_endio); + submit_io(b, REQ_OP_WRITE, write_endio); wait_on_bit_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE); __unlink_buffer(b); @@ -1716,7 +1716,7 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign if (!DM_BUFIO_CACHE_NAME(c)) { r = -ENOMEM; mutex_unlock(&dm_bufio_clients_lock); - goto bad_cache; + goto bad; } } @@ -1727,7 +1727,7 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign if (!DM_BUFIO_CACHE(c)) { r = -ENOMEM; mutex_unlock(&dm_bufio_clients_lock); - goto bad_cache; + goto bad; } } } @@ -1738,27 +1738,28 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign if (!b) { r = -ENOMEM; - goto bad_buffer; + goto bad; } __free_buffer_wake(b); } + c->shrinker.count_objects = dm_bufio_shrink_count; + c->shrinker.scan_objects = dm_bufio_shrink_scan; + c->shrinker.seeks = 1; + c->shrinker.batch = 0; + r = register_shrinker(&c->shrinker); + if (r) + goto bad; + mutex_lock(&dm_bufio_clients_lock); dm_bufio_client_count++; list_add(&c->client_list, &dm_bufio_all_clients); __cache_size_refresh(); mutex_unlock(&dm_bufio_clients_lock); - c->shrinker.count_objects = dm_bufio_shrink_count; - c->shrinker.scan_objects = dm_bufio_shrink_scan; - c->shrinker.seeks = 1; - c->shrinker.batch = 0; - register_shrinker(&c->shrinker); - return c; -bad_buffer: -bad_cache: +bad: while (!list_empty(&c->reserved_buffers)) { struct dm_buffer *b = list_entry(c->reserved_buffers.next, struct dm_buffer, lru_list); @@ -1767,6 +1768,7 @@ bad_cache: } dm_io_client_destroy(c->dm_io); bad_dm_io: + mutex_destroy(&c->lock); kfree(c); bad_client: return ERR_PTR(r); @@ -1811,6 +1813,7 @@ void dm_bufio_client_destroy(struct dm_bufio_client *c) BUG_ON(c->n_buffers[i]); dm_io_client_destroy(c->dm_io); + mutex_destroy(&c->lock); kfree(c); } EXPORT_SYMBOL_GPL(dm_bufio_client_destroy); diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index 6a14f945783c..3222e21cbbf8 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -91,8 +91,7 @@ struct mapped_device { /* * io objects are allocated from here. */ - mempool_t *io_pool; - + struct bio_set *io_bs; struct bio_set *bs; /* @@ -130,8 +129,6 @@ struct mapped_device { struct srcu_struct io_barrier; }; -void dm_init_md_queue(struct mapped_device *md); -void dm_init_normal_md_queue(struct mapped_device *md); int md_in_flight(struct mapped_device *md); void disable_write_same(struct mapped_device *md); void disable_write_zeroes(struct mapped_device *md); diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 2ad429100d25..8168f737590e 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -2193,6 +2193,8 @@ static void crypt_dtr(struct dm_target *ti) kzfree(cc->cipher_auth); kzfree(cc->authenc_key); + mutex_destroy(&cc->bio_alloc_lock); + /* Must zero key material before freeing */ kzfree(cc); } @@ -2702,8 +2704,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } - cc->bs = bioset_create(MIN_IOS, 0, (BIOSET_NEED_BVECS | - BIOSET_NEED_RESCUER)); + cc->bs = bioset_create(MIN_IOS, 0, BIOSET_NEED_BVECS); if (!cc->bs) { ti->error = "Cannot allocate crypt bioset"; goto bad; diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index 288386bfbfb5..1783d80c9cad 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@ -229,6 +229,8 @@ static void delay_dtr(struct dm_target *ti) if (dc->dev_write) dm_put_device(ti, dc->dev_write); + mutex_destroy(&dc->timer_lock); + kfree(dc); } diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index b82cb1ab1eaa..1b907b15f5c3 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c @@ -70,6 +70,11 @@ static int parse_features(struct dm_arg_set *as, struct flakey_c *fc, arg_name = dm_shift_arg(as); argc--; + if (!arg_name) { + ti->error = "Insufficient feature arguments"; + return -EINVAL; + } + /* * drop_writes */ diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index b4357ed4d541..a8d914d5abbe 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -58,8 +58,7 @@ struct dm_io_client *dm_io_client_create(void) if (!client->pool) goto bad; - client->bios = bioset_create(min_ios, 0, (BIOSET_NEED_BVECS | - BIOSET_NEED_RESCUER)); + client->bios = bioset_create(min_ios, 0, BIOSET_NEED_BVECS); if (!client->bios) goto bad; diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c index eb45cc3df31d..e6e7c686646d 100644 --- a/drivers/md/dm-kcopyd.c +++ b/drivers/md/dm-kcopyd.c @@ -477,8 +477,10 @@ static int run_complete_job(struct kcopyd_job *job) * If this is the master job, the sub jobs have already * completed so we can free everything. */ - if (job->master_job == job) + if (job->master_job == job) { + mutex_destroy(&job->lock); mempool_free(job, kc->job_pool); + } fn(read_err, write_err, context); if (atomic_dec_and_test(&kc->nr_jobs)) @@ -750,6 +752,7 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from, * followed by SPLIT_COUNT sub jobs. */ job = mempool_alloc(kc->job_pool, GFP_NOIO); + mutex_init(&job->lock); /* * set up for the read. @@ -811,7 +814,6 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from, if (job->source.count <= SUB_JOB_SIZE) dispatch_job(job); else { - mutex_init(&job->lock); job->progress = 0; split_job(job); } diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c index 189badbeddaf..3362d866793b 100644 --- a/drivers/md/dm-log-writes.c +++ b/drivers/md/dm-log-writes.c @@ -594,7 +594,7 @@ static int log_mark(struct log_writes_c *lc, char *data) return -ENOMEM; } - block->data = kstrndup(data, maxsize, GFP_KERNEL); + block->data = kstrndup(data, maxsize - 1, GFP_KERNEL); if (!block->data) { DMERR("Error copying mark data"); kfree(block); diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index ef57c6d1c887..7d3e572072f5 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -64,36 +64,30 @@ struct priority_group { /* Multipath context */ struct multipath { - struct list_head list; - struct dm_target *ti; - - const char *hw_handler_name; - char *hw_handler_params; + unsigned long flags; /* Multipath state flags */ spinlock_t lock; - - unsigned nr_priority_groups; - struct list_head priority_groups; - - wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */ + enum dm_queue_mode queue_mode; struct pgpath *current_pgpath; struct priority_group *current_pg; struct priority_group *next_pg; /* Switch to this PG if set */ - unsigned long flags; /* Multipath state flags */ + atomic_t nr_valid_paths; /* Total number of usable paths */ + unsigned nr_priority_groups; + struct list_head priority_groups; + const char *hw_handler_name; + char *hw_handler_params; + wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */ unsigned pg_init_retries; /* Number of times to retry pg_init */ unsigned pg_init_delay_msecs; /* Number of msecs before pg_init retry */ - - atomic_t nr_valid_paths; /* Total number of usable paths */ atomic_t pg_init_in_progress; /* Only one pg_init allowed at once */ atomic_t pg_init_count; /* Number of times pg_init called */ - enum dm_queue_mode queue_mode; - struct mutex work_mutex; struct work_struct trigger_event; + struct dm_target *ti; struct work_struct process_queued_bios; struct bio_list queued_bios; @@ -135,10 +129,10 @@ static struct pgpath *alloc_pgpath(void) { struct pgpath *pgpath = kzalloc(sizeof(*pgpath), GFP_KERNEL); - if (pgpath) { - pgpath->is_active = true; - INIT_DELAYED_WORK(&pgpath->activate_path, activate_path_work); - } + if (!pgpath) + return NULL; + + pgpath->is_active = true; return pgpath; } @@ -193,13 +187,8 @@ static struct multipath *alloc_multipath(struct dm_target *ti) if (m) { INIT_LIST_HEAD(&m->priority_groups); spin_lock_init(&m->lock); - set_bit(MPATHF_QUEUE_IO, &m->flags); atomic_set(&m->nr_valid_paths, 0); - atomic_set(&m->pg_init_in_progress, 0); - atomic_set(&m->pg_init_count, 0); - m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT; INIT_WORK(&m->trigger_event, trigger_event); - init_waitqueue_head(&m->pg_init_wait); mutex_init(&m->work_mutex); m->queue_mode = DM_TYPE_NONE; @@ -221,13 +210,26 @@ static int alloc_multipath_stage2(struct dm_target *ti, struct multipath *m) m->queue_mode = DM_TYPE_MQ_REQUEST_BASED; else m->queue_mode = DM_TYPE_REQUEST_BASED; - } else if (m->queue_mode == DM_TYPE_BIO_BASED) { + + } else if (m->queue_mode == DM_TYPE_BIO_BASED || + m->queue_mode == DM_TYPE_NVME_BIO_BASED) { INIT_WORK(&m->process_queued_bios, process_queued_bios); - /* - * bio-based doesn't support any direct scsi_dh management; - * it just discovers if a scsi_dh is attached. - */ - set_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags); + + if (m->queue_mode == DM_TYPE_BIO_BASED) { + /* + * bio-based doesn't support any direct scsi_dh management; + * it just discovers if a scsi_dh is attached. + */ + set_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags); + } + } + + if (m->queue_mode != DM_TYPE_NVME_BIO_BASED) { + set_bit(MPATHF_QUEUE_IO, &m->flags); + atomic_set(&m->pg_init_in_progress, 0); + atomic_set(&m->pg_init_count, 0); + m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT; + init_waitqueue_head(&m->pg_init_wait); } dm_table_set_type(ti->table, m->queue_mode); @@ -246,6 +248,7 @@ static void free_multipath(struct multipath *m) kfree(m->hw_handler_name); kfree(m->hw_handler_params); + mutex_destroy(&m->work_mutex); kfree(m); } @@ -264,29 +267,23 @@ static struct dm_mpath_io *get_mpio_from_bio(struct bio *bio) return dm_per_bio_data(bio, multipath_per_bio_data_size()); } -static struct dm_bio_details *get_bio_details_from_bio(struct bio *bio) +static struct dm_bio_details *get_bio_details_from_mpio(struct dm_mpath_io *mpio) { /* dm_bio_details is immediately after the dm_mpath_io in bio's per-bio-data */ - struct dm_mpath_io *mpio = get_mpio_from_bio(bio); void *bio_details = mpio + 1; - return bio_details; } -static void multipath_init_per_bio_data(struct bio *bio, struct dm_mpath_io **mpio_p, - struct dm_bio_details **bio_details_p) +static void multipath_init_per_bio_data(struct bio *bio, struct dm_mpath_io **mpio_p) { struct dm_mpath_io *mpio = get_mpio_from_bio(bio); - struct dm_bio_details *bio_details = get_bio_details_from_bio(bio); + struct dm_bio_details *bio_details = get_bio_details_from_mpio(mpio); - memset(mpio, 0, sizeof(*mpio)); - memset(bio_details, 0, sizeof(*bio_details)); - dm_bio_record(bio_details, bio); + mpio->nr_bytes = bio->bi_iter.bi_size; + mpio->pgpath = NULL; + *mpio_p = mpio; - if (mpio_p) - *mpio_p = mpio; - if (bio_details_p) - *bio_details_p = bio_details; + dm_bio_record(bio_details, bio); } /*----------------------------------------------- @@ -340,6 +337,9 @@ static void __switch_pg(struct multipath *m, struct priority_group *pg) { m->current_pg = pg; + if (m->queue_mode == DM_TYPE_NVME_BIO_BASED) + return; + /* Must we initialise the PG first, and queue I/O till it's ready? */ if (m->hw_handler_name) { set_bit(MPATHF_PG_INIT_REQUIRED, &m->flags); @@ -385,7 +385,8 @@ static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes) unsigned bypassed = 1; if (!atomic_read(&m->nr_valid_paths)) { - clear_bit(MPATHF_QUEUE_IO, &m->flags); + if (m->queue_mode != DM_TYPE_NVME_BIO_BASED) + clear_bit(MPATHF_QUEUE_IO, &m->flags); goto failed; } @@ -516,12 +517,10 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq, return DM_MAPIO_KILL; } else if (test_bit(MPATHF_QUEUE_IO, &m->flags) || test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) { - if (pg_init_all_paths(m)) - return DM_MAPIO_DELAY_REQUEUE; - return DM_MAPIO_REQUEUE; + pg_init_all_paths(m); + return DM_MAPIO_DELAY_REQUEUE; } - memset(mpio, 0, sizeof(*mpio)); mpio->pgpath = pgpath; mpio->nr_bytes = nr_bytes; @@ -530,12 +529,23 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq, clone = blk_get_request(q, rq->cmd_flags | REQ_NOMERGE, GFP_ATOMIC); if (IS_ERR(clone)) { /* EBUSY, ENODEV or EWOULDBLOCK: requeue */ - bool queue_dying = blk_queue_dying(q); - if (queue_dying) { + if (blk_queue_dying(q)) { atomic_inc(&m->pg_init_in_progress); activate_or_offline_path(pgpath); + return DM_MAPIO_DELAY_REQUEUE; } - return DM_MAPIO_DELAY_REQUEUE; + + /* + * blk-mq's SCHED_RESTART can cover this requeue, so we + * needn't deal with it by DELAY_REQUEUE. More importantly, + * we have to return DM_MAPIO_REQUEUE so that blk-mq can + * get the queue busy feedback (via BLK_STS_RESOURCE), + * otherwise I/O merging can suffer. + */ + if (q->mq_ops) + return DM_MAPIO_REQUEUE; + else + return DM_MAPIO_DELAY_REQUEUE; } clone->bio = clone->biotail = NULL; clone->rq_disk = bdev->bd_disk; @@ -557,9 +567,9 @@ static void multipath_release_clone(struct request *clone) /* * Map cloned bios (bio-based multipath) */ -static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_mpath_io *mpio) + +static struct pgpath *__map_bio(struct multipath *m, struct bio *bio) { - size_t nr_bytes = bio->bi_iter.bi_size; struct pgpath *pgpath; unsigned long flags; bool queue_io; @@ -568,7 +578,7 @@ static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_m pgpath = READ_ONCE(m->current_pgpath); queue_io = test_bit(MPATHF_QUEUE_IO, &m->flags); if (!pgpath || !queue_io) - pgpath = choose_pgpath(m, nr_bytes); + pgpath = choose_pgpath(m, bio->bi_iter.bi_size); if ((pgpath && queue_io) || (!pgpath && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))) { @@ -576,14 +586,62 @@ static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_m spin_lock_irqsave(&m->lock, flags); bio_list_add(&m->queued_bios, bio); spin_unlock_irqrestore(&m->lock, flags); + /* PG_INIT_REQUIRED cannot be set without QUEUE_IO */ if (queue_io || test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) pg_init_all_paths(m); else if (!queue_io) queue_work(kmultipathd, &m->process_queued_bios); - return DM_MAPIO_SUBMITTED; + + return ERR_PTR(-EAGAIN); } + return pgpath; +} + +static struct pgpath *__map_bio_nvme(struct multipath *m, struct bio *bio) +{ + struct pgpath *pgpath; + unsigned long flags; + + /* Do we need to select a new pgpath? */ + /* + * FIXME: currently only switching path if no path (due to failure, etc) + * - which negates the point of using a path selector + */ + pgpath = READ_ONCE(m->current_pgpath); + if (!pgpath) + pgpath = choose_pgpath(m, bio->bi_iter.bi_size); + + if (!pgpath) { + if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) { + /* Queue for the daemon to resubmit */ + spin_lock_irqsave(&m->lock, flags); + bio_list_add(&m->queued_bios, bio); + spin_unlock_irqrestore(&m->lock, flags); + queue_work(kmultipathd, &m->process_queued_bios); + + return ERR_PTR(-EAGAIN); + } + return NULL; + } + + return pgpath; +} + +static int __multipath_map_bio(struct multipath *m, struct bio *bio, + struct dm_mpath_io *mpio) +{ + struct pgpath *pgpath; + + if (m->queue_mode == DM_TYPE_NVME_BIO_BASED) + pgpath = __map_bio_nvme(m, bio); + else + pgpath = __map_bio(m, bio); + + if (IS_ERR(pgpath)) + return DM_MAPIO_SUBMITTED; + if (!pgpath) { if (must_push_back_bio(m)) return DM_MAPIO_REQUEUE; @@ -592,7 +650,6 @@ static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_m } mpio->pgpath = pgpath; - mpio->nr_bytes = nr_bytes; bio->bi_status = 0; bio_set_dev(bio, pgpath->path.dev->bdev); @@ -601,7 +658,7 @@ static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_m if (pgpath->pg->ps.type->start_io) pgpath->pg->ps.type->start_io(&pgpath->pg->ps, &pgpath->path, - nr_bytes); + mpio->nr_bytes); return DM_MAPIO_REMAPPED; } @@ -610,8 +667,7 @@ static int multipath_map_bio(struct dm_target *ti, struct bio *bio) struct multipath *m = ti->private; struct dm_mpath_io *mpio = NULL; - multipath_init_per_bio_data(bio, &mpio, NULL); - + multipath_init_per_bio_data(bio, &mpio); return __multipath_map_bio(m, bio, mpio); } @@ -619,7 +675,8 @@ static void process_queued_io_list(struct multipath *m) { if (m->queue_mode == DM_TYPE_MQ_REQUEST_BASED) dm_mq_kick_requeue_list(dm_table_get_md(m->ti->table)); - else if (m->queue_mode == DM_TYPE_BIO_BASED) + else if (m->queue_mode == DM_TYPE_BIO_BASED || + m->queue_mode == DM_TYPE_NVME_BIO_BASED) queue_work(kmultipathd, &m->process_queued_bios); } @@ -649,7 +706,9 @@ static void process_queued_bios(struct work_struct *work) blk_start_plug(&plug); while ((bio = bio_list_pop(&bios))) { - r = __multipath_map_bio(m, bio, get_mpio_from_bio(bio)); + struct dm_mpath_io *mpio = get_mpio_from_bio(bio); + dm_bio_restore(get_bio_details_from_mpio(mpio), bio); + r = __multipath_map_bio(m, bio, mpio); switch (r) { case DM_MAPIO_KILL: bio->bi_status = BLK_STS_IOERR; @@ -752,34 +811,11 @@ static int parse_path_selector(struct dm_arg_set *as, struct priority_group *pg, return 0; } -static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps, - struct dm_target *ti) +static int setup_scsi_dh(struct block_device *bdev, struct multipath *m, char **error) { - int r; - struct pgpath *p; - struct multipath *m = ti->private; - struct request_queue *q = NULL; + struct request_queue *q = bdev_get_queue(bdev); const char *attached_handler_name; - - /* we need at least a path arg */ - if (as->argc < 1) { - ti->error = "no device given"; - return ERR_PTR(-EINVAL); - } - - p = alloc_pgpath(); - if (!p) - return ERR_PTR(-ENOMEM); - - r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table), - &p->path.dev); - if (r) { - ti->error = "error getting device"; - goto bad; - } - - if (test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags) || m->hw_handler_name) - q = bdev_get_queue(p->path.dev->bdev); + int r; if (test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags)) { retain: @@ -811,26 +847,59 @@ retain: char b[BDEVNAME_SIZE]; printk(KERN_INFO "dm-mpath: retaining handler on device %s\n", - bdevname(p->path.dev->bdev, b)); + bdevname(bdev, b)); goto retain; } if (r < 0) { - ti->error = "error attaching hardware handler"; - dm_put_device(ti, p->path.dev); - goto bad; + *error = "error attaching hardware handler"; + return r; } if (m->hw_handler_params) { r = scsi_dh_set_params(q, m->hw_handler_params); if (r < 0) { - ti->error = "unable to set hardware " - "handler parameters"; - dm_put_device(ti, p->path.dev); - goto bad; + *error = "unable to set hardware handler parameters"; + return r; } } } + return 0; +} + +static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps, + struct dm_target *ti) +{ + int r; + struct pgpath *p; + struct multipath *m = ti->private; + + /* we need at least a path arg */ + if (as->argc < 1) { + ti->error = "no device given"; + return ERR_PTR(-EINVAL); + } + + p = alloc_pgpath(); + if (!p) + return ERR_PTR(-ENOMEM); + + r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table), + &p->path.dev); + if (r) { + ti->error = "error getting device"; + goto bad; + } + + if (m->queue_mode != DM_TYPE_NVME_BIO_BASED) { + INIT_DELAYED_WORK(&p->activate_path, activate_path_work); + r = setup_scsi_dh(p->path.dev->bdev, m, &ti->error); + if (r) { + dm_put_device(ti, p->path.dev); + goto bad; + } + } + r = ps->type->add_path(ps, &p->path, as->argc, as->argv, &ti->error); if (r) { dm_put_device(ti, p->path.dev); @@ -838,7 +907,6 @@ retain: } return p; - bad: free_pgpath(p); return ERR_PTR(r); @@ -933,7 +1001,8 @@ static int parse_hw_handler(struct dm_arg_set *as, struct multipath *m) if (!hw_argc) return 0; - if (m->queue_mode == DM_TYPE_BIO_BASED) { + if (m->queue_mode == DM_TYPE_BIO_BASED || + m->queue_mode == DM_TYPE_NVME_BIO_BASED) { dm_consume_args(as, hw_argc); DMERR("bio-based multipath doesn't allow hardware handler args"); return 0; @@ -1022,6 +1091,8 @@ static int parse_features(struct dm_arg_set *as, struct multipath *m) if (!strcasecmp(queue_mode_name, "bio")) m->queue_mode = DM_TYPE_BIO_BASED; + else if (!strcasecmp(queue_mode_name, "nvme")) + m->queue_mode = DM_TYPE_NVME_BIO_BASED; else if (!strcasecmp(queue_mode_name, "rq")) m->queue_mode = DM_TYPE_REQUEST_BASED; else if (!strcasecmp(queue_mode_name, "mq")) @@ -1122,7 +1193,7 @@ static int multipath_ctr(struct dm_target *ti, unsigned argc, char **argv) ti->num_discard_bios = 1; ti->num_write_same_bios = 1; ti->num_write_zeroes_bios = 1; - if (m->queue_mode == DM_TYPE_BIO_BASED) + if (m->queue_mode == DM_TYPE_BIO_BASED || m->queue_mode == DM_TYPE_NVME_BIO_BASED) ti->per_io_data_size = multipath_per_bio_data_size(); else ti->per_io_data_size = sizeof(struct dm_mpath_io); @@ -1151,16 +1222,19 @@ static void multipath_wait_for_pg_init_completion(struct multipath *m) static void flush_multipath_work(struct multipath *m) { - set_bit(MPATHF_PG_INIT_DISABLED, &m->flags); - smp_mb__after_atomic(); + if (m->hw_handler_name) { + set_bit(MPATHF_PG_INIT_DISABLED, &m->flags); + smp_mb__after_atomic(); |