summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/block/ioprio.rst13
-rw-r--r--block/bio-integrity.c2
-rw-r--r--block/blk-cgroup.c2
-rw-r--r--block/blk-iocost.c2
-rw-r--r--block/blk-mq-debugfs.c18
-rw-r--r--block/blk-mq-sched.c2
-rw-r--r--block/blk-mq.c50
-rw-r--r--block/ioprio.c26
-rw-r--r--block/partitions/core.c4
-rw-r--r--drivers/block/loop.c52
-rw-r--r--drivers/block/nbd.c6
-rw-r--r--drivers/block/null_blk/main.c4
-rw-r--r--drivers/block/virtio_blk.c2
-rw-r--r--drivers/md/md.c40
-rw-r--r--drivers/md/raid1.c12
-rw-r--r--drivers/nvme/common/keyring.c2
-rw-r--r--drivers/nvme/host/core.c41
-rw-r--r--drivers/nvme/host/nvme.h16
-rw-r--r--drivers/nvme/host/pci.c27
-rw-r--r--drivers/nvme/host/pr.c2
-rw-r--r--drivers/nvme/host/rdma.c11
-rw-r--r--drivers/nvme/host/sysfs.c8
-rw-r--r--drivers/nvme/host/tcp.c11
-rw-r--r--drivers/nvme/target/fc.c2
-rw-r--r--drivers/nvme/target/fcloop.c7
-rw-r--r--drivers/nvme/target/rdma.c19
-rw-r--r--drivers/nvme/target/tcp.c48
-rw-r--r--drivers/nvme/target/trace.c6
-rw-r--r--drivers/nvme/target/trace.h33
-rw-r--r--include/linux/bio.h9
-rw-r--r--include/linux/blk-mq.h3
-rw-r--r--include/linux/ioprio.h25
-rw-r--r--include/linux/nvme.h1
-rw-r--r--lib/sbitmap.c5
34 files changed, 287 insertions, 224 deletions
diff --git a/Documentation/block/ioprio.rst b/Documentation/block/ioprio.rst
index a25c6d5df87b..4662e1ff3d81 100644
--- a/Documentation/block/ioprio.rst
+++ b/Documentation/block/ioprio.rst
@@ -6,17 +6,16 @@ Block io priorities
Intro
-----
-With the introduction of cfq v3 (aka cfq-ts or time sliced cfq), basic io
-priorities are supported for reads on files. This enables users to io nice
-processes or process groups, similar to what has been possible with cpu
-scheduling for ages. This document mainly details the current possibilities
-with cfq; other io schedulers do not support io priorities thus far.
+The io priority feature enables users to io nice processes or process groups,
+similar to what has been possible with cpu scheduling for ages. Support for io
+priorities is io scheduler dependent and currently supported by bfq and
+mq-deadline.
Scheduling classes
------------------
-CFQ implements three generic scheduling classes that determine how io is
-served for a process.
+Three generic scheduling classes are implemented for io priorities that
+determine how io is served for a process.
IOPRIO_CLASS_RT: This is the realtime io class. This scheduling class is given
higher priority than any other in the system, processes from this class are
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index feef615e2c9c..c9a16fba58b9 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -336,7 +336,7 @@ int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes,
if (nr_vecs > BIO_MAX_VECS)
return -E2BIG;
if (nr_vecs > UIO_FASTIOV) {
- bvec = kcalloc(sizeof(*bvec), nr_vecs, GFP_KERNEL);
+ bvec = kcalloc(nr_vecs, sizeof(*bvec), GFP_KERNEL);
if (!bvec)
return -ENOMEM;
pages = NULL;
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index e303fd317313..ff93c385ba5a 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -300,7 +300,7 @@ static inline struct blkcg *blkcg_parent(struct blkcg *blkcg)
* @disk: gendisk the new blkg is associated with
* @gfp_mask: allocation mask to use
*
- * Allocate a new blkg assocating @blkcg and @q.
+ * Allocate a new blkg associating @blkcg and @disk.
*/
static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk,
gfp_t gfp_mask)
diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index 089fcb9cfce3..c8beec6d7df0 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -1261,7 +1261,7 @@ static void weight_updated(struct ioc_gq *iocg, struct ioc_now *now)
static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now)
{
struct ioc *ioc = iocg->ioc;
- u64 last_period, cur_period;
+ u64 __maybe_unused last_period, cur_period;
u64 vtime, vtarget;
int i;
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 5cbeb9344f2f..94668e72ab09 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -479,23 +479,6 @@ out:
return res;
}
-static int hctx_run_show(void *data, struct seq_file *m)
-{
- struct blk_mq_hw_ctx *hctx = data;
-
- seq_printf(m, "%lu\n", hctx->run);
- return 0;
-}
-
-static ssize_t hctx_run_write(void *data, const char __user *buf, size_t count,
- loff_t *ppos)
-{
- struct blk_mq_hw_ctx *hctx = data;
-
- hctx->run = 0;
- return count;
-}
-
static int hctx_active_show(void *data, struct seq_file *m)
{
struct blk_mq_hw_ctx *hctx = data;
@@ -624,7 +607,6 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = {
{"tags_bitmap", 0400, hctx_tags_bitmap_show},
{"sched_tags", 0400, hctx_sched_tags_show},
{"sched_tags_bitmap", 0400, hctx_sched_tags_bitmap_show},
- {"run", 0600, hctx_run_show, hctx_run_write},
{"active", 0400, hctx_active_show},
{"dispatch_busy", 0400, hctx_dispatch_busy_show},
{"type", 0400, hctx_type_show},
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 67c95f31b15b..451a2c1f1f32 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -324,8 +324,6 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
return;
- hctx->run++;
-
/*
* A return of -EAGAIN is an indication that hctx->dispatch is not
* empty and we must run again in order to avoid starving flushes.
diff --git a/block/blk-mq.c b/block/blk-mq.c
index c11c97afa0bc..aa87fcfda1ec 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -772,11 +772,16 @@ static void req_bio_endio(struct request *rq, struct bio *bio,
/*
* Partial zone append completions cannot be supported as the
* BIO fragments may end up not being written sequentially.
+ * For such case, force the completed nbytes to be equal to
+ * the BIO size so that bio_advance() sets the BIO remaining
+ * size to 0 and we end up calling bio_endio() before returning.
*/
- if (bio->bi_iter.bi_size != nbytes)
+ if (bio->bi_iter.bi_size != nbytes) {
bio->bi_status = BLK_STS_IOERR;
- else
+ nbytes = bio->bi_iter.bi_size;
+ } else {
bio->bi_iter.bi_sector = rq->__sector;
+ }
}
bio_advance(bio, nbytes);
@@ -1860,6 +1865,22 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx,
__add_wait_queue(wq, wait);
/*
+ * Add one explicit barrier since blk_mq_get_driver_tag() may
+ * not imply barrier in case of failure.
+ *
+ * Order adding us to wait queue and allocating driver tag.
+ *
+ * The pair is the one implied in sbitmap_queue_wake_up() which
+ * orders clearing sbitmap tag bits and waitqueue_active() in
+ * __sbitmap_queue_wake_up(), since waitqueue_active() is lockless
+ *
+ * Otherwise, re-order of adding wait queue and getting driver tag
+ * may cause __sbitmap_queue_wake_up() to wake up nothing because
+ * the waitqueue_active() may not observe us in wait queue.
+ */
+ smp_mb();
+
+ /*
* It's possible that a tag was freed in the window between the
* allocation failure and adding the hardware queue to the wait
* queue.
@@ -2891,8 +2912,11 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q,
return NULL;
}
-/* return true if this @rq can be used for @bio */
-static bool blk_mq_can_use_cached_rq(struct request *rq, struct blk_plug *plug,
+/*
+ * Check if we can use the passed on request for submitting the passed in bio,
+ * and remove it from the request list if it can be used.
+ */
+static bool blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug,
struct bio *bio)
{
enum hctx_type type = blk_mq_get_hctx_type(bio->bi_opf);
@@ -2952,12 +2976,6 @@ void blk_mq_submit_bio(struct bio *bio)
blk_status_t ret;
bio = blk_queue_bounce(bio, q);
- if (bio_may_exceed_limits(bio, &q->limits)) {
- bio = __bio_split_to_limits(bio, &q->limits, &nr_segs);
- if (!bio)
- return;
- }
-
bio_set_ioprio(bio);
if (plug) {
@@ -2966,16 +2984,26 @@ void blk_mq_submit_bio(struct bio *bio)
rq = NULL;
}
if (rq) {
+ if (unlikely(bio_may_exceed_limits(bio, &q->limits))) {
+ bio = __bio_split_to_limits(bio, &q->limits, &nr_segs);
+ if (!bio)
+ return;
+ }
if (!bio_integrity_prep(bio))
return;
if (blk_mq_attempt_bio_merge(q, bio, nr_segs))
return;
- if (blk_mq_can_use_cached_rq(rq, plug, bio))
+ if (blk_mq_use_cached_rq(rq, plug, bio))
goto done;
percpu_ref_get(&q->q_usage_counter);
} else {
if (unlikely(bio_queue_enter(bio)))
return;
+ if (unlikely(bio_may_exceed_limits(bio, &q->limits))) {
+ bio = __bio_split_to_limits(bio, &q->limits, &nr_segs);
+ if (!bio)
+ goto fail;
+ }
if (!bio_integrity_prep(bio))
goto fail;
}
diff --git a/block/ioprio.c b/block/ioprio.c
index b5a942519a79..73301a261429 100644
--- a/block/ioprio.c
+++ b/block/ioprio.c
@@ -139,32 +139,6 @@ out:
return ret;
}
-/*
- * If the task has set an I/O priority, use that. Otherwise, return
- * the default I/O priority.
- *
- * Expected to be called for current task or with task_lock() held to keep
- * io_context stable.
- */
-int __get_task_ioprio(struct task_struct *p)
-{
- struct io_context *ioc = p->io_context;
- int prio;
-
- if (p != current)
- lockdep_assert_held(&p->alloc_lock);
- if (ioc)
- prio = ioc->ioprio;
- else
- prio = IOPRIO_DEFAULT;
-
- if (IOPRIO_PRIO_CLASS(prio) == IOPRIO_CLASS_NONE)
- prio = IOPRIO_PRIO_VALUE(task_nice_ioclass(p),
- task_nice_ioprio(p));
- return prio;
-}
-EXPORT_SYMBOL_GPL(__get_task_ioprio);
-
static int get_task_ioprio(struct task_struct *p)
{
int ret;
diff --git a/block/partitions/core.c b/block/partitions/core.c
index e6ac73617f3e..cab0d76a828e 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -562,8 +562,8 @@ static bool blk_add_partition(struct gendisk *disk,
part = add_partition(disk, p, from, size, state->parts[p].flags,
&state->parts[p].info);
if (IS_ERR(part) && PTR_ERR(part) != -ENXIO) {
- printk(KERN_ERR " %s: p%d could not be added: %ld\n",
- disk->disk_name, p, -PTR_ERR(part));
+ printk(KERN_ERR " %s: p%d could not be added: %pe\n",
+ disk->disk_name, p, part);
return true;
}
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 146b32fa7b47..f8145499da38 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -165,39 +165,37 @@ static loff_t get_loop_size(struct loop_device *lo, struct file *file)
return get_size(lo->lo_offset, lo->lo_sizelimit, file);
}
+/*
+ * We support direct I/O only if lo_offset is aligned with the logical I/O size
+ * of backing device, and the logical block size of loop is bigger than that of
+ * the backing device.
+ */
+static bool lo_bdev_can_use_dio(struct loop_device *lo,
+ struct block_device *backing_bdev)
+{
+ unsigned short sb_bsize = bdev_logical_block_size(backing_bdev);
+
+ if (queue_logical_block_size(lo->lo_queue) < sb_bsize)
+ return false;
+ if (lo->lo_offset & (sb_bsize - 1))
+ return false;
+ return true;
+}
+
static void __loop_update_dio(struct loop_device *lo, bool dio)
{
struct file *file = lo->lo_backing_file;
- struct address_space *mapping = file->f_mapping;
- struct inode *inode = mapping->host;
- unsigned short sb_bsize = 0;
- unsigned dio_align = 0;
+ struct inode *inode = file->f_mapping->host;
+ struct block_device *backing_bdev = NULL;
bool use_dio;
- if (inode->i_sb->s_bdev) {
- sb_bsize = bdev_logical_block_size(inode->i_sb->s_bdev);
- dio_align = sb_bsize - 1;
- }
+ if (S_ISBLK(inode->i_mode))
+ backing_bdev = I_BDEV(inode);
+ else if (inode->i_sb->s_bdev)
+ backing_bdev = inode->i_sb->s_bdev;
- /*
- * We support direct I/O only if lo_offset is aligned with the
- * logical I/O size of backing device, and the logical block
- * size of loop is bigger than the backing device's.
- *
- * TODO: the above condition may be loosed in the future, and
- * direct I/O may be switched runtime at that time because most
- * of requests in sane applications should be PAGE_SIZE aligned
- */
- if (dio) {
- if (queue_logical_block_size(lo->lo_queue) >= sb_bsize &&
- !(lo->lo_offset & dio_align) &&
- (file->f_mode & FMODE_CAN_ODIRECT))
- use_dio = true;
- else
- use_dio = false;
- } else {
- use_dio = false;
- }
+ use_dio = dio && (file->f_mode & FMODE_CAN_ODIRECT) &&
+ (!backing_bdev || lo_bdev_can_use_dio(lo, backing_bdev));
if (lo->use_dio == use_dio)
return;
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 4e72ec4e25ac..33a8f37bb6a1 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -508,7 +508,7 @@ static int __sock_xmit(struct nbd_device *nbd, struct socket *sock, int send,
struct iov_iter *iter, int msg_flags, int *sent)
{
int result;
- struct msghdr msg;
+ struct msghdr msg = {} ;
unsigned int noreclaim_flag;
if (unlikely(!sock)) {
@@ -524,10 +524,6 @@ static int __sock_xmit(struct nbd_device *nbd, struct socket *sock, int send,
do {
sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC;
sock->sk->sk_use_task_frag = false;
- msg.msg_name = NULL;
- msg.msg_namelen = 0;
- msg.msg_control = NULL;
- msg.msg_controllen = 0;
msg.msg_flags = msg_flags | MSG_NOSIGNAL;
if (send)
diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c
index 9f7695f00c2d..36755f263e8e 100644
--- a/drivers/block/null_blk/main.c
+++ b/drivers/block/null_blk/main.c
@@ -1840,7 +1840,7 @@ static void null_del_dev(struct nullb *nullb)
dev = nullb->dev;
- ida_simple_remove(&nullb_indexes, nullb->index);
+ ida_free(&nullb_indexes, nullb->index);
list_del_init(&nullb->list);
@@ -2174,7 +2174,7 @@ static int null_add_dev(struct nullb_device *dev)
blk_queue_flag_set(QUEUE_FLAG_NONROT, nullb->q);
mutex_lock(&lock);
- rv = ida_simple_get(&nullb_indexes, 0, 0, GFP_KERNEL);
+ rv = ida_alloc(&nullb_indexes, GFP_KERNEL);
if (rv < 0) {
mutex_unlock(&lock);
goto out_cleanup_zone;
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 3b6b9abb8ce1..5bf98fd6a651 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -367,8 +367,6 @@ static void virtblk_done(struct virtqueue *vq)
blk_mq_complete_request(req);
req_done = true;
}
- if (unlikely(virtqueue_is_broken(vq)))
- break;
} while (!virtqueue_enable_cb(vq));
/* In case queue is stopped waiting for more buffers. */
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 0a2bd72a6d76..2266358d8074 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -8132,6 +8132,19 @@ static void status_unused(struct seq_file *seq)
seq_printf(seq, "\n");
}
+static void status_personalities(struct seq_file *seq)
+{
+ struct md_personality *pers;
+
+ seq_puts(seq, "Personalities : ");
+ spin_lock(&pers_lock);
+ list_for_each_entry(pers, &pers_list, list)
+ seq_printf(seq, "[%s] ", pers->name);
+
+ spin_unlock(&pers_lock);
+ seq_puts(seq, "\n");
+}
+
static int status_resync(struct seq_file *seq, struct mddev *mddev)
{
sector_t max_sectors, resync, res;
@@ -8273,20 +8286,10 @@ static int status_resync(struct seq_file *seq, struct mddev *mddev)
static void *md_seq_start(struct seq_file *seq, loff_t *pos)
__acquires(&all_mddevs_lock)
{
- struct md_personality *pers;
-
- seq_puts(seq, "Personalities : ");
- spin_lock(&pers_lock);
- list_for_each_entry(pers, &pers_list, list)
- seq_printf(seq, "[%s] ", pers->name);
-
- spin_unlock(&pers_lock);
- seq_puts(seq, "\n");
seq->poll_event = atomic_read(&md_event_count);
-
spin_lock(&all_mddevs_lock);
- return seq_list_start(&all_mddevs, *pos);
+ return seq_list_start_head(&all_mddevs, *pos);
}
static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
@@ -8297,16 +8300,23 @@ static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
static void md_seq_stop(struct seq_file *seq, void *v)
__releases(&all_mddevs_lock)
{
- status_unused(seq);
spin_unlock(&all_mddevs_lock);
}
static int md_seq_show(struct seq_file *seq, void *v)
{
- struct mddev *mddev = list_entry(v, struct mddev, all_mddevs);
+ struct mddev *mddev;
sector_t sectors;
struct md_rdev *rdev;
+ if (v == &all_mddevs) {
+ status_personalities(seq);
+ if (list_empty(&all_mddevs))
+ status_unused(seq);
+ return 0;
+ }
+
+ mddev = list_entry(v, struct mddev, all_mddevs);
if (!mddev_get(mddev))
return 0;
@@ -8382,6 +8392,10 @@ static int md_seq_show(struct seq_file *seq, void *v)
}
spin_unlock(&mddev->lock);
spin_lock(&all_mddevs_lock);
+
+ if (mddev == list_last_entry(&all_mddevs, struct mddev, all_mddevs))
+ status_unused(seq);
+
if (atomic_dec_and_test(&mddev->active))
__mddev_put(mddev);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index aaa434f0c175..24f0d799fd98 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1968,12 +1968,12 @@ static void end_sync_write(struct bio *bio)
}
static int r1_sync_page_io(struct md_rdev *rdev, sector_t sector,
- int sectors, struct page *page, int rw)
+ int sectors, struct page *page, blk_opf_t rw)
{
if (sync_page_io(rdev, sector, sectors << 9, page, rw, false))
/* success */
return 1;
- if (rw == WRITE) {
+ if (rw == REQ_OP_WRITE) {
set_bit(WriteErrorSeen, &rdev->flags);
if (!test_and_set_bit(WantReplacement,
&rdev->flags))
@@ -2090,7 +2090,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
rdev = conf->mirrors[d].rdev;
if (r1_sync_page_io(rdev, sect, s,
pages[idx],
- WRITE) == 0) {
+ REQ_OP_WRITE) == 0) {
r1_bio->bios[d]->bi_end_io = NULL;
rdev_dec_pending(rdev, mddev);
}
@@ -2105,7 +2105,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
rdev = conf->mirrors[d].rdev;
if (r1_sync_page_io(rdev, sect, s,
pages[idx],
- READ) != 0)
+ REQ_OP_READ) != 0)
atomic_add(s, &rdev->corrected_errors);
}
sectors -= s;
@@ -2321,7 +2321,7 @@ static void fix_read_error(struct r1conf *conf, struct r1bio *r1_bio)
!test_bit(Faulty, &rdev->flags)) {
atomic_inc(&rdev->nr_pending);
r1_sync_page_io(rdev, sect, s,
- conf->tmppage, WRITE);
+ conf->tmppage, REQ_OP_WRITE);
rdev_dec_pending(rdev, mddev);
}
}
@@ -2335,7 +2335,7 @@ static void fix_read_error(struct r1conf *conf, struct r1bio *r1_bio)
!test_bit(Faulty, &rdev->flags)) {
atomic_inc(&rdev->nr_pending);
if (r1_sync_page_io(rdev, sect, s,
- conf->tmppage, READ)) {
+ conf->tmppage, REQ_OP_READ)) {
atomic_add(s, &rdev->corrected_errors);
pr_info("md/raid1:%s: read error corrected (%d sectors at %llu on %pg)\n",
mdname(mddev), s,
diff --git a/drivers/nvme/common/keyring.c b/drivers/nvme/common/keyring.c
index ee341b83eeba..a5c0431c101c 100644
--- a/drivers/nvme/common/keyring.c
+++ b/drivers/nvme/common/keyring.c
@@ -111,7 +111,7 @@ static struct key *nvme_tls_psk_lookup(struct key *keyring,
* should be preferred to 'generated' PSKs,
* and SHA-384 should be preferred to SHA-256.
*/
-struct nvme_tls_psk_priority_list {
+static struct nvme_tls_psk_priority_list {
bool generated;
enum nvme_tcp_tls_cipher cipher;
} nvme_tls_psk_prio[] = {
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 0af612387083..85ab0fcf9e88 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1740,13 +1740,13 @@ static void nvme_config_discard(struct nvme_ctrl *ctrl, struct gendisk *disk,
struct nvme_ns_head *head)
{
struct request_queue *queue = disk->queue;
- u32 size = queue_logical_block_size(queue);
+ u32 max_discard_sectors;
- if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(head, UINT_MAX))
- ctrl->max_discard_sectors =
- nvme_lba_to_sect(head, ctrl->dmrsl);
-
- if (ctrl->max_discard_sectors == 0) {
+ if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(head, UINT_MAX)) {
+ max_discard_sectors = nvme_lba_to_sect(head, ctrl->dmrsl);
+ } else if (ctrl->oncs & NVME_CTRL_ONCS_DSM) {
+ max_discard_sectors = UINT_MAX;
+ } else {
blk_queue_max_discard_sectors(queue, 0);
return;
}
@@ -1754,14 +1754,22 @@ static void nvme_config_discard(struct nvme_ctrl *ctrl, struct gendisk *disk,
BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) <
NVME_DSM_MAX_RANGES);
- queue->limits.discard_granularity = size;
-
- /* If discard is already enabled, don't reset queue limits */
+ /*
+ * If discard is already enabled, don't reset queue limits.
+ *
+ * This works around the fact that the block layer can't cope well with
+ * updating the hardware limits when overridden through sysfs. This is
+ * harmless because discard limits in NVMe are purely advisory.
+ */
if (queue->limits.max_discard_sectors)
return;
- blk_queue_max_discard_sectors(queue, ctrl->max_discard_sectors);
- blk_queue_max_discard_segments(queue, ctrl->max_discard_segments);
+ blk_queue_max_discard_sectors(queue, max_discard_sectors);
+ if (ctrl->dmrl)
+ blk_queue_max_discard_segments(queue, ctrl->dmrl);
+ else
+ blk_queue_max_discard_segments(queue, NVME_DSM_MAX_RANGES);
+ queue->limits.discard_granularity = queue_logical_block_size(queue);
if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
blk_queue_max_write_zeroes_sectors(queue, UINT_MAX);
@@ -2930,14 +2938,6 @@ static int nvme_init_non_mdts_limits(struct nvme_ctrl *ctrl)
struct nvme_id_ctrl_nvm *id;
int ret;
- if (ctrl->oncs & NVME_CTRL_ONCS_DSM) {
- ctrl->max_discard_sectors = UINT_MAX;
- ctrl->max_discard_segments = NVME_DSM_MAX_RANGES;
- } else {
- ctrl->max_discard_sectors = 0;
- ctrl->max_discard_segments = 0;
- }
-
/*
* Even though NVMe spec explicitly states that MDTS is not applicable
* to the write-zeroes, we are cautious and limit the size to the
@@ -2967,8 +2967,7 @@ static int nvme_init_non_mdts_limits(struct nvme_ctrl *ctrl)
if (ret)
goto free_data;
- if (id->dmrl)
- ctrl->max_discard_segments = id->dmrl;
+ ctrl->dmrl = id->dmrl;
ctrl->dmrsl = le32_to_cpu(id->dmrsl);
if (id->wzsl)
ctrl->max_zeroes_sectors = nvme_mps_to_sectors(ctrl, id->wzsl);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 4be7f6822966..030c80818240 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -303,14 +303,13 @@ struct nvme_ctrl {
u32 max_hw_sectors;
u32 max_segments;
u32 max_integrity_segments;
- u32 max_discard_sectors;
- u32 max_discard_segments;
u32 max_zeroes_sectors;
#ifdef CONFIG_BLK_DEV_ZONED
u32 max_zone_append;
#endif
u16 crdt[3];
u16 oncs;
+ u8 dmrl;
u32 dmrsl;
u16 oacs;
u16 sqsize;
@@ -932,6 +931,10 @@ extern struct device_attribute dev_attr_ana_grpid;
extern struct device_attribute dev_attr_ana_state;
extern struct device_attribute subsys_attr_iopolicy;
+static inline bool nvme_disk_is_ns_head(struct gendisk *disk)
+{
+ return disk->fops == &nvme_ns_head_ops;
+}
#else
#define multipath false
static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
@@ -1009,6 +1012,10 @@ static inline void nvme_mpath_start_request(struct request *rq)
static inline void nvme_mpath_end_request(struct request *rq)
{
}
+static inline bool nvme_disk_is_ns_head(struct gendisk *disk)
+{
+ return false;
+}
#endif /* CONFIG_NVME_MULTIPATH */
int nvme_revalidate_zones(struct nvme_ns *ns);
@@ -1037,7 +1044,10 @@ static inline int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf)
static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev)
{
- return dev_to_disk(dev)->private_data;
+ struct gendisk *disk = dev_to_disk(dev);
+
+ WARN_ON(nvme_disk_is_ns_head(disk));
+ return disk->private_data;
}
#ifdef CONFIG_NVME_HWMON
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 61af7ff1a9d6..c1d6357ec98a 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1284,6 +1284,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req)
struct request *abort_req;
struct nvme_command cmd = { };
u32 csts = readl(dev->bar + NVME_REG_CSTS);
+ u8 opcode;
/* If PCI error recovery process is happening, we cannot reset or
* the recovery mechanism will surely fail.
@@ -1310,8 +1311,8 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req)
if (blk_mq_rq_state(req) != MQ_RQ_IN_FLIGHT) {
dev_warn(dev->ctrl.device,
- "I/O %d QID %d timeout, completion polled\n",
- req->tag, nvmeq->qid);
+ "I/O tag %d (%04x) QID %d timeout, completion polled\n",
+ req->tag, nvme_cid(req), nvmeq->qid);
return BLK_EH_DONE;
}
@@ -1327,8 +1328,8 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req)
fallthrough;
case NVME_CTRL_DELETING:
dev_warn_ratelimited(dev->ctrl.device,
- "I/O %d QID %d timeout, disable controller\n",
- req->tag, nvmeq->qid);
+ "I/O tag %d (%04x) QID %d timeout, disable controller\n",
+ req->tag, nvme_cid(req), nvmeq->qid);
nvme_req(req)->flags |= NVME_REQ_CANCELLED;
nvme_dev_disable(dev, true);
return BLK_EH_DONE;
@@ -1343,10 +1344,12 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req)
* command was already aborted once before and still hasn't been
* returned to the driver, or if this is the admin queue.
*/
+ opcode = nvme_req(req)->cmd->common.opcode;
if (!nvmeq->qid || iod->aborted) {
dev_warn(dev->ctrl.device,
- "I/O %d QID %d timeout, reset controller\n",
- req->tag, nvmeq->qid);
+ "I/O tag %d (%04x) opcode %#x (%s) QID %d timeout, reset controller\n",
+ req->tag, nvme_cid(req), opcode,
+ nvme_opcode_str(nvmeq->qid, opcode, 0), nvmeq->qid);
nvme_req(req)->flags |= NVME_REQ_CANCELLED;
goto disable;
}
@@ -1362,10 +1365,10 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req)
cmd.abort.sqid = cpu_to_le16(nvmeq->qid);
dev_warn(nvmeq->dev->ctrl.device,
- "I/O %d (%s) QID %d timeout, aborting\n",
- req->tag,
- nvme_get_opcode_str(nvme_req(req)->cmd->common.opcode),
- nvmeq->qid);
+ "I/O tag %d (%04x) opcode %#x (%s) QID %d timeout, aborting req_op:%s(%u) size:%u\n",
+ req->tag, nvme_cid(req), opcode, nvme_get_opcode_str(opcode),
+ nvmeq->qid, blk_op_str(req_op(req)), req_op(req),
+ blk_rq_bytes(req));
abort_req = blk_mq_alloc_request(dev->ctrl.admin_q, nvme_req_op(&cmd),
BLK_MQ_REQ_NOWAIT);
@@ -2743,10 +2746,10 @@ static void nvme_reset_work(struct work_struct *work)
* controller around but remove all namespaces.
*/
if (dev->online_queues > 1) {
+ nvme_dbbuf_set(dev);
nvme_unquiesce_io_queues(&dev->ctrl);
nvme_wait_freeze(&dev->ctrl);
nvme_pci_update_nr_queues(dev);
- nvme_dbbuf_set(dev);
nvme_unfreeze(&dev->ctrl);
} else {