summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-12-06 10:08:59 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2019-12-06 10:08:59 -0800
commit9feb1af97e7366b512ecb9e4dd61d3252074cda3 (patch)
treeb94821803bc3c5a69b3132e82ab6ddd1fcbcbcc0
parent0aecba6173216931c436a03183f4759a4fd4c2f2 (diff)
parent8539429917c48c994d2e2cafa02ab06587b3b42c (diff)
downloadlinux-9feb1af97e7366b512ecb9e4dd61d3252074cda3.tar.gz
linux-9feb1af97e7366b512ecb9e4dd61d3252074cda3.tar.bz2
linux-9feb1af97e7366b512ecb9e4dd61d3252074cda3.zip
Merge tag 'for-linus-20191205' of git://git.kernel.dk/linux-block
Pull more block and io_uring updates from Jens Axboe: "I wasn't expecting this to be so big, and if I was, I would have used separate branches for this. Going forward I'll be doing separate branches for the current tree, just like for the next kernel version tree. In any case, this contains: - Series from Christoph that fixes an inherent race condition with zoned devices and revalidation. - null_blk zone size fix (Damien) - Fix for a regression in this merge window that caused busy spins by sending empty disk uevents (Eric) - Fix for a regression in this merge window for bfq stats (Hou) - Fix for io_uring creds allocation failure handling (me) - io_uring -ERESTARTSYS send/recvmsg fix (me) - Series that fixes the need for applications to retain state across async request punts for io_uring. This one is a bit larger than I would have hoped, but I think it's important we get this fixed for 5.5. - connect(2) improvement for io_uring, handling EINPROGRESS instead of having applications needing to poll for it (me) - Have io_uring use a hash for poll requests instead of an rbtree. This turned out to work much better in practice, so I think we should make the switch now. For some workloads, even with a fair amount of cancellations, the insertion sort is just too expensive. (me) - Various little io_uring fixes (me, Jackie, Pavel, LimingWu) - Fix for brd unaligned IO, and a warning for the future (Ming) - Fix for a bio integrity data leak (Justin) - bvec_iter_advance() improvement (Pavel) - Xen blkback page unmap fix (SeongJae) The major items in here are all well tested, and on the liburing side we continue to add regression and feature test cases. We're up to 50 topic cases now, each with anywhere from 1 to more than 10 cases in each" * tag 'for-linus-20191205' of git://git.kernel.dk/linux-block: (33 commits) block: fix memleak of bio integrity data io_uring: fix a typo in a comment bfq-iosched: Ensure bio->bi_blkg is valid before using it io_uring: hook all linked requests via link_list io_uring: fix error handling in io_queue_link_head io_uring: use hash table for poll command lookups io-wq: clear node->next on list deletion io_uring: ensure deferred timeouts copy necessary data io_uring: allow IO_SQE_* flags on IORING_OP_TIMEOUT null_blk: remove unused variable warning on !CONFIG_BLK_DEV_ZONED brd: warn on un-aligned buffer brd: remove max_hw_sectors queue limit xen/blkback: Avoid unmapping unmapped grant pages io_uring: handle connect -EINPROGRESS like -EAGAIN block: set the zone size in blk_revalidate_disk_zones atomically block: don't handle bio based drivers in blk_revalidate_disk_zones block: allocate the zone bitmaps lazily block: replace seq_zones_bitmap with conv_zones_bitmap block: simplify blkdev_nr_zones block: remove the empty line at the end of blk-zoned.c ...
-rw-r--r--block/bfq-cgroup.c3
-rw-r--r--block/bio-integrity.c2
-rw-r--r--block/bio.c3
-rw-r--r--block/blk-zoned.c149
-rw-r--r--block/blk.h4
-rw-r--r--block/ioctl.c2
-rw-r--r--drivers/block/brd.c5
-rw-r--r--drivers/block/null_blk_main.c40
-rw-r--r--drivers/block/xen-blkback/blkback.c2
-rw-r--r--drivers/md/dm-table.c12
-rw-r--r--drivers/md/dm-zoned-target.c2
-rw-r--r--drivers/scsi/sd_zbc.c2
-rw-r--r--fs/block_dev.c2
-rw-r--r--fs/io-wq.c2
-rw-r--r--fs/io-wq.h11
-rw-r--r--fs/io_uring.c694
-rw-r--r--include/linux/blkdev.h24
-rw-r--r--include/linux/bvec.h22
-rw-r--r--include/linux/socket.h20
-rw-r--r--include/uapi/linux/io_uring.h1
-rw-r--r--net/socket.c76
21 files changed, 672 insertions, 406 deletions
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index cea0ae12f937..e1419edde2ec 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -351,6 +351,9 @@ void bfqg_stats_update_legacy_io(struct request_queue *q, struct request *rq)
{
struct bfq_group *bfqg = blkg_to_bfqg(rq->bio->bi_blkg);
+ if (!bfqg)
+ return;
+
blkg_rwstat_add(&bfqg->stats.bytes, rq->cmd_flags, blk_rq_bytes(rq));
blkg_rwstat_add(&bfqg->stats.ios, rq->cmd_flags, 1);
}
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index fb95dbb21dd8..bf62c25cde8f 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -87,7 +87,7 @@ EXPORT_SYMBOL(bio_integrity_alloc);
* Description: Used to free the integrity portion of a bio. Usually
* called from bio_free().
*/
-static void bio_integrity_free(struct bio *bio)
+void bio_integrity_free(struct bio *bio)
{
struct bio_integrity_payload *bip = bio_integrity(bio);
struct bio_set *bs = bio->bi_pool;
diff --git a/block/bio.c b/block/bio.c
index b1170ec18464..9d54aa37ce6c 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -233,6 +233,9 @@ fallback:
void bio_uninit(struct bio *bio)
{
bio_disassociate_blkg(bio);
+
+ if (bio_integrity(bio))
+ bio_integrity_free(bio);
}
EXPORT_SYMBOL(bio_uninit);
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 6fad6f3f6980..d00fcfd71dfe 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -70,30 +70,20 @@ void __blk_req_zone_write_unlock(struct request *rq)
}
EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock);
-static inline unsigned int __blkdev_nr_zones(struct request_queue *q,
- sector_t nr_sectors)
-{
- sector_t zone_sectors = blk_queue_zone_sectors(q);
-
- return (nr_sectors + zone_sectors - 1) >> ilog2(zone_sectors);
-}
-
/**
* blkdev_nr_zones - Get number of zones
- * @bdev: Target block device
+ * @disk: Target gendisk
*
- * Description:
- * Return the total number of zones of a zoned block device.
- * For a regular block device, the number of zones is always 0.
+ * Return the total number of zones of a zoned block device. For a block
+ * device without zone capabilities, the number of zones is always 0.
*/
-unsigned int blkdev_nr_zones(struct block_device *bdev)
+unsigned int blkdev_nr_zones(struct gendisk *disk)
{
- struct request_queue *q = bdev_get_queue(bdev);
+ sector_t zone_sectors = blk_queue_zone_sectors(disk->queue);
- if (!blk_queue_is_zoned(q))
+ if (!blk_queue_is_zoned(disk->queue))
return 0;
-
- return __blkdev_nr_zones(q, get_capacity(bdev->bd_disk));
+ return (get_capacity(disk) + zone_sectors - 1) >> ilog2(zone_sectors);
}
EXPORT_SYMBOL_GPL(blkdev_nr_zones);
@@ -342,16 +332,18 @@ static inline unsigned long *blk_alloc_zone_bitmap(int node,
void blk_queue_free_zone_bitmaps(struct request_queue *q)
{
- kfree(q->seq_zones_bitmap);
- q->seq_zones_bitmap = NULL;
+ kfree(q->conv_zones_bitmap);
+ q->conv_zones_bitmap = NULL;
kfree(q->seq_zones_wlock);
q->seq_zones_wlock = NULL;
}
struct blk_revalidate_zone_args {
struct gendisk *disk;
- unsigned long *seq_zones_bitmap;
+ unsigned long *conv_zones_bitmap;
unsigned long *seq_zones_wlock;
+ unsigned int nr_zones;
+ sector_t zone_sectors;
sector_t sector;
};
@@ -364,25 +356,33 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
struct blk_revalidate_zone_args *args = data;
struct gendisk *disk = args->disk;
struct request_queue *q = disk->queue;
- sector_t zone_sectors = blk_queue_zone_sectors(q);
sector_t capacity = get_capacity(disk);
/*
* All zones must have the same size, with the exception on an eventual
* smaller last zone.
*/
- if (zone->start + zone_sectors < capacity &&
- zone->len != zone_sectors) {
- pr_warn("%s: Invalid zoned device with non constant zone size\n",
- disk->disk_name);
- return false;
- }
+ if (zone->start == 0) {
+ if (zone->len == 0 || !is_power_of_2(zone->len)) {
+ pr_warn("%s: Invalid zoned device with non power of two zone size (%llu)\n",
+ disk->disk_name, zone->len);
+ return -ENODEV;
+ }
- if (zone->start + zone->len >= capacity &&
- zone->len > zone_sectors) {
- pr_warn("%s: Invalid zoned device with larger last zone size\n",
- disk->disk_name);
- return -ENODEV;
+ args->zone_sectors = zone->len;
+ args->nr_zones = (capacity + zone->len - 1) >> ilog2(zone->len);
+ } else if (zone->start + args->zone_sectors < capacity) {
+ if (zone->len != args->zone_sectors) {
+ pr_warn("%s: Invalid zoned device with non constant zone size\n",
+ disk->disk_name);
+ return -ENODEV;
+ }
+ } else {
+ if (zone->len > args->zone_sectors) {
+ pr_warn("%s: Invalid zoned device with larger last zone size\n",
+ disk->disk_name);
+ return -ENODEV;
+ }
}
/* Check for holes in the zone report */
@@ -395,8 +395,22 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
/* Check zone type */
switch (zone->type) {
case BLK_ZONE_TYPE_CONVENTIONAL:
+ if (!args->conv_zones_bitmap) {
+ args->conv_zones_bitmap =
+ blk_alloc_zone_bitmap(q->node, args->nr_zones);
+ if (!args->conv_zones_bitmap)
+ return -ENOMEM;
+ }
+ set_bit(idx, args->conv_zones_bitmap);
+ break;
case BLK_ZONE_TYPE_SEQWRITE_REQ:
case BLK_ZONE_TYPE_SEQWRITE_PREF:
+ if (!args->seq_zones_wlock) {
+ args->seq_zones_wlock =
+ blk_alloc_zone_bitmap(q->node, args->nr_zones);
+ if (!args->seq_zones_wlock)
+ return -ENOMEM;
+ }
break;
default:
pr_warn("%s: Invalid zone type 0x%x at sectors %llu\n",
@@ -404,78 +418,54 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
return -ENODEV;
}
- if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL)
- set_bit(idx, args->seq_zones_bitmap);
-
args->sector += zone->len;
return 0;
}
-static int blk_update_zone_info(struct gendisk *disk, unsigned int nr_zones,
- struct blk_revalidate_zone_args *args)
-{
- /*
- * Ensure that all memory allocations in this context are done as
- * if GFP_NOIO was specified.
- */
- unsigned int noio_flag = memalloc_noio_save();
- struct request_queue *q = disk->queue;
- int ret;
-
- args->seq_zones_wlock = blk_alloc_zone_bitmap(q->node, nr_zones);
- if (!args->seq_zones_wlock)
- return -ENOMEM;
- args->seq_zones_bitmap = blk_alloc_zone_bitmap(q->node, nr_zones);
- if (!args->seq_zones_bitmap)
- return -ENOMEM;
-
- ret = disk->fops->report_zones(disk, 0, nr_zones,
- blk_revalidate_zone_cb, args);
- memalloc_noio_restore(noio_flag);
- return ret;
-}
-
/**
* blk_revalidate_disk_zones - (re)allocate and initialize zone bitmaps
* @disk: Target disk
*
* Helper function for low-level device drivers to (re) allocate and initialize
* a disk request queue zone bitmaps. This functions should normally be called
- * within the disk ->revalidate method. For BIO based queues, no zone bitmap
- * is allocated.
+ * within the disk ->revalidate method for blk-mq based drivers. For BIO based
+ * drivers only q->nr_zones needs to be updated so that the sysfs exposed value
+ * is correct.
*/
int blk_revalidate_disk_zones(struct gendisk *disk)
{
struct request_queue *q = disk->queue;
- unsigned int nr_zones = __blkdev_nr_zones(q, get_capacity(disk));
- struct blk_revalidate_zone_args args = { .disk = disk };
- int ret = 0;
+ struct blk_revalidate_zone_args args = {
+ .disk = disk,
+ };
+ unsigned int noio_flag;
+ int ret;
if (WARN_ON_ONCE(!blk_queue_is_zoned(q)))
return -EIO;
+ if (WARN_ON_ONCE(!queue_is_mq(q)))
+ return -EIO;
/*
- * BIO based queues do not use a scheduler so only q->nr_zones
- * needs to be updated so that the sysfs exposed value is correct.
+ * Ensure that all memory allocations in this context are done as if
+ * GFP_NOIO was specified.
*/
- if (!queue_is_mq(q)) {
- q->nr_zones = nr_zones;
- return 0;
- }
-
- if (nr_zones)
- ret = blk_update_zone_info(disk, nr_zones, &args);
+ noio_flag = memalloc_noio_save();
+ ret = disk->fops->report_zones(disk, 0, UINT_MAX,
+ blk_revalidate_zone_cb, &args);
+ memalloc_noio_restore(noio_flag);
/*
- * Install the new bitmaps, making sure the queue is stopped and
- * all I/Os are completed (i.e. a scheduler is not referencing the
- * bitmaps).
+ * Install the new bitmaps and update nr_zones only once the queue is
+ * stopped and all I/Os are completed (i.e. a scheduler is not
+ * referencing the bitmaps).
*/
blk_mq_freeze_queue(q);
if (ret >= 0) {
- q->nr_zones = nr_zones;
+ blk_queue_chunk_sectors(q, args.zone_sectors);
+ q->nr_zones = args.nr_zones;
swap(q->seq_zones_wlock, args.seq_zones_wlock);
- swap(q->seq_zones_bitmap, args.seq_zones_bitmap);
+ swap(q->conv_zones_bitmap, args.conv_zones_bitmap);
ret = 0;
} else {
pr_warn("%s: failed to revalidate zones\n", disk->disk_name);
@@ -484,8 +474,7 @@ int blk_revalidate_disk_zones(struct gendisk *disk)
blk_mq_unfreeze_queue(q);
kfree(args.seq_zones_wlock);
- kfree(args.seq_zones_bitmap);
+ kfree(args.conv_zones_bitmap);
return ret;
}
EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones);
-
diff --git a/block/blk.h b/block/blk.h
index 2bea40180b6f..6842f28c033e 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -121,6 +121,7 @@ static inline void blk_rq_bio_prep(struct request *rq, struct bio *bio,
#ifdef CONFIG_BLK_DEV_INTEGRITY
void blk_flush_integrity(void);
bool __bio_integrity_endio(struct bio *);
+void bio_integrity_free(struct bio *bio);
static inline bool bio_integrity_endio(struct bio *bio)
{
if (bio_integrity(bio))
@@ -166,6 +167,9 @@ static inline bool bio_integrity_endio(struct bio *bio)
{
return true;
}
+static inline void bio_integrity_free(struct bio *bio)
+{
+}
#endif /* CONFIG_BLK_DEV_INTEGRITY */
unsigned long blk_rq_timeout(unsigned long timeout);
diff --git a/block/ioctl.c b/block/ioctl.c
index 7ac8a66c9787..5de98b97af2a 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -512,7 +512,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
case BLKGETZONESZ:
return put_uint(arg, bdev_zone_sectors(bdev));
case BLKGETNRZONES:
- return put_uint(arg, blkdev_nr_zones(bdev));
+ return put_uint(arg, blkdev_nr_zones(bdev->bd_disk));
case HDIO_GETGEO:
return blkdev_getgeo(bdev, argp);
case BLKRAGET:
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index c548a5a6c1a0..a8730cc4db10 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -297,6 +297,10 @@ static blk_qc_t brd_make_request(struct request_queue *q, struct bio *bio)
unsigned int len = bvec.bv_len;
int err;
+ /* Don't support un-aligned buffer */
+ WARN_ON_ONCE((bvec.bv_offset & (SECTOR_SIZE - 1)) ||
+ (len & (SECTOR_SIZE - 1)));
+
err = brd_do_bvec(brd, bvec.bv_page, len, bvec.bv_offset,
bio_op(bio), sector);
if (err)
@@ -382,7 +386,6 @@ static struct brd_device *brd_alloc(int i)
goto out_free_dev;
blk_queue_make_request(brd->brd_queue, brd_make_request);
- blk_queue_max_hw_sectors(brd->brd_queue, 1024);
/* This is so fdisk will align partitions on 4k, because of
* direct_access API needing 4k alignment, returning a PFN
diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c
index 795fda576824..ae8d4bc532b0 100644
--- a/drivers/block/null_blk_main.c
+++ b/drivers/block/null_blk_main.c
@@ -1559,14 +1559,13 @@ static int init_driver_queues(struct nullb *nullb)
static int null_gendisk_register(struct nullb *nullb)
{
+ sector_t size = ((sector_t)nullb->dev->size * SZ_1M) >> SECTOR_SHIFT;
struct gendisk *disk;
- sector_t size;
disk = nullb->disk = alloc_disk_node(1, nullb->dev->home_node);
if (!disk)
return -ENOMEM;
- size = (sector_t)nullb->dev->size * 1024 * 1024ULL;
- set_capacity(disk, size >> 9);
+ set_capacity(disk, size);
disk->flags |= GENHD_FL_EXT_DEVT | GENHD_FL_SUPPRESS_PARTITION_INFO;
disk->major = null_major;
@@ -1576,12 +1575,19 @@ static int null_gendisk_register(struct nullb *nullb)
disk->queue = nullb->q;
strncpy(disk->disk_name, nullb->disk_name, DISK_NAME_LEN);
+#ifdef CONFIG_BLK_DEV_ZONED
if (nullb->dev->zoned) {
- int ret = blk_revalidate_disk_zones(disk);
-
- if (ret != 0)
- return ret;
+ if (queue_is_mq(nullb->q)) {
+ int ret = blk_revalidate_disk_zones(disk);
+ if (ret)
+ return ret;
+ } else {
+ blk_queue_chunk_sectors(nullb->q,
+ nullb->dev->zone_size_sects);
+ nullb->q->nr_zones = blkdev_nr_zones(disk);
+ }
}
+#endif
add_disk(disk);
return 0;
@@ -1607,7 +1613,7 @@ static int null_init_tag_set(struct nullb *nullb, struct blk_mq_tag_set *set)
return blk_mq_alloc_tag_set(set);
}
-static void null_validate_conf(struct nullb_device *dev)
+static int null_validate_conf(struct nullb_device *dev)
{
dev->blocksize = round_down(dev->blocksize, 512);
dev->blocksize = clamp_t(unsigned int, dev->blocksize, 512, 4096);
@@ -1634,6 +1640,14 @@ static void null_validate_conf(struct nullb_device *dev)
/* can not stop a queue */
if (dev->queue_mode == NULL_Q_BIO)
dev->mbps = 0;
+
+ if (dev->zoned &&
+ (!dev->zone_size || !is_power_of_2(dev->zone_size))) {
+ pr_err("zone_size must be power-of-two\n");
+ return -EINVAL;
+ }
+
+ return 0;
}
#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
@@ -1666,7 +1680,9 @@ static int null_add_dev(struct nullb_device *dev)
struct nullb *nullb;
int rv;
- null_validate_conf(dev);
+ rv = null_validate_conf(dev);
+ if (rv)
+ return rv;
nullb = kzalloc_node(sizeof(*nullb), GFP_KERNEL, dev->home_node);
if (!nullb) {
@@ -1731,7 +1747,6 @@ static int null_add_dev(struct nullb_device *dev)
if (rv)
goto out_cleanup_blk_queue;
- blk_queue_chunk_sectors(nullb->q, dev->zone_size_sects);
nullb->q->limits.zoned = BLK_ZONED_HM;
blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, nullb->q);
blk_queue_required_elevator_features(nullb->q,
@@ -1792,11 +1807,6 @@ static int __init null_init(void)
g_bs = PAGE_SIZE;
}
- if (!is_power_of_2(g_zone_size)) {
- pr_err("zone_size must be power-of-two\n");
- return -EINVAL;
- }
-
if (g_home_node != NUMA_NO_NODE && g_home_node >= nr_online_nodes) {
pr_err("invalid home_node value\n");
g_home_node = NUMA_NO_NODE;
diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index fd1e19f1a49f..3666afa639d1 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -936,6 +936,8 @@ next:
out_of_memory:
pr_alert("%s: out of memory\n", __func__);
put_free_pages(ring, pages_to_gnt, segs_to_map);
+ for (i = last_map; i < num; i++)
+ pages[i]->handle = BLKBACK_INVALID_HANDLE;
return -ENOMEM;
}
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 2ae0c1913766..0a2cc197f62b 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1954,12 +1954,14 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
/*
* For a zoned target, the number of zones should be updated for the
* correct value to be exposed in sysfs queue/nr_zones. For a BIO based
- * target, this is all that is needed. For a request based target, the
- * queue zone bitmaps must also be updated.
- * Use blk_revalidate_disk_zones() to handle this.
+ * target, this is all that is needed.
*/
- if (blk_queue_is_zoned(q))
- blk_revalidate_disk_zones(t->md->disk);
+#ifdef CONFIG_BLK_DEV_ZONED
+ if (blk_queue_is_zoned(q)) {
+ WARN_ON_ONCE(queue_is_mq(q));
+ q->nr_zones = blkdev_nr_zones(t->md->disk);
+ }
+#endif
/* Allow reads to exceed readahead limits */
q->backing_dev_info->io_pages = limits->max_sectors >> (PAGE_SHIFT - 9);
diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c
index 4574e0dedbd6..70a1063161c0 100644
--- a/drivers/md/dm-zoned-target.c
+++ b/drivers/md/dm-zoned-target.c
@@ -727,7 +727,7 @@ static int dmz_get_zoned_device(struct dm_target *ti, char *path)
dev->zone_nr_blocks = dmz_sect2blk(dev->zone_nr_sectors);
dev->zone_nr_blocks_shift = ilog2(dev->zone_nr_blocks);
- dev->nr_zones = blkdev_nr_zones(dev->bdev);
+ dev->nr_zones = blkdev_nr_zones(dev->bdev->bd_disk);
dmz->dev = dev;
diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index 0e5ede48f045..27d72c1d4654 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -412,8 +412,6 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned char *buf)
goto err;
/* The drive satisfies the kernel restrictions: set it up */
- blk_queue_chunk_sectors(sdkp->disk->queue,
- logical_to_sectors(sdkp->device, zone_blocks));
blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, sdkp->disk->queue);
blk_queue_required_elevator_features(sdkp->disk->queue,
ELEVATOR_F_ZBD_SEQ_WRITE);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index ee63c2732fa2..69bf2fb6f7cd 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1531,7 +1531,7 @@ rescan:
ret = blk_add_partitions(disk, bdev);
if (ret == -EAGAIN)
goto rescan;
- } else {
+ } else if (invalidate) {
/*
* Tell userspace that the media / partition table may have
* changed.
diff --git a/fs/io-wq.c b/fs/io-wq.c
index 91b85df0861e..74b40506c5d9 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -111,7 +111,7 @@ struct io_wq {
struct task_struct *manager;
struct user_struct *user;
- struct cred *creds;
+ const struct cred *creds;
struct mm_struct *mm;
refcount_t refs;
struct completion done;
diff --git a/fs/io-wq.h b/fs/io-wq.h
index 600e0158cba7..7c333a28e2a7 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -52,6 +52,7 @@ static inline void wq_node_del(struct io_wq_work_list *list,
list->last = prev;
if (prev)
prev->next = node->next;
+ node->next = NULL;
}
#define wq_list_for_each(pos, prv, head) \
@@ -87,7 +88,7 @@ typedef void (put_work_fn)(struct io_wq_work *);
struct io_wq_data {
struct mm_struct *mm;
struct user_struct *user;
- struct cred *creds;
+ const struct cred *creds;
get_work_fn *get_work;
put_work_fn *put_work;
@@ -118,10 +119,6 @@ static inline void io_wq_worker_sleeping(struct task_struct *tsk)
static inline void io_wq_worker_running(struct task_struct *tsk)
{
}
-#endif
+#endif /* CONFIG_IO_WQ */
-static inline bool io_wq_current_is_worker(void)
-{
- return in_task() && (current->flags & PF_IO_WORKER);
-}
-#endif
+#endif /* INTERNAL_IO_WQ_H */
diff --git a/fs/io_uring.c b/fs/io_uring.c
index ec53aa7cdc94..405be10da73d 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -145,7 +145,7 @@ struct io_rings {
/*
* Number of completion events lost because the queue was full;
* this should be avoided by the application by making sure
- * there are not more requests pending thatn there is space in
+ * there are not more requests pending than there is space in
* the completion queue.
*
* Written by the kernel, shouldn't be modified by the
@@ -238,7 +238,7 @@ struct io_ring_ctx {
struct user_struct *user;
- struct cred *creds;
+ const struct cred *creds;
/* 0 is for ctx quiesce/reinit/free, 1 is for sqo_thread started */
struct completion *completions;
@@ -275,7 +275,8 @@ struct io_ring_ctx {
* manipulate the list, hence no extra locking is needed there.
*/
struct list_head poll_list;
- struct rb_root cancel_tree;
+ struct hlist_head *cancel_hash;
+ unsigned cancel_hash_bits;
spinlock_t inflight_lock;
struct list_head inflight_list;
@@ -303,9 +304,32 @@ struct io_timeout_data {
u32 seq_offset;
};
-struct io_timeout {
- struct file *file;
- struct io_timeout_data *data;
+struct io_async_connect {
+ struct sockaddr_storage address;
+};
+
+struct io_async_msghdr {
+ struct iovec fast_iov[UIO_FASTIOV];
+ struct iovec *iov;
+ struct sockaddr __user *uaddr;
+ struct msghdr msg;
+};
+
+struct io_async_rw {
+ struct iovec fast_iov[UIO_FASTIOV];
+ struct iovec *iov;
+ ssize_t nr_segs;
+ ssize_t size;
+};
+
+struct io_async_ctx {
+ struct io_uring_sqe sqe;
+ union {
+ struct io_async_rw rw;
+ struct io_async_msghdr msg;
+ struct io_async_connect connect;
+ struct io_timeout_data timeout;
+ };
};
/*
@@ -319,10 +343,10 @@ struct io_kiocb {
struct file *file;
struct kiocb rw;
struct io_poll_iocb poll;
- struct io_timeout timeout;
};
const struct io_uring_sqe *sqe;
+ struct io_async_ctx *io;
struct file *ring_file;
int ring_fd;
bool has_user;
@@ -332,7 +356,7 @@ struct io_kiocb {
struct io_ring_ctx *ctx;
union {
struct list_head list;
- struct rb_node rb_node;
+ struct hlist_node hash_node;
};
struct list_head link_list;
unsigned int flags;
@@ -353,7 +377,6 @@ struct io_kiocb {
#define REQ_F_TIMEOUT_NOSEQ 8192 /* no timeout sequence */
#define REQ_F_INFLIGHT 16384 /* on inflight list */
#define REQ_F_COMP_LOCKED 32768 /* completion under lock */
-#define REQ_F_FREE_SQE 65536 /* free sqe if not async queued */
u64 user_data;
u32 result;
u32 sequence;
@@ -422,6 +445,7 @@ static void io_ring_ctx_ref_free(struct percpu_ref *ref)
static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
{
struct io_ring_ctx *ctx;
+ int hash_bits;
ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
if (!ctx)
@@ -435,6 +459,21 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
if (!ctx->completions)
goto err;
+ /*
+ * Use 5 bits less than the max cq entries, that should give us around
+ * 32 entries per hash list if totally full and uniformly spread.
+ */
+ hash_bits = ilog2(p->cq_entries);
+ hash_bits -= 5;
+ if (hash_bits <= 0)
+ hash_bits = 1;
+ ctx->cancel_hash_bits = hash_bits;
+ ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head),
+ GFP_KERNEL);
+ if (!ctx->cancel_hash)
+ goto err;
+ __hash_init(ctx->cancel_hash, 1U << hash_bits);
+
if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
goto err;
@@ -448,7 +487,6 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
init_waitqueue_head(&ctx->wait);
spin_lock_init(&ctx->completion_lock);
INIT_LIST_HEAD(&ctx->poll_list);
- ctx->cancel_tree = RB_ROOT;
INIT_LIST_HEAD(&ctx->defer_list);
INIT_LIST_HEAD(&ctx->timeout_list);
init_waitqueue_head(&ctx->inflight_wait);
@@ -459,6 +497,7 @@ err:
if (ctx->fallback_req)
kmem_cache_free(req_cachep, ctx->fallback_req);
kfree(ctx->completions);
+ kfree(ctx->cancel_hash);
kfree(ctx);
return NULL;
}
@@ -592,7 +631,7 @@ static void io_kill_timeout(struct io_kiocb *req)
{
int ret;
- ret = hrtimer_try_to_cancel(&req->timeout.data->timer);
+ ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
if (ret != -1) {
atomic_inc(&req->ctx->cq_timeouts);
list_del_init(&req->list);
@@ -806,6 +845,7 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
}
got_it:
+ req->io = NULL;
req->ring_file = NULL;
req->file = NULL;
req->ctx = ctx;
@@ -836,8 +876,8 @@ static void __io_free_req(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
- if (req->flags & REQ_F_FREE_SQE)
- kfree(req->sqe);
+ if (req->io)
+ kfree(req->io);
if (req->file && !(req->flags & REQ_F_FIXED_FILE))
fput(req->file);
if (req->flags & REQ_F_INFLIGHT) {
@@ -849,8 +889,6 @@ static void __io_free_req(struct io_kiocb *req)
wake_up(&ctx->inflight_wait);
spin_unlock_irqrestore(&ctx->inflight_lock, flags);
}
- if (req->flags & REQ_F_TIMEOUT)
- kfree(req->timeout.data);
percpu_ref_put(&ctx->refs);
if (likely(!io_is_fallback_req(req)))
kmem_cache_free(req_cachep, req);
@@ -863,7 +901,7 @@ static bool io_link_cancel_timeout(struct io_kiocb *req)
struct io_ring_ctx *ctx = req->ctx;
int ret;
- ret = hrtimer_try_to_cancel(&req->timeout.data->timer);
+ ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
if (ret != -1) {
io_cqring_fill_event(req, -ECANCELED);
io_commit_cqring(ctx);
@@ -878,7 +916,6 @@ static bool io_link_cancel_timeout(struct io_kiocb *req)
static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
{
struct io_ring_ctx *ctx = req->ctx;
- struct io_kiocb *nxt;
bool wake_ev = false;
/* Already got next link */
@@ -890,24 +927,21 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
* potentially happen if the chain is messed up, check to be on the
* safe side.
*/
- nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, list);
- while (nxt) {
- list_del_init(&nxt->list);
+ while (!list_empty(&req->link_list)) {
+ struct io_kiocb *nxt = list_first_entry(&req->link_list,
+ struct io_kiocb, link_list);
- if ((req->flags & REQ_F_LINK_TIMEOUT) &&
- (nxt->flags & REQ_F_TIMEOUT)) {
+ if (unlikely((req->flags & REQ_F_LINK_TIMEOUT) &&
+ (nxt->flags & REQ_F_TIMEOUT))) {
+