diff options
-rw-r--r-- | block/bfq-cgroup.c | 3 | ||||
-rw-r--r-- | block/bio-integrity.c | 2 | ||||
-rw-r--r-- | block/bio.c | 3 | ||||
-rw-r--r-- | block/blk-zoned.c | 149 | ||||
-rw-r--r-- | block/blk.h | 4 | ||||
-rw-r--r-- | block/ioctl.c | 2 | ||||
-rw-r--r-- | drivers/block/brd.c | 5 | ||||
-rw-r--r-- | drivers/block/null_blk_main.c | 40 | ||||
-rw-r--r-- | drivers/block/xen-blkback/blkback.c | 2 | ||||
-rw-r--r-- | drivers/md/dm-table.c | 12 | ||||
-rw-r--r-- | drivers/md/dm-zoned-target.c | 2 | ||||
-rw-r--r-- | drivers/scsi/sd_zbc.c | 2 | ||||
-rw-r--r-- | fs/block_dev.c | 2 | ||||
-rw-r--r-- | fs/io-wq.c | 2 | ||||
-rw-r--r-- | fs/io-wq.h | 11 | ||||
-rw-r--r-- | fs/io_uring.c | 694 | ||||
-rw-r--r-- | include/linux/blkdev.h | 24 | ||||
-rw-r--r-- | include/linux/bvec.h | 22 | ||||
-rw-r--r-- | include/linux/socket.h | 20 | ||||
-rw-r--r-- | include/uapi/linux/io_uring.h | 1 | ||||
-rw-r--r-- | net/socket.c | 76 |
21 files changed, 672 insertions, 406 deletions
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index cea0ae12f937..e1419edde2ec 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -351,6 +351,9 @@ void bfqg_stats_update_legacy_io(struct request_queue *q, struct request *rq) { struct bfq_group *bfqg = blkg_to_bfqg(rq->bio->bi_blkg); + if (!bfqg) + return; + blkg_rwstat_add(&bfqg->stats.bytes, rq->cmd_flags, blk_rq_bytes(rq)); blkg_rwstat_add(&bfqg->stats.ios, rq->cmd_flags, 1); } diff --git a/block/bio-integrity.c b/block/bio-integrity.c index fb95dbb21dd8..bf62c25cde8f 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -87,7 +87,7 @@ EXPORT_SYMBOL(bio_integrity_alloc); * Description: Used to free the integrity portion of a bio. Usually * called from bio_free(). */ -static void bio_integrity_free(struct bio *bio) +void bio_integrity_free(struct bio *bio) { struct bio_integrity_payload *bip = bio_integrity(bio); struct bio_set *bs = bio->bi_pool; diff --git a/block/bio.c b/block/bio.c index b1170ec18464..9d54aa37ce6c 100644 --- a/block/bio.c +++ b/block/bio.c @@ -233,6 +233,9 @@ fallback: void bio_uninit(struct bio *bio) { bio_disassociate_blkg(bio); + + if (bio_integrity(bio)) + bio_integrity_free(bio); } EXPORT_SYMBOL(bio_uninit); diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 6fad6f3f6980..d00fcfd71dfe 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -70,30 +70,20 @@ void __blk_req_zone_write_unlock(struct request *rq) } EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock); -static inline unsigned int __blkdev_nr_zones(struct request_queue *q, - sector_t nr_sectors) -{ - sector_t zone_sectors = blk_queue_zone_sectors(q); - - return (nr_sectors + zone_sectors - 1) >> ilog2(zone_sectors); -} - /** * blkdev_nr_zones - Get number of zones - * @bdev: Target block device + * @disk: Target gendisk * - * Description: - * Return the total number of zones of a zoned block device. - * For a regular block device, the number of zones is always 0. + * Return the total number of zones of a zoned block device. For a block + * device without zone capabilities, the number of zones is always 0. */ -unsigned int blkdev_nr_zones(struct block_device *bdev) +unsigned int blkdev_nr_zones(struct gendisk *disk) { - struct request_queue *q = bdev_get_queue(bdev); + sector_t zone_sectors = blk_queue_zone_sectors(disk->queue); - if (!blk_queue_is_zoned(q)) + if (!blk_queue_is_zoned(disk->queue)) return 0; - - return __blkdev_nr_zones(q, get_capacity(bdev->bd_disk)); + return (get_capacity(disk) + zone_sectors - 1) >> ilog2(zone_sectors); } EXPORT_SYMBOL_GPL(blkdev_nr_zones); @@ -342,16 +332,18 @@ static inline unsigned long *blk_alloc_zone_bitmap(int node, void blk_queue_free_zone_bitmaps(struct request_queue *q) { - kfree(q->seq_zones_bitmap); - q->seq_zones_bitmap = NULL; + kfree(q->conv_zones_bitmap); + q->conv_zones_bitmap = NULL; kfree(q->seq_zones_wlock); q->seq_zones_wlock = NULL; } struct blk_revalidate_zone_args { struct gendisk *disk; - unsigned long *seq_zones_bitmap; + unsigned long *conv_zones_bitmap; unsigned long *seq_zones_wlock; + unsigned int nr_zones; + sector_t zone_sectors; sector_t sector; }; @@ -364,25 +356,33 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, struct blk_revalidate_zone_args *args = data; struct gendisk *disk = args->disk; struct request_queue *q = disk->queue; - sector_t zone_sectors = blk_queue_zone_sectors(q); sector_t capacity = get_capacity(disk); /* * All zones must have the same size, with the exception on an eventual * smaller last zone. */ - if (zone->start + zone_sectors < capacity && - zone->len != zone_sectors) { - pr_warn("%s: Invalid zoned device with non constant zone size\n", - disk->disk_name); - return false; - } + if (zone->start == 0) { + if (zone->len == 0 || !is_power_of_2(zone->len)) { + pr_warn("%s: Invalid zoned device with non power of two zone size (%llu)\n", + disk->disk_name, zone->len); + return -ENODEV; + } - if (zone->start + zone->len >= capacity && - zone->len > zone_sectors) { - pr_warn("%s: Invalid zoned device with larger last zone size\n", - disk->disk_name); - return -ENODEV; + args->zone_sectors = zone->len; + args->nr_zones = (capacity + zone->len - 1) >> ilog2(zone->len); + } else if (zone->start + args->zone_sectors < capacity) { + if (zone->len != args->zone_sectors) { + pr_warn("%s: Invalid zoned device with non constant zone size\n", + disk->disk_name); + return -ENODEV; + } + } else { + if (zone->len > args->zone_sectors) { + pr_warn("%s: Invalid zoned device with larger last zone size\n", + disk->disk_name); + return -ENODEV; + } } /* Check for holes in the zone report */ @@ -395,8 +395,22 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, /* Check zone type */ switch (zone->type) { case BLK_ZONE_TYPE_CONVENTIONAL: + if (!args->conv_zones_bitmap) { + args->conv_zones_bitmap = + blk_alloc_zone_bitmap(q->node, args->nr_zones); + if (!args->conv_zones_bitmap) + return -ENOMEM; + } + set_bit(idx, args->conv_zones_bitmap); + break; case BLK_ZONE_TYPE_SEQWRITE_REQ: case BLK_ZONE_TYPE_SEQWRITE_PREF: + if (!args->seq_zones_wlock) { + args->seq_zones_wlock = + blk_alloc_zone_bitmap(q->node, args->nr_zones); + if (!args->seq_zones_wlock) + return -ENOMEM; + } break; default: pr_warn("%s: Invalid zone type 0x%x at sectors %llu\n", @@ -404,78 +418,54 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, return -ENODEV; } - if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) - set_bit(idx, args->seq_zones_bitmap); - args->sector += zone->len; return 0; } -static int blk_update_zone_info(struct gendisk *disk, unsigned int nr_zones, - struct blk_revalidate_zone_args *args) -{ - /* - * Ensure that all memory allocations in this context are done as - * if GFP_NOIO was specified. - */ - unsigned int noio_flag = memalloc_noio_save(); - struct request_queue *q = disk->queue; - int ret; - - args->seq_zones_wlock = blk_alloc_zone_bitmap(q->node, nr_zones); - if (!args->seq_zones_wlock) - return -ENOMEM; - args->seq_zones_bitmap = blk_alloc_zone_bitmap(q->node, nr_zones); - if (!args->seq_zones_bitmap) - return -ENOMEM; - - ret = disk->fops->report_zones(disk, 0, nr_zones, - blk_revalidate_zone_cb, args); - memalloc_noio_restore(noio_flag); - return ret; -} - /** * blk_revalidate_disk_zones - (re)allocate and initialize zone bitmaps * @disk: Target disk * * Helper function for low-level device drivers to (re) allocate and initialize * a disk request queue zone bitmaps. This functions should normally be called - * within the disk ->revalidate method. For BIO based queues, no zone bitmap - * is allocated. + * within the disk ->revalidate method for blk-mq based drivers. For BIO based + * drivers only q->nr_zones needs to be updated so that the sysfs exposed value + * is correct. */ int blk_revalidate_disk_zones(struct gendisk *disk) { struct request_queue *q = disk->queue; - unsigned int nr_zones = __blkdev_nr_zones(q, get_capacity(disk)); - struct blk_revalidate_zone_args args = { .disk = disk }; - int ret = 0; + struct blk_revalidate_zone_args args = { + .disk = disk, + }; + unsigned int noio_flag; + int ret; if (WARN_ON_ONCE(!blk_queue_is_zoned(q))) return -EIO; + if (WARN_ON_ONCE(!queue_is_mq(q))) + return -EIO; /* - * BIO based queues do not use a scheduler so only q->nr_zones - * needs to be updated so that the sysfs exposed value is correct. + * Ensure that all memory allocations in this context are done as if + * GFP_NOIO was specified. */ - if (!queue_is_mq(q)) { - q->nr_zones = nr_zones; - return 0; - } - - if (nr_zones) - ret = blk_update_zone_info(disk, nr_zones, &args); + noio_flag = memalloc_noio_save(); + ret = disk->fops->report_zones(disk, 0, UINT_MAX, + blk_revalidate_zone_cb, &args); + memalloc_noio_restore(noio_flag); /* - * Install the new bitmaps, making sure the queue is stopped and - * all I/Os are completed (i.e. a scheduler is not referencing the - * bitmaps). + * Install the new bitmaps and update nr_zones only once the queue is + * stopped and all I/Os are completed (i.e. a scheduler is not + * referencing the bitmaps). */ blk_mq_freeze_queue(q); if (ret >= 0) { - q->nr_zones = nr_zones; + blk_queue_chunk_sectors(q, args.zone_sectors); + q->nr_zones = args.nr_zones; swap(q->seq_zones_wlock, args.seq_zones_wlock); - swap(q->seq_zones_bitmap, args.seq_zones_bitmap); + swap(q->conv_zones_bitmap, args.conv_zones_bitmap); ret = 0; } else { pr_warn("%s: failed to revalidate zones\n", disk->disk_name); @@ -484,8 +474,7 @@ int blk_revalidate_disk_zones(struct gendisk *disk) blk_mq_unfreeze_queue(q); kfree(args.seq_zones_wlock); - kfree(args.seq_zones_bitmap); + kfree(args.conv_zones_bitmap); return ret; } EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones); - diff --git a/block/blk.h b/block/blk.h index 2bea40180b6f..6842f28c033e 100644 --- a/block/blk.h +++ b/block/blk.h @@ -121,6 +121,7 @@ static inline void blk_rq_bio_prep(struct request *rq, struct bio *bio, #ifdef CONFIG_BLK_DEV_INTEGRITY void blk_flush_integrity(void); bool __bio_integrity_endio(struct bio *); +void bio_integrity_free(struct bio *bio); static inline bool bio_integrity_endio(struct bio *bio) { if (bio_integrity(bio)) @@ -166,6 +167,9 @@ static inline bool bio_integrity_endio(struct bio *bio) { return true; } +static inline void bio_integrity_free(struct bio *bio) +{ +} #endif /* CONFIG_BLK_DEV_INTEGRITY */ unsigned long blk_rq_timeout(unsigned long timeout); diff --git a/block/ioctl.c b/block/ioctl.c index 7ac8a66c9787..5de98b97af2a 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -512,7 +512,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, case BLKGETZONESZ: return put_uint(arg, bdev_zone_sectors(bdev)); case BLKGETNRZONES: - return put_uint(arg, blkdev_nr_zones(bdev)); + return put_uint(arg, blkdev_nr_zones(bdev->bd_disk)); case HDIO_GETGEO: return blkdev_getgeo(bdev, argp); case BLKRAGET: diff --git a/drivers/block/brd.c b/drivers/block/brd.c index c548a5a6c1a0..a8730cc4db10 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -297,6 +297,10 @@ static blk_qc_t brd_make_request(struct request_queue *q, struct bio *bio) unsigned int len = bvec.bv_len; int err; + /* Don't support un-aligned buffer */ + WARN_ON_ONCE((bvec.bv_offset & (SECTOR_SIZE - 1)) || + (len & (SECTOR_SIZE - 1))); + err = brd_do_bvec(brd, bvec.bv_page, len, bvec.bv_offset, bio_op(bio), sector); if (err) @@ -382,7 +386,6 @@ static struct brd_device *brd_alloc(int i) goto out_free_dev; blk_queue_make_request(brd->brd_queue, brd_make_request); - blk_queue_max_hw_sectors(brd->brd_queue, 1024); /* This is so fdisk will align partitions on 4k, because of * direct_access API needing 4k alignment, returning a PFN diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c index 795fda576824..ae8d4bc532b0 100644 --- a/drivers/block/null_blk_main.c +++ b/drivers/block/null_blk_main.c @@ -1559,14 +1559,13 @@ static int init_driver_queues(struct nullb *nullb) static int null_gendisk_register(struct nullb *nullb) { + sector_t size = ((sector_t)nullb->dev->size * SZ_1M) >> SECTOR_SHIFT; struct gendisk *disk; - sector_t size; disk = nullb->disk = alloc_disk_node(1, nullb->dev->home_node); if (!disk) return -ENOMEM; - size = (sector_t)nullb->dev->size * 1024 * 1024ULL; - set_capacity(disk, size >> 9); + set_capacity(disk, size); disk->flags |= GENHD_FL_EXT_DEVT | GENHD_FL_SUPPRESS_PARTITION_INFO; disk->major = null_major; @@ -1576,12 +1575,19 @@ static int null_gendisk_register(struct nullb *nullb) disk->queue = nullb->q; strncpy(disk->disk_name, nullb->disk_name, DISK_NAME_LEN); +#ifdef CONFIG_BLK_DEV_ZONED if (nullb->dev->zoned) { - int ret = blk_revalidate_disk_zones(disk); - - if (ret != 0) - return ret; + if (queue_is_mq(nullb->q)) { + int ret = blk_revalidate_disk_zones(disk); + if (ret) + return ret; + } else { + blk_queue_chunk_sectors(nullb->q, + nullb->dev->zone_size_sects); + nullb->q->nr_zones = blkdev_nr_zones(disk); + } } +#endif add_disk(disk); return 0; @@ -1607,7 +1613,7 @@ static int null_init_tag_set(struct nullb *nullb, struct blk_mq_tag_set *set) return blk_mq_alloc_tag_set(set); } -static void null_validate_conf(struct nullb_device *dev) +static int null_validate_conf(struct nullb_device *dev) { dev->blocksize = round_down(dev->blocksize, 512); dev->blocksize = clamp_t(unsigned int, dev->blocksize, 512, 4096); @@ -1634,6 +1640,14 @@ static void null_validate_conf(struct nullb_device *dev) /* can not stop a queue */ if (dev->queue_mode == NULL_Q_BIO) dev->mbps = 0; + + if (dev->zoned && + (!dev->zone_size || !is_power_of_2(dev->zone_size))) { + pr_err("zone_size must be power-of-two\n"); + return -EINVAL; + } + + return 0; } #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION @@ -1666,7 +1680,9 @@ static int null_add_dev(struct nullb_device *dev) struct nullb *nullb; int rv; - null_validate_conf(dev); + rv = null_validate_conf(dev); + if (rv) + return rv; nullb = kzalloc_node(sizeof(*nullb), GFP_KERNEL, dev->home_node); if (!nullb) { @@ -1731,7 +1747,6 @@ static int null_add_dev(struct nullb_device *dev) if (rv) goto out_cleanup_blk_queue; - blk_queue_chunk_sectors(nullb->q, dev->zone_size_sects); nullb->q->limits.zoned = BLK_ZONED_HM; blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, nullb->q); blk_queue_required_elevator_features(nullb->q, @@ -1792,11 +1807,6 @@ static int __init null_init(void) g_bs = PAGE_SIZE; } - if (!is_power_of_2(g_zone_size)) { - pr_err("zone_size must be power-of-two\n"); - return -EINVAL; - } - if (g_home_node != NUMA_NO_NODE && g_home_node >= nr_online_nodes) { pr_err("invalid home_node value\n"); g_home_node = NUMA_NO_NODE; diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index fd1e19f1a49f..3666afa639d1 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -936,6 +936,8 @@ next: out_of_memory: pr_alert("%s: out of memory\n", __func__); put_free_pages(ring, pages_to_gnt, segs_to_map); + for (i = last_map; i < num; i++) + pages[i]->handle = BLKBACK_INVALID_HANDLE; return -ENOMEM; } diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 2ae0c1913766..0a2cc197f62b 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1954,12 +1954,14 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, /* * For a zoned target, the number of zones should be updated for the * correct value to be exposed in sysfs queue/nr_zones. For a BIO based - * target, this is all that is needed. For a request based target, the - * queue zone bitmaps must also be updated. - * Use blk_revalidate_disk_zones() to handle this. + * target, this is all that is needed. */ - if (blk_queue_is_zoned(q)) - blk_revalidate_disk_zones(t->md->disk); +#ifdef CONFIG_BLK_DEV_ZONED + if (blk_queue_is_zoned(q)) { + WARN_ON_ONCE(queue_is_mq(q)); + q->nr_zones = blkdev_nr_zones(t->md->disk); + } +#endif /* Allow reads to exceed readahead limits */ q->backing_dev_info->io_pages = limits->max_sectors >> (PAGE_SHIFT - 9); diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index 4574e0dedbd6..70a1063161c0 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -727,7 +727,7 @@ static int dmz_get_zoned_device(struct dm_target *ti, char *path) dev->zone_nr_blocks = dmz_sect2blk(dev->zone_nr_sectors); dev->zone_nr_blocks_shift = ilog2(dev->zone_nr_blocks); - dev->nr_zones = blkdev_nr_zones(dev->bdev); + dev->nr_zones = blkdev_nr_zones(dev->bdev->bd_disk); dmz->dev = dev; diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c index 0e5ede48f045..27d72c1d4654 100644 --- a/drivers/scsi/sd_zbc.c +++ b/drivers/scsi/sd_zbc.c @@ -412,8 +412,6 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned char *buf) goto err; /* The drive satisfies the kernel restrictions: set it up */ - blk_queue_chunk_sectors(sdkp->disk->queue, - logical_to_sectors(sdkp->device, zone_blocks)); blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, sdkp->disk->queue); blk_queue_required_elevator_features(sdkp->disk->queue, ELEVATOR_F_ZBD_SEQ_WRITE); diff --git a/fs/block_dev.c b/fs/block_dev.c index ee63c2732fa2..69bf2fb6f7cd 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1531,7 +1531,7 @@ rescan: ret = blk_add_partitions(disk, bdev); if (ret == -EAGAIN) goto rescan; - } else { + } else if (invalidate) { /* * Tell userspace that the media / partition table may have * changed. diff --git a/fs/io-wq.c b/fs/io-wq.c index 91b85df0861e..74b40506c5d9 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -111,7 +111,7 @@ struct io_wq { struct task_struct *manager; struct user_struct *user; - struct cred *creds; + const struct cred *creds; struct mm_struct *mm; refcount_t refs; struct completion done; diff --git a/fs/io-wq.h b/fs/io-wq.h index 600e0158cba7..7c333a28e2a7 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -52,6 +52,7 @@ static inline void wq_node_del(struct io_wq_work_list *list, list->last = prev; if (prev) prev->next = node->next; + node->next = NULL; } #define wq_list_for_each(pos, prv, head) \ @@ -87,7 +88,7 @@ typedef void (put_work_fn)(struct io_wq_work *); struct io_wq_data { struct mm_struct *mm; struct user_struct *user; - struct cred *creds; + const struct cred *creds; get_work_fn *get_work; put_work_fn *put_work; @@ -118,10 +119,6 @@ static inline void io_wq_worker_sleeping(struct task_struct *tsk) static inline void io_wq_worker_running(struct task_struct *tsk) { } -#endif +#endif /* CONFIG_IO_WQ */ -static inline bool io_wq_current_is_worker(void) -{ - return in_task() && (current->flags & PF_IO_WORKER); -} -#endif +#endif /* INTERNAL_IO_WQ_H */ diff --git a/fs/io_uring.c b/fs/io_uring.c index ec53aa7cdc94..405be10da73d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -145,7 +145,7 @@ struct io_rings { /* * Number of completion events lost because the queue was full; * this should be avoided by the application by making sure - * there are not more requests pending thatn there is space in + * there are not more requests pending than there is space in * the completion queue. * * Written by the kernel, shouldn't be modified by the @@ -238,7 +238,7 @@ struct io_ring_ctx { struct user_struct *user; - struct cred *creds; + const struct cred *creds; /* 0 is for ctx quiesce/reinit/free, 1 is for sqo_thread started */ struct completion *completions; @@ -275,7 +275,8 @@ struct io_ring_ctx { * manipulate the list, hence no extra locking is needed there. */ struct list_head poll_list; - struct rb_root cancel_tree; + struct hlist_head *cancel_hash; + unsigned cancel_hash_bits; spinlock_t inflight_lock; struct list_head inflight_list; @@ -303,9 +304,32 @@ struct io_timeout_data { u32 seq_offset; }; -struct io_timeout { - struct file *file; - struct io_timeout_data *data; +struct io_async_connect { + struct sockaddr_storage address; +}; + +struct io_async_msghdr { + struct iovec fast_iov[UIO_FASTIOV]; + struct iovec *iov; + struct sockaddr __user *uaddr; + struct msghdr msg; +}; + +struct io_async_rw { + struct iovec fast_iov[UIO_FASTIOV]; + struct iovec *iov; + ssize_t nr_segs; + ssize_t size; +}; + +struct io_async_ctx { + struct io_uring_sqe sqe; + union { + struct io_async_rw rw; + struct io_async_msghdr msg; + struct io_async_connect connect; + struct io_timeout_data timeout; + }; }; /* @@ -319,10 +343,10 @@ struct io_kiocb { struct file *file; struct kiocb rw; struct io_poll_iocb poll; - struct io_timeout timeout; }; const struct io_uring_sqe *sqe; + struct io_async_ctx *io; struct file *ring_file; int ring_fd; bool has_user; @@ -332,7 +356,7 @@ struct io_kiocb { struct io_ring_ctx *ctx; union { struct list_head list; - struct rb_node rb_node; + struct hlist_node hash_node; }; struct list_head link_list; unsigned int flags; @@ -353,7 +377,6 @@ struct io_kiocb { #define REQ_F_TIMEOUT_NOSEQ 8192 /* no timeout sequence */ #define REQ_F_INFLIGHT 16384 /* on inflight list */ #define REQ_F_COMP_LOCKED 32768 /* completion under lock */ -#define REQ_F_FREE_SQE 65536 /* free sqe if not async queued */ u64 user_data; u32 result; u32 sequence; @@ -422,6 +445,7 @@ static void io_ring_ctx_ref_free(struct percpu_ref *ref) static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) { struct io_ring_ctx *ctx; + int hash_bits; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) @@ -435,6 +459,21 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) if (!ctx->completions) goto err; + /* + * Use 5 bits less than the max cq entries, that should give us around + * 32 entries per hash list if totally full and uniformly spread. + */ + hash_bits = ilog2(p->cq_entries); + hash_bits -= 5; + if (hash_bits <= 0) + hash_bits = 1; + ctx->cancel_hash_bits = hash_bits; + ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head), + GFP_KERNEL); + if (!ctx->cancel_hash) + goto err; + __hash_init(ctx->cancel_hash, 1U << hash_bits); + if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) goto err; @@ -448,7 +487,6 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) init_waitqueue_head(&ctx->wait); spin_lock_init(&ctx->completion_lock); INIT_LIST_HEAD(&ctx->poll_list); - ctx->cancel_tree = RB_ROOT; INIT_LIST_HEAD(&ctx->defer_list); INIT_LIST_HEAD(&ctx->timeout_list); init_waitqueue_head(&ctx->inflight_wait); @@ -459,6 +497,7 @@ err: if (ctx->fallback_req) kmem_cache_free(req_cachep, ctx->fallback_req); kfree(ctx->completions); + kfree(ctx->cancel_hash); kfree(ctx); return NULL; } @@ -592,7 +631,7 @@ static void io_kill_timeout(struct io_kiocb *req) { int ret; - ret = hrtimer_try_to_cancel(&req->timeout.data->timer); + ret = hrtimer_try_to_cancel(&req->io->timeout.timer); if (ret != -1) { atomic_inc(&req->ctx->cq_timeouts); list_del_init(&req->list); @@ -806,6 +845,7 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, } got_it: + req->io = NULL; req->ring_file = NULL; req->file = NULL; req->ctx = ctx; @@ -836,8 +876,8 @@ static void __io_free_req(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - if (req->flags & REQ_F_FREE_SQE) - kfree(req->sqe); + if (req->io) + kfree(req->io); if (req->file && !(req->flags & REQ_F_FIXED_FILE)) fput(req->file); if (req->flags & REQ_F_INFLIGHT) { @@ -849,8 +889,6 @@ static void __io_free_req(struct io_kiocb *req) wake_up(&ctx->inflight_wait); spin_unlock_irqrestore(&ctx->inflight_lock, flags); } - if (req->flags & REQ_F_TIMEOUT) - kfree(req->timeout.data); percpu_ref_put(&ctx->refs); if (likely(!io_is_fallback_req(req))) kmem_cache_free(req_cachep, req); @@ -863,7 +901,7 @@ static bool io_link_cancel_timeout(struct io_kiocb *req) struct io_ring_ctx *ctx = req->ctx; int ret; - ret = hrtimer_try_to_cancel(&req->timeout.data->timer); + ret = hrtimer_try_to_cancel(&req->io->timeout.timer); if (ret != -1) { io_cqring_fill_event(req, -ECANCELED); io_commit_cqring(ctx); @@ -878,7 +916,6 @@ static bool io_link_cancel_timeout(struct io_kiocb *req) static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) { struct io_ring_ctx *ctx = req->ctx; - struct io_kiocb *nxt; bool wake_ev = false; /* Already got next link */ @@ -890,24 +927,21 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) * potentially happen if the chain is messed up, check to be on the * safe side. */ - nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, list); - while (nxt) { - list_del_init(&nxt->list); + while (!list_empty(&req->link_list)) { + struct io_kiocb *nxt = list_first_entry(&req->link_list, + struct io_kiocb, link_list); - if ((req->flags & REQ_F_LINK_TIMEOUT) && - (nxt->flags & REQ_F_TIMEOUT)) { + if (unlikely((req->flags & REQ_F_LINK_TIMEOUT) && + (nxt->flags & REQ_F_TIMEOUT))) { + list_del_init(&nxt->link_list); wake_ev |= io_link_cancel_timeout(nxt); - nxt = list_first_entry_or_null(&req->link_list, - struct io_kiocb, list); req->flags &= ~REQ_F_LINK_TIMEOUT; continue; } - if (!list_empty(&req->link_list)) { - INIT_LIST_HEAD(&nxt->link_list); - list_splice(&req->link_list, &nxt->link_list); - nxt->flags |= REQ_F_LINK; - } + list_del_init(&req->link_list); + if (!list_empty(&nxt->link_list)) + nxt->flags |= REQ_F_LINK; *nxtptr = nxt; break; } @@ -923,15 +957,15 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) static void io_fail_links(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - struct io_kiocb *link; unsigned long flags; spin_lock_irqsave(&ctx->completion_lock, flags); while (!list_empty(&req->link_list)) { - link = list_first_entry(&req->link_list, struct io_kiocb, list); - list_del_init(&link->list); + struct io_kiocb *link = list_first_entry(&req->link_list, + struct io_kiocb, link_list); + list_del_init(&link->link_list); trace_io_uring_fail_link(req, link); if ((req->flags & REQ_F_LINK_TIMEOUT) && @@ -1079,9 +1113,9 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, * completions for those, only batch free for fixed * file and non-linked commands. */ - if (((req->flags & - (REQ_F_FIXED_FILE|REQ_F_LINK|REQ_F_FREE_SQE)) == - REQ_F_FIXED_FILE) && !io_is_fallback_req(req)) { + if (((req->flags & (REQ_F_FIXED_FILE|REQ_F_LINK)) == + REQ_F_FIXED_FILE) && !io_is_fallback_req(req) && + !req->io) { reqs[to_free++] = req; if (to_free == ARRAY_SIZE(reqs)) io_free_req_many(ctx, reqs, &to_free); @@ -1410,15 +1444,6 @@ static int io_prep_rw(struct io_kiocb *req, bool force_nonblock) if (S_ISREG(file_inode(req->file)->i_mode)) req->flags |= REQ_F_ISREG; - /* - * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so - * we know to async punt it even if it was opened O_NONBLOCK - */ - if (force_nonblock && !io_file_supports_async(req->file)) { - req->flags |= REQ_F_MUST_PUNT; - return -EAGAIN; |