diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2019-07-09 10:45:06 -0700 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2019-07-09 10:45:06 -0700 |
| commit | 3b99107f0e0298e6fe0787f75b8f3d8306dfb230 (patch) | |
| tree | 30536dbc9ca176470a2ae2938f952381e33f5deb /block | |
| parent | 0415052db4f92b7e272fc15802ad8b8be672deea (diff) | |
| parent | c9b3007feca018d3f7061f5d5a14cb00766ffe9b (diff) | |
| download | linux-3b99107f0e0298e6fe0787f75b8f3d8306dfb230.tar.gz linux-3b99107f0e0298e6fe0787f75b8f3d8306dfb230.tar.bz2 linux-3b99107f0e0298e6fe0787f75b8f3d8306dfb230.zip | |
Merge tag 'for-5.3/block-20190708' of git://git.kernel.dk/linux-block
Pull block updates from Jens Axboe:
"This is the main block updates for 5.3. Nothing earth shattering or
major in here, just fixes, additions, and improvements all over the
map. This contains:
- Series of documentation fixes (Bart)
- Optimization of the blk-mq ctx get/put (Bart)
- null_blk removal race condition fix (Bob)
- req/bio_op() cleanups (Chaitanya)
- Series cleaning up the segment accounting, and request/bio mapping
(Christoph)
- Series cleaning up the page getting/putting for bios (Christoph)
- block cgroup cleanups and moving it to where it is used (Christoph)
- block cgroup fixes (Tejun)
- Series of fixes and improvements to bcache, most notably a write
deadlock fix (Coly)
- blk-iolatency STS_AGAIN and accounting fixes (Dennis)
- Series of improvements and fixes to BFQ (Douglas, Paolo)
- debugfs_create() return value check removal for drbd (Greg)
- Use struct_size(), where appropriate (Gustavo)
- Two lighnvm fixes (Heiner, Geert)
- MD fixes, including a read balance and corruption fix (Guoqing,
Marcos, Xiao, Yufen)
- block opal shadow mbr additions (Jonas, Revanth)
- sbitmap compare-and-exhange improvemnts (Pavel)
- Fix for potential bio->bi_size overflow (Ming)
- NVMe pull requests:
- improved PCIe suspent support (Keith Busch)
- error injection support for the admin queue (Akinobu Mita)
- Fibre Channel discovery improvements (James Smart)
- tracing improvements including nvmetc tracing support (Minwoo Im)
- misc fixes and cleanups (Anton Eidelman, Minwoo Im, Chaitanya
Kulkarni)"
- Various little fixes and improvements to drivers and core"
* tag 'for-5.3/block-20190708' of git://git.kernel.dk/linux-block: (153 commits)
blk-iolatency: fix STS_AGAIN handling
block: nr_phys_segments needs to be zero for REQ_OP_WRITE_ZEROES
blk-mq: simplify blk_mq_make_request()
blk-mq: remove blk_mq_put_ctx()
sbitmap: Replace cmpxchg with xchg
block: fix .bi_size overflow
block: sed-opal: check size of shadow mbr
block: sed-opal: ioctl for writing to shadow mbr
block: sed-opal: add ioctl for done-mark of shadow mbr
block: never take page references for ITER_BVEC
direct-io: use bio_release_pages in dio_bio_complete
block_dev: use bio_release_pages in bio_unmap_user
block_dev: use bio_release_pages in blkdev_bio_end_io
iomap: use bio_release_pages in iomap_dio_bio_end_io
block: use bio_release_pages in bio_map_user_iov
block: use bio_release_pages in bio_unmap_user
block: optionally mark pages dirty in bio_release_pages
block: move the BIO_NO_PAGE_REF check into bio_release_pages
block: skd_main.c: Remove call to memset after dma_alloc_coherent
block: mtip32xx: Remove call to memset after dma_alloc_coherent
...
Diffstat (limited to 'block')
| -rw-r--r-- | block/Kconfig.iosched | 7 | ||||
| -rw-r--r-- | block/bfq-cgroup.c | 212 | ||||
| -rw-r--r-- | block/bfq-iosched.c | 967 | ||||
| -rw-r--r-- | block/bfq-iosched.h | 48 | ||||
| -rw-r--r-- | block/bio.c | 96 | ||||
| -rw-r--r-- | block/blk-cgroup.c | 139 | ||||
| -rw-r--r-- | block/blk-core.c | 111 | ||||
| -rw-r--r-- | block/blk-iolatency.c | 51 | ||||
| -rw-r--r-- | block/blk-map.c | 10 | ||||
| -rw-r--r-- | block/blk-merge.c | 112 | ||||
| -rw-r--r-- | block/blk-mq-debugfs.c | 42 | ||||
| -rw-r--r-- | block/blk-mq-sched.c | 31 | ||||
| -rw-r--r-- | block/blk-mq-sched.h | 10 | ||||
| -rw-r--r-- | block/blk-mq-tag.c | 8 | ||||
| -rw-r--r-- | block/blk-mq.c | 44 | ||||
| -rw-r--r-- | block/blk-mq.h | 7 | ||||
| -rw-r--r-- | block/blk.h | 36 | ||||
| -rw-r--r-- | block/genhd.c | 5 | ||||
| -rw-r--r-- | block/kyber-iosched.c | 6 | ||||
| -rw-r--r-- | block/mq-deadline.c | 5 | ||||
| -rw-r--r-- | block/opal_proto.h | 16 | ||||
| -rw-r--r-- | block/sed-opal.c | 197 |
22 files changed, 1342 insertions, 818 deletions
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index 4626b88b2d5a..7a6b2f29a582 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched @@ -36,6 +36,13 @@ config BFQ_GROUP_IOSCHED Enable hierarchical scheduling in BFQ, using the blkio (cgroups-v1) or io (cgroups-v2) controller. +config BFQ_CGROUP_DEBUG + bool "BFQ IO controller debugging" + depends on BFQ_GROUP_IOSCHED + ---help--- + Enable some debugging help. Currently it exports additional stat + files in a cgroup which can be useful for debugging. + endmenu endif diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index b3796a40a61a..0f6cd688924f 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -15,7 +15,83 @@ #include "bfq-iosched.h" -#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) +#ifdef CONFIG_BFQ_CGROUP_DEBUG +static int bfq_stat_init(struct bfq_stat *stat, gfp_t gfp) +{ + int ret; + + ret = percpu_counter_init(&stat->cpu_cnt, 0, gfp); + if (ret) + return ret; + + atomic64_set(&stat->aux_cnt, 0); + return 0; +} + +static void bfq_stat_exit(struct bfq_stat *stat) +{ + percpu_counter_destroy(&stat->cpu_cnt); +} + +/** + * bfq_stat_add - add a value to a bfq_stat + * @stat: target bfq_stat + * @val: value to add + * + * Add @val to @stat. The caller must ensure that IRQ on the same CPU + * don't re-enter this function for the same counter. + */ +static inline void bfq_stat_add(struct bfq_stat *stat, uint64_t val) +{ + percpu_counter_add_batch(&stat->cpu_cnt, val, BLKG_STAT_CPU_BATCH); +} + +/** + * bfq_stat_read - read the current value of a bfq_stat + * @stat: bfq_stat to read + */ +static inline uint64_t bfq_stat_read(struct bfq_stat *stat) +{ + return percpu_counter_sum_positive(&stat->cpu_cnt); +} + +/** + * bfq_stat_reset - reset a bfq_stat + * @stat: bfq_stat to reset + */ +static inline void bfq_stat_reset(struct bfq_stat *stat) +{ + percpu_counter_set(&stat->cpu_cnt, 0); + atomic64_set(&stat->aux_cnt, 0); +} + +/** + * bfq_stat_add_aux - add a bfq_stat into another's aux count + * @to: the destination bfq_stat + * @from: the source + * + * Add @from's count including the aux one to @to's aux count. + */ +static inline void bfq_stat_add_aux(struct bfq_stat *to, + struct bfq_stat *from) +{ + atomic64_add(bfq_stat_read(from) + atomic64_read(&from->aux_cnt), + &to->aux_cnt); +} + +/** + * blkg_prfill_stat - prfill callback for bfq_stat + * @sf: seq_file to print to + * @pd: policy private data of interest + * @off: offset to the bfq_stat in @pd + * + * prfill callback for printing a bfq_stat. + */ +static u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, + int off) +{ + return __blkg_prfill_u64(sf, pd, bfq_stat_read((void *)pd + off)); +} /* bfqg stats flags */ enum bfqg_stats_flags { @@ -53,7 +129,7 @@ static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats) now = ktime_get_ns(); if (now > stats->start_group_wait_time) - blkg_stat_add(&stats->group_wait_time, + bfq_stat_add(&stats->group_wait_time, now - stats->start_group_wait_time); bfqg_stats_clear_waiting(stats); } @@ -82,14 +158,14 @@ static void bfqg_stats_end_empty_time(struct bfqg_stats *stats) now = ktime_get_ns(); if (now > stats->start_empty_time) - blkg_stat_add(&stats->empty_time, + bfq_stat_add(&stats->empty_time, now - stats->start_empty_time); bfqg_stats_clear_empty(stats); } void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { - blkg_stat_add(&bfqg->stats.dequeue, 1); + bfq_stat_add(&bfqg->stats.dequeue, 1); } void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) @@ -119,7 +195,7 @@ void bfqg_stats_update_idle_time(struct bfq_group *bfqg) u64 now = ktime_get_ns(); if (now > stats->start_idle_time) - blkg_stat_add(&stats->idle_time, + bfq_stat_add(&stats->idle_time, now - stats->start_idle_time); bfqg_stats_clear_idling(stats); } @@ -137,9 +213,9 @@ void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { struct bfqg_stats *stats = &bfqg->stats; - blkg_stat_add(&stats->avg_queue_size_sum, + bfq_stat_add(&stats->avg_queue_size_sum, blkg_rwstat_total(&stats->queued)); - blkg_stat_add(&stats->avg_queue_size_samples, 1); + bfq_stat_add(&stats->avg_queue_size_samples, 1); bfqg_stats_update_group_wait_time(stats); } @@ -176,7 +252,7 @@ void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns, io_start_time_ns - start_time_ns); } -#else /* CONFIG_BFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */ +#else /* CONFIG_BFQ_CGROUP_DEBUG */ void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq, unsigned int op) { } @@ -190,7 +266,7 @@ void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { } void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { } void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { } -#endif /* CONFIG_BFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */ +#endif /* CONFIG_BFQ_CGROUP_DEBUG */ #ifdef CONFIG_BFQ_GROUP_IOSCHED @@ -274,18 +350,18 @@ void bfqg_and_blkg_put(struct bfq_group *bfqg) /* @stats = 0 */ static void bfqg_stats_reset(struct bfqg_stats *stats) { -#ifdef CONFIG_DEBUG_BLK_CGROUP +#ifdef CONFIG_BFQ_CGROUP_DEBUG /* queued stats shouldn't be cleared */ blkg_rwstat_reset(&stats->merged); blkg_rwstat_reset(&stats->service_time); blkg_rwstat_reset(&stats->wait_time); - blkg_stat_reset(&stats->time); - blkg_stat_reset(&stats->avg_queue_size_sum); - blkg_stat_reset(&stats->avg_queue_size_samples); - blkg_stat_reset(&stats->dequeue); - blkg_stat_reset(&stats->group_wait_time); - blkg_stat_reset(&stats->idle_time); - blkg_stat_reset(&stats->empty_time); + bfq_stat_reset(&stats->time); + bfq_stat_reset(&stats->avg_queue_size_sum); + bfq_stat_reset(&stats->avg_queue_size_samples); + bfq_stat_reset(&stats->dequeue); + bfq_stat_reset(&stats->group_wait_time); + bfq_stat_reset(&stats->idle_time); + bfq_stat_reset(&stats->empty_time); #endif } @@ -295,19 +371,19 @@ static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from) if (!to || !from) return; -#ifdef CONFIG_DEBUG_BLK_CGROUP +#ifdef CONFIG_BFQ_CGROUP_DEBUG /* queued stats shouldn't be cleared */ blkg_rwstat_add_aux(&to->merged, &from->merged); blkg_rwstat_add_aux(&to->service_time, &from->service_time); blkg_rwstat_add_aux(&to->wait_time, &from->wait_time); - blkg_stat_add_aux(&from->time, &from->time); - blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum); - blkg_stat_add_aux(&to->avg_queue_size_samples, + bfq_stat_add_aux(&from->time, &from->time); + bfq_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum); + bfq_stat_add_aux(&to->avg_queue_size_samples, &from->avg_queue_size_samples); - blkg_stat_add_aux(&to->dequeue, &from->dequeue); - blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time); - blkg_stat_add_aux(&to->idle_time, &from->idle_time); - blkg_stat_add_aux(&to->empty_time, &from->empty_time); + bfq_stat_add_aux(&to->dequeue, &from->dequeue); + bfq_stat_add_aux(&to->group_wait_time, &from->group_wait_time); + bfq_stat_add_aux(&to->idle_time, &from->idle_time); + bfq_stat_add_aux(&to->empty_time, &from->empty_time); #endif } @@ -355,35 +431,35 @@ void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg) static void bfqg_stats_exit(struct bfqg_stats *stats) { -#ifdef CONFIG_DEBUG_BLK_CGROUP +#ifdef CONFIG_BFQ_CGROUP_DEBUG blkg_rwstat_exit(&stats->merged); blkg_rwstat_exit(&stats->service_time); blkg_rwstat_exit(&stats->wait_time); blkg_rwstat_exit(&stats->queued); - blkg_stat_exit(&stats->time); - blkg_stat_exit(&stats->avg_queue_size_sum); - blkg_stat_exit(&stats->avg_queue_size_samples); - blkg_stat_exit(&stats->dequeue); - blkg_stat_exit(&stats->group_wait_time); - blkg_stat_exit(&stats->idle_time); - blkg_stat_exit(&stats->empty_time); + bfq_stat_exit(&stats->time); + bfq_stat_exit(&stats->avg_queue_size_sum); + bfq_stat_exit(&stats->avg_queue_size_samples); + bfq_stat_exit(&stats->dequeue); + bfq_stat_exit(&stats->group_wait_time); + bfq_stat_exit(&stats->idle_time); + bfq_stat_exit(&stats->empty_time); #endif } static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) { -#ifdef CONFIG_DEBUG_BLK_CGROUP +#ifdef CONFIG_BFQ_CGROUP_DEBUG if (blkg_rwstat_init(&stats->merged, gfp) || blkg_rwstat_init(&stats->service_time, gfp) || blkg_rwstat_init(&stats->wait_time, gfp) || blkg_rwstat_init(&stats->queued, gfp) || - blkg_stat_init(&stats->time, gfp) || - blkg_stat_init(&stats->avg_queue_size_sum, gfp) || - blkg_stat_init(&stats->avg_queue_size_samples, gfp) || - blkg_stat_init(&stats->dequeue, gfp) || - blkg_stat_init(&stats->group_wait_time, gfp) || - blkg_stat_init(&stats->idle_time, gfp) || - blkg_stat_init(&stats->empty_time, gfp)) { + bfq_stat_init(&stats->time, gfp) || + bfq_stat_init(&stats->avg_queue_size_sum, gfp) || + bfq_stat_init(&stats->avg_queue_size_samples, gfp) || + bfq_stat_init(&stats->dequeue, gfp) || + bfq_stat_init(&stats->group_wait_time, gfp) || + bfq_stat_init(&stats->idle_time, gfp) || + bfq_stat_init(&stats->empty_time, gfp)) { bfqg_stats_exit(stats); return -ENOMEM; } @@ -909,7 +985,7 @@ static ssize_t bfq_io_set_weight(struct kernfs_open_file *of, return ret ?: nbytes; } -#ifdef CONFIG_DEBUG_BLK_CGROUP +#ifdef CONFIG_BFQ_CGROUP_DEBUG static int bfqg_print_stat(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat, @@ -927,17 +1003,34 @@ static int bfqg_print_rwstat(struct seq_file *sf, void *v) static u64 bfqg_prfill_stat_recursive(struct seq_file *sf, struct blkg_policy_data *pd, int off) { - u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd), - &blkcg_policy_bfq, off); + struct blkcg_gq *blkg = pd_to_blkg(pd); + struct blkcg_gq *pos_blkg; + struct cgroup_subsys_state *pos_css; + u64 sum = 0; + + lockdep_assert_held(&blkg->q->queue_lock); + + rcu_read_lock(); + blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) { + struct bfq_stat *stat; + + if (!pos_blkg->online) + continue; + + stat = (void *)blkg_to_pd(pos_blkg, &blkcg_policy_bfq) + off; + sum += bfq_stat_read(stat) + atomic64_read(&stat->aux_cnt); + } + rcu_read_unlock(); + return __blkg_prfill_u64(sf, pd, sum); } static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf, struct blkg_policy_data *pd, int off) { - struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd), - &blkcg_policy_bfq, - off); + struct blkg_rwstat_sample sum; + + blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off, &sum); return __blkg_prfill_rwstat(sf, pd, &sum); } @@ -975,12 +1068,13 @@ static int bfqg_print_stat_sectors(struct seq_file *sf, void *v) static u64 bfqg_prfill_sectors_recursive(struct seq_file *sf, struct blkg_policy_data *pd, int off) { - struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL, - offsetof(struct blkcg_gq, stat_bytes)); - u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) + - atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]); + struct blkg_rwstat_sample tmp; - return __blkg_prfill_u64(sf, pd, sum >> 9); + blkg_rwstat_recursive_sum(pd->blkg, NULL, + offsetof(struct blkcg_gq, stat_bytes), &tmp); + + return __blkg_prfill_u64(sf, pd, + (tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE]) >> 9); } static int bfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v) @@ -995,11 +1089,11 @@ static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf, struct blkg_policy_data *pd, int off) { struct bfq_group *bfqg = pd_to_bfqg(pd); - u64 samples = blkg_stat_read(&bfqg->stats.avg_queue_size_samples); + u64 samples = bfq_stat_read(&bfqg->stats.avg_queue_size_samples); u64 v = 0; if (samples) { - v = blkg_stat_read(&bfqg->stats.avg_queue_size_sum); + v = bfq_stat_read(&bfqg->stats.avg_queue_size_sum); v = div64_u64(v, samples); } __blkg_prfill_u64(sf, pd, v); @@ -1014,7 +1108,7 @@ static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v) 0, false); return 0; } -#endif /* CONFIG_DEBUG_BLK_CGROUP */ +#endif /* CONFIG_BFQ_CGROUP_DEBUG */ struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) { @@ -1062,7 +1156,7 @@ struct cftype bfq_blkcg_legacy_files[] = { .private = (unsigned long)&blkcg_policy_bfq, .seq_show = blkg_print_stat_ios, }, -#ifdef CONFIG_DEBUG_BLK_CGROUP +#ifdef CONFIG_BFQ_CGROUP_DEBUG { .name = "bfq.time", .private = offsetof(struct bfq_group, stats.time), @@ -1092,7 +1186,7 @@ struct cftype bfq_blkcg_legacy_files[] = { .private = offsetof(struct bfq_group, stats.queued), .seq_show = bfqg_print_rwstat, }, -#endif /* CONFIG_DEBUG_BLK_CGROUP */ +#endif /* CONFIG_BFQ_CGROUP_DEBUG */ /* the same statistics which cover the bfqg and its descendants */ { @@ -1105,7 +1199,7 @@ struct cftype bfq_blkcg_legacy_files[] = { .private = (unsigned long)&blkcg_policy_bfq, .seq_show = blkg_print_stat_ios_recursive, }, -#ifdef CONFIG_DEBUG_BLK_CGROUP +#ifdef CONFIG_BFQ_CGROUP_DEBUG { .name = "bfq.time_recursive", .private = offsetof(struct bfq_group, stats.time), @@ -1159,7 +1253,7 @@ struct cftype bfq_blkcg_legacy_files[] = { .private = offsetof(struct bfq_group, stats.dequeue), .seq_show = bfqg_print_stat, }, -#endif /* CONFIG_DEBUG_BLK_CGROUP */ +#endif /* CONFIG_BFQ_CGROUP_DEBUG */ { } /* terminate */ }; diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index f9269ae6da9c..50c9d2598500 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -157,6 +157,7 @@ BFQ_BFQQ_FNS(in_large_burst); BFQ_BFQQ_FNS(coop); BFQ_BFQQ_FNS(split_coop); BFQ_BFQQ_FNS(softrt_update); +BFQ_BFQQ_FNS(has_waker); #undef BFQ_BFQQ_FNS \ /* Expiration time of sync (0) and async (1) requests, in ns. */ @@ -1427,17 +1428,19 @@ static int bfq_min_budget(struct bfq_data *bfqd) * mechanism may be re-designed in such a way to make it possible to * know whether preemption is needed without needing to update service * trees). In addition, queue preemptions almost always cause random - * I/O, and thus loss of throughput. Because of these facts, the next - * function adopts the following simple scheme to avoid both costly - * operations and too frequent preemptions: it requests the expiration - * of the in-service queue (unconditionally) only for queues that need - * to recover a hole, or that either are weight-raised or deserve to - * be weight-raised. + * I/O, which may in turn cause loss of throughput. Finally, there may + * even be no in-service queue when the next function is invoked (so, + * no queue to compare timestamps with). Because of these facts, the + * next function adopts the following simple scheme to avoid costly + * operations, too frequent preemptions and too many dependencies on + * the state of the scheduler: it requests the expiration of the + * in-service queue (unconditionally) only for queues that need to + * recover a hole. Then it delegates to other parts of the code the + * responsibility of handling the above case 2. */ static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd, struct bfq_queue *bfqq, - bool arrived_in_time, - bool wr_or_deserves_wr) + bool arrived_in_time) { struct bfq_entity *entity = &bfqq->entity; @@ -1492,7 +1495,7 @@ static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd, entity->budget = max_t(unsigned long, bfqq->max_budget, bfq_serv_to_charge(bfqq->next_rq, bfqq)); bfq_clear_bfqq_non_blocking_wait_rq(bfqq); - return wr_or_deserves_wr; + return false; } /* @@ -1610,6 +1613,36 @@ static bool bfq_bfqq_idle_for_long_time(struct bfq_data *bfqd, bfqd->bfq_wr_min_idle_time); } + +/* + * Return true if bfqq is in a higher priority class, or has a higher + * weight than the in-service queue. + */ +static bool bfq_bfqq_higher_class_or_weight(struct bfq_queue *bfqq, + struct bfq_queue *in_serv_bfqq) +{ + int bfqq_weight, in_serv_weight; + + if (bfqq->ioprio_class < in_serv_bfqq->ioprio_class) + return true; + + if (in_serv_bfqq->entity.parent == bfqq->entity.parent) { + bfqq_weight = bfqq->entity.weight; + in_serv_weight = in_serv_bfqq->entity.weight; + } else { + if (bfqq->entity.parent) + bfqq_weight = bfqq->entity.parent->weight; + else + bfqq_weight = bfqq->entity.weight; + if (in_serv_bfqq->entity.parent) + in_serv_weight = in_serv_bfqq->entity.parent->weight; + else + in_serv_weight = in_serv_bfqq->entity.weight; + } + + return bfqq_weight > in_serv_weight; +} + static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, struct bfq_queue *bfqq, int old_wr_coeff, @@ -1654,8 +1687,7 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, */ bfqq_wants_to_preempt = bfq_bfqq_update_budg_for_activation(bfqd, bfqq, - arrived_in_time, - wr_or_deserves_wr); + arrived_in_time); /* * If bfqq happened to be activated in a burst, but has been @@ -1720,21 +1752,111 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, /* * Expire in-service queue only if preemption may be needed - * for guarantees. In this respect, the function - * next_queue_may_preempt just checks a simple, necessary - * condition, and not a sufficient condition based on - * timestamps. In fact, for the latter condition to be - * evaluated, timestamps would need first to be updated, and - * this operation is quite costly (see the comments on the - * function bfq_bfqq_update_budg_for_activation). + * for guarantees. In particular, we care only about two + * cases. The first is that bfqq has to recover a service + * hole, as explained in the comments on + * bfq_bfqq_update_budg_for_activation(), i.e., that + * bfqq_wants_to_preempt is true. However, if bfqq does not + * carry time-critical I/O, then bfqq's bandwidth is less + * important than that of queues that carry time-critical I/O. + * So, as a further constraint, we consider this case only if + * bfqq is at least as weight-raised, i.e., at least as time + * critical, as the in-service queue. + * + * The second case is that bfqq is in a higher priority class, + * or has a higher weight than the in-service queue. If this + * condition does not hold, we don't care because, even if + * bfqq does not start to be served immediately, the resulting + * delay for bfqq's I/O is however lower or much lower than + * the ideal completion time to be guaranteed to bfqq's I/O. + * + * In both cases, preemption is needed only if, according to + * the timestamps of both bfqq and of the in-service queue, + * bfqq actually is the next queue to serve. So, to reduce + * useless preemptions, the return value of + * next_queue_may_preempt() is considered in the next compound + * condition too. Yet next_queue_may_preempt() just checks a + * simple, necessary condition for bfqq to be the next queue + * to serve. In fact, to evaluate a sufficient condition, the + * timestamps of the in-service queue would need to be + * updated, and this operation is quite costly (see the + * comments on bfq_bfqq_update_budg_for_activation()). */ - if (bfqd->in_service_queue && bfqq_wants_to_preempt && - bfqd->in_service_queue->wr_coeff < bfqq->wr_coeff && + if (bfqd->in_service_queue && + ((bfqq_wants_to_preempt && + bfqq->wr_coeff >= bfqd->in_service_queue->wr_coeff) || + bfq_bfqq_higher_class_or_weight(bfqq, bfqd->in_service_queue)) && next_queue_may_preempt(bfqd)) bfq_bfqq_expire(bfqd, bfqd->in_service_queue, false, BFQQE_PREEMPTED); } +static void bfq_reset_inject_limit(struct bfq_data *bfqd, + struct bfq_queue *bfqq) +{ + /* invalidate baseline total service time */ + bfqq->last_serv_time_ns = 0; + + /* + * Reset pointer in case we are waiting for + * some request completion. + */ + bfqd->waited_rq = NULL; + + /* + * If bfqq has a short think time, then start by setting the + * inject limit to 0 prudentially, because the service time of + * an injected I/O request may be higher than the think time + * of bfqq, and therefore, if one request was injected when + * bfqq remains empty, this injected request might delay the + * service of the next I/O request for bfqq significantly. In + * case bfqq can actually tolerate some injection, then the + * adaptive update will however raise the limit soon. This + * lucky circumstance holds exactly because bfqq has a short + * think time, and thus, after remaining empty, is likely to + * get new I/O enqueued---and then completed---before being + * expired. This is the very pattern that gives the + * limit-update algorithm the chance to measure the effect of + * injection on request service times, and then to update the + * limit accordingly. + * + * However, in the following special case, the inject limit is + * left to 1 even if the think time is short: bfqq's I/O is + * synchronized with that of some other queue, i.e., bfqq may + * receive new I/O only after the I/O of the other queue is + * completed. Keeping the inject limit to 1 allows the + * blocking I/O to be served while bfqq is in service. And + * this is very convenient both for bfqq and for overall + * throughput, as explained in detail in the comments in + * bfq_update_has_short_ttime(). + * + * On the opposite end, if bfqq has a long think time, then + * start directly by 1, because: + * a) on the bright side, keeping at most one request in + * service in the drive is unlikely to cause any harm to the + * latency of bfqq's requests, as the service time of a single + * request is likely to be lower than the think time of bfqq; + * b) on the downside, after becoming empty, bfqq is likely to + * expire before getting its next request. With this request + * arrival pattern, it is very hard to sample total service + * times and update the inject limit accordingly (see comments + * on bfq_update_inject_limit()). So the limit is likely to be + * never, or at least seldom, updated. As a consequence, by + * setting the limit to 1, we avoid that no injection ever + * occurs with bfqq. On the downside, this proactive step + * further reduces chances to actually compute the baseline + * total service time. Thus it reduces chances to execute the + * limit-update algorithm and possibly raise the limit to more + * than 1. + */ + if (bfq_bfqq_has_short_ttime(bfqq)) + bfqq->inject_limit = 0; + else + bfqq->inject_limit = 1; + + bfqq->decrease_time_jif = jiffies; +} + static void bfq_add_request(struct request *rq) { struct bfq_queue *bfqq = RQ_BFQQ(rq); @@ -1749,77 +1871,119 @@ static void bfq_add_request(struct request *rq) if (RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_sync(bfqq)) { /* + * Detect whether bfqq's I/O seems synchronized with + * that of some other queue, i.e., whether bfqq, after + * remaining empty, happens to receive new I/O only + * right after some I/O request of the other queue has + * been completed. We call waker queue the other + * queue, and we assume, for simplicity, that bfqq may + * have at most one waker queue. + * + * A remarkable throughput boost can be reached by + * unconditionally injecting the I/O of the waker + * queue, every time a new bfq_dispatch_request + * happens to be invoked while I/O is being plugged + * for bfqq. In addition to boosting throughput, this + * unblocks bfqq's I/O, thereby improving bandwidth + * and latency for bfqq. Note that these same results + * may be achieved with the general injection + * mechanism, but less effectively. For details on + * this aspect, see the comments on the choice of the + * queue for injection in bfq_select_queue(). + * + * Turning back to the detection of a waker queue, a + * queue Q is deemed as a waker queue for bfqq if, for + * two consecutive times, bfqq happens to become non + * empty right after a request of Q has been + * completed. In particular, on the first time, Q is + * tentatively set as a candidate waker queue, while + * on the second time, the flag + * bfq_bfqq_has_waker(bfqq) is set to confirm that Q + * is a waker queue for bfqq. These detection steps + * are performed only if bfqq has a long think time, + * so as to make it more likely that bfqq's I/O is + * actually being blocked by a synchronization. This + * last filter, plus the above two-times requirement, + * make false positives less likely. + * + * NOTE + * + * The sooner a waker queue is detected, the sooner + * throughput can be boosted by injecting I/O from the + * waker queue. Fortunately, detection is likely to be + * actually fast, for the following reasons. While + * blocked by synchronization, bfqq has a long think + * time. This implies that bfqq's inject limit is at + * least equal to 1 (see the comments in + * bfq_update_inject_limit()). So, thanks to + * injection, the waker queue is likely to be served + * during the very first I/O-plugging time interval + * for bfqq. This triggers the first step of the + * detection mechanism. Thanks again to injection, the + * candidate waker queue is then likely to be + * confirmed no later than during the next + * I/O-plugging interval for bfqq. + */ + if (!bfq_bfqq_has_short_ttime(bfqq) && + ktime_get_ns() - bfqd->last_completion < + 200 * NSEC_PER_USEC) { + if (bfqd->last_completed_rq_bfqq != bfqq && + bfqd->last_completed_rq_bfqq != + bfqq->waker_bfqq) { + /* + * First synchronization detected with + * a candidate waker queue, or with a + * different candidate waker queue + * from the current one. + */ + bfqq->waker_bfqq = bfqd->last_completed_rq_bfqq; + + /* + * If the waker queue disappears, then + * bfqq->waker_bfqq must be reset. To + * this goal, we maintain in each + * waker queue a list, woken_list, of + * all the queues that reference the + * waker queue through their + * waker_bfqq pointer. When the waker + * queue exits, the waker_bfqq pointer + * of all the queues in the woken_list + * is reset. + * + * In addition, if bfqq is already in + * the woken_list of a waker queue, + * then, before being inserted into + * the woken_list of a new waker + * queue, bfqq must be removed from + * the woken_list of the old waker + * queue. + */ + if (!hlist_unhashed(&bfqq->woken_list_node)) + hlist_del_init(&bfqq->woken_list_node); + hlist_add_head(&bfqq->woken_list_node, + &bfqd->last_completed_rq_bfqq->woken_list); + + bfq_clear_bfqq_has_waker(bfqq); + } else if (bfqd->last_completed_rq_bfqq == + bfqq->waker_bfqq && + !bfq_bfqq_has_waker(bfqq)) { + /* + * synchronization with waker_bfqq + * seen for the second time + */ + bfq_mark_bfqq_has_waker(bfqq); + } + } + + /* * Periodically reset inject limit, to make sure that * the latter eventually drops in case workload * changes, see step (3) in the comments on * bfq_update_inject_limit(). */ if (time_is_before_eq_jiffies(bfqq->decrease_time_jif + - msecs_to_jiffies(1000))) { - /* invalidate baseline total service time */ - bfqq->last_serv_time_ns = 0; |
