// SPDX-License-Identifier: GPL-2.0
/*
* background writeback - scan btree for dirty data and write it to the backing
* device
*
* Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
* Copyright 2012 Google, Inc.
*/
#include "bcache.h"
#include "btree.h"
#include "debug.h"
#include "writeback.h"
#include <linux/delay.h>
#include <linux/kthread.h>
#include <linux/sched/clock.h>
#include <trace/events/bcache.h>
static void update_gc_after_writeback(struct cache_set *c)
{
if (c->gc_after_writeback != (BCH_ENABLE_AUTO_GC) ||
c->gc_stats.in_use < BCH_AUTO_GC_DIRTY_THRESHOLD)
return;
c->gc_after_writeback |= BCH_DO_AUTO_GC;
}
/* Rate limiting */
static uint64_t __calc_target_rate(struct cached_dev *dc)
{
struct cache_set *c = dc->disk.c;
/*
* This is the size of the cache, minus the amount used for
* flash-only devices
*/
uint64_t cache_sectors = c->nbuckets * c->cache->sb.bucket_size -
atomic_long_read(&c->flash_dev_dirty_sectors);
/*
* Unfortunately there is no control of global dirty data. If the
* user states that they want 10% dirty data in the cache, and has,
* e.g., 5 backing volumes of equal size, we try and ensure each
* backing volume uses about 2% of the cache for dirty data.
*/
uint32_t bdev_share =
div64_u64(bdev_nr_sectors(dc->bdev) << WRITEBACK_SHARE_SHIFT,
c->cached_dev_sectors);
uint64_t cache_dirty_target =
div_u64(cache_sectors * dc->writeback_percent, 100);
/* Ensure each backing dev gets at least one dirty share */
if (bdev_share < 1)
bdev_share = 1;
return (cache_dirty_target * bdev_share) >> WRITEBACK_SHARE_SHIFT;
}
static void __update_writeback_rate(struct cached_dev *dc)
{
/*
* PI controller:
* Figures out the amount that should be written per second.
*
* First, the error (number of sectors that are dirty beyond our
* target) is calculated. The error is accumulated (numerically
* integrated).
*
* Then, the proportional value and integral value are scaled
* based on configured values. These are stored as inverses to
* avoid fixed point math and to make configuration easy-- e.g.
* the default value of 40 for writeback_rate_p_term_inverse
* attempts to write at a rate that would retire all the dirty
* blocks in 40 seconds.
*
* The writeback_rate_i_inverse value of 10000 means that 1/10000th
* of the error is accumulated in the integral term per second.
* This acts as a slow, long-term average that is not subject to
* variations in usage like the p term.
*/
int64_t target = __calc_target_rate(dc);
int64_t dirty = bcache_dev_sectors_dirty(&dc->disk);
int64_t error = dirty - target;
int64_t proportional_scaled =
div_s64(error, dc->writeback_rate_p_term_inverse);
int64_t integral_scaled;
uint32_t new_rate;
/*
* We need to consider the number of dirty buckets as well
* when calculating the proportional_scaled, Otherwise we might
* have an unreasonable small writeback rate at a highly fragmented situation
* when very few dirty sectors consumed a lot dirty buckets, the
* worst case is when dirty buckets reached cutoff_writeback_sync and
* dirty data is still not even reached to writeback percent, so the rate
* still will be at the minimum value, which will cause the write
* stuck at a non-writeback mode.
*/
struct cache_set *c = dc->disk.c;
int64_t dirty_buckets = c->nbuckets - c->avail_nbuckets;
if (dc->writeback_consider_fragment &&
c->gc_stats.in_use > BCH_WRITEBACK_FRAGMENT_THRESHOLD_LOW && dirty > 0) {
int64_t fragment =
div_s64((dirty_buckets * c->cache->sb.bucket_size), dirty);
int64_t fp_term;
int64_t fps;
if (c->gc_stats.in_use <= BCH_WRITEBACK_FRAGMENT_THRESHOLD_MID) {
fp_term = (int64_t)dc->writeback_rate_fp_term_low *
(c->gc_stats.in_use - BCH_WRITEBACK_FRAGMENT_THRESHOLD_LOW);
} else if (c->gc_stats.in_use <= BCH_WRITEBACK_FRAGMENT_THRESHOLD_HIGH) {
fp_term = (int64_t)dc->writeback_rate_fp_term_mid *
(c->gc_stats.in_use - BCH_WRITEBACK_FRAGMENT_THRESHOLD_MID);
} else {
fp_term = (int64_t)dc->writeback_rate_fp_term_high *
(c->gc_stats.in_use - BCH_WRITEBACK_FRAGMENT_THRESHOLD_HIGH);
}
fps = div_s64(dirty, dirty_buckets) * fp_term;
if (fragment > 3 && fps > proportional_scaled) {
/* Only overrite the p when fragment > 3 */
proportional_scaled = fps;
}
}
if ((error < 0 && dc->writeback_rate_integral > 0) ||
(error > 0 && time_before64(local_clock(),
dc->writeback_rate.next + NSEC_PER_MSEC))) {
/*
* Only decrease the integral term if it's more than
* zero. Only increase the integral term if the device
* is keeping up. (Don't wind up the integral
* ineffectively in either case).
*
* It's necessary to scale this by
* writeback_rate_update_seconds to keep the integral
* term dimensioned properly.
*/
dc->writeback_rate_integral += error *
dc->writeback_rate_update_seconds;
}
integral_scaled = div_s64(dc->writeback_rate_integral,
dc->writeback_rate_i_term_inverse);
new_rate = clamp_t(int32_t, (proportional_scaled + integral_scaled),
dc->writeback_rate_minimum, NSEC_PER_SEC);
dc->writeback_rate_proportional = proportional_scaled;
dc->writeback_rate_integral_scaled = integral_scaled;
dc->writeback_rate_change = new_rate -
atomic_long_read(&dc->writeback_rate.rate);
atomic_long_set(&dc->writeback_rate.rate, new_rate);
dc->writeback_rate_target = target;
}
static bool idle_counter_exceeded(struct cache_set *c)
{
int counter, dev_nr;
/*
* If c->idle_counter is overflow (idel for really long time),
* reset as 0 and not set maxim