summaryrefslogtreecommitdiff
path: root/drivers/md/dm-bufio.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/dm-bufio.c')
-rw-r--r--drivers/md/dm-bufio.c1940
1 files changed, 1343 insertions, 597 deletions
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index cf077f9b30c3..eea977662e81 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -21,6 +21,8 @@
#include <linux/stacktrace.h>
#include <linux/jump_label.h>
+#include "dm.h"
+
#define DM_MSG_PREFIX "bufio"
/*
@@ -66,57 +68,241 @@
#define LIST_DIRTY 1
#define LIST_SIZE 2
+/*--------------------------------------------------------------*/
+
/*
- * Linking of buffers:
- * All buffers are linked to buffer_tree with their node field.
- *
- * Clean buffers that are not being written (B_WRITING not set)
- * are linked to lru[LIST_CLEAN] with their lru_list field.
- *
- * Dirty and clean buffers that are being written are linked to
- * lru[LIST_DIRTY] with their lru_list field. When the write
- * finishes, the buffer cannot be relinked immediately (because we
- * are in an interrupt context and relinking requires process
- * context), so some clean-not-writing buffers can be held on
- * dirty_lru too. They are later added to lru in the process
- * context.
+ * Rather than use an LRU list, we use a clock algorithm where entries
+ * are held in a circular list. When an entry is 'hit' a reference bit
+ * is set. The least recently used entry is approximated by running a
+ * cursor around the list selecting unreferenced entries. Referenced
+ * entries have their reference bit cleared as the cursor passes them.
*/
-struct dm_bufio_client {
- struct mutex lock;
- spinlock_t spinlock;
- bool no_sleep;
+struct lru_entry {
+ struct list_head list;
+ atomic_t referenced;
+};
- struct list_head lru[LIST_SIZE];
- unsigned long n_buffers[LIST_SIZE];
+struct lru_iter {
+ struct lru *lru;
+ struct list_head list;
+ struct lru_entry *stop;
+ struct lru_entry *e;
+};
- struct block_device *bdev;
- unsigned int block_size;
- s8 sectors_per_block_bits;
- void (*alloc_callback)(struct dm_buffer *buf);
- void (*write_callback)(struct dm_buffer *buf);
- struct kmem_cache *slab_buffer;
- struct kmem_cache *slab_cache;
- struct dm_io_client *dm_io;
+struct lru {
+ struct list_head *cursor;
+ unsigned long count;
- struct list_head reserved_buffers;
- unsigned int need_reserved_buffers;
+ struct list_head iterators;
+};
- unsigned int minimum_buffers;
+/*--------------*/
- struct rb_root buffer_tree;
- wait_queue_head_t free_buffer_wait;
+static void lru_init(struct lru *lru)
+{
+ lru->cursor = NULL;
+ lru->count = 0;
+ INIT_LIST_HEAD(&lru->iterators);
+}
- sector_t start;
+static void lru_destroy(struct lru *lru)
+{
+ WARN_ON_ONCE(lru->cursor);
+ WARN_ON_ONCE(!list_empty(&lru->iterators));
+}
- int async_write_error;
+/*
+ * Insert a new entry into the lru.
+ */
+static void lru_insert(struct lru *lru, struct lru_entry *le)
+{
+ /*
+ * Don't be tempted to set to 1, makes the lru aspect
+ * perform poorly.
+ */
+ atomic_set(&le->referenced, 0);
- struct list_head client_list;
+ if (lru->cursor) {
+ list_add_tail(&le->list, lru->cursor);
+ } else {
+ INIT_LIST_HEAD(&le->list);
+ lru->cursor = &le->list;
+ }
+ lru->count++;
+}
- struct shrinker shrinker;
- struct work_struct shrink_work;
- atomic_long_t need_shrink;
+/*--------------*/
+
+/*
+ * Convert a list_head pointer to an lru_entry pointer.
+ */
+static inline struct lru_entry *to_le(struct list_head *l)
+{
+ return container_of(l, struct lru_entry, list);
+}
+
+/*
+ * Initialize an lru_iter and add it to the list of cursors in the lru.
+ */
+static void lru_iter_begin(struct lru *lru, struct lru_iter *it)
+{
+ it->lru = lru;
+ it->stop = lru->cursor ? to_le(lru->cursor->prev) : NULL;
+ it->e = lru->cursor ? to_le(lru->cursor) : NULL;
+ list_add(&it->list, &lru->iterators);
+}
+
+/*
+ * Remove an lru_iter from the list of cursors in the lru.
+ */
+static inline void lru_iter_end(struct lru_iter *it)
+{
+ list_del(&it->list);
+}
+
+/* Predicate function type to be used with lru_iter_next */
+typedef bool (*iter_predicate)(struct lru_entry *le, void *context);
+
+/*
+ * Advance the cursor to the next entry that passes the
+ * predicate, and return that entry. Returns NULL if the
+ * iteration is complete.
+ */
+static struct lru_entry *lru_iter_next(struct lru_iter *it,
+ iter_predicate pred, void *context)
+{
+ struct lru_entry *e;
+
+ while (it->e) {
+ e = it->e;
+
+ /* advance the cursor */
+ if (it->e == it->stop)
+ it->e = NULL;
+ else
+ it->e = to_le(it->e->list.next);
+
+ if (pred(e, context))
+ return e;
+ }
+
+ return NULL;
+}
+
+/*
+ * Invalidate a specific lru_entry and update all cursors in
+ * the lru accordingly.
+ */
+static void lru_iter_invalidate(struct lru *lru, struct lru_entry *e)
+{
+ struct lru_iter *it;
+
+ list_for_each_entry(it, &lru->iterators, list) {
+ /* Move c->e forwards if necc. */
+ if (it->e == e) {
+ it->e = to_le(it->e->list.next);
+ if (it->e == e)
+ it->e = NULL;
+ }
+
+ /* Move it->stop backwards if necc. */
+ if (it->stop == e) {
+ it->stop = to_le(it->stop->list.prev);
+ if (it->stop == e)
+ it->stop = NULL;
+ }
+ }
+}
+
+/*--------------*/
+
+/*
+ * Remove a specific entry from the lru.
+ */
+static void lru_remove(struct lru *lru, struct lru_entry *le)
+{
+ lru_iter_invalidate(lru, le);
+ if (lru->count == 1) {
+ lru->cursor = NULL;
+ } else {
+ if (lru->cursor == &le->list)
+ lru->cursor = lru->cursor->next;
+ list_del(&le->list);
+ }
+ lru->count--;
+}
+
+/*
+ * Mark as referenced.
+ */
+static inline void lru_reference(struct lru_entry *le)
+{
+ atomic_set(&le->referenced, 1);
+}
+
+/*--------------*/
+
+/*
+ * Remove the least recently used entry (approx), that passes the predicate.
+ * Returns NULL on failure.
+ */
+enum evict_result {
+ ER_EVICT,
+ ER_DONT_EVICT,
+ ER_STOP, /* stop looking for something to evict */
};
+typedef enum evict_result (*le_predicate)(struct lru_entry *le, void *context);
+
+static struct lru_entry *lru_evict(struct lru *lru, le_predicate pred, void *context)
+{
+ unsigned long tested = 0;
+ struct list_head *h = lru->cursor;
+ struct lru_entry *le;
+
+ if (!h)
+ return NULL;
+ /*
+ * In the worst case we have to loop around twice. Once to clear
+ * the reference flags, and then again to discover the predicate
+ * fails for all entries.
+ */
+ while (tested < lru->count) {
+ le = container_of(h, struct lru_entry, list);
+
+ if (atomic_read(&le->referenced)) {
+ atomic_set(&le->referenced, 0);
+ } else {
+ tested++;
+ switch (pred(le, context)) {
+ case ER_EVICT:
+ /*
+ * Adjust the cursor, so we start the next
+ * search from here.
+ */
+ lru->cursor = le->list.next;
+ lru_remove(lru, le);
+ return le;
+
+ case ER_DONT_EVICT:
+ break;
+
+ case ER_STOP:
+ lru->cursor = le->list.next;
+ return NULL;
+ }
+ }
+
+ h = h->next;
+
+ cond_resched();
+ }
+
+ return NULL;
+}
+
+/*--------------------------------------------------------------*/
+
/*
* Buffer state bits.
*/
@@ -137,26 +323,37 @@ enum data_mode {
};
struct dm_buffer {
+ /* protected by the locks in dm_buffer_cache */
struct rb_node node;
- struct list_head lru_list;
- struct list_head global_list;
+
+ /* immutable, so don't need protecting */
sector_t block;
void *data;
unsigned char data_mode; /* DATA_MODE_* */
+
+ /*
+ * These two fields are used in isolation, so do not need
+ * a surrounding lock.
+ */
+ atomic_t hold_count;
+ unsigned long last_accessed;
+
+ /*
+ * Everything else is protected by the mutex in
+ * dm_bufio_client
+ */
+ unsigned long state;
+ struct lru_entry lru;
unsigned char list_mode; /* LIST_* */
blk_status_t read_error;
blk_status_t write_error;
- unsigned int accessed;
- unsigned int hold_count;
- unsigned long state;
- unsigned long last_accessed;
unsigned int dirty_start;
unsigned int dirty_end;
unsigned int write_start;
unsigned int write_end;
- struct dm_bufio_client *c;
struct list_head write_list;
- void (*end_io)(struct dm_buffer *buf, blk_status_t stat);
+ struct dm_bufio_client *c;
+ void (*end_io)(struct dm_buffer *b, blk_status_t bs);
#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
#define MAX_STACK 10
unsigned int stack_len;
@@ -164,126 +361,507 @@ struct dm_buffer {
#endif
};
-static DEFINE_STATIC_KEY_FALSE(no_sleep_enabled);
+/*--------------------------------------------------------------*/
-/*----------------------------------------------------------------*/
+/*
+ * The buffer cache manages buffers, particularly:
+ * - inc/dec of holder count
+ * - setting the last_accessed field
+ * - maintains clean/dirty state along with lru
+ * - selecting buffers that match predicates
+ *
+ * It does *not* handle:
+ * - allocation/freeing of buffers.
+ * - IO
+ * - Eviction or cache sizing.
+ *
+ * cache_get() and cache_put() are threadsafe, you do not need to
+ * protect these calls with a surrounding mutex. All the other
+ * methods are not threadsafe; they do use locking primitives, but
+ * only enough to ensure get/put are threadsafe.
+ */
-#define dm_bufio_in_request() (!!current->bio_list)
+struct buffer_tree {
+ struct rw_semaphore lock;
+ struct rb_root root;
+} ____cacheline_aligned_in_smp;
-static void dm_bufio_lock(struct dm_bufio_client *c)
+struct dm_buffer_cache {
+ struct lru lru[LIST_SIZE];
+ /*
+ * We spread entries across multiple trees to reduce contention
+ * on the locks.
+ */
+ unsigned int num_locks;
+ struct buffer_tree trees[];
+};
+
+static inline unsigned int cache_index(sector_t block, unsigned int num_locks)
{
- if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep)
- spin_lock_bh(&c->spinlock);
- else
- mutex_lock_nested(&c->lock, dm_bufio_in_request());
+ return dm_hash_locks_index(block, num_locks);
}
-static int dm_bufio_trylock(struct dm_bufio_client *c)
+static inline void cache_read_lock(struct dm_buffer_cache *bc, sector_t block)
{
- if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep)
- return spin_trylock_bh(&c->spinlock);
+ down_read(&bc->trees[cache_index(block, bc->num_locks)].lock);
+}
+
+static inline void cache_read_unlock(struct dm_buffer_cache *bc, sector_t block)
+{
+ up_read(&bc->trees[cache_index(block, bc->num_locks)].lock);
+}
+
+static inline void cache_write_lock(struct dm_buffer_cache *bc, sector_t block)
+{
+ down_write(&bc->trees[cache_index(block, bc->num_locks)].lock);
+}
+
+static inline void cache_write_unlock(struct dm_buffer_cache *bc, sector_t block)
+{
+ up_write(&bc->trees[cache_index(block, bc->num_locks)].lock);
+}
+
+/*
+ * Sometimes we want to repeatedly get and drop locks as part of an iteration.
+ * This struct helps avoid redundant drop and gets of the same lock.
+ */
+struct lock_history {
+ struct dm_buffer_cache *cache;
+ bool write;
+ unsigned int previous;
+ unsigned int no_previous;
+};
+
+static void lh_init(struct lock_history *lh, struct dm_buffer_cache *cache, bool write)
+{
+ lh->cache = cache;
+ lh->write = write;
+ lh->no_previous = cache->num_locks;
+ lh->previous = lh->no_previous;
+}
+
+static void __lh_lock(struct lock_history *lh, unsigned int index)
+{
+ if (lh->write)
+ down_write(&lh->cache->trees[index].lock);
else
- return mutex_trylock(&c->lock);
+ down_read(&lh->cache->trees[index].lock);
}
-static void dm_bufio_unlock(struct dm_bufio_client *c)
+static void __lh_unlock(struct lock_history *lh, unsigned int index)
{
- if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep)
- spin_unlock_bh(&c->spinlock);
+ if (lh->write)
+ up_write(&lh->cache->trees[index].lock);
else
- mutex_unlock(&c->lock);
+ up_read(&lh->cache->trees[index].lock);
}
-/*----------------------------------------------------------------*/
+/*
+ * Make sure you call this since it will unlock the final lock.
+ */
+static void lh_exit(struct lock_history *lh)
+{
+ if (lh->previous != lh->no_previous) {
+ __lh_unlock(lh, lh->previous);
+ lh->previous = lh->no_previous;
+ }
+}
/*
- * Default cache size: available memory divided by the ratio.
+ * Named 'next' because there is no corresponding
+ * 'up/unlock' call since it's done automatically.
*/
-static unsigned long dm_bufio_default_cache_size;
+static void lh_next(struct lock_history *lh, sector_t b)
+{
+ unsigned int index = cache_index(b, lh->no_previous); /* no_previous is num_locks */
+
+ if (lh->previous != lh->no_previous) {
+ if (lh->previous != index) {
+ __lh_unlock(lh, lh->previous);
+ __lh_lock(lh, index);
+ lh->previous = index;
+ }
+ } else {
+ __lh_lock(lh, index);
+ lh->previous = index;
+ }
+}
+
+static inline struct dm_buffer *le_to_buffer(struct lru_entry *le)
+{
+ return container_of(le, struct dm_buffer, lru);
+}
+
+static struct dm_buffer *list_to_buffer(struct list_head *l)
+{
+ struct lru_entry *le = list_entry(l, struct lru_entry, list);
+
+ if (!le)
+ return NULL;
+
+ return le_to_buffer(le);
+}
+
+static void cache_init(struct dm_buffer_cache *bc, unsigned int num_locks)
+{
+ unsigned int i;
+
+ bc->num_locks = num_locks;
+
+ for (i = 0; i < bc->num_locks; i++) {
+ init_rwsem(&bc->trees[i].lock);
+ bc->trees[i].root = RB_ROOT;
+ }
+
+ lru_init(&bc->lru[LIST_CLEAN]);
+ lru_init(&bc->lru[LIST_DIRTY]);
+}
+
+static void cache_destroy(struct dm_buffer_cache *bc)
+{
+ unsigned int i;
+
+ for (i = 0; i < bc->num_locks; i++)
+ WARN_ON_ONCE(!RB_EMPTY_ROOT(&bc->trees[i].root));
+
+ lru_destroy(&bc->lru[LIST_CLEAN]);
+ lru_destroy(&bc->lru[LIST_DIRTY]);
+}
+
+/*--------------*/
/*
- * Total cache size set by the user.
+ * not threadsafe, or racey depending how you look at it
*/
-static unsigned long dm_bufio_cache_size;
+static inline unsigned long cache_count(struct dm_buffer_cache *bc, int list_mode)
+{
+ return bc->lru[list_mode].count;
+}
+
+static inline unsigned long cache_total(struct dm_buffer_cache *bc)
+{
+ return cache_count(bc, LIST_CLEAN) + cache_count(bc, LIST_DIRTY);
+}
+
+/*--------------*/
/*
- * A copy of dm_bufio_cache_size because dm_bufio_cache_size can change
- * at any time. If it disagrees, the user has changed cache size.
+ * Gets a specific buffer, indexed by block.
+ * If the buffer is found then its holder count will be incremented and
+ * lru_reference will be called.
+ *
+ * threadsafe
*/
-static unsigned long dm_bufio_cache_size_latch;
+static struct dm_buffer *__cache_get(const struct rb_root *root, sector_t block)
+{
+ struct rb_node *n = root->rb_node;
+ struct dm_buffer *b;
-static DEFINE_SPINLOCK(global_spinlock);
+ while (n) {
+ b = container_of(n, struct dm_buffer, node);
+
+ if (b->block == block)
+ return b;
+
+ n = block < b->block ? n->rb_left : n->rb_right;
+ }
+
+ return NULL;
+}
+
+static void __cache_inc_buffer(struct dm_buffer *b)
+{
+ atomic_inc(&b->hold_count);
+ WRITE_ONCE(b->last_accessed, jiffies);
+}
+
+static struct dm_buffer *cache_get(struct dm_buffer_cache *bc, sector_t block)
+{
+ struct dm_buffer *b;
+
+ cache_read_lock(bc, block);
+ b = __cache_get(&bc->trees[cache_index(block, bc->num_locks)].root, block);
+ if (b) {
+ lru_reference(&b->lru);
+ __cache_inc_buffer(b);
+ }
+ cache_read_unlock(bc, block);
-static LIST_HEAD(global_queue);
+ return b;
+}
-static unsigned long global_num;
+/*--------------*/
/*
- * Buffers are freed after this timeout
+ * Returns true if the hold count hits zero.
+ * threadsafe
*/
-static unsigned int dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS;
-static unsigned long dm_bufio_retain_bytes = DM_BUFIO_DEFAULT_RETAIN_BYTES;
+static bool cache_put(struct dm_buffer_cache *bc, struct dm_buffer *b)
+{
+ bool r;
-static unsigned long dm_bufio_peak_allocated;
-static unsigned long dm_bufio_allocated_kmem_cache;
-static unsigned long dm_bufio_allocated_get_free_pages;
-static unsigned long dm_bufio_allocated_vmalloc;
-static unsigned long dm_bufio_current_allocated;
+ cache_read_lock(bc, b->block);
+ BUG_ON(!atomic_read(&b->hold_count));
+ r = atomic_dec_and_test(&b->hold_count);
+ cache_read_unlock(bc, b->block);
-/*----------------------------------------------------------------*/
+ return r;
+}
+
+/*--------------*/
+
+typedef enum evict_result (*b_predicate)(struct dm_buffer *, void *);
/*
- * The current number of clients.
+ * Evicts a buffer based on a predicate. The oldest buffer that
+ * matches the predicate will be selected. In addition to the
+ * predicate the hold_count of the selected buffer will be zero.
*/
-static int dm_bufio_client_count;
+struct evict_wrapper {
+ struct lock_history *lh;
+ b_predicate pred;
+ void *context;
+};
/*
- * The list of all clients.
+ * Wraps the buffer predicate turning it into an lru predicate. Adds
+ * extra test for hold_count.
*/
-static LIST_HEAD(dm_bufio_all_clients);
+static enum evict_result __evict_pred(struct lru_entry *le, void *context)
+{
+ struct evict_wrapper *w = context;
+ struct dm_buffer *b = le_to_buffer(le);
+
+ lh_next(w->lh, b->block);
+
+ if (atomic_read(&b->hold_count))
+ return ER_DONT_EVICT;
+
+ return w->pred(b, w->context);
+}
+
+static struct dm_buffer *__cache_evict(struct dm_buffer_cache *bc, int list_mode,
+ b_predicate pred, void *context,
+ struct lock_history *lh)
+{
+ struct evict_wrapper w = {.lh = lh, .pred = pred, .context = context};
+ struct lru_entry *le;
+ struct dm_buffer *b;
+
+ le = lru_evict(&bc->lru[list_mode], __evict_pred, &w);
+ if (!le)
+ return NULL;
+
+ b = le_to_buffer(le);
+ /* __evict_pred will have locked the appropriate tree. */
+ rb_erase(&b->node, &bc->trees[cache_index(b->block, bc->num_locks)].root);
+
+ return b;
+}
+
+static struct dm_buffer *cache_evict(struct dm_buffer_cache *bc, int list_mode,
+ b_predicate pred, void *context)
+{
+ struct dm_buffer *b;
+ struct lock_history lh;
+
+ lh_init(&lh, bc, true);
+ b = __cache_evict(bc, list_mode, pred, context, &lh);
+ lh_exit(&lh);
+
+ return b;
+}
+
+/*--------------*/
/*
- * This mutex protects dm_bufio_cache_size_latch and dm_bufio_client_count
+ * Mark a buffer as clean or dirty. Not threadsafe.
*/
-static DEFINE_MUTEX(dm_bufio_clients_lock);
+static void cache_mark(struct dm_buffer_cache *bc, struct dm_buffer *b, int list_mode)
+{
+ cache_write_lock(bc, b->block);
+ if (list_mode != b->list_mode) {
+ lru_remove(&bc->lru[b->list_mode], &b->lru);
+ b->list_mode = list_mode;
+ lru_insert(&bc->lru[b->list_mode], &b->lru);
+ }
+ cache_write_unlock(bc, b->block);
+}
-static struct workqueue_struct *dm_bufio_wq;
-static struct delayed_work dm_bufio_cleanup_old_work;
-static struct work_struct dm_bufio_replacement_work;
+/*--------------*/
+/*
+ * Runs through the lru associated with 'old_mode', if the predicate matches then
+ * it moves them to 'new_mode'. Not threadsafe.
+ */
+static void __cache_mark_many(struct dm_buffer_cache *bc, int old_mode, int new_mode,
+ b_predicate pred, void *context, struct lock_history *lh)
+{
+ struct lru_entry *le;
+ struct dm_buffer *b;
+ struct evict_wrapper w = {.lh = lh, .pred = pred, .context = context};
-#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
-static void buffer_record_stack(struct dm_buffer *b)
+ while (true) {
+ le = lru_evict(&bc->lru[old_mode], __evict_pred, &w);
+ if (!le)
+ break;
+
+ b = le_to_buffer(le);
+ b->list_mode = new_mode;
+ lru_insert(&bc->lru[b->list_mode], &b->lru);
+ }
+}
+
+static void cache_mark_many(struct dm_buffer_cache *bc, int old_mode, int new_mode,
+ b_predicate pred, void *context)
{
- b->stack_len = stack_trace_save(b->stack_entries, MAX_STACK, 2);
+ struct lock_history lh;
+
+ lh_init(&lh, bc, true);
+ __cache_mark_many(bc, old_mode, new_mode, pred, context, &lh);
+ lh_exit(&lh);
}
-#endif
+
+/*--------------*/
+
+/*
+ * Iterates through all clean or dirty entries calling a function for each
+ * entry. The callback may terminate the iteration early. Not threadsafe.
+ */
/*
- *----------------------------------------------------------------
- * A red/black tree acts as an index for all the buffers.
- *----------------------------------------------------------------
+ * Iterator functions should return one of these actions to indicate
+ * how the iteration should proceed.
*/
-static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block)
+enum it_action {
+ IT_NEXT,
+ IT_COMPLETE,
+};
+
+typedef enum it_action (*iter_fn)(struct dm_buffer *b, void *context);
+
+static void __cache_iterate(struct dm_buffer_cache *bc, int list_mode,
+ iter_fn fn, void *context, struct lock_history *lh)
{
- struct rb_node *n = c->buffer_tree.rb_node;
- struct dm_buffer *b;
+ struct lru *lru = &bc->lru[list_mode];
+ struct lru_entry *le, *first;
- while (n) {
- b = container_of(n, struct dm_buffer, node);
+ if (!lru->cursor)
+ return;
- if (b->block == block)
- return b;
+ first = le = to_le(lru->cursor);
+ do {
+ struct dm_buffer *b = le_to_buffer(le);
- n = block < b->block ? n->rb_left : n->rb_right;
+ lh_next(lh, b->block);
+
+ switch (fn(b, context)) {
+ case IT_NEXT:
+ break;
+
+ case IT_COMPLETE:
+ return;
+ }
+ cond_resched();
+
+ le = to_le(le->list.next);
+ } while (le != first);
+}
+
+static void cache_iterate(struct dm_buffer_cache *bc, int list_mode,
+ iter_fn fn, void *context)
+{
+ struct lock_history lh;
+
+ lh_init(&lh, bc, false);
+ __cache_iterate(bc, list_mode, fn, context, &lh);
+ lh_exit(&lh);
+}
+
+/*--------------*/
+
+/*
+ * Passes ownership of the buffer to the cache. Returns false if the
+ * buffer was already present (in which case ownership does not pass).
+ * eg, a race with another thread.
+ *
+ * Holder count should be 1 on insertion.
+ *
+ * Not threadsafe.
+ */
+static bool __cache_insert(struct rb_root *root, struct dm_buffer *b)
+{
+ struct rb_node **new = &root->rb_node, *parent = NULL;
+ struct dm_buffer *found;
+
+ while (*new) {
+ found = container_of(*new, struct dm_buffer, node);
+
+ if (found->block == b->block)
+ return false;
+
+ parent = *new;
+ new = b->block < found->block ?
+ &found->node.rb_left : &found->node.rb_right;
}
- return NULL;
+ rb_link_node(&b->node, parent, new);
+ rb_insert_color(&b->node, root);
+
+ return true;
+}
+
+static bool cache_insert(struct dm_buffer_cache *bc, struct dm_buffer *b)
+{
+ bool r;
+
+ if (WARN_ON_ONCE(b->list_mode >= LIST_SIZE))
+ return false;
+
+ cache_write_lock(bc, b->block);
+ BUG_ON(atomic_read(&b->hold_count) != 1);
+ r = __cache_insert(&bc->trees[cache_index(b->block, bc->num_locks)].root, b);
+ if (r)
+ lru_insert(&bc->lru[b->list_mode], &b->lru);
+ cache_write_unlock(bc, b->block);
+
+ return r;
}
-static struct dm_buffer *__find_next(struct dm_bufio_client *c, sector_t block)
+/*--------------*/
+
+/*
+ * Removes buffer from cache, ownership of the buffer passes back to the caller.
+ * Fails if the hold_count is not one (ie. the caller holds the only reference).
+ *
+ * Not threadsafe.
+ */
+static bool cache_remove(struct dm_buffer_cache *bc, struct dm_buffer *b)
{
- struct rb_node *n = c->buffer_tree.rb_node;
+ bool r;
+
+ cache_write_lock(bc, b->block);
+
+ if (atomic_read(&b->hold_count) != 1) {
+ r = false;
+ } else {
+ r = true;
+ rb_erase(&b->node, &bc->trees[cache_index(b->block, bc->num_locks)].root);
+ lru_remove(&bc->lru[b->list_mode], &b->lru);
+ }
+
+ cache_write_unlock(bc, b->block);
+
+ return r;
+}
+
+/*--------------*/
+
+typedef void (*b_release)(struct dm_buffer *);
+
+static struct dm_buffer *__find_next(struct rb_root *root, sector_t block)
+{
+ struct rb_node *n = root->rb_node;
struct dm_buffer *b;
struct dm_buffer *best = NULL;
@@ -304,35 +882,188 @@ static struct dm_buffer *__find_next(struct dm_bufio_client *c, sector_t block)
return best;
}
-static void __insert(struct dm_bufio_client *c, struct dm_buffer *b)
+static void __remove_range(struct dm_buffer_cache *bc,
+ struct rb_root *root,
+ sector_t begin, sector_t end,
+ b_predicate pred, b_release release)
{
- struct rb_node **new = &c->buffer_tree.rb_node, *parent = NULL;
- struct dm_buffer *found;
+ struct dm_buffer *b;
- while (*new) {
- found = container_of(*new, struct dm_buffer, node);
+ while (true) {
+ cond_resched();
- if (found->block == b->block) {
- BUG_ON(found != b);
- return;
+ b = __find_next(root, begin);
+ if (!b || (b->block >= end))
+ break;
+
+ begin = b->block + 1;
+
+ if (atomic_read(&b->hold_count))
+ continue;
+
+ if (pred(b, NULL) == ER_EVICT) {
+ rb_erase(&b->node, root);
+ lru_remove(&bc->lru[b->list_mode], &b->lru);
+ release(b);
}
+ }
+}
- parent = *new;
- new = b->block < found->block ?
- &found->node.rb_left : &found->node.rb_right;
+static void cache_remove_range(struct dm_buffer_cache *bc,
+ sector_t begin, sector_t end,
+ b_predicate pred, b_release release)
+{
+ unsigned int i;
+
+ for (i = 0; i < bc->num_locks; i++) {
+ down_write(&bc->trees[i].lock);
+ __remove_range(bc, &bc->trees[i].root, begin, end, pred, release);
+ up_write(&bc->trees[i].lock);
}
+}
- rb_link_node(&b->node, parent, new);
- rb_insert_color(&b->node, &c->buffer_tree);
+/*----------------------------------------------------------------*/
+
+/*
+ * Linking of buffers:
+ * All buffers are linked to buffer_cache with their node field.
+ *
+ * Clean buffers that are not being written (B_WRITING not set)
+ * are linked to lru[LIST_CLEAN] with their lru_list field.
+ *
+ * Dirty and clean buffers that are being written are linked to
+ * lru[LIST_DIRTY] with their lru_list field. When the write
+ * finishes, the buffer cannot be relinked immediately (because we
+ * are in an interrupt context and relinking requires process
+ * context), so some clean-not-writing buffers can be held on
+ * dirty_lru too. They are later added to lru in the process
+ * context.
+ */
+struct dm_bufio_client {
+ struct block_device *bdev;
+ unsigned int block_size;
+ s8 sectors_per_block_bits;
+
+ bool no_sleep;
+ struct mutex lock;
+ spinlock_t spinlock;
+
+ int async_write_error;
+
+ void (*alloc_callback)(struct dm_buffer *buf);
+ void (*write_callback)(struct dm_buffer *buf);
+ struct kmem_cache *slab_buffer;
+ struct kmem_cache *slab_cache;
+ struct dm_io_client *dm_io;
+
+ struct list_head reserved_buffers;
+ unsigned int need_reserved_buffers;
+
+ unsigned int minimum_buffers;
+
+ sector_t start;
+
+ struct shrinker shrinker;
+ struct work_struct shrink_work;
+ atomic_long_t need_shrink;
+
+ wait_queue_head_t free_buffer_wait;
+
+ struct list_head client_list;
+
+ /*
+ * Used by global_cleanup to sort the clients list.
+ */
+ unsigned long oldest_buffer;
+
+ struct dm_buffer_cache cache; /* must be last member */
+};
+
+static DEFINE_STATIC_KEY_FALSE(no_sleep_enabled);
+
+/*----------------------------------------------------------------*/
+
+#define dm_bufio_in_request() (!!current->bio_list)
+
+static void dm_bufio_lock(struct dm_bufio_client *c)
+{
+ if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep)
+ spin_lock_bh(&c->spinlock);
+ else
+ mutex_lock_nested(&c->lock, dm_bufio_in_request());
}
-static void __remove(struct dm_bufio_client *c, struct dm_buffer *b)
+static void dm_bufio_unlock(struct dm_bufio_client *c)
{
- rb_erase(&b->node, &c->buffer_tree);
+ if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep)
+ spin_unlock_bh(&c->spinlock);
+ else
+ mutex_unlock(&c->lock);
}
/*----------------------------------------------------------------*/
+/*
+ * Default cache size: available memory divided by the ratio.
+ */
+static unsigned long dm_bufio_default_cache_size;
+
+/*
+ * Total cache size set by the user.
+ */
+static unsigned long dm_bufio_cache_size;
+
+/*
+ * A copy of dm_bufio_cache_size because dm_bufio_cache_size can change
+ * at any time. If it disagrees, the user has changed cache size.
+ */
+static unsigned long dm_bufio_cache_size_latch;
+
+static DEFINE_SPINLOCK(global_spinlock);
+
+/*
+ * Buffers are freed after this timeout
+ */
+static unsigned int dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS;
+static unsigned long dm_bufio_retain_bytes = DM_BUFIO_DEFAULT_RETAIN_BYTES;
+
+static unsigned long dm_bufio_peak_allocated;
+static unsigned long dm_bufio_allocated_kmem_cache;
+static unsigned long dm_bufio_allocated_get_free_pages;
+static unsigned long dm_bufio_allocated_vmalloc;
+static unsigned long dm_bufio_current_allocated;
+
+/*----------------------------------------------------------------*/
+
+/*
+ * The current number of clients.
+ */
+static int dm_bufio_client_count;
+
+/*
+ * The list of all clients.
+ */
+static LIST_HEAD(dm_bufio_all_clients);
+
+/*
+ * This mutex protects dm_bufio_cache_size_latch and dm_bufio_client_count
+ */
+static DEFINE_MUTEX(dm_bufio_clients_lock);
+
+static struct workqueue_struct *dm_bufio_wq;
+static struct delayed_work dm_bufio_cleanup_old_work;
+static struct work_struct dm_bufio_replacement_work;
+
+
+#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
+static void buffer_record_stack(struct dm_buffer *b)
+{
+ b->stack_len = stack_trace_save(b->stack_entries, MAX_STACK, 2);
+}
+#endif
+
+/*----------------------------------------------------------------*/
+
static void adjust_total_allocated(struct dm_buffer *b, bool unlink)
{
unsigned char data_mode;
@@ -358,16 +1089,9 @@ static void adjust_total_allocated(struct dm_buffer *b, bool unlink)
if (dm_bufio_current_allocated > dm_bufio_peak_allocated)
dm_bufio_peak_allocated = dm_bufio_current_allocated;
- b->accessed = 1;
-
if (!unlink) {
- list_add(&b->global_list, &global_queue);
- global_num++;
if (dm_bufio_current_allocated > dm_bufio_cache_size)
queue_work(dm_bufio_wq, &dm_bufio_replacement_work);
- } else {
- list_del(&b->global_list);
- global_num--;
}
spin_unlock(&global_spinlock);
@@ -378,8 +1102,10 @@ static void adjust_total_allocated(struct dm_buffer *b, bool unlink)
*/
static void __cache_size_refresh(void)
{
- BUG_ON(!mutex_is_locked(&dm_bufio_clients_lock));
- BUG_ON(dm_bufio_client_count < 0);
+ if (WARN_ON(!mutex_is_locked(&dm_bufio_clients_lock)))
+ return;
+ if (WARN_ON(dm_bufio_client_count < 0))
+ return;
dm_bufio_cache_size_latch = READ_ONCE(dm_bufio_cache_size);
@@ -408,7 +1134,7 @@ static void __cache_size_refresh(void)
* If the allocation may fail we use __get_free_pages. Memory fragmentation
* won't have a fatal effect here, but it just causes flushes of some other
* buffers and more I/O will be performed. Don't use __get_free_pages if it
- * always fails (i.e. order >= MAX_ORDER).
+ * always fails (i.e. order > MAX_ORDER).
*
* If the allocation shouldn't fail we use __vmalloc. This is only for the
* initial reserve allocation, so there's no risk of wasting all vmalloc
@@ -495,6 +1221,7 @@ static struct dm_buffer *alloc_buffer(struct dm_bufio_client *c, gfp_t gfp_mask)
kmem_cache_free(c->slab_buffer, b);
return NULL;
}
+ adjust_total_allocated(b, false);
#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
b->stack_len = 0;
@@ -509,62 +1236,12 @@ static void free_buffer(struct dm_buffer *b)
{
struct dm_bufio_client *c = b->c;
+ adjust_total_allocated(b, true);
free_buffer_data(c, b->data, b->data_mode);
kmem_cache_free(c->slab_buffer, b);
}
/*
- * Link buffer to the buffer tree and clean or dirty queue.
- */
-static void __link_buffer(struct dm_buffer *b, sector_t block, int dirty)
-{
- struct dm_bufio_client *c = b->c;
-
- c->n_buffers[dirty]++;
- b->block = block;
- b->list_mode = dirty;
- list_a