summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2021-04-26 13:48:02 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2021-04-26 13:48:02 -0700
commit55ba0fe059a577fa08f23223991b24564962620f (patch)
treef3b4ccfd5105c44c4b398be496c8219a97365e35
parent2a19866b6e4cf554b57660549d12496ea84aa7d7 (diff)
parent18bb8bbf13c1839b43c9e09e76d397b753989af2 (diff)
downloadlinux-55ba0fe059a577fa08f23223991b24564962620f.tar.gz
linux-55ba0fe059a577fa08f23223991b24564962620f.tar.bz2
linux-55ba0fe059a577fa08f23223991b24564962620f.zip
Merge tag 'for-5.13-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
Pull btrfs updates from David Sterba: "The updates this time are mostly stabilization, preparation and minor improvements. User visible improvements: - readahead for send, improving run time of full send by 10% and for incremental by 25% - make reflinks respect O_SYNC, O_DSYNC and S_SYNC flags - export supported sectorsize values in sysfs (currently only page size, more once full subpage support lands) - more graceful errors and warnings on 32bit systems when logical addresses for metadata reach the limit posed by unsigned long in page::index - error: fail mount if there's a metadata block beyond the limit - error: new metadata block would be at unreachable address - warn when 5/8th of the limit is reached, for 4K page systems it's 10T, for 64K page it's 160T - zoned mode - relocated zones get reset at the end instead of discard - automatic background reclaim of zones that have 75%+ of unusable space, the threshold is tunable in sysfs Fixes: - fsync and tree mod log fixes - fix inefficient preemptive reclaim calculations - fix exhaustion of the system chunk array due to concurrent allocations - fix fallback to no compression when racing with remount - preemptive fix for dm-crypt on zoned device that does not properly advertise zoned support Core changes: - add inode lock to synchronize mmap and other block updates (eg. deduplication, fallocate, fsync) - kmap conversions to new kmap_local API - subpage support (continued) - new helpers for page state/extent buffer tracking - metadata changes now support read and write - error handling through out relocation call paths - many other cleanups and code simplifications" * tag 'for-5.13-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (112 commits) btrfs: zoned: automatically reclaim zones btrfs: rename delete_unused_bgs_mutex to reclaim_bgs_lock btrfs: zoned: reset zones of relocated block groups btrfs: more graceful errors/warnings on 32bit systems when reaching limits btrfs: zoned: fix unpaired block group unfreeze during device replace btrfs: fix race when picking most recent mod log operation for an old root btrfs: fix metadata extent leak after failure to create subvolume btrfs: handle remount to no compress during compression btrfs: zoned: fail mount if the device does not support zone append btrfs: fix race between transaction aborts and fsyncs leading to use-after-free btrfs: introduce submit_eb_subpage() to submit a subpage metadata page btrfs: make lock_extent_buffer_for_io() to be subpage compatible btrfs: introduce write_one_subpage_eb() function btrfs: introduce end_bio_subpage_eb_writepage() function btrfs: check return value of btrfs_commit_transaction in relocation btrfs: do proper error handling in merge_reloc_roots btrfs: handle extent corruption with select_one_root properly btrfs: cleanup error handling in prepare_to_merge btrfs: do not panic in __add_reloc_root btrfs: handle __add_reloc_root failures in btrfs_recover_relocation ...
-rw-r--r--fs/btrfs/Makefile2
-rw-r--r--fs/btrfs/backref.c33
-rw-r--r--fs/btrfs/block-group.c207
-rw-r--r--fs/btrfs/block-group.h3
-rw-r--r--fs/btrfs/btrfs_inode.h33
-rw-r--r--fs/btrfs/check-integrity.c14
-rw-r--r--fs/btrfs/compression.c15
-rw-r--r--fs/btrfs/ctree.c984
-rw-r--r--fs/btrfs/ctree.h80
-rw-r--r--fs/btrfs/delayed-inode.c35
-rw-r--r--fs/btrfs/delayed-ref.c31
-rw-r--r--fs/btrfs/disk-io.c162
-rw-r--r--fs/btrfs/extent-tree.c21
-rw-r--r--fs/btrfs/extent_io.c439
-rw-r--r--fs/btrfs/extent_io.h4
-rw-r--r--fs/btrfs/file-item.c1
-rw-r--r--fs/btrfs/file.c118
-rw-r--r--fs/btrfs/free-space-cache.c9
-rw-r--r--fs/btrfs/inode.c125
-rw-r--r--fs/btrfs/ioctl.c51
-rw-r--r--fs/btrfs/lzo.c9
-rw-r--r--fs/btrfs/ordered-data.c19
-rw-r--r--fs/btrfs/ordered-data.h4
-rw-r--r--fs/btrfs/qgroup.c47
-rw-r--r--fs/btrfs/raid56.c70
-rw-r--r--fs/btrfs/reflink.c65
-rw-r--r--fs/btrfs/relocation.c448
-rw-r--r--fs/btrfs/scrub.c13
-rw-r--r--fs/btrfs/send.c43
-rw-r--r--fs/btrfs/space-info.c4
-rw-r--r--fs/btrfs/subpage.c140
-rw-r--r--fs/btrfs/subpage.h7
-rw-r--r--fs/btrfs/super.c26
-rw-r--r--fs/btrfs/sysfs.c50
-rw-r--r--fs/btrfs/transaction.c59
-rw-r--r--fs/btrfs/transaction.h9
-rw-r--r--fs/btrfs/tree-checker.c5
-rw-r--r--fs/btrfs/tree-log.c21
-rw-r--r--fs/btrfs/tree-mod-log.c929
-rw-r--r--fs/btrfs/tree-mod-log.h53
-rw-r--r--fs/btrfs/volumes.c123
-rw-r--r--fs/btrfs/volumes.h1
-rw-r--r--fs/btrfs/zoned.c7
-rw-r--r--fs/btrfs/zoned.h6
-rw-r--r--include/linux/pagemap.h9
-rw-r--r--include/trace/events/btrfs.h12
46 files changed, 2964 insertions, 1582 deletions
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index b4fb997eda16..cec88a66bd6c 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -30,7 +30,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \
- subpage.o
+ subpage.o tree-mod-log.o
btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index f47c1528eb9a..117d423fdb93 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -14,6 +14,7 @@
#include "delayed-ref.h"
#include "locking.h"
#include "misc.h"
+#include "tree-mod-log.h"
/* Just an arbitrary number so we can be sure this happened */
#define BACKREF_FOUND_SHARED 6
@@ -452,7 +453,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
if (path->slots[0] >= btrfs_header_nritems(eb) ||
is_shared_data_backref(preftrees, eb->start) ||
ref->root_id != btrfs_header_owner(eb)) {
- if (time_seq == SEQ_LAST)
+ if (time_seq == BTRFS_SEQ_LAST)
ret = btrfs_next_leaf(root, path);
else
ret = btrfs_next_old_leaf(root, path, time_seq);
@@ -476,7 +477,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
if (slot == 0 &&
(is_shared_data_backref(preftrees, eb->start) ||
ref->root_id != btrfs_header_owner(eb))) {
- if (time_seq == SEQ_LAST)
+ if (time_seq == BTRFS_SEQ_LAST)
ret = btrfs_next_leaf(root, path);
else
ret = btrfs_next_old_leaf(root, path, time_seq);
@@ -514,7 +515,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
eie = NULL;
}
next:
- if (time_seq == SEQ_LAST)
+ if (time_seq == BTRFS_SEQ_LAST)
ret = btrfs_next_item(root, path);
else
ret = btrfs_next_old_item(root, path, time_seq);
@@ -574,7 +575,7 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info,
if (path->search_commit_root)
root_level = btrfs_header_level(root->commit_root);
- else if (time_seq == SEQ_LAST)
+ else if (time_seq == BTRFS_SEQ_LAST)
root_level = btrfs_header_level(root->node);
else
root_level = btrfs_old_root_level(root, time_seq);
@@ -605,7 +606,7 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info,
search_key.offset >= LLONG_MAX)
search_key.offset = 0;
path->lowest_level = level;
- if (time_seq == SEQ_LAST)
+ if (time_seq == BTRFS_SEQ_LAST)
ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
else
ret = btrfs_search_old_slot(root, &search_key, path, time_seq);
@@ -1147,8 +1148,8 @@ static int add_keyed_refs(struct btrfs_fs_info *fs_info,
* indirect refs to their parent bytenr.
* When roots are found, they're added to the roots list
*
- * If time_seq is set to SEQ_LAST, it will not search delayed_refs, and behave
- * much like trans == NULL case, the difference only lies in it will not
+ * If time_seq is set to BTRFS_SEQ_LAST, it will not search delayed_refs, and
+ * behave much like trans == NULL case, the difference only lies in it will not
* commit root.
* The special case is for qgroup to search roots in commit_transaction().
*
@@ -1199,7 +1200,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
path->skip_locking = 1;
}
- if (time_seq == SEQ_LAST)
+ if (time_seq == BTRFS_SEQ_LAST)
path->skip_locking = 1;
/*
@@ -1217,9 +1218,9 @@ again:
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
if (trans && likely(trans->type != __TRANS_DUMMY) &&
- time_seq != SEQ_LAST) {
+ time_seq != BTRFS_SEQ_LAST) {
#else
- if (trans && time_seq != SEQ_LAST) {
+ if (trans && time_seq != BTRFS_SEQ_LAST) {
#endif
/*
* look if there are updates for this ref queued and lock the
@@ -1527,7 +1528,7 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
struct btrfs_trans_handle *trans;
struct ulist_iterator uiter;
struct ulist_node *node;
- struct seq_list elem = SEQ_LIST_INIT(elem);
+ struct btrfs_seq_list elem = BTRFS_SEQ_LIST_INIT(elem);
int ret = 0;
struct share_check shared = {
.root_objectid = root->root_key.objectid,
@@ -1953,7 +1954,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
struct ulist *roots = NULL;
struct ulist_node *ref_node = NULL;
struct ulist_node *root_node = NULL;
- struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem);
+ struct btrfs_seq_list seq_elem = BTRFS_SEQ_LIST_INIT(seq_elem);
struct ulist_iterator ref_uiter;
struct ulist_iterator root_uiter;
@@ -1971,12 +1972,12 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
}
if (trans)
- btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem);
+ btrfs_get_tree_mod_seq(fs_info, &seq_elem);
else
down_read(&fs_info->commit_root_sem);
ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid,
- tree_mod_seq_elem.seq, &refs,
+ seq_elem.seq, &refs,
&extent_item_pos, ignore_offset);
if (ret)
goto out;
@@ -1984,7 +1985,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
ULIST_ITER_INIT(&ref_uiter);
while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) {
ret = btrfs_find_all_roots_safe(trans, fs_info, ref_node->val,
- tree_mod_seq_elem.seq, &roots,
+ seq_elem.seq, &roots,
ignore_offset);
if (ret)
break;
@@ -2007,7 +2008,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
free_leaf_list(refs);
out:
if (trans) {
- btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
+ btrfs_put_tree_mod_seq(fs_info, &seq_elem);
btrfs_end_transaction(trans);
} else {
up_read(&fs_info->commit_root_sem);
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 744b99ddc28c..aa57bdc8fc89 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1289,7 +1289,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
* Long running balances can keep us blocked here for eternity, so
* simply skip deletion if we're unable to get the mutex.
*/
- if (!mutex_trylock(&fs_info->delete_unused_bgs_mutex))
+ if (!mutex_trylock(&fs_info->reclaim_bgs_lock))
return;
spin_lock(&fs_info->unused_bgs_lock);
@@ -1462,12 +1462,12 @@ next:
spin_lock(&fs_info->unused_bgs_lock);
}
spin_unlock(&fs_info->unused_bgs_lock);
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
return;
flip_async:
btrfs_end_transaction(trans);
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
btrfs_put_block_group(block_group);
btrfs_discard_punt_unused_bgs_list(fs_info);
}
@@ -1485,6 +1485,97 @@ void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
spin_unlock(&fs_info->unused_bgs_lock);
}
+void btrfs_reclaim_bgs_work(struct work_struct *work)
+{
+ struct btrfs_fs_info *fs_info =
+ container_of(work, struct btrfs_fs_info, reclaim_bgs_work);
+ struct btrfs_block_group *bg;
+ struct btrfs_space_info *space_info;
+ int ret;
+
+ if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
+ return;
+
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
+ return;
+
+ mutex_lock(&fs_info->reclaim_bgs_lock);
+ spin_lock(&fs_info->unused_bgs_lock);
+ while (!list_empty(&fs_info->reclaim_bgs)) {
+ bg = list_first_entry(&fs_info->reclaim_bgs,
+ struct btrfs_block_group,
+ bg_list);
+ list_del_init(&bg->bg_list);
+
+ space_info = bg->space_info;
+ spin_unlock(&fs_info->unused_bgs_lock);
+
+ /* Don't race with allocators so take the groups_sem */
+ down_write(&space_info->groups_sem);
+
+ spin_lock(&bg->lock);
+ if (bg->reserved || bg->pinned || bg->ro) {
+ /*
+ * We want to bail if we made new allocations or have
+ * outstanding allocations in this block group. We do
+ * the ro check in case balance is currently acting on
+ * this block group.
+ */
+ spin_unlock(&bg->lock);
+ up_write(&space_info->groups_sem);
+ goto next;
+ }
+ spin_unlock(&bg->lock);
+
+ /* Get out fast, in case we're unmounting the filesystem */
+ if (btrfs_fs_closing(fs_info)) {
+ up_write(&space_info->groups_sem);
+ goto next;
+ }
+
+ ret = inc_block_group_ro(bg, 0);
+ up_write(&space_info->groups_sem);
+ if (ret < 0)
+ goto next;
+
+ btrfs_info(fs_info, "reclaiming chunk %llu with %llu%% used",
+ bg->start, div_u64(bg->used * 100, bg->length));
+ trace_btrfs_reclaim_block_group(bg);
+ ret = btrfs_relocate_chunk(fs_info, bg->start);
+ if (ret)
+ btrfs_err(fs_info, "error relocating chunk %llu",
+ bg->start);
+
+next:
+ btrfs_put_block_group(bg);
+ spin_lock(&fs_info->unused_bgs_lock);
+ }
+ spin_unlock(&fs_info->unused_bgs_lock);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
+ btrfs_exclop_finish(fs_info);
+}
+
+void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info)
+{
+ spin_lock(&fs_info->unused_bgs_lock);
+ if (!list_empty(&fs_info->reclaim_bgs))
+ queue_work(system_unbound_wq, &fs_info->reclaim_bgs_work);
+ spin_unlock(&fs_info->unused_bgs_lock);
+}
+
+void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg)
+{
+ struct btrfs_fs_info *fs_info = bg->fs_info;
+
+ spin_lock(&fs_info->unused_bgs_lock);
+ if (list_empty(&bg->bg_list)) {
+ btrfs_get_block_group(bg);
+ trace_btrfs_add_reclaim_block_group(bg);
+ list_add_tail(&bg->bg_list, &fs_info->reclaim_bgs);
+ }
+ spin_unlock(&fs_info->unused_bgs_lock);
+}
+
static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
struct btrfs_path *path)
{
@@ -2267,29 +2358,33 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
struct btrfs_trans_handle *trans;
u64 alloc_flags;
int ret;
+ bool dirty_bg_running;
-again:
- trans = btrfs_join_transaction(fs_info->extent_root);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
+ do {
+ trans = btrfs_join_transaction(fs_info->extent_root);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
- /*
- * we're not allowed to set block groups readonly after the dirty
- * block groups cache has started writing. If it already started,
- * back off and let this transaction commit
- */
- mutex_lock(&fs_info->ro_block_group_mutex);
- if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
- u64 transid = trans->transid;
+ dirty_bg_running = false;
- mutex_unlock(&fs_info->ro_block_group_mutex);
- btrfs_end_transaction(trans);
+ /*
+ * We're not allowed to set block groups readonly after the dirty
+ * block group cache has started writing. If it already started,
+ * back off and let this transaction commit.
+ */
+ mutex_lock(&fs_info->ro_block_group_mutex);
+ if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
+ u64 transid = trans->transid;
- ret = btrfs_wait_for_commit(fs_info, transid);
- if (ret)
- return ret;
- goto again;
- }
+ mutex_unlock(&fs_info->ro_block_group_mutex);
+ btrfs_end_transaction(trans);
+
+ ret = btrfs_wait_for_commit(fs_info, transid);
+ if (ret)
+ return ret;
+ dirty_bg_running = true;
+ }
+ } while (dirty_bg_running);
if (do_chunk_alloc) {
/*
@@ -3269,6 +3364,7 @@ static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
*/
void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
{
+ struct btrfs_transaction *cur_trans = trans->transaction;
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_space_info *info;
u64 left;
@@ -3283,6 +3379,7 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
lockdep_assert_held(&fs_info->chunk_mutex);
info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
+again:
spin_lock(&info->lock);
left = info->total_bytes - btrfs_space_info_used(info, true);
spin_unlock(&info->lock);
@@ -3301,6 +3398,58 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
if (left < thresh) {
u64 flags = btrfs_system_alloc_profile(fs_info);
+ u64 reserved = atomic64_read(&cur_trans->chunk_bytes_reserved);
+
+ /*
+ * If there's not available space for the chunk tree (system
+ * space) and there are other tasks that reserved space for
+ * creating a new system block group, wait for them to complete
+ * the creation of their system block group and release excess
+ * reserved space. We do this because:
+ *
+ * *) We can end up allocating more system chunks than necessary
+ * when there are multiple tasks that are concurrently
+ * allocating block groups, which can lead to exhaustion of
+ * the system array in the superblock;
+ *
+ * *) If we allocate extra and unnecessary system block groups,
+ * despite being empty for a long time, and possibly forever,
+ * they end not being added to the list of unused block groups
+ * because that typically happens only when deallocating the
+ * last extent from a block group - which never happens since
+ * we never allocate from them in the first place. The few
+ * exceptions are when mounting a filesystem or running scrub,
+ * which add unused block groups to the list of unused block
+ * groups, to be deleted by the cleaner kthread.
+ * And even when they are added to the list of unused block
+ * groups, it can take a long time until they get deleted,
+ * since the cleaner kthread might be sleeping or busy with
+ * other work (deleting subvolumes, running delayed iputs,
+ * defrag scheduling, etc);
+ *
+ * This is rare in practice, but can happen when too many tasks
+ * are allocating blocks groups in parallel (via fallocate())
+ * and before the one that reserved space for a new system block
+ * group finishes the block group creation and releases the space
+ * reserved in excess (at btrfs_create_pending_block_groups()),
+ * other tasks end up here and see free system space temporarily
+ * not enough for updating the chunk tree.
+ *
+ * We unlock the chunk mutex before waiting for such tasks and
+ * lock it again after the wait, otherwise we would deadlock.
+ * It is safe to do so because allocating a system chunk is the
+ * first thing done while allocating a new block group.
+ */
+ if (reserved > trans->chunk_bytes_reserved) {
+ const u64 min_needed = reserved - thresh;
+
+ mutex_unlock(&fs_info->chunk_mutex);
+ wait_event(cur_trans->chunk_reserve_wait,
+ atomic64_read(&cur_trans->chunk_bytes_reserved) <=
+ min_needed);
+ mutex_lock(&fs_info->chunk_mutex);
+ goto again;
+ }
/*
* Ignore failure to create system chunk. We might end up not
@@ -3315,8 +3464,10 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
ret = btrfs_block_rsv_add(fs_info->chunk_root,
&fs_info->chunk_block_rsv,
thresh, BTRFS_RESERVE_NO_FLUSH);
- if (!ret)
+ if (!ret) {
+ atomic64_add(thresh, &cur_trans->chunk_bytes_reserved);
trans->chunk_bytes_reserved += thresh;
+ }
}
}
@@ -3386,6 +3537,16 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
}
spin_unlock(&info->unused_bgs_lock);
+ spin_lock(&info->unused_bgs_lock);
+ while (!list_empty(&info->reclaim_bgs)) {
+ block_group = list_first_entry(&info->reclaim_bgs,
+ struct btrfs_block_group,
+ bg_list);
+ list_del_init(&block_group->bg_list);
+ btrfs_put_block_group(block_group);
+ }
+ spin_unlock(&info->unused_bgs_lock);
+
spin_lock(&info->block_group_cache_lock);
while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
block_group = rb_entry(n, struct btrfs_block_group,
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index 3ecc3372a5ce..7b927425dc71 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -264,6 +264,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
u64 group_start, struct extent_map *em);
void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
void btrfs_mark_bg_unused(struct btrfs_block_group *bg);
+void btrfs_reclaim_bgs_work(struct work_struct *work);
+void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info);
+void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg);
int btrfs_read_block_groups(struct btrfs_fs_info *info);
int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
u64 type, u64 chunk_offset, u64 size);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 28e202e89660..c652e19ad74e 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -220,6 +220,7 @@ struct btrfs_inode {
/* Hook into fs_info->delayed_iputs */
struct list_head delayed_iput;
+ struct rw_semaphore i_mmap_lock;
struct inode vfs_inode;
};
@@ -299,24 +300,30 @@ static inline void btrfs_mod_outstanding_extents(struct btrfs_inode *inode,
mod);
}
-static inline int btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation)
+/*
+ * Called every time after doing a buffered, direct IO or memory mapped write.
+ *
+ * This is to ensure that if we write to a file that was previously fsynced in
+ * the current transaction, then try to fsync it again in the same transaction,
+ * we will know that there were changes in the file and that it needs to be
+ * logged.
+ */
+static inline void btrfs_set_inode_last_sub_trans(struct btrfs_inode *inode)
+{
+ spin_lock(&inode->lock);
+ inode->last_sub_trans = inode->root->log_transid;
+ spin_unlock(&inode->lock);
+}
+
+static inline bool btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation)
{
- int ret = 0;
+ bool ret = false;
spin_lock(&inode->lock);
if (inode->logged_trans == generation &&
inode->last_sub_trans <= inode->last_log_commit &&
- inode->last_sub_trans <= inode->root->last_log_commit) {
- /*
- * After a ranged fsync we might have left some extent maps
- * (that fall outside the fsync's range). So return false
- * here if the list isn't empty, to make sure btrfs_log_inode()
- * will be called and process those extent maps.
- */
- smp_mb();
- if (list_empty(&inode->extent_tree.modified_extents))
- ret = 1;
- }
+ inode->last_sub_trans <= inode->root->last_log_commit)
+ ret = true;
spin_unlock(&inode->lock);
return ret;
}
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 113cb85c1fd4..169508609324 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -1555,10 +1555,11 @@ static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx)
BUG_ON(!block_ctx->pagev);
num_pages = (block_ctx->len + (u64)PAGE_SIZE - 1) >>
PAGE_SHIFT;
+ /* Pages must be unmapped in reverse order */
while (num_pages > 0) {
num_pages--;
if (block_ctx->datav[num_pages]) {
- kunmap(block_ctx->pagev[num_pages]);
+ kunmap_local(block_ctx->datav[num_pages]);
block_ctx->datav[num_pages] = NULL;
}
if (block_ctx->pagev[num_pages]) {
@@ -1637,7 +1638,7 @@ static int btrfsic_read_block(struct btrfsic_state *state,
i = j;
}
for (i = 0; i < num_pages; i++)
- block_ctx->datav[i] = kmap(block_ctx->pagev[i]);
+ block_ctx->datav[i] = kmap_local_page(block_ctx->pagev[i]);
return block_ctx->len;
}
@@ -2677,7 +2678,7 @@ static void __btrfsic_submit_bio(struct bio *bio)
dev_state = btrfsic_dev_state_lookup(bio->bi_bdev->bd_dev);
if (NULL != dev_state &&
(bio_op(bio) == REQ_OP_WRITE) && bio_has_data(bio)) {
- unsigned int i = 0;
+ int i = 0;
u64 dev_bytenr;
u64 cur_bytenr;
struct bio_vec bvec;
@@ -2702,7 +2703,7 @@ static void __btrfsic_submit_bio(struct bio *bio)
bio_for_each_segment(bvec, bio, iter) {
BUG_ON(bvec.bv_len != PAGE_SIZE);
- mapped_datav[i] = kmap(bvec.bv_page);
+ mapped_datav[i] = kmap_local_page(bvec.bv_page);
i++;
if (dev_state->state->print_mask &
@@ -2715,8 +2716,9 @@ static void __btrfsic_submit_bio(struct bio *bio)
mapped_datav, segs,
bio, &bio_is_patched,
bio->bi_opf);
- bio_for_each_segment(bvec, bio, iter)
- kunmap(bvec.bv_page);
+ /* Unmap in reverse order */
+ for (--i; i >= 0; i--)
+ kunmap_local(mapped_datav[i]);
kfree(mapped_datav);
} else if (NULL != dev_state && (bio->bi_opf & REQ_PREFLUSH)) {
if (dev_state->state->print_mask &
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 3f4c832abfed..17f93fd28f7e 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -80,10 +80,15 @@ static int compression_compress_pages(int type, struct list_head *ws,
case BTRFS_COMPRESS_NONE:
default:
/*
- * This can't happen, the type is validated several times
- * before we get here. As a sane fallback, return what the
- * callers will understand as 'no compression happened'.
+ * This can happen when compression races with remount setting
+ * it to 'no compress', while caller doesn't call
+ * inode_need_compress() to check if we really need to
+ * compress.
+ *
+ * Not a big deal, just need to inform caller that we
+ * haven't allocated any pages yet.
*/
+ *out_pages = 0;
return -E2BIG;
}
}
@@ -1611