diff options
Diffstat (limited to 'fs')
48 files changed, 410 insertions, 244 deletions
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c index fb7ee026d101..adbb3a1edcbf 100644 --- a/fs/adfs/inode.c +++ b/fs/adfs/inode.c @@ -73,6 +73,7 @@ static sector_t _adfs_bmap(struct address_space *mapping, sector_t block) } static const struct address_space_operations adfs_aops = { + .set_page_dirty = __set_page_dirty_buffers, .readpage = adfs_readpage, .writepage = adfs_writepage, .write_begin = adfs_write_begin, diff --git a/fs/affs/file.c b/fs/affs/file.c index d91b0133d95d..75ebd2b576ca 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -453,6 +453,7 @@ static sector_t _affs_bmap(struct address_space *mapping, sector_t block) } const struct address_space_operations affs_aops = { + .set_page_dirty = __set_page_dirty_buffers, .readpage = affs_readpage, .writepage = affs_writepage, .write_begin = affs_write_begin, @@ -833,6 +834,7 @@ err_bh: } const struct address_space_operations affs_aops_ofs = { + .set_page_dirty = __set_page_dirty_buffers, .readpage = affs_readpage_ofs, //.writepage = affs_writepage_ofs, .write_begin = affs_write_begin_ofs, diff --git a/fs/bfs/file.c b/fs/bfs/file.c index 0dceefc54b48..7f8544abf636 100644 --- a/fs/bfs/file.c +++ b/fs/bfs/file.c @@ -188,6 +188,7 @@ static sector_t bfs_bmap(struct address_space *mapping, sector_t block) } const struct address_space_operations bfs_aops = { + .set_page_dirty = __set_page_dirty_buffers, .readpage = bfs_readpage, .writepage = bfs_writepage, .write_begin = bfs_write_begin, diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index 3e84e9bb9084..145917f734fe 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -222,7 +222,7 @@ static int load_aout_binary(struct linux_binprm * bprm) error = vm_mmap(bprm->file, N_TXTADDR(ex), ex.a_text, PROT_READ | PROT_EXEC, - MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE, + MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE, fd_offset); if (error != N_TXTADDR(ex)) @@ -230,7 +230,7 @@ static int load_aout_binary(struct linux_binprm * bprm) error = vm_mmap(bprm->file, N_DATADDR(ex), ex.a_data, PROT_READ | PROT_WRITE | PROT_EXEC, - MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE, + MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE, fd_offset + ex.a_text); if (error != N_DATADDR(ex)) return error; diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 3d73cbb439fa..439ed81e755a 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1070,7 +1070,7 @@ out_free_interp: elf_prot = make_prot(elf_ppnt->p_flags, &arch_state, !!interpreter, false); - elf_flags = MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE; + elf_flags = MAP_PRIVATE | MAP_DENYWRITE; vaddr = elf_ppnt->p_vaddr; /* diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index ab9c31ddffda..cf4028487dcc 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -928,7 +928,7 @@ static int elf_fdpic_map_file_constdisp_on_uclinux( { struct elf32_fdpic_loadseg *seg; struct elf32_phdr *phdr; - unsigned long load_addr, base = ULONG_MAX, top = 0, maddr = 0, mflags; + unsigned long load_addr, base = ULONG_MAX, top = 0, maddr = 0; int loop, ret; load_addr = params->load_addr; @@ -948,12 +948,8 @@ static int elf_fdpic_map_file_constdisp_on_uclinux( } /* allocate one big anon block for everything */ - mflags = MAP_PRIVATE; - if (params->flags & ELF_FDPIC_FLAG_EXECUTABLE) - mflags |= MAP_EXECUTABLE; - maddr = vm_mmap(NULL, load_addr, top - base, - PROT_READ | PROT_WRITE | PROT_EXEC, mflags, 0); + PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE, 0); if (IS_ERR_VALUE(maddr)) return (int) maddr; @@ -1046,9 +1042,6 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, if (phdr->p_flags & PF_X) prot |= PROT_EXEC; flags = MAP_PRIVATE | MAP_DENYWRITE; - if (params->flags & ELF_FDPIC_FLAG_EXECUTABLE) - flags |= MAP_EXECUTABLE; - maddr = 0; switch (params->flags & ELF_FDPIC_FLAG_ARRANGEMENT) { diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index a1072c6a2341..5d776f80ee50 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -573,7 +573,7 @@ static int load_flat_file(struct linux_binprm *bprm, pr_debug("ROM mapping of file (we hope)\n"); textpos = vm_mmap(bprm->file, 0, text_len, PROT_READ|PROT_EXEC, - MAP_PRIVATE|MAP_EXECUTABLE, 0); + MAP_PRIVATE, 0); if (!textpos || IS_ERR_VALUE(textpos)) { ret = textpos; if (!textpos) diff --git a/fs/block_dev.c b/fs/block_dev.c index 6cc4d4cfe0c2..eb34f5c357cf 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1754,6 +1754,7 @@ static int blkdev_writepages(struct address_space *mapping, } static const struct address_space_operations def_blk_aops = { + .set_page_dirty = __set_page_dirty_buffers, .readpage = blkdev_readpage, .readahead = blkdev_readahead, .writepage = blkdev_writepage, diff --git a/fs/buffer.c b/fs/buffer.c index ea48c01fb76b..6290c3afdba4 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -589,31 +589,6 @@ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode) EXPORT_SYMBOL(mark_buffer_dirty_inode); /* - * Mark the page dirty, and set it dirty in the page cache, and mark the inode - * dirty. - * - * If warn is true, then emit a warning if the page is not uptodate and has - * not been truncated. - * - * The caller must hold lock_page_memcg(). - */ -void __set_page_dirty(struct page *page, struct address_space *mapping, - int warn) -{ - unsigned long flags; - - xa_lock_irqsave(&mapping->i_pages, flags); - if (page->mapping) { /* Race with truncate? */ - WARN_ON_ONCE(warn && !PageUptodate(page)); - account_page_dirtied(page, mapping); - __xa_set_mark(&mapping->i_pages, page_index(page), - PAGECACHE_TAG_DIRTY); - } - xa_unlock_irqrestore(&mapping->i_pages, flags); -} -EXPORT_SYMBOL_GPL(__set_page_dirty); - -/* * Add a page to the dirty page list. * * It is a sad fact of life that this function is called from several places diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index eb5ec3e46283..b601610e9907 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -28,12 +28,6 @@ static struct lock_class_key default_group_class[MAX_LOCK_DEPTH]; #endif -static const struct address_space_operations configfs_aops = { - .readpage = simple_readpage, - .write_begin = simple_write_begin, - .write_end = simple_write_end, -}; - static const struct inode_operations configfs_inode_operations ={ .setattr = configfs_setattr, }; @@ -114,7 +108,7 @@ struct inode *configfs_new_inode(umode_t mode, struct configfs_dirent *sd, struct inode * inode = new_inode(s); if (inode) { inode->i_ino = get_next_ino(); - inode->i_mapping->a_ops = &configfs_aops; + inode->i_mapping->a_ops = &ram_aops; inode->i_op = &configfs_inode_operations; if (sd->s_iattr) { @@ -488,10 +488,11 @@ static void *grab_mapping_entry(struct xa_state *xas, struct address_space *mapping, unsigned int order) { unsigned long index = xas->xa_index; - bool pmd_downgrade = false; /* splitting PMD entry into PTE entries? */ + bool pmd_downgrade; /* splitting PMD entry into PTE entries? */ void *entry; retry: + pmd_downgrade = false; xas_lock_irq(xas); entry = get_unlocked_entry(xas, order); diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index 392e721b50a3..7d85e64ea62f 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -533,7 +533,20 @@ static sector_t ecryptfs_bmap(struct address_space *mapping, sector_t block) return block; } +#include <linux/buffer_head.h> + const struct address_space_operations ecryptfs_aops = { + /* + * XXX: This is pretty broken for multiple reasons: ecryptfs does not + * actually use buffer_heads, and ecryptfs will crash without + * CONFIG_BLOCK. But it matches the behavior before the default for + * address_space_operations without the ->set_page_dirty method was + * cleaned up, so this is the best we can do without maintainer + * feedback. + */ +#ifdef CONFIG_BLOCK + .set_page_dirty = __set_page_dirty_buffers, +#endif .writepage = ecryptfs_writepage, .readpage = ecryptfs_readpage, .write_begin = ecryptfs_write_begin, diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c index 1803ef3220fd..ca37d4344361 100644 --- a/fs/exfat/inode.c +++ b/fs/exfat/inode.c @@ -491,6 +491,7 @@ int exfat_block_truncate_page(struct inode *inode, loff_t from) } static const struct address_space_operations exfat_aops = { + .set_page_dirty = __set_page_dirty_buffers, .readpage = exfat_readpage, .readahead = exfat_readahead, .writepage = exfat_writepage, diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 68178b2234bd..dadb121beb22 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -961,6 +961,7 @@ ext2_dax_writepages(struct address_space *mapping, struct writeback_control *wbc } const struct address_space_operations ext2_aops = { + .set_page_dirty = __set_page_dirty_buffers, .readpage = ext2_readpage, .readahead = ext2_readahead, .writepage = ext2_writepage, @@ -975,6 +976,7 @@ const struct address_space_operations ext2_aops = { }; const struct address_space_operations ext2_nobh_aops = { + .set_page_dirty = __set_page_dirty_buffers, .readpage = ext2_readpage, .readahead = ext2_readahead, .writepage = ext2_nobh_writepage, @@ -990,7 +992,7 @@ const struct address_space_operations ext2_nobh_aops = { static const struct address_space_operations ext2_dax_aops = { .writepages = ext2_dax_writepages, .direct_IO = noop_direct_IO, - .set_page_dirty = noop_set_page_dirty, + .set_page_dirty = __set_page_dirty_no_writeback, .invalidatepage = noop_invalidatepage, }; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index fe6045a46599..b8170a008590 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3701,7 +3701,7 @@ static const struct address_space_operations ext4_da_aops = { static const struct address_space_operations ext4_dax_aops = { .writepages = ext4_dax_writepages, .direct_IO = noop_direct_IO, - .set_page_dirty = noop_set_page_dirty, + .set_page_dirty = __set_page_dirty_no_writeback, .bmap = ext4_bmap, .invalidatepage = noop_invalidatepage, .swap_activate = ext4_iomap_swap_activate, diff --git a/fs/fat/inode.c b/fs/fat/inode.c index bab9b202b496..de0c9b013a85 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -342,6 +342,7 @@ int fat_block_truncate_page(struct inode *inode, loff_t from) } static const struct address_space_operations fat_aops = { + .set_page_dirty = __set_page_dirty_buffers, .readpage = fat_readpage, .readahead = fat_readahead, .writepage = fat_writepage, diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index e91980f49388..62193106683d 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -131,25 +131,6 @@ static bool inode_io_list_move_locked(struct inode *inode, return false; } -/** - * inode_io_list_del_locked - remove an inode from its bdi_writeback IO list - * @inode: inode to be removed - * @wb: bdi_writeback @inode is being removed from - * - * Remove @inode which may be on one of @wb->b_{dirty|io|more_io} lists and - * clear %WB_has_dirty_io if all are empty afterwards. - */ -static void inode_io_list_del_locked(struct inode *inode, - struct bdi_writeback *wb) -{ - assert_spin_locked(&wb->list_lock); - assert_spin_locked(&inode->i_lock); - - inode->i_state &= ~I_SYNC_QUEUED; - list_del_init(&inode->i_io_list); - wb_io_lists_depopulated(wb); -} - static void wb_wakeup(struct bdi_writeback *wb) { spin_lock_bh(&wb->work_lock); @@ -244,6 +225,13 @@ void wb_wait_for_completion(struct wb_completion *done) /* one round can affect upto 5 slots */ #define WB_FRN_MAX_IN_FLIGHT 1024 /* don't queue too many concurrently */ +/* + * Maximum inodes per isw. A specific value has been chosen to make + * struct inode_switch_wbs_context fit into 1024 bytes kmalloc. + */ +#define WB_MAX_INODES_PER_ISW ((1024UL - sizeof(struct inode_switch_wbs_context)) \ + / sizeof(struct inode *)) + static atomic_t isw_nr_in_flight = ATOMIC_INIT(0); static struct workqueue_struct *isw_wq; @@ -279,6 +267,28 @@ void __inode_attach_wb(struct inode *inode, struct page *page) EXPORT_SYMBOL_GPL(__inode_attach_wb); /** + * inode_cgwb_move_to_attached - put the inode onto wb->b_attached list + * @inode: inode of interest with i_lock held + * @wb: target bdi_writeback + * + * Remove the inode from wb's io lists and if necessarily put onto b_attached + * list. Only inodes attached to cgwb's are kept on this list. + */ +static void inode_cgwb_move_to_attached(struct inode *inode, + struct bdi_writeback *wb) +{ + assert_spin_locked(&wb->list_lock); + assert_spin_locked(&inode->i_lock); + + inode->i_state &= ~I_SYNC_QUEUED; + if (wb != &wb->bdi->wb) + list_move(&inode->i_io_list, &wb->b_attached); + else + list_del_init(&inode->i_io_list); + wb_io_lists_depopulated(wb); +} + +/** * locked_inode_to_wb_and_lock_list - determine a locked inode's wb and lock it * @inode: inode of interest with i_lock held * @@ -332,11 +342,18 @@ static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode) } struct inode_switch_wbs_context { - struct inode *inode; - struct bdi_writeback *new_wb; + struct rcu_work work; - struct rcu_head rcu_head; - struct work_struct work; + /* + * Multiple inodes can be switched at once. The switching procedure + * consists of two parts, separated by a RCU grace period. To make + * sure that the second part is executed for each inode gone through + * the first part, all inode pointers are placed into a NULL-terminated + * array embedded into struct inode_switch_wbs_context. Otherwise + * an inode could be left in a non-consistent state. + */ + struct bdi_writeback *new_wb; + struct inode *inodes[]; }; static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) @@ -349,50 +366,23 @@ static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) up_write(&bdi->wb_switch_rwsem); } -static void inode_switch_wbs_work_fn(struct work_struct *work) +static bool inode_do_switch_wbs(struct inode *inode, + struct bdi_writeback *old_wb, + struct bdi_writeback *new_wb) { - struct inode_switch_wbs_context *isw = - container_of(work, struct inode_switch_wbs_context, work); - struct inode *inode = isw->inode; - struct backing_dev_info *bdi = inode_to_bdi(inode); struct address_space *mapping = inode->i_mapping; - struct bdi_writeback *old_wb = inode->i_wb; - struct bdi_writeback *new_wb = isw->new_wb; XA_STATE(xas, &mapping->i_pages, 0); struct page *page; bool switched = false; - /* - * If @inode switches cgwb membership while sync_inodes_sb() is - * being issued, sync_inodes_sb() might miss it. Synchronize. - */ - down_read(&bdi->wb_switch_rwsem); - - /* - * By the time control reaches here, RCU grace period has passed - * since I_WB_SWITCH assertion and all wb stat update transactions - * between unlocked_inode_to_wb_begin/end() are guaranteed to be - * synchronizing against the i_pages lock. - * - * Grabbing old_wb->list_lock, inode->i_lock and the i_pages lock - * gives us exclusion against all wb related operations on @inode - * including IO list manipulations and stat updates. - */ - if (old_wb < new_wb) { - spin_lock(&old_wb->list_lock); - spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING); - } else { - spin_lock(&new_wb->list_lock); - spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING); - } spin_lock(&inode->i_lock); xa_lock_irq(&mapping->i_pages); /* - * Once I_FREEING is visible under i_lock, the eviction path owns - * the inode and we shouldn't modify ->i_io_list. + * Once I_FREEING or I_WILL_FREE are visible under i_lock, the eviction + * path owns the inode and we shouldn't modify ->i_io_list. */ - if (unlikely(inode->i_state & I_FREEING)) + if (unlikely(inode->i_state & (I_FREEING | I_WILL_FREE))) goto skip_switch; trace_inode_switch_wbs(inode, old_wb, new_wb); @@ -419,21 +409,28 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) wb_get(new_wb); /* - * Transfer to @new_wb's IO list if necessary. The specific list - * @inode was on is ignored and the inode is put on ->b_dirty which - * is always correct including from ->b_dirty_time. The transfer - * preserves @inode->dirtied_when ordering. + * Transfer to @new_wb's IO list if necessary. If the @inode is dirty, + * the specific list @inode was on is ignored and the @inode is put on + * ->b_dirty which is always correct including from ->b_dirty_time. + * The transfer preserves @inode->dirtied_when ordering. If the @inode + * was clean, it means it was on the b_attached list, so move it onto + * the b_attached list of @new_wb. */ if (!list_empty(&inode->i_io_list)) { - struct inode *pos; - - inode_io_list_del_locked(inode, old_wb); inode->i_wb = new_wb; - list_for_each_entry(pos, &new_wb->b_dirty, i_io_list) - if (time_after_eq(inode->dirtied_when, - pos->dirtied_when)) - break; - inode_io_list_move_locked(inode, new_wb, pos->i_io_list.prev); + + if (inode->i_state & I_DIRTY_ALL) { + struct inode *pos; + + list_for_each_entry(pos, &new_wb->b_dirty, i_io_list) + if (time_after_eq(inode->dirtied_when, + pos->dirtied_when)) + break; + inode_io_list_move_locked(inode, new_wb, + pos->i_io_list.prev); + } else { + inode_cgwb_move_to_attached(inode, new_wb); + } } else { inode->i_wb = new_wb; } @@ -452,31 +449,91 @@ skip_switch: xa_unlock_irq(&mapping->i_pages); spin_unlock(&inode->i_lock); + + return switched; +} + +static void inode_switch_wbs_work_fn(struct work_struct *work) +{ + struct inode_switch_wbs_context *isw = + container_of(to_rcu_work(work), struct inode_switch_wbs_context, work); + struct backing_dev_info *bdi = inode_to_bdi(isw->inodes[0]); + struct bdi_writeback *old_wb = isw->inodes[0]->i_wb; + struct bdi_writeback *new_wb = isw->new_wb; + unsigned long nr_switched = 0; + struct inode **inodep; + + /* + * If @inode switches cgwb membership while sync_inodes_sb() is + * being issued, sync_inodes_sb() might miss it. Synchronize. + */ + down_read(&bdi->wb_switch_rwsem); + + /* + * By the time control reaches here, RCU grace period has passed + * since I_WB_SWITCH assertion and all wb stat update transactions + * between unlocked_inode_to_wb_begin/end() are guaranteed to be + * synchronizing against the i_pages lock. + * + * Grabbing old_wb->list_lock, inode->i_lock and the i_pages lock + * gives us exclusion against all wb related operations on @inode + * including IO list manipulations and stat updates. + */ + if (old_wb < new_wb) { + spin_lock(&old_wb->list_lock); + spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING); + } else { + spin_lock(&new_wb->list_lock); + spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING); + } + + for (inodep = isw->inodes; *inodep; inodep++) { + WARN_ON_ONCE((*inodep)->i_wb != old_wb); + if (inode_do_switch_wbs(*inodep, old_wb, new_wb)) + nr_switched++; + } + spin_unlock(&new_wb->list_lock); spin_unlock(&old_wb->list_lock); up_read(&bdi->wb_switch_rwsem); - if (switched) { + if (nr_switched) { wb_wakeup(new_wb); - wb_put(old_wb); + wb_put_many(old_wb, nr_switched); } - wb_put(new_wb); - iput(inode); + for (inodep = isw->inodes; *inodep; inodep++) + iput(*inodep); + wb_put(new_wb); kfree(isw); - atomic_dec(&isw_nr_in_flight); } -static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head) +static bool inode_prepare_wbs_switch(struct inode *inode, + struct bdi_writeback *new_wb) { - struct inode_switch_wbs_context *isw = container_of(rcu_head, - struct inode_switch_wbs_context, rcu_head); + /* + * Paired with smp_mb() in cgroup_writeback_umount(). + * isw_nr_in_flight must be increased before checking SB_ACTIVE and + * grabbing an inode, otherwise isw_nr_in_flight can be observed as 0 + * in cgroup_writeback_umount() and the isw_wq will be not flushed. + */ + smp_mb(); + + /* while holding I_WB_SWITCH, no one else can update the association */ + spin_lock(&inode->i_lock); + if (!(inode->i_sb->s_flags & SB_ACTIVE) || + inode->i_state & (I_WB_SWITCH | I_FREEING | I_WILL_FREE) || + inode_to_wb(inode) == new_wb) { + spin_unlock(&inode->i_lock); + return false; + } + inode->i_state |= I_WB_SWITCH; + __iget(inode); + spin_unlock(&inode->i_lock); - /* needs to grab bh-unsafe locks, bounce to work item */ - INIT_WORK(&isw->work, inode_switch_wbs_work_fn); - queue_work(isw_wq, &isw->work); + return true; } /** @@ -501,10 +558,12 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) if (atomic_read(&isw_nr_in_flight) > WB_FRN_MAX_IN_FLIGHT) return; - isw = kzalloc(sizeof(*isw), GFP_ATOMIC); + isw = kzalloc(sizeof(*isw) + 2 * sizeof(struct inode *), GFP_ATOMIC); if (!isw) return; + atomic_inc(&isw_nr_in_flight); + /* find and pin the new wb */ rcu_read_lock(); memcg_css = css_from_id(new_wb_id, &memory_cgrp_subsys); @@ -514,19 +573,10 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) if (!isw->new_wb) goto out_free; - /* while holding I_WB_SWITCH, no one else can update the association */ - spin_lock(&inode->i_lock); - if (!(inode->i_sb->s_flags & SB_ACTIVE) || - inode->i_state & (I_WB_SWITCH | I_FREEING) || - inode_to_wb(inode) == isw->new_wb) { - spin_unlock(&inode->i_lock); + if (!inode_prepare_wbs_switch(inode, isw->new_wb)) goto out_free; - } - inode->i_state |= I_WB_SWITCH; - __iget(inode); - spin_unlock(&inode->i_lock); - isw->inode = inode; + isw->inodes[0] = inode; /* * In addition to synchronizing among switchers, I_WB_SWITCH tells @@ -534,18 +584,85 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) * lock so that stat transfer can synchronize against them. * Let's continue after I_WB_SWITCH is guaranteed to be visible. */ - call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn); - - atomic_inc(&isw_nr_in_flight); + INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn); + queue_rcu_work(isw_wq, &isw->work); return; out_free: + atomic_dec(&isw_nr_in_flight); if (isw->new_wb) wb_put(isw->new_wb); kfree(isw); } /** + * cleanup_offline_cgwb - detach associated inodes + * @wb: target wb + * + * Switch all inodes attached to @wb to a nearest living ancestor's wb in order + * to eventually release the dying @wb. Returns %true if not all inodes were + * switched and the function has to be restarted. + */ +bool cleanup_offline_cgwb(struct bdi_writeback *wb) +{ + struct cgroup_subsys_state *memcg_css; + struct inode_switch_wbs_context *isw; + struct inode *inode; + int nr; + bool restart = false; + + isw = kzalloc(sizeof(*isw) + WB_MAX_INODES_PER_ISW * + sizeof(struct inode *), GFP_KERNEL); + if (!isw) + return restart; + + atomic_inc(&isw |
