summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/adfs/inode.c1
-rw-r--r--fs/affs/file.c2
-rw-r--r--fs/bfs/file.c1
-rw-r--r--fs/binfmt_aout.c4
-rw-r--r--fs/binfmt_elf.c2
-rw-r--r--fs/binfmt_elf_fdpic.c11
-rw-r--r--fs/binfmt_flat.c2
-rw-r--r--fs/block_dev.c1
-rw-r--r--fs/buffer.c25
-rw-r--r--fs/configfs/inode.c8
-rw-r--r--fs/dax.c3
-rw-r--r--fs/ecryptfs/mmap.c13
-rw-r--r--fs/exfat/inode.c1
-rw-r--r--fs/ext2/inode.c4
-rw-r--r--fs/ext4/inode.c2
-rw-r--r--fs/fat/inode.c1
-rw-r--r--fs/fs-writeback.c332
-rw-r--r--fs/fuse/dax.c3
-rw-r--r--fs/gfs2/aops.c2
-rw-r--r--fs/gfs2/meta_io.c2
-rw-r--r--fs/hfs/inode.c2
-rw-r--r--fs/hfsplus/inode.c2
-rw-r--r--fs/hpfs/file.c1
-rw-r--r--fs/iomap/buffered-io.c27
-rw-r--r--fs/jfs/inode.c1
-rw-r--r--fs/kernfs/inode.c8
-rw-r--r--fs/libfs.c44
-rw-r--r--fs/minix/inode.c1
-rw-r--r--fs/nilfs2/mdt.c1
-rw-r--r--fs/ntfs/inode.c2
-rw-r--r--fs/ocfs2/aops.c4
-rw-r--r--fs/ocfs2/cluster/heartbeat.c7
-rw-r--r--fs/ocfs2/cluster/nodemanager.c2
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c2
-rw-r--r--fs/ocfs2/filecheck.c6
-rw-r--r--fs/ocfs2/stackglue.c8
-rw-r--r--fs/omfs/file.c1
-rw-r--r--fs/proc/task_mmu.c2
-rw-r--r--fs/ramfs/inode.c9
-rw-r--r--fs/squashfs/block.c5
-rw-r--r--fs/squashfs/squashfs_fs_sb.h1
-rw-r--r--fs/squashfs/super.c86
-rw-r--r--fs/sysv/itree.c1
-rw-r--r--fs/udf/file.c1
-rw-r--r--fs/udf/inode.c1
-rw-r--r--fs/ufs/inode.c1
-rw-r--r--fs/xfs/xfs_aops.c4
-rw-r--r--fs/zonefs/super.c4
48 files changed, 410 insertions, 244 deletions
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index fb7ee026d101..adbb3a1edcbf 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -73,6 +73,7 @@ static sector_t _adfs_bmap(struct address_space *mapping, sector_t block)
}
static const struct address_space_operations adfs_aops = {
+ .set_page_dirty = __set_page_dirty_buffers,
.readpage = adfs_readpage,
.writepage = adfs_writepage,
.write_begin = adfs_write_begin,
diff --git a/fs/affs/file.c b/fs/affs/file.c
index d91b0133d95d..75ebd2b576ca 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -453,6 +453,7 @@ static sector_t _affs_bmap(struct address_space *mapping, sector_t block)
}
const struct address_space_operations affs_aops = {
+ .set_page_dirty = __set_page_dirty_buffers,
.readpage = affs_readpage,
.writepage = affs_writepage,
.write_begin = affs_write_begin,
@@ -833,6 +834,7 @@ err_bh:
}
const struct address_space_operations affs_aops_ofs = {
+ .set_page_dirty = __set_page_dirty_buffers,
.readpage = affs_readpage_ofs,
//.writepage = affs_writepage_ofs,
.write_begin = affs_write_begin_ofs,
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index 0dceefc54b48..7f8544abf636 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -188,6 +188,7 @@ static sector_t bfs_bmap(struct address_space *mapping, sector_t block)
}
const struct address_space_operations bfs_aops = {
+ .set_page_dirty = __set_page_dirty_buffers,
.readpage = bfs_readpage,
.writepage = bfs_writepage,
.write_begin = bfs_write_begin,
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index 3e84e9bb9084..145917f734fe 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -222,7 +222,7 @@ static int load_aout_binary(struct linux_binprm * bprm)
error = vm_mmap(bprm->file, N_TXTADDR(ex), ex.a_text,
PROT_READ | PROT_EXEC,
- MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE,
+ MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE,
fd_offset);
if (error != N_TXTADDR(ex))
@@ -230,7 +230,7 @@ static int load_aout_binary(struct linux_binprm * bprm)
error = vm_mmap(bprm->file, N_DATADDR(ex), ex.a_data,
PROT_READ | PROT_WRITE | PROT_EXEC,
- MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE,
+ MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE,
fd_offset + ex.a_text);
if (error != N_DATADDR(ex))
return error;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 3d73cbb439fa..439ed81e755a 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1070,7 +1070,7 @@ out_free_interp:
elf_prot = make_prot(elf_ppnt->p_flags, &arch_state,
!!interpreter, false);
- elf_flags = MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE;
+ elf_flags = MAP_PRIVATE | MAP_DENYWRITE;
vaddr = elf_ppnt->p_vaddr;
/*
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index ab9c31ddffda..cf4028487dcc 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -928,7 +928,7 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(
{
struct elf32_fdpic_loadseg *seg;
struct elf32_phdr *phdr;
- unsigned long load_addr, base = ULONG_MAX, top = 0, maddr = 0, mflags;
+ unsigned long load_addr, base = ULONG_MAX, top = 0, maddr = 0;
int loop, ret;
load_addr = params->load_addr;
@@ -948,12 +948,8 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(
}
/* allocate one big anon block for everything */
- mflags = MAP_PRIVATE;
- if (params->flags & ELF_FDPIC_FLAG_EXECUTABLE)
- mflags |= MAP_EXECUTABLE;
-
maddr = vm_mmap(NULL, load_addr, top - base,
- PROT_READ | PROT_WRITE | PROT_EXEC, mflags, 0);
+ PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE, 0);
if (IS_ERR_VALUE(maddr))
return (int) maddr;
@@ -1046,9 +1042,6 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
if (phdr->p_flags & PF_X) prot |= PROT_EXEC;
flags = MAP_PRIVATE | MAP_DENYWRITE;
- if (params->flags & ELF_FDPIC_FLAG_EXECUTABLE)
- flags |= MAP_EXECUTABLE;
-
maddr = 0;
switch (params->flags & ELF_FDPIC_FLAG_ARRANGEMENT) {
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index a1072c6a2341..5d776f80ee50 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -573,7 +573,7 @@ static int load_flat_file(struct linux_binprm *bprm,
pr_debug("ROM mapping of file (we hope)\n");
textpos = vm_mmap(bprm->file, 0, text_len, PROT_READ|PROT_EXEC,
- MAP_PRIVATE|MAP_EXECUTABLE, 0);
+ MAP_PRIVATE, 0);
if (!textpos || IS_ERR_VALUE(textpos)) {
ret = textpos;
if (!textpos)
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 6cc4d4cfe0c2..eb34f5c357cf 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1754,6 +1754,7 @@ static int blkdev_writepages(struct address_space *mapping,
}
static const struct address_space_operations def_blk_aops = {
+ .set_page_dirty = __set_page_dirty_buffers,
.readpage = blkdev_readpage,
.readahead = blkdev_readahead,
.writepage = blkdev_writepage,
diff --git a/fs/buffer.c b/fs/buffer.c
index ea48c01fb76b..6290c3afdba4 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -589,31 +589,6 @@ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
EXPORT_SYMBOL(mark_buffer_dirty_inode);
/*
- * Mark the page dirty, and set it dirty in the page cache, and mark the inode
- * dirty.
- *
- * If warn is true, then emit a warning if the page is not uptodate and has
- * not been truncated.
- *
- * The caller must hold lock_page_memcg().
- */
-void __set_page_dirty(struct page *page, struct address_space *mapping,
- int warn)
-{
- unsigned long flags;
-
- xa_lock_irqsave(&mapping->i_pages, flags);
- if (page->mapping) { /* Race with truncate? */
- WARN_ON_ONCE(warn && !PageUptodate(page));
- account_page_dirtied(page, mapping);
- __xa_set_mark(&mapping->i_pages, page_index(page),
- PAGECACHE_TAG_DIRTY);
- }
- xa_unlock_irqrestore(&mapping->i_pages, flags);
-}
-EXPORT_SYMBOL_GPL(__set_page_dirty);
-
-/*
* Add a page to the dirty page list.
*
* It is a sad fact of life that this function is called from several places
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index eb5ec3e46283..b601610e9907 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -28,12 +28,6 @@
static struct lock_class_key default_group_class[MAX_LOCK_DEPTH];
#endif
-static const struct address_space_operations configfs_aops = {
- .readpage = simple_readpage,
- .write_begin = simple_write_begin,
- .write_end = simple_write_end,
-};
-
static const struct inode_operations configfs_inode_operations ={
.setattr = configfs_setattr,
};
@@ -114,7 +108,7 @@ struct inode *configfs_new_inode(umode_t mode, struct configfs_dirent *sd,
struct inode * inode = new_inode(s);
if (inode) {
inode->i_ino = get_next_ino();
- inode->i_mapping->a_ops = &configfs_aops;
+ inode->i_mapping->a_ops = &ram_aops;
inode->i_op = &configfs_inode_operations;
if (sd->s_iattr) {
diff --git a/fs/dax.c b/fs/dax.c
index 62352cbcf0f4..da41f9363568 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -488,10 +488,11 @@ static void *grab_mapping_entry(struct xa_state *xas,
struct address_space *mapping, unsigned int order)
{
unsigned long index = xas->xa_index;
- bool pmd_downgrade = false; /* splitting PMD entry into PTE entries? */
+ bool pmd_downgrade; /* splitting PMD entry into PTE entries? */
void *entry;
retry:
+ pmd_downgrade = false;
xas_lock_irq(xas);
entry = get_unlocked_entry(xas, order);
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 392e721b50a3..7d85e64ea62f 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -533,7 +533,20 @@ static sector_t ecryptfs_bmap(struct address_space *mapping, sector_t block)
return block;
}
+#include <linux/buffer_head.h>
+
const struct address_space_operations ecryptfs_aops = {
+ /*
+ * XXX: This is pretty broken for multiple reasons: ecryptfs does not
+ * actually use buffer_heads, and ecryptfs will crash without
+ * CONFIG_BLOCK. But it matches the behavior before the default for
+ * address_space_operations without the ->set_page_dirty method was
+ * cleaned up, so this is the best we can do without maintainer
+ * feedback.
+ */
+#ifdef CONFIG_BLOCK
+ .set_page_dirty = __set_page_dirty_buffers,
+#endif
.writepage = ecryptfs_writepage,
.readpage = ecryptfs_readpage,
.write_begin = ecryptfs_write_begin,
diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c
index 1803ef3220fd..ca37d4344361 100644
--- a/fs/exfat/inode.c
+++ b/fs/exfat/inode.c
@@ -491,6 +491,7 @@ int exfat_block_truncate_page(struct inode *inode, loff_t from)
}
static const struct address_space_operations exfat_aops = {
+ .set_page_dirty = __set_page_dirty_buffers,
.readpage = exfat_readpage,
.readahead = exfat_readahead,
.writepage = exfat_writepage,
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 68178b2234bd..dadb121beb22 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -961,6 +961,7 @@ ext2_dax_writepages(struct address_space *mapping, struct writeback_control *wbc
}
const struct address_space_operations ext2_aops = {
+ .set_page_dirty = __set_page_dirty_buffers,
.readpage = ext2_readpage,
.readahead = ext2_readahead,
.writepage = ext2_writepage,
@@ -975,6 +976,7 @@ const struct address_space_operations ext2_aops = {
};
const struct address_space_operations ext2_nobh_aops = {
+ .set_page_dirty = __set_page_dirty_buffers,
.readpage = ext2_readpage,
.readahead = ext2_readahead,
.writepage = ext2_nobh_writepage,
@@ -990,7 +992,7 @@ const struct address_space_operations ext2_nobh_aops = {
static const struct address_space_operations ext2_dax_aops = {
.writepages = ext2_dax_writepages,
.direct_IO = noop_direct_IO,
- .set_page_dirty = noop_set_page_dirty,
+ .set_page_dirty = __set_page_dirty_no_writeback,
.invalidatepage = noop_invalidatepage,
};
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index fe6045a46599..b8170a008590 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3701,7 +3701,7 @@ static const struct address_space_operations ext4_da_aops = {
static const struct address_space_operations ext4_dax_aops = {
.writepages = ext4_dax_writepages,
.direct_IO = noop_direct_IO,
- .set_page_dirty = noop_set_page_dirty,
+ .set_page_dirty = __set_page_dirty_no_writeback,
.bmap = ext4_bmap,
.invalidatepage = noop_invalidatepage,
.swap_activate = ext4_iomap_swap_activate,
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index bab9b202b496..de0c9b013a85 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -342,6 +342,7 @@ int fat_block_truncate_page(struct inode *inode, loff_t from)
}
static const struct address_space_operations fat_aops = {
+ .set_page_dirty = __set_page_dirty_buffers,
.readpage = fat_readpage,
.readahead = fat_readahead,
.writepage = fat_writepage,
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index e91980f49388..62193106683d 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -131,25 +131,6 @@ static bool inode_io_list_move_locked(struct inode *inode,
return false;
}
-/**
- * inode_io_list_del_locked - remove an inode from its bdi_writeback IO list
- * @inode: inode to be removed
- * @wb: bdi_writeback @inode is being removed from
- *
- * Remove @inode which may be on one of @wb->b_{dirty|io|more_io} lists and
- * clear %WB_has_dirty_io if all are empty afterwards.
- */
-static void inode_io_list_del_locked(struct inode *inode,
- struct bdi_writeback *wb)
-{
- assert_spin_locked(&wb->list_lock);
- assert_spin_locked(&inode->i_lock);
-
- inode->i_state &= ~I_SYNC_QUEUED;
- list_del_init(&inode->i_io_list);
- wb_io_lists_depopulated(wb);
-}
-
static void wb_wakeup(struct bdi_writeback *wb)
{
spin_lock_bh(&wb->work_lock);
@@ -244,6 +225,13 @@ void wb_wait_for_completion(struct wb_completion *done)
/* one round can affect upto 5 slots */
#define WB_FRN_MAX_IN_FLIGHT 1024 /* don't queue too many concurrently */
+/*
+ * Maximum inodes per isw. A specific value has been chosen to make
+ * struct inode_switch_wbs_context fit into 1024 bytes kmalloc.
+ */
+#define WB_MAX_INODES_PER_ISW ((1024UL - sizeof(struct inode_switch_wbs_context)) \
+ / sizeof(struct inode *))
+
static atomic_t isw_nr_in_flight = ATOMIC_INIT(0);
static struct workqueue_struct *isw_wq;
@@ -279,6 +267,28 @@ void __inode_attach_wb(struct inode *inode, struct page *page)
EXPORT_SYMBOL_GPL(__inode_attach_wb);
/**
+ * inode_cgwb_move_to_attached - put the inode onto wb->b_attached list
+ * @inode: inode of interest with i_lock held
+ * @wb: target bdi_writeback
+ *
+ * Remove the inode from wb's io lists and if necessarily put onto b_attached
+ * list. Only inodes attached to cgwb's are kept on this list.
+ */
+static void inode_cgwb_move_to_attached(struct inode *inode,
+ struct bdi_writeback *wb)
+{
+ assert_spin_locked(&wb->list_lock);
+ assert_spin_locked(&inode->i_lock);
+
+ inode->i_state &= ~I_SYNC_QUEUED;
+ if (wb != &wb->bdi->wb)
+ list_move(&inode->i_io_list, &wb->b_attached);
+ else
+ list_del_init(&inode->i_io_list);
+ wb_io_lists_depopulated(wb);
+}
+
+/**
* locked_inode_to_wb_and_lock_list - determine a locked inode's wb and lock it
* @inode: inode of interest with i_lock held
*
@@ -332,11 +342,18 @@ static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
}
struct inode_switch_wbs_context {
- struct inode *inode;
- struct bdi_writeback *new_wb;
+ struct rcu_work work;
- struct rcu_head rcu_head;
- struct work_struct work;
+ /*
+ * Multiple inodes can be switched at once. The switching procedure
+ * consists of two parts, separated by a RCU grace period. To make
+ * sure that the second part is executed for each inode gone through
+ * the first part, all inode pointers are placed into a NULL-terminated
+ * array embedded into struct inode_switch_wbs_context. Otherwise
+ * an inode could be left in a non-consistent state.
+ */
+ struct bdi_writeback *new_wb;
+ struct inode *inodes[];
};
static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi)
@@ -349,50 +366,23 @@ static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi)
up_write(&bdi->wb_switch_rwsem);
}
-static void inode_switch_wbs_work_fn(struct work_struct *work)
+static bool inode_do_switch_wbs(struct inode *inode,
+ struct bdi_writeback *old_wb,
+ struct bdi_writeback *new_wb)
{
- struct inode_switch_wbs_context *isw =
- container_of(work, struct inode_switch_wbs_context, work);
- struct inode *inode = isw->inode;
- struct backing_dev_info *bdi = inode_to_bdi(inode);
struct address_space *mapping = inode->i_mapping;
- struct bdi_writeback *old_wb = inode->i_wb;
- struct bdi_writeback *new_wb = isw->new_wb;
XA_STATE(xas, &mapping->i_pages, 0);
struct page *page;
bool switched = false;
- /*
- * If @inode switches cgwb membership while sync_inodes_sb() is
- * being issued, sync_inodes_sb() might miss it. Synchronize.
- */
- down_read(&bdi->wb_switch_rwsem);
-
- /*
- * By the time control reaches here, RCU grace period has passed
- * since I_WB_SWITCH assertion and all wb stat update transactions
- * between unlocked_inode_to_wb_begin/end() are guaranteed to be
- * synchronizing against the i_pages lock.
- *
- * Grabbing old_wb->list_lock, inode->i_lock and the i_pages lock
- * gives us exclusion against all wb related operations on @inode
- * including IO list manipulations and stat updates.
- */
- if (old_wb < new_wb) {
- spin_lock(&old_wb->list_lock);
- spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING);
- } else {
- spin_lock(&new_wb->list_lock);
- spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING);
- }
spin_lock(&inode->i_lock);
xa_lock_irq(&mapping->i_pages);
/*
- * Once I_FREEING is visible under i_lock, the eviction path owns
- * the inode and we shouldn't modify ->i_io_list.
+ * Once I_FREEING or I_WILL_FREE are visible under i_lock, the eviction
+ * path owns the inode and we shouldn't modify ->i_io_list.
*/
- if (unlikely(inode->i_state & I_FREEING))
+ if (unlikely(inode->i_state & (I_FREEING | I_WILL_FREE)))
goto skip_switch;
trace_inode_switch_wbs(inode, old_wb, new_wb);
@@ -419,21 +409,28 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
wb_get(new_wb);
/*
- * Transfer to @new_wb's IO list if necessary. The specific list
- * @inode was on is ignored and the inode is put on ->b_dirty which
- * is always correct including from ->b_dirty_time. The transfer
- * preserves @inode->dirtied_when ordering.
+ * Transfer to @new_wb's IO list if necessary. If the @inode is dirty,
+ * the specific list @inode was on is ignored and the @inode is put on
+ * ->b_dirty which is always correct including from ->b_dirty_time.
+ * The transfer preserves @inode->dirtied_when ordering. If the @inode
+ * was clean, it means it was on the b_attached list, so move it onto
+ * the b_attached list of @new_wb.
*/
if (!list_empty(&inode->i_io_list)) {
- struct inode *pos;
-
- inode_io_list_del_locked(inode, old_wb);
inode->i_wb = new_wb;
- list_for_each_entry(pos, &new_wb->b_dirty, i_io_list)
- if (time_after_eq(inode->dirtied_when,
- pos->dirtied_when))
- break;
- inode_io_list_move_locked(inode, new_wb, pos->i_io_list.prev);
+
+ if (inode->i_state & I_DIRTY_ALL) {
+ struct inode *pos;
+
+ list_for_each_entry(pos, &new_wb->b_dirty, i_io_list)
+ if (time_after_eq(inode->dirtied_when,
+ pos->dirtied_when))
+ break;
+ inode_io_list_move_locked(inode, new_wb,
+ pos->i_io_list.prev);
+ } else {
+ inode_cgwb_move_to_attached(inode, new_wb);
+ }
} else {
inode->i_wb = new_wb;
}
@@ -452,31 +449,91 @@ skip_switch:
xa_unlock_irq(&mapping->i_pages);
spin_unlock(&inode->i_lock);
+
+ return switched;
+}
+
+static void inode_switch_wbs_work_fn(struct work_struct *work)
+{
+ struct inode_switch_wbs_context *isw =
+ container_of(to_rcu_work(work), struct inode_switch_wbs_context, work);
+ struct backing_dev_info *bdi = inode_to_bdi(isw->inodes[0]);
+ struct bdi_writeback *old_wb = isw->inodes[0]->i_wb;
+ struct bdi_writeback *new_wb = isw->new_wb;
+ unsigned long nr_switched = 0;
+ struct inode **inodep;
+
+ /*
+ * If @inode switches cgwb membership while sync_inodes_sb() is
+ * being issued, sync_inodes_sb() might miss it. Synchronize.
+ */
+ down_read(&bdi->wb_switch_rwsem);
+
+ /*
+ * By the time control reaches here, RCU grace period has passed
+ * since I_WB_SWITCH assertion and all wb stat update transactions
+ * between unlocked_inode_to_wb_begin/end() are guaranteed to be
+ * synchronizing against the i_pages lock.
+ *
+ * Grabbing old_wb->list_lock, inode->i_lock and the i_pages lock
+ * gives us exclusion against all wb related operations on @inode
+ * including IO list manipulations and stat updates.
+ */
+ if (old_wb < new_wb) {
+ spin_lock(&old_wb->list_lock);
+ spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING);
+ } else {
+ spin_lock(&new_wb->list_lock);
+ spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING);
+ }
+
+ for (inodep = isw->inodes; *inodep; inodep++) {
+ WARN_ON_ONCE((*inodep)->i_wb != old_wb);
+ if (inode_do_switch_wbs(*inodep, old_wb, new_wb))
+ nr_switched++;
+ }
+
spin_unlock(&new_wb->list_lock);
spin_unlock(&old_wb->list_lock);
up_read(&bdi->wb_switch_rwsem);
- if (switched) {
+ if (nr_switched) {
wb_wakeup(new_wb);
- wb_put(old_wb);
+ wb_put_many(old_wb, nr_switched);
}
- wb_put(new_wb);
- iput(inode);
+ for (inodep = isw->inodes; *inodep; inodep++)
+ iput(*inodep);
+ wb_put(new_wb);
kfree(isw);
-
atomic_dec(&isw_nr_in_flight);
}
-static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head)
+static bool inode_prepare_wbs_switch(struct inode *inode,
+ struct bdi_writeback *new_wb)
{
- struct inode_switch_wbs_context *isw = container_of(rcu_head,
- struct inode_switch_wbs_context, rcu_head);
+ /*
+ * Paired with smp_mb() in cgroup_writeback_umount().
+ * isw_nr_in_flight must be increased before checking SB_ACTIVE and
+ * grabbing an inode, otherwise isw_nr_in_flight can be observed as 0
+ * in cgroup_writeback_umount() and the isw_wq will be not flushed.
+ */
+ smp_mb();
+
+ /* while holding I_WB_SWITCH, no one else can update the association */
+ spin_lock(&inode->i_lock);
+ if (!(inode->i_sb->s_flags & SB_ACTIVE) ||
+ inode->i_state & (I_WB_SWITCH | I_FREEING | I_WILL_FREE) ||
+ inode_to_wb(inode) == new_wb) {
+ spin_unlock(&inode->i_lock);
+ return false;
+ }
+ inode->i_state |= I_WB_SWITCH;
+ __iget(inode);
+ spin_unlock(&inode->i_lock);
- /* needs to grab bh-unsafe locks, bounce to work item */
- INIT_WORK(&isw->work, inode_switch_wbs_work_fn);
- queue_work(isw_wq, &isw->work);
+ return true;
}
/**
@@ -501,10 +558,12 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
if (atomic_read(&isw_nr_in_flight) > WB_FRN_MAX_IN_FLIGHT)
return;
- isw = kzalloc(sizeof(*isw), GFP_ATOMIC);
+ isw = kzalloc(sizeof(*isw) + 2 * sizeof(struct inode *), GFP_ATOMIC);
if (!isw)
return;
+ atomic_inc(&isw_nr_in_flight);
+
/* find and pin the new wb */
rcu_read_lock();
memcg_css = css_from_id(new_wb_id, &memory_cgrp_subsys);
@@ -514,19 +573,10 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
if (!isw->new_wb)
goto out_free;
- /* while holding I_WB_SWITCH, no one else can update the association */
- spin_lock(&inode->i_lock);
- if (!(inode->i_sb->s_flags & SB_ACTIVE) ||
- inode->i_state & (I_WB_SWITCH | I_FREEING) ||
- inode_to_wb(inode) == isw->new_wb) {
- spin_unlock(&inode->i_lock);
+ if (!inode_prepare_wbs_switch(inode, isw->new_wb))
goto out_free;
- }
- inode->i_state |= I_WB_SWITCH;
- __iget(inode);
- spin_unlock(&inode->i_lock);
- isw->inode = inode;
+ isw->inodes[0] = inode;
/*
* In addition to synchronizing among switchers, I_WB_SWITCH tells
@@ -534,18 +584,85 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
* lock so that stat transfer can synchronize against them.
* Let's continue after I_WB_SWITCH is guaranteed to be visible.
*/
- call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn);
-
- atomic_inc(&isw_nr_in_flight);
+ INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn);
+ queue_rcu_work(isw_wq, &isw->work);
return;
out_free:
+ atomic_dec(&isw_nr_in_flight);
if (isw->new_wb)
wb_put(isw->new_wb);
kfree(isw);
}
/**
+ * cleanup_offline_cgwb - detach associated inodes
+ * @wb: target wb
+ *
+ * Switch all inodes attached to @wb to a nearest living ancestor's wb in order
+ * to eventually release the dying @wb. Returns %true if not all inodes were
+ * switched and the function has to be restarted.
+ */
+bool cleanup_offline_cgwb(struct bdi_writeback *wb)
+{
+ struct cgroup_subsys_state *memcg_css;
+ struct inode_switch_wbs_context *isw;
+ struct inode *inode;
+ int nr;
+ bool restart = false;
+
+ isw = kzalloc(sizeof(*isw) + WB_MAX_INODES_PER_ISW *
+ sizeof(struct inode *), GFP_KERNEL);
+ if (!isw)
+ return restart;
+
+ atomic_inc(&isw