diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2015-09-05 20:34:28 -0700 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-09-05 20:34:28 -0700 |
| commit | 7d9071a095023cd1db8fa18fa0d648dc1a5210e0 (patch) | |
| tree | 072b462e43912b9dfc321136f3367114dcb8f2b3 | |
| parent | bd779669945ed9982890da789ad32e3bd0d41f14 (diff) | |
| parent | 397d425dc26da728396e66d392d5dcb8dac30c37 (diff) | |
| download | linux-7d9071a095023cd1db8fa18fa0d648dc1a5210e0.tar.gz linux-7d9071a095023cd1db8fa18fa0d648dc1a5210e0.tar.bz2 linux-7d9071a095023cd1db8fa18fa0d648dc1a5210e0.zip | |
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs
Pull vfs updates from Al Viro:
"In this one:
- d_move fixes (Eric Biederman)
- UFS fixes (me; locking is mostly sane now, a bunch of bugs in error
handling ought to be fixed)
- switch of sb_writers to percpu rwsem (Oleg Nesterov)
- superblock scalability (Josef Bacik and Dave Chinner)
- swapon(2) race fix (Hugh Dickins)"
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs: (65 commits)
vfs: Test for and handle paths that are unreachable from their mnt_root
dcache: Reduce the scope of i_lock in d_splice_alias
dcache: Handle escaped paths in prepend_path
mm: fix potential data race in SyS_swapon
inode: don't softlockup when evicting inodes
inode: rename i_wb_list to i_io_list
sync: serialise per-superblock sync operations
inode: convert inode_sb_list_lock to per-sb
inode: add hlist_fake to avoid the inode hash lock in evict
writeback: plug writeback at a high level
change sb_writers to use percpu_rw_semaphore
shift percpu_counter_destroy() into destroy_super_work()
percpu-rwsem: kill CONFIG_PERCPU_RWSEM
percpu-rwsem: introduce percpu_rwsem_release() and percpu_rwsem_acquire()
percpu-rwsem: introduce percpu_down_read_trylock()
document rwsem_release() in sb_wait_write()
fix the broken lockdep logic in __sb_start_write()
introduce __sb_writers_{acquired,release}() helpers
ufs_inode_get{frag,block}(): get rid of 'phys' argument
ufs_getfrag_block(): tidy up a bit
...
| -rw-r--r-- | arch/Kconfig | 1 | ||||
| -rw-r--r-- | fs/block_dev.c | 12 | ||||
| -rw-r--r-- | fs/btrfs/transaction.c | 8 | ||||
| -rw-r--r-- | fs/dcache.c | 14 | ||||
| -rw-r--r-- | fs/drop_caches.c | 10 | ||||
| -rw-r--r-- | fs/fs-writeback.c | 72 | ||||
| -rw-r--r-- | fs/inode.c | 50 | ||||
| -rw-r--r-- | fs/internal.h | 3 | ||||
| -rw-r--r-- | fs/namei.c | 27 | ||||
| -rw-r--r-- | fs/notify/inode_mark.c | 20 | ||||
| -rw-r--r-- | fs/quota/dquot.c | 16 | ||||
| -rw-r--r-- | fs/super.c | 175 | ||||
| -rw-r--r-- | fs/ufs/Makefile | 2 | ||||
| -rw-r--r-- | fs/ufs/balloc.c | 4 | ||||
| -rw-r--r-- | fs/ufs/inode.c | 948 | ||||
| -rw-r--r-- | fs/ufs/super.c | 36 | ||||
| -rw-r--r-- | fs/ufs/truncate.c | 523 | ||||
| -rw-r--r-- | fs/ufs/ufs.h | 13 | ||||
| -rw-r--r-- | fs/xfs/xfs_aops.c | 6 | ||||
| -rw-r--r-- | include/linux/fs.h | 33 | ||||
| -rw-r--r-- | include/linux/fsnotify_backend.h | 4 | ||||
| -rw-r--r-- | include/linux/list.h | 5 | ||||
| -rw-r--r-- | include/linux/percpu-rwsem.h | 20 | ||||
| -rw-r--r-- | init/Kconfig | 1 | ||||
| -rw-r--r-- | kernel/locking/Makefile | 3 | ||||
| -rw-r--r-- | kernel/locking/percpu-rwsem.c | 13 | ||||
| -rw-r--r-- | lib/Kconfig | 3 | ||||
| -rw-r--r-- | mm/backing-dev.c | 8 | ||||
| -rw-r--r-- | mm/swapfile.c | 25 |
29 files changed, 936 insertions, 1119 deletions
diff --git a/arch/Kconfig b/arch/Kconfig index a71cdbe2a04d..8f3564930580 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -93,7 +93,6 @@ config KPROBES_ON_FTRACE config UPROBES def_bool n - select PERCPU_RWSEM help Uprobes is the user-space counterpart to kprobes: they enable instrumentation applications (such as 'perf probe') diff --git a/fs/block_dev.c b/fs/block_dev.c index 198243717da5..33b813e04f79 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1769,7 +1769,7 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg) { struct inode *inode, *old_inode = NULL; - spin_lock(&inode_sb_list_lock); + spin_lock(&blockdev_superblock->s_inode_list_lock); list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) { struct address_space *mapping = inode->i_mapping; @@ -1781,13 +1781,13 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg) } __iget(inode); spin_unlock(&inode->i_lock); - spin_unlock(&inode_sb_list_lock); + spin_unlock(&blockdev_superblock->s_inode_list_lock); /* * We hold a reference to 'inode' so it couldn't have been * removed from s_inodes list while we dropped the - * inode_sb_list_lock. We cannot iput the inode now as we can + * s_inode_list_lock We cannot iput the inode now as we can * be holding the last reference and we cannot iput it under - * inode_sb_list_lock. So we keep the reference and iput it + * s_inode_list_lock. So we keep the reference and iput it * later. */ iput(old_inode); @@ -1795,8 +1795,8 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg) func(I_BDEV(inode), arg); - spin_lock(&inode_sb_list_lock); + spin_lock(&blockdev_superblock->s_inode_list_lock); } - spin_unlock(&inode_sb_list_lock); + spin_unlock(&blockdev_superblock->s_inode_list_lock); iput(old_inode); } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 68ad89e23713..8f259b3a66b3 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1640,9 +1640,7 @@ static void do_async_commit(struct work_struct *work) * Tell lockdep about it. */ if (ac->newtrans->type & __TRANS_FREEZABLE) - rwsem_acquire_read( - &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], - 0, 1, _THIS_IP_); + __sb_writers_acquired(ac->root->fs_info->sb, SB_FREEZE_FS); current->journal_info = ac->newtrans; @@ -1681,9 +1679,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, * async commit thread will be the one to unlock it. */ if (ac->newtrans->type & __TRANS_FREEZABLE) - rwsem_release( - &root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], - 1, _THIS_IP_); + __sb_writers_release(root->fs_info->sb, SB_FREEZE_FS); schedule_work(&ac->work); diff --git a/fs/dcache.c b/fs/dcache.c index 9b5fe503f6cb..5c33aeb0f68f 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2718,7 +2718,7 @@ struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2) * This helper attempts to cope with remotely renamed directories * * It assumes that the caller is already holding - * dentry->d_parent->d_inode->i_mutex, inode->i_lock and rename_lock + * dentry->d_parent->d_inode->i_mutex, and rename_lock * * Note: If ever the locking in lock_rename() changes, then please * remember to update this too... @@ -2744,7 +2744,6 @@ out_unalias: __d_move(alias, dentry, false); ret = 0; out_err: - spin_unlock(&inode->i_lock); if (m2) mutex_unlock(m2); if (m1) @@ -2790,10 +2789,11 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) if (S_ISDIR(inode->i_mode)) { struct dentry *new = __d_find_any_alias(inode); if (unlikely(new)) { + /* The reference to new ensures it remains an alias */ + spin_unlock(&inode->i_lock); write_seqlock(&rename_lock); if (unlikely(d_ancestor(new, dentry))) { write_sequnlock(&rename_lock); - spin_unlock(&inode->i_lock); dput(new); new = ERR_PTR(-ELOOP); pr_warn_ratelimited( @@ -2812,7 +2812,6 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) } else { __d_move(new, dentry, false); write_sequnlock(&rename_lock); - spin_unlock(&inode->i_lock); security_d_instantiate(new, inode); } iput(inode); @@ -2926,6 +2925,13 @@ restart: if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) { struct mount *parent = ACCESS_ONCE(mnt->mnt_parent); + /* Escaped? */ + if (dentry != vfsmnt->mnt_root) { + bptr = *buffer; + blen = *buflen; + error = 3; + break; + } /* Global root? */ if (mnt != parent) { dentry = ACCESS_ONCE(mnt->mnt_mountpoint); diff --git a/fs/drop_caches.c b/fs/drop_caches.c index 5718cb9f7273..d72d52b90433 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -17,7 +17,7 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused) { struct inode *inode, *toput_inode = NULL; - spin_lock(&inode_sb_list_lock); + spin_lock(&sb->s_inode_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { spin_lock(&inode->i_lock); if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || @@ -27,13 +27,15 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused) } __iget(inode); spin_unlock(&inode->i_lock); - spin_unlock(&inode_sb_list_lock); + spin_unlock(&sb->s_inode_list_lock); + invalidate_mapping_pages(inode->i_mapping, 0, -1); iput(toput_inode); toput_inode = inode; - spin_lock(&inode_sb_list_lock); + + spin_lock(&sb->s_inode_list_lock); } - spin_unlock(&inode_sb_list_lock); + spin_unlock(&sb->s_inode_list_lock); iput(toput_inode); } diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 5fa588e933d5..ae0f438c2ee6 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -88,7 +88,7 @@ unsigned int dirtytime_expire_interval = 12 * 60 * 60; static inline struct inode *wb_inode(struct list_head *head) { - return list_entry(head, struct inode, i_wb_list); + return list_entry(head, struct inode, i_io_list); } /* @@ -125,22 +125,22 @@ static void wb_io_lists_depopulated(struct bdi_writeback *wb) } /** - * inode_wb_list_move_locked - move an inode onto a bdi_writeback IO list + * inode_io_list_move_locked - move an inode onto a bdi_writeback IO list * @inode: inode to be moved * @wb: target bdi_writeback * @head: one of @wb->b_{dirty|io|more_io} * - * Move @inode->i_wb_list to @list of @wb and set %WB_has_dirty_io. + * Move @inode->i_io_list to @list of @wb and set %WB_has_dirty_io. * Returns %true if @inode is the first occupant of the !dirty_time IO * lists; otherwise, %false. */ -static bool inode_wb_list_move_locked(struct inode *inode, +static bool inode_io_list_move_locked(struct inode *inode, struct bdi_writeback *wb, struct list_head *head) { assert_spin_locked(&wb->list_lock); - list_move(&inode->i_wb_list, head); + list_move(&inode->i_io_list, head); /* dirty_time doesn't count as dirty_io until expiration */ if (head != &wb->b_dirty_time) @@ -151,19 +151,19 @@ static bool inode_wb_list_move_locked(struct inode *inode, } /** - * inode_wb_list_del_locked - remove an inode from its bdi_writeback IO list + * inode_io_list_del_locked - remove an inode from its bdi_writeback IO list * @inode: inode to be removed * @wb: bdi_writeback @inode is being removed from * * Remove @inode which may be on one of @wb->b_{dirty|io|more_io} lists and * clear %WB_has_dirty_io if all are empty afterwards. */ -static void inode_wb_list_del_locked(struct inode *inode, +static void inode_io_list_del_locked(struct inode *inode, struct bdi_writeback *wb) { assert_spin_locked(&wb->list_lock); - list_del_init(&inode->i_wb_list); + list_del_init(&inode->i_io_list); wb_io_lists_depopulated(wb); } @@ -351,7 +351,7 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) /* * Once I_FREEING is visible under i_lock, the eviction path owns - * the inode and we shouldn't modify ->i_wb_list. + * the inode and we shouldn't modify ->i_io_list. */ if (unlikely(inode->i_state & I_FREEING)) goto skip_switch; @@ -390,16 +390,16 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) * is always correct including from ->b_dirty_time. The transfer * preserves @inode->dirtied_when ordering. */ - if (!list_empty(&inode->i_wb_list)) { + if (!list_empty(&inode->i_io_list)) { struct inode *pos; - inode_wb_list_del_locked(inode, old_wb); + inode_io_list_del_locked(inode, old_wb); inode->i_wb = new_wb; - list_for_each_entry(pos, &new_wb->b_dirty, i_wb_list) + list_for_each_entry(pos, &new_wb->b_dirty, i_io_list) if (time_after_eq(inode->dirtied_when, pos->dirtied_when)) break; - inode_wb_list_move_locked(inode, new_wb, pos->i_wb_list.prev); + inode_io_list_move_locked(inode, new_wb, pos->i_io_list.prev); } else { inode->i_wb = new_wb; } @@ -961,12 +961,12 @@ void wb_start_background_writeback(struct bdi_writeback *wb) /* * Remove the inode from the writeback list it is on. */ -void inode_wb_list_del(struct inode *inode) +void inode_io_list_del(struct inode *inode) { struct bdi_writeback *wb; wb = inode_to_wb_and_lock_list(inode); - inode_wb_list_del_locked(inode, wb); + inode_io_list_del_locked(inode, wb); spin_unlock(&wb->list_lock); } @@ -988,7 +988,7 @@ static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) if (time_before(inode->dirtied_when, tail->dirtied_when)) inode->dirtied_when = jiffies; } - inode_wb_list_move_locked(inode, wb, &wb->b_dirty); + inode_io_list_move_locked(inode, wb, &wb->b_dirty); } /* @@ -996,7 +996,7 @@ static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) */ static void requeue_io(struct inode *inode, struct bdi_writeback *wb) { - inode_wb_list_move_locked(inode, wb, &wb->b_more_io); + inode_io_list_move_locked(inode, wb, &wb->b_more_io); } static void inode_sync_complete(struct inode *inode) @@ -1055,7 +1055,7 @@ static int move_expired_inodes(struct list_head *delaying_queue, if (older_than_this && inode_dirtied_after(inode, *older_than_this)) break; - list_move(&inode->i_wb_list, &tmp); + list_move(&inode->i_io_list, &tmp); moved++; if (flags & EXPIRE_DIRTY_ATIME) set_bit(__I_DIRTY_TIME_EXPIRED, &inode->i_state); @@ -1078,7 +1078,7 @@ static int move_expired_inodes(struct list_head *delaying_queue, list_for_each_prev_safe(pos, node, &tmp) { inode = wb_inode(pos); if (inode->i_sb == sb) - list_move(&inode->i_wb_list, dispatch_queue); + list_move(&inode->i_io_list, dispatch_queue); } } out: @@ -1232,10 +1232,10 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, redirty_tail(inode, wb); } else if (inode->i_state & I_DIRTY_TIME) { inode->dirtied_when = jiffies; - inode_wb_list_move_locked(inode, wb, &wb->b_dirty_time); + inode_io_list_move_locked(inode, wb, &wb->b_dirty_time); } else { /* The inode is clean. Remove from writeback lists. */ - inode_wb_list_del_locked(inode, wb); + inode_io_list_del_locked(inode, wb); } } @@ -1378,7 +1378,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, * touch it. See comment above for explanation. */ if (!(inode->i_state & I_DIRTY_ALL)) - inode_wb_list_del_locked(inode, wb); + inode_io_list_del_locked(inode, wb); spin_unlock(&wb->list_lock); inode_sync_complete(inode); out: @@ -1439,7 +1439,9 @@ static long writeback_sb_inodes(struct super_block *sb, unsigned long start_time = jiffies; long write_chunk; long wrote = 0; /* count both pages and inodes */ + struct blk_plug plug; + blk_start_plug(&plug); while (!list_empty(&wb->b_io)) { struct inode *inode = wb_inode(wb->b_io.prev); @@ -1537,6 +1539,7 @@ static long writeback_sb_inodes(struct super_block *sb, break; } } + blk_finish_plug(&plug); return wrote; } @@ -2088,7 +2091,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) else dirty_list = &wb->b_dirty_time; - wakeup_bdi = inode_wb_list_move_locked(inode, wb, + wakeup_bdi = inode_io_list_move_locked(inode, wb, dirty_list); spin_unlock(&wb->list_lock); @@ -2111,6 +2114,15 @@ out_unlock_inode: } EXPORT_SYMBOL(__mark_inode_dirty); +/* + * The @s_sync_lock is used to serialise concurrent sync operations + * to avoid lock contention problems with concurrent wait_sb_inodes() calls. + * Concurrent callers will block on the s_sync_lock rather than doing contending + * walks. The queueing maintains sync(2) required behaviour as all the IO that + * has been issued up to the time this function is enter is guaranteed to be + * completed by the time we have gained the lock and waited for all IO that is + * in progress regardless of the order callers are granted the lock. + */ static void wait_sb_inodes(struct super_block *sb) { struct inode *inode, *old_inode = NULL; @@ -2121,7 +2133,8 @@ static void wait_sb_inodes(struct super_block *sb) */ WARN_ON(!rwsem_is_locked(&sb->s_umount)); - spin_lock(&inode_sb_list_lock); + mutex_lock(&sb->s_sync_lock); + spin_lock(&sb->s_inode_list_lock); /* * Data integrity sync. Must wait for all pages under writeback, @@ -2141,14 +2154,14 @@ static void wait_sb_inodes(struct super_block *sb) } __iget(inode); spin_unlock(&inode->i_lock); - spin_unlock(&inode_sb_list_lock); + spin_unlock(&sb->s_inode_list_lock); /* * We hold a reference to 'inode' so it couldn't have been * removed from s_inodes list while we dropped the - * inode_sb_list_lock. We cannot iput the inode now as we can + * s_inode_list_lock. We cannot iput the inode now as we can * be holding the last reference and we cannot iput it under - * inode_sb_list_lock. So we keep the reference and iput it + * s_inode_list_lock. So we keep the reference and iput it * later. */ iput(old_inode); @@ -2158,10 +2171,11 @@ static void wait_sb_inodes(struct super_block *sb) cond_resched(); - spin_lock(&inode_sb_list_lock); + spin_lock(&sb->s_inode_list_lock); } - spin_unlock(&inode_sb_list_lock); + spin_unlock(&sb->s_inode_list_lock); iput(old_inode); + mutex_unlock(&sb->s_sync_lock); } static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr, diff --git a/fs/inode.c b/fs/inode.c index d30640f7a193..78a17b8859e1 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -28,16 +28,16 @@ * inode->i_state, inode->i_hash, __iget() * Inode LRU list locks protect: * inode->i_sb->s_inode_lru, inode->i_lru - * inode_sb_list_lock protects: - * sb->s_inodes, inode->i_sb_list + * inode->i_sb->s_inode_list_lock protects: + * inode->i_sb->s_inodes, inode->i_sb_list * bdi->wb.list_lock protects: - * bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_wb_list + * bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_io_list * inode_hash_lock protects: * inode_hashtable, inode->i_hash * * Lock ordering: * - * inode_sb_list_lock + * inode->i_sb->s_inode_list_lock * inode->i_lock * Inode LRU list locks * @@ -45,7 +45,7 @@ * inode->i_lock * * inode_hash_lock - * inode_sb_list_lock + * inode->i_sb->s_inode_list_lock * inode->i_lock * * iunique_lock @@ -57,8 +57,6 @@ static unsigned int i_hash_shift __read_mostly; static struct hlist_head *inode_hashtable __read_mostly; static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock); -__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock); - /* * Empty aops. Can be used for the cases where the user does not * define any of the address_space operations. @@ -359,7 +357,7 @@ void inode_init_once(struct inode *inode) memset(inode, 0, sizeof(*inode)); INIT_HLIST_NODE(&inode->i_hash); INIT_LIST_HEAD(&inode->i_devices); - INIT_LIST_HEAD(&inode->i_wb_list); + INIT_LIST_HEAD(&inode->i_io_list); INIT_LIST_HEAD(&inode->i_lru); address_space_init_once(&inode->i_data); i_size_ordered_init(inode); @@ -426,18 +424,18 @@ static void inode_lru_list_del(struct inode *inode) */ void inode_sb_list_add(struct inode *inode) { - spin_lock(&inode_sb_list_lock); + spin_lock(&inode->i_sb->s_inode_list_lock); list_add(&inode->i_sb_list, &inode->i_sb->s_inodes); - spin_unlock(&inode_sb_list_lock); + spin_unlock(&inode->i_sb->s_inode_list_lock); } EXPORT_SYMBOL_GPL(inode_sb_list_add); static inline void inode_sb_list_del(struct inode *inode) { if (!list_empty(&inode->i_sb_list)) { - spin_lock(&inode_sb_list_lock); + spin_lock(&inode->i_sb->s_inode_list_lock); list_del_init(&inode->i_sb_list); - spin_unlock(&inode_sb_list_lock); + spin_unlock(&inode->i_sb->s_inode_list_lock); } } @@ -527,8 +525,8 @@ static void evict(struct inode *inode) BUG_ON(!(inode->i_state & I_FREEING)); BUG_ON(!list_empty(&inode->i_lru)); - if (!list_empty(&inode->i_wb_list)) - inode_wb_list_del(inode); + if (!list_empty(&inode->i_io_list)) + inode_io_list_del(inode); inode_sb_list_del(inode); @@ -577,6 +575,7 @@ static void dispose_list(struct list_head *head) list_del_init(&inode->i_lru); evict(inode); + cond_resched(); } } @@ -594,7 +593,8 @@ void evict_inodes(struct super_block *sb) struct inode *inode, *next; LIST_HEAD(dispose); - spin_lock(&inode_sb_list_lock); +again: + spin_lock(&sb->s_inode_list_lock); list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { if (atomic_read(&inode->i_count)) continue; @@ -609,8 +609,20 @@ void evict_inodes(struct super_block *sb) inode_lru_list_del(inode); spin_unlock(&inode->i_lock); list_add(&inode->i_lru, &dispose); + + /* + * We can have a ton of inodes to evict at unmount time given + * enough memory, check to see if we need to go to sleep for a + * bit so we don't livelock. + */ + if (need_resched()) { + spin_unlock(&sb->s_inode_list_lock); + cond_resched(); + dispose_list(&dispose); + goto again; + } } - spin_unlock(&inode_sb_list_lock); + spin_unlock(&sb->s_inode_list_lock); dispose_list(&dispose); } @@ -631,7 +643,7 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty) struct inode *inode, *next; LIST_HEAD(dispose); - spin_lock(&inode_sb_list_lock); + spin_lock(&sb->s_inode_list_lock); list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { spin_lock(&inode->i_lock); if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { @@ -654,7 +666,7 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty) spin_unlock(&inode->i_lock); list_add(&inode->i_lru, &dispose); } - spin_unlock(&inode_sb_list_lock); + spin_unlock(&sb->s_inode_list_lock); dispose_list(&dispose); @@ -890,7 +902,7 @@ struct inode *new_inode(struct super_block *sb) { struct inode *inode; - spin_lock_prefetch(&inode_sb_list_lock); + spin_lock_prefetch(&sb->s_inode_list_lock); inode = new_inode_pseudo(sb); if (inode) diff --git a/fs/internal.h b/fs/internal.h index 4d5af583ab03..71859c4d0b41 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -112,14 +112,13 @@ extern int vfs_open(const struct path *, struct file *, const struct cred *); /* * inode.c */ -extern spinlock_t inode_sb_list_lock; extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc); extern void inode_add_lru(struct inode *inode); /* * fs-writeback.c */ -extern void inode_wb_list_del(struct inode *inode); +extern void inode_io_list_del(struct inode *inode); extern long get_nr_dirty_inodes(void); extern void evict_inodes(struct super_block *); diff --git a/fs/namei.c b/fs/namei.c index 1c2105ed20c5..29b927938b8c 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -560,6 +560,24 @@ static int __nd_alloc_stack(struct nameidata *nd) return 0; } +/** + * path_connected - Verify that a path->dentry is below path->mnt.mnt_root + * @path: nameidate to verify + * + * Rename can sometimes move a file or directory outside of a bind + * mount, path_connected allows those cases to be detected. + */ +static bool path_connected(const struct path *path) +{ + struct vfsmount *mnt = path->mnt; + + /* Only bind mounts can have disconnected paths */ + if (mnt->mnt_root == mnt->mnt_sb->s_root) + return true; + + return is_subdir(path->dentry, mnt->mnt_root); +} + static inline int nd_alloc_stack(struct nameidata *nd) { if (likely(nd->depth != EMBEDDED_LEVELS)) @@ -1296,6 +1314,8 @@ static int follow_dotdot_rcu(struct nameidata *nd) return -ECHILD; nd->path.dentry = parent; nd->seq = seq; + if (unlikely(!path_connected(&nd->path))) + return -ENOENT; break; } else { struct mount *mnt = real_mount(nd->path.mnt); @@ -1396,7 +1416,7 @@ static void follow_mount(struct path *path) } } -static void follow_dotdot(struct nameidata *nd) +static int follow_dotdot(struct nameidata *nd) { if (!nd->root.mnt) set_root(nd); @@ -1412,6 +1432,8 @@ static void follow_dotdot(struct nameidata *nd) /* rare case of legitimate dget_parent()... */ nd->path.dentry = dget_parent(nd->path.dentry); dput(old); + if (unlikely(!path_connected(&nd->path))) + return -ENOENT; break; } if (!follow_up(&nd->path)) @@ -1419,6 +1441,7 @@ static void follow_dotdot(struct nameidata *nd) } follow_mount(&nd->path); nd->inode = nd->path.dentry->d_inode; + return 0; } /* @@ -1634,7 +1657,7 @@ static inline int handle_dots(struct nameidata *nd, int type) if (nd->flags & LOOKUP_RCU) { return follow_dotdot_rcu(nd); } else - follow_dotdot(nd); + return follow_dotdot(nd); } return 0; } diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c index 474a3ce1b5e1..e785fd954c30 100644 --- a/fs/notify/inode_mark.c +++ b/fs/notify/inode_mark.c @@ -143,17 +143,17 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark, /** * fsnotify_unmount_inodes - an sb is unmounting. handle any watched inodes. - * @list: list of inodes being unmounted (sb->s_inodes) + * @sb: superblock being unmounted. * * Called during unmount with no locks held, so needs to be safe against - * concurrent modifiers. We temporarily drop inode_sb_list_lock and CAN block. + * concurrent modifiers. We temporarily drop sb->s_inode_list_lock and CAN block. */ -void fsnotify_unmount_inodes(struct list_head *list) +void fsnotify_unmount_inodes(struct super_block *sb) { struct inode *inode, *next_i, *need_iput = NULL; - spin_lock(&inode_sb_list_lock); - list_for_each_entry_safe(inode, next_i, list, i_sb_list) { + spin_lock(&sb->s_inode_list_lock); + list_for_each_entry_safe(inode, next_i, &sb->s_inodes, i_sb_list) { struct inode *need_iput_tmp; /* @@ -189,7 +189,7 @@ void fsnotify_unmount_inodes(struct list_head *list) spin_unlock(&inode->i_lock); /* In case the dropping of a reference would nuke next_i. */ - while (&next_i->i_sb_list != list) { + while (&next_i->i_sb_list != &sb->s_inodes) { spin_lock(&next_i->i_lock); if (!(next_i->i_state & (I_FREEING | I_WILL_FREE)) && atomic_read(&next_i->i_count)) { @@ -204,12 +204,12 @@ void fsnotify_unmount_inodes(struct list_head *list) } /* - * We can safely drop inode_sb_list_lock here because either + * We can safely drop s_inode_list_lock here because either * we actually hold references on both inode and next_i or * end of list. Also no new inodes will be added since the * umount has begun. */ - spin_unlock(&inode_sb_list_lock); + spin_unlock(&sb->s_inode_list_lock); if (need_iput_tmp) iput(need_iput_tmp); @@ -221,7 +221,7 @@ void fsnotify_unmount_inodes(struct list_head *list) iput(inode); - spin_lock(&inode_sb_list_lock); + spin_lock(&sb->s_inode_list_lock); } - spin_unlock(&inode_sb_list_lock); + spin_unlock(&sb->s_inode_list_lock); } diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index fed66e2c9fe8..ef0d64b2a6d9 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -928,7 +928,7 @@ static void add_dquot_ref(struct super_block *sb, int type) int reserved = 0; #endif - spin_lock(&inode_sb_list_lock); + spin_lock(&sb->s_inode_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { spin_lock(&inode->i_lock); if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || @@ -939,7 +939,7 @@ static void add_dquot_ref(struct super_block *sb, int type) } __iget(inode); spin_unlock(&inode->i_lock); - spin_unlock(&inode_sb_list_lock); + spin_unlock(&sb->s_inode_list_lock); #ifdef CONFIG_QUOTA_DEBUG if (unlikely(inode_get_rsv_space(inode) > 0)) @@ -951,15 +951,15 @@ static void add_dquot_ref(struct super_block *sb, int type) /* * We hold a reference to 'inode' so it couldn't have been * removed from s_inodes list while we dropped the - * inode_sb_list_lock We cannot iput the inode now as we can be + * s_inode_list_lock. We cannot iput the inode now as we can be * holding the last reference and we cannot iput it under - * inode_sb_list_lock. So we keep the reference and iput it + * s_inode_list_lock. So we keep the reference and iput it * later. */ old_inode = inode; - spin_lock(&inode_sb_list_lock); + spin_lock(&sb->s_inode_list_lock); } - spin_unlock(&inode_sb_list_lock); + spin_unlock(&sb->s_inode_list_lock); iput(old_inode); #ifdef CONFIG_QUOTA_DEBUG @@ -1028,7 +1028,7 @@ static void remove_dquot_ref(struct super_block *sb, int type, |
