diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2024-09-16 09:14:02 +0200 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2024-09-16 09:14:02 +0200 |
commit | 3352633ce6b221d64bf40644d412d9670e7d56e3 (patch) | |
tree | f74add2d0a46ac33034955c1fa8dcff1cedd7dc0 | |
parent | 2775df6e5e324be9dc375f7db2c8d3042df72bbf (diff) | |
parent | 24a988f75c8a5f16ef935c51039700e985767eb9 (diff) | |
download | linux-3352633ce6b221d64bf40644d412d9670e7d56e3.tar.gz linux-3352633ce6b221d64bf40644d412d9670e7d56e3.tar.bz2 linux-3352633ce6b221d64bf40644d412d9670e7d56e3.zip |
Merge tag 'vfs-6.12.file' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
Pull vfs file updates from Christian Brauner:
"This is the work to cleanup and shrink struct file significantly.
Right now, (focusing on x86) struct file is 232 bytes. After this
series struct file will be 184 bytes aka 3 cacheline and a spare 8
bytes for future extensions at the end of the struct.
With struct file being as ubiquitous as it is this should make a
difference for file heavy workloads and allow further optimizations in
the future.
- struct fown_struct was embedded into struct file letting it take up
32 bytes in total when really it shouldn't even be embedded in
struct file in the first place. Instead, actual users of struct
fown_struct now allocate the struct on demand. This frees up 24
bytes.
- Move struct file_ra_state into the union containg the cleanup hooks
and move f_iocb_flags out of the union. This closes a 4 byte hole
we created earlier and brings struct file to 192 bytes. Which means
struct file is 3 cachelines and we managed to shrink it by 40
bytes.
- Reorder struct file so that nothing crosses a cacheline.
I suspect that in the future we will end up reordering some members
to mitigate false sharing issues or just because someone does
actually provide really good perf data.
- Shrinking struct file to 192 bytes is only part of the work.
Files use a slab that is SLAB_TYPESAFE_BY_RCU and when a kmem cache
is created with SLAB_TYPESAFE_BY_RCU the free pointer must be
located outside of the object because the cache doesn't know what
part of the memory can safely be overwritten as it may be needed to
prevent object recycling.
That has the consequence that SLAB_TYPESAFE_BY_RCU may end up
adding a new cacheline.
So this also contains work to add a new kmem_cache_create_rcu()
function that allows the caller to specify an offset where the
freelist pointer is supposed to be placed. Thus avoiding the
implicit addition of a fourth cacheline.
- And finally this removes the f_version member in struct file.
The f_version member isn't particularly well-defined. It is mainly
used as a cookie to detect concurrent seeks when iterating
directories. But it is also abused by some subsystems for
completely unrelated things.
It is mostly a directory and filesystem specific thing that doesn't
really need to live in struct file and with its wonky semantics it
really lacks a specific function.
For pipes, f_version is (ab)used to defer poll notifications until
a write has happened. And struct pipe_inode_info is used by
multiple struct files in their ->private_data so there's no chance
of pushing that down into file->private_data without introducing
another pointer indirection.
But pipes don't rely on f_pos_lock so this adds a union into struct
file encompassing f_pos_lock and a pipe specific f_pipe member that
pipes can use. This union of course can be extended to other file
types and is similar to what we do in struct inode already"
* tag 'vfs-6.12.file' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: (26 commits)
fs: remove f_version
pipe: use f_pipe
fs: add f_pipe
ubifs: store cookie in private data
ufs: store cookie in private data
udf: store cookie in private data
proc: store cookie in private data
ocfs2: store cookie in private data
input: remove f_version abuse
ext4: store cookie in private data
ext2: store cookie in private data
affs: store cookie in private data
fs: add generic_llseek_cookie()
fs: use must_set_pos()
fs: add must_set_pos()
fs: add vfs_setpos_cookie()
s390: remove unused f_version
ceph: remove unused f_version
adi: remove unused f_version
mm: Removed @freeptr_offset to prevent doc warning
...
-rw-r--r-- | drivers/char/adi.c | 1 | ||||
-rw-r--r-- | drivers/input/input.c | 47 | ||||
-rw-r--r-- | drivers/net/tun.c | 6 | ||||
-rw-r--r-- | drivers/s390/char/hmcdrv_dev.c | 3 | ||||
-rw-r--r-- | drivers/tty/tty_io.c | 6 | ||||
-rw-r--r-- | fs/affs/dir.c | 44 | ||||
-rw-r--r-- | fs/ceph/dir.c | 1 | ||||
-rw-r--r-- | fs/ext2/dir.c | 28 | ||||
-rw-r--r-- | fs/ext4/dir.c | 50 | ||||
-rw-r--r-- | fs/ext4/ext4.h | 2 | ||||
-rw-r--r-- | fs/ext4/inline.c | 7 | ||||
-rw-r--r-- | fs/fcntl.c | 166 | ||||
-rw-r--r-- | fs/file_table.c | 16 | ||||
-rw-r--r-- | fs/internal.h | 1 | ||||
-rw-r--r-- | fs/locks.c | 6 | ||||
-rw-r--r-- | fs/notify/dnotify/dnotify.c | 6 | ||||
-rw-r--r-- | fs/ocfs2/dir.c | 3 | ||||
-rw-r--r-- | fs/ocfs2/file.c | 11 | ||||
-rw-r--r-- | fs/ocfs2/file.h | 1 | ||||
-rw-r--r-- | fs/pipe.c | 8 | ||||
-rw-r--r-- | fs/proc/base.c | 30 | ||||
-rw-r--r-- | fs/read_write.c | 171 | ||||
-rw-r--r-- | fs/ubifs/dir.c | 64 | ||||
-rw-r--r-- | fs/udf/dir.c | 28 | ||||
-rw-r--r-- | fs/ufs/dir.c | 28 | ||||
-rw-r--r-- | include/linux/fs.h | 106 | ||||
-rw-r--r-- | include/linux/slab.h | 9 | ||||
-rw-r--r-- | mm/slab.h | 2 | ||||
-rw-r--r-- | mm/slab_common.c | 138 | ||||
-rw-r--r-- | mm/slub.c | 20 | ||||
-rw-r--r-- | net/core/sock.c | 2 | ||||
-rw-r--r-- | security/selinux/hooks.c | 2 | ||||
-rw-r--r-- | security/smack/smack_lsm.c | 2 |
33 files changed, 744 insertions, 271 deletions
diff --git a/drivers/char/adi.c b/drivers/char/adi.c index 1c76c8758f0f..f9bec10a6064 100644 --- a/drivers/char/adi.c +++ b/drivers/char/adi.c @@ -190,7 +190,6 @@ static loff_t adi_llseek(struct file *file, loff_t offset, int whence) if (offset != file->f_pos) { file->f_pos = offset; - file->f_version = 0; ret = offset; } diff --git a/drivers/input/input.c b/drivers/input/input.c index 54c57b267b25..19ea1888da9f 100644 --- a/drivers/input/input.c +++ b/drivers/input/input.c @@ -1079,33 +1079,31 @@ static inline void input_wakeup_procfs_readers(void) wake_up(&input_devices_poll_wait); } +struct input_seq_state { + unsigned short pos; + bool mutex_acquired; + int input_devices_state; +}; + static __poll_t input_proc_devices_poll(struct file *file, poll_table *wait) { + struct seq_file *seq = file->private_data; + struct input_seq_state *state = seq->private; + poll_wait(file, &input_devices_poll_wait, wait); - if (file->f_version != input_devices_state) { - file->f_version = input_devices_state; + if (state->input_devices_state != input_devices_state) { + state->input_devices_state = input_devices_state; return EPOLLIN | EPOLLRDNORM; } return 0; } -union input_seq_state { - struct { - unsigned short pos; - bool mutex_acquired; - }; - void *p; -}; - static void *input_devices_seq_start(struct seq_file *seq, loff_t *pos) { - union input_seq_state *state = (union input_seq_state *)&seq->private; + struct input_seq_state *state = seq->private; int error; - /* We need to fit into seq->private pointer */ - BUILD_BUG_ON(sizeof(union input_seq_state) != sizeof(seq->private)); - error = mutex_lock_interruptible(&input_mutex); if (error) { state->mutex_acquired = false; @@ -1124,7 +1122,7 @@ static void *input_devices_seq_next(struct seq_file *seq, void *v, loff_t *pos) static void input_seq_stop(struct seq_file *seq, void *v) { - union input_seq_state *state = (union input_seq_state *)&seq->private; + struct input_seq_state *state = seq->private; if (state->mutex_acquired) mutex_unlock(&input_mutex); @@ -1210,7 +1208,8 @@ static const struct seq_operations input_devices_seq_ops = { static int input_proc_devices_open(struct inode *inode, struct file *file) { - return seq_open(file, &input_devices_seq_ops); + return seq_open_private(file, &input_devices_seq_ops, + sizeof(struct input_seq_state)); } static const struct proc_ops input_devices_proc_ops = { @@ -1218,17 +1217,14 @@ static const struct proc_ops input_devices_proc_ops = { .proc_poll = input_proc_devices_poll, .proc_read = seq_read, .proc_lseek = seq_lseek, - .proc_release = seq_release, + .proc_release = seq_release_private, }; static void *input_handlers_seq_start(struct seq_file *seq, loff_t *pos) { - union input_seq_state *state = (union input_seq_state *)&seq->private; + struct input_seq_state *state = seq->private; int error; - /* We need to fit into seq->private pointer */ - BUILD_BUG_ON(sizeof(union input_seq_state) != sizeof(seq->private)); - error = mutex_lock_interruptible(&input_mutex); if (error) { state->mutex_acquired = false; @@ -1243,7 +1239,7 @@ static void *input_handlers_seq_start(struct seq_file *seq, loff_t *pos) static void *input_handlers_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - union input_seq_state *state = (union input_seq_state *)&seq->private; + struct input_seq_state *state = seq->private; state->pos = *pos + 1; return seq_list_next(v, &input_handler_list, pos); @@ -1252,7 +1248,7 @@ static void *input_handlers_seq_next(struct seq_file *seq, void *v, loff_t *pos) static int input_handlers_seq_show(struct seq_file *seq, void *v) { struct input_handler *handler = container_of(v, struct input_handler, node); - union input_seq_state *state = (union input_seq_state *)&seq->private; + struct input_seq_state *state = seq->private; seq_printf(seq, "N: Number=%u Name=%s", state->pos, handler->name); if (handler->filter) @@ -1273,14 +1269,15 @@ static const struct seq_operations input_handlers_seq_ops = { static int input_proc_handlers_open(struct inode *inode, struct file *file) { - return seq_open(file, &input_handlers_seq_ops); + return seq_open_private(file, &input_handlers_seq_ops, + sizeof(struct input_seq_state)); } static const struct proc_ops input_handlers_proc_ops = { .proc_open = input_proc_handlers_open, .proc_read = seq_read, .proc_lseek = seq_lseek, - .proc_release = seq_release, + .proc_release = seq_release_private, }; static int __init input_proc_init(void) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index a645c36f2cee..5f77faef0ff1 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -3452,6 +3452,12 @@ static int tun_chr_fasync(int fd, struct file *file, int on) struct tun_file *tfile = file->private_data; int ret; + if (on) { + ret = file_f_owner_allocate(file); + if (ret) + goto out; + } + if ((ret = fasync_helper(fd, file, on, &tfile->fasync)) < 0) goto out; diff --git a/drivers/s390/char/hmcdrv_dev.c b/drivers/s390/char/hmcdrv_dev.c index 8d50c894711f..e069dd685899 100644 --- a/drivers/s390/char/hmcdrv_dev.c +++ b/drivers/s390/char/hmcdrv_dev.c @@ -186,9 +186,6 @@ static loff_t hmcdrv_dev_seek(struct file *fp, loff_t pos, int whence) if (pos < 0) return -EINVAL; - if (fp->f_pos != pos) - ++fp->f_version; - fp->f_pos = pos; return pos; } diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c index 407b0d87b7c1..7ae0c8934f42 100644 --- a/drivers/tty/tty_io.c +++ b/drivers/tty/tty_io.c @@ -2225,6 +2225,12 @@ static int __tty_fasync(int fd, struct file *filp, int on) if (tty_paranoia_check(tty, file_inode(filp), "tty_fasync")) goto out; + if (on) { + retval = file_f_owner_allocate(filp); + if (retval) + goto out; + } + retval = fasync_helper(fd, filp, on, &tty->fasync); if (retval <= 0) goto out; diff --git a/fs/affs/dir.c b/fs/affs/dir.c index b2bf7016e1b3..bd40d5f08810 100644 --- a/fs/affs/dir.c +++ b/fs/affs/dir.c @@ -17,13 +17,44 @@ #include <linux/iversion.h> #include "affs.h" +struct affs_dir_data { + unsigned long ino; + u64 cookie; +}; + static int affs_readdir(struct file *, struct dir_context *); +static loff_t affs_dir_llseek(struct file *file, loff_t offset, int whence) +{ + struct affs_dir_data *data = file->private_data; + + return generic_llseek_cookie(file, offset, whence, &data->cookie); +} + +static int affs_dir_open(struct inode *inode, struct file *file) +{ + struct affs_dir_data *data; + + data = kzalloc(sizeof(struct affs_dir_data), GFP_KERNEL); + if (!data) + return -ENOMEM; + file->private_data = data; + return 0; +} + +static int affs_dir_release(struct inode *inode, struct file *file) +{ + kfree(file->private_data); + return 0; +} + const struct file_operations affs_dir_operations = { + .open = affs_dir_open, .read = generic_read_dir, - .llseek = generic_file_llseek, + .llseek = affs_dir_llseek, .iterate_shared = affs_readdir, .fsync = affs_file_fsync, + .release = affs_dir_release, }; /* @@ -45,6 +76,7 @@ static int affs_readdir(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); + struct affs_dir_data *data = file->private_data; struct super_block *sb = inode->i_sb; struct buffer_head *dir_bh = NULL; struct buffer_head *fh_bh = NULL; @@ -59,7 +91,7 @@ affs_readdir(struct file *file, struct dir_context *ctx) pr_debug("%s(ino=%lu,f_pos=%llx)\n", __func__, inode->i_ino, ctx->pos); if (ctx->pos < 2) { - file->private_data = (void *)0; + data->ino = 0; if (!dir_emit_dots(file, ctx)) return 0; } @@ -80,8 +112,8 @@ affs_readdir(struct file *file, struct dir_context *ctx) /* If the directory hasn't changed since the last call to readdir(), * we can jump directly to where we left off. */ - ino = (u32)(long)file->private_data; - if (ino && inode_eq_iversion(inode, file->f_version)) { + ino = data->ino; + if (ino && inode_eq_iversion(inode, data->cookie)) { pr_debug("readdir() left off=%d\n", ino); goto inside; } @@ -131,8 +163,8 @@ inside: } while (ino); } done: - file->f_version = inode_query_iversion(inode); - file->private_data = (void *)(long)ino; + data->cookie = inode_query_iversion(inode); + data->ino = ino; affs_brelse(fh_bh); out_brelse_dir: diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 18c72b305858..ddec8c9244ee 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -707,7 +707,6 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) if (offset != file->f_pos) { file->f_pos = offset; - file->f_version = 0; dfi->file_info.flags &= ~CEPH_F_ATEND; } retval = offset; diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 2f3042d0174c..402fecf90a44 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -263,7 +263,7 @@ ext2_readdir(struct file *file, struct dir_context *ctx) unsigned long n = pos >> PAGE_SHIFT; unsigned long npages = dir_pages(inode); unsigned chunk_mask = ~(ext2_chunk_size(inode)-1); - bool need_revalidate = !inode_eq_iversion(inode, file->f_version); + bool need_revalidate = !inode_eq_iversion(inode, *(u64 *)file->private_data); bool has_filetype; if (pos > inode->i_size - EXT2_DIR_REC_LEN(1)) @@ -290,7 +290,7 @@ ext2_readdir(struct file *file, struct dir_context *ctx) offset = ext2_validate_entry(kaddr, offset, chunk_mask); ctx->pos = (n<<PAGE_SHIFT) + offset; } - file->f_version = inode_query_iversion(inode); + *(u64 *)file->private_data = inode_query_iversion(inode); need_revalidate = false; } de = (ext2_dirent *)(kaddr+offset); @@ -703,8 +703,30 @@ not_empty: return 0; } +static int ext2_dir_open(struct inode *inode, struct file *file) +{ + file->private_data = kzalloc(sizeof(u64), GFP_KERNEL); + if (!file->private_data) + return -ENOMEM; + return 0; +} + +static int ext2_dir_release(struct inode *inode, struct file *file) +{ + kfree(file->private_data); + return 0; +} + +static loff_t ext2_dir_llseek(struct file *file, loff_t offset, int whence) +{ + return generic_llseek_cookie(file, offset, whence, + (u64 *)file->private_data); +} + const struct file_operations ext2_dir_operations = { - .llseek = generic_file_llseek, + .open = ext2_dir_open, + .release = ext2_dir_release, + .llseek = ext2_dir_llseek, .read = generic_read_dir, .iterate_shared = ext2_readdir, .unlocked_ioctl = ext2_ioctl, diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index ff4514e4626b..13196afe55ce 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -133,6 +133,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) struct super_block *sb = inode->i_sb; struct buffer_head *bh = NULL; struct fscrypt_str fstr = FSTR_INIT(NULL, 0); + struct dir_private_info *info = file->private_data; err = fscrypt_prepare_readdir(inode); if (err) @@ -229,7 +230,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) * readdir(2), then we might be pointing to an invalid * dirent right now. Scan from the start of the block * to make sure. */ - if (!inode_eq_iversion(inode, file->f_version)) { + if (!inode_eq_iversion(inode, info->cookie)) { for (i = 0; i < sb->s_blocksize && i < offset; ) { de = (struct ext4_dir_entry_2 *) (bh->b_data + i); @@ -249,7 +250,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) offset = i; ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1)) | offset; - file->f_version = inode_query_iversion(inode); + info->cookie = inode_query_iversion(inode); } while (ctx->pos < inode->i_size @@ -384,6 +385,7 @@ static inline loff_t ext4_get_htree_eof(struct file *filp) static loff_t ext4_dir_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; + struct dir_private_info *info = file->private_data; int dx_dir = is_dx_dir(inode); loff_t ret, htree_max = ext4_get_htree_eof(file); @@ -392,7 +394,7 @@ static loff_t ext4_dir_llseek(struct file *file, loff_t offset, int whence) htree_max, htree_max); else ret = ext4_llseek(file, offset, whence); - file->f_version = inode_peek_iversion(inode) - 1; + info->cookie = inode_peek_iversion(inode) - 1; return ret; } @@ -429,18 +431,15 @@ static void free_rb_tree_fname(struct rb_root *root) *root = RB_ROOT; } - -static struct dir_private_info *ext4_htree_create_dir_info(struct file *filp, - loff_t pos) +static void ext4_htree_init_dir_info(struct file *filp, loff_t pos) { - struct dir_private_info *p; - - p = kzalloc(sizeof(*p), GFP_KERNEL); - if (!p) - return NULL; - p->curr_hash = pos2maj_hash(filp, pos); - p->curr_minor_hash = pos2min_hash(filp, pos); - return p; + struct dir_private_info *p = filp->private_data; + + if (is_dx_dir(file_inode(filp)) && !p->initialized) { + p->curr_hash = pos2maj_hash(filp, pos); + p->curr_minor_hash = pos2min_hash(filp, pos); + p->initialized = true; + } } void ext4_htree_free_dir_info(struct dir_private_info *p) @@ -552,12 +551,7 @@ static int ext4_dx_readdir(struct file *file, struct dir_context *ctx) struct fname *fname; int ret = 0; - if (!info) { - info = ext4_htree_create_dir_info(file, ctx->pos); - if (!info) - return -ENOMEM; - file->private_data = info; - } + ext4_htree_init_dir_info(file, ctx->pos); if (ctx->pos == ext4_get_htree_eof(file)) return 0; /* EOF */ @@ -590,10 +584,10 @@ static int ext4_dx_readdir(struct file *file, struct dir_context *ctx) * cached entries. */ if ((!info->curr_node) || - !inode_eq_iversion(inode, file->f_version)) { + !inode_eq_iversion(inode, info->cookie)) { info->curr_node = NULL; free_rb_tree_fname(&info->root); - file->f_version = inode_query_iversion(inode); + info->cookie = inode_query_iversion(inode); ret = ext4_htree_fill_tree(file, info->curr_hash, info->curr_minor_hash, &info->next_hash); @@ -664,7 +658,19 @@ int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, void *buf, return 0; } +static int ext4_dir_open(struct inode *inode, struct file *file) +{ + struct dir_private_info *info; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return -ENOMEM; + file->private_data = info; + return 0; +} + const struct file_operations ext4_dir_operations = { + .open = ext4_dir_open, .llseek = ext4_dir_llseek, .read = generic_read_dir, .iterate_shared = ext4_readdir, diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index c3429b664b8d..ecc15e5f1eba 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2553,6 +2553,8 @@ struct dir_private_info { __u32 curr_hash; __u32 curr_minor_hash; __u32 next_hash; + u64 cookie; + bool initialized; }; /* calculate the first block number of the group */ diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index d5c2f3b75084..edf4aa99a974 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -1460,6 +1460,7 @@ int ext4_read_inline_dir(struct file *file, struct ext4_iloc iloc; void *dir_buf = NULL; int dotdot_offset, dotdot_size, extra_offset, extra_size; + struct dir_private_info *info = file->private_data; ret = ext4_get_inode_loc(inode, &iloc); if (ret) @@ -1503,12 +1504,12 @@ int ext4_read_inline_dir(struct file *file, extra_size = extra_offset + inline_size; /* - * If the version has changed since the last call to + * If the cookie has changed since the last call to * readdir(2), then we might be pointing to an invalid * dirent right now. Scan from the start of the inline * dir to make sure. */ - if (!inode_eq_iversion(inode, file->f_version)) { + if (!inode_eq_iversion(inode, info->cookie)) { for (i = 0; i < extra_size && i < offset;) { /* * "." is with offset 0 and @@ -1540,7 +1541,7 @@ int ext4_read_inline_dir(struct file *file, } offset = i; ctx->pos = offset; - file->f_version = inode_query_iversion(inode); + info->cookie = inode_query_iversion(inode); } while (ctx->pos < extra_size) { diff --git a/fs/fcntl.c b/fs/fcntl.c index 22ec683ad8f8..f6fde75a3bd5 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -33,6 +33,8 @@ #include <asm/siginfo.h> #include <linux/uaccess.h> +#include "internal.h" + #define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME) static int setfl(int fd, struct file * filp, unsigned int arg) @@ -87,22 +89,64 @@ static int setfl(int fd, struct file * filp, unsigned int arg) return error; } +/* + * Allocate an file->f_owner struct if it doesn't exist, handling racing + * allocations correctly. + */ +int file_f_owner_allocate(struct file *file) +{ + struct fown_struct *f_owner; + + f_owner = file_f_owner(file); + if (f_owner) + return 0; + + f_owner = kzalloc(sizeof(struct fown_struct), GFP_KERNEL); + if (!f_owner) + return -ENOMEM; + + rwlock_init(&f_owner->lock); + f_owner->file = file; + /* If someone else raced us, drop our allocation. */ + if (unlikely(cmpxchg(&file->f_owner, NULL, f_owner))) + kfree(f_owner); + return 0; +} +EXPORT_SYMBOL(file_f_owner_allocate); + +void file_f_owner_release(struct file *file) +{ + struct fown_struct *f_owner; + + f_owner = file_f_owner(file); + if (f_owner) { + put_pid(f_owner->pid); + kfree(f_owner); + } +} + static void f_modown(struct file *filp, struct pid *pid, enum pid_type type, int force) { - write_lock_irq(&filp->f_owner.lock); - if (force || !filp->f_owner.pid) { - put_pid(filp->f_owner.pid); - filp->f_owner.pid = get_pid(pid); - filp->f_owner.pid_type = type; + struct fown_struct *f_owner; + + f_owner = file_f_owner(filp); + if (WARN_ON_ONCE(!f_owner)) + return; + + write_lock_irq(&f_owner->lock); + if (force || !f_owner->pid) { + put_pid(f_owner->pid); + f_owner->pid = get_pid(pid); + f_owner->pid_type = type; if (pid) { const struct cred *cred = current_cred(); - filp->f_owner.uid = cred->uid; - filp->f_owner.euid = cred->euid; + f_owner->uid = cred->uid; + f_owner->euid = cred->euid; } } - write_unlock_irq(&filp->f_owner.lock); + write_unlock_irq(&f_owner->lock); } void __f_setown(struct file *filp, struct pid *pid, enum pid_type type, @@ -119,6 +163,8 @@ int f_setown(struct file *filp, int who, int force) struct pid *pid = NULL; int ret = 0; + might_sleep(); + type = PIDTYPE_TGID; if (who < 0) { /* avoid overflow below */ @@ -129,6 +175,10 @@ int f_setown(struct file *filp, int who, int force) who = -who; } + ret = file_f_owner_allocate(filp); + if (ret) + return ret; + rcu_read_lock(); if (who) { pid = find_vpid(who); @@ -152,16 +202,21 @@ void f_delown(struct file *filp) pid_t f_getown(struct file *filp) { pid_t pid = 0; + struct fown_struct *f_owner; + + f_owner = file_f_owner(filp); + if (!f_owner) + return pid; - read_lock_irq(&filp->f_owner.lock); + read_lock_irq(&f_owner->lock); rcu_read_lock(); - if (pid_task(filp->f_owner.pid, filp->f_owner.pid_type)) { - pid = pid_vnr(filp->f_owner.pid); - if (filp->f_owner.pid_type == PIDTYPE_PGID) + if (pid_task(f_owner->pid, f_owner->pid_type)) { + pid = pid_vnr(f_owner->pid); + if (f_owner->pid_type == PIDTYPE_PGID) pid = -pid; } rcu_read_unlock(); - read_unlock_irq(&filp->f_owner.lock); + read_unlock_irq(&f_owner->lock); return pid; } @@ -194,6 +249,10 @@ static int f_setown_ex(struct file *filp, unsigned long arg) return -EINVAL; } + ret = file_f_owner_allocate(filp); + if (ret) + return ret; + rcu_read_lock(); pid = find_vpid(owner.pid); if (owner.pid && !pid) @@ -210,13 +269,20 @@ static int f_getown_ex(struct file *filp, unsigned long arg) struct f_owner_ex __user *owner_p = (void __user *)arg; struct f_owner_ex owner = {}; int ret = 0; + struct fown_struct *f_owner; + enum pid_type pid_type = PIDTYPE_PID; - read_lock_irq(&filp->f_owner.lock); - rcu_read_lock(); - if (pid_task(filp->f_owner.pid, filp->f_owner.pid_type)) - owner.pid = pid_vnr(filp->f_owner.pid); - rcu_read_unlock(); - switch (filp->f_owner.pid_type) { + f_owner = file_f_owner(filp); + if (f_owner) { + read_lock_irq(&f_owner->lock); + rcu_read_lock(); + if (pid_task(f_owner->pid, f_owner->pid_type)) + owner.pid = pid_vnr(f_owner->pid); + rcu_read_unlock(); + pid_type = f_owner->pid_type; + } + + switch (pid_type) { case PIDTYPE_PID: owner.type = F_OWNER_TID; break; @@ -234,7 +300,8 @@ static int f_getown_ex(struct file *filp, unsigned long arg) ret = -EINVAL; break; } - read_unlock_irq(&filp->f_owner.lock); + if (f_owner) + read_unlock_irq(&f_owner->lock); if (!ret) { ret = copy_to_user(owner_p, &owner, sizeof(owner)); @@ -248,14 +315,18 @@ static int f_getown_ex(struct file *filp, unsigned long arg) static int f_getowner_uids(struct file *filp, unsigned long arg) { struct user_namespace *user_ns = current_user_ns(); + struct fown_struct *f_owner; uid_t __user *dst = (void __user *)arg; - uid_t src[2]; + uid_t src[2] = {0, 0}; int err; - read_lock_irq(&filp->f_owner.lock); - src[0] = from_kuid(user_ns, filp->f_owner.uid); - src[1] = from_kuid(user_ns, filp->f_owner.euid); - read_unlock_irq(&filp->f_owner.lock); + f_owner = file_f_owner(filp); + if (f_owner) { + read_lock_irq(&f_owner->lock); + src[0] = from_kuid(user_ns, f_owner->uid); + src[1] = from_kuid(user_ns, f_owner->euid); + read_unlock_irq(&f_owner->lock); + } err = put_user(src[0], &dst[0]); err |= put_user(src[1], &dst[1]); @@ -349,6 +420,30 @@ static long f_created_query(const struct file *filp) return !!(filp->f_mode & FMODE_CREATED); } +static int f_owner_sig(struct file *filp, int signum, bool setsig) +{ + int ret = 0; + struct fown_struct *f_owner; + + might_sleep(); + + if (setsig) { + if (!valid_signal(signum)) + return -EINVAL; + + ret = file_f_owner_allocate(filp); + if (ret) + return ret; + } + + f_owner = file_f_owner(filp); + if (setsig) + f_owner->signum = signum; + else if (f_owner) + ret = f_owner->signum; + return ret; +} + static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, struct file *filp) { @@ -430,15 +525,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, err = f_getowner_uids(filp, arg); break; case F_GETSIG: - err = filp->f_owner.signum; + err = f_owner_sig(filp, 0, false); break; case F_SETSIG: - /* arg == 0 restores default behaviour. */ |