summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--fs/binfmt_elf.c4
-rw-r--r--fs/binfmt_elf_fdpic.c4
-rw-r--r--fs/btrfs/ioctl.c9
-rw-r--r--fs/btrfs/super.c2
-rw-r--r--fs/exec.c8
-rw-r--r--fs/ext4/file.c3
-rw-r--r--fs/ext4/super.c3
-rw-r--r--fs/fcntl.c4
-rw-r--r--fs/notify/fanotify/fanotify.c31
-rw-r--r--fs/notify/fanotify/fanotify.h15
-rw-r--r--fs/notify/fanotify/fanotify_user.c150
-rw-r--r--fs/notify/fsnotify.c83
-rw-r--r--fs/open.c62
-rw-r--r--fs/xfs/xfs_file.c13
-rw-r--r--fs/xfs/xfs_super.c2
-rw-r--r--include/linux/fanotify.h18
-rw-r--r--include/linux/fs.h72
-rw-r--r--include/linux/fsnotify.h78
-rw-r--r--include/linux/fsnotify_backend.h53
-rw-r--r--include/linux/mm.h1
-rw-r--r--include/uapi/asm-generic/fcntl.h1
-rw-r--r--include/uapi/linux/fanotify.h18
-rw-r--r--kernel/fork.c12
-rw-r--r--mm/filemap.c86
-rw-r--r--mm/memory.c19
-rw-r--r--mm/nommu.c7
-rw-r--r--mm/readahead.c14
-rw-r--r--security/selinux/hooks.c3
28 files changed, 669 insertions, 106 deletions
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 106f0e8af177..8054f44d39cf 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1257,7 +1257,7 @@ out_free_interp:
}
reloc_func_desc = interp_load_addr;
- allow_write_access(interpreter);
+ exe_file_allow_write_access(interpreter);
fput(interpreter);
kfree(interp_elf_ex);
@@ -1354,7 +1354,7 @@ out_free_dentry:
kfree(interp_elf_ex);
kfree(interp_elf_phdata);
out_free_file:
- allow_write_access(interpreter);
+ exe_file_allow_write_access(interpreter);
if (interpreter)
fput(interpreter);
out_free_ph:
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index f1a7c4875c4a..c13ee8180b17 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -394,7 +394,7 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
goto error;
}
- allow_write_access(interpreter);
+ exe_file_allow_write_access(interpreter);
fput(interpreter);
interpreter = NULL;
}
@@ -467,7 +467,7 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
error:
if (interpreter) {
- allow_write_access(interpreter);
+ exe_file_allow_write_access(interpreter);
fput(interpreter);
}
kfree(interpreter_name);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index ae98269a5e3a..6c18bad53cd3 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2544,6 +2544,15 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
goto out;
}
+ /*
+ * Don't allow defrag on pre-content watched files, as it could
+ * populate the page cache with 0's via readahead.
+ */
+ if (unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) {
+ ret = -EINVAL;
+ goto out;
+ }
+
if (argp) {
if (copy_from_user(&range, argp, sizeof(range))) {
ret = -EFAULT;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f809c3200c21..dc4fee519ca6 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -961,7 +961,7 @@ static int btrfs_fill_super(struct super_block *sb,
#endif
sb->s_xattr = btrfs_xattr_handlers;
sb->s_time_gran = 1;
- sb->s_iflags |= SB_I_CGROUPWB;
+ sb->s_iflags |= SB_I_CGROUPWB | SB_I_ALLOW_HSM;
err = super_setup_bdi(sb);
if (err) {
diff --git a/fs/exec.c b/fs/exec.c
index d58b061c5e42..18f25c23b09f 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -913,7 +913,7 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags)
path_noexec(&file->f_path))
return ERR_PTR(-EACCES);
- err = deny_write_access(file);
+ err = exe_file_deny_write_access(file);
if (err)
return ERR_PTR(err);
@@ -928,7 +928,7 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags)
* Returns ERR_PTR on failure or allocated struct file on success.
*
* As this is a wrapper for the internal do_open_execat(), callers
- * must call allow_write_access() before fput() on release. Also see
+ * must call exe_file_allow_write_access() before fput() on release. Also see
* do_close_execat().
*/
struct file *open_exec(const char *name)
@@ -1493,7 +1493,7 @@ static void do_close_execat(struct file *file)
{
if (!file)
return;
- allow_write_access(file);
+ exe_file_allow_write_access(file);
fput(file);
}
@@ -1822,7 +1822,7 @@ static int exec_binprm(struct linux_binprm *bprm)
bprm->file = bprm->interpreter;
bprm->interpreter = NULL;
- allow_write_access(exec);
+ exe_file_allow_write_access(exec);
if (unlikely(bprm->have_execfd)) {
if (bprm->executable) {
fput(exec);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 3bd96c3d4cd0..a5205149adba 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -756,6 +756,9 @@ retry:
return VM_FAULT_SIGBUS;
}
} else {
+ result = filemap_fsnotify_fault(vmf);
+ if (unlikely(result))
+ return result;
filemap_invalidate_lock_shared(mapping);
}
result = dax_iomap_fault(vmf, order, &pfn, &error, &ext4_iomap_ops);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index fdf4817a7dbc..a50e5c31b937 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -5301,6 +5301,9 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
/* i_version is always enabled now */
sb->s_flags |= SB_I_VERSION;
+ /* HSM events are allowed by default. */
+ sb->s_iflags |= SB_I_ALLOW_HSM;
+
err = ext4_check_feature_compatibility(sb, es, silent);
if (err)
goto failed_mount;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 49884fa3c81d..5598e4d57422 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -1158,10 +1158,10 @@ static int __init fcntl_init(void)
* Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
* is defined as O_NONBLOCK on some platforms and not on others.
*/
- BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ !=
+ BUILD_BUG_ON(20 - 1 /* for O_RDONLY being 0 */ !=
HWEIGHT32(
(VALID_OPEN_FLAGS & ~(O_NONBLOCK | O_NDELAY)) |
- __FMODE_EXEC | __FMODE_NONOTIFY));
+ __FMODE_EXEC));
fasync_cache = kmem_cache_create("fasync_cache",
sizeof(struct fasync_struct), 0,
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 24c7c5df4998..95646f7c46ca 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -223,7 +223,7 @@ static int fanotify_get_response(struct fsnotify_group *group,
struct fanotify_perm_event *event,
struct fsnotify_iter_info *iter_info)
{
- int ret;
+ int ret, errno;
pr_debug("%s: group=%p event=%p\n", __func__, group, event);
@@ -262,14 +262,23 @@ static int fanotify_get_response(struct fsnotify_group *group,
ret = 0;
break;
case FAN_DENY:
+ /* Check custom errno from pre-content events */
+ errno = fanotify_get_response_errno(event->response);
+ if (errno) {
+ ret = -errno;
+ break;
+ }
+ fallthrough;
default:
ret = -EPERM;
}
/* Check if the response should be audited */
- if (event->response & FAN_AUDIT)
- audit_fanotify(event->response & ~FAN_AUDIT,
- &event->audit_rule);
+ if (event->response & FAN_AUDIT) {
+ u32 response = event->response &
+ (FANOTIFY_RESPONSE_ACCESS | FANOTIFY_RESPONSE_FLAGS);
+ audit_fanotify(response & ~FAN_AUDIT, &event->audit_rule);
+ }
pr_debug("%s: group=%p event=%p about to return ret=%d\n", __func__,
group, event, ret);
@@ -548,9 +557,13 @@ static struct fanotify_event *fanotify_alloc_path_event(const struct path *path,
return &pevent->fae;
}
-static struct fanotify_event *fanotify_alloc_perm_event(const struct path *path,
+static struct fanotify_event *fanotify_alloc_perm_event(const void *data,
+ int data_type,
gfp_t gfp)
{
+ const struct path *path = fsnotify_data_path(data, data_type);
+ const struct file_range *range =
+ fsnotify_data_file_range(data, data_type);
struct fanotify_perm_event *pevent;
pevent = kmem_cache_alloc(fanotify_perm_event_cachep, gfp);
@@ -564,6 +577,9 @@ static struct fanotify_event *fanotify_alloc_perm_event(const struct path *path,
pevent->hdr.len = 0;
pevent->state = FAN_EVENT_INIT;
pevent->path = *path;
+ /* NULL ppos means no range info */
+ pevent->ppos = range ? &range->pos : NULL;
+ pevent->count = range ? range->count : 0;
path_get(path);
return &pevent->fae;
@@ -801,7 +817,7 @@ static struct fanotify_event *fanotify_alloc_event(
old_memcg = set_active_memcg(group->memcg);
if (fanotify_is_perm_event(mask)) {
- event = fanotify_alloc_perm_event(path, gfp);
+ event = fanotify_alloc_perm_event(data, data_type, gfp);
} else if (fanotify_is_error_event(mask)) {
event = fanotify_alloc_error_event(group, fsid, data,
data_type, &hash);
@@ -909,8 +925,9 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask,
BUILD_BUG_ON(FAN_OPEN_EXEC_PERM != FS_OPEN_EXEC_PERM);
BUILD_BUG_ON(FAN_FS_ERROR != FS_ERROR);
BUILD_BUG_ON(FAN_RENAME != FS_RENAME);
+ BUILD_BUG_ON(FAN_PRE_ACCESS != FS_PRE_ACCESS);
- BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 21);
+ BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 22);
mask = fanotify_group_event_mask(group, iter_info, &match_mask,
mask, data, data_type, dir);
diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
index e5ab33cae6a7..c12cbc270539 100644
--- a/fs/notify/fanotify/fanotify.h
+++ b/fs/notify/fanotify/fanotify.h
@@ -425,6 +425,8 @@ FANOTIFY_PE(struct fanotify_event *event)
struct fanotify_perm_event {
struct fanotify_event fae;
struct path path;
+ const loff_t *ppos; /* optional file range info */
+ size_t count;
u32 response; /* userspace answer to the event */
unsigned short state; /* state of the event */
int fd; /* fd we passed to userspace for this event */
@@ -446,6 +448,14 @@ static inline bool fanotify_is_perm_event(u32 mask)
mask & FANOTIFY_PERM_EVENTS;
}
+static inline bool fanotify_event_has_access_range(struct fanotify_event *event)
+{
+ if (!(event->mask & FANOTIFY_PRE_CONTENT_EVENTS))
+ return false;
+
+ return FANOTIFY_PERM(event)->ppos;
+}
+
static inline struct fanotify_event *FANOTIFY_E(struct fsnotify_event *fse)
{
return container_of(fse, struct fanotify_event, fse);
@@ -518,3 +528,8 @@ static inline unsigned int fanotify_mark_user_flags(struct fsnotify_mark *mark)
return mflags;
}
+
+static inline u32 fanotify_get_response_errno(int res)
+{
+ return (res >> FAN_ERRNO_SHIFT) & FAN_ERRNO_MASK;
+}
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 2d85c71717d6..6ff94e312232 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -100,8 +100,7 @@ static void __init fanotify_sysctls_init(void)
*
* Internal and external open flags are stored together in field f_flags of
* struct file. Only external open flags shall be allowed in event_f_flags.
- * Internal flags like FMODE_NONOTIFY, FMODE_EXEC, FMODE_NOCMTIME shall be
- * excluded.
+ * Internal flags like FMODE_EXEC shall be excluded.
*/
#define FANOTIFY_INIT_ALL_EVENT_F_BITS ( \
O_ACCMODE | O_APPEND | O_NONBLOCK | \
@@ -118,10 +117,12 @@ struct kmem_cache *fanotify_perm_event_cachep __ro_after_init;
#define FANOTIFY_EVENT_ALIGN 4
#define FANOTIFY_FID_INFO_HDR_LEN \
(sizeof(struct fanotify_event_info_fid) + sizeof(struct file_handle))
-#define FANOTIFY_PIDFD_INFO_HDR_LEN \
+#define FANOTIFY_PIDFD_INFO_LEN \
sizeof(struct fanotify_event_info_pidfd)
#define FANOTIFY_ERROR_INFO_LEN \
(sizeof(struct fanotify_event_info_error))
+#define FANOTIFY_RANGE_INFO_LEN \
+ (sizeof(struct fanotify_event_info_range))
static int fanotify_fid_info_len(int fh_len, int name_len)
{
@@ -159,9 +160,6 @@ static size_t fanotify_event_len(unsigned int info_mode,
int fh_len;
int dot_len = 0;
- if (!info_mode)
- return event_len;
-
if (fanotify_is_error_event(event->mask))
event_len += FANOTIFY_ERROR_INFO_LEN;
@@ -176,14 +174,17 @@ static size_t fanotify_event_len(unsigned int info_mode,
dot_len = 1;
}
- if (info_mode & FAN_REPORT_PIDFD)
- event_len += FANOTIFY_PIDFD_INFO_HDR_LEN;
-
if (fanotify_event_has_object_fh(event)) {
fh_len = fanotify_event_object_fh_len(event);
event_len += fanotify_fid_info_len(fh_len, dot_len);
}
+ if (info_mode & FAN_REPORT_PIDFD)
+ event_len += FANOTIFY_PIDFD_INFO_LEN;
+
+ if (fanotify_event_has_access_range(event))
+ event_len += FANOTIFY_RANGE_INFO_LEN;
+
return event_len;
}
@@ -258,12 +259,11 @@ static int create_fd(struct fsnotify_group *group, const struct path *path,
return client_fd;
/*
- * we need a new file handle for the userspace program so it can read even if it was
- * originally opened O_WRONLY.
+ * We provide an fd for the userspace program, so it could access the
+ * file without generating fanotify events itself.
*/
- new_file = dentry_open(path,
- group->fanotify_data.f_flags | __FMODE_NONOTIFY,
- current_cred());
+ new_file = dentry_open_nonotify(path, group->fanotify_data.f_flags,
+ current_cred());
if (IS_ERR(new_file)) {
put_unused_fd(client_fd);
client_fd = PTR_ERR(new_file);
@@ -327,11 +327,12 @@ static int process_access_response(struct fsnotify_group *group,
struct fanotify_perm_event *event;
int fd = response_struct->fd;
u32 response = response_struct->response;
+ int errno = fanotify_get_response_errno(response);
int ret = info_len;
struct fanotify_response_info_audit_rule friar;
- pr_debug("%s: group=%p fd=%d response=%u buf=%p size=%zu\n", __func__,
- group, fd, response, info, info_len);
+ pr_debug("%s: group=%p fd=%d response=%x errno=%d buf=%p size=%zu\n",
+ __func__, group, fd, response, errno, info, info_len);
/*
* make sure the response is valid, if invalid we do nothing and either
* userspace can send a valid response or we will clean it up after the
@@ -342,7 +343,31 @@ static int process_access_response(struct fsnotify_group *group,
switch (response & FANOTIFY_RESPONSE_ACCESS) {
case FAN_ALLOW:
+ if (errno)
+ return -EINVAL;
+ break;
case FAN_DENY:
+ /* Custom errno is supported only for pre-content groups */
+ if (errno && group->priority != FSNOTIFY_PRIO_PRE_CONTENT)
+ return -EINVAL;
+
+ /*
+ * Limit errno to values expected on open(2)/read(2)/write(2)
+ * of regular files.
+ */
+ switch (errno) {
+ case 0:
+ case EIO:
+ case EPERM:
+ case EBUSY:
+ case ETXTBSY:
+ case EAGAIN:
+ case ENOSPC:
+ case EDQUOT:
+ break;
+ default:
+ return -EINVAL;
+ }
break;
default:
return -EINVAL;
@@ -506,7 +531,7 @@ static int copy_pidfd_info_to_user(int pidfd,
size_t count)
{
struct fanotify_event_info_pidfd info = { };
- size_t info_len = FANOTIFY_PIDFD_INFO_HDR_LEN;
+ size_t info_len = FANOTIFY_PIDFD_INFO_LEN;
if (WARN_ON_ONCE(info_len > count))
return -EFAULT;
@@ -521,6 +546,30 @@ static int copy_pidfd_info_to_user(int pidfd,
return info_len;
}
+static size_t copy_range_info_to_user(struct fanotify_event *event,
+ char __user *buf, int count)
+{
+ struct fanotify_perm_event *pevent = FANOTIFY_PERM(event);
+ struct fanotify_event_info_range info = { };
+ size_t info_len = FANOTIFY_RANGE_INFO_LEN;
+
+ if (WARN_ON_ONCE(info_len > count))
+ return -EFAULT;
+
+ if (WARN_ON_ONCE(!pevent->ppos))
+ return -EINVAL;
+
+ info.hdr.info_type = FAN_EVENT_INFO_TYPE_RANGE;
+ info.hdr.len = info_len;
+ info.offset = *(pevent->ppos);
+ info.count = pevent->count;
+
+ if (copy_to_user(buf, &info, info_len))
+ return -EFAULT;
+
+ return info_len;
+}
+
static int copy_info_records_to_user(struct fanotify_event *event,
struct fanotify_info *info,
unsigned int info_mode, int pidfd,
@@ -642,6 +691,15 @@ static int copy_info_records_to_user(struct fanotify_event *event,
total_bytes += ret;
}
+ if (fanotify_event_has_access_range(event)) {
+ ret = copy_range_info_to_user(event, buf, count);
+ if (ret < 0)
+ return ret;
+ buf += ret;
+ count -= ret;
+ total_bytes += ret;
+ }
+
return total_bytes;
}
@@ -756,12 +814,10 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
buf += FAN_EVENT_METADATA_LEN;
count -= FAN_EVENT_METADATA_LEN;
- if (info_mode) {
- ret = copy_info_records_to_user(event, info, info_mode, pidfd,
- buf, count);
- if (ret < 0)
- goto out_close_fd;
- }
+ ret = copy_info_records_to_user(event, info, info_mode, pidfd,
+ buf, count);
+ if (ret < 0)
+ goto out_close_fd;
if (f)
fd_install(fd, f);
@@ -1294,7 +1350,7 @@ static int fanotify_group_init_error_pool(struct fsnotify_group *group)
}
static int fanotify_may_update_existing_mark(struct fsnotify_mark *fsn_mark,
- unsigned int fan_flags)
+ __u32 mask, unsigned int fan_flags)
{
/*
* Non evictable mark cannot be downgraded to evictable mark.
@@ -1321,6 +1377,11 @@ static int fanotify_may_update_existing_mark(struct fsnotify_mark *fsn_mark,
fsn_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)
return -EEXIST;
+ /* For now pre-content events are not generated for directories */
+ mask |= fsn_mark->mask;
+ if (mask & FANOTIFY_PRE_CONTENT_EVENTS && mask & FAN_ONDIR)
+ return -EEXIST;
+
return 0;
}
@@ -1347,7 +1408,7 @@ static int fanotify_add_mark(struct fsnotify_group *group,
/*
* Check if requested mark flags conflict with an existing mark flags.
*/
- ret = fanotify_may_update_existing_mark(fsn_mark, fan_flags);
+ ret = fanotify_may_update_existing_mark(fsn_mark, mask, fan_flags);
if (ret)
goto out;
@@ -1409,6 +1470,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
unsigned int fid_mode = flags & FANOTIFY_FID_BITS;
unsigned int class = flags & FANOTIFY_CLASS_BITS;
unsigned int internal_flags = 0;
+ struct file *file;
pr_debug("%s: flags=%x event_f_flags=%x\n",
__func__, flags, event_f_flags);
@@ -1477,7 +1539,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
(!(fid_mode & FAN_REPORT_NAME) || !(fid_mode & FAN_REPORT_FID)))
return -EINVAL;
- f_flags = O_RDWR | __FMODE_NONOTIFY;
+ f_flags = O_RDWR;
if (flags & FAN_CLOEXEC)
f_flags |= O_CLOEXEC;
if (flags & FAN_NONBLOCK)
@@ -1555,10 +1617,18 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
goto out_destroy_group;
}
- fd = anon_inode_getfd("[fanotify]", &fanotify_fops, group, f_flags);
+ fd = get_unused_fd_flags(f_flags);
if (fd < 0)
goto out_destroy_group;
+ file = anon_inode_getfile_fmode("[fanotify]", &fanotify_fops, group,
+ f_flags, FMODE_NONOTIFY);
+ if (IS_ERR(file)) {
+ put_unused_fd(fd);
+ fd = PTR_ERR(file);
+ goto out_destroy_group;
+ }
+ fd_install(fd, file);
return fd;
out_destroy_group:
@@ -1638,12 +1708,24 @@ static int fanotify_events_supported(struct fsnotify_group *group,
unsigned int flags)
{
unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
+ bool is_dir = d_is_dir(path->dentry);
/* Strict validation of events in non-dir inode mask with v5.17+ APIs */
bool strict_dir_events = FAN_GROUP_FLAG(group, FAN_REPORT_TARGET_FID) ||
(mask & FAN_RENAME) ||
(flags & FAN_MARK_IGNORE);
/*
+ * Filesystems need to opt-into pre-content evnets (a.k.a HSM)
+ * and they are only supported on regular files and directories.
+ */
+ if (mask & FANOTIFY_PRE_CONTENT_EVENTS) {
+ if (!(path->mnt->mnt_sb->s_iflags & SB_I_ALLOW_HSM))
+ return -EOPNOTSUPP;
+ if (!is_dir && !d_is_reg(path->dentry))
+ return -EINVAL;
+ }
+
+ /*
* Some filesystems such as 'proc' acquire unusual locks when opening
* files. For them fanotify permission events have high chances of
* deadlocking the system - open done when reporting fanotify event
@@ -1675,7 +1757,7 @@ static int fanotify_events_supported(struct fsnotify_group *group,
* but because we always allowed it, error only when using new APIs.
*/
if (strict_dir_events && mark_type == FAN_MARK_INODE &&
- !d_is_dir(path->dentry) && (mask & FANOTIFY_DIRONLY_EVENT_BITS))
+ !is_dir && (mask & FANOTIFY_DIRONLY_EVENT_BITS))
return -ENOTDIR;
return 0;
@@ -1776,10 +1858,14 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
return -EPERM;
/*
- * Permission events require minimum priority FAN_CLASS_CONTENT.
+ * Permission events are not allowed for FAN_CLASS_NOTIF.
+ * Pre-content permission events are not allowed for FAN_CLASS_CONTENT.
*/
if (mask & FANOTIFY_PERM_EVENTS &&
- group->priority < FSNOTIFY_PRIO_CONTENT)
+ group->priority == FSNOTIFY_PRIO_NORMAL)
+ return -EINVAL;
+ else if (mask & FANOTIFY_PRE_CONTENT_EVENTS &&
+ group->priority == FSNOTIFY_PRIO_CONTENT)
return -EINVAL;
if (mask & FAN_FS_ERROR &&
@@ -1814,6 +1900,10 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
if (mask & FAN_RENAME && !(fid_mode & FAN_REPORT_NAME))
return -EINVAL;
+ /* Pre-content events are not currently generated for directories. */
+ if (mask & FANOTIFY_PRE_CONTENT_EVENTS && mask & FAN_ONDIR)
+ return -EINVAL;
+
if (mark_cmd == FAN_MARK_FLUSH) {
if (mark_type == FAN_MARK_MOUNT)
fsnotify_clear_vfsmount_marks_by_group(group);
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index f976949d2634..8ee495a58d0a 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -193,7 +193,7 @@ static bool fsnotify_event_needs_parent(struct inode *inode, __u32 mnt_mask,
return mask & marks_mask;
}
-/* Are there any inode/mount/sb objects that are interested in this event? */
+/* Are there any inode/mount/sb objects that watch for these events? */
static inline bool fsnotify_object_watched(struct inode *inode, __u32 mnt_mask,
__u32 mask)
{
@@ -203,6 +203,24 @@ static inline bool fsnotify_object_watched(struct inode *inode, __u32 mnt_mask,
return mask & marks_mask & ALL_FSNOTIFY_EVENTS;
}
+/* Report pre-content event with optional range info */
+int fsnotify_pre_content(const struct path *path, const loff_t *ppos,
+ size_t count)
+{
+ struct file_range range;
+
+ /* Report page aligned range only when pos is known */
+ if (!ppos)
+ return fsnotify_path(path, FS_PRE_ACCESS);
+
+ range.path = path;
+ range.pos = PAGE_ALIGN_DOWN(*ppos);
+ range.count = PAGE_ALIGN(*ppos + count) - range.pos;
+
+ return fsnotify_parent(path->dentry, FS_PRE_ACCESS, &range,
+ FSNOTIFY_EVENT_FILE_RANGE);
+}
+
/*
* Notify this dentry's parent about a child's events with child name info
* if parent is watching or if inode/sb/mount are interested in events with
@@ -623,11 +641,72 @@ out:
}
EXPORT_SYMBOL_GPL(fsnotify);
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+/*
+ * At open time we check fsnotify_sb_has_priority_watchers() and set the
+ * FMODE_NONOTIFY_ mode bits accordignly.
+ * Later, fsnotify permission hooks do not check if there are permission event
+ * watches, but that there were permission event watches at open time.
+ */
+void file_set_fsnotify_mode(struct file *file)
+{
+ struct dentry *dentry = file->f_path.dentry, *parent;
+ struct super_block *sb = dentry->d_sb;
+ __u32 mnt_mask, p_mask;
+
+ /* Is it a file opened by fanotify? */
+ if (FMODE_FSNOTIFY_NONE(file->f_mode))
+ return;
+
+ /*
+ * Permission events is a super set of pre-content events, so if there
+ * are no permission event watchers, there are also no pre-content event
+ * watchers and this is implied from the single FMODE_NONOTIFY_PERM bit.
+ */
+ if (likely(!fsnotify_sb_has_priority_watchers(sb,
+ FSNOTIFY_PRIO_CONTENT))) {
+ file->f_mode |= FMODE_NONOTIFY_PERM;
+ return;
+ }
+
+ /*
+ * If there are permission event watchers but no pre-content event
+ * watchers, set FMODE_NONOTIFY | FMODE_NONOTIFY_PERM to indicate that.
+ */
+ if ((!d_is_dir(dentry) && !d_is_reg(dentry)) ||
+ likely(!fsnotify_sb_has_priority_watchers(sb,
+ FSNOTIFY_PRIO_PRE_CONTENT))) {
+ file->f_mode |= FMODE_NONOTIFY | FMODE_NONOTIFY_PERM;
+ return;
+ }
+
+ /*
+ * OK, there are some pre-content watchers. Check if anybody is
+ * watching for pre-content events on *this* file.
+ */
+ mnt_mask = READ_ONCE(real_mount(file->f_path.mnt)->mnt_fsnotify_mask);
+ if (unlikely(fsnotify_object_watched(d_inode(dentry), mnt_mask,
+ FSNOTIFY_PRE_CONTENT_EVENTS)))
+ return;
+
+ /* Is parent watching for pre-content events on this file? */
+ if (dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED) {
+ parent = dget_parent(dentry);
+ p_mask = fsnotify_inode_watches_children(d_inode(parent));
+ dput(parent);
+ if (p_mask & FSNOTIFY_PRE_CONTENT_EVENTS)
+ return;
+ }
+ /* Nobody watching for pre-content events from this file */
+ file->f_mode |= FMODE_NONOTIFY | FMODE_NONOTIFY_PERM;
+}
+#endif
+
static __init int fsnotify_init(void)
{
int ret;
- BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 23);
+ BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 24);
ret = init_srcu_struct(&fsnotify_mark_srcu);
if (ret)
diff --git a/fs/open.c b/fs/open.c
index ffcfef67ac86..0a5d2f6061c6 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -81,14 +81,18 @@ long vfs_truncate(const struct path *path, loff_t length)
if (!S_ISREG(inode->i_mode))
return -EINVAL;
- error = mnt_want_write(path->mnt);
- if (error)
- goto out;
-
idmap = mnt_idmap(path->mnt);
error = inode_permission(idmap, inode, MAY_WRITE);
if (error)
- goto mnt_drop_write_and_out;
+ return error;
+
+ error = fsnotify_truncate_perm(path, length);
+ if (error)
+ return error;
+
+ error = mnt_want_write(path->mnt);
+ if (error)
+ return error;
error = -EPERM;
if (IS_APPEND(inode))
@@ -114,7 +118,7 @@ put_write_and_out:
put_write_access(inode);
mnt_drop_write_and_out:
mnt_drop_write(path->mnt);
-out:
+
return error;
}
EXPORT_SYMBOL_GPL(vfs_truncate);
@@ -175,11 +179,18 @@ long do_ftruncate(struct file *file, loff_t length, int small)
/* Check IS_APPEND on real upper inode */
if (IS_APPEND(file_inode(file)))
return -EPERM;
- sb_start_write(inode->i_sb);
+
error = security_file_truncate(file);
- if (!error)
- error = do_truncate(file_mnt_idmap(file), dentry, length,
- ATTR_MTIME | ATTR_CTIME, file);
+ if (error)
+ return error;
+
+ error = fsnotify_truncate_perm(&file->f_path, length);
+ if (error)
+ return error;
+
+ sb_start_write(inode->i_sb);
+ error = do_truncate(file_mnt_idmap(file), dentry, length,
+ ATTR_MTIME | ATTR_CTIME, file);
sb_end_write(inode->i_sb);
return error;
@@ -894,7 +905,7 @@ static int do_dentry_open(struct file *f,
f->f_sb_err = file_sample_sb_err(f);
if (unlikely(f->f_flags & O_PATH)) {
- f->f_mode = FMODE_PATH | FMODE_OPENED;
+ f->f_mode = FMODE_PATH | FMODE_OPENED | FMODE_NONOTIFY;
f->f_op = &empty_fops;
return 0;
}
@@ -922,6 +933,12 @@ static int do_dentry_open(struct file *f,
if (error)
goto cleanup_all;
+ /*
+ * Set FMODE_NONOTIFY_* bits according to existing permission watches.
+ * If FMODE_NONOTIFY was already set for an fanotify fd, this doesn't
+ * change anything.
+ */
+ file_set_fsnotify_mode(f);
error = fsnotify_open_perm(f);
if (error)
goto cleanup_all;
@@ -1098,6 +1115,23 @@ struct file *dentry_open(const struct path *path, int flags,
}
EXPORT_SYMBOL(dentry_open);
+struct file *dentry_open_nonotify(const struct path *path, int flags,
+ const struct cred *cred)
+{
+ struct file *f = alloc_empty_file(flags, cred);
+ if (!IS_ERR(f)) {
+ int error;
+
+ f->f_mode |= FMODE_NONOTIFY;
+ error = vfs_open(path, f);
+ if (error) {
+ fput(f);
+ f = ERR_PTR(error);
+ }
+ }
+ return f;
+}
+
/**
* dentry_create - Create and open a file
* @path: path to create
@@ -1195,7 +1229,7 @@ inline struct open_how build_open_how(int flags, umode_t mode)
inline int build_open_flags(const struct open_how *how, struct open_flags *op)
{
u64 flags = how->flags;
- u64 strip = __FMODE_NONOTIFY | O_CLOEXEC;
+ u64 strip = O_CLOEXEC;
int lookup_flags = 0;
int acc_mode = ACC_MODE(flags);
@@ -1203,9 +1237,7 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op)
"struct open_flags doesn't yet handle flags > 32 bits");
/*
- * Strip flags that either shouldn't be set by userspace like
- * FMODE_NONOTIFY or that aren't relevant in determining struct
- * open_flags like O_CLOEXEC.
+ * Strip flags that aren't relevant in determining struct open_flags.
*/
flags &= ~strip;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 9a435b1ff264..f7a7d89c345e 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1451,6 +1451,9 @@ xfs_dax_read_fault(
trace_xfs_read_fault(ip, order);
+ ret = filemap_fsnotify_fault(vmf);
+ if (unlikely(ret))
+ return ret;
xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
ret = xfs_dax_fault_locked(vmf, order, false);
xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
@@ -1479,6 +1482,16 @@ xfs_write_fault(
vm_fault_t ret;
trace_xfs_write_fault(ip, order);
+ /*
+ * Usually we get here from ->page_mkwrite callback but in case of DAX
+ * we will get here also for ordinary write fault. Handle HSM
+ * notifications for that case.
+ */
+ if (IS_DAX(inode)) {
+ ret = filemap_fsnotify_fault(vmf);
+ if (unlikely(ret))
+ return ret;
+ }
sb_start_pagefault(inode->i_sb);
file_update_time(vmf->vma->vm_file);
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 809ac6d1813c..d92d7a07ea89 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1756,7 +1756,7 @@ xfs_fs_fill_super(
sb->s_time_max = XFS_LEGACY_TIME_MAX;
}
trace_xfs_inode_timestamp_range(mp, sb->s_time_min, sb->s_time_max);
- sb->s_iflags |= SB_I_CGROUPWB;
+ sb->s_iflags |= SB_I_CGROUPWB | SB_I_ALLOW_HSM;
set_posix_acl_flag(sb);
diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h
index 89ff45bd6f01..78f660ebc318 100644
--- a/include/linux/fanotify.h
+++ b/include/linux/fanotify.h
@@ -89,6 +89,16 @@
#define FANOTIFY_DIRENT_EVENTS (FAN_MOVE | FAN_CREATE | FAN_DELETE | \
FAN_RENAME)
+/* Content events can be used to inspect file content */
+#define FANOTIFY_CONTENT_PERM_EVENTS (FAN_OPEN_PERM | FAN_OPEN_EXEC_PERM | \
+ FAN_ACCESS_PERM)
+/* Pre-content events can be used to fill file content */
+#define FANOTIFY_PRE_CONTENT_EVENTS (FAN_PRE_ACCESS)
+
+/* Events that require a permission response from user */
+#define FANOTIFY_PERM_EVENTS (FANOTIFY_CONTENT_PERM_EVENTS | \
+ FANOTIFY_PRE_CONTENT_EVENTS)
+
/* Events that can be reported with event->fd */
#define FANOTIFY_FD_EVENTS (FANOTIFY_PATH_EVENTS | FANOTIFY_PERM_EVENTS)
@@ -104,10 +114,6 @@
FANOTIFY_INODE_EVENTS | \
FANOTIFY_ERROR_EVENTS)
-/* Events that require a permission response from user */
-#define FANOTIFY_PERM_EVENTS (FAN_OPEN_PERM | FAN_ACCESS_PERM | \
- FAN_OPEN_EXEC_PERM)
-
/* Extra flags that may be reported with event or control handling of events */
#define FANOTIFY_EVENT_FLAGS (FAN_EVENT_ON_CHILD | FAN_ONDIR)
@@ -126,7 +132,9 @@
/* These masks check for invalid bits in permission responses. */
#define FANOTIFY_RESPONSE_ACCESS (FAN_ALLOW | FAN_DENY)
#define FANOTIFY_RESPONSE_FLAGS (FAN_AUDIT | FAN_INFO)
-#define FANOTIFY_RESPONSE_VALID_MASK (FANOTIFY_RESPONSE_ACCESS | FANOTIFY_RESPONSE_FLAGS)
+#define FANOTIFY_RESPONSE_VALID_MASK \
+ (FANOTIFY_RESPONSE_ACCESS | FANOTIFY_RESPONSE_FLAGS | \
+ (FAN_ERRNO_MASK << FAN_ERRNO_SHIFT))
/* Do not use these old uapi constants internally */
#undef FAN_ALL_CLASS_BITS
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a4af70367f8a..534e652bd05f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -173,13 +173,20 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
#define FMODE_NOREUSE ((__force fmode_t)(1 << 23))
-/* FMODE_* bit 24 */
-
/* File is embedded in backing_file object */
-#define FMODE_BACKING ((__force fmode_t)(1 << 25))
+#define FMODE_BACKING ((__force fmode_t)(1 << 24))
+
+/*
+ * Together with FMODE_NONOTIFY_PERM defines which fsnotify events shouldn't be
+ * generated (see below)
+ */
+#define FMODE_NONOTIFY ((__force fmode_t)(1 << 25))
-/* File was opened by fanotify and shouldn't generate fanotify events */
-#define FMODE_NONOTIFY ((__force fmode_t)(1 << 26))
+/*
+ * Together with FMODE_NONOTIFY defines which fsnotify events shouldn't be
+ * generated (see below)
+ */
+#define FMODE_NONOTIFY_PERM ((__force fmode_t)(1 << 26))
/* File is capable of returning -EAGAIN if I/O will block */
#define FMODE_NOWAIT ((__force fmode_t)(1 << 27))
@@ -191,6 +198,32 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
#define FMODE_NOACCOUNT ((__force fmode_t)(1 << 29))
/*
+ * The two FMODE_NONOTIFY* define which fsnotify events should not be generated
+ * for a file. These are the possible values of (f->f_mode &
+ * FMODE_FSNOTIFY_MASK) and their meaning:
+ *
+ * FMODE_NONOTIFY - suppress all (incl. non-permission) events.
+ * FMODE_NONOTIFY_PERM - suppress permission (incl. pre-content) events.
+ * FMODE_NONOTIFY | FMODE_NONOTIFY_PERM - suppress only pre-content events.
+ */
+#define FMODE_FSNOTIFY_MASK \
+ (FMODE_NONOTIFY | FMODE_NONOTIFY_PERM)
+
+#define FMODE_FSNOTIFY_NONE(mode) \
+ ((mode & FMODE_FSNOTIFY_MASK) == FMODE_NONOTIFY)
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+#define FMODE_FSNOTIFY_PERM(mode) \
+ ((mode & FMODE_FSNOTIFY_MASK) == 0 || \
+ (mode & FMODE_FSNOTIFY_MASK) == (FMODE_NONOTIFY | FMODE_NONOTIFY_PERM))
+#define FMODE_FSNOTIFY_HSM(mode) \
+ ((mode & FMODE_FSNOTIFY_MASK) == 0)
+#else
+#define FMODE_FSNOTIFY_PERM(mode) 0
+#define FMODE_FSNOTIFY_HSM(mode) 0
+#endif
+
+
+/*
* Attribute flags. These should be or-ed together to figure out what
* has been changed!
*/
@@ -1246,6 +1279,7 @@ extern int send_sigurg(struct file *file);
#define SB_I_RETIRED 0x00000800 /* superblock shouldn't be reused */
#define SB_I_NOUMASK 0x00001000 /* VFS does not apply umask */
#define SB_I_NOIDMAP 0x00002000 /* No idmapped mounts on this superblock */
+#define SB_I_ALLOW_HSM 0x00004000 /* Allow HSM events on this superblock */
/* Possible states of 'frozen' field */
enum {
@@ -2767,6 +2801,8 @@ static inline struct file *file_open_root_mnt(struct vfsmount *mnt,
}
struct file *dentry_open(const struct path *path, int flags,
const struct cred *creds);
+struct file *dentry_open_nonotify(const struct path *path, int flags,
+ const struct cred *cred);
struct file *dentry_create(const struct path *path, int flags, umode_t mode,
const struct cred *cred);
struct path *backing_file_user_path(struct file *f);
@@ -3075,6 +3111,28 @@ static inline void allow_write_access(struct file *file)
if (file)
atomic_inc(&file_inode(file)->i_writecount);
}
+
+/*
+ * Do not prevent write to executable file when watched by pre-content events.
+ *
+ * Note that FMODE_FSNOTIFY_HSM mode is set depending on pre-content watches at
+ * the time of file open and remains constant for entire lifetime of the file,
+ * so if pre-content watches are added post execution or removed before the end
+ * of the execution, it will not cause i_writecount reference leak.
+ */
+static inline int exe_file_deny_write_access(struct file *exe_file)
+{
+ if (unlikely(FMODE_FSNOTIFY_HSM(exe_file->f_mode)))
+ return 0;
+ return deny_write_access(exe_file);
+}
+static inline void exe_file_allow_write_access(struct file *exe_file)
+{
+ if (unlikely(!exe_file || FMODE_FSNOTIFY_HSM(exe_file->f_mode)))
+ return;
+ allow_write_access(exe_file);
+}
+
static inline bool inode_is_open_for_write(const struct inode *inode)
{
return atomic_read(&inode->i_writecount) > 0;
@@ -3730,11 +3788,9 @@ struct ctl_table;
int __init list_bdev_fs_names(char *buf, size_t size);
#define __FMODE_EXEC ((__force int) FMODE_EXEC)
-#define __FMODE_NONOTIFY ((__force int) FMODE_NONOTIFY)
#define ACC_MODE(x) ("\004\002\006\006"[(x)&O_ACCMODE])
-#define OPEN_FMODE(flag) ((__force fmode_t)(((flag + 1) & O_ACCMODE) | \
- (flag & __FMODE_NONOTIFY)))
+#define OPEN_FMODE(flag) ((__force fmode_t)((flag + 1) & O_ACCMODE))
static inline bool is_sxid(umode_t mode)
{
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 278620e063ab..1a9ef8f6784d 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -108,38 +108,35 @@ static inline void fsnotify_dentry(struct dentry *dentry, __u32 mask)
fsnotify_parent(dentry, mask, dentry, FSNOTIFY_EVENT_DENTRY);
}
-static inline int fsnotify_file(struct file *file, __u32 mask)
+static inline int fsnotify_path(const struct path *path, __u32 mask)
{
- const struct path *path;
+ return fsnotify_parent(path->dentry, mask, path, FSNOTIFY_EVENT_PATH);
+}
+static inline int fsnotify_file(struct file *file, __u32 mask)
+{
/*
* FMODE_NONOTIFY are fds generated by fanotify itself which should not
* generate new events. We also don't want to generate events for
* FMODE_PATH fds (involves open & close events) as they are just
* handle creation / destruction events and not "real" file events.
*/
- if (file->f_mode & (FMODE_NONOTIFY | FMODE_PATH))
+ if (FMODE_FSNOTIFY_NONE(file->f_mode))
return 0;
- path = &file->f_path;
- /* Permission events require group prio >= FSNOTIFY_PRIO_CONTENT */
- if (mask & ALL_FSNOTIFY_PERM_EVENTS &&
- !fsnotify_sb_has_priority_watchers(path->dentry->d_sb,
- FSNOTIFY_PRIO_CONTENT))
- return 0;
-
- return fsnotify_parent(path->dentry, mask, path, FSNOTIFY_EVENT_PATH);
+ return fsnotify_path(&file->f_path, mask);
}
#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+
+void file_set_fsnotify_mode(struct file *file);
+
/*
* fsnotify_file_area_perm - permission hook before access to file range
*/
static inline int fsnotify_file_area_perm(struct file *file, int perm_mask,
const loff_t *ppos, size_t count)
{
- __u32 fsnotify_mask = FS_ACCESS_PERM;
-
/*
* filesystem may be modified in the context of permission events
* (e.g. by HSM filling a file on access), so sb freeze protection
@@ -147,14 +144,49 @@ static inline int fsnotify_file_area_perm(struct file *file, int perm_mask,
*/
lockdep_assert_once(file_write_not_started(file));
+ if (!(perm_mask & (MAY_READ | MAY_WRITE | MAY_ACCESS)))
+ return 0;
+
+ if (likely(!FMODE_FSNOTIFY_PERM(file->f_mode)))
+ return 0;
+
+ /*
+ * read()/write() and other types of access generate pre-content events.
+ */
+ if (unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) {
+ int ret = fsnotify_pre_content(&file->f_path, ppos, count);
+
+ if (ret)
+ return ret;
+ }
+
if (!(perm_mask & MAY_READ))
return 0;
- return fsnotify_file(file, fsnotify_mask);
+ /*
+ * read() also generates the legacy FS_ACCESS_PERM event, so content
+ * scanners can inspect the content filled by pre-content event.
+ */
+ return fsnotify_path(&file->f_path, FS_ACCESS_PERM);
+}
+
+/*
+ * fsnotify_truncate_perm - permission hook before file truncate
+ */
+static inline int fsnotify_truncate_perm(const struct path *path, loff_t length)
+{
+ struct inode *inode = d_inode(path->dentry);
+
+ if (!(inode->i_sb->s_iflags & SB_I_ALLOW_HSM) ||
+ !fsnotify_sb_has_priority_watchers(inode->i_sb,
+ FSNOTIFY_PRIO_PRE_CONTENT))
+ return 0;
+
+ return fsnotify_pre_content(path, &length, 0);
}
/*
- * fsnotify_file_perm - permission hook before file access
+ * fsnotify_file_perm - permission hook before file access (unknown range)
*/
static inline int fsnotify_file_perm(struct file *file, int perm_mask)
{
@@ -168,22 +200,34 @@ static inline int fsnotify_open_perm(struct file *file)
{
int ret;
+ if (likely(!FMODE_FSNOTIFY_PERM(file->f_mode)))
+ return 0;
+
if (file->f_flags & __FMODE_EXEC) {
- ret = fsnotify_file(file, FS_OPEN_EXEC_PERM);
+ ret = fsnotify_path(&file->f_path, FS_OPEN_EXEC_PERM);
if (ret)
return ret;
}
- return fsnotify_file(file, FS_OPEN_PERM);
+ return fsnotify_path(&file->f_path, FS_OPEN_PERM);
}
#else
+static inline void file_set_fsnotify_mode(struct file *file)
+{
+}
+
static inline int fsnotify_file_area_perm(struct file *file, int perm_mask,
const loff_t *ppos, size_t count)
{
return 0;
}
+static inline int fsnotify_truncate_perm(const struct path *path, loff_t length)
+{
+ return 0;
+}
+
static inline int fsnotify_file_perm(struct file *file, int perm_mask)
{
return 0;
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 3ecf7768e577..0d24a21a8e60 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -55,6 +55,9 @@
#define FS_OPEN_PERM 0x00010000 /* open event in an permission hook */
#define FS_ACCESS_PERM 0x00020000 /* access event in a permissions hook */
#define FS_OPEN_EXEC_PERM 0x00040000 /* open/exec event in a permission hook */
+/* #define FS_DIR_MODIFY 0x00080000 */ /* Deprecated (reserved) */
+
+#define FS_PRE_ACCESS 0x00100000 /* Pre-content access hook */
/*
* Set on inode mark that cares about things that happen to its children.
@@ -77,8 +80,14 @@
*/
#define ALL_FSNOTIFY_DIRENT_EVENTS (FS_CREATE | FS_DELETE | FS_MOVE | FS_RENAME)
-#define ALL_FSNOTIFY_PERM_EVENTS (FS_OPEN_PERM | FS_ACCESS_PERM | \
- FS_OPEN_EXEC_PERM)
+/* Content events can be used to inspect file content */
+#define FSNOTIFY_CONTENT_PERM_EVENTS (FS_OPEN_PERM | FS_OPEN_EXEC_PERM | \
+ FS_ACCESS_PERM)
+/* Pre-content events can be used to fill file content */
+#define FSNOTIFY_PRE_CONTENT_EVENTS (FS_PRE_ACCESS)
+
+#define ALL_FSNOTIFY_PERM_EVENTS (FSNOTIFY_CONTENT_PERM_EVENTS | \
+ FSNOTIFY_PRE_CONTENT_EVENTS)
/*
* This is a list of all events that may get sent to a parent that is watching
@@ -285,6 +294,7 @@ static inline void fsnotify_group_assert_locked(struct fsnotify_group *group)
/* When calling fsnotify tell it if the data is a path or inode */
enum fsnotify_data_type {
FSNOTIFY_EVENT_NONE,
+ FSNOTIFY_EVENT_FILE_RANGE,
FSNOTIFY_EVENT_PATH,
FSNOTIFY_EVENT_INODE,
FSNOTIFY_EVENT_DENTRY,
@@ -297,6 +307,17 @@ struct fs_error_report {
struct super_block *sb;
};
+struct file_range {
+ const struct path *path;
+ loff_t pos;
+ size_t count;
+};
+
+static inline const struct path *file_range_path(const struct file_range *range)
+{
+ return range->path;
+}
+
static inline struct inode *fsnotify_data_inode(const void *data, int data_type)
{
switch (data_type) {
@@ -306,6 +327,8 @@ static inline struct inode *fsnotify_data_inode(const void *data, int data_type)
return d_inode(data);
case FSNOTIFY_EVENT_PATH:
return d_inode(((const struct path *)data)->dentry);
+ case FSNOTIFY_EVENT_FILE_RANGE:
+ return d_inode(file_range_path(data)->dentry);
case FSNOTIFY_EVENT_ERROR:
return ((struct fs_error_report *)data)->inode;
default:
@@ -321,6 +344,8 @@ static inline struct dentry *fsnotify_data_dentry(const void *data, int data_typ
return (struct dentry *)data;
case FSNOTIFY_EVENT_PATH:
return ((const struct path *)data)->dentry;
+ case FSNOTIFY_EVENT_FILE_RANGE:
+ return file_range_path(data)->dentry;
default:
return NULL;
}
@@ -332,6 +357,8 @@ static inline const struct path *fsnotify_data_path(const void *data,
switch (data_type) {
case FSNOTIFY_EVENT_PATH:
return data;
+ case FSNOTIFY_EVENT_FILE_RANGE:
+ return file_range_path(data);
default:
return NULL;
}
@@ -347,6 +374,8 @@ static inline struct super_block *fsnotify_data_sb(const void *data,
return ((struct dentry *)data)->d_sb;
case FSNOTIFY_EVENT_PATH:
return ((const struct path *)data)->dentry->d_sb;
+ case FSNOTIFY_EVENT_FILE_RANGE:
+ return file_range_path(data)->dentry->d_sb;
case FSNOTIFY_EVENT_ERROR:
return ((struct fs_error_report *) data)->sb;
default:
@@ -366,6 +395,18 @@ static inline struct fs_error_report *fsnotify_data_error_report(
}
}
+static inline const struct file_range *fsnotify_data_file_range(
+ const void *data,
+ int data_type)
+{
+ switch (data_type) {
+ case FSNOTIFY_EVENT_FILE_RANGE:
+ return (struct file_range *)data;
+ default:
+ return NULL;
+ }
+}
+
/*
* Index to merged marks iterator array that correlates to a type of watch.
* The type of watched object can be deduced from the iterator type, but not
@@ -854,9 +895,17 @@ static inline void fsnotify_init_event(struct fsnotify_event *event)
{
INIT_LIST_HEAD(&event->list);
}
+int fsnotify_pre_content(const struct path *path, const loff_t *ppos,
+ size_t count);
#else
+static inline int fsnotify_pre_content(const struct path *path,
+ const loff_t *ppos, size_t count)
+{
+ return 0;
+}
+
static inline int fsnotify(__u32 mask, const void *data, int data_type,
struct inode *dir, const struct qstr *name,
struct inode *inode, u32 cookie)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index f02925447e59..21428d897d76 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3431,6 +3431,7 @@ extern vm_fault_t filemap_fault(struct vm_fault *vmf);
extern vm_fault_t filemap_map_pages(struct vm_fault *vmf,
pgoff_t start_pgoff, pgoff_t end_pgoff);
extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf);
+extern vm_fault_t filemap_fsnotify_fault(struct vm_fault *vmf);
extern unsigned long stack_guard_gap;
/* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */
diff --git a/include/uapi/asm-generic/fcntl.h b/include/uapi/asm-generic/fcntl.h
index 80f37a0d40d7..613475285643 100644
--- a/include/uapi/asm-generic/fcntl.h
+++ b/include/uapi/asm-generic/fcntl.h
@@ -6,7 +6,6 @@
/*
* FMODE_EXEC is 0x20
- * FMODE_NONOTIFY is 0x4000000
* These cannot be used by userspace O_* until internal and external open
* flags are split.
* -Eric Paris
diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h
index 34f221d3a1b9..bd8167979707 100644
--- a/include/uapi/linux/fanotify.h
+++ b/include/uapi/linux/fanotify.h
@@ -25,6 +25,9 @@
#define FAN_OPEN_PERM 0x00010000 /* File open in perm check */
#define FAN_ACCESS_PERM 0x00020000 /* File accessed in perm check */
#define FAN_OPEN_EXEC_PERM 0x00040000 /* File open/exec in perm check */
+/* #define FAN_DIR_MODIFY 0x00080000 */ /* Deprecated (reserved) */
+
+#define FAN_PRE_ACCESS 0x00100000 /* Pre-content access hook */
#define FAN_EVENT_ON_CHILD 0x08000000 /* Interested in child events */
@@ -143,6 +146,7 @@ struct fanotify_event_metadata {
#define FAN_EVENT_INFO_TYPE_DFID 3
#define FAN_EVENT_INFO_TYPE_PIDFD 4
#define FAN_EVENT_INFO_TYPE_ERROR 5
+#define FAN_EVENT_INFO_TYPE_RANGE 6
/* Special info types for FAN_RENAME */
#define FAN_EVENT_INFO_TYPE_OLD_DFID_NAME 10
@@ -189,6 +193,13 @@ struct fanotify_event_info_error {
__u32 error_count;
};
+struct fanotify_event_info_range {
+ struct fanotify_event_info_header hdr;
+ __u32 pad;
+ __u64 offset;
+ __u64 count;
+};
+
/*
* User space may need to record additional information about its decision.
* The extra information type records what kind of information is included.
@@ -224,6 +235,13 @@ struct fanotify_response_info_audit_rule {
/* Legit userspace responses to a _PERM event */
#define FAN_ALLOW 0x01
#define FAN_DENY 0x02
+/* errno other than EPERM can specified in upper byte of deny response */
+#define FAN_ERRNO_BITS 8
+#define FAN_ERRNO_SHIFT (32 - FAN_ERRNO_BITS)
+#define FAN_ERRNO_MASK ((1 << FAN_ERRNO_BITS) - 1)
+#define FAN_DENY_ERRNO(err) \
+ (FAN_DENY | ((((__u32)(err)) & FAN_ERRNO_MASK) << FAN_ERRNO_SHIFT))
+
#define FAN_AUDIT 0x10 /* Bitmask to create audit record for result */
#define FAN_INFO 0x20 /* Bitmask to indicate additional information */
diff --git a/kernel/fork.c b/kernel/fork.c
index ded49f18cd95..2fa2a3582925 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -625,8 +625,8 @@ static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm)
* We depend on the oldmm having properly denied write access to the
* exe_file already.
*/
- if (exe_file && deny_write_access(exe_file))
- pr_warn_once("deny_write_access() failed in %s\n", __func__);
+ if (exe_file && exe_file_deny_write_access(exe_file))
+ pr_warn_once("exe_file_deny_write_access() failed in %s\n", __func__);
}
#ifdef CONFIG_MMU
@@ -1416,13 +1416,13 @@ int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
* We expect the caller (i.e., sys_execve) to already denied
* write access, so this is unlikely to fail.
*/
- if (unlikely(deny_write_access(new_exe_file)))
+ if (unlikely(exe_file_deny_write_access(new_exe_file)))
return -EACCES;
get_file(new_exe_file);
}
rcu_assign_pointer(mm->exe_file, new_exe_file);
if (old_exe_file) {
- allow_write_access(old_exe_file);
+ exe_file_allow_write_access(old_exe_file);
fput(old_exe_file);
}
return 0;
@@ -1463,7 +1463,7 @@ int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
return ret;
}
- ret = deny_write_access(new_exe_file);
+ ret = exe_file_deny_write_access(new_exe_file);
if (ret)
return -EACCES;
get_file(new_exe_file);
@@ -1475,7 +1475,7 @@ int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
mmap_write_unlock(mm);
if (old_exe_file) {
- allow_write_access(old_exe_file);
+ exe_file_allow_write_access(old_exe_file);
fput(old_exe_file);
}
return 0;
diff --git a/mm/filemap.c b/mm/filemap.c
index 440922a7d8f1..b8ed647416e9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -47,6 +47,7 @@
#include <linux/splice.h>
#include <linux/rcupdate_wait.h>
#include <linux/sched/mm.h>
+#include <linux/fsnotify.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include "internal.h"
@@ -3141,6 +3142,14 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
unsigned long vm_flags = vmf->vma->vm_flags;
unsigned int mmap_miss;
+ /*
+ * If we have pre-content watches we need to disable readahead to make
+ * sure that we don't populate our mapping with 0 filled pages that we
+ * never emitted an event for.
+ */
+ if (unlikely(FMODE_FSNOTIFY_HSM(file->f_mode)))
+ return fpin;
+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/* Use the readahead code, even if readahead is disabled */
if ((vm_flags & VM_HUGEPAGE) && HPAGE_PMD_ORDER <= MAX_PAGECACHE_ORDER) {
@@ -3209,6 +3218,10 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
struct file *fpin = NULL;
unsigned int mmap_miss;
+ /* See comment in do_sync_mmap_readahead. */
+ if (unlikely(FMODE_FSNOTIFY_HSM(file->f_mode)))
+ return fpin;
+
/* If we don't want any read-ahead, don't bother */
if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages)
return fpin;
@@ -3268,6 +3281,48 @@ static vm_fault_t filemap_fault_recheck_pte_none(struct vm_fault *vmf)
}
/**
+ * filemap_fsnotify_fault - maybe emit a pre-content event.
+ * @vmf: struct vm_fault containing details of the fault.
+ *
+ * If we have a pre-content watch on this file we will emit an event for this
+ * range. If we return anything the fault caller should return immediately, we
+ * will return VM_FAULT_RETRY if we had to emit an event, which will trigger the
+ * fault again and then the fault handler will run the second time through.
+ *
+ * Return: a bitwise-OR of %VM_FAULT_ codes, 0 if nothing happened.
+ */
+vm_fault_t filemap_fsnotify_fault(struct vm_fault *vmf)
+{
+ struct file *fpin = NULL;
+ int mask = (vmf->flags & FAULT_FLAG_WRITE) ? MAY_WRITE : MAY_ACCESS;
+ loff_t pos = vmf->pgoff >> PAGE_SHIFT;
+ size_t count = PAGE_SIZE;
+ int err;
+
+ /*
+ * We already did this and now we're retrying with everything locked,
+ * don't emit the event and continue.
+ */
+ if (vmf->flags & FAULT_FLAG_TRIED)
+ return 0;
+
+ /* No watches, we're done. */
+ if (likely(!FMODE_FSNOTIFY_HSM(vmf->vma->vm_file->f_mode)))
+ return 0;
+
+ fpin = maybe_unlock_mmap_for_io(vmf, fpin);
+ if (!fpin)
+ return VM_FAULT_SIGBUS;
+
+ err = fsnotify_file_area_perm(fpin, mask, &pos, count);
+ fput(fpin);
+ if (err)
+ return VM_FAULT_SIGBUS;
+ return VM_FAULT_RETRY;
+}
+EXPORT_SYMBOL_GPL(filemap_fsnotify_fault);
+
+/**
* filemap_fault - read in file data for page fault handling
* @vmf: struct vm_fault containing details of the fault
*
@@ -3371,6 +3426,37 @@ retry_find:
*/
if (unlikely(!folio_test_uptodate(folio))) {
/*
+ * If this is a precontent file we have can now emit an event to
+ * try and populate the folio.
+ */
+ if (!(vmf->flags & FAULT_FLAG_TRIED) &&
+ unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) {
+ loff_t pos = folio_pos(folio);
+ size_t count = folio_size(folio);
+
+ /* We're NOWAIT, we have to retry. */
+ if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) {
+ folio_unlock(folio);
+ goto out_retry;
+ }
+
+ if (mapping_locked)
+ filemap_invalidate_unlock_shared(mapping);
+ mapping_locked = false;
+
+ folio_unlock(folio);
+ fpin = maybe_unlock_mmap_for_io(vmf, fpin);
+ if (!fpin)
+ goto out_retry;
+
+ error = fsnotify_file_area_perm(fpin, MAY_ACCESS, &pos,
+ count);
+ if (error)
+ ret = VM_FAULT_SIGBUS;
+ goto out_retry;
+ }
+
+ /*
* If the invalidate lock is not held, the folio was in cache
* and uptodate and now it is not. Strange but possible since we
* didn't hold the page lock all the time. Let's drop
diff --git a/mm/memory.c b/mm/memory.c
index 398c031be9ba..f8bebec145f5 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -76,6 +76,7 @@
#include <linux/ptrace.h>
#include <linux/vmalloc.h>
#include <linux/sched/sysctl.h>
+#include <linux/fsnotify.h>
#include <trace/events/kmem.h>
@@ -5662,8 +5663,17 @@ out_map:
static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
+
if (vma_is_anonymous(vma))
return do_huge_pmd_anonymous_page(vmf);
+ /*
+ * Currently we just emit PAGE_SIZE for our fault events, so don't allow
+ * a huge fault if we have a pre content watch on this file. This would
+ * be trivial to support, but there would need to be tests to ensure
+ * this works properly and those don't exist currently.
+ */
+ if (unlikely(FMODE_FSNOTIFY_HSM(vma->vm_file->f_mode)))
+ return VM_FAULT_FALLBACK;
if (vma->vm_ops->huge_fault)
return vma->vm_ops->huge_fault(vmf, PMD_ORDER);
return VM_FAULT_FALLBACK;
@@ -5687,6 +5697,9 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
}
if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
+ /* See comment in create_huge_pmd. */
+ if (unlikely(FMODE_FSNOTIFY_HSM(vma->vm_file->f_mode)))
+ goto split;
if (vma->vm_ops->huge_fault) {
ret = vma->vm_ops->huge_fault(vmf, PMD_ORDER);
if (!(ret & VM_FAULT_FALLBACK))
@@ -5709,6 +5722,9 @@ static vm_fault_t create_huge_pud(struct vm_fault *vmf)
/* No support for anonymous transparent PUD pages yet */
if (vma_is_anonymous(vma))
return VM_FAULT_FALLBACK;
+ /* See comment in create_huge_pmd. */
+ if (unlikely(FMODE_FSNOTIFY_HSM(vma->vm_file->f_mode)))
+ return VM_FAULT_FALLBACK;
if (vma->vm_ops->huge_fault)
return vma->vm_ops->huge_fault(vmf, PUD_ORDER);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
@@ -5726,6 +5742,9 @@ static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
if (vma_is_anonymous(vma))
goto split;
if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
+ /* See comment in create_huge_pmd. */
+ if (unlikely(FMODE_FSNOTIFY_HSM(vma->vm_file->f_mode)))
+ goto split;
if (vma->vm_ops->huge_fault) {
ret = vma->vm_ops->huge_fault(vmf, PUD_ORDER);
if (!(ret & VM_FAULT_FALLBACK))
diff --git a/mm/nommu.c b/mm/nommu.c
index 9cb6e99215e2..baa79abdaf03 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1613,6 +1613,13 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
}
EXPORT_SYMBOL(remap_vmalloc_range);
+vm_fault_t filemap_fsnotify_fault(struct vm_fault *vmf)
+{
+ BUG();
+ return 0;
+}
+EXPORT_SYMBOL_GPL(filemap_fsnotify_fault);
+
vm_fault_t filemap_fault(struct vm_fault *vmf)
{
BUG();
diff --git a/mm/readahead.c b/mm/readahead.c
index e151f4b13ca4..95e3e71abb6b 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -128,6 +128,7 @@
#include <linux/blk-cgroup.h>
#include <linux/fadvise.h>
#include <linux/sched/mm.h>
+#include <linux/fsnotify.h>
#include "internal.h"
@@ -549,6 +550,15 @@ void page_cache_sync_ra(struct readahead_control *ractl,
pgoff_t prev_index, miss;
/*
+ * If we have pre-content watches we need to disable readahead to make
+ * sure that we don't find 0 filled pages in cache that we never emitted
+ * events for. Filesystems supporting HSM must make sure to not call
+ * this function with ractl->file unset for files handled by HSM.
+ */
+ if (ractl->file && unlikely(FMODE_FSNOTIFY_HSM(ractl->file->f_mode)))
+ return;
+
+ /*
* Even if readahead is disabled, issue this request as readahead
* as we'll need it to satisfy the requested range. The forced
* readahead will do the right thing and limit the read to just the
@@ -626,6 +636,10 @@ void page_cache_async_ra(struct readahead_control *ractl,
if (!ra->ra_pages)
return;
+ /* See the comment in page_cache_sync_ra. */
+ if (ractl->file && unlikely(FMODE_FSNOTIFY_HSM(ractl->file->f_mode)))
+ return;
+
/*
* Same bit is used for PG_readahead and PG_reclaim.
*/
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 22fd7436f372..7b867dfec88b 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -3404,7 +3404,8 @@ static int selinux_path_notify(const struct path *path, u64 mask,
perm |= FILE__WATCH_WITH_PERM;
/* watches on read-like events need the file:watch_reads permission */
- if (mask & (FS_ACCESS | FS_ACCESS_PERM | FS_CLOSE_NOWRITE))
+ if (mask & (FS_ACCESS | FS_ACCESS_PERM | FS_PRE_ACCESS |
+ FS_CLOSE_NOWRITE))
perm |= FILE__WATCH_READS;
return path_has_perm(current_cred(), path, perm);