From 7a37f55af7af868119b4fb69285f5fa03ba8cf35 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 5 Aug 2025 17:10:56 +0200 Subject: fuse: add COPY_FILE_RANGE_64 that allows large copies The FUSE protocol uses struct fuse_write_out to convey the return value of copy_file_range, which is restricted to uint32_t. But the COPY_FILE_RANGE interface supports a 64-bit size copies and there's no reason why copies should be limited to 32-bit. Introduce a new op COPY_FILE_RANGE_64, which is identical, except the number of bytes copied is returned in a 64-bit value. If the fuse server does not support COPY_FILE_RANGE_64, fall back to COPY_FILE_RANGE. Reported-by: Florian Weimer Closes: https://lore.kernel.org/all/lhuh5ynl8z5.fsf@oldenburg.str.redhat.com/ Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 44 ++++++++++++++++++++++++++++++++------------ fs/fuse/fuse_i.h | 3 +++ include/uapi/linux/fuse.h | 12 +++++++++++- 3 files changed, 46 insertions(+), 13 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 4adcf09d4b01..867b5fde1237 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2960,10 +2960,12 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, .nodeid_out = ff_out->nodeid, .fh_out = ff_out->fh, .off_out = pos_out, - .len = min_t(size_t, len, UINT_MAX & PAGE_MASK), + .len = len, .flags = flags }; struct fuse_write_out outarg; + struct fuse_copy_file_range_out outarg_64; + u64 bytes_copied; ssize_t err; /* mark unstable when write-back is not used, and file_out gets * extended */ @@ -3013,33 +3015,51 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, if (is_unstable) set_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state); - args.opcode = FUSE_COPY_FILE_RANGE; + args.opcode = FUSE_COPY_FILE_RANGE_64; args.nodeid = ff_in->nodeid; args.in_numargs = 1; args.in_args[0].size = sizeof(inarg); args.in_args[0].value = &inarg; args.out_numargs = 1; - args.out_args[0].size = sizeof(outarg); - args.out_args[0].value = &outarg; + args.out_args[0].size = sizeof(outarg_64); + args.out_args[0].value = &outarg_64; + if (fc->no_copy_file_range_64) { +fallback: + /* Fall back to old op that can't handle large copy length */ + args.opcode = FUSE_COPY_FILE_RANGE; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; + inarg.len = len = min_t(size_t, len, UINT_MAX & PAGE_MASK); + } err = fuse_simple_request(fm, &args); if (err == -ENOSYS) { - fc->no_copy_file_range = 1; - err = -EOPNOTSUPP; + if (fc->no_copy_file_range_64) { + fc->no_copy_file_range = 1; + err = -EOPNOTSUPP; + } else { + fc->no_copy_file_range_64 = 1; + goto fallback; + } } - if (!err && outarg.size > len) - err = -EIO; - if (err) goto out; + bytes_copied = fc->no_copy_file_range_64 ? + outarg.size : outarg_64.bytes_copied; + + if (bytes_copied > len) { + err = -EIO; + goto out; + } + truncate_inode_pages_range(inode_out->i_mapping, ALIGN_DOWN(pos_out, PAGE_SIZE), - ALIGN(pos_out + outarg.size, PAGE_SIZE) - 1); + ALIGN(pos_out + bytes_copied, PAGE_SIZE) - 1); file_update_time(file_out); - fuse_write_update_attr(inode_out, pos_out + outarg.size, outarg.size); + fuse_write_update_attr(inode_out, pos_out + bytes_copied, bytes_copied); - err = outarg.size; + err = bytes_copied; out: if (is_unstable) clear_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index cc428d04be3e..d68eea4dd743 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -856,6 +856,9 @@ struct fuse_conn { /** Does the filesystem support copy_file_range? */ unsigned no_copy_file_range:1; + /** Does the filesystem support copy_file_range_64? */ + unsigned no_copy_file_range_64:1; + /* Send DESTROY request */ unsigned int destroy:1; diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 122d6586e8d4..94621f68a5cc 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -235,6 +235,10 @@ * * 7.44 * - add FUSE_NOTIFY_INC_EPOCH + * + * 7.45 + * - add FUSE_COPY_FILE_RANGE_64 + * - add struct fuse_copy_file_range_out */ #ifndef _LINUX_FUSE_H @@ -270,7 +274,7 @@ #define FUSE_KERNEL_VERSION 7 /** Minor version number of this interface */ -#define FUSE_KERNEL_MINOR_VERSION 44 +#define FUSE_KERNEL_MINOR_VERSION 45 /** The node ID of the root inode */ #define FUSE_ROOT_ID 1 @@ -657,6 +661,7 @@ enum fuse_opcode { FUSE_SYNCFS = 50, FUSE_TMPFILE = 51, FUSE_STATX = 52, + FUSE_COPY_FILE_RANGE_64 = 53, /* CUSE specific operations */ CUSE_INIT = 4096, @@ -1148,6 +1153,11 @@ struct fuse_copy_file_range_in { uint64_t flags; }; +/* For FUSE_COPY_FILE_RANGE_64 */ +struct fuse_copy_file_range_out { + uint64_t bytes_copied; +}; + #define FUSE_SETUPMAPPING_FLAG_WRITE (1ull << 0) #define FUSE_SETUPMAPPING_FLAG_READ (1ull << 1) struct fuse_setupmapping_in { -- cgit v1.2.3 From e49a6828aba42bfbd4702e0a06aaae6a67f27285 Mon Sep 17 00:00:00 2001 From: Chen Linxuan Date: Tue, 10 Jun 2025 10:11:25 +0800 Subject: doc: fuse: Add max_background and congestion_threshold As I preparing patches adding selftests for fusectl, I notice that documentation of max_background and congestion_threshold is missing. This patch add some descriptions about these two files. Cc: Amir Goldstein Signed-off-by: Chen Linxuan Signed-off-by: Miklos Szeredi --- Documentation/filesystems/fuse.rst | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/Documentation/filesystems/fuse.rst b/Documentation/filesystems/fuse.rst index 1e31e87aee68..c589316c8bb3 100644 --- a/Documentation/filesystems/fuse.rst +++ b/Documentation/filesystems/fuse.rst @@ -129,6 +129,20 @@ For each connection the following files exist within this directory: connection. This means that all waiting requests will be aborted an error returned for all aborted and new requests. + max_background + The maximum number of background requests that can be outstanding + at a time. When the number of background requests reaches this limit, + further requests will be blocked until some are completed, potentially + causing I/O operations to stall. + + congestion_threshold + The threshold of background requests at which the kernel considers + the filesystem to be congested. When the number of background requests + exceeds this value, the kernel will skip asynchronous readahead + operations, reducing read-ahead optimizations but preserving essential + I/O, as well as suspending non-synchronous writeback operations + (WB_SYNC_NONE), delaying page cache flushing to the filesystem. + Only the owner of the mount may read or write these files. Interrupting filesystem operations -- cgit v1.2.3 From 1a7b13781b0d48312e0ead2585776b0afd1343cd Mon Sep 17 00:00:00 2001 From: Chen Linxuan Date: Tue, 10 Jun 2025 10:10:03 +0800 Subject: selftests: filesystems: Add functional test for the abort file in fusectl This patch add a simple functional test for the "abort" file in fusectlfs (/sys/fs/fuse/connections/ID/abort). A simple fuse daemon is added for testing. Related discussion can be found in the link below. Link: https://lore.kernel.org/all/CAOQ4uxjKFXOKQxPpxtS6G_nR0tpw95w0GiO68UcWg_OBhmSY=Q@mail.gmail.com/ Signed-off-by: Chen Linxuan Acked-by: Shuah Khan Reviewed-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- MAINTAINERS | 1 + tools/testing/selftests/Makefile | 1 + .../testing/selftests/filesystems/fuse/.gitignore | 3 + tools/testing/selftests/filesystems/fuse/Makefile | 21 +++ .../testing/selftests/filesystems/fuse/fuse_mnt.c | 146 +++++++++++++++++++++ .../selftests/filesystems/fuse/fusectl_test.c | 140 ++++++++++++++++++++ 6 files changed, 312 insertions(+) create mode 100644 tools/testing/selftests/filesystems/fuse/.gitignore create mode 100644 tools/testing/selftests/filesystems/fuse/Makefile create mode 100644 tools/testing/selftests/filesystems/fuse/fuse_mnt.c create mode 100644 tools/testing/selftests/filesystems/fuse/fusectl_test.c diff --git a/MAINTAINERS b/MAINTAINERS index fed6cd812d79..1b7f84ae8fd7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10064,6 +10064,7 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/fuse.git F: Documentation/filesystems/fuse* F: fs/fuse/ F: include/uapi/linux/fuse.h +F: tools/testing/selftests/filesystems/fuse/ FUTEX SUBSYSTEM M: Thomas Gleixner diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 030da61dbff3..babed7b1c2d1 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -36,6 +36,7 @@ TARGETS += filesystems/fat TARGETS += filesystems/overlayfs TARGETS += filesystems/statmount TARGETS += filesystems/mount-notify +TARGETS += filesystems/fuse TARGETS += firmware TARGETS += fpu TARGETS += ftrace diff --git a/tools/testing/selftests/filesystems/fuse/.gitignore b/tools/testing/selftests/filesystems/fuse/.gitignore new file mode 100644 index 000000000000..3e72e742d08e --- /dev/null +++ b/tools/testing/selftests/filesystems/fuse/.gitignore @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only +fuse_mnt +fusectl_test diff --git a/tools/testing/selftests/filesystems/fuse/Makefile b/tools/testing/selftests/filesystems/fuse/Makefile new file mode 100644 index 000000000000..612aad69a93a --- /dev/null +++ b/tools/testing/selftests/filesystems/fuse/Makefile @@ -0,0 +1,21 @@ +# SPDX-License-Identifier: GPL-2.0-or-later + +CFLAGS += -Wall -O2 -g $(KHDR_INCLUDES) + +TEST_GEN_PROGS := fusectl_test +TEST_GEN_FILES := fuse_mnt + +include ../../lib.mk + +VAR_CFLAGS := $(shell pkg-config fuse --cflags 2>/dev/null) +ifeq ($(VAR_CFLAGS),) +VAR_CFLAGS := -D_FILE_OFFSET_BITS=64 -I/usr/include/fuse +endif + +VAR_LDLIBS := $(shell pkg-config fuse --libs 2>/dev/null) +ifeq ($(VAR_LDLIBS),) +VAR_LDLIBS := -lfuse -pthread +endif + +$(OUTPUT)/fuse_mnt: CFLAGS += $(VAR_CFLAGS) +$(OUTPUT)/fuse_mnt: LDLIBS += $(VAR_LDLIBS) diff --git a/tools/testing/selftests/filesystems/fuse/fuse_mnt.c b/tools/testing/selftests/filesystems/fuse/fuse_mnt.c new file mode 100644 index 000000000000..d12b17f30fad --- /dev/null +++ b/tools/testing/selftests/filesystems/fuse/fuse_mnt.c @@ -0,0 +1,146 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fusectl test file-system + * Creates a simple FUSE filesystem with a single read-write file (/test) + */ + +#define FUSE_USE_VERSION 26 + +#include +#include +#include +#include +#include +#include +#include + +#define MAX(a, b) ((a) > (b) ? (a) : (b)) + +static char *content; +static size_t content_size = 0; +static const char test_path[] = "/test"; + +static int test_getattr(const char *path, struct stat *st) +{ + memset(st, 0, sizeof(*st)); + + if (!strcmp(path, "/")) { + st->st_mode = S_IFDIR | 0755; + st->st_nlink = 2; + return 0; + } + + if (!strcmp(path, test_path)) { + st->st_mode = S_IFREG | 0664; + st->st_nlink = 1; + st->st_size = content_size; + return 0; + } + + return -ENOENT; +} + +static int test_readdir(const char *path, void *buf, fuse_fill_dir_t filler, + off_t offset, struct fuse_file_info *fi) +{ + if (strcmp(path, "/")) + return -ENOENT; + + filler(buf, ".", NULL, 0); + filler(buf, "..", NULL, 0); + filler(buf, test_path + 1, NULL, 0); + + return 0; +} + +static int test_open(const char *path, struct fuse_file_info *fi) +{ + if (strcmp(path, test_path)) + return -ENOENT; + + return 0; +} + +static int test_read(const char *path, char *buf, size_t size, off_t offset, + struct fuse_file_info *fi) +{ + if (strcmp(path, test_path) != 0) + return -ENOENT; + + if (!content || content_size == 0) + return 0; + + if (offset >= content_size) + return 0; + + if (offset + size > content_size) + size = content_size - offset; + + memcpy(buf, content + offset, size); + + return size; +} + +static int test_write(const char *path, const char *buf, size_t size, + off_t offset, struct fuse_file_info *fi) +{ + size_t new_size; + + if (strcmp(path, test_path) != 0) + return -ENOENT; + + if(offset > content_size) + return -EINVAL; + + new_size = MAX(offset + size, content_size); + + if (new_size > content_size) + content = realloc(content, new_size); + + content_size = new_size; + + if (!content) + return -ENOMEM; + + memcpy(content + offset, buf, size); + + return size; +} + +static int test_truncate(const char *path, off_t size) +{ + if (strcmp(path, test_path) != 0) + return -ENOENT; + + if (size == 0) { + free(content); + content = NULL; + content_size = 0; + return 0; + } + + content = realloc(content, size); + + if (!content) + return -ENOMEM; + + if (size > content_size) + memset(content + content_size, 0, size - content_size); + + content_size = size; + return 0; +} + +static struct fuse_operations memfd_ops = { + .getattr = test_getattr, + .readdir = test_readdir, + .open = test_open, + .read = test_read, + .write = test_write, + .truncate = test_truncate, +}; + +int main(int argc, char *argv[]) +{ + return fuse_main(argc, argv, &memfd_ops, NULL); +} diff --git a/tools/testing/selftests/filesystems/fuse/fusectl_test.c b/tools/testing/selftests/filesystems/fuse/fusectl_test.c new file mode 100644 index 000000000000..8d124d1cacb2 --- /dev/null +++ b/tools/testing/selftests/filesystems/fuse/fusectl_test.c @@ -0,0 +1,140 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +// Copyright (c) 2025 Chen Linxuan + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../../kselftest_harness.h" + +#define FUSECTL_MOUNTPOINT "/sys/fs/fuse/connections" +#define FUSE_MOUNTPOINT "/tmp/fuse_mnt_XXXXXX" +#define FUSE_DEVICE "/dev/fuse" +#define FUSECTL_TEST_VALUE "1" + +static void write_file(struct __test_metadata *const _metadata, + const char *path, const char *val) +{ + int fd = open(path, O_WRONLY); + size_t len = strlen(val); + + ASSERT_GE(fd, 0); + ASSERT_EQ(write(fd, val, len), len); + ASSERT_EQ(close(fd), 0); +} + +FIXTURE(fusectl){ + char fuse_mountpoint[sizeof(FUSE_MOUNTPOINT)]; + int connection; +}; + +FIXTURE_SETUP(fusectl) +{ + const char *fuse_mnt_prog = "./fuse_mnt"; + int status, pid; + struct stat statbuf; + uid_t uid = getuid(); + gid_t gid = getgid(); + char buf[32]; + + /* Setup userns */ + ASSERT_EQ(unshare(CLONE_NEWNS|CLONE_NEWUSER), 0); + sprintf(buf, "0 %d 1", uid); + write_file(_metadata, "/proc/self/uid_map", buf); + write_file(_metadata, "/proc/self/setgroups", "deny"); + sprintf(buf, "0 %d 1", gid); + write_file(_metadata, "/proc/self/gid_map", buf); + ASSERT_EQ(mount("", "/", NULL, MS_REC|MS_PRIVATE, NULL), 0); + + strcpy(self->fuse_mountpoint, FUSE_MOUNTPOINT); + + if (!mkdtemp(self->fuse_mountpoint)) + SKIP(return, + "Failed to create FUSE mountpoint %s", + strerror(errno)); + + if (access(FUSECTL_MOUNTPOINT, F_OK)) + SKIP(return, + "FUSE control filesystem not mounted"); + + pid = fork(); + if (pid < 0) + SKIP(return, + "Failed to fork FUSE daemon process: %s", + strerror(errno)); + + if (pid == 0) { + execlp(fuse_mnt_prog, fuse_mnt_prog, self->fuse_mountpoint, NULL); + exit(errno); + } + + waitpid(pid, &status, 0); + if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { + SKIP(return, + "Failed to start FUSE daemon %s", + strerror(WEXITSTATUS(status))); + } + + if (stat(self->fuse_mountpoint, &statbuf)) + SKIP(return, + "Failed to stat FUSE mountpoint %s", + strerror(errno)); + + self->connection = statbuf.st_dev; +} + +FIXTURE_TEARDOWN(fusectl) +{ + umount2(self->fuse_mountpoint, MNT_DETACH); + rmdir(self->fuse_mountpoint); +} + +TEST_F(fusectl, abort) +{ + char path_buf[PATH_MAX]; + int abort_fd, test_fd, ret; + + sprintf(path_buf, "/sys/fs/fuse/connections/%d/abort", self->connection); + + ASSERT_EQ(0, access(path_buf, F_OK)); + + abort_fd = open(path_buf, O_WRONLY); + ASSERT_GE(abort_fd, 0); + + sprintf(path_buf, "%s/test", self->fuse_mountpoint); + + test_fd = open(path_buf, O_RDWR); + ASSERT_GE(test_fd, 0); + + ret = read(test_fd, path_buf, sizeof(path_buf)); + ASSERT_EQ(ret, 0); + + ret = write(test_fd, "test", sizeof("test")); + ASSERT_EQ(ret, sizeof("test")); + + ret = lseek(test_fd, 0, SEEK_SET); + ASSERT_GE(ret, 0); + + ret = write(abort_fd, FUSECTL_TEST_VALUE, sizeof(FUSECTL_TEST_VALUE)); + ASSERT_GT(ret, 0); + + close(abort_fd); + + ret = read(test_fd, path_buf, sizeof(path_buf)); + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, ENOTCONN); +} + +TEST_HARNESS_MAIN -- cgit v1.2.3 From 6be0ddb20200c0e45c1e7db4b817fccd04d53f2d Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Thu, 12 Jun 2025 10:22:39 +0700 Subject: Documentation: fuse: Consolidate FUSE docs into its own subdirectory All four FUSE docs are currently in upper-level Documentation/filesystems/ directory, but these are distinct as a group of its own. Move them into Documentation/filesystems/fuse/ subdirectory. Signed-off-by: Bagas Sanjaya Acked-by: Jeff Layton Signed-off-by: Miklos Szeredi --- Documentation/filesystems/fuse-io-uring.rst | 99 ----- Documentation/filesystems/fuse-io.rst | 45 --- Documentation/filesystems/fuse-passthrough.rst | 133 ------- Documentation/filesystems/fuse.rst | 440 --------------------- Documentation/filesystems/fuse/fuse-io-uring.rst | 99 +++++ Documentation/filesystems/fuse/fuse-io.rst | 45 +++ .../filesystems/fuse/fuse-passthrough.rst | 133 +++++++ Documentation/filesystems/fuse/fuse.rst | 440 +++++++++++++++++++++ Documentation/filesystems/fuse/index.rst | 14 + Documentation/filesystems/index.rst | 5 +- MAINTAINERS | 2 +- 11 files changed, 733 insertions(+), 722 deletions(-) delete mode 100644 Documentation/filesystems/fuse-io-uring.rst delete mode 100644 Documentation/filesystems/fuse-io.rst delete mode 100644 Documentation/filesystems/fuse-passthrough.rst delete mode 100644 Documentation/filesystems/fuse.rst create mode 100644 Documentation/filesystems/fuse/fuse-io-uring.rst create mode 100644 Documentation/filesystems/fuse/fuse-io.rst create mode 100644 Documentation/filesystems/fuse/fuse-passthrough.rst create mode 100644 Documentation/filesystems/fuse/fuse.rst create mode 100644 Documentation/filesystems/fuse/index.rst diff --git a/Documentation/filesystems/fuse-io-uring.rst b/Documentation/filesystems/fuse-io-uring.rst deleted file mode 100644 index d73dd0dbd238..000000000000 --- a/Documentation/filesystems/fuse-io-uring.rst +++ /dev/null @@ -1,99 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -======================================= -FUSE-over-io-uring design documentation -======================================= - -This documentation covers basic details how the fuse -kernel/userspace communication through io-uring is configured -and works. For generic details about FUSE see fuse.rst. - -This document also covers the current interface, which is -still in development and might change. - -Limitations -=========== -As of now not all requests types are supported through io-uring, userspace -is required to also handle requests through /dev/fuse after io-uring setup -is complete. Specifically notifications (initiated from the daemon side) -and interrupts. - -Fuse io-uring configuration -=========================== - -Fuse kernel requests are queued through the classical /dev/fuse -read/write interface - until io-uring setup is complete. - -In order to set up fuse-over-io-uring fuse-server (user-space) -needs to submit SQEs (opcode = IORING_OP_URING_CMD) to the /dev/fuse -connection file descriptor. Initial submit is with the sub command -FUSE_URING_REQ_REGISTER, which will just register entries to be -available in the kernel. - -Once at least one entry per queue is submitted, kernel starts -to enqueue to ring queues. -Note, every CPU core has its own fuse-io-uring queue. -Userspace handles the CQE/fuse-request and submits the result as -subcommand FUSE_URING_REQ_COMMIT_AND_FETCH - kernel completes -the requests and also marks the entry available again. If there are -pending requests waiting the request will be immediately submitted -to the daemon again. - -Initial SQE ------------:: - - | | FUSE filesystem daemon - | | - | | >io_uring_submit() - | | IORING_OP_URING_CMD / - | | FUSE_URING_CMD_REGISTER - | | [wait cqe] - | | >io_uring_wait_cqe() or - | | >io_uring_submit_and_wait() - | | - | >fuse_uring_cmd() | - | >fuse_uring_register() | - - -Sending requests with CQEs ---------------------------:: - - | | FUSE filesystem daemon - | | [waiting for CQEs] - | "rm /mnt/fuse/file" | - | | - | >sys_unlink() | - | >fuse_unlink() | - | [allocate request] | - | >fuse_send_one() | - | ... | - | >fuse_uring_queue_fuse_req | - | [queue request on fg queue] | - | >fuse_uring_add_req_to_ring_ent() | - | ... | - | >fuse_uring_copy_to_ring() | - | >io_uring_cmd_done() | - | >request_wait_answer() | - | [sleep on req->waitq] | - | | [receives and handles CQE] - | | [submit result and fetch next] - | | >io_uring_submit() - | | IORING_OP_URING_CMD/ - | | FUSE_URING_CMD_COMMIT_AND_FETCH - | >fuse_uring_cmd() | - | >fuse_uring_commit_fetch() | - | >fuse_uring_commit() | - | >fuse_uring_copy_from_ring() | - | [ copy the result to the fuse req] | - | >fuse_uring_req_end() | - | >fuse_request_end() | - | [wake up req->waitq] | - | >fuse_uring_next_fuse_req | - | [wait or handle next req] | - | | - | [req->waitq woken up] | - | s_stack_depth`` and ``fc->max_stack_depth``). -For example, during the ``FUSE_INIT`` handshake, the FUSE daemon can negotiate -the ``max_stack_depth`` it supports. When a backing file is registered via -``FUSE_DEV_IOC_BACKING_OPEN``, the kernel checks if the backing file's -filesystem stack depth is within the allowed limit. - -The ``CAP_SYS_ADMIN`` requirement provides an additional layer of security, -ensuring that only privileged users can create these potentially complex -stacking arrangements. - -General Security Posture ------------------------- - -As a general principle for new kernel features that allow userspace to instruct -the kernel to perform direct operations on its behalf based on user-provided -file descriptors, starting with a higher privilege requirement (like -``CAP_SYS_ADMIN``) is a conservative and common security practice. This allows -the feature to be used and tested while further security implications are -evaluated and addressed. diff --git a/Documentation/filesystems/fuse.rst b/Documentation/filesystems/fuse.rst deleted file mode 100644 index c589316c8bb3..000000000000 --- a/Documentation/filesystems/fuse.rst +++ /dev/null @@ -1,440 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -==== -FUSE -==== - -Definitions -=========== - -Userspace filesystem: - A filesystem in which data and metadata are provided by an ordinary - userspace process. The filesystem can be accessed normally through - the kernel interface. - -Filesystem daemon: - The process(es) providing the data and metadata of the filesystem. - -Non-privileged mount (or user mount): - A userspace filesystem mounted by a non-privileged (non-root) user. - The filesystem daemon is running with the privileges of the mounting - user. NOTE: this is not the same as mounts allowed with the "user" - option in /etc/fstab, which is not discussed here. - -Filesystem connection: - A connection between the filesystem daemon and the kernel. The - connection exists until either the daemon dies, or the filesystem is - umounted. Note that detaching (or lazy umounting) the filesystem - does *not* break the connection, in this case it will exist until - the last reference to the filesystem is released. - -Mount owner: - The user who does the mounting. - -User: - The user who is performing filesystem operations. - -What is FUSE? -============= - -FUSE is a userspace filesystem framework. It consists of a kernel -module (fuse.ko), a userspace library (libfuse.*) and a mount utility -(fusermount). - -One of the most important features of FUSE is allowing secure, -non-privileged mounts. This opens up new possibilities for the use of -filesystems. A good example is sshfs: a secure network filesystem -using the sftp protocol. - -The userspace library and utilities are available from the -`FUSE homepage: `_ - -Filesystem type -=============== - -The filesystem type given to mount(2) can be one of the following: - - fuse - This is the usual way to mount a FUSE filesystem. The first - argument of the mount system call may contain an arbitrary string, - which is not interpreted by the kernel. - - fuseblk - The filesystem is block device based. The first argument of the - mount system call is interpreted as the name of the device. - -Mount options -============= - -fd=N - The file descriptor to use for communication between the userspace - filesystem and the kernel. The file descriptor must have been - obtained by opening the FUSE device ('/dev/fuse'). - -rootmode=M - The file mode of the filesystem's root in octal representation. - -user_id=N - The numeric user id of the mount owner. - -group_id=N - The numeric group id of the mount owner. - -default_permissions - By default FUSE doesn't check file access permissions, the - filesystem is free to implement its access policy or leave it to - the underlying file access mechanism (e.g. in case of network - filesystems). This option enables permission checking, restricting - access based on file mode. It is usually useful together with the - 'allow_other' mount option. - -allow_other - This option overrides the security measure restricting file access - to the user mounting the filesystem. This option is by default only - allowed to root, but this restriction can be removed with a - (userspace) configuration option. - -max_read=N - With this option the maximum size of read operations can be set. - The default is infinite. Note that the size of read requests is - limited anyway to 32 pages (which is 128kbyte on i386). - -blksize=N - Set the block size for the filesystem. The default is 512. This - option is only valid for 'fuseblk' type mounts. - -Control filesystem -================== - -There's a control filesystem for FUSE, which can be mounted by:: - - mount -t fusectl none /sys/fs/fuse/connections - -Mounting it under the '/sys/fs/fuse/connections' directory makes it -backwards compatible with earlier versions. - -Under the fuse control filesystem each connection has a directory -named by a unique number. - -For each connection the following files exist within this directory: - - waiting - The number of requests which are waiting to be transferred to - userspace or being processed by the filesystem daemon. If there is - no filesystem activity and 'waiting' is non-zero, then the - filesystem is hung or deadlocked. - - abort - Writing anything into this file will abort the filesystem - connection. This means that all waiting requests will be aborted an - error returned for all aborted and new requests. - - max_background - The maximum number of background requests that can be outstanding - at a time. When the number of background requests reaches this limit, - further requests will be blocked until some are completed, potentially - causing I/O operations to stall. - - congestion_threshold - The threshold of background requests at which the kernel considers - the filesystem to be congested. When the number of background requests - exceeds this value, the kernel will skip asynchronous readahead - operations, reducing read-ahead optimizations but preserving essential - I/O, as well as suspending non-synchronous writeback operations - (WB_SYNC_NONE), delaying page cache flushing to the filesystem. - -Only the owner of the mount may read or write these files. - -Interrupting filesystem operations -################################## - -If a process issuing a FUSE filesystem request is interrupted, the -following will happen: - - - If the request is not yet sent to userspace AND the signal is - fatal (SIGKILL or unhandled fatal signal), then the request is - dequeued and returns immediately. - - - If the request is not yet sent to userspace AND the signal is not - fatal, then an interrupted flag is set for the request. When - the request has been successfully transferred to userspace and - this flag is set, an INTERRUPT request is queued. - - - If the request is already sent to userspace, then an INTERRUPT - request is queued. - -INTERRUPT requests take precedence over other requests, so the -userspace filesystem will receive queued INTERRUPTs before any others. - -The userspace filesystem may ignore the INTERRUPT requests entirely, -or may honor them by sending a reply to the *original* request, with -the error set to EINTR. - -It is also possible that there's a race between processing the -original request and its INTERRUPT request. There are two possibilities: - - 1. The INTERRUPT request is processed before the original request is - processed - - 2. The INTERRUPT request is processed after the original request has - been answered - -If the filesystem cannot find the original request, it should wait for -some timeout and/or a number of new requests to arrive, after which it -should reply to the INTERRUPT request with an EAGAIN error. In case -1) the INTERRUPT request will be requeued. In case 2) the INTERRUPT -reply will be ignored. - -Aborting a filesystem connection -================================ - -It is possible to get into certain situations where the filesystem is -not responding. Reasons for this may be: - - a) Broken userspace filesystem implementation - - b) Network connection down - - c) Accidental deadlock - - d) Malicious deadlock - -(For more on c) and d) see later sections) - -In either of these cases it may be useful to abort the connection to -the filesystem. There are several ways to do this: - - - Kill the filesystem daemon. Works in case of a) and b) - - - Kill the filesystem daemon and all users of the filesystem. Works - in all cases except some malicious deadlocks - - - Use forced umount (umount -f). Works in all cases but only if - filesystem is still attached (it hasn't been lazy unmounted) - - - Abort filesystem through the FUSE control filesystem. Most - powerful method, always works. - -How do non-privileged mounts work? -================================== - -Since the mount() system call is a privileged operation, a helper -program (fusermount) is needed, which is installed setuid root. - -The implication of providing non-privileged mounts is that the mount -owner must not be able to use this capability to compromise the -system. Obvious requirements arising from this are: - - A) mount owner should not be able to get elevated privileges with the - help of the mounted filesystem - - B) mount owner should not get illegitimate access to information from - other users' and the super user's processes - - C) mount owner should not be able to induce undesired behavior in - other users' or the super user's processes - -How are requirements fulfilled? -=============================== - - A) The mount owner could gain elevated privileges by either: - - 1. creating a filesystem containing a device file, then opening this device - - 2. creating a filesystem containing a suid or sgid application, then executing this application - - The solution is not to allow opening device files and ignore - setuid and setgid bits when executing programs. To ensure this - fusermount always adds "nosuid" and "nodev" to the mount options - for non-privileged mounts. - - B) If another user is accessing files or directories in the - filesystem, the filesystem daemon serving requests can record the - exact sequence and timing of operations performed. This - information is otherwise inaccessible to the mount owner, so this - counts as an information leak. - - The solution to this problem will be presented in point 2) of C). - - C) There are several ways in which the mount owner can induce - undesired behavior in other users' processes, such as: - - 1) mounting a filesystem over a file or directory which the mount - owner could otherwise not be able to modify (or could only - make limited modifications). - - This is solved in fusermount, by checking the access - permissions on the mountpoint and only allowing the mount if - the mount owner can do unlimited modification (has write - access to the mountpoint, and mountpoint is not a "sticky" - directory) - - 2) Even if 1) is solved the mount owner can change the behavior - of other users' processes. - - i) It can slow down or indefinitely delay the execution of a - filesystem operation creating a DoS against the user or the - whole system. For example a suid application locking a - system file, and then accessing a file on the mount owner's - filesystem could be stopped, and thus causing the system - file to be locked forever. - - ii) It can present files or directories of unlimited length, or - directory structures of unlimited depth, possibly causing a - system process to eat up diskspace, memory or other - resources, again causing *DoS*. - - The solution to this as well as B) is not to allow processes - to access the filesystem, which could otherwise not be - monitored or manipulated by the mount owner. Since if the - mount owner can ptrace a process, it can do all of the above - without using a FUSE mount, the same criteria as used in - ptrace can be used to check if a process is allowed to access - the filesystem or not. - - Note that the *ptrace* check is not strictly necessary to - prevent C/2/i, it is enough to check if mount owner has enough - privilege to send signal to the process accessing the - filesystem, since *SIGSTOP* can be used to get a similar effect. - -I think these limitations are unacceptable? -=========================================== - -If a sysadmin trusts the users enough, or can ensure through other -measures, that system processes will never enter non-privileged -mounts, it can relax the last limitation in several ways: - - - With the 'user_allow_other' config option. If this config option is - set, the mounting user can add the 'allow_other' mount option which - disables the check for other users' processes. - - User namespaces have an unintuitive interaction with 'allow_other': - an unprivileged user - normally restricted from mounting with - 'allow_other' - could do so in a user namespace where they're - privileged. If any process could access such an 'allow_other' mount - this would give the mounting user the ability to manipulate - processes in user namespaces where they're unprivileged. For this - reason 'allow_other' restricts access to users in the same userns - or a descendant. - - - With the 'allow_sys_admin_access' module option. If this option is - set, super user's processes have unrestricted access to mounts - irrespective of allow_other setting or user namespace of the - mounting user. - -Note that both of these relaxations expose the system to potential -information leak or *DoS* as described in points B and C/2/i-ii in the -preceding section. - -Kernel - userspace interface -============================ - -The following diagram shows how a filesystem operation (in this -example unlink) is performed in FUSE. :: - - - | "rm /mnt/fuse/file" | FUSE filesystem daemon - | | - | | >sys_read() - | | >fuse_dev_read() - | | >request_wait() - | | [sleep on fc->waitq] - | | - | >sys_unlink() | - | >fuse_unlink() | - | [get request from | - | fc->unused_list] | - | >request_send() | - | [queue req on fc->pending] | - | [wake up fc->waitq] | [woken up] - | >request_wait_answer() | - | [sleep on req->waitq] | - | | pending] - | | [copy req to read buffer] - | | [add req to fc->processing] - | | sys_write() - | | >fuse_dev_write() - | | [look up req in fc->processing] - | | [remove from fc->processing] - | | [copy write buffer to req] - | [woken up] | [wake up req->waitq] - | | unused_list] | - | sys_unlink("/mnt/fuse/file") | - | [acquire inode semaphore | - | for "file"] | - | >fuse_unlink() | - | [sleep on req->waitq] | - | | sys_unlink("/mnt/fuse/file") - | | [acquire inode semaphore - | | for "file"] - | | *DEADLOCK* - -The solution for this is to allow the filesystem to be aborted. - -**Scenario 2 - Tricky deadlock** - - -This one needs a carefully crafted filesystem. It's a variation on -the above, only the call back to the filesystem is not explicit, -but is caused by a pagefault. :: - - | Kamikaze filesystem thread 1 | Kamikaze filesystem thread 2 - | | - | [fd = open("/mnt/fuse/file")] | [request served normally] - | [mmap fd to 'addr'] | - | [close fd] | [FLUSH triggers 'magic' flag] - | [read a byte from addr] | - | >do_page_fault() | - | [find or create page] | - | [lock page] | - | >fuse_readpage() | - | [queue READ request] | - | [sleep on req->waitq] | - | | [read request to buffer] - | | [create reply header before addr] - | | >sys_write(addr - headerlength) - | | >fuse_dev_write() - | | [look up req in fc->processing] - | | [remove from fc->processing] - | | [copy write buffer to req] - | | >do_page_fault() - | | [find or create page] - | | [lock page] - | | * DEADLOCK * - -The solution is basically the same as above. - -An additional problem is that while the write buffer is being copied -to the request, the request must not be interrupted/aborted. This is -because the destination address of the copy may not be valid after the -request has returned. - -This is solved with doing the copy atomically, and allowing abort -while the page(s) belonging to the write buffer are faulted with -get_user_pages(). The 'req->locked' flag indicates when the copy is -taking place, and abort is delayed until this flag is unset. diff --git a/Documentation/filesystems/fuse/fuse-io-uring.rst b/Documentation/filesystems/fuse/fuse-io-uring.rst new file mode 100644 index 000000000000..d73dd0dbd238 --- /dev/null +++ b/Documentation/filesystems/fuse/fuse-io-uring.rst @@ -0,0 +1,99 @@ +.. SPDX-License-Identifier: GPL-2.0 + +======================================= +FUSE-over-io-uring design documentation +======================================= + +This documentation covers basic details how the fuse +kernel/userspace communication through io-uring is configured +and works. For generic details about FUSE see fuse.rst. + +This document also covers the current interface, which is +still in development and might change. + +Limitations +=========== +As of now not all requests types are supported through io-uring, userspace +is required to also handle requests through /dev/fuse after io-uring setup +is complete. Specifically notifications (initiated from the daemon side) +and interrupts. + +Fuse io-uring configuration +=========================== + +Fuse kernel requests are queued through the classical /dev/fuse +read/write interface - until io-uring setup is complete. + +In order to set up fuse-over-io-uring fuse-server (user-space) +needs to submit SQEs (opcode = IORING_OP_URING_CMD) to the /dev/fuse +connection file descriptor. Initial submit is with the sub command +FUSE_URING_REQ_REGISTER, which will just register entries to be +available in the kernel. + +Once at least one entry per queue is submitted, kernel starts +to enqueue to ring queues. +Note, every CPU core has its own fuse-io-uring queue. +Userspace handles the CQE/fuse-request and submits the result as +subcommand FUSE_URING_REQ_COMMIT_AND_FETCH - kernel completes +the requests and also marks the entry available again. If there are +pending requests waiting the request will be immediately submitted +to the daemon again. + +Initial SQE +-----------:: + + | | FUSE filesystem daemon + | | + | | >io_uring_submit() + | | IORING_OP_URING_CMD / + | | FUSE_URING_CMD_REGISTER + | | [wait cqe] + | | >io_uring_wait_cqe() or + | | >io_uring_submit_and_wait() + | | + | >fuse_uring_cmd() | + | >fuse_uring_register() | + + +Sending requests with CQEs +--------------------------:: + + | | FUSE filesystem daemon + | | [waiting for CQEs] + | "rm /mnt/fuse/file" | + | | + | >sys_unlink() | + | >fuse_unlink() | + | [allocate request] | + | >fuse_send_one() | + | ... | + | >fuse_uring_queue_fuse_req | + | [queue request on fg queue] | + | >fuse_uring_add_req_to_ring_ent() | + | ... | + | >fuse_uring_copy_to_ring() | + | >io_uring_cmd_done() | + | >request_wait_answer() | + | [sleep on req->waitq] | + | | [receives and handles CQE] + | | [submit result and fetch next] + | | >io_uring_submit() + | | IORING_OP_URING_CMD/ + | | FUSE_URING_CMD_COMMIT_AND_FETCH + | >fuse_uring_cmd() | + | >fuse_uring_commit_fetch() | + | >fuse_uring_commit() | + | >fuse_uring_copy_from_ring() | + | [ copy the result to the fuse req] | + | >fuse_uring_req_end() | + | >fuse_request_end() | + | [wake up req->waitq] | + | >fuse_uring_next_fuse_req | + | [wait or handle next req] | + | | + | [req->waitq woken up] | + | s_stack_depth`` and ``fc->max_stack_depth``). +For example, during the ``FUSE_INIT`` handshake, the FUSE daemon can negotiate +the ``max_stack_depth`` it supports. When a backing file is registered via +``FUSE_DEV_IOC_BACKING_OPEN``, the kernel checks if the backing file's +filesystem stack depth is within the allowed limit. + +The ``CAP_SYS_ADMIN`` requirement provides an additional layer of security, +ensuring that only privileged users can create these potentially complex +stacking arrangements. + +General Security Posture +------------------------ + +As a general principle for new kernel features that allow userspace to instruct +the kernel to perform direct operations on its behalf based on user-provided +file descriptors, starting with a higher privilege requirement (like +``CAP_SYS_ADMIN``) is a conservative and common security practice. This allows +the feature to be used and tested while further security implications are +evaluated and addressed. diff --git a/Documentation/filesystems/fuse/fuse.rst b/Documentation/filesystems/fuse/fuse.rst new file mode 100644 index 000000000000..0fbd5a03fdc9 --- /dev/null +++ b/Documentation/filesystems/fuse/fuse.rst @@ -0,0 +1,440 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============= +FUSE Overview +============= + +Definitions +=========== + +Userspace filesystem: + A filesystem in which data and metadata are provided by an ordinary + userspace process. The filesystem can be accessed normally through + the kernel interface. + +Filesystem daemon: + The process(es) providing the data and metadata of the filesystem. + +Non-privileged mount (or user mount): + A userspace filesystem mounted by a non-privileged (non-root) user. + The filesystem daemon is running with the privileges of the mounting + user. NOTE: this is not the same as mounts allowed with the "user" + option in /etc/fstab, which is not discussed here. + +Filesystem connection: + A connection between the filesystem daemon and the kernel. The + connection exists until either the daemon dies, or the filesystem is + umounted. Note that detaching (or lazy umounting) the filesystem + does *not* break the connection, in this case it will exist until + the last reference to the filesystem is released. + +Mount owner: + The user who does the mounting. + +User: + The user who is performing filesystem operations. + +What is FUSE? +============= + +FUSE is a userspace filesystem framework. It consists of a kernel +module (fuse.ko), a userspace library (libfuse.*) and a mount utility +(fusermount). + +One of the most important features of FUSE is allowing secure, +non-privileged mounts. This opens up new possibilities for the use of +filesystems. A good example is sshfs: a secure network filesystem +using the sftp protocol. + +The userspace library and utilities are available from the +`FUSE homepage: `_ + +Filesystem type +=============== + +The filesystem type given to mount(2) can be one of the following: + + fuse + This is the usual way to mount a FUSE filesystem. The first + argument of the mount system call may contain an arbitrary string, + which is not interpreted by the kernel. + + fuseblk + The filesystem is block device based. The first argument of the + mount system call is interpreted as the name of the device. + +Mount options +============= + +fd=N + The file descriptor to use for communication between the userspace + filesystem and the kernel. The file descriptor must have been + obtained by opening the FUSE device ('/dev/fuse'). + +rootmode=M + The file mode of the filesystem's root in octal representation. + +user_id=N + The numeric user id of the mount owner. + +group_id=N + The numeric group id of the mount owner. + +default_permissions + By default FUSE doesn't check file access permissions, the + filesystem is free to implement its access policy or leave it to + the underlying file access mechanism (e.g. in case of network + filesystems). This option enables permission checking, restricting + access based on file mode. It is usually useful together with the + 'allow_other' mount option. + +allow_other + This option overrides the security measure restricting file access + to the user mounting the filesystem. This option is by default only + allowed to root, but this restriction can be removed with a + (userspace) configuration option. + +max_read=N + With this option the maximum size of read operations can be set. + The default is infinite. Note that the size of read requests is + limited anyway to 32 pages (which is 128kbyte on i386). + +blksize=N + Set the block size for the filesystem. The default is 512. This + option is only valid for 'fuseblk' type mounts. + +Control filesystem +================== + +There's a control filesystem for FUSE, which can be mounted by:: + + mount -t fusectl none /sys/fs/fuse/connections + +Mounting it under the '/sys/fs/fuse/connections' directory makes it +backwards compatible with earlier versions. + +Under the fuse control filesystem each connection has a directory +named by a unique number. + +For each connection the following files exist within this directory: + + waiting + The number of requests which are waiting to be transferred to + userspace or being processed by the filesystem daemon. If there is + no filesystem activity and 'waiting' is non-zero, then the + filesystem is hung or deadlocked. + + abort + Writing anything into this file will abort the filesystem + connection. This means that all waiting requests will be aborted an + error returned for all aborted and new requests. + + max_background + The maximum number of background requests that can be outstanding + at a time. When the number of background requests reaches this limit, + further requests will be blocked until some are completed, potentially + causing I/O operations to stall. + + congestion_threshold + The threshold of background requests at which the kernel considers + the filesystem to be congested. When the number of background requests + exceeds this value, the kernel will skip asynchronous readahead + operations, reducing read-ahead optimizations but preserving essential + I/O, as well as suspending non-synchronous writeback operations + (WB_SYNC_NONE), delaying page cache flushing to the filesystem. + +Only the owner of the mount may read or write these files. + +Interrupting filesystem operations +################################## + +If a process issuing a FUSE filesystem request is interrupted, the +following will happen: + + - If the request is not yet sent to userspace AND the signal is + fatal (SIGKILL or unhandled fatal signal), then the request is + dequeued and returns immediately. + + - If the request is not yet sent to userspace AND the signal is not + fatal, then an interrupted flag is set for the request. When + the request has been successfully transferred to userspace and + this flag is set, an INTERRUPT request is queued. + + - If the request is already sent to userspace, then an INTERRUPT + request is queued. + +INTERRUPT requests take precedence over other requests, so the +userspace filesystem will receive queued INTERRUPTs before any others. + +The userspace filesystem may ignore the INTERRUPT requests entirely, +or may honor them by sending a reply to the *original* request, with +the error set to EINTR. + +It is also possible that there's a race between processing the +original request and its INTERRUPT request. There are two possibilities: + + 1. The INTERRUPT request is processed before the original request is + processed + + 2. The INTERRUPT request is processed after the original request has + been answered + +If the filesystem cannot find the original request, it should wait for +some timeout and/or a number of new requests to arrive, after which it +should reply to the INTERRUPT request with an EAGAIN error. In case +1) the INTERRUPT request will be requeued. In case 2) the INTERRUPT +reply will be ignored. + +Aborting a filesystem connection +================================ + +It is possible to get into certain situations where the filesystem is +not responding. Reasons for this may be: + + a) Broken userspace filesystem implementation + + b) Network connection down + + c) Accidental deadlock + + d) Malicious deadlock + +(For more on c) and d) see later sections) + +In either of these cases it may be useful to abort the connection to +the filesystem. There are several ways to do this: + + - Kill the filesystem daemon. Works in case of a) and b) + + - Kill the filesystem daemon and all users of the filesystem. Works + in all cases except some malicious deadlocks + + - Use forced umount (umount -f). Works in all cases but only if + filesystem is still attached (it hasn't been lazy unmounted) + + - Abort filesystem through the FUSE control filesystem. Most + powerful method, always works. + +How do non-privileged mounts work? +================================== + +Since the mount() system call is a privileged operation, a helper +program (fusermount) is needed, which is installed setuid root. + +The implication of providing non-privileged mounts is that the mount +owner must not be able to use this capability to compromise the +system. Obvious requirements arising from this are: + + A) mount owner should not be able to get elevated privileges with the + help of the mounted filesystem + + B) mount owner should not get illegitimate access to information from + other users' and the super user's processes + + C) mount owner should not be able to induce undesired behavior in + other users' or the super user's processes + +How are requirements fulfilled? +=============================== + + A) The mount owner could gain elevated privileges by either: + + 1. creating a filesystem containing a device file, then opening this device + + 2. creating a filesystem containing a suid or sgid application, then executing this application + + The solution is not to allow opening device files and ignore + setuid and setgid bits when executing programs. To ensure this + fusermount always adds "nosuid" and "nodev" to the mount options + for non-privileged mounts. + + B) If another user is accessing files or directories in the + filesystem, the filesystem daemon serving requests can record the + exact sequence and timing of operations performed. This + information is otherwise inaccessible to the mount owner, so this + counts as an information leak. + + The solution to this problem will be presented in point 2) of C). + + C) There are several ways in which the mount owner can induce + undesired behavior in other users' processes, such as: + + 1) mounting a filesystem over a file or directory which the mount + owner could otherwise not be able to modify (or could only + make limited modifications). + + This is solved in fusermount, by checking the access + permissions on the mountpoint and only allowing the mount if + the mount owner can do unlimited modification (has write + access to the mountpoint, and mountpoint is not a "sticky" + directory) + + 2) Even if 1) is solved the mount owner can change the behavior + of other users' processes. + + i) It can slow down or indefinitely delay the execution of a + filesystem operation creating a DoS against the user or the + whole system. For example a suid application locking a + system file, and then accessing a file on the mount owner's + filesystem could be stopped, and thus causing the system + file to be locked forever. + + ii) It can present files or directories of unlimited length, or + directory structures of unlimited depth, possibly causing a + system process to eat up diskspace, memory or other + resources, again causing *DoS*. + + The solution to this as well as B) is not to allow processes + to access the filesystem, which could otherwise not be + monitored or manipulated by the mount owner. Since if the + mount owner can ptrace a process, it can do all of the above + without using a FUSE mount, the same criteria as used in + ptrace can be used to check if a process is allowed to access + the filesystem or not. + + Note that the *ptrace* check is not strictly necessary to + prevent C/2/i, it is enough to check if mount owner has enough + privilege to send signal to the process accessing the + filesystem, since *SIGSTOP* can be used to get a similar effect. + +I think these limitations are unacceptable? +=========================================== + +If a sysadmin trusts the users enough, or can ensure through other +measures, that system processes will never enter non-privileged +mounts, it can relax the last limitation in several ways: + + - With the 'user_allow_other' config option. If this config option is + set, the mounting user can add the 'allow_other' mount option which + disables the check for other users' processes. + + User namespaces have an unintuitive interaction with 'allow_other': + an unprivileged user - normally restricted from mounting with + 'allow_other' - could do so in a user namespace where they're + privileged. If any process could access such an 'allow_other' mount + this would give the mounting user the ability to manipulate + processes in user namespaces where they're unprivileged. For this + reason 'allow_other' restricts access to users in the same userns + or a descendant. + + - With the 'allow_sys_admin_access' module option. If this option is + set, super user's processes have unrestricted access to mounts + irrespective of allow_other setting or user namespace of the + mounting user. + +Note that both of these relaxations expose the system to potential +information leak or *DoS* as described in points B and C/2/i-ii in the +preceding section. + +Kernel - userspace interface +============================ + +The following diagram shows how a filesystem operation (in this +example unlink) is performed in FUSE. :: + + + | "rm /mnt/fuse/file" | FUSE filesystem daemon + | | + | | >sys_read() + | | >fuse_dev_read() + | | >request_wait() + | | [sleep on fc->waitq] + | | + | >sys_unlink() | + | >fuse_unlink() | + | [get request from | + | fc->unused_list] | + | >request_send() | + | [queue req on fc->pending] | + | [wake up fc->waitq] | [woken up] + | >request_wait_answer() | + | [sleep on req->waitq] | + | | pending] + | | [copy req to read buffer] + | | [add req to fc->processing] + | | sys_write() + | | >fuse_dev_write() + | | [look up req in fc->processing] + | | [remove from fc->processing] + | | [copy write buffer to req] + | [woken up] | [wake up req->waitq] + | | unused_list] | + | sys_unlink("/mnt/fuse/file") | + | [acquire inode semaphore | + | for "file"] | + | >fuse_unlink() | + | [sleep on req->waitq] | + | | sys_unlink("/mnt/fuse/file") + | | [acquire inode semaphore + | | for "file"] + | | *DEADLOCK* + +The solution for this is to allow the filesystem to be aborted. + +**Scenario 2 - Tricky deadlock** + + +This one needs a carefully crafted filesystem. It's a variation on +the above, only the call back to the filesystem is not explicit, +but is caused by a pagefault. :: + + | Kamikaze filesystem thread 1 | Kamikaze filesystem thread 2 + | | + | [fd = open("/mnt/fuse/file")] | [request served normally] + | [mmap fd to 'addr'] | + | [close fd] | [FLUSH triggers 'magic' flag] + | [read a byte from addr] | + | >do_page_fault() | + | [find or create page] | + | [lock page] | + | >fuse_readpage() | + | [queue READ request] | + | [sleep on req->waitq] | + | | [read request to buffer] + | | [create reply header before addr] + | | >sys_write(addr - headerlength) + | | >fuse_dev_write() + | | [look up req in fc->processing] + | | [remove from fc->processing] + | | [copy write buffer to req] + | | >do_page_fault() + | | [find or create page] + | | [lock page] + | | * DEADLOCK * + +The solution is basically the same as above. + +An additional problem is that while the write buffer is being copied +to the request, the request must not be interrupted/aborted. This is +because the destination address of the copy may not be valid after the +request has returned. + +This is solved with doing the copy atomically, and allowing abort +while the page(s) belonging to the write buffer are faulted with +get_user_pages(). The 'req->locked' flag indicates when the copy is +taking place, and abort is delayed until this flag is unset. diff --git a/Documentation/filesystems/fuse/index.rst b/Documentation/filesystems/fuse/index.rst new file mode 100644 index 000000000000..393a845214da --- /dev/null +++ b/Documentation/filesystems/fuse/index.rst @@ -0,0 +1,14 @@ +.. SPDX-License-Identifier: GPL-2.0 + +====================================================== +FUSE (Filesystem in Userspace) Technical Documentation +====================================================== + +.. toctree:: + :maxdepth: 2 + :numbered: + + fuse + fuse-io + fuse-io-uring + fuse-passthrough diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst index 11a599387266..84c5a0d11b6d 100644 --- a/Documentation/filesystems/index.rst +++ b/Documentation/filesystems/index.rst @@ -96,10 +96,7 @@ Documentation for filesystem implementations. hfs hfsplus hpfs - fuse - fuse-io - fuse-io-uring - fuse-passthrough + fuse/index inotify isofs nilfs2 diff --git a/MAINTAINERS b/MAINTAINERS index 1b7f84ae8fd7..51600a98a785 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10061,7 +10061,7 @@ L: linux-fsdevel@vger.kernel.org S: Maintained W: https://github.com/libfuse/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/fuse.git -F: Documentation/filesystems/fuse* +F: Documentation/filesystems/fuse/* F: fs/fuse/ F: include/uapi/linux/fuse.h F: tools/testing/selftests/filesystems/fuse/ -- cgit v1.2.3 From dd6a5a71c811289eec234e78cb9ca34d055d2ad5 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Tue, 10 Jun 2025 13:52:28 +0900 Subject: sched/wait: Add wait_event_state_exclusive() Allows exclusive waits with a custom @state. Signed-off-by: Sergey Senozhatsky Acked-by: Peter Zijlstra (Intel) Signed-off-by: Miklos Szeredi --- include/linux/wait.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/include/linux/wait.h b/include/linux/wait.h index 09855d819418..f648044466d5 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -965,6 +965,18 @@ extern int do_wait_intr_irq(wait_queue_head_t *, wait_queue_entry_t *); __ret; \ }) +#define __wait_event_state_exclusive(wq, condition, state) \ + ___wait_event(wq, condition, state, 1, 0, schedule()) + +#define wait_event_state_exclusive(wq, condition, state) \ +({ \ + int __ret = 0; \ + might_sleep(); \ + if (!(condition)) \ + __ret = __wait_event_state_exclusive(wq, condition, state); \ + __ret; \ +}) + #define __wait_event_killable_timeout(wq_head, condition, timeout) \ ___wait_event(wq_head, ___wait_cond_timeout(condition), \ TASK_KILLABLE, 0, timeout, \ -- cgit v1.2.3 From 14cbb72d7595f3a962a7b206f75a32b1bd9fa335 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Tue, 10 Jun 2025 13:52:29 +0900 Subject: fuse: use freezable wait in fuse_get_req() Use freezable wait in fuse_get_req() so that it won't block the system from entering suspend: Freezing user space processes failed after 20.009 seconds Call trace: __switch_to+0xcc/0x168 schedule+0x57c/0x1138 fuse_get_req+0xd0/0x2b0 fuse_simple_request+0x120/0x620 fuse_getxattr+0xe4/0x158 fuse_xattr_get+0x2c/0x48 __vfs_getxattr+0x160/0x1d8 get_vfs_caps_from_disk+0x74/0x1a8 __audit_inode+0x244/0x4d8 user_path_at_empty+0x2e0/0x390 __arm64_sys_faccessat+0xdc/0x260 Signed-off-by: Sergey Senozhatsky Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 5150aa25e64b..3d8a3edebc23 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -207,8 +207,9 @@ static struct fuse_req *fuse_get_req(struct mnt_idmap *idmap, if (fuse_block_alloc(fc, for_background)) { err = -EINTR; - if (wait_event_killable_exclusive(fc->blocked_waitq, - !fuse_block_alloc(fc, for_background))) + if (wait_event_state_exclusive(fc->blocked_waitq, + !fuse_block_alloc(fc, for_background), + (TASK_KILLABLE | TASK_FREEZABLE))) goto out; } /* Matches smp_wmb() in fuse_set_initialized() */ -- cgit v1.2.3 From 6fd26f50857698c6f07a9e6b149247925fadb8fd Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Tue, 20 May 2025 13:16:54 -0700 Subject: fuse: remove unneeded offset assignment when filling write pages With the change in aee03ea7ff98 ("fuse: support large folios for writethrough writes"), this old line for setting ap->descs[0].offset is now obsolete and unneeded. This should have been removed as part of aee03ea7ff98. Signed-off-by: Joanne Koong Fixes: aee03ea7ff98 ("fuse: support large folios for writethrough writes") Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 867b5fde1237..ffda61c9bc0c 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1175,7 +1175,6 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, num = min(iov_iter_count(ii), fc->max_write); ap->args.in_pages = true; - ap->descs[0].offset = offset; while (num && ap->num_folios < max_folios) { size_t tmp; -- cgit v1.2.3 From b4da63cea158f050865220a05ab691cfe8fb6450 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Sun, 15 Jun 2025 21:20:39 +0800 Subject: virtio_fs: Remove redundant spinlock in virtio_fs_request_complete() Since clear_bit is an atomic operation, the spinlock is redundant and can be removed, reducing lock contention is good for performance. Signed-off-by: Li RongQing Reviewed-by: Stefan Hajnoczi Signed-off-by: Miklos Szeredi --- fs/fuse/virtio_fs.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index c826e7ca49f5..aeb488750fa6 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -761,7 +761,6 @@ static void copy_args_from_argbuf(struct fuse_args *args, struct fuse_req *req) static void virtio_fs_request_complete(struct fuse_req *req, struct virtio_fs_vq *fsvq) { - struct fuse_pqueue *fpq = &fsvq->fud->pq; struct fuse_args *args; struct fuse_args_pages *ap; unsigned int len, i, thislen; @@ -790,9 +789,7 @@ static void virtio_fs_request_complete(struct fuse_req *req, } } - spin_lock(&fpq->lock); clear_bit(FR_SENT, &req->flags); - spin_unlock(&fpq->lock); fuse_request_end(req); spin_lock(&fsvq->lock); -- cgit v1.2.3 From 494d2f508883a6e5c4530e5c6b3c8b2bbfb7318d Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Mon, 7 Jul 2025 16:46:05 -0700 Subject: fuse: use default writeback accounting commit 0c58a97f919c ("fuse: remove tmp folio for writebacks and internal rb tree") removed temp folios for dirty page writeback. Consequently, fuse can now use the default writeback accounting. With switching fuse to use default writeback accounting, there are some added benefits. This updates wb->writeback_inodes tracking as well now and updates writeback throughput estimates after writeback completion. This commit also removes inc_wb_stat() and dec_wb_stat(). These have no callers anymore now that fuse does not call them. Signed-off-by: Joanne Koong Reviewed-by: David Hildenbrand Reviewed-by: Bernd Schubert Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 9 +-------- fs/fuse/inode.c | 2 -- include/linux/backing-dev.h | 10 ---------- 3 files changed, 1 insertion(+), 20 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index ffda61c9bc0c..0698bcb2f992 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1822,19 +1822,15 @@ static void fuse_writepage_finish(struct fuse_writepage_args *wpa) struct fuse_args_pages *ap = &wpa->ia.ap; struct inode *inode = wpa->inode; struct fuse_inode *fi = get_fuse_inode(inode); - struct backing_dev_info *bdi = inode_to_bdi(inode); int i; - for (i = 0; i < ap->num_folios; i++) { + for (i = 0; i < ap->num_folios; i++) /* * Benchmarks showed that ending writeback within the * scope of the fi->lock alleviates xarray lock * contention and noticeably improves performance. */ iomap_finish_folio_write(inode, ap->folios[i], 1); - dec_wb_stat(&bdi->wb, WB_WRITEBACK); - wb_writeout_inc(&bdi->wb); - } wake_up(&fi->page_waitq); } @@ -2009,14 +2005,11 @@ static void fuse_writepage_add_to_bucket(struct fuse_conn *fc, static void fuse_writepage_args_page_fill(struct fuse_writepage_args *wpa, struct folio *folio, uint32_t folio_index, loff_t offset, unsigned len) { - struct inode *inode = folio->mapping->host; struct fuse_args_pages *ap = &wpa->ia.ap; ap->folios[folio_index] = folio; ap->descs[folio_index].offset = offset; ap->descs[folio_index].length = len; - - inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); } static struct fuse_writepage_args *fuse_writepage_args_setup(struct folio *folio, diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 7ddfd2b3cc9c..19fc58cb84dc 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1561,8 +1561,6 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb) if (err) return err; - /* fuse does it's own writeback accounting */ - sb->s_bdi->capabilities &= ~BDI_CAP_WRITEBACK_ACCT; sb->s_bdi->capabilities |= BDI_CAP_STRICTLIMIT; /* diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index e721148c95d0..9a1e895dd5df 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -66,16 +66,6 @@ static inline void wb_stat_mod(struct bdi_writeback *wb, percpu_counter_add_batch(&wb->stat[item], amount, WB_STAT_BATCH); } -static inline void inc_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item) -{ - wb_stat_mod(wb, item, 1); -} - -static inline void dec_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item) -{ - wb_stat_mod(wb, item, -1); -} - static inline s64 wb_stat(struct bdi_writeback *wb, enum wb_stat_item item) { return percpu_counter_read_positive(&wb->stat[item]); -- cgit v1.2.3 From 2841808f35eebfd07150333f3af3007cb2904a09 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Mon, 7 Jul 2025 16:46:06 -0700 Subject: mm: remove BDI_CAP_WRITEBACK_ACCT There are no users of BDI_CAP_WRITEBACK_ACCT now that fuse doesn't do its own writeback accounting. This commit removes BDI_CAP_WRITEBACK_ACCT. Signed-off-by: Joanne Koong Acked-by: David Hildenbrand Signed-off-by: Miklos Szeredi --- include/linux/backing-dev.h | 4 +--- mm/backing-dev.c | 2 +- mm/page-writeback.c | 43 ++++++++++++++++++------------------------- 3 files changed, 20 insertions(+), 29 deletions(-) diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 9a1e895dd5df..3e64f14739dd 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -108,12 +108,10 @@ int bdi_set_strict_limit(struct backing_dev_info *bdi, unsigned int strict_limit * * BDI_CAP_WRITEBACK: Supports dirty page writeback, and dirty pages * should contribute to accounting - * BDI_CAP_WRITEBACK_ACCT: Automatically account writeback pages * BDI_CAP_STRICTLIMIT: Keep number of dirty pages below bdi threshold */ #define BDI_CAP_WRITEBACK (1 << 0) -#define BDI_CAP_WRITEBACK_ACCT (1 << 1) -#define BDI_CAP_STRICTLIMIT (1 << 2) +#define BDI_CAP_STRICTLIMIT (1 << 1) extern struct backing_dev_info noop_backing_dev_info; diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 783904d8c5ef..35f11e75e30e 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -1026,7 +1026,7 @@ struct backing_dev_info *bdi_alloc(int node_id) kfree(bdi); return NULL; } - bdi->capabilities = BDI_CAP_WRITEBACK | BDI_CAP_WRITEBACK_ACCT; + bdi->capabilities = BDI_CAP_WRITEBACK; bdi->ra_pages = VM_READAHEAD_PAGES; bdi->io_pages = VM_READAHEAD_PAGES; timer_setup(&bdi->laptop_mode_wb_timer, laptop_mode_timer_fn, 0); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 3e248d1c3969..de669636120d 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -3014,26 +3014,22 @@ bool __folio_end_writeback(struct folio *folio) if (mapping && mapping_use_writeback_tags(mapping)) { struct inode *inode = mapping->host; - struct backing_dev_info *bdi = inode_to_bdi(inode); + struct bdi_writeback *wb = inode_to_wb(inode); unsigned long flags; xa_lock_irqsave(&mapping->i_pages, flags); ret = folio_xor_flags_has_waiters(folio, 1 << PG_writeback); __xa_clear_mark(&mapping->i_pages, folio_index(folio), PAGECACHE_TAG_WRITEBACK); - if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) { - struct bdi_writeback *wb = inode_to_wb(inode); - wb_stat_mod(wb, WB_WRITEBACK, -nr); - __wb_writeout_add(wb, nr); - if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) - wb_inode_writeback_end(wb); + wb_stat_mod(wb, WB_WRITEBACK, -nr); + __wb_writeout_add(wb, nr); + if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) { + wb_inode_writeback_end(wb); + if (mapping->host) + sb_clear_inode_writeback(mapping->host); } - if (mapping->host && !mapping_tagged(mapping, - PAGECACHE_TAG_WRITEBACK)) - sb_clear_inode_writeback(mapping->host); - xa_unlock_irqrestore(&mapping->i_pages, flags); } else { ret = folio_xor_flags_has_waiters(folio, 1 << PG_writeback); @@ -3058,7 +3054,7 @@ void __folio_start_writeback(struct folio *folio, bool keep_write) if (mapping && mapping_use_writeback_tags(mapping)) { XA_STATE(xas, &mapping->i_pages, folio_index(folio)); struct inode *inode = mapping->host; - struct backing_dev_info *bdi = inode_to_bdi(inode); + struct bdi_writeback *wb = inode_to_wb(inode); unsigned long flags; bool on_wblist; @@ -3069,21 +3065,18 @@ void __folio_start_writeback(struct folio *folio, bool keep_write) on_wblist = mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK); xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK); - if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) { - struct bdi_writeback *wb = inode_to_wb(inode); - - wb_stat_mod(wb, WB_WRITEBACK, nr); - if (!on_wblist) - wb_inode_writeback_start(wb); + wb_stat_mod(wb, WB_WRITEBACK, nr); + if (!on_wblist) { + wb_inode_writeback_start(wb); + /* + * We can come through here when swapping anonymous + * folios, so we don't necessarily have an inode to + * track for sync. + */ + if (mapping->host) + sb_mark_inode_writeback(mapping->host); } - /* - * We can come through here when swapping anonymous - * folios, so we don't necessarily have an inode to - * track for sync. - */ - if (mapping->host && !on_wblist) - sb_mark_inode_writeback(mapping->host); if (!folio_test_dirty(folio)) xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY); if (!keep_write) -- cgit v1.2.3 From 7dbe6442487743ad492d9143f1f404c1f4a05e0e Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Thu, 3 Jul 2025 14:47:38 +0800 Subject: virtio_fs: fix the hash table using in virtio_fs_enqueue_req() The original commit be2ff42c5d6e ("fuse: Use hash table to link processing request") converted fuse_pqueue->processing to a hash table, but virtio_fs_enqueue_req() was not updated to use it correctly. So use fuse_pqueue->processing as a hash table, this make the code more coherent Co-developed-by: Fushuai Wang Signed-off-by: Fushuai Wang Signed-off-by: Li RongQing Reviewed-by: Stefan Hajnoczi Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 1 + fs/fuse/virtio_fs.c | 6 ++++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 3d8a3edebc23..f6259711ca1c 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -323,6 +323,7 @@ unsigned int fuse_req_hash(u64 unique) { return hash_long(unique & ~FUSE_INT_REQ_BIT, FUSE_PQ_HASH_BITS); } +EXPORT_SYMBOL_GPL(fuse_req_hash); /* * A new request is available, wake fiq->waitq diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index aeb488750fa6..c3a39061363e 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -20,6 +20,7 @@ #include #include #include "fuse_i.h" +#include "fuse_dev_i.h" /* Used to help calculate the FUSE connection's max_pages limit for a request's * size. Parts of the struct fuse_req are sliced into scattergather lists in @@ -1381,7 +1382,7 @@ static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq, unsigned int out_sgs = 0; unsigned int in_sgs = 0; unsigned int total_sgs; - unsigned int i; + unsigned int i, hash; int ret; bool notify; struct fuse_pqueue *fpq; @@ -1441,8 +1442,9 @@ static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq, /* Request successfully sent. */ fpq = &fsvq->fud->pq; + hash = fuse_req_hash(req->in.h.unique); spin_lock(&fpq->lock); - list_add_tail(&req->list, fpq->processing); + list_add_tail(&req->list, &fpq->processing[hash]); spin_unlock(&fpq->lock); set_bit(FR_SENT, &req->flags); /* matches barrier in request_wait_answer() */ -- cgit v1.2.3 From 8c14f2086b94818e4a30e82c810443604f2f5a2b Mon Sep 17 00:00:00 2001 From: Chunsheng Luo Date: Wed, 30 Jul 2025 21:06:03 +0800 Subject: fuse: remove unused 'inode' parameter in fuse_passthrough_open The 'inode' parameter in fuse_passthrough_open() is never referenced in the function implementation. Signed-off-by: Chunsheng Luo Reviewed-by: Joanne Koong Signed-off-by: Miklos Szeredi --- fs/fuse/fuse_i.h | 4 +--- fs/fuse/iomode.c | 3 +-- fs/fuse/passthrough.c | 4 +--- 3 files changed, 3 insertions(+), 8 deletions(-) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index d68eea4dd743..486fa550c951 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1555,9 +1555,7 @@ void fuse_backing_files_free(struct fuse_conn *fc); int fuse_backing_open(struct fuse_conn *fc, struct fuse_backing_map *map); int fuse_backing_close(struct fuse_conn *fc, int backing_id); -struct fuse_backing *fuse_passthrough_open(struct file *file, - struct inode *inode, - int backing_id); +struct fuse_backing *fuse_passthrough_open(struct file *file, int backing_id); void fuse_passthrough_release(struct fuse_file *ff, struct fuse_backing *fb); static inline struct file *fuse_file_passthrough(struct fuse_file *ff) diff --git a/fs/fuse/iomode.c b/fs/fuse/iomode.c index c99e285f3183..3728933188f3 100644 --- a/fs/fuse/iomode.c +++ b/fs/fuse/iomode.c @@ -177,8 +177,7 @@ static int fuse_file_passthrough_open(struct inode *inode, struct file *file) (ff->open_flags & ~FOPEN_PASSTHROUGH_MASK)) return -EINVAL; - fb = fuse_passthrough_open(file, inode, - ff->args->open_outarg.backing_id); + fb = fuse_passthrough_open(file, ff->args->open_outarg.backing_id); if (IS_ERR(fb)) return PTR_ERR(fb); diff --git a/fs/fuse/passthrough.c b/fs/fuse/passthrough.c index eb97ac009e75..0bb80112741a 100644 --- a/fs/fuse/passthrough.c +++ b/fs/fuse/passthrough.c @@ -306,9 +306,7 @@ out: * * Returns an fb object with elevated refcount to be stored in fuse inode. */ -struct fuse_backing *fuse_passthrough_open(struct file *file, - struct inode *inode, - int backing_id) +struct fuse_backing *fuse_passthrough_open(struct file *file, int backing_id) { struct fuse_file *ff = file->private_data; struct fuse_conn *fc = ff->fm->fc; -- cgit v1.2.3 From 3ca1b311181072415b6432a169de765ac2034e5a Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 19 Aug 2025 16:44:02 +0200 Subject: fuse: zero initialize inode private data This is slightly tricky, since the VFS uses non-zeroing allocation to preserve some fields that are left in a consistent state. Reported-by: Chunsheng Luo Closes: https://lore.kernel.org/all/20250818083224.229-1-luochunsheng@ustc.edu/ Signed-off-by: Miklos Szeredi --- fs/fuse/inode.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 19fc58cb84dc..9d26a5bc394d 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -101,14 +101,11 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) if (!fi) return NULL; - fi->i_time = 0; + /* Initialize private data (i.e. everything except fi->inode) */ + BUILD_BUG_ON(offsetof(struct fuse_inode, inode) != 0); + memset((void *) fi + sizeof(fi->inode), 0, sizeof(*fi) - sizeof(fi->inode)); + fi->inval_mask = ~0; - fi->nodeid = 0; - fi->nlookup = 0; - fi->attr_version = 0; - fi->orig_ino = 0; - fi->state = 0; - fi->submount_lookup = NULL; mutex_init(&fi->mutex); spin_lock_init(&fi->lock); fi->forget = fuse_alloc_forget(); -- cgit v1.2.3 From dfb84c33079497bf27058b15780e1c7bba4c371b Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 22 Aug 2025 13:10:44 +0200 Subject: fuse: allow synchronous FUSE_INIT FUSE_INIT has always been asynchronous with mount. That means that the server processed this request after the mount syscall returned. This means that FUSE_INIT can't supply the root inode's ID, hence it currently has a hardcoded value. There are other limitations such as not being able to perform getxattr during mount, which is needed by selinux. To remove these limitations allow server to process FUSE_INIT while initializing the in-core super block for the fuse filesystem. This can only be done if the server is prepared to handle this, so add FUSE_DEV_IOC_SYNC_INIT ioctl, which a) lets the server know whether this feature is supported, returning ENOTTY othewrwise. b) lets the kernel know to perform a synchronous initialization The implementation is slightly tricky, since fuse_dev/fuse_conn are set up only during super block creation. This is solved by setting the private data of the fuse device file to a special value ((struct fuse_dev *) 1) and waiting for this to be turned into a proper fuse_dev before commecing with operations on the device file. Signed-off-by: Miklos Szeredi --- fs/fuse/cuse.c | 3 +- fs/fuse/dev.c | 71 +++++++++++++++++++++++++++++++++++------------ fs/fuse/dev_uring.c | 4 +-- fs/fuse/fuse_dev_i.h | 13 +++++++-- fs/fuse/fuse_i.h | 5 +++- fs/fuse/inode.c | 51 +++++++++++++++++++++++++++------- include/uapi/linux/fuse.h | 1 + 7 files changed, 114 insertions(+), 34 deletions(-) diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index b39844d75a80..28c96961e85d 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -52,6 +52,7 @@ #include #include "fuse_i.h" +#include "fuse_dev_i.h" #define CUSE_CONNTBL_LEN 64 @@ -547,7 +548,7 @@ static int cuse_channel_open(struct inode *inode, struct file *file) */ static int cuse_channel_release(struct inode *inode, struct file *file) { - struct fuse_dev *fud = file->private_data; + struct fuse_dev *fud = __fuse_get_dev(file); struct cuse_conn *cc = fc_to_cc(fud->fc); /* remove from the conntbl, no more access from this point on */ diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index f6259711ca1c..2f1619e266e1 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1530,14 +1530,34 @@ static int fuse_dev_open(struct inode *inode, struct file *file) return 0; } +struct fuse_dev *fuse_get_dev(struct file *file) +{ + struct fuse_dev *fud = __fuse_get_dev(file); + int err; + + if (likely(fud)) + return fud; + + err = wait_event_interruptible(fuse_dev_waitq, + READ_ONCE(file->private_data) != FUSE_DEV_SYNC_INIT); + if (err) + return ERR_PTR(err); + + fud = __fuse_get_dev(file); + if (!fud) + return ERR_PTR(-EPERM); + + return fud; +} + static ssize_t fuse_dev_read(struct kiocb *iocb, struct iov_iter *to) { struct fuse_copy_state cs; struct file *file = iocb->ki_filp; struct fuse_dev *fud = fuse_get_dev(file); - if (!fud) - return -EPERM; + if (IS_ERR(fud)) + return PTR_ERR(fud); if (!user_backed_iter(to)) return -EINVAL; @@ -1557,8 +1577,8 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos, struct fuse_copy_state cs; struct fuse_dev *fud = fuse_get_dev(in); - if (!fud) - return -EPERM; + if (IS_ERR(fud)) + return PTR_ERR(fud); bufs = kvmalloc_array(pipe->max_usage, sizeof(struct pipe_buffer), GFP_KERNEL); @@ -2231,7 +2251,7 @@ copy_finish: static ssize_t fuse_dev_write(struct kiocb *iocb, struct iov_iter *from) { struct fuse_copy_state cs; - struct fuse_dev *fud = fuse_get_dev(iocb->ki_filp); + struct fuse_dev *fud = __fuse_get_dev(iocb->ki_filp); if (!fud) return -EPERM; @@ -2253,11 +2273,10 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, unsigned idx; struct pipe_buffer *bufs; struct fuse_copy_state cs; - struct fuse_dev *fud; + struct fuse_dev *fud = __fuse_get_dev(out); size_t rem; ssize_t ret; - fud = fuse_get_dev(out); if (!fud) return -EPERM; @@ -2343,7 +2362,7 @@ static __poll_t fuse_dev_poll(struct file *file, poll_table *wait) struct fuse_iqueue *fiq; struct fuse_dev *fud = fuse_get_dev(file); - if (!fud) + if (IS_ERR(fud)) return EPOLLERR; fiq = &fud->fc->iq; @@ -2490,7 +2509,7 @@ void fuse_wait_aborted(struct fuse_conn *fc) int fuse_dev_release(struct inode *inode, struct file *file) { - struct fuse_dev *fud = fuse_get_dev(file); + struct fuse_dev *fud = __fuse_get_dev(file); if (fud) { struct fuse_conn *fc = fud->fc; @@ -2521,8 +2540,8 @@ static int fuse_dev_fasync(int fd, struct file *file, int on) { struct fuse_dev *fud = fuse_get_dev(file); - if (!fud) - return -EPERM; + if (IS_ERR(fud)) + return PTR_ERR(fud); /* No locking - fasync_helper does its own locking */ return fasync_helper(fd, file, on, &fud->fc->iq.fasync); @@ -2532,7 +2551,7 @@ static int fuse_device_clone(struct fuse_conn *fc, struct file *new) { struct fuse_dev *fud; - if (new->private_data) + if (__fuse_get_dev(new)) return -EINVAL; fud = fuse_dev_alloc_install(fc); @@ -2563,7 +2582,7 @@ static long fuse_dev_ioctl_clone(struct file *file, __u32 __user *argp) * uses the same ioctl handler. */ if (fd_file(f)->f_op == file->f_op) - fud = fuse_get_dev(fd_file(f)); + fud = __fuse_get_dev(fd_file(f)); res = -EINVAL; if (fud) { @@ -2581,8 +2600,8 @@ static long fuse_dev_ioctl_backing_open(struct file *file, struct fuse_dev *fud = fuse_get_dev(file); struct fuse_backing_map map; - if (!fud) - return -EPERM; + if (IS_ERR(fud)) + return PTR_ERR(fud); if (!IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) return -EOPNOTSUPP; @@ -2598,8 +2617,8 @@ static long fuse_dev_ioctl_backing_close(struct file *file, __u32 __user *argp) struct fuse_dev *fud = fuse_get_dev(file); int backing_id; - if (!fud) - return -EPERM; + if (IS_ERR(fud)) + return PTR_ERR(fud); if (!IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) return -EOPNOTSUPP; @@ -2610,6 +2629,19 @@ static long fuse_dev_ioctl_backing_close(struct file *file, __u32 __user *argp) return fuse_backing_close(fud->fc, backing_id); } +static long fuse_dev_ioctl_sync_init(struct file *file) +{ + int err = -EINVAL; + + mutex_lock(&fuse_mutex); + if (!__fuse_get_dev(file)) { + WRITE_ONCE(file->private_data, FUSE_DEV_SYNC_INIT); + err = 0; + } + mutex_unlock(&fuse_mutex); + return err; +} + static long fuse_dev_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -2625,6 +2657,9 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd, case FUSE_DEV_IOC_BACKING_CLOSE: return fuse_dev_ioctl_backing_close(file, argp); + case FUSE_DEV_IOC_SYNC_INIT: + return fuse_dev_ioctl_sync_init(file); + default: return -ENOTTY; } @@ -2633,7 +2668,7 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd, #ifdef CONFIG_PROC_FS static void fuse_dev_show_fdinfo(struct seq_file *seq, struct file *file) { - struct fuse_dev *fud = fuse_get_dev(file); + struct fuse_dev *fud = __fuse_get_dev(file); if (!fud) return; diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index 249b210becb1..bef38ed78249 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -1139,9 +1139,9 @@ int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) return -EINVAL; fud = fuse_get_dev(cmd->file); - if (!fud) { + if (IS_ERR(fud)) { pr_info_ratelimited("No fuse device found\n"); - return -ENOTCONN; + return PTR_ERR(fud); } fc = fud->fc; diff --git a/fs/fuse/fuse_dev_i.h b/fs/fuse/fuse_dev_i.h index 5a9bd771a319..6e8373f97040 100644 --- a/fs/fuse/fuse_dev_i.h +++ b/fs/fuse/fuse_dev_i.h @@ -12,6 +12,8 @@ #define FUSE_INT_REQ_BIT (1ULL << 0) #define FUSE_REQ_ID_STEP (1ULL << 1) +extern struct wait_queue_head fuse_dev_waitq; + struct fuse_arg; struct fuse_args; struct fuse_pqueue; @@ -37,15 +39,22 @@ struct fuse_copy_state { } ring; }; -static inline struct fuse_dev *fuse_get_dev(struct file *file) +#define FUSE_DEV_SYNC_INIT ((struct fuse_dev *) 1) +#define FUSE_DEV_PTR_MASK (~1UL) + +static inline struct fuse_dev *__fuse_get_dev(struct file *file) { /* * Lockless access is OK, because file->private data is set * once during mount and is valid until the file is released. */ - return READ_ONCE(file->private_data); + struct fuse_dev *fud = READ_ONCE(file->private_data); + + return (typeof(fud)) ((unsigned long) fud & FUSE_DEV_PTR_MASK); } +struct fuse_dev *fuse_get_dev(struct file *file); + unsigned int fuse_req_hash(u64 unique); struct fuse_req *fuse_request_find(struct fuse_pqueue *fpq, u64 unique); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 486fa550c951..233c6111f768 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -904,6 +904,9 @@ struct fuse_conn { /* Is link not implemented by fs? */ unsigned int no_link:1; + /* Is synchronous FUSE_INIT allowed? */ + unsigned int sync_init:1; + /* Use io_uring for communication */ unsigned int io_uring; @@ -1318,7 +1321,7 @@ struct fuse_dev *fuse_dev_alloc_install(struct fuse_conn *fc); struct fuse_dev *fuse_dev_alloc(void); void fuse_dev_install(struct fuse_dev *fud, struct fuse_conn *fc); void fuse_dev_free(struct fuse_dev *fud); -void fuse_send_init(struct fuse_mount *fm); +int fuse_send_init(struct fuse_mount *fm); /** * Fill in superblock and initialize fuse connection diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 9d26a5bc394d..5b7897bf7e45 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -7,6 +7,7 @@ */ #include "fuse_i.h" +#include "fuse_dev_i.h" #include "dev_uring_i.h" #include @@ -34,6 +35,7 @@ MODULE_LICENSE("GPL"); static struct kmem_cache *fuse_inode_cachep; struct list_head fuse_conn_list; DEFINE_MUTEX(fuse_mutex); +DECLARE_WAIT_QUEUE_HEAD(fuse_dev_waitq); static int set_global_limit(const char *val, const struct kernel_param *kp); @@ -1466,7 +1468,7 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, wake_up_all(&fc->blocked_waitq); } -void fuse_send_init(struct fuse_mount *fm) +static struct fuse_init_args *fuse_new_init(struct fuse_mount *fm) { struct fuse_init_args *ia; u64 flags; @@ -1525,10 +1527,30 @@ void fuse_send_init(struct fuse_mount *fm) ia->args.out_args[0].value = &ia->out; ia->args.force = true; ia->args.nocreds = true; - ia->args.end = process_init_reply; - if (fuse_simple_background(fm, &ia->args, GFP_KERNEL) != 0) - process_init_reply(fm, &ia->args, -ENOTCONN); + return ia; +} + +int fuse_send_init(struct fuse_mount *fm) +{ + struct fuse_init_args *ia = fuse_new_init(fm); + int err; + + if (fm->fc->sync_init) { + err = fuse_simple_request(fm, &ia->args); + /* Ignore size of init reply */ + if (err > 0) + err = 0; + } else { + ia->args.end = process_init_reply; + err = fuse_simple_background(fm, &ia->args, GFP_KERNEL); + if (!err) + return 0; + } + process_init_reply(fm, &ia->args, err); + if (fm->fc->conn_error) + return -ENOTCONN; + return 0; } EXPORT_SYMBOL_GPL(fuse_send_init); @@ -1867,8 +1889,12 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) mutex_lock(&fuse_mutex); err = -EINVAL; - if (ctx->fudptr && *ctx->fudptr) - goto err_unlock; + if (ctx->fudptr && *ctx->fudptr) { + if (*ctx->fudptr == FUSE_DEV_SYNC_INIT) + fc->sync_init = 1; + else + goto err_unlock; + } err = fuse_ctl_add_conn(fc); if (err) @@ -1876,8 +1902,10 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) list_add_tail(&fc->entry, &fuse_conn_list); sb->s_root = root_dentry; - if (ctx->fudptr) + if (ctx->fudptr) { *ctx->fudptr = fud; + wake_up_all(&fuse_dev_waitq); + } mutex_unlock(&fuse_mutex); return 0; @@ -1898,6 +1926,7 @@ EXPORT_SYMBOL_GPL(fuse_fill_super_common); static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) { struct fuse_fs_context *ctx = fsc->fs_private; + struct fuse_mount *fm; int err; if (!ctx->file || !ctx->rootmode_present || @@ -1918,8 +1947,10 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) return err; /* file->private_data shall be visible on all CPUs after this */ smp_mb(); - fuse_send_init(get_fuse_mount_super(sb)); - return 0; + + fm = get_fuse_mount_super(sb); + + return fuse_send_init(fm); } /* @@ -1980,7 +2011,7 @@ static int fuse_get_tree(struct fs_context *fsc) * Allow creating a fuse mount with an already initialized fuse * connection */ - fud = READ_ONCE(ctx->file->private_data); + fud = __fuse_get_dev(ctx->file); if (ctx->file->f_op == &fuse_dev_operations && fud) { fsc->sget_key = fud->fc; sb = sget_fc(fsc, fuse_test_super, fuse_set_no_super); diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 94621f68a5cc..6b9fb8b08768 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -1131,6 +1131,7 @@ struct fuse_backing_map { #define FUSE_DEV_IOC_BACKING_OPEN _IOW(FUSE_DEV_IOC_MAGIC, 1, \ struct fuse_backing_map) #define FUSE_DEV_IOC_BACKING_CLOSE _IOW(FUSE_DEV_IOC_MAGIC, 2, uint32_t) +#define FUSE_DEV_IOC_SYNC_INIT _IO(FUSE_DEV_IOC_MAGIC, 3) struct fuse_lseek_in { uint64_t fh; -- cgit v1.2.3 From b3c7ab1d2593213c2d3339830b3950a689e83867 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 1 Sep 2025 10:43:50 +0200 Subject: fuse: fix references to fuse.rst -> fuse/fuse.rst Commit 6be0ddb20200 ("Documentation: fuse: Consolidate FUSE docs into its own subdirectory") moved fuse docs to a subdirectory but didn't update references inside the kernel tree. Fixes: 6be0ddb20200 ("Documentation: fuse: Consolidate FUSE docs into its own subdirectory") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202508261621.EaNMWVjm-lkp@intel.com/ Signed-off-by: Miklos Szeredi --- Documentation/filesystems/sysfs.rst | 2 +- Documentation/translations/zh_CN/filesystems/sysfs.txt | 2 +- Documentation/translations/zh_TW/filesystems/sysfs.txt | 2 +- fs/fuse/Kconfig | 2 +- fs/fuse/dev.c | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Documentation/filesystems/sysfs.rst b/Documentation/filesystems/sysfs.rst index c32993bc83c7..be966f27f284 100644 --- a/Documentation/filesystems/sysfs.rst +++ b/Documentation/filesystems/sysfs.rst @@ -321,7 +321,7 @@ span multiple bus types). fs/ contains a directory for some filesystems. Currently each filesystem wanting to export attributes must create its own hierarchy -below fs/ (see ./fuse.rst for an example). +below fs/ (see fuse/fuse.rst for an example). module/ contains parameter values and state information for all loaded system modules, for both builtin and loadable modules. diff --git a/Documentation/translations/zh_CN/filesystems/sysfs.txt b/Documentation/translations/zh_CN/filesystems/sysfs.txt index 547062759e60..b17c9f638628 100644 --- a/Documentation/translations/zh_CN/filesystems/sysfs.txt +++ b/Documentation/translations/zh_CN/filesystems/sysfs.txt @@ -282,7 +282,7 @@ drivers/ 包含了每个已为特定总线上的设备而挂载的驱动程序 假定驱动没有跨越多个总线类型)。 fs/ 包含了一个为文件系统设立的目录。现在每个想要导出属性的文件系统必须 -在 fs/ 下创建自己的层次结构(参见Documentation/filesystems/fuse.rst)。 +在 fs/ 下创建自己的层次结构(参见Documentation/filesystems/fuse/fuse.rst)。 dev/ 包含两个子目录: char/ 和 block/。在这两个子目录中,有以 : 格式命名的符号链接。这些符号链接指向 sysfs 目录 diff --git a/Documentation/translations/zh_TW/filesystems/sysfs.txt b/Documentation/translations/zh_TW/filesystems/sysfs.txt index 978462d5fe14..d1cee02ef1de 100644 --- a/Documentation/translations/zh_TW/filesystems/sysfs.txt +++ b/Documentation/translations/zh_TW/filesystems/sysfs.txt @@ -285,7 +285,7 @@ drivers/ 包含了每個已爲特定總線上的設備而掛載的驅動程序 假定驅動沒有跨越多個總線類型)。 fs/ 包含了一個爲文件系統設立的目錄。現在每個想要導出屬性的文件系統必須 -在 fs/ 下創建自己的層次結構(參見Documentation/filesystems/fuse.rst)。 +在 fs/ 下創建自己的層次結構(參見Documentation/filesystems/fuse/fuse.rst)。 dev/ 包含兩個子目錄: char/ 和 block/。在這兩個子目錄中,有以 : 格式命名的符號鏈接。這些符號鏈接指向 sysfs 目錄 diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig index a774166264de..3a4ae632c94a 100644 --- a/fs/fuse/Kconfig +++ b/fs/fuse/Kconfig @@ -13,7 +13,7 @@ config FUSE_FS although chances are your distribution already has that library installed if you've installed the "fuse" package itself. - See for more information. + See for more information. See for needed library/utility version. If you want to develop a userspace FS, or if you want to use diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 2f1619e266e1..df793003eb0c 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -2415,7 +2415,7 @@ static void end_polls(struct fuse_conn *fc) * The same effect is usually achievable through killing the filesystem daemon * and all users of the filesystem. The exception is the combination of an * asynchronous request and the tricky deadlock (see - * Documentation/filesystems/fuse.rst). + * Documentation/filesystems/fuse/fuse.rst). * * Aborting requests under I/O goes as follows: 1: Separate out unlocked * requests, they should be finished off immediately. Locked requests will be -- cgit v1.2.3 From 02d47e213dce6e2fa197d0cff66e92f1bc55f00f Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Fri, 29 Aug 2025 16:56:27 -0700 Subject: fuse: remove fuse_readpages_end() null mapping check Remove extra logic in fuse_readpages_end() that checks against null folio mappings. This was added in commit ce534fb05292 ("fuse: allow splice to move pages"): "Since the remove_from_page_cache() + add_to_page_cache_locked() are non-atomic it is possible that the page cache is repopulated in between the two and add_to_page_cache_locked() will fail. This could be fixed by creating a new atomic replace_page_cache_page() function. fuse_readpages_end() needed to be reworked so it works even if page->mapping is NULL for some or all pages which can happen if the add_to_page_cache_locked() failed." Commit ef6a3c63112e ("mm: add replace_page_cache_page() function") added atomic page cache replacement, which means the check against null mappings can be removed. Signed-off-by: Joanne Koong Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 0698bcb2f992..54786f62a9d8 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -865,22 +865,20 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args, struct fuse_args_pages *ap = &ia->ap; size_t count = ia->read.in.size; size_t num_read = args->out_args[0].size; - struct address_space *mapping = NULL; - - for (i = 0; mapping == NULL && i < ap->num_folios; i++) - mapping = ap->folios[i]->mapping; + struct address_space *mapping; + struct inode *inode; - if (mapping) { - struct inode *inode = mapping->host; + WARN_ON_ONCE(!ap->num_folios); + mapping = ap->folios[0]->mapping; + inode = mapping->host; - /* - * Short read means EOF. If file size is larger, truncate it - */ - if (!err && num_read < count) - fuse_short_read(inode, ia->read.attr_ver, num_read, ap); + /* + * Short read means EOF. If file size is larger, truncate it + */ + if (!err && num_read < count) + fuse_short_read(inode, ia->read.attr_ver, num_read, ap); - fuse_invalidate_atime(inode); - } + fuse_invalidate_atime(inode); for (i = 0; i < ap->num_folios; i++) { folio_end_read(ap->folios[i], !err); -- cgit v1.2.3 From 0a0fdb98d16e334e259352893462030f15fb887f Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 26 Aug 2025 17:08:19 +0200 Subject: fuse: remove FUSE_NOTIFY_CODE_MAX from Constants that change value from version to version have no place in an interface definition. Hopefully this won't break anything. Reviewed-by: Joanne Koong Signed-off-by: Miklos Szeredi --- include/uapi/linux/fuse.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 6b9fb8b08768..30bf0846547f 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -680,7 +680,6 @@ enum fuse_notify_code { FUSE_NOTIFY_DELETE = 6, FUSE_NOTIFY_RESEND = 7, FUSE_NOTIFY_INC_EPOCH = 8, - FUSE_NOTIFY_CODE_MAX, }; /* The read buffer is required to be at least 8k, but may be much larger */ -- cgit v1.2.3 From 0b563aad1c0a05dc7d123f68a9f82f79de206dad Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 1 Sep 2025 17:16:26 +0200 Subject: fuse: fix possibly missing fuse_copy_finish() call in fuse_notify() In case of FUSE_NOTIFY_RESEND and FUSE_NOTIFY_INC_EPOCH fuse_copy_finish() isn't called. Fix by always calling fuse_copy_finish() after fuse_notify(). It's a no-op if called a second time. Fixes: 760eac73f9f6 ("fuse: Introduce a new notification type for resend pending requests") Fixes: 2396356a945b ("fuse: add more control over cache invalidation behaviour") Cc: # v6.9 Reviewed-by: Joanne Koong Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index df793003eb0c..85d05a5e40e9 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -2178,7 +2178,7 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, */ if (!oh.unique) { err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), cs); - goto out; + goto copy_finish; } err = -EINVAL; -- cgit v1.2.3 From 60e1579a0dcf2c432286ef83ee470173d6db2f13 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 1 Sep 2025 17:09:40 +0200 Subject: fuse: remove redundant calls to fuse_copy_finish() in fuse_notify() Remove tail calls of fuse_copy_finish(), since it's now done from fuse_dev_do_write(). No functional change. Reviewed-by: Joanne Koong Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 79 +++++++++++++++++------------------------------------------ 1 file changed, 23 insertions(+), 56 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 85d05a5e40e9..1258acee9704 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1622,35 +1622,31 @@ static int fuse_notify_poll(struct fuse_conn *fc, unsigned int size, struct fuse_copy_state *cs) { struct fuse_notify_poll_wakeup_out outarg; - int err = -EINVAL; + int err; if (size != sizeof(outarg)) - goto err; + return -EINVAL; err = fuse_copy_one(cs, &outarg, sizeof(outarg)); if (err) - goto err; + return err; fuse_copy_finish(cs); return fuse_notify_poll_wakeup(fc, &outarg); - -err: - fuse_copy_finish(cs); - return err; } static int fuse_notify_inval_inode(struct fuse_conn *fc, unsigned int size, struct fuse_copy_state *cs) { struct fuse_notify_inval_inode_out outarg; - int err = -EINVAL; + int err; if (size != sizeof(outarg)) - goto err; + return -EINVAL; err = fuse_copy_one(cs, &outarg, sizeof(outarg)); if (err) - goto err; + return err; fuse_copy_finish(cs); down_read(&fc->killsb); @@ -1658,10 +1654,6 @@ static int fuse_notify_inval_inode(struct fuse_conn *fc, unsigned int size, outarg.off, outarg.len); up_read(&fc->killsb); return err; - -err: - fuse_copy_finish(cs); - return err; } static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size, @@ -1669,29 +1661,26 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size, { struct fuse_notify_inval_entry_out outarg; int err; - char *buf = NULL; + char *buf; struct qstr name; - err = -EINVAL; if (size < sizeof(outarg)) - goto err; + return -EINVAL; err = fuse_copy_one(cs, &outarg, sizeof(outarg)); if (err) - goto err; + return err; - err = -ENAMETOOLONG; if (outarg.namelen > fc->name_max) - goto err; + return -ENAMETOOLONG; err = -EINVAL; if (size != sizeof(outarg) + outarg.namelen + 1) - goto err; + return -EINVAL; - err = -ENOMEM; buf = kzalloc(outarg.namelen + 1, GFP_KERNEL); if (!buf) - goto err; + return -ENOMEM; name.name = buf; name.len = outarg.namelen; @@ -1704,12 +1693,8 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size, down_read(&fc->killsb); err = fuse_reverse_inval_entry(fc, outarg.parent, 0, &name, outarg.flags); up_read(&fc->killsb); - kfree(buf); - return err; - err: kfree(buf); - fuse_copy_finish(cs); return err; } @@ -1718,29 +1703,25 @@ static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size, { struct fuse_notify_delete_out outarg; int err; - char *buf = NULL; + char *buf; struct qstr name; - err = -EINVAL; if (size < sizeof(outarg)) - goto err; + return -EINVAL; err = fuse_copy_one(cs, &outarg, sizeof(outarg)); if (err) - goto err; + return err; - err = -ENAMETOOLONG; if (outarg.namelen > fc->name_max) - goto err; + return -ENAMETOOLONG; - err = -EINVAL; if (size != sizeof(outarg) + outarg.namelen + 1) - goto err; + return -EINVAL; - err = -ENOMEM; buf = kzalloc(outarg.namelen + 1, GFP_KERNEL); if (!buf) - goto err; + return -ENOMEM; name.name = buf; name.len = outarg.namelen; @@ -1753,12 +1734,8 @@ static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size, down_read(&fc->killsb); err = fuse_reverse_inval_entry(fc, outarg.parent, outarg.child, &name, 0); up_read(&fc->killsb); - kfree(buf); - return err; - err: kfree(buf); - fuse_copy_finish(cs); return err; } @@ -1776,17 +1753,15 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size, loff_t file_size; loff_t end; - err = -EINVAL; if (size < sizeof(outarg)) - goto out_finish; + return -EINVAL; err = fuse_copy_one(cs, &outarg, sizeof(outarg)); if (err) - goto out_finish; + return err; - err = -EINVAL; if (size - sizeof(outarg) != outarg.size) - goto out_finish; + return -EINVAL; nodeid = outarg.nodeid; @@ -1846,8 +1821,6 @@ out_iput: iput(inode); out_up_killsb: up_read(&fc->killsb); -out_finish: - fuse_copy_finish(cs); return err; } @@ -1962,13 +1935,12 @@ static int fuse_notify_retrieve(struct fuse_conn *fc, unsigned int size, u64 nodeid; int err; - err = -EINVAL; if (size != sizeof(outarg)) - goto copy_finish; + return -EINVAL; err = fuse_copy_one(cs, &outarg, sizeof(outarg)); if (err) - goto copy_finish; + return err; fuse_copy_finish(cs); @@ -1984,10 +1956,6 @@ static int fuse_notify_retrieve(struct fuse_conn *fc, unsigned int size, up_read(&fc->killsb); return err; - -copy_finish: - fuse_copy_finish(cs); - return err; } /* @@ -2098,7 +2066,6 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, return fuse_notify_inc_epoch(fc); default: - fuse_copy_finish(cs); return -EINVAL; } } -- cgit v1.2.3 From 3f29d59e92a96d843c2ff10ebfed92ac26878658 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 2 Sep 2025 10:22:06 +0200 Subject: fuse: add prune notification Some fuse servers need to prune their caches, which can only be done if the kernel's own dentry/inode caches are pruned first to avoid dangling references. Add FUSE_NOTIFY_PRUNE, which takes an array of node ID's to try and get rid of. Inodes with active references are skipped. A similar functionality is already provided by FUSE_NOTIFY_INVAL_ENTRY with the FUSE_EXPIRE_ONLY flag. Differences in the interface are FUSE_NOTIFY_INVAL_ENTRY: - can only prune one dentry - dentry is determined by parent ID and name - if inode has multiple aliases (cached hard links), then they would have to be invalidated individually to be able to get rid of the inode FUSE_NOTIFY_PRUNE: - can prune multiple inodes - inodes determined by their node ID - aliases are taken care of automatically Reviewed-by: Joanne Koong Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 39 +++++++++++++++++++++++++++++++++++++++ fs/fuse/fuse_i.h | 6 ++++++ fs/fuse/inode.c | 11 +++++++++++ include/uapi/linux/fuse.h | 8 ++++++++ 4 files changed, 64 insertions(+) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 1258acee9704..4229b38546bb 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -2034,6 +2034,42 @@ static int fuse_notify_inc_epoch(struct fuse_conn *fc) return 0; } +static int fuse_notify_prune(struct fuse_conn *fc, unsigned int size, + struct fuse_copy_state *cs) +{ + struct fuse_notify_prune_out outarg; + const unsigned int batch = 512; + u64 *nodeids __free(kfree) = kmalloc(sizeof(u64) * batch, GFP_KERNEL); + unsigned int num, i; + int err; + + if (!nodeids) + return -ENOMEM; + + if (size < sizeof(outarg)) + return -EINVAL; + + err = fuse_copy_one(cs, &outarg, sizeof(outarg)); + if (err) + return err; + + if (size - sizeof(outarg) != outarg.count * sizeof(u64)) + return -EINVAL; + + for (; outarg.count; outarg.count -= num) { + num = min(batch, outarg.count); + err = fuse_copy_one(cs, nodeids, num * sizeof(u64)); + if (err) + return err; + + scoped_guard(rwsem_read, &fc->killsb) { + for (i = 0; i < num; i++) + fuse_try_prune_one_inode(fc, nodeids[i]); + } + } + return 0; +} + static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, unsigned int size, struct fuse_copy_state *cs) { @@ -2065,6 +2101,9 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, case FUSE_NOTIFY_INC_EPOCH: return fuse_notify_inc_epoch(fc); + case FUSE_NOTIFY_PRUNE: + return fuse_notify_prune(fc, size, cs); + default: return -EINVAL; } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 233c6111f768..fb6604120b53 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1413,6 +1413,12 @@ int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid, int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid, u64 child_nodeid, struct qstr *name, u32 flags); +/* + * Try to prune this inode. If neither the inode itself nor dentries associated + * with this inode have any external reference, then the inode can be freed. + */ +void fuse_try_prune_one_inode(struct fuse_conn *fc, u64 nodeid); + int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file, bool isdir); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 5b7897bf7e45..a4d361b34d06 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -585,6 +585,17 @@ int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid, return 0; } +void fuse_try_prune_one_inode(struct fuse_conn *fc, u64 nodeid) +{ + struct inode *inode; + + inode = fuse_ilookup(fc, nodeid, NULL); + if (!inode) + return; + d_prune_aliases(inode); + iput(inode); +} + bool fuse_lock_inode(struct inode *inode) { bool locked = false; diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 30bf0846547f..c13e1f9a2f12 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -239,6 +239,7 @@ * 7.45 * - add FUSE_COPY_FILE_RANGE_64 * - add struct fuse_copy_file_range_out + * - add FUSE_NOTIFY_PRUNE */ #ifndef _LINUX_FUSE_H @@ -680,6 +681,7 @@ enum fuse_notify_code { FUSE_NOTIFY_DELETE = 6, FUSE_NOTIFY_RESEND = 7, FUSE_NOTIFY_INC_EPOCH = 8, + FUSE_NOTIFY_PRUNE = 9, }; /* The read buffer is required to be at least 8k, but may be much larger */ @@ -1118,6 +1120,12 @@ struct fuse_notify_retrieve_in { uint64_t dummy4; }; +struct fuse_notify_prune_out { + uint32_t count; + uint32_t padding; + uint64_t spare; +}; + struct fuse_backing_map { int32_t fd; uint32_t flags; -- cgit v1.2.3 From ebbe7d7bb1eae4db32554562f0e228824ab1f135 Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Tue, 26 Aug 2025 15:09:48 +0200 Subject: mm: fix lockdep issues in writeback handling Commit 2841808f35ee ("mm: remove BDI_CAP_WRITEBACK_ACCT") removed BDI_CAP_WRITEBACK_ACCT flag and refactored code that depend on it. Unfortunately it also moved some variable intialization out of guarded scope in writeback handling, what triggers a true lockdep warning. Fix this by moving initialization to the proper place. Fixes: 2841808f35ee ("mm: remove BDI_CAP_WRITEBACK_ACCT") Signed-off-by: Marek Szyprowski Acked-by: David Hildenbrand Reviewed-by: Joanne Koong Signed-off-by: Miklos Szeredi --- mm/page-writeback.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index de669636120d..e359f141639d 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -3014,7 +3014,7 @@ bool __folio_end_writeback(struct folio *folio) if (mapping && mapping_use_writeback_tags(mapping)) { struct inode *inode = mapping->host; - struct bdi_writeback *wb = inode_to_wb(inode); + struct bdi_writeback *wb; unsigned long flags; xa_lock_irqsave(&mapping->i_pages, flags); @@ -3022,6 +3022,7 @@ bool __folio_end_writeback(struct folio *folio) __xa_clear_mark(&mapping->i_pages, folio_index(folio), PAGECACHE_TAG_WRITEBACK); + wb = inode_to_wb(inode); wb_stat_mod(wb, WB_WRITEBACK, -nr); __wb_writeout_add(wb, nr); if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) { @@ -3054,7 +3055,7 @@ void __folio_start_writeback(struct folio *folio, bool keep_write) if (mapping && mapping_use_writeback_tags(mapping)) { XA_STATE(xas, &mapping->i_pages, folio_index(folio)); struct inode *inode = mapping->host; - struct bdi_writeback *wb = inode_to_wb(inode); + struct bdi_writeback *wb; unsigned long flags; bool on_wblist; @@ -3065,6 +3066,7 @@ void __folio_start_writeback(struct folio *folio, bool keep_write) on_wblist = mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK); xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK); + wb = inode_to_wb(inode); wb_stat_mod(wb, WB_WRITEBACK, nr); if (!on_wblist) { wb_inode_writeback_start(wb); -- cgit v1.2.3 From 26e5c67deb2e1f42a951f022fdf5b9f7eb747b01 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 15 Sep 2025 17:24:17 -0700 Subject: fuse: fix livelock in synchronous file put from fuseblk workers I observed a hang when running generic/323 against a fuseblk server. This test opens a file, initiates a lot of AIO writes to that file descriptor, and closes the file descriptor before the writes complete. Unsurprisingly, the AIO exerciser threads are mostly stuck waiting for responses from the fuseblk server: # cat /proc/372265/task/372313/stack [<0>] request_wait_answer+0x1fe/0x2a0 [fuse] [<0>] __fuse_simple_request+0xd3/0x2b0 [fuse] [<0>] fuse_do_getattr+0xfc/0x1f0 [fuse] [<0>] fuse_file_read_iter+0xbe/0x1c0 [fuse] [<0>] aio_read+0x130/0x1e0 [<0>] io_submit_one+0x542/0x860 [<0>] __x64_sys_io_submit+0x98/0x1a0 [<0>] do_syscall_64+0x37/0xf0 [<0>] entry_SYSCALL_64_after_hwframe+0x4b/0x53 But the /weird/ part is that the fuseblk server threads are waiting for responses from itself: # cat /proc/372210/task/372232/stack [<0>] request_wait_answer+0x1fe/0x2a0 [fuse] [<0>] __fuse_simple_request+0xd3/0x2b0 [fuse] [<0>] fuse_file_put+0x9a/0xd0 [fuse] [<0>] fuse_release+0x36/0x50 [fuse] [<0>] __fput+0xec/0x2b0 [<0>] task_work_run+0x55/0x90 [<0>] syscall_exit_to_user_mode+0xe9/0x100 [<0>] do_syscall_64+0x43/0xf0 [<0>] entry_SYSCALL_64_after_hwframe+0x4b/0x53 The fuseblk server is fuse2fs so there's nothing all that exciting in the server itself. So why is the fuse server calling fuse_file_put? The commit message for the fstest sheds some light on that: "By closing the file descriptor before calling io_destroy, you pretty much guarantee that the last put on the ioctx will be done in interrupt context (during I/O completion). Aha. AIO fgets a new struct file from the fd when it queues the ioctx. The completion of the FUSE_WRITE command from userspace causes the fuse server to call the AIO completion function. The completion puts the struct file, queuing a delayed fput to the fuse server task. When the fuse server task returns to userspace, it has to run the delayed fput, which in the case of a fuseblk server, it does synchronously. Sending the FUSE_RELEASE command sychronously from fuse server threads is a bad idea because a client program can initiate enough simultaneous AIOs such that all the fuse server threads end up in delayed_fput, and now there aren't any threads left to handle the queued fuse commands. Fix this by only using asynchronous fputs when closing files, and leave a comment explaining why. Cc: stable@vger.kernel.org # v2.6.38 Fixes: 5a18ec176c934c ("fuse: fix hang of single threaded fuseblk filesystem") Signed-off-by: Darrick J. Wong Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 54786f62a9d8..f1ef77a0be05 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -356,8 +356,14 @@ void fuse_file_release(struct inode *inode, struct fuse_file *ff, * Make the release synchronous if this is a fuseblk mount, * synchronous RELEASE is allowed (and desirable) in this case * because the server can be trusted not to screw up. + * + * Always use the asynchronous file put because the current thread + * might be the fuse server. This can happen if a process starts some + * aio and closes the fd before the aio completes. Since aio takes its + * own ref to the file, the IO completion has to drop the ref, which is + * how the fuse server can end up closing its clients' files. */ - fuse_file_put(ff, ff->fm->fc->destroy); + fuse_file_put(ff, false); } void fuse_release_common(struct file *file, bool isdir) -- cgit v1.2.3 From 0d375a1385ed80d8c84433fb54062a9253ccf7e5 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 15 Sep 2025 17:24:48 -0700 Subject: fuse: capture the unique id of fuse commands being sent The fuse_request_{send,end} tracepoints capture the value of req->in.h.unique in the trace output. It would be really nice if we could use this to match a request to its response for debugging and latency analysis, but the call to trace_fuse_request_send occurs before the unique id has been set: fuse_request_send: connection 8388608 req 0 opcode 1 (FUSE_LOOKUP) len 107 fuse_request_end: connection 8388608 req 6 len 16 error -2 (Notice that req moves from 0 to 6) Move the callsites to trace_fuse_request_send to after the unique id has been set by introducing a helper to do that for standard fuse_req requests. FUSE_FORGET requests are not covered by this because they appear to be synthesized into the event stream without a fuse_req object and are never replied to. Requests that are aborted without ever having been submitted to the fuse server retain the behavior that only the fuse_request_end tracepoint shows up in the trace record, and with req==0. Signed-off-by: Darrick J. Wong Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 27 +++++++++++++++++++++++---- fs/fuse/dev_uring.c | 4 ++-- fs/fuse/fuse_i.h | 5 +++++ fs/fuse/virtio_fs.c | 3 +-- 4 files changed, 31 insertions(+), 8 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 4229b38546bb..1941f93190e4 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -371,12 +371,32 @@ void fuse_dev_queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req) } } +static inline void fuse_request_assign_unique_locked(struct fuse_iqueue *fiq, + struct fuse_req *req) +{ + if (req->in.h.opcode != FUSE_NOTIFY_REPLY) + req->in.h.unique = fuse_get_unique_locked(fiq); + + /* tracepoint captures in.h.unique and in.h.len */ + trace_fuse_request_send(req); +} + +inline void fuse_request_assign_unique(struct fuse_iqueue *fiq, + struct fuse_req *req) +{ + if (req->in.h.opcode != FUSE_NOTIFY_REPLY) + req->in.h.unique = fuse_get_unique(fiq); + + /* tracepoint captures in.h.unique and in.h.len */ + trace_fuse_request_send(req); +} +EXPORT_SYMBOL_GPL(fuse_request_assign_unique); + static void fuse_dev_queue_req(struct fuse_iqueue *fiq, struct fuse_req *req) { spin_lock(&fiq->lock); if (fiq->connected) { - if (req->in.h.opcode != FUSE_NOTIFY_REPLY) - req->in.h.unique = fuse_get_unique_locked(fiq); + fuse_request_assign_unique_locked(fiq, req); list_add_tail(&req->list, &fiq->pending); fuse_dev_wake_and_unlock(fiq); } else { @@ -399,7 +419,6 @@ static void fuse_send_one(struct fuse_iqueue *fiq, struct fuse_req *req) req->in.h.len = sizeof(struct fuse_in_header) + fuse_len_args(req->args->in_numargs, (struct fuse_arg *) req->args->in_args); - trace_fuse_request_send(req); fiq->ops->send_req(fiq, req); } @@ -689,10 +708,10 @@ static bool fuse_request_queue_background_uring(struct fuse_conn *fc, { struct fuse_iqueue *fiq = &fc->iq; - req->in.h.unique = fuse_get_unique(fiq); req->in.h.len = sizeof(struct fuse_in_header) + fuse_len_args(req->args->in_numargs, (struct fuse_arg *) req->args->in_args); + fuse_request_assign_unique(fiq, req); return fuse_uring_queue_bq_req(req); } diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index bef38ed78249..6862fe6b7799 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -7,6 +7,7 @@ #include "fuse_i.h" #include "dev_uring_i.h" #include "fuse_dev_i.h" +#include "fuse_trace.h" #include #include @@ -1268,8 +1269,7 @@ void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req) if (!queue) goto err; - if (req->in.h.opcode != FUSE_NOTIFY_REPLY) - req->in.h.unique = fuse_get_unique(fiq); + fuse_request_assign_unique(fiq, req); spin_lock(&queue->lock); err = -ENOTCONN; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index fb6604120b53..48ec7812a1e9 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1260,6 +1260,11 @@ static inline ssize_t fuse_simple_idmap_request(struct mnt_idmap *idmap, int fuse_simple_background(struct fuse_mount *fm, struct fuse_args *args, gfp_t gfp_flags); +/** + * Assign a unique id to a fuse request + */ +void fuse_request_assign_unique(struct fuse_iqueue *fiq, struct fuse_req *req); + /** * End a finished request */ diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index c3a39061363e..7164961cb053 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -1479,8 +1479,7 @@ static void virtio_fs_send_req(struct fuse_iqueue *fiq, struct fuse_req *req) struct virtio_fs_vq *fsvq; int ret; - if (req->in.h.opcode != FUSE_NOTIFY_REPLY) - req->in.h.unique = fuse_get_unique(fiq); + fuse_request_assign_unique(fiq, req); clear_bit(FR_PENDING, &req->flags); -- cgit v1.2.3 From d3906d8f3cee0279d459dc88b5a871fcdcd4b236 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 15 Sep 2025 17:26:09 -0700 Subject: fuse: enable FUSE_SYNCFS for all fuseblk servers Turn on syncfs for all fuseblk servers so that the ones in the know can flush cached intermediate data and logs to disk. Signed-off-by: Darrick J. Wong Signed-off-by: Miklos Szeredi --- fs/fuse/inode.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index a4d361b34d06..6fcfa15da868 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1849,6 +1849,7 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) !sb_set_blocksize(sb, PAGE_SIZE)) goto err; #endif + fc->sync_fs = 1; } else { sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; -- cgit v1.2.3 From c4331e19a6b0f9b8de5921cc2f3253e572945564 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 15 Sep 2025 17:27:11 -0700 Subject: fuse: move the backing file idr and code into a new source file iomap support for fuse is also going to want the ability to attach backing files to a fuse filesystem. Move the fuse_backing code into a separate file so that both can use it. Signed-off-by: Darrick J. Wong Reviewed-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- fs/fuse/Makefile | 2 +- fs/fuse/backing.c | 179 ++++++++++++++++++++++++++++++++++++++++++++++++++ fs/fuse/fuse_i.h | 47 +++++++------ fs/fuse/passthrough.c | 163 +-------------------------------------------- 4 files changed, 208 insertions(+), 183 deletions(-) create mode 100644 fs/fuse/backing.c diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile index 3f0f312a31c1..8ddd8f0b204e 100644 --- a/fs/fuse/Makefile +++ b/fs/fuse/Makefile @@ -13,7 +13,7 @@ obj-$(CONFIG_VIRTIO_FS) += virtiofs.o fuse-y := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o ioctl.o fuse-y += iomode.o fuse-$(CONFIG_FUSE_DAX) += dax.o -fuse-$(CONFIG_FUSE_PASSTHROUGH) += passthrough.o +fuse-$(CONFIG_FUSE_PASSTHROUGH) += passthrough.o backing.o fuse-$(CONFIG_SYSCTL) += sysctl.o fuse-$(CONFIG_FUSE_IO_URING) += dev_uring.o diff --git a/fs/fuse/backing.c b/fs/fuse/backing.c new file mode 100644 index 000000000000..4afda419dd14 --- /dev/null +++ b/fs/fuse/backing.c @@ -0,0 +1,179 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * FUSE passthrough to backing file. + * + * Copyright (c) 2023 CTERA Networks. + */ + +#include "fuse_i.h" + +#include + +struct fuse_backing *fuse_backing_get(struct fuse_backing *fb) +{ + if (fb && refcount_inc_not_zero(&fb->count)) + return fb; + return NULL; +} + +static void fuse_backing_free(struct fuse_backing *fb) +{ + pr_debug("%s: fb=0x%p\n", __func__, fb); + + if (fb->file) + fput(fb->file); + put_cred(fb->cred); + kfree_rcu(fb, rcu); +} + +void fuse_backing_put(struct fuse_backing *fb) +{ + if (fb && refcount_dec_and_test(&fb->count)) + fuse_backing_free(fb); +} + +void fuse_backing_files_init(struct fuse_conn *fc) +{ + idr_init(&fc->backing_files_map); +} + +static int fuse_backing_id_alloc(struct fuse_conn *fc, struct fuse_backing *fb) +{ + int id; + + idr_preload(GFP_KERNEL); + spin_lock(&fc->lock); + /* FIXME: xarray might be space inefficient */ + id = idr_alloc_cyclic(&fc->backing_files_map, fb, 1, 0, GFP_ATOMIC); + spin_unlock(&fc->lock); + idr_preload_end(); + + WARN_ON_ONCE(id == 0); + return id; +} + +static struct fuse_backing *fuse_backing_id_remove(struct fuse_conn *fc, + int id) +{ + struct fuse_backing *fb; + + spin_lock(&fc->lock); + fb = idr_remove(&fc->backing_files_map, id); + spin_unlock(&fc->lock); + + return fb; +} + +static int fuse_backing_id_free(int id, void *p, void *data) +{ + struct fuse_backing *fb = p; + + WARN_ON_ONCE(refcount_read(&fb->count) != 1); + fuse_backing_free(fb); + return 0; +} + +void fuse_backing_files_free(struct fuse_conn *fc) +{ + idr_for_each(&fc->backing_files_map, fuse_backing_id_free, NULL); + idr_destroy(&fc->backing_files_map); +} + +int fuse_backing_open(struct fuse_conn *fc, struct fuse_backing_map *map) +{ + struct file *file; + struct super_block *backing_sb; + struct fuse_backing *fb = NULL; + int res; + + pr_debug("%s: fd=%d flags=0x%x\n", __func__, map->fd, map->flags); + + /* TODO: relax CAP_SYS_ADMIN once backing files are visible to lsof */ + res = -EPERM; + if (!fc->passthrough || !capable(CAP_SYS_ADMIN)) + goto out; + + res = -EINVAL; + if (map->flags || map->padding) + goto out; + + file = fget_raw(map->fd); + res = -EBADF; + if (!file) + goto out; + + /* read/write/splice/mmap passthrough only relevant for regular files */ + res = d_is_dir(file->f_path.dentry) ? -EISDIR : -EINVAL; + if (!d_is_reg(file->f_path.dentry)) + goto out_fput; + + backing_sb = file_inode(file)->i_sb; + res = -ELOOP; + if (backing_sb->s_stack_depth >= fc->max_stack_depth) + goto out_fput; + + fb = kmalloc(sizeof(struct fuse_backing), GFP_KERNEL); + res = -ENOMEM; + if (!fb) + goto out_fput; + + fb->file = file; + fb->cred = prepare_creds(); + refcount_set(&fb->count, 1); + + res = fuse_backing_id_alloc(fc, fb); + if (res < 0) { + fuse_backing_free(fb); + fb = NULL; + } + +out: + pr_debug("%s: fb=0x%p, ret=%i\n", __func__, fb, res); + + return res; + +out_fput: + fput(file); + goto out; +} + +int fuse_backing_close(struct fuse_conn *fc, int backing_id) +{ + struct fuse_backing *fb = NULL; + int err; + + pr_debug("%s: backing_id=%d\n", __func__, backing_id); + + /* TODO: relax CAP_SYS_ADMIN once backing files are visible to lsof */ + err = -EPERM; + if (!fc->passthrough || !capable(CAP_SYS_ADMIN)) + goto out; + + err = -EINVAL; + if (backing_id <= 0) + goto out; + + err = -ENOENT; + fb = fuse_backing_id_remove(fc, backing_id); + if (!fb) + goto out; + + fuse_backing_put(fb); + err = 0; +out: + pr_debug("%s: fb=0x%p, err=%i\n", __func__, fb, err); + + return err; +} + +struct fuse_backing *fuse_backing_lookup(struct fuse_conn *fc, int backing_id) +{ + struct fuse_backing *fb; + + rcu_read_lock(); + fb = idr_find(&fc->backing_files_map, backing_id); + fb = fuse_backing_get(fb); + rcu_read_unlock(); + + return fb; +} diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 48ec7812a1e9..c2f2a48156d6 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1529,29 +1529,11 @@ struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid, void fuse_file_release(struct inode *inode, struct fuse_file *ff, unsigned int open_flags, fl_owner_t id, bool isdir); -/* passthrough.c */ -static inline struct fuse_backing *fuse_inode_backing(struct fuse_inode *fi) -{ -#ifdef CONFIG_FUSE_PASSTHROUGH - return READ_ONCE(fi->fb); -#else - return NULL; -#endif -} - -static inline struct fuse_backing *fuse_inode_backing_set(struct fuse_inode *fi, - struct fuse_backing *fb) -{ -#ifdef CONFIG_FUSE_PASSTHROUGH - return xchg(&fi->fb, fb); -#else - return NULL; -#endif -} - +/* backing.c */ #ifdef CONFIG_FUSE_PASSTHROUGH struct fuse_backing *fuse_backing_get(struct fuse_backing *fb); void fuse_backing_put(struct fuse_backing *fb); +struct fuse_backing *fuse_backing_lookup(struct fuse_conn *fc, int backing_id); #else static inline struct fuse_backing *fuse_backing_get(struct fuse_backing *fb) @@ -1562,6 +1544,11 @@ static inline struct fuse_backing *fuse_backing_get(struct fuse_backing *fb) static inline void fuse_backing_put(struct fuse_backing *fb) { } +static inline struct fuse_backing *fuse_backing_lookup(struct fuse_conn *fc, + int backing_id) +{ + return NULL; +} #endif void fuse_backing_files_init(struct fuse_conn *fc); @@ -1569,6 +1556,26 @@ void fuse_backing_files_free(struct fuse_conn *fc); int fuse_backing_open(struct fuse_conn *fc, struct fuse_backing_map *map); int fuse_backing_close(struct fuse_conn *fc, int backing_id); +/* passthrough.c */ +static inline struct fuse_backing *fuse_inode_backing(struct fuse_inode *fi) +{ +#ifdef CONFIG_FUSE_PASSTHROUGH + return READ_ONCE(fi->fb); +#else + return NULL; +#endif +} + +static inline struct fuse_backing *fuse_inode_backing_set(struct fuse_inode *fi, + struct fuse_backing *fb) +{ +#ifdef CONFIG_FUSE_PASSTHROUGH + return xchg(&fi->fb, fb); +#else + return NULL; +#endif +} + struct fuse_backing *fuse_passthrough_open(struct file *file, int backing_id); void fuse_passthrough_release(struct fuse_file *ff, struct fuse_backing *fb); diff --git a/fs/fuse/passthrough.c b/fs/fuse/passthrough.c index 0bb80112741a..72de97c03d0e 100644 --- a/fs/fuse/passthrough.c +++ b/fs/fuse/passthrough.c @@ -144,163 +144,6 @@ ssize_t fuse_passthrough_mmap(struct file *file, struct vm_area_struct *vma) return backing_file_mmap(backing_file, vma, &ctx); } -struct fuse_backing *fuse_backing_get(struct fuse_backing *fb) -{ - if (fb && refcount_inc_not_zero(&fb->count)) - return fb; - return NULL; -} - -static void fuse_backing_free(struct fuse_backing *fb) -{ - pr_debug("%s: fb=0x%p\n", __func__, fb); - - if (fb->file) - fput(fb->file); - put_cred(fb->cred); - kfree_rcu(fb, rcu); -} - -void fuse_backing_put(struct fuse_backing *fb) -{ - if (fb && refcount_dec_and_test(&fb->count)) - fuse_backing_free(fb); -} - -void fuse_backing_files_init(struct fuse_conn *fc) -{ - idr_init(&fc->backing_files_map); -} - -static int fuse_backing_id_alloc(struct fuse_conn *fc, struct fuse_backing *fb) -{ - int id; - - idr_preload(GFP_KERNEL); - spin_lock(&fc->lock); - /* FIXME: xarray might be space inefficient */ - id = idr_alloc_cyclic(&fc->backing_files_map, fb, 1, 0, GFP_ATOMIC); - spin_unlock(&fc->lock); - idr_preload_end(); - - WARN_ON_ONCE(id == 0); - return id; -} - -static struct fuse_backing *fuse_backing_id_remove(struct fuse_conn *fc, - int id) -{ - struct fuse_backing *fb; - - spin_lock(&fc->lock); - fb = idr_remove(&fc->backing_files_map, id); - spin_unlock(&fc->lock); - - return fb; -} - -static int fuse_backing_id_free(int id, void *p, void *data) -{ - struct fuse_backing *fb = p; - - WARN_ON_ONCE(refcount_read(&fb->count) != 1); - fuse_backing_free(fb); - return 0; -} - -void fuse_backing_files_free(struct fuse_conn *fc) -{ - idr_for_each(&fc->backing_files_map, fuse_backing_id_free, NULL); - idr_destroy(&fc->backing_files_map); -} - -int fuse_backing_open(struct fuse_conn *fc, struct fuse_backing_map *map) -{ - struct file *file; - struct super_block *backing_sb; - struct fuse_backing *fb = NULL; - int res; - - pr_debug("%s: fd=%d flags=0x%x\n", __func__, map->fd, map->flags); - - /* TODO: relax CAP_SYS_ADMIN once backing files are visible to lsof */ - res = -EPERM; - if (!fc->passthrough || !capable(CAP_SYS_ADMIN)) - goto out; - - res = -EINVAL; - if (map->flags || map->padding) - goto out; - - file = fget_raw(map->fd); - res = -EBADF; - if (!file) - goto out; - - /* read/write/splice/mmap passthrough only relevant for regular files */ - res = d_is_dir(file->f_path.dentry) ? -EISDIR : -EINVAL; - if (!d_is_reg(file->f_path.dentry)) - goto out_fput; - - backing_sb = file_inode(file)->i_sb; - res = -ELOOP; - if (backing_sb->s_stack_depth >= fc->max_stack_depth) - goto out_fput; - - fb = kmalloc(sizeof(struct fuse_backing), GFP_KERNEL); - res = -ENOMEM; - if (!fb) - goto out_fput; - - fb->file = file; - fb->cred = prepare_creds(); - refcount_set(&fb->count, 1); - - res = fuse_backing_id_alloc(fc, fb); - if (res < 0) { - fuse_backing_free(fb); - fb = NULL; - } - -out: - pr_debug("%s: fb=0x%p, ret=%i\n", __func__, fb, res); - - return res; - -out_fput: - fput(file); - goto out; -} - -int fuse_backing_close(struct fuse_conn *fc, int backing_id) -{ - struct fuse_backing *fb = NULL; - int err; - - pr_debug("%s: backing_id=%d\n", __func__, backing_id); - - /* TODO: relax CAP_SYS_ADMIN once backing files are visible to lsof */ - err = -EPERM; - if (!fc->passthrough || !capable(CAP_SYS_ADMIN)) - goto out; - - err = -EINVAL; - if (backing_id <= 0) - goto out; - - err = -ENOENT; - fb = fuse_backing_id_remove(fc, backing_id); - if (!fb) - goto out; - - fuse_backing_put(fb); - err = 0; -out: - pr_debug("%s: fb=0x%p, err=%i\n", __func__, fb, err); - - return err; -} - /* * Setup passthrough to a backing file. * @@ -318,12 +161,8 @@ struct fuse_backing *fuse_passthrough_open(struct file *file, int backing_id) if (backing_id <= 0) goto out; - rcu_read_lock(); - fb = idr_find(&fc->backing_files_map, backing_id); - fb = fuse_backing_get(fb); - rcu_read_unlock(); - err = -ENOENT; + fb = fuse_backing_lookup(fc, backing_id); if (!fb) goto out; -- cgit v1.2.3 From cb403594701cd36f7f3f868258655d56f9afaf8e Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 15 Sep 2025 17:27:58 -0700 Subject: fuse: move CREATE_TRACE_POINTS to a separate file Before we start adding new tracepoints for fuse+iomap, move the tracepoint creation itself to a separate source file so that we don't have to start pulling iomap dependencies into dev.c just for the iomap structures. Signed-off-by: Darrick J. Wong Signed-off-by: Miklos Szeredi --- fs/fuse/Makefile | 3 ++- fs/fuse/dev.c | 1 - fs/fuse/trace.c | 13 +++++++++++++ 3 files changed, 15 insertions(+), 2 deletions(-) create mode 100644 fs/fuse/trace.c diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile index 8ddd8f0b204e..22ad9538dfc4 100644 --- a/fs/fuse/Makefile +++ b/fs/fuse/Makefile @@ -10,7 +10,8 @@ obj-$(CONFIG_FUSE_FS) += fuse.o obj-$(CONFIG_CUSE) += cuse.o obj-$(CONFIG_VIRTIO_FS) += virtiofs.o -fuse-y := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o ioctl.o +fuse-y := trace.o # put trace.o first so we see ftrace errors sooner +fuse-y += dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o ioctl.o fuse-y += iomode.o fuse-$(CONFIG_FUSE_DAX) += dax.o fuse-$(CONFIG_FUSE_PASSTHROUGH) += passthrough.o backing.o diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 1941f93190e4..f316a9ef9ff8 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -25,7 +25,6 @@ #include #include -#define CREATE_TRACE_POINTS #include "fuse_trace.h" MODULE_ALIAS_MISCDEV(FUSE_MINOR); diff --git a/fs/fuse/trace.c b/fs/fuse/trace.c new file mode 100644 index 000000000000..93bd72efc98c --- /dev/null +++ b/fs/fuse/trace.c @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2025 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#include "dev_uring_i.h" +#include "fuse_i.h" +#include "fuse_dev_i.h" + +#include + +#define CREATE_TRACE_POINTS +#include "fuse_trace.h" -- cgit v1.2.3