summaryrefslogtreecommitdiff
path: root/fs/xfs/xfs_inode.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-07-17 12:57:48 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2024-07-17 12:57:48 -0700
commitbf3aa9de7ba57c2c7b5ea70c1ad3a6670cd6fcb0 (patch)
tree791228dc4eb6d90e2c27295930449b06f6952ad3 /fs/xfs/xfs_inode.c
parent0260b0a7445c62a08938fa66fad256e5d0779817 (diff)
parent2bf6e353542d233486195953dc9c346331f82dcb (diff)
downloadlinux-bf3aa9de7ba57c2c7b5ea70c1ad3a6670cd6fcb0.tar.gz
linux-bf3aa9de7ba57c2c7b5ea70c1ad3a6670cd6fcb0.tar.bz2
linux-bf3aa9de7ba57c2c7b5ea70c1ad3a6670cd6fcb0.zip
Merge tag 'xfs-6.11-merge-3' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux
Pull xfs updates from Chandan Babu: "Major changes in this release are limited to enabling FITRIM on realtime devices and Byte-based grant head log reservation tracking. The remaining changes are limited to fixes and cleanups included in this pull request. Core: - Enable FITRIM on the realtime device - Introduce byte-based grant head log reservation tracking instead of physical log location tracking. This allows grant head to track a full 64 bit bytes space and hence overcome the limit of 4GB indexing that has been present until now Fixes: - xfs_flush_unmap_range() and xfs_prepare_shift() should consider RT extents in the flush unmap range - Implement bounds check when traversing log operations during log replay - Prevent out of bounds access when traversing a directory data block - Prevent incorrect ENOSPC when concurrently performing file creation and file writes - Fix rtalloc rotoring when delalloc is in use Cleanups: - Clean up I/O path inode locking helpers and the page fault handler - xfs: hoist inode operations to libxfs in anticipation of the metadata inode directory feature, which maintains a directory tree of metadata inodes. This will be necessary for further enhancements to the realtime feature, subvolume support - Clean up some warts in the extent freeing log intent code - Clean up the refcount and rmap intent code before adding support for realtime devices - Provide the correct email address for sysfs ABI documentation" * tag 'xfs-6.11-merge-3' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux: (80 commits) xfs: fix rtalloc rotoring when delalloc is in use xfs: get rid of xfs_ag_resv_rmapbt_alloc xfs: skip flushing log items during push xfs: grant heads track byte counts, not LSNs xfs: pass the full grant head to accounting functions xfs: track log space pinned by the AIL xfs: collapse xlog_state_set_callback in caller xfs: l_last_sync_lsn is really AIL state xfs: ensure log tail is always up to date xfs: background AIL push should target physical space xfs: AIL doesn't need manual pushing xfs: move and rename xfs_trans_committed_bulk xfs: fix the contact address for the sysfs ABI documentation xfs: Avoid races with cnt_btree lastrec updates xfs: move xfs_refcount_update_defer_add to xfs_refcount_item.c xfs: simplify usage of the rcur local variable in xfs_refcount_finish_one xfs: don't bother calling xfs_refcount_finish_one_cleanup in xfs_refcount_finish_one xfs: reuse xfs_refcount_update_cancel_item xfs: add a ci_entry helper xfs: remove xfs_trans_set_refcount_flags ...
Diffstat (limited to 'fs/xfs/xfs_inode.c')
-rw-r--r--fs/xfs/xfs_inode.c1488
1 files changed, 161 insertions, 1327 deletions
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index a4e3cd8971fc..7dc6f326936c 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -42,55 +42,11 @@
#include "xfs_pnfs.h"
#include "xfs_parent.h"
#include "xfs_xattr.h"
-#include "xfs_sb.h"
+#include "xfs_inode_util.h"
struct kmem_cache *xfs_inode_cache;
/*
- * helper function to extract extent size hint from inode
- */
-xfs_extlen_t
-xfs_get_extsz_hint(
- struct xfs_inode *ip)
-{
- /*
- * No point in aligning allocations if we need to COW to actually
- * write to them.
- */
- if (xfs_is_always_cow_inode(ip))
- return 0;
- if ((ip->i_diflags & XFS_DIFLAG_EXTSIZE) && ip->i_extsize)
- return ip->i_extsize;
- if (XFS_IS_REALTIME_INODE(ip) &&
- ip->i_mount->m_sb.sb_rextsize > 1)
- return ip->i_mount->m_sb.sb_rextsize;
- return 0;
-}
-
-/*
- * Helper function to extract CoW extent size hint from inode.
- * Between the extent size hint and the CoW extent size hint, we
- * return the greater of the two. If the value is zero (automatic),
- * use the default size.
- */
-xfs_extlen_t
-xfs_get_cowextsz_hint(
- struct xfs_inode *ip)
-{
- xfs_extlen_t a, b;
-
- a = 0;
- if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)
- a = ip->i_cowextsize;
- b = xfs_get_extsz_hint(ip);
-
- a = max(a, b);
- if (a == 0)
- return XFS_DEFAULT_COWEXTSZ_HINT;
- return a;
-}
-
-/*
* These two are wrapper routines around the xfs_ilock() routine used to
* centralize some grungy code. They are used in places that wish to lock the
* inode solely for reading the extents. The reason these places can't just
@@ -567,55 +523,6 @@ xfs_lock_two_inodes(
}
}
-uint
-xfs_ip2xflags(
- struct xfs_inode *ip)
-{
- uint flags = 0;
-
- if (ip->i_diflags & XFS_DIFLAG_ANY) {
- if (ip->i_diflags & XFS_DIFLAG_REALTIME)
- flags |= FS_XFLAG_REALTIME;
- if (ip->i_diflags & XFS_DIFLAG_PREALLOC)
- flags |= FS_XFLAG_PREALLOC;
- if (ip->i_diflags & XFS_DIFLAG_IMMUTABLE)
- flags |= FS_XFLAG_IMMUTABLE;
- if (ip->i_diflags & XFS_DIFLAG_APPEND)
- flags |= FS_XFLAG_APPEND;
- if (ip->i_diflags & XFS_DIFLAG_SYNC)
- flags |= FS_XFLAG_SYNC;
- if (ip->i_diflags & XFS_DIFLAG_NOATIME)
- flags |= FS_XFLAG_NOATIME;
- if (ip->i_diflags & XFS_DIFLAG_NODUMP)
- flags |= FS_XFLAG_NODUMP;
- if (ip->i_diflags & XFS_DIFLAG_RTINHERIT)
- flags |= FS_XFLAG_RTINHERIT;
- if (ip->i_diflags & XFS_DIFLAG_PROJINHERIT)
- flags |= FS_XFLAG_PROJINHERIT;
- if (ip->i_diflags & XFS_DIFLAG_NOSYMLINKS)
- flags |= FS_XFLAG_NOSYMLINKS;
- if (ip->i_diflags & XFS_DIFLAG_EXTSIZE)
- flags |= FS_XFLAG_EXTSIZE;
- if (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT)
- flags |= FS_XFLAG_EXTSZINHERIT;
- if (ip->i_diflags & XFS_DIFLAG_NODEFRAG)
- flags |= FS_XFLAG_NODEFRAG;
- if (ip->i_diflags & XFS_DIFLAG_FILESTREAM)
- flags |= FS_XFLAG_FILESTREAM;
- }
-
- if (ip->i_diflags2 & XFS_DIFLAG2_ANY) {
- if (ip->i_diflags2 & XFS_DIFLAG2_DAX)
- flags |= FS_XFLAG_DAX;
- if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)
- flags |= FS_XFLAG_COWEXTSIZE;
- }
-
- if (xfs_inode_has_attr_fork(ip))
- flags |= FS_XFLAG_HASATTR;
- return flags;
-}
-
/*
* Lookups up an inode from "name". If ci_name is not NULL, then a CI match
* is allowed, otherwise it has to be an exact match. If a CI match is found,
@@ -657,97 +564,6 @@ out_unlock:
return error;
}
-/* Propagate di_flags from a parent inode to a child inode. */
-static void
-xfs_inode_inherit_flags(
- struct xfs_inode *ip,
- const struct xfs_inode *pip)
-{
- unsigned int di_flags = 0;
- xfs_failaddr_t failaddr;
- umode_t mode = VFS_I(ip)->i_mode;
-
- if (S_ISDIR(mode)) {
- if (pip->i_diflags & XFS_DIFLAG_RTINHERIT)
- di_flags |= XFS_DIFLAG_RTINHERIT;
- if (pip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) {
- di_flags |= XFS_DIFLAG_EXTSZINHERIT;
- ip->i_extsize = pip->i_extsize;
- }
- if (pip->i_diflags & XFS_DIFLAG_PROJINHERIT)
- di_flags |= XFS_DIFLAG_PROJINHERIT;
- } else if (S_ISREG(mode)) {
- if ((pip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
- xfs_has_realtime(ip->i_mount))
- di_flags |= XFS_DIFLAG_REALTIME;
- if (pip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) {
- di_flags |= XFS_DIFLAG_EXTSIZE;
- ip->i_extsize = pip->i_extsize;
- }
- }
- if ((pip->i_diflags & XFS_DIFLAG_NOATIME) &&
- xfs_inherit_noatime)
- di_flags |= XFS_DIFLAG_NOATIME;
- if ((pip->i_diflags & XFS_DIFLAG_NODUMP) &&
- xfs_inherit_nodump)
- di_flags |= XFS_DIFLAG_NODUMP;
- if ((pip->i_diflags & XFS_DIFLAG_SYNC) &&
- xfs_inherit_sync)
- di_flags |= XFS_DIFLAG_SYNC;
- if ((pip->i_diflags & XFS_DIFLAG_NOSYMLINKS) &&
- xfs_inherit_nosymlinks)
- di_flags |= XFS_DIFLAG_NOSYMLINKS;
- if ((pip->i_diflags & XFS_DIFLAG_NODEFRAG) &&
- xfs_inherit_nodefrag)
- di_flags |= XFS_DIFLAG_NODEFRAG;
- if (pip->i_diflags & XFS_DIFLAG_FILESTREAM)
- di_flags |= XFS_DIFLAG_FILESTREAM;
-
- ip->i_diflags |= di_flags;
-
- /*
- * Inode verifiers on older kernels only check that the extent size
- * hint is an integer multiple of the rt extent size on realtime files.
- * They did not check the hint alignment on a directory with both
- * rtinherit and extszinherit flags set. If the misaligned hint is
- * propagated from a directory into a new realtime file, new file
- * allocations will fail due to math errors in the rt allocator and/or
- * trip the verifiers. Validate the hint settings in the new file so
- * that we don't let broken hints propagate.
- */
- failaddr = xfs_inode_validate_extsize(ip->i_mount, ip->i_extsize,
- VFS_I(ip)->i_mode, ip->i_diflags);
- if (failaddr) {
- ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE |
- XFS_DIFLAG_EXTSZINHERIT);
- ip->i_extsize = 0;
- }
-}
-
-/* Propagate di_flags2 from a parent inode to a child inode. */
-static void
-xfs_inode_inherit_flags2(
- struct xfs_inode *ip,
- const struct xfs_inode *pip)
-{
- xfs_failaddr_t failaddr;
-
- if (pip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) {
- ip->i_diflags2 |= XFS_DIFLAG2_COWEXTSIZE;
- ip->i_cowextsize = pip->i_cowextsize;
- }
- if (pip->i_diflags2 & XFS_DIFLAG2_DAX)
- ip->i_diflags2 |= XFS_DIFLAG2_DAX;
-
- /* Don't let invalid cowextsize hints propagate. */
- failaddr = xfs_inode_validate_cowextsize(ip->i_mount, ip->i_cowextsize,
- VFS_I(ip)->i_mode, ip->i_diflags, ip->i_diflags2);
- if (failaddr) {
- ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE;
- ip->i_cowextsize = 0;
- }
-}
-
/*
* Initialise a newly allocated inode and return the in-core inode to the
* caller locked exclusively.
@@ -755,39 +571,15 @@ xfs_inode_inherit_flags2(
* Caller is responsible for unlocking the inode manually upon return
*/
int
-xfs_init_new_inode(
- struct mnt_idmap *idmap,
+xfs_icreate(
struct xfs_trans *tp,
- struct xfs_inode *pip,
xfs_ino_t ino,
- umode_t mode,
- xfs_nlink_t nlink,
- dev_t rdev,
- prid_t prid,
- bool init_xattrs,
+ const struct xfs_icreate_args *args,
struct xfs_inode **ipp)
{
- struct inode *dir = pip ? VFS_I(pip) : NULL;
struct xfs_mount *mp = tp->t_mountp;
- struct xfs_inode *ip;
- unsigned int flags;
+ struct xfs_inode *ip = NULL;
int error;
- struct timespec64 tv;
- struct inode *inode;
-
- /*
- * Protect against obviously corrupt allocation btree records. Later
- * xfs_iget checks will catch re-allocation of other active in-memory
- * and on-disk inodes. If we don't catch reallocating the parent inode
- * here we will deadlock in xfs_iget() so we have to do these checks
- * first.
- */
- if ((pip && ino == pip->i_ino) || !xfs_verify_dir_ino(mp, ino)) {
- xfs_alert(mp, "Allocated a known in-use inode 0x%llx!", ino);
- xfs_agno_mark_sick(mp, XFS_INO_TO_AGNO(mp, ino),
- XFS_SICK_AG_INOBT);
- return -EFSCORRUPTED;
- }
/*
* Get the in-core inode with the lock held exclusively to prevent
@@ -798,96 +590,8 @@ xfs_init_new_inode(
return error;
ASSERT(ip != NULL);
- inode = VFS_I(ip);
- set_nlink(inode, nlink);
- inode->i_rdev = rdev;
- ip->i_projid = prid;
-
- if (dir && !(dir->i_mode & S_ISGID) && xfs_has_grpid(mp)) {
- inode_fsuid_set(inode, idmap);
- inode->i_gid = dir->i_gid;
- inode->i_mode = mode;
- } else {
- inode_init_owner(idmap, inode, dir, mode);
- }
-
- /*
- * If the group ID of the new file does not match the effective group
- * ID or one of the supplementary group IDs, the S_ISGID bit is cleared
- * (and only if the irix_sgid_inherit compatibility variable is set).
- */
- if (irix_sgid_inherit && (inode->i_mode & S_ISGID) &&
- !vfsgid_in_group_p(i_gid_into_vfsgid(idmap, inode)))
- inode->i_mode &= ~S_ISGID;
-
- ip->i_disk_size = 0;
- ip->i_df.if_nextents = 0;
- ASSERT(ip->i_nblocks == 0);
-
- tv = inode_set_ctime_current(inode);
- inode_set_mtime_to_ts(inode, tv);
- inode_set_atime_to_ts(inode, tv);
-
- ip->i_extsize = 0;
- ip->i_diflags = 0;
-
- if (xfs_has_v3inodes(mp)) {
- inode_set_iversion(inode, 1);
- ip->i_cowextsize = 0;
- ip->i_crtime = tv;
- }
-
- flags = XFS_ILOG_CORE;
- switch (mode & S_IFMT) {
- case S_IFIFO:
- case S_IFCHR:
- case S_IFBLK:
- case S_IFSOCK:
- ip->i_df.if_format = XFS_DINODE_FMT_DEV;
- flags |= XFS_ILOG_DEV;
- break;
- case S_IFREG:
- case S_IFDIR:
- if (pip && (pip->i_diflags & XFS_DIFLAG_ANY))
- xfs_inode_inherit_flags(ip, pip);
- if (pip && (pip->i_diflags2 & XFS_DIFLAG2_ANY))
- xfs_inode_inherit_flags2(ip, pip);
- fallthrough;
- case S_IFLNK:
- ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS;
- ip->i_df.if_bytes = 0;
- ip->i_df.if_data = NULL;
- break;
- default:
- ASSERT(0);
- }
-
- /*
- * If we need to create attributes immediately after allocating the
- * inode, initialise an empty attribute fork right now. We use the
- * default fork offset for attributes here as we don't know exactly what
- * size or how many attributes we might be adding. We can do this
- * safely here because we know the data fork is completely empty and
- * this saves us from needing to run a separate transaction to set the
- * fork offset in the immediate future.
- */
- if (init_xattrs) {
- ip->i_forkoff = xfs_default_attroffset(ip) >> 3;
- xfs_ifork_init_attr(ip, XFS_DINODE_FMT_EXTENTS, 0);
-
- if (!xfs_has_attr(mp)) {
- spin_lock(&mp->m_sb_lock);
- xfs_add_attr(mp);
- spin_unlock(&mp->m_sb_lock);
- xfs_log_sb(tp);
- }
- }
-
- /*
- * Log the new values stuffed into the inode.
- */
xfs_trans_ijoin(tp, ip, 0);
- xfs_trans_log_inode(tp, ip, flags);
+ xfs_inode_init(tp, args, ip);
/* now that we have an i_mode we can setup the inode structure */
xfs_setup_inode(ip);
@@ -896,158 +600,60 @@ xfs_init_new_inode(
return 0;
}
-/*
- * Decrement the link count on an inode & log the change. If this causes the
- * link count to go to zero, move the inode to AGI unlinked list so that it can
- * be freed when the last active reference goes away via xfs_inactive().
- */
+/* Return dquots for the ids that will be assigned to a new file. */
int
-xfs_droplink(
- struct xfs_trans *tp,
- struct xfs_inode *ip)
-{
- struct inode *inode = VFS_I(ip);
-
- xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
-
- if (inode->i_nlink == 0) {
- xfs_info_ratelimited(tp->t_mountp,
- "Inode 0x%llx link count dropped below zero. Pinning link count.",
- ip->i_ino);
- set_nlink(inode, XFS_NLINK_PINNED);
- }
- if (inode->i_nlink != XFS_NLINK_PINNED)
- drop_nlink(inode);
-
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-
- if (inode->i_nlink)
- return 0;
-
- return xfs_iunlink(tp, ip);
-}
-
-/*
- * Increment the link count on an inode & log the change.
- */
-void
-xfs_bumplink(
- struct xfs_trans *tp,
- struct xfs_inode *ip)
-{
- struct inode *inode = VFS_I(ip);
-
- xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
-
- if (inode->i_nlink == XFS_NLINK_PINNED - 1)
- xfs_info_ratelimited(tp->t_mountp,
- "Inode 0x%llx link count exceeded maximum. Pinning link count.",
- ip->i_ino);
- if (inode->i_nlink != XFS_NLINK_PINNED)
- inc_nlink(inode);
-
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-}
-
-#ifdef CONFIG_XFS_LIVE_HOOKS
-/*
- * Use a static key here to reduce the overhead of directory live update hooks.
- * If the compiler supports jump labels, the static branch will be replaced by
- * a nop sled when there are no hook users. Online fsck is currently the only
- * caller, so this is a reasonable tradeoff.
- *
- * Note: Patching the kernel code requires taking the cpu hotplug lock. Other
- * parts of the kernel allocate memory with that lock held, which means that
- * XFS callers cannot hold any locks that might be used by memory reclaim or
- * writeback when calling the static_branch_{inc,dec} functions.
- */
-DEFINE_STATIC_XFS_HOOK_SWITCH(xfs_dir_hooks_switch);
-
-void
-xfs_dir_hook_disable(void)
-{
- xfs_hooks_switch_off(&xfs_dir_hooks_switch);
-}
-
-void
-xfs_dir_hook_enable(void)
-{
- xfs_hooks_switch_on(&xfs_dir_hooks_switch);
-}
-
-/* Call hooks for a directory update relating to a child dirent update. */
-inline void
-xfs_dir_update_hook(
- struct xfs_inode *dp,
- struct xfs_inode *ip,
- int delta,
- const struct xfs_name *name)
-{
- if (xfs_hooks_switched_on(&xfs_dir_hooks_switch)) {
- struct xfs_dir_update_params p = {
- .dp = dp,
- .ip = ip,
- .delta = delta,
- .name = name,
- };
- struct xfs_mount *mp = ip->i_mount;
-
- xfs_hooks_call(&mp->m_dir_update_hooks, 0, &p);
+xfs_icreate_dqalloc(
+ const struct xfs_icreate_args *args,
+ struct xfs_dquot **udqpp,
+ struct xfs_dquot **gdqpp,
+ struct xfs_dquot **pdqpp)
+{
+ struct inode *dir = VFS_I(args->pip);
+ kuid_t uid = GLOBAL_ROOT_UID;
+ kgid_t gid = GLOBAL_ROOT_GID;
+ prid_t prid = 0;
+ unsigned int flags = XFS_QMOPT_QUOTALL;
+
+ if (args->idmap) {
+ /*
+ * The uid/gid computation code must match what the VFS uses to
+ * assign i_[ug]id. INHERIT adjusts the gid computation for
+ * setgid/grpid systems.
+ */
+ uid = mapped_fsuid(args->idmap, i_user_ns(dir));
+ gid = mapped_fsgid(args->idmap, i_user_ns(dir));
+ prid = xfs_get_initial_prid(args->pip);
+ flags |= XFS_QMOPT_INHERIT;
}
-}
-/* Call the specified function during a directory update. */
-int
-xfs_dir_hook_add(
- struct xfs_mount *mp,
- struct xfs_dir_hook *hook)
-{
- return xfs_hooks_add(&mp->m_dir_update_hooks, &hook->dirent_hook);
-}
+ *udqpp = *gdqpp = *pdqpp = NULL;
-/* Stop calling the specified function during a directory update. */
-void
-xfs_dir_hook_del(
- struct xfs_mount *mp,
- struct xfs_dir_hook *hook)
-{
- xfs_hooks_del(&mp->m_dir_update_hooks, &hook->dirent_hook);
+ return xfs_qm_vop_dqalloc(args->pip, uid, gid, prid, flags, udqpp,
+ gdqpp, pdqpp);
}
-/* Configure directory update hook functions. */
-void
-xfs_dir_hook_setup(
- struct xfs_dir_hook *hook,
- notifier_fn_t mod_fn)
-{
- xfs_hook_setup(&hook->dirent_hook, mod_fn);
-}
-#endif /* CONFIG_XFS_LIVE_HOOKS */
-
int
xfs_create(
- struct mnt_idmap *idmap,
- struct xfs_inode *dp,
+ const struct xfs_icreate_args *args,
struct xfs_name *name,
- umode_t mode,
- dev_t rdev,
- bool init_xattrs,
- xfs_inode_t **ipp)
+ struct xfs_inode **ipp)
{
- int is_dir = S_ISDIR(mode);
+ struct xfs_inode *dp = args->pip;
+ struct xfs_dir_update du = {
+ .dp = dp,
+ .name = name,
+ };
struct xfs_mount *mp = dp->i_mount;
- struct xfs_inode *ip = NULL;
struct xfs_trans *tp = NULL;
- int error;
- bool unlock_dp_on_error = false;
- prid_t prid;
- struct xfs_dquot *udqp = NULL;
- struct xfs_dquot *gdqp = NULL;
- struct xfs_dquot *pdqp = NULL;
+ struct xfs_dquot *udqp;
+ struct xfs_dquot *gdqp;
+ struct xfs_dquot *pdqp;
struct xfs_trans_res *tres;
- uint resblks;
xfs_ino_t ino;
- struct xfs_parent_args *ppargs;
+ bool unlock_dp_on_error = false;
+ bool is_dir = S_ISDIR(args->mode);
+ uint resblks;
+ int error;
trace_xfs_create(dp, name);
@@ -1056,15 +662,8 @@ xfs_create(
if (xfs_ifork_zapped(dp, XFS_DATA_FORK))
return -EIO;
- prid = xfs_get_initial_prid(dp);
-
- /*
- * Make sure that we have allocated dquot(s) on disk.
- */
- error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(idmap, &init_user_ns),
- mapped_fsgid(idmap, &init_user_ns), prid,
- XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
- &udqp, &gdqp, &pdqp);
+ /* Make sure that we have allocated dquot(s) on disk. */
+ error = xfs_icreate_dqalloc(args, &udqp, &gdqp, &pdqp);
if (error)
return error;
@@ -1076,7 +675,7 @@ xfs_create(
tres = &M_RES(mp)->tr_create;
}
- error = xfs_parent_start(mp, &ppargs);
+ error = xfs_parent_start(mp, &du.ppargs);
if (error)
goto out_release_dquots;
@@ -1105,10 +704,9 @@ xfs_create(
* entry pointing to them, but a directory also the "." entry
* pointing to itself.
*/
- error = xfs_dialloc(&tp, dp->i_ino, mode, &ino);
+ error = xfs_dialloc(&tp, dp->i_ino, args->mode, &ino);
if (!error)
- error = xfs_init_new_inode(idmap, tp, dp, ino, mode,
- is_dir ? 2 : 1, rdev, prid, init_xattrs, &ip);
+ error = xfs_icreate(tp, ino, args, &du.ip);
if (error)
goto out_trans_cancel;
@@ -1121,38 +719,9 @@ xfs_create(
*/
xfs_trans_ijoin(tp, dp, 0);
- error = xfs_dir_createname(tp, dp, name, ip->i_ino,
- resblks - XFS_IALLOC_SPACE_RES(mp));
- if (error) {
- ASSERT(error != -ENOSPC);
+ error = xfs_dir_create_child(tp, resblks, &du);
+ if (error)
goto out_trans_cancel;
- }
- xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
- xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
-
- if (is_dir) {
- error = xfs_dir_init(tp, ip, dp);
- if (error)
- goto out_trans_cancel;
-
- xfs_bumplink(tp, dp);
- }
-
- /*
- * If we have parent pointers, we need to add the attribute containing
- * the parent information now.
- */
- if (ppargs) {
- error = xfs_parent_addname(tp, ppargs, dp, name, ip);
- if (error)
- goto out_trans_cancel;
- }
-
- /*
- * Create ip with a reference from dp, and add '.' and '..' references
- * if it's a directory.
- */
- xfs_dir_update_hook(dp, ip, 1, name);
/*
* If this is a synchronous mount, make sure that the
@@ -1167,7 +736,7 @@ xfs_create(
* These ids of the inode couldn't have changed since the new
* inode has been locked ever since it was created.
*/
- xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
+ xfs_qm_vop_create_dqattach(tp, du.ip, udqp, gdqp, pdqp);
error = xfs_trans_commit(tp);
if (error)
@@ -1177,10 +746,10 @@ xfs_create(
xfs_qm_dqrele(gdqp);
xfs_qm_dqrele(pdqp);
- *ipp = ip;
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ *ipp = du.ip;
+ xfs_iunlock(du.ip, XFS_ILOCK_EXCL);
xfs_iunlock(dp, XFS_ILOCK_EXCL);
- xfs_parent_finish(mp, ppargs);
+ xfs_parent_finish(mp, du.ppargs);
return 0;
out_trans_cancel:
@@ -1191,13 +760,13 @@ xfs_create(
* setup of the inode and release the inode. This prevents recursive
* transactions and deadlocks from xfs_inactive.
*/
- if (ip) {
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- xfs_finish_inode_setup(ip);
- xfs_irele(ip);
+ if (du.ip) {
+ xfs_iunlock(du.ip, XFS_ILOCK_EXCL);
+ xfs_finish_inode_setup(du.ip);
+ xfs_irele(du.ip);
}
out_parent:
- xfs_parent_finish(mp, ppargs);
+ xfs_parent_finish(mp, du.ppargs);
out_release_dquots:
xfs_qm_dqrele(udqp);
xfs_qm_dqrele(gdqp);
@@ -1210,36 +779,28 @@ xfs_create(
int
xfs_create_tmpfile(
- struct mnt_idmap *idmap,
- struct xfs_inode *dp,
- umode_t mode,
- bool init_xattrs,
+ const struct xfs_icreate_args *args,
struct xfs_inode **ipp)
{
+ struct xfs_inode *dp = args->pip;
struct xfs_mount *mp = dp->i_mount;
struct xfs_inode *ip = NULL;
struct xfs_trans *tp = NULL;
- int error;
- prid_t prid;
- struct xfs_dquot *udqp = NULL;
- struct xfs_dquot *gdqp = NULL;
- struct xfs_dquot *pdqp = NULL;
+ struct xfs_dquot *udqp;
+ struct xfs_dquot *gdqp;
+ struct xfs_dquot *pdqp;
struct xfs_trans_res *tres;
- uint resblks;
xfs_ino_t ino;
+ uint resblks;
+ int error;
+
+ ASSERT(args->flags & XFS_ICREATE_TMPFILE);
if (xfs_is_shutdown(mp))
return -EIO;
- prid = xfs_get_initial_prid(dp);
-
- /*
- * Make sure that we have allocated dquot(s) on disk.
- */
- error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(idmap, &init_user_ns),
- mapped_fsgid(idmap, &init_user_ns), prid,
- XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
- &udqp, &gdqp, &pdqp);
+ /* Make sure that we have allocated dquot(s) on disk. */
+ error = xfs_icreate_dqalloc(args, &udqp, &gdqp, &pdqp);
if (error)
return error;
@@ -1251,10 +812,9 @@ xfs_create_tmpfile(
if (error)
goto out_release_dquots;
- error = xfs_dialloc(&tp, dp->i_ino, mode, &ino);
+ error = xfs_dialloc(&tp, dp->i_ino, args->mode, &ino);
if (!error)
- error = xfs_init_new_inode(idmap, tp, dp, ino, mode,
- 0, 0, prid, init_xattrs, &ip);
+ error = xfs_icreate(tp, ino, args, &ip);
if (error)
goto out_trans_cancel;
@@ -1311,11 +871,15 @@ xfs_link(
struct xfs_inode *sip,
struct xfs_name *target_name)
{
+ struct xfs_dir_update du = {
+ .dp = tdp,
+ .name = target_name,
+ .ip = sip,
+ };
struct xfs_mount *mp = tdp->i_mount;
struct xfs_trans *tp;
int error, nospace_error = 0;
int resblks;
- struct xfs_parent_args *ppargs;
trace_xfs_link(tdp, target_name);
@@ -1334,7 +898,7 @@ xfs_link(
if (error)
goto std_return;
- error = xfs_parent_start(mp, &ppargs);
+ error = xfs_parent_start(mp, &du.ppargs);
if (error)
goto std_return;
@@ -1349,7 +913,7 @@ xfs_link(
* pointers are enabled because we can't back out if the xattrs must
* grow.
*/
- if (ppargs && nospace_error) {
+ if (du.ppargs && nospace_error) {
error = nospace_error;
goto error_return;
}
@@ -1376,47 +940,9 @@ xfs_link(
}
}
- if (!resblks) {
- error = xfs_dir_canenter(tp, tdp, target_name);
- if (error)
- goto error_return;
- }
-
- /*
- * Handle initial link state of O_TMPFILE inode
- */
- if (VFS_I(sip)->i_nlink == 0) {
- struct xfs_perag *pag;
-
- pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, sip->i_ino));
- error = xfs_iunlink_remove(tp, pag, sip);
- xfs_perag_put(pag);
- if (error)
- goto error_return;
- }
-
- error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino,
- resblks);
+ error = xfs_dir_add_child(tp, resblks, &du);
if (error)
goto error_return;
- xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
- xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
-
- xfs_bumplink(tp, sip);
-
- /*
- * If we have parent pointers, we now need to add the parent record to
- * the attribute fork of the inode. If this is the initial parent
- * attribute, we need to create it correctly, otherwise we can just add
- * the parent to the inode.
- */
- if (ppargs) {
- error = xfs_parent_addname(tp, ppargs, tdp, target_name, sip);
- if (error)
- goto error_return;
- }
-
- xfs_dir_update_hook(tdp, sip, 1, target_name);
/*
* If this is a synchronous mount, make sure that the
@@ -1429,7 +955,7 @@ xfs_link(
error = xfs_trans_commit(tp);
xfs_iunlock(tdp, XFS_ILOCK_EXCL);
xfs_iunlock(sip, XFS_ILOCK_EXCL);
- xfs_parent_finish(mp, ppargs);
+ xfs_parent_finish(mp, du.ppargs);
return error;
error_return:
@@ -1437,7 +963,7 @@ xfs_link(
xfs_iunlock(tdp, XFS_ILOCK_EXCL);
xfs_iunlock(sip, XFS_ILOCK_EXCL);
out_parent:
- xfs_parent_finish(mp, ppargs);
+ xfs_parent_finish(mp, du.ppargs);
std_return:
if (error == -ENOSPC && nospace_error)
error = nospace_error;
@@ -2024,39 +1550,6 @@ out:
}
/*
- * In-Core Unlinked List Lookups
- * =============================
- *
- * Every inode is supposed to be reachable from some other piece of metadata
- * with the exception of the root directory. Inodes with a connection to a
- * file descriptor but not linked from anywhere in the on-disk directory tree
- * are collectively known as unlinked inodes, though the filesystem itself
- * maintains links to these inodes so that on-disk metadata are consistent.
- *
- * XFS implements a per-AG on-disk hash table of unlinked inodes. The AGI
- * header contains a number of buckets that point to an inode, and each inode
- * record has a pointer to the next inode in the hash chain. This
- * singly-linked list causes scaling problems in the iunlink remove function
- * because we must walk that list to find the inode that points to the inode
- * being removed from the unlinked hash bucket list.
- *
- * Hence we keep an in-memory double linked list to link each inode on an
- * unlinked list. Because there are 64 unlinked lists per AGI, keeping pointer
- * based lists would require having 64 list heads in the perag, one for each
- * list. This is expensive in terms of memory (think millions of AGs) and cache
- * misses on lookups. Instead, use the fact that inodes on the unlinked list
- * must be referenced at the VFS level to keep them on the list and hence we
- * have an existence guarantee for inodes on the unlinked list.
- *
- * Given we have an existence guarantee, we can use lockless inode cache lookups
- * to resolve aginos to xfs inodes. This means we only need 8 bytes per inode
- * for the double linked unlinked list, and we don't need any extra locking to
- * keep the list safe as all manipulations are done under the AGI buffer lock.
- * Keeping the list up to date does not require memory allocation, just finding
- * the XFS inode and updating the next/prev unlinked list aginos.
- */
-
-/*
* Find an inode on the unlinked list. This does not take references to the
* inode as we have existence guarantees by holding the AGI buffer lock and that
* only unlinked, referenced inodes can be on the unlinked inode list. If we
@@ -2091,75 +1584,11 @@ xfs_iunlink_lookup(
}
/*
- * Update the prev pointer of the next agino. Returns -ENOLINK if the inode
- * is not in cache.
- */
-static int
-xfs_iunlink_update_backref(
- struct xfs_perag *pag,
- xfs_agino_t prev_agino,
- xfs_agino_t next_agino)
-{
- struct xfs_inode *ip;
-
- /* No update necessary if we are at the end of the list. */
- if (next_agino == NULLAGINO)
- return 0;
-
- ip = xfs_iunlink_lookup(pag, next_agino);
- if (!ip)
- return -ENOLINK;
-
- ip->i_prev_unlinked = prev_agino;
- return 0;
-}
-
-/*
- * Point the AGI unlinked bucket at an inode and log the results. The caller
- * is responsible for validating the old value.
- */
-STATIC int
-xfs_iunlink_update_bucket(
- struct xfs_trans *tp,
- struct xfs_perag *pag,
- struct xfs_buf *agibp,
- unsigned int bucket_index,
- xfs_agino_t new_agino)
-{
- struct xfs_agi *agi = agibp->b_addr;
- xfs_agino_t old_value;
- int offset;
-
- ASSERT(xfs_verify_agino_or_null(pag, new_agino));
-
- old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]);
- trace_xfs_iunlink_update_bucket(tp->t_mountp, pag->pag_agno, bucket_index,
- old_value, new_agino);
-
- /*
- * We should never find the head of the list already set to the value
- * passed in because either we're adding or removing ourselves from the
- * head of the list.
- */
- if (old_value == new_agino) {
- xfs_buf_mark_corrupt(agibp);
- xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI);
- return -EFSCORRUPTED;
- }
-
- agi->agi_unlinked[bucket_index] = cpu_to_be32(new_agino);
- offset = offsetof(struct xfs_agi, agi_unlinked) +
- (sizeof(xfs_agino_t) * bucket_index);
- xfs_trans_log_buf(tp, agibp, offset, offset + sizeof(xfs_agino_t) - 1);
- return 0;
-}
-
-/*
* Load the inode @next_agino into the cache and set its prev_unlinked pointer
* to @prev_agino. Caller must hold the AGI to synchronize with other changes
* to the unlinked list.
*/
-STATIC int
+int
xfs_iunlink_reload_next(
struct xfs_trans *tp,
struct xfs_buf *agibp,
@@ -2215,187 +1644,6 @@ rele:
return error;
}
-static int
-xfs_iunlink_insert_inode(
- struct xfs_trans *tp,
- struct xfs_perag *pag,
- struct xfs_buf *agibp,
- struct xfs_inode *ip)
-{
- struct xfs_mount *mp = tp->t_mountp;
- struct xfs_agi *agi = agibp->b_addr;
- xfs_agino_t next_agino;
- xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
- short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
- int error;
-
- /*
- * Get the index into the agi hash table for the list this inode will
- * go on. Make sure the pointer isn't garbage and that this inode
- * isn't already on the list.
- */
- next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
- if (next_agino == agino ||
- !xfs_verify_agino_or_null(pag, next_agino)) {
- xfs_buf_mark_corrupt(agibp);
- xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI);
- return -EFSCORRUPTED;
- }
-
- /*
- * Update the prev pointer in the next inode to point back to this
- * inode.
- */
- error = xfs_iunlink_update_backref(pag, agino, next_agino);
- if (error == -ENOLINK)
- error = xfs_iunlink_reload_next(tp, agibp, agino, next_agino);
- if (error)
- return error;
-
- if (next_agino != NULLAGINO) {
- /*
- * There is already another inode in the bucket, so point this
- * inode to the current head of the list.
- */
- error = xfs_iunlink_log_inode(tp, ip, pag, next_agino);
- if (error)
- return error;
- ip->i_next_unlinked = next_agino;
- }
-
- /* Point the head of the list to point to this inode. */
- ip->i_prev_unlinked = NULLAGINO;
- return xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, agino);
-}
-
-/*
- * This is called when the inode's link count has gone to 0 or we are creating
- * a tmpfile via O_TMPFILE. The inode @ip must have nlink == 0.
- *
- * We place the on-disk inode on a list in the AGI. It will be pulled from this
- * list when the inode is freed.
- */
-int
-xfs_iunlink(
- struct xfs_trans *tp,
- struct xfs_inode *ip)
-{
- struct xfs_mount *mp = tp->t_mountp;
- struct xfs_perag *pag;
- struct xfs_buf *agibp;
- int error;
-
- ASSERT(VFS_I(ip)->i_nlink == 0);
- ASSERT(VFS_I(ip)->i_mode != 0);
- trace_xfs_iunlink(ip);
-
- pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
-
- /* Get the agi buffer first. It ensures lock ordering on the list. */
- error = xfs_read_agi(pag, tp, 0, &agibp);
- if (error)
- goto out;
-
- error = xfs_iunlink_insert_inode(tp, pag, agibp, ip);
-out:
- xfs_perag_put(pag);
- return error;
-}
-
-static int
-xfs_iunlink_remove_inode(
- struct xfs_trans *tp,
- struct xfs_perag *pag,
- struct xfs_buf *agibp,
- struct xfs_inode *ip)
-{
- struct xfs_mount *mp = tp->t_mountp;
- struct xfs_agi *agi = agibp->b_addr;
- xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
- xfs_agino_t head_agino;
- short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
- int error;
-