diff options
64 files changed, 6267 insertions, 915 deletions
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 52c288514be1..fc593c869493 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -39,6 +39,7 @@ xfs-y += $(addprefix libxfs/, \ xfs_btree.o \ xfs_da_btree.o \ xfs_da_format.o \ + xfs_defer.o \ xfs_dir2.o \ xfs_dir2_block.o \ xfs_dir2_data.o \ @@ -51,6 +52,8 @@ xfs-y += $(addprefix libxfs/, \ xfs_inode_fork.o \ xfs_inode_buf.o \ xfs_log_rlimit.o \ + xfs_rmap.o \ + xfs_rmap_btree.o \ xfs_sb.o \ xfs_symlink_remote.o \ xfs_trans_resv.o \ @@ -100,11 +103,13 @@ xfs-y += xfs_log.o \ xfs_extfree_item.o \ xfs_icreate_item.o \ xfs_inode_item.o \ + xfs_rmap_item.o \ xfs_log_recover.o \ xfs_trans_ail.o \ xfs_trans_buf.o \ xfs_trans_extfree.o \ xfs_trans_inode.o \ + xfs_trans_rmap.o \ # optional features xfs-$(CONFIG_XFS_QUOTA) += xfs_dquot.o \ diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 88c26b827a2d..776ae2f325d1 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -24,8 +24,10 @@ #include "xfs_bit.h" #include "xfs_sb.h" #include "xfs_mount.h" +#include "xfs_defer.h" #include "xfs_inode.h" #include "xfs_btree.h" +#include "xfs_rmap.h" #include "xfs_alloc_btree.h" #include "xfs_alloc.h" #include "xfs_extent_busy.h" @@ -49,6 +51,81 @@ STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *); STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *, xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *); +xfs_extlen_t +xfs_prealloc_blocks( + struct xfs_mount *mp) +{ + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + return XFS_RMAP_BLOCK(mp) + 1; + if (xfs_sb_version_hasfinobt(&mp->m_sb)) + return XFS_FIBT_BLOCK(mp) + 1; + return XFS_IBT_BLOCK(mp) + 1; +} + +/* + * In order to avoid ENOSPC-related deadlock caused by out-of-order locking of + * AGF buffer (PV 947395), we place constraints on the relationship among + * actual allocations for data blocks, freelist blocks, and potential file data + * bmap btree blocks. However, these restrictions may result in no actual space + * allocated for a delayed extent, for example, a data block in a certain AG is + * allocated but there is no additional block for the additional bmap btree + * block due to a split of the bmap btree of the file. The result of this may + * lead to an infinite loop when the file gets flushed to disk and all delayed + * extents need to be actually allocated. To get around this, we explicitly set + * aside a few blocks which will not be reserved in delayed allocation. + * + * When rmap is disabled, we need to reserve 4 fsbs _per AG_ for the freelist + * and 4 more to handle a potential split of the file's bmap btree. + * + * When rmap is enabled, we must also be able to handle two rmap btree inserts + * to record both the file data extent and a new bmbt block. The bmbt block + * might not be in the same AG as the file data extent. In the worst case + * the bmap btree splits multiple levels and all the new blocks come from + * different AGs, so set aside enough to handle rmap btree splits in all AGs. + */ +unsigned int +xfs_alloc_set_aside( + struct xfs_mount *mp) +{ + unsigned int blocks; + + blocks = 4 + (mp->m_sb.sb_agcount * XFS_ALLOC_AGFL_RESERVE); + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + blocks += mp->m_sb.sb_agcount * mp->m_rmap_maxlevels; + return blocks; +} + +/* + * When deciding how much space to allocate out of an AG, we limit the + * allocation maximum size to the size the AG. However, we cannot use all the + * blocks in the AG - some are permanently used by metadata. These + * blocks are generally: + * - the AG superblock, AGF, AGI and AGFL + * - the AGF (bno and cnt) and AGI btree root blocks, and optionally + * the AGI free inode and rmap btree root blocks. + * - blocks on the AGFL according to xfs_alloc_set_aside() limits + * - the rmapbt root block + * + * The AG headers are sector sized, so the amount of space they take up is + * dependent on filesystem geometry. The others are all single blocks. + */ +unsigned int +xfs_alloc_ag_max_usable( + struct xfs_mount *mp) +{ + unsigned int blocks; + + blocks = XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)); /* ag headers */ + blocks += XFS_ALLOC_AGFL_RESERVE; + blocks += 3; /* AGF, AGI btree root blocks */ + if (xfs_sb_version_hasfinobt(&mp->m_sb)) + blocks++; /* finobt root block */ + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + blocks++; /* rmap root block */ + + return mp->m_sb.sb_agblocks - blocks; +} + /* * Lookup the record equal to [bno, len] in the btree given by cur. */ @@ -636,6 +713,14 @@ xfs_alloc_ag_vextent( ASSERT(!args->wasfromfl || !args->isfl); ASSERT(args->agbno % args->alignment == 0); + /* if not file data, insert new block into the reverse map btree */ + if (args->oinfo.oi_owner != XFS_RMAP_OWN_UNKNOWN) { + error = xfs_rmap_alloc(args->tp, args->agbp, args->agno, + args->agbno, args->len, &args->oinfo); + if (error) + return error; + } + if (!args->wasfromfl) { error = xfs_alloc_update_counters(args->tp, args->pag, args->agbp, @@ -1577,14 +1662,15 @@ error0: /* * Free the extent starting at agno/bno for length. */ -STATIC int /* error */ +STATIC int xfs_free_ag_extent( - xfs_trans_t *tp, /* transaction pointer */ - xfs_buf_t *agbp, /* buffer for a.g. freelist header */ - xfs_agnumber_t agno, /* allocation group number */ - xfs_agblock_t bno, /* starting block number */ - xfs_extlen_t len, /* length of extent */ - int isfl) /* set if is freelist blocks - no sb acctg */ + xfs_trans_t *tp, + xfs_buf_t *agbp, + xfs_agnumber_t agno, + xfs_agblock_t bno, + xfs_extlen_t len, + struct xfs_owner_info *oinfo, + int isfl) { xfs_btree_cur_t *bno_cur; /* cursor for by-block btree */ xfs_btree_cur_t *cnt_cur; /* cursor for by-size btree */ @@ -1601,12 +1687,19 @@ xfs_free_ag_extent( xfs_extlen_t nlen; /* new length of freespace */ xfs_perag_t *pag; /* per allocation group data */ + bno_cur = cnt_cur = NULL; mp = tp->t_mountp; + + if (oinfo->oi_owner != XFS_RMAP_OWN_UNKNOWN) { + error = xfs_rmap_free(tp, agbp, agno, bno, len, oinfo); + if (error) + goto error0; + } + /* * Allocate and initialize a cursor for the by-block btree. */ bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_BNO); - cnt_cur = NULL; /* * Look for a neighboring block on the left (lower block numbers) * that is contiguous with this space. @@ -1875,6 +1968,11 @@ xfs_alloc_min_freelist( /* space needed by-size freespace btree */ min_free += min_t(unsigned int, pag->pagf_levels[XFS_BTNUM_CNTi] + 1, mp->m_ag_maxlevels); + /* space needed reverse mapping used space btree */ + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + min_free += min_t(unsigned int, + pag->pagf_levels[XFS_BTNUM_RMAPi] + 1, + mp->m_rmap_maxlevels); return min_free; } @@ -1992,21 +2090,34 @@ xfs_alloc_fix_freelist( * anything other than extra overhead when we need to put more blocks * back on the free list? Maybe we should only do this when space is * getting low or the AGFL is more than half full? + * + * The NOSHRINK flag prevents the AGFL from being shrunk if it's too + * big; the NORMAP flag prevents AGFL expand/shrink operations from + * updating the rmapbt. Both flags are used in xfs_repair while we're + * rebuilding the rmapbt, and neither are used by the kernel. They're + * both required to ensure that rmaps are correctly recorded for the + * regenerated AGFL, bnobt, and cntbt. See repair/phase5.c and + * repair/rmap.c in xfsprogs for details. */ - while (pag->pagf_flcount > need) { + memset(&targs, 0, sizeof(targs)); + if (flags & XFS_ALLOC_FLAG_NORMAP) + xfs_rmap_skip_owner_update(&targs.oinfo); + else + xfs_rmap_ag_owner(&targs.oinfo, XFS_RMAP_OWN_AG); + while (!(flags & XFS_ALLOC_FLAG_NOSHRINK) && pag->pagf_flcount > need) { struct xfs_buf *bp; error = xfs_alloc_get_freelist(tp, agbp, &bno, 0); if (error) goto out_agbp_relse; - error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, 1); + error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, + &targs.oinfo, 1); if (error) goto out_agbp_relse; bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0); xfs_trans_binval(tp, bp); } - memset(&targs, 0, sizeof(targs)); targs.tp = tp; targs.mp = mp; targs.agbp = agbp; @@ -2271,6 +2382,10 @@ xfs_agf_verify( be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) > XFS_BTREE_MAXLEVELS) return false; + if (xfs_sb_version_hasrmapbt(&mp->m_sb) && + be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > XFS_BTREE_MAXLEVELS) + return false; + /* * during growfs operations, the perag is not fully initialised, * so we can't use it for any useful checking. growfs ensures we can't @@ -2402,6 +2517,8 @@ xfs_alloc_read_agf( be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]); pag->pagf_levels[XFS_BTNUM_CNTi] = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]); + pag->pagf_levels[XFS_BTNUM_RMAPi] = + be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]); spin_lock_init(&pag->pagb_lock); pag->pagb_count = 0; pag->pagb_tree = RB_ROOT; @@ -2691,7 +2808,8 @@ int /* error */ xfs_free_extent( struct xfs_trans *tp, /* transaction pointer */ xfs_fsblock_t bno, /* starting block number of extent */ - xfs_extlen_t len) /* length of extent */ + xfs_extlen_t len, /* length of extent */ + struct xfs_owner_info *oinfo) /* extent owner */ { struct xfs_mount *mp = tp->t_mountp; struct xfs_buf *agbp; @@ -2701,6 +2819,11 @@ xfs_free_extent( ASSERT(len != 0); + if (XFS_TEST_ERROR(false, mp, + XFS_ERRTAG_FREE_EXTENT, + XFS_RANDOM_FREE_EXTENT)) + return -EIO; + error = xfs_free_extent_fix_freelist(tp, agno, &agbp); if (error) return error; @@ -2712,7 +2835,7 @@ xfs_free_extent( agbno + len <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_length), err); - error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, 0); + error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, oinfo, 0); if (error) goto err; diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index cf268b2d0b6c..6fe2d6b7cfe9 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -54,41 +54,8 @@ typedef unsigned int xfs_alloctype_t; */ #define XFS_ALLOC_FLAG_TRYLOCK 0x00000001 /* use trylock for buffer locking */ #define XFS_ALLOC_FLAG_FREEING 0x00000002 /* indicate caller is freeing extents*/ - -/* - * In order to avoid ENOSPC-related deadlock caused by - * out-of-order locking of AGF buffer (PV 947395), we place - * constraints on the relationship among actual allocations for - * data blocks, freelist blocks, and potential file data bmap - * btree blocks. However, these restrictions may result in no - * actual space allocated for a delayed extent, for example, a data - * block in a certain AG is allocated but there is no additional - * block for the additional bmap btree block due to a split of the - * bmap btree of the file. The result of this may lead to an - * infinite loop in xfssyncd when the file gets flushed to disk and - * all delayed extents need to be actually allocated. To get around - * this, we explicitly set aside a few blocks which will not be - * reserved in delayed allocation. Considering the minimum number of - * needed freelist blocks is 4 fsbs _per AG_, a potential split of file's bmap - * btree requires 1 fsb, so we set the number of set-aside blocks - * to 4 + 4*agcount. - */ -#define XFS_ALLOC_SET_ASIDE(mp) (4 + ((mp)->m_sb.sb_agcount * 4)) - -/* - * When deciding how much space to allocate out of an AG, we limit the - * allocation maximum size to the size the AG. However, we cannot use all the - * blocks in the AG - some are permanently used by metadata. These - * blocks are generally: - * - the AG superblock, AGF, AGI and AGFL - * - the AGF (bno and cnt) and AGI btree root blocks - * - 4 blocks on the AGFL according to XFS_ALLOC_SET_ASIDE() limits - * - * The AG headers are sector sized, so the amount of space they take up is - * dependent on filesystem geometry. The others are all single blocks. - */ -#define XFS_ALLOC_AG_MAX_USABLE(mp) \ - ((mp)->m_sb.sb_agblocks - XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)) - 7) +#define XFS_ALLOC_FLAG_NORMAP 0x00000004 /* don't modify the rmapbt */ +#define XFS_ALLOC_FLAG_NOSHRINK 0x00000008 /* don't shrink the freelist */ /* @@ -123,6 +90,7 @@ typedef struct xfs_alloc_arg { char isfl; /* set if is freelist blocks - !acctg */ char userdata; /* mask defining userdata treatment */ xfs_fsblock_t firstblock; /* io first block allocated */ + struct xfs_owner_info oinfo; /* owner of blocks being allocated */ } xfs_alloc_arg_t; /* @@ -132,6 +100,11 @@ typedef struct xfs_alloc_arg { #define XFS_ALLOC_INITIAL_USER_DATA (1 << 1)/* special case start of file */ #define XFS_ALLOC_USERDATA_ZERO (1 << 2)/* zero extent on allocation */ +/* freespace limit calculations */ +#define XFS_ALLOC_AGFL_RESERVE 4 +unsigned int xfs_alloc_set_aside(struct xfs_mount *mp); +unsigned int xfs_alloc_ag_max_usable(struct xfs_mount *mp); + xfs_extlen_t xfs_alloc_longest_free_extent(struct xfs_mount *mp, struct xfs_perag *pag, xfs_extlen_t need); unsigned int xfs_alloc_min_freelist(struct xfs_mount *mp, @@ -208,9 +181,10 @@ xfs_alloc_vextent( */ int /* error */ xfs_free_extent( - struct xfs_trans *tp, /* transaction pointer */ - xfs_fsblock_t bno, /* starting block number of extent */ - xfs_extlen_t len); /* length of extent */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_fsblock_t bno, /* starting block number of extent */ + xfs_extlen_t len, /* length of extent */ + struct xfs_owner_info *oinfo);/* extent owner */ int /* error */ xfs_alloc_lookup_ge( @@ -232,4 +206,6 @@ int xfs_alloc_fix_freelist(struct xfs_alloc_arg *args, int flags); int xfs_free_extent_fix_freelist(struct xfs_trans *tp, xfs_agnumber_t agno, struct xfs_buf **agbp); +xfs_extlen_t xfs_prealloc_blocks(struct xfs_mount *mp); + #endif /* __XFS_ALLOC_H__ */ diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index d9b42425291e..5ba2dac5e67c 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -212,17 +212,6 @@ xfs_allocbt_init_key_from_rec( } STATIC void -xfs_allocbt_init_rec_from_key( - union xfs_btree_key *key, - union xfs_btree_rec *rec) -{ - ASSERT(key->alloc.ar_startblock != 0); - - rec->alloc.ar_startblock = key->alloc.ar_startblock; - rec->alloc.ar_blockcount = key->alloc.ar_blockcount; -} - -STATIC void xfs_allocbt_init_rec_from_cur( struct xfs_btree_cur *cur, union xfs_btree_rec *rec) @@ -406,7 +395,6 @@ static const struct xfs_btree_ops xfs_allocbt_ops = { .get_minrecs = xfs_allocbt_get_minrecs, .get_maxrecs = xfs_allocbt_get_maxrecs, .init_key_from_rec = xfs_allocbt_init_key_from_rec, - .init_rec_from_key = xfs_allocbt_init_rec_from_key, .init_rec_from_cur = xfs_allocbt_init_rec_from_cur, .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur, .key_diff = xfs_allocbt_key_diff, diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 4e126f41a0aa..af1ecb19121e 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -23,6 +23,7 @@ #include "xfs_trans_resv.h" #include "xfs_bit.h" #include "xfs_mount.h" +#include "xfs_defer.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" #include "xfs_attr_sf.h" @@ -203,7 +204,7 @@ xfs_attr_set( { struct xfs_mount *mp = dp->i_mount; struct xfs_da_args args; - struct xfs_bmap_free flist; + struct xfs_defer_ops dfops; struct xfs_trans_res tres; xfs_fsblock_t firstblock; int rsvd = (flags & ATTR_ROOT) != 0; @@ -221,7 +222,7 @@ xfs_attr_set( args.value = value; args.valuelen = valuelen; args.firstblock = &firstblock; - args.flist = &flist; + args.dfops = &dfops; args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT; args.total = xfs_attr_calc_size(&args, &local); @@ -316,13 +317,13 @@ xfs_attr_set( * It won't fit in the shortform, transform to a leaf block. * GROT: another possible req'mt for a double-split btree op. */ - xfs_bmap_init(args.flist, args.firstblock); + xfs_defer_init(args.dfops, args.firstblock); error = xfs_attr_shortform_to_leaf(&args); if (!error) - error = xfs_bmap_finish(&args.trans, args.flist, dp); + error = xfs_defer_finish(&args.trans, args.dfops, dp); if (error) { args.trans = NULL; - xfs_bmap_cancel(&flist); + xfs_defer_cancel(&dfops); goto out; } @@ -382,7 +383,7 @@ xfs_attr_remove( { struct xfs_mount *mp = dp->i_mount; struct xfs_da_args args; - struct xfs_bmap_free flist; + struct xfs_defer_ops dfops; xfs_fsblock_t firstblock; int error; @@ -399,7 +400,7 @@ xfs_attr_remove( return error; args.firstblock = &firstblock; - args.flist = &flist; + args.dfops = &dfops; /* * we have no control over the attribute names that userspace passes us @@ -584,13 +585,13 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * Commit that transaction so that the node_addname() call * can manage its own transactions. */ - xfs_bmap_init(args->flist, args->firstblock); + xfs_defer_init(args->dfops, args->firstblock); error = xfs_attr3_leaf_to_node(args); if (!error) - error = xfs_bmap_finish(&args->trans, args->flist, dp); + error = xfs_defer_finish(&args->trans, args->dfops, dp); if (error) { args->trans = NULL; - xfs_bmap_cancel(args->flist); + xfs_defer_cancel(args->dfops); return error; } @@ -674,15 +675,15 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * If the result is small enough, shrink it all into the inode. */ if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { - xfs_bmap_init(args->flist, args->firstblock); + xfs_defer_init(args->dfops, args->firstblock); error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ if (!error) - error = xfs_bmap_finish(&args->trans, - args->flist, dp); + error = xfs_defer_finish(&args->trans, + args->dfops, dp); if (error) { args->trans = NULL; - xfs_bmap_cancel(args->flist); + xfs_defer_cancel(args->dfops); return error; } } @@ -737,14 +738,14 @@ xfs_attr_leaf_removename(xfs_da_args_t *args) * If the result is small enough, shrink it all into the inode. */ if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { - xfs_bmap_init(args->flist, args->firstblock); + xfs_defer_init(args->dfops, args->firstblock); error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ if (!error) - error = xfs_bmap_finish(&args->trans, args->flist, dp); + error = xfs_defer_finish(&args->trans, args->dfops, dp); if (error) { args->trans = NULL; - xfs_bmap_cancel(args->flist); + xfs_defer_cancel(args->dfops); return error; } } @@ -863,14 +864,14 @@ restart: */ xfs_da_state_free(state); state = NULL; - xfs_bmap_init(args->flist, args->firstblock); + xfs_defer_init(args->dfops, args->firstblock); error = xfs_attr3_leaf_to_node(args); if (!error) - error = xfs_bmap_finish(&args->trans, - args->flist, dp); + error = xfs_defer_finish(&args->trans, + args->dfops, dp); if (error) { args->trans = NULL; - xfs_bmap_cancel(args->flist); + xfs_defer_cancel(args->dfops); goto out; } @@ -891,13 +892,13 @@ restart: * in the index/blkno/rmtblkno/rmtblkcnt fields and * in the index2/blkno2/rmtblkno2/rmtblkcnt2 fields. */ - xfs_bmap_init(args->flist, args->firstblock); + xfs_defer_init(args->dfops, args->firstblock); error = xfs_da3_split(state); if (!error) - error = xfs_bmap_finish(&args->trans, args->flist, dp); + error = xfs_defer_finish(&args->trans, args->dfops, dp); if (error) { args->trans = NULL; - xfs_bmap_cancel(args->flist); |
