From d0b2f91bede3bd5e3d24dd6803e56eee959c1797 Mon Sep 17 00:00:00 2001 From: AndrĂ© Fabian Silva Delgado Date: Thu, 20 Oct 2016 00:10:27 -0300 Subject: Linux-libre 4.8.2-gnu --- fs/xfs/Kconfig | 1 + fs/xfs/Makefile | 8 +- fs/xfs/libxfs/xfs_alloc.c | 262 +++++-- fs/xfs/libxfs/xfs_alloc.h | 59 +- fs/xfs/libxfs/xfs_alloc_btree.c | 12 - fs/xfs/libxfs/xfs_attr.c | 71 +- fs/xfs/libxfs/xfs_attr_leaf.c | 4 +- fs/xfs/libxfs/xfs_attr_leaf.h | 3 - fs/xfs/libxfs/xfs_attr_remote.c | 19 +- fs/xfs/libxfs/xfs_bmap.c | 256 +++---- fs/xfs/libxfs/xfs_bmap.h | 54 +- fs/xfs/libxfs/xfs_bmap_btree.c | 32 +- fs/xfs/libxfs/xfs_btree.c | 953 +++++++++++++++++++++----- fs/xfs/libxfs/xfs_btree.h | 90 ++- fs/xfs/libxfs/xfs_da_btree.c | 65 +- fs/xfs/libxfs/xfs_da_btree.h | 4 +- fs/xfs/libxfs/xfs_da_format.c | 31 +- fs/xfs/libxfs/xfs_da_format.h | 44 +- fs/xfs/libxfs/xfs_defer.c | 454 +++++++++++++ fs/xfs/libxfs/xfs_defer.h | 97 +++ fs/xfs/libxfs/xfs_dir2.c | 15 +- fs/xfs/libxfs/xfs_dir2.h | 8 +- fs/xfs/libxfs/xfs_dir2_sf.c | 38 +- fs/xfs/libxfs/xfs_format.h | 210 ++++-- fs/xfs/libxfs/xfs_fs.h | 9 +- fs/xfs/libxfs/xfs_ialloc.c | 43 +- fs/xfs/libxfs/xfs_ialloc.h | 2 +- fs/xfs/libxfs/xfs_ialloc_btree.c | 18 +- fs/xfs/libxfs/xfs_inode_buf.c | 1 + fs/xfs/libxfs/xfs_log_format.h | 63 +- fs/xfs/libxfs/xfs_rmap.c | 1399 ++++++++++++++++++++++++++++++++++++++ fs/xfs/libxfs/xfs_rmap.h | 209 ++++++ fs/xfs/libxfs/xfs_rmap_btree.c | 517 ++++++++++++++ fs/xfs/libxfs/xfs_rmap_btree.h | 61 ++ fs/xfs/libxfs/xfs_rtbitmap.c | 2 +- fs/xfs/libxfs/xfs_sb.c | 9 + fs/xfs/libxfs/xfs_shared.h | 2 + fs/xfs/libxfs/xfs_trans_resv.c | 62 +- fs/xfs/libxfs/xfs_trans_resv.h | 10 - fs/xfs/libxfs/xfs_types.h | 4 +- fs/xfs/xfs_aops.c | 323 ++------- fs/xfs/xfs_aops.h | 3 + fs/xfs/xfs_attr_inactive.c | 2 +- fs/xfs/xfs_attr_list.c | 2 +- fs/xfs/xfs_bmap_util.c | 458 +++++-------- fs/xfs/xfs_bmap_util.h | 7 +- fs/xfs/xfs_buf.c | 267 +++++--- fs/xfs/xfs_buf.h | 7 +- fs/xfs/xfs_buf_item.c | 31 +- fs/xfs/xfs_discard.c | 2 +- fs/xfs/xfs_dquot.c | 14 +- fs/xfs/xfs_dquot_item.c | 2 + fs/xfs/xfs_error.c | 5 +- fs/xfs/xfs_error.h | 8 +- fs/xfs/xfs_export.c | 2 +- fs/xfs/xfs_extfree_item.c | 71 ++ fs/xfs/xfs_extfree_item.h | 3 + fs/xfs/xfs_file.c | 442 ++++++------ fs/xfs/xfs_filestream.c | 3 +- fs/xfs/xfs_fsops.c | 212 ++++-- fs/xfs/xfs_icache.c | 2 +- fs/xfs/xfs_icache.h | 1 + fs/xfs/xfs_inode.c | 115 ++-- fs/xfs/xfs_inode.h | 24 +- fs/xfs/xfs_inode_item.c | 1 + fs/xfs/xfs_ioctl.c | 35 +- fs/xfs/xfs_ioctl.h | 3 - fs/xfs/xfs_ioctl32.c | 6 +- fs/xfs/xfs_iomap.c | 251 ++++++- fs/xfs/xfs_iomap.h | 8 + fs/xfs/xfs_iops.c | 120 +--- fs/xfs/xfs_linux.h | 7 - fs/xfs/xfs_log.c | 13 +- fs/xfs/xfs_log.h | 5 - fs/xfs/xfs_log_cil.c | 258 +++++-- fs/xfs/xfs_log_recover.c | 336 ++++++--- fs/xfs/xfs_mount.c | 17 +- fs/xfs/xfs_mount.h | 6 + fs/xfs/xfs_ondisk.h | 34 +- fs/xfs/xfs_pnfs.c | 27 +- fs/xfs/xfs_pnfs.h | 4 +- fs/xfs/xfs_rmap_item.c | 536 +++++++++++++++ fs/xfs/xfs_rmap_item.h | 95 +++ fs/xfs/xfs_rtalloc.c | 11 +- fs/xfs/xfs_rtalloc.h | 2 - fs/xfs/xfs_stats.c | 2 +- fs/xfs/xfs_stats.h | 18 +- fs/xfs/xfs_super.c | 50 +- fs/xfs/xfs_super.h | 2 - fs/xfs/xfs_symlink.c | 25 +- fs/xfs/xfs_sysfs.c | 3 + fs/xfs/xfs_trace.c | 2 + fs/xfs/xfs_trace.h | 398 ++++++++++- fs/xfs/xfs_trans.h | 27 +- fs/xfs/xfs_trans_extfree.c | 215 ++++-- fs/xfs/xfs_trans_rmap.c | 271 ++++++++ 96 files changed, 7698 insertions(+), 2297 deletions(-) create mode 100644 fs/xfs/libxfs/xfs_defer.c create mode 100644 fs/xfs/libxfs/xfs_defer.h create mode 100644 fs/xfs/libxfs/xfs_rmap.c create mode 100644 fs/xfs/libxfs/xfs_rmap.h create mode 100644 fs/xfs/libxfs/xfs_rmap_btree.c create mode 100644 fs/xfs/libxfs/xfs_rmap_btree.h create mode 100644 fs/xfs/xfs_rmap_item.c create mode 100644 fs/xfs/xfs_rmap_item.h create mode 100644 fs/xfs/xfs_trans_rmap.c (limited to 'fs/xfs') diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index 5d47b4df6..35faf128f 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -4,6 +4,7 @@ config XFS_FS depends on (64BIT || LBDAF) select EXPORTFS select LIBCRC32C + select FS_IOMAP help XFS is a high performance journaling filesystem which originated on the SGI IRIX platform. It is completely multi-threaded, can diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 3542d94fd..fc593c869 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -39,6 +39,7 @@ xfs-y += $(addprefix libxfs/, \ xfs_btree.o \ xfs_da_btree.o \ xfs_da_format.o \ + xfs_defer.o \ xfs_dir2.o \ xfs_dir2_block.o \ xfs_dir2_data.o \ @@ -51,6 +52,8 @@ xfs-y += $(addprefix libxfs/, \ xfs_inode_fork.o \ xfs_inode_buf.o \ xfs_log_rlimit.o \ + xfs_rmap.o \ + xfs_rmap_btree.o \ xfs_sb.o \ xfs_symlink_remote.o \ xfs_trans_resv.o \ @@ -100,11 +103,13 @@ xfs-y += xfs_log.o \ xfs_extfree_item.o \ xfs_icreate_item.o \ xfs_inode_item.o \ + xfs_rmap_item.o \ xfs_log_recover.o \ xfs_trans_ail.o \ xfs_trans_buf.o \ xfs_trans_extfree.o \ xfs_trans_inode.o \ + xfs_trans_rmap.o \ # optional features xfs-$(CONFIG_XFS_QUOTA) += xfs_dquot.o \ @@ -121,5 +126,4 @@ xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o xfs-$(CONFIG_COMPAT) += xfs_ioctl32.o -xfs-$(CONFIG_NFSD_BLOCKLAYOUT) += xfs_pnfs.o -xfs-$(CONFIG_NFSD_SCSILAYOUT) += xfs_pnfs.o +xfs-$(CONFIG_EXPORTFS_BLOCK_OPS) += xfs_pnfs.o diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index a708e38b4..05b5243d8 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -24,8 +24,10 @@ #include "xfs_bit.h" #include "xfs_sb.h" #include "xfs_mount.h" +#include "xfs_defer.h" #include "xfs_inode.h" #include "xfs_btree.h" +#include "xfs_rmap.h" #include "xfs_alloc_btree.h" #include "xfs_alloc.h" #include "xfs_extent_busy.h" @@ -49,6 +51,81 @@ STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *); STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *, xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *); +xfs_extlen_t +xfs_prealloc_blocks( + struct xfs_mount *mp) +{ + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + return XFS_RMAP_BLOCK(mp) + 1; + if (xfs_sb_version_hasfinobt(&mp->m_sb)) + return XFS_FIBT_BLOCK(mp) + 1; + return XFS_IBT_BLOCK(mp) + 1; +} + +/* + * In order to avoid ENOSPC-related deadlock caused by out-of-order locking of + * AGF buffer (PV 947395), we place constraints on the relationship among + * actual allocations for data blocks, freelist blocks, and potential file data + * bmap btree blocks. However, these restrictions may result in no actual space + * allocated for a delayed extent, for example, a data block in a certain AG is + * allocated but there is no additional block for the additional bmap btree + * block due to a split of the bmap btree of the file. The result of this may + * lead to an infinite loop when the file gets flushed to disk and all delayed + * extents need to be actually allocated. To get around this, we explicitly set + * aside a few blocks which will not be reserved in delayed allocation. + * + * When rmap is disabled, we need to reserve 4 fsbs _per AG_ for the freelist + * and 4 more to handle a potential split of the file's bmap btree. + * + * When rmap is enabled, we must also be able to handle two rmap btree inserts + * to record both the file data extent and a new bmbt block. The bmbt block + * might not be in the same AG as the file data extent. In the worst case + * the bmap btree splits multiple levels and all the new blocks come from + * different AGs, so set aside enough to handle rmap btree splits in all AGs. + */ +unsigned int +xfs_alloc_set_aside( + struct xfs_mount *mp) +{ + unsigned int blocks; + + blocks = 4 + (mp->m_sb.sb_agcount * XFS_ALLOC_AGFL_RESERVE); + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + blocks += mp->m_sb.sb_agcount * mp->m_rmap_maxlevels; + return blocks; +} + +/* + * When deciding how much space to allocate out of an AG, we limit the + * allocation maximum size to the size the AG. However, we cannot use all the + * blocks in the AG - some are permanently used by metadata. These + * blocks are generally: + * - the AG superblock, AGF, AGI and AGFL + * - the AGF (bno and cnt) and AGI btree root blocks, and optionally + * the AGI free inode and rmap btree root blocks. + * - blocks on the AGFL according to xfs_alloc_set_aside() limits + * - the rmapbt root block + * + * The AG headers are sector sized, so the amount of space they take up is + * dependent on filesystem geometry. The others are all single blocks. + */ +unsigned int +xfs_alloc_ag_max_usable( + struct xfs_mount *mp) +{ + unsigned int blocks; + + blocks = XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)); /* ag headers */ + blocks += XFS_ALLOC_AGFL_RESERVE; + blocks += 3; /* AGF, AGI btree root blocks */ + if (xfs_sb_version_hasfinobt(&mp->m_sb)) + blocks++; /* finobt root block */ + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + blocks++; /* rmap root block */ + + return mp->m_sb.sb_agblocks - blocks; +} + /* * Lookup the record equal to [bno, len] in the btree given by cur. */ @@ -84,7 +161,7 @@ xfs_alloc_lookup_ge( * Lookup the first record less than or equal to [bno, len] * in the btree given by cur. */ -int /* error */ +static int /* error */ xfs_alloc_lookup_le( struct xfs_btree_cur *cur, /* btree cursor */ xfs_agblock_t bno, /* starting block of extent */ @@ -636,6 +713,14 @@ xfs_alloc_ag_vextent( ASSERT(!args->wasfromfl || !args->isfl); ASSERT(args->agbno % args->alignment == 0); + /* if not file data, insert new block into the reverse map btree */ + if (args->oinfo.oi_owner != XFS_RMAP_OWN_UNKNOWN) { + error = xfs_rmap_alloc(args->tp, args->agbp, args->agno, + args->agbno, args->len, &args->oinfo); + if (error) + return error; + } + if (!args->wasfromfl) { error = xfs_alloc_update_counters(args->tp, args->pag, args->agbp, @@ -1497,6 +1582,7 @@ xfs_alloc_ag_vextent_small( xfs_extlen_t *flenp, /* result length */ int *stat) /* status: 0-freelist, 1-normal/none */ { + struct xfs_owner_info oinfo; int error; xfs_agblock_t fbno; xfs_extlen_t flen; @@ -1539,6 +1625,18 @@ xfs_alloc_ag_vextent_small( error0); args->wasfromfl = 1; trace_xfs_alloc_small_freelist(args); + + /* + * If we're feeding an AGFL block to something that + * doesn't live in the free space, we need to clear + * out the OWN_AG rmap. + */ + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG); + error = xfs_rmap_free(args->tp, args->agbp, args->agno, + fbno, 1, &oinfo); + if (error) + goto error0; + *stat = 0; return 0; } @@ -1577,14 +1675,15 @@ error0: /* * Free the extent starting at agno/bno for length. */ -STATIC int /* error */ +STATIC int xfs_free_ag_extent( - xfs_trans_t *tp, /* transaction pointer */ - xfs_buf_t *agbp, /* buffer for a.g. freelist header */ - xfs_agnumber_t agno, /* allocation group number */ - xfs_agblock_t bno, /* starting block number */ - xfs_extlen_t len, /* length of extent */ - int isfl) /* set if is freelist blocks - no sb acctg */ + xfs_trans_t *tp, + xfs_buf_t *agbp, + xfs_agnumber_t agno, + xfs_agblock_t bno, + xfs_extlen_t len, + struct xfs_owner_info *oinfo, + int isfl) { xfs_btree_cur_t *bno_cur; /* cursor for by-block btree */ xfs_btree_cur_t *cnt_cur; /* cursor for by-size btree */ @@ -1601,12 +1700,19 @@ xfs_free_ag_extent( xfs_extlen_t nlen; /* new length of freespace */ xfs_perag_t *pag; /* per allocation group data */ + bno_cur = cnt_cur = NULL; mp = tp->t_mountp; + + if (oinfo->oi_owner != XFS_RMAP_OWN_UNKNOWN) { + error = xfs_rmap_free(tp, agbp, agno, bno, len, oinfo); + if (error) + goto error0; + } + /* * Allocate and initialize a cursor for the by-block btree. */ bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_BNO); - cnt_cur = NULL; /* * Look for a neighboring block on the left (lower block numbers) * that is contiguous with this space. @@ -1839,19 +1945,8 @@ void xfs_alloc_compute_maxlevels( xfs_mount_t *mp) /* file system mount structure */ { - int level; - uint maxblocks; - uint maxleafents; - int minleafrecs; - int minnoderecs; - - maxleafents = (mp->m_sb.sb_agblocks + 1) / 2; - minleafrecs = mp->m_alloc_mnr[0]; - minnoderecs = mp->m_alloc_mnr[1]; - maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs; - for (level = 1; maxblocks > 1; level++) - maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs; - mp->m_ag_maxlevels = level; + mp->m_ag_maxlevels = xfs_btree_compute_maxlevels(mp, mp->m_alloc_mnr, + (mp->m_sb.sb_agblocks + 1) / 2); } /* @@ -1886,6 +1981,11 @@ xfs_alloc_min_freelist( /* space needed by-size freespace btree */ min_free += min_t(unsigned int, pag->pagf_levels[XFS_BTNUM_CNTi] + 1, mp->m_ag_maxlevels); + /* space needed reverse mapping used space btree */ + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + min_free += min_t(unsigned int, + pag->pagf_levels[XFS_BTNUM_RMAPi] + 1, + mp->m_rmap_maxlevels); return min_free; } @@ -2003,21 +2103,34 @@ xfs_alloc_fix_freelist( * anything other than extra overhead when we need to put more blocks * back on the free list? Maybe we should only do this when space is * getting low or the AGFL is more than half full? + * + * The NOSHRINK flag prevents the AGFL from being shrunk if it's too + * big; the NORMAP flag prevents AGFL expand/shrink operations from + * updating the rmapbt. Both flags are used in xfs_repair while we're + * rebuilding the rmapbt, and neither are used by the kernel. They're + * both required to ensure that rmaps are correctly recorded for the + * regenerated AGFL, bnobt, and cntbt. See repair/phase5.c and + * repair/rmap.c in xfsprogs for details. */ - while (pag->pagf_flcount > need) { + memset(&targs, 0, sizeof(targs)); + if (flags & XFS_ALLOC_FLAG_NORMAP) + xfs_rmap_skip_owner_update(&targs.oinfo); + else + xfs_rmap_ag_owner(&targs.oinfo, XFS_RMAP_OWN_AG); + while (!(flags & XFS_ALLOC_FLAG_NOSHRINK) && pag->pagf_flcount > need) { struct xfs_buf *bp; error = xfs_alloc_get_freelist(tp, agbp, &bno, 0); if (error) goto out_agbp_relse; - error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, 1); + error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, + &targs.oinfo, 1); if (error) goto out_agbp_relse; bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0); xfs_trans_binval(tp, bp); } - memset(&targs, 0, sizeof(targs)); targs.tp = tp; targs.mp = mp; targs.agbp = agbp; @@ -2164,6 +2277,9 @@ xfs_alloc_log_agf( offsetof(xfs_agf_t, agf_longest), offsetof(xfs_agf_t, agf_btreeblks), offsetof(xfs_agf_t, agf_uuid), + offsetof(xfs_agf_t, agf_rmap_blocks), + /* needed so that we don't log the whole rest of the structure: */ + offsetof(xfs_agf_t, agf_spare64), sizeof(xfs_agf_t) }; @@ -2282,6 +2398,10 @@ xfs_agf_verify( be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) > XFS_BTREE_MAXLEVELS) return false; + if (xfs_sb_version_hasrmapbt(&mp->m_sb) && + be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > XFS_BTREE_MAXLEVELS) + return false; + /* * during growfs operations, the perag is not fully initialised, * so we can't use it for any useful checking. growfs ensures we can't @@ -2413,6 +2533,8 @@ xfs_alloc_read_agf( be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]); pag->pagf_levels[XFS_BTNUM_CNTi] = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]); + pag->pagf_levels[XFS_BTNUM_RMAPi] = + be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]); spin_lock_init(&pag->pagb_lock); pag->pagb_count = 0; pag->pagb_tree = RB_ROOT; @@ -2658,55 +2780,85 @@ error0: return error; } -/* - * Free an extent. - * Just break up the extent address and hand off to xfs_free_ag_extent - * after fixing up the freelist. - */ -int /* error */ -xfs_free_extent( - xfs_trans_t *tp, /* transaction pointer */ - xfs_fsblock_t bno, /* starting block number of extent */ - xfs_extlen_t len) /* length of extent */ +/* Ensure that the freelist is at full capacity. */ +int +xfs_free_extent_fix_freelist( + struct xfs_trans *tp, + xfs_agnumber_t agno, + struct xfs_buf **agbp) { - xfs_alloc_arg_t args; - int error; + struct xfs_alloc_arg args; + int error; - ASSERT(len != 0); - memset(&args, 0, sizeof(xfs_alloc_arg_t)); + memset(&args, 0, sizeof(struct xfs_alloc_arg)); args.tp = tp; args.mp = tp->t_mountp; + args.agno = agno; /* * validate that the block number is legal - the enables us to detect * and handle a silent filesystem corruption rather than crashing. */ - args.agno = XFS_FSB_TO_AGNO(args.mp, bno); if (args.agno >= args.mp->m_sb.sb_agcount) return -EFSCORRUPTED; - args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno); - if (args.agbno >= args.mp->m_sb.sb_agblocks) - return -EFSCORRUPTED; - args.pag = xfs_perag_get(args.mp, args.agno); ASSERT(args.pag); error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING); if (error) - goto error0; + goto out; + + *agbp = args.agbp; +out: + xfs_perag_put(args.pag); + return error; +} + +/* + * Free an extent. + * Just break up the extent address and hand off to xfs_free_ag_extent + * after fixing up the freelist. + */ +int /* error */ +xfs_free_extent( + struct xfs_trans *tp, /* transaction pointer */ + xfs_fsblock_t bno, /* starting block number of extent */ + xfs_extlen_t len, /* length of extent */ + struct xfs_owner_info *oinfo) /* extent owner */ +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_buf *agbp; + xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, bno); + xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, bno); + int error; + + ASSERT(len != 0); + + if (XFS_TEST_ERROR(false, mp, + XFS_ERRTAG_FREE_EXTENT, + XFS_RANDOM_FREE_EXTENT)) + return -EIO; + + error = xfs_free_extent_fix_freelist(tp, agno, &agbp); + if (error) + return error; + + XFS_WANT_CORRUPTED_GOTO(mp, agbno < mp->m_sb.sb_agblocks, err); /* validate the extent size is legal now we have the agf locked */ - if (args.agbno + len > - be32_to_cpu(XFS_BUF_TO_AGF(args.agbp)->agf_length)) { - error = -EFSCORRUPTED; - goto error0; - } + XFS_WANT_CORRUPTED_GOTO(mp, + agbno + len <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_length), + err); - error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0); - if (!error) - xfs_extent_busy_insert(tp, args.agno, args.agbno, len, 0); -error0: - xfs_perag_put(args.pag); + error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, oinfo, 0); + if (error) + goto err; + + xfs_extent_busy_insert(tp, agno, agbno, len, 0); + return 0; + +err: + xfs_trans_brelse(tp, agbp); return error; } diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 135eb3d24..6fe2d6b7c 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -54,41 +54,8 @@ typedef unsigned int xfs_alloctype_t; */ #define XFS_ALLOC_FLAG_TRYLOCK 0x00000001 /* use trylock for buffer locking */ #define XFS_ALLOC_FLAG_FREEING 0x00000002 /* indicate caller is freeing extents*/ - -/* - * In order to avoid ENOSPC-related deadlock caused by - * out-of-order locking of AGF buffer (PV 947395), we place - * constraints on the relationship among actual allocations for - * data blocks, freelist blocks, and potential file data bmap - * btree blocks. However, these restrictions may result in no - * actual space allocated for a delayed extent, for example, a data - * block in a certain AG is allocated but there is no additional - * block for the additional bmap btree block due to a split of the - * bmap btree of the file. The result of this may lead to an - * infinite loop in xfssyncd when the file gets flushed to disk and - * all delayed extents need to be actually allocated. To get around - * this, we explicitly set aside a few blocks which will not be - * reserved in delayed allocation. Considering the minimum number of - * needed freelist blocks is 4 fsbs _per AG_, a potential split of file's bmap - * btree requires 1 fsb, so we set the number of set-aside blocks - * to 4 + 4*agcount. - */ -#define XFS_ALLOC_SET_ASIDE(mp) (4 + ((mp)->m_sb.sb_agcount * 4)) - -/* - * When deciding how much space to allocate out of an AG, we limit the - * allocation maximum size to the size the AG. However, we cannot use all the - * blocks in the AG - some are permanently used by metadata. These - * blocks are generally: - * - the AG superblock, AGF, AGI and AGFL - * - the AGF (bno and cnt) and AGI btree root blocks - * - 4 blocks on the AGFL according to XFS_ALLOC_SET_ASIDE() limits - * - * The AG headers are sector sized, so the amount of space they take up is - * dependent on filesystem geometry. The others are all single blocks. - */ -#define XFS_ALLOC_AG_MAX_USABLE(mp) \ - ((mp)->m_sb.sb_agblocks - XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)) - 7) +#define XFS_ALLOC_FLAG_NORMAP 0x00000004 /* don't modify the rmapbt */ +#define XFS_ALLOC_FLAG_NOSHRINK 0x00000008 /* don't shrink the freelist */ /* @@ -123,6 +90,7 @@ typedef struct xfs_alloc_arg { char isfl; /* set if is freelist blocks - !acctg */ char userdata; /* mask defining userdata treatment */ xfs_fsblock_t firstblock; /* io first block allocated */ + struct xfs_owner_info oinfo; /* owner of blocks being allocated */ } xfs_alloc_arg_t; /* @@ -132,6 +100,11 @@ typedef struct xfs_alloc_arg { #define XFS_ALLOC_INITIAL_USER_DATA (1 << 1)/* special case start of file */ #define XFS_ALLOC_USERDATA_ZERO (1 << 2)/* zero extent on allocation */ +/* freespace limit calculations */ +#define XFS_ALLOC_AGFL_RESERVE 4 +unsigned int xfs_alloc_set_aside(struct xfs_mount *mp); +unsigned int xfs_alloc_ag_max_usable(struct xfs_mount *mp); + xfs_extlen_t xfs_alloc_longest_free_extent(struct xfs_mount *mp, struct xfs_perag *pag, xfs_extlen_t need); unsigned int xfs_alloc_min_freelist(struct xfs_mount *mp, @@ -208,16 +181,10 @@ xfs_alloc_vextent( */ int /* error */ xfs_free_extent( - struct xfs_trans *tp, /* transaction pointer */ - xfs_fsblock_t bno, /* starting block number of extent */ - xfs_extlen_t len); /* length of extent */ - -int /* error */ -xfs_alloc_lookup_le( - struct xfs_btree_cur *cur, /* btree cursor */ - xfs_agblock_t bno, /* starting block of extent */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_fsblock_t bno, /* starting block number of extent */ xfs_extlen_t len, /* length of extent */ - int *stat); /* success/failure */ + struct xfs_owner_info *oinfo);/* extent owner */ int /* error */ xfs_alloc_lookup_ge( @@ -236,5 +203,9 @@ xfs_alloc_get_rec( int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, int flags, struct xfs_buf **bpp); int xfs_alloc_fix_freelist(struct xfs_alloc_arg *args, int flags); +int xfs_free_extent_fix_freelist(struct xfs_trans *tp, xfs_agnumber_t agno, + struct xfs_buf **agbp); + +xfs_extlen_t xfs_prealloc_blocks(struct xfs_mount *mp); #endif /* __XFS_ALLOC_H__ */ diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index d9b424252..5ba2dac5e 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -211,17 +211,6 @@ xfs_allocbt_init_key_from_rec( key->alloc.ar_blockcount = rec->alloc.ar_blockcount; } -STATIC void -xfs_allocbt_init_rec_from_key( - union xfs_btree_key *key, - union xfs_btree_rec *rec) -{ - ASSERT(key->alloc.ar_startblock != 0); - - rec->alloc.ar_startblock = key->alloc.ar_startblock; - rec->alloc.ar_blockcount = key->alloc.ar_blockcount; -} - STATIC void xfs_allocbt_init_rec_from_cur( struct xfs_btree_cur *cur, @@ -406,7 +395,6 @@ static const struct xfs_btree_ops xfs_allocbt_ops = { .get_minrecs = xfs_allocbt_get_minrecs, .get_maxrecs = xfs_allocbt_get_maxrecs, .init_key_from_rec = xfs_allocbt_init_key_from_rec, - .init_rec_from_key = xfs_allocbt_init_rec_from_key, .init_rec_from_cur = xfs_allocbt_init_rec_from_cur, .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur, .key_diff = xfs_allocbt_key_diff, diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 4e126f41a..af1ecb191 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -23,6 +23,7 @@ #include "xfs_trans_resv.h" #include "xfs_bit.h" #include "xfs_mount.h" +#include "xfs_defer.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" #include "xfs_attr_sf.h" @@ -203,7 +204,7 @@ xfs_attr_set( { struct xfs_mount *mp = dp->i_mount; struct xfs_da_args args; - struct xfs_bmap_free flist; + struct xfs_defer_ops dfops; struct xfs_trans_res tres; xfs_fsblock_t firstblock; int rsvd = (flags & ATTR_ROOT) != 0; @@ -221,7 +222,7 @@ xfs_attr_set( args.value = value; args.valuelen = valuelen; args.firstblock = &firstblock; - args.flist = &flist; + args.dfops = &dfops; args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT; args.total = xfs_attr_calc_size(&args, &local); @@ -316,13 +317,13 @@ xfs_attr_set( * It won't fit in the shortform, transform to a leaf block. * GROT: another possible req'mt for a double-split btree op. */ - xfs_bmap_init(args.flist, args.firstblock); + xfs_defer_init(args.dfops, args.firstblock); error = xfs_attr_shortform_to_leaf(&args); if (!error) - error = xfs_bmap_finish(&args.trans, args.flist, dp); + error = xfs_defer_finish(&args.trans, args.dfops, dp); if (error) { args.trans = NULL; - xfs_bmap_cancel(&flist); + xfs_defer_cancel(&dfops); goto out; } @@ -382,7 +383,7 @@ xfs_attr_remove( { struct xfs_mount *mp = dp->i_mount; struct xfs_da_args args; - struct xfs_bmap_free flist; + struct xfs_defer_ops dfops; xfs_fsblock_t firstblock; int error; @@ -399,7 +400,7 @@ xfs_attr_remove( return error; args.firstblock = &firstblock; - args.flist = &flist; + args.dfops = &dfops; /* * we have no control over the attribute names that userspace passes us @@ -584,13 +585,13 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * Commit that transaction so that the node_addname() call * can manage its own transactions. */ - xfs_bmap_init(args->flist, args->firstblock); + xfs_defer_init(args->dfops, args->firstblock); error = xfs_attr3_leaf_to_node(args); if (!error) - error = xfs_bmap_finish(&args->trans, args->flist, dp); + error = xfs_defer_finish(&args->trans, args->dfops, dp); if (error) { args->trans = NULL; - xfs_bmap_cancel(args->flist); + xfs_defer_cancel(args->dfops); return error; } @@ -674,15 +675,15 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * If the result is small enough, shrink it all into the inode. */ if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { - xfs_bmap_init(args->flist, args->firstblock); + xfs_defer_init(args->dfops, args->firstblock); error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ if (!error) - error = xfs_bmap_finish(&args->trans, - args->flist, dp); + error = xfs_defer_finish(&args->trans, + args->dfops, dp); if (error) { args->trans = NULL; - xfs_bmap_cancel(args->flist); + xfs_defer_cancel(args->dfops); return error; } } @@ -737,14 +738,14 @@ xfs_attr_leaf_removename(xfs_da_args_t *args) * If the result is small enough, shrink it all into the inode. */ if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { - xfs_bmap_init(args->flist, args->firstblock); + xfs_defer_init(args->dfops, args->firstblock); error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ if (!error) - error = xfs_bmap_finish(&args->trans, args->flist, dp); + error = xfs_defer_finish(&args->trans, args->dfops, dp); if (error) { args->trans = NULL; - xfs_bmap_cancel(args->flist); + xfs_defer_cancel(args->dfops); return error; } } @@ -863,14 +864,14 @@ restart: */ xfs_da_state_free(state); state = NULL; - xfs_bmap_init(args->flist, args->firstblock); + xfs_defer_init(args->dfops, args->firstblock); error = xfs_attr3_leaf_to_node(args); if (!error) - error = xfs_bmap_finish(&args->trans, - args->flist, dp); + error = xfs_defer_finish(&args->trans, + args->dfops, dp); if (error) { args->trans = NULL; - xfs_bmap_cancel(args->flist); + xfs_defer_cancel(args->dfops); goto out; } @@ -891,13 +892,13 @@ restart: * in the index/blkno/rmtblkno/rmtblkcnt fields and * in the index2/blkno2/rmtblkno2/rmtblkcnt2 fields. */ - xfs_bmap_init(args->flist, args->firstblock); + xfs_defer_init(args->dfops, args->firstblock); error = xfs_da3_split(state); if (!error) - error = xfs_bmap_finish(&args->trans, args->flist, dp); + error = xfs_defer_finish(&args->trans, args->dfops, dp); if (error) { args->trans = NULL; - xfs_bmap_cancel(args->flist); + xfs_defer_cancel(args->dfops); goto out; } } else { @@ -990,14 +991,14 @@ restart: * Check to see if the tree needs to be collapsed. */ if (retval && (state->path.active > 1)) { - xfs_bmap_init(args->flist, args->firstblock); + xfs_defer_init(args->dfops, args->firstblock); error = xfs_da3_join(state); if (!error) - error = xfs_bmap_finish(&args->trans, - args->flist, dp); + error = xfs_defer_finish(&args->trans, + args->dfops, dp); if (error) { args->trans = NULL; - xfs_bmap_cancel(args->flist); + xfs_defer_cancel(args->dfops); goto out; } } @@ -1113,13 +1114,13 @@ xfs_attr_node_removename(xfs_da_args_t *args) * Check to see if the tree needs to be collapsed. */ if (retval && (state->path.active > 1)) { - xfs_bmap_init(args->flist, args->firstblock); + xfs_defer_init(args->dfops, args->firstblock); error = xfs_da3_join(state); if (!error) - error = xfs_bmap_finish(&args->trans, args->flist, dp); + error = xfs_defer_finish(&args->trans, args->dfops, dp); if (error) { args->trans = NULL; - xfs_bmap_cancel(args->flist); + xfs_defer_cancel(args->dfops); goto out; } /* @@ -1146,15 +1147,15 @@ xfs_attr_node_removename(xfs_da_args_t *args) goto out; if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { - xfs_bmap_init(args->flist, args->firstblock); + xfs_defer_init(args->dfops, args->firstblock); error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ if (!error) - error = xfs_bmap_finish(&args->trans, - args->flist, dp); + error = xfs_defer_finish(&args->trans, + args->dfops, dp); if (error) { args->trans = NULL; - xfs_bmap_cancel(args->flist); + xfs_defer_cancel(args->dfops); goto out; } } else diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 01a5ecfed..8ea91f363 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -792,7 +792,7 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args) nargs.dp = dp; nargs.geo = args->geo; nargs.firstblock = args->firstblock; - nargs.flist = args->flist; + nargs.dfops = args->dfops; nargs.total = args->total; nargs.whichfork = XFS_ATTR_FORK; nargs.trans = args->trans; @@ -922,7 +922,7 @@ xfs_attr3_leaf_to_shortform( nargs.geo = args->geo; nargs.dp = dp; nargs.firstblock = args->firstblock; - nargs.flist = args->flist; + nargs.dfops = args->dfops; nargs.total = args->total; nargs.whichfork = XFS_ATTR_FORK; nargs.trans = args->trans; diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h index 882c8d338..4f2aed04f 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.h +++ b/fs/xfs/libxfs/xfs_attr_leaf.h @@ -50,7 +50,6 @@ int xfs_attr_shortform_lookup(struct xfs_da_args *args); int xfs_attr_shortform_getvalue(struct xfs_da_args *args); int xfs_attr_shortform_to_leaf(struct xfs_da_args *args); int xfs_attr_shortform_remove(struct xfs_da_args *args); -int xfs_attr_shortform_list(struct xfs_attr_list_context *context); int xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp); int xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes); void xfs_attr_fork_remove(struct xfs_inode *ip, struct xfs_trans *tp); @@ -88,8 +87,6 @@ int xfs_attr3_leaf_toosmall(struct xfs_da_state *state, int *retval); void xfs_attr3_leaf_unbalance(struct xfs_da_state *state, struct xfs_da_state_blk *drop_blk, struct xfs_da_state_blk *save_blk); -int xfs_attr3_root_inactive(struct xfs_trans **trans, struct xfs_inode *dp); - /* * Utility routines. */ diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index a572532a5..d52f525f5 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -24,6 +24,7 @@ #include "xfs_trans_resv.h" #include "xfs_bit.h" #include "xfs_mount.h" +#include "xfs_defer.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" #include "xfs_inode.h" @@ -460,16 +461,16 @@ xfs_attr_rmtval_set( * extent and then crash then the block may not contain the * correct metadata after log recovery occurs. */ - xfs_bmap_init(args->flist, args->firstblock); + xfs_defer_init(args->dfops, args->firstblock); nmap = 1; error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno, blkcnt, XFS_BMAPI_ATTRFORK, args->firstblock, - args->total, &map, &nmap, args->flist); + args->total, &map, &nmap, args->dfops); if (!error) - error = xfs_bmap_finish(&args->trans, args->flist, dp); + error = xfs_defer_finish(&args->trans, args->dfops, dp); if (error) { args->trans = NULL; - xfs_bmap_cancel(args->flist); + xfs_defer_cancel(args->dfops); return error; } @@ -503,7 +504,7 @@ xfs_attr_rmtval_set( ASSERT(blkcnt > 0); - xfs_bmap_init(args->flist, args->firstblock); + xfs_defer_init(args->dfops, args->firstblock); nmap = 1; error = xfs_bmapi_read(dp, (xfs_fileoff_t)lblkno, blkcnt, &map, &nmap, @@ -603,16 +604,16 @@ xfs_attr_rmtval_remove( blkcnt = args->rmtblkcnt; done = 0; while (!done) { - xfs_bmap_init(args->flist, args->firstblock); + xfs_defer_init(args->dfops, args->firstblock); error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt, XFS_BMAPI_ATTRFORK, 1, args->firstblock, - args->flist, &done); + args->dfops, &done); if (!error) - error = xfs_bmap_finish(&args->trans, args->flist, + error = xfs_defer_finish(&args->trans, args->dfops, args->dp); if (error) { args->trans = NULL; - xfs_bmap_cancel(args->flist); + xfs_defer_cancel(args->dfops); return error; } diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 932381cae..b060bca93 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -24,6 +24,7 @@ #include "xfs_bit.h" #include "xfs_sb.h" #include "xfs_mount.h" +#include "xfs_defer.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" #include "xfs_dir2.h" @@ -45,6 +46,7 @@ #include "xfs_symlink.h" #include "xfs_attr_leaf.h" #include "xfs_filestream.h" +#include "xfs_rmap.h" kmem_zone_t *xfs_bmap_free_item_zone; @@ -570,14 +572,13 @@ xfs_bmap_validate_ret( */ void xfs_bmap_add_free( - xfs_fsblock_t bno, /* fs block number of extent */ - xfs_filblks_t len, /* length of extent */ - xfs_bmap_free_t *flist, /* list of extents */ - xfs_mount_t *mp) /* mount point structure */ + struct xfs_mount *mp, + struct xfs_defer_ops *dfops, + xfs_fsblock_t bno, + xfs_filblks_t len, + struct xfs_owner_info *oinfo) { - xfs_bmap_free_item_t *cur; /* current (next) element */ - xfs_bmap_free_item_t *new; /* new element */ - xfs_bmap_free_item_t *prev; /* previous element */ + struct xfs_extent_free_item *new; /* new element */ #ifdef DEBUG xfs_agnumber_t agno; xfs_agblock_t agbno; @@ -594,59 +595,17 @@ xfs_bmap_add_free( ASSERT(agbno + len <= mp->m_sb.sb_agblocks); #endif ASSERT(xfs_bmap_free_item_zone != NULL); - new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP); - new->xbfi_startblock = bno; - new->xbfi_blockcount = (xfs_extlen_t)len; - for (prev = NULL, cur = flist->xbf_first; - cur != NULL; - prev = cur, cur = cur->xbfi_next) { - if (cur->xbfi_startblock >= bno) - break; - } - if (prev) - prev->xbfi_next = new; - else - flist->xbf_first = new; - new->xbfi_next = cur; - flist->xbf_count++; -} -/* - * Remove the entry "free" from the free item list. Prev points to the - * previous entry, unless "free" is the head of the list. - */ -void -xfs_bmap_del_free( - xfs_bmap_free_t *flist, /* free item list header */ - xfs_bmap_free_item_t *prev, /* previous item on list, if any */ - xfs_bmap_free_item_t *free) /* list item to be freed */ -{ - if (prev) - prev->xbfi_next = free->xbfi_next; + new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP); + new->xefi_startblock = bno; + new->xefi_blockcount = (xfs_extlen_t)len; + if (oinfo) + new->xefi_oinfo = *oinfo; else - flist->xbf_first = free->xbfi_next; - flist->xbf_count--; - kmem_zone_free(xfs_bmap_free_item_zone, free); -} - -/* - * Free up any items left in the list. - */ -void -xfs_bmap_cancel( - xfs_bmap_free_t *flist) /* list of bmap_free_items */ -{ - xfs_bmap_free_item_t *free; /* free list item */ - xfs_bmap_free_item_t *next; - - if (flist->xbf_count == 0) - return; - ASSERT(flist->xbf_first != NULL); - for (free = flist->xbf_first; free; free = next) { - next = free->xbfi_next; - xfs_bmap_del_free(flist, NULL, free); - } - ASSERT(flist->xbf_count == 0); + xfs_rmap_skip_owner_update(&new->xefi_oinfo); + trace_xfs_bmap_free_defer(mp, XFS_FSB_TO_AGNO(mp, bno), 0, + XFS_FSB_TO_AGBNO(mp, bno), len); + xfs_defer_add(dfops, XFS_DEFER_OPS_TYPE_FREE, &new->xefi_list); } /* @@ -676,6 +635,7 @@ xfs_bmap_btree_to_extents( xfs_mount_t *mp; /* mount point structure */ __be64 *pp; /* ptr to block address */ struct xfs_btree_block *rblock;/* root btree block */ + struct xfs_owner_info oinfo; mp = ip->i_mount; ifp = XFS_IFORK_PTR(ip, whichfork); @@ -699,7 +659,8 @@ xfs_bmap_btree_to_extents( cblock = XFS_BUF_TO_BLOCK(cbp); if ((error = xfs_btree_check_block(cur, cblock, 0, cbp))) return error; - xfs_bmap_add_free(cbno, 1, cur->bc_private.b.flist, mp); + xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, whichfork); + xfs_bmap_add_free(mp, cur->bc_private.b.dfops, cbno, 1, &oinfo); ip->i_d.di_nblocks--; xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L); xfs_trans_binval(tp, cbp); @@ -722,7 +683,7 @@ xfs_bmap_extents_to_btree( xfs_trans_t *tp, /* transaction pointer */ xfs_inode_t *ip, /* incore inode pointer */ xfs_fsblock_t *firstblock, /* first-block-allocated */ - xfs_bmap_free_t *flist, /* blocks freed in xaction */ + struct xfs_defer_ops *dfops, /* blocks freed in xaction */ xfs_btree_cur_t **curp, /* cursor returned to caller */ int wasdel, /* converting a delayed alloc */ int *logflagsp, /* inode logging flags */ @@ -771,7 +732,7 @@ xfs_bmap_extents_to_btree( */ cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); cur->bc_private.b.firstblock = *firstblock; - cur->bc_private.b.flist = flist; + cur->bc_private.b.dfops = dfops; cur->bc_private.b.flags = wasdel ? XFS_BTCUR_BPRV_WASDEL : 0; /* * Convert to a btree with two levels, one record in root. @@ -780,11 +741,12 @@ xfs_bmap_extents_to_btree( memset(&args, 0, sizeof(args)); args.tp = tp; args.mp = mp; + xfs_rmap_ino_bmbt_owner(&args.oinfo, ip->i_ino, whichfork); args.firstblock = *firstblock; if (*firstblock == NULLFSBLOCK) { args.type = XFS_ALLOCTYPE_START_BNO; args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino); - } else if (flist->xbf_low) { + } else if (dfops->dop_low) { args.type = XFS_ALLOCTYPE_START_BNO; args.fsbno = *firstblock; } else { @@ -805,7 +767,7 @@ xfs_bmap_extents_to_btree( ASSERT(args.fsbno != NULLFSBLOCK); ASSERT(*firstblock == NULLFSBLOCK || args.agno == XFS_FSB_TO_AGNO(mp, *firstblock) || - (flist->xbf_low && + (dfops->dop_low && args.agno > XFS_FSB_TO_AGNO(mp, *firstblock))); *firstblock = cur->bc_private.b.firstblock = args.fsbno; cur->bc_private.b.allocated++; @@ -926,6 +888,7 @@ xfs_bmap_local_to_extents( memset(&args, 0, sizeof(args)); args.tp = tp; args.mp = ip->i_mount; + xfs_rmap_ino_owner(&args.oinfo, ip->i_ino, whichfork, 0); args.firstblock = *firstblock; /* * Allocate a block. We know we need only one, since the @@ -990,7 +953,7 @@ xfs_bmap_add_attrfork_btree( xfs_trans_t *tp, /* transaction pointer */ xfs_inode_t *ip, /* incore inode pointer */ xfs_fsblock_t *firstblock, /* first block allocated */ - xfs_bmap_free_t *flist, /* blocks to free at commit */ + struct xfs_defer_ops *dfops, /* blocks to free at commit */ int *flags) /* inode logging flags */ { xfs_btree_cur_t *cur; /* btree cursor */ @@ -1003,7 +966,7 @@ xfs_bmap_add_attrfork_btree( *flags |= XFS_ILOG_DBROOT; else { cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK); - cur->bc_private.b.flist = flist; + cur->bc_private.b.dfops = dfops; cur->bc_private.b.firstblock = *firstblock; if ((error = xfs_bmbt_lookup_ge(cur, 0, 0, 0, &stat))) goto error0; @@ -1033,7 +996,7 @@ xfs_bmap_add_attrfork_extents( xfs_trans_t *tp, /* transaction pointer */ xfs_inode_t *ip, /* incore inode pointer */ xfs_fsblock_t *firstblock, /* first block allocated */ - xfs_bmap_free_t *flist, /* blocks to free at commit */ + struct xfs_defer_ops *dfops, /* blocks to free at commit */ int *flags) /* inode logging flags */ { xfs_btree_cur_t *cur; /* bmap btree cursor */ @@ -1042,7 +1005,7 @@ xfs_bmap_add_attrfork_extents( if (ip->i_d.di_nextents * sizeof(xfs_bmbt_rec_t) <= XFS_IFORK_DSIZE(ip)) return 0; cur = NULL; - error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist, &cur, 0, + error = xfs_bmap_extents_to_btree(tp, ip, firstblock, dfops, &cur, 0, flags, XFS_DATA_FORK); if (cur) { cur->bc_private.b.allocated = 0; @@ -1068,7 +1031,7 @@ xfs_bmap_add_attrfork_local( xfs_trans_t *tp, /* transaction pointer */ xfs_inode_t *ip, /* incore inode pointer */ xfs_fsblock_t *firstblock, /* first block allocated */ - xfs_bmap_free_t *flist, /* blocks to free at commit */ + struct xfs_defer_ops *dfops, /* blocks to free at commit */ int *flags) /* inode logging flags */ { xfs_da_args_t dargs; /* args for dir/attr code */ @@ -1081,7 +1044,7 @@ xfs_bmap_add_attrfork_local( dargs.geo = ip->i_mount->m_dir_geo; dargs.dp = ip; dargs.firstblock = firstblock; - dargs.flist = flist; + dargs.dfops = dfops; dargs.total = dargs.geo->fsbcount; dargs.whichfork = XFS_DATA_FORK; dargs.trans = tp; @@ -1109,7 +1072,7 @@ xfs_bmap_add_attrfork( int rsvd) /* xact may use reserved blks */ { xfs_fsblock_t firstblock; /* 1st block/ag allocated */ - xfs_bmap_free_t flist; /* freed extent records */ + struct xfs_defer_ops dfops; /* freed extent records */ xfs_mount_t *mp; /* mount structure */ xfs_trans_t *tp; /* transaction pointer */ int blks; /* space reservation */ @@ -1175,18 +1138,18 @@ xfs_bmap_add_attrfork( ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP); ip->i_afp->if_flags = XFS_IFEXTENTS; logflags = 0; - xfs_bmap_init(&flist, &firstblock); + xfs_defer_init(&dfops, &firstblock); switch (ip->i_d.di_format) { case XFS_DINODE_FMT_LOCAL: - error = xfs_bmap_add_attrfork_local(tp, ip, &firstblock, &flist, + error = xfs_bmap_add_attrfork_local(tp, ip, &firstblock, &dfops, &logflags); break; case XFS_DINODE_FMT_EXTENTS: error = xfs_bmap_add_attrfork_extents(tp, ip, &firstblock, - &flist, &logflags); + &dfops, &logflags); break; case XFS_DINODE_FMT_BTREE: - error = xfs_bmap_add_attrfork_btree(tp, ip, &firstblock, &flist, + error = xfs_bmap_add_attrfork_btree(tp, ip, &firstblock, &dfops, &logflags); break; default: @@ -1215,7 +1178,7 @@ xfs_bmap_add_attrfork( xfs_log_sb(tp); } - error = xfs_bmap_finish(&tp, &flist, NULL); + error = xfs_defer_finish(&tp, &dfops, NULL); if (error) goto bmap_cancel; error = xfs_trans_commit(tp); @@ -1223,7 +1186,7 @@ xfs_bmap_add_attrfork( return error; bmap_cancel: - xfs_bmap_cancel(&flist); + xfs_defer_cancel(&dfops); trans_cancel: xfs_trans_cancel(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -2020,7 +1983,7 @@ xfs_bmap_add_extent_delay_real( if (xfs_bmap_needs_btree(bma->ip, whichfork)) { error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, - bma->firstblock, bma->flist, + bma->firstblock, bma->dfops, &bma->cur, 1, &tmp_rval, whichfork); rval |= tmp_rval; if (error) @@ -2104,7 +2067,7 @@ xfs_bmap_add_extent_delay_real( if (xfs_bmap_needs_btree(bma->ip, whichfork)) { error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, - bma->firstblock, bma->flist, &bma->cur, 1, + bma->firstblock, bma->dfops, &bma->cur, 1, &tmp_rval, whichfork); rval |= tmp_rval; if (error) @@ -2173,7 +2136,7 @@ xfs_bmap_add_extent_delay_real( if (xfs_bmap_needs_btree(bma->ip, whichfork)) { error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, - bma->firstblock, bma->flist, &bma->cur, + bma->firstblock, bma->dfops, &bma->cur, 1, &tmp_rval, whichfork); rval |= tmp_rval; if (error) @@ -2216,13 +2179,18 @@ xfs_bmap_add_extent_delay_real( ASSERT(0); } + /* add reverse mapping */ + error = xfs_rmap_map_extent(mp, bma->dfops, bma->ip, whichfork, new); + if (error) + goto done; + /* convert to a btree if necessary */ if (xfs_bmap_needs_btree(bma->ip, whichfork)) { int tmp_logflags; /* partial log flag return val */ ASSERT(bma->cur == NULL); error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, - bma->firstblock, bma->flist, &bma->cur, + bma->firstblock, bma->dfops, &bma->cur, da_old > 0, &tmp_logflags, whichfork); bma->logflags |= tmp_logflags; if (error) @@ -2264,7 +2232,7 @@ xfs_bmap_add_extent_unwritten_real( xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ xfs_bmbt_irec_t *new, /* new data to add to file extents */ xfs_fsblock_t *first, /* pointer to firstblock variable */ - xfs_bmap_free_t *flist, /* list of extents to be freed */ + struct xfs_defer_ops *dfops, /* list of extents to be freed */ int *logflagsp) /* inode logging flags */ { xfs_btree_cur_t *cur; /* btree cursor */ @@ -2752,12 +2720,17 @@ xfs_bmap_add_extent_unwritten_real( ASSERT(0); } + /* update reverse mappings */ + error = xfs_rmap_convert_extent(mp, dfops, ip, XFS_DATA_FORK, new); + if (error) + goto done; + /* convert to a btree if necessary */ if (xfs_bmap_needs_btree(ip, XFS_DATA_FORK)) { int tmp_logflags; /* partial log flag return val */ ASSERT(cur == NULL); - error = xfs_bmap_extents_to_btree(tp, ip, first, flist, &cur, + error = xfs_bmap_extents_to_btree(tp, ip, first, dfops, &cur, 0, &tmp_logflags, XFS_DATA_FORK); *logflagsp |= tmp_logflags; if (error) @@ -3144,13 +3117,18 @@ xfs_bmap_add_extent_hole_real( break; } + /* add reverse mapping */ + error = xfs_rmap_map_extent(mp, bma->dfops, bma->ip, whichfork, new); + if (error) + goto done; + /* convert to a btree if necessary */ if (xfs_bmap_needs_btree(bma->ip, whichfork)) { int tmp_logflags; /* partial log flag return val */ ASSERT(bma->cur == NULL); error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, - bma->firstblock, bma->flist, &bma->cur, + bma->firstblock, bma->dfops, &bma->cur, 0, &tmp_logflags, whichfork); bma->logflags |= tmp_logflags; if (error) @@ -3708,9 +3686,10 @@ xfs_bmap_btalloc( args.tp = ap->tp; args.mp = mp; args.fsbno = ap->blkno; + xfs_rmap_skip_owner_update(&args.oinfo); /* Trim the allocation back to the maximum an AG can fit. */ - args.maxlen = MIN(ap->length, XFS_ALLOC_AG_MAX_USABLE(mp)); + args.maxlen = MIN(ap->length, mp->m_ag_max_usable); args.firstblock = *ap->firstblock; blen = 0; if (nullfb) { @@ -3725,7 +3704,7 @@ xfs_bmap_btalloc( error = xfs_bmap_btalloc_nullfb(ap, &args, &blen); if (error) return error; - } else if (ap->flist->xbf_low) { + } else if (ap->dfops->dop_low) { if (xfs_inode_is_filestream(ap->ip)) args.type = XFS_ALLOCTYPE_FIRST_AG; else @@ -3758,7 +3737,7 @@ xfs_bmap_btalloc( * is >= the stripe unit and the allocation offset is * at the end of file. */ - if (!ap->flist->xbf_low && ap->aeof) { + if (!ap->dfops->dop_low && ap->aeof) { if (!ap->offset) { args.alignment = stripe_align; atype = args.type; @@ -3851,7 +3830,7 @@ xfs_bmap_btalloc( args.minleft = 0; if ((error = xfs_alloc_vextent(&args))) return error; - ap->flist->xbf_low = 1; + ap->dfops->dop_low = true; } if (args.fsbno != NULLFSBLOCK) { /* @@ -3861,7 +3840,7 @@ xfs_bmap_btalloc( ASSERT(*ap->firstblock == NULLFSBLOCK || XFS_FSB_TO_AGNO(mp, *ap->firstblock) == XFS_FSB_TO_AGNO(mp, args.fsbno) || - (ap->flist->xbf_low && + (ap->dfops->dop_low && XFS_FSB_TO_AGNO(mp, *ap->firstblock) < XFS_FSB_TO_AGNO(mp, args.fsbno))); @@ -3869,7 +3848,7 @@ xfs_bmap_btalloc( if (*ap->firstblock == NULLFSBLOCK) *ap->firstblock = args.fsbno; ASSERT(nullfb || fb_agno == args.agno || - (ap->flist->xbf_low && fb_agno < args.agno)); + (ap->dfops->dop_low && fb_agno < args.agno)); ap->length = args.len; ap->ip->i_d.di_nblocks += args.len; xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); @@ -4336,7 +4315,7 @@ xfs_bmapi_allocate( if (error) return error; - if (bma->flist->xbf_low) + if (bma->dfops->dop_low) bma->minleft = 0; if (bma->cur) bma->cur->bc_private.b.firstblock = *bma->firstblock; @@ -4345,7 +4324,7 @@ xfs_bmapi_allocate( if ((ifp->if_flags & XFS_IFBROOT) && !bma->cur) { bma->cur = xfs_bmbt_init_cursor(mp, bma->tp, bma->ip, whichfork); bma->cur->bc_private.b.firstblock = *bma->firstblock; - bma->cur->bc_private.b.flist = bma->flist; + bma->cur->bc_private.b.dfops = bma->dfops; } /* * Bump the number of extents we've allocated @@ -4426,7 +4405,7 @@ xfs_bmapi_convert_unwritten( bma->cur = xfs_bmbt_init_cursor(bma->ip->i_mount, bma->tp, bma->ip, whichfork); bma->cur->bc_private.b.firstblock = *bma->firstblock; - bma->cur->bc_private.b.flist = bma->flist; + bma->cur->bc_private.b.dfops = bma->dfops; } mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN) ? XFS_EXT_NORM : XFS_EXT_UNWRITTEN; @@ -4443,7 +4422,7 @@ xfs_bmapi_convert_unwritten( } error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, &bma->idx, - &bma->cur, mval, bma->firstblock, bma->flist, + &bma->cur, mval, bma->firstblock, bma->dfops, &tmp_logflags); /* * Log the inode core unconditionally in the unwritten extent conversion @@ -4497,7 +4476,7 @@ xfs_bmapi_write( xfs_extlen_t total, /* total blocks needed */ struct xfs_bmbt_irec *mval, /* output: map values */ int *nmap, /* i/o: mval size/count */ - struct xfs_bmap_free *flist) /* i/o: list extents to free */ + struct xfs_defer_ops *dfops) /* i/o: list extents to free */ { struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp; @@ -4587,7 +4566,7 @@ xfs_bmapi_write( bma.ip = ip; bma.total = total; bma.userdata = 0; - bma.flist = flist; + bma.dfops = dfops; bma.firstblock = firstblock; while (bno < end && n < *nmap) { @@ -4701,7 +4680,7 @@ error0: XFS_FSB_TO_AGNO(mp, *firstblock) == XFS_FSB_TO_AGNO(mp, bma.cur->bc_private.b.firstblock) || - (flist->xbf_low && + (dfops->dop_low && XFS_FSB_TO_AGNO(mp, *firstblock) < XFS_FSB_TO_AGNO(mp, bma.cur->bc_private.b.firstblock))); @@ -4785,7 +4764,7 @@ xfs_bmap_del_extent( xfs_inode_t *ip, /* incore inode pointer */ xfs_trans_t *tp, /* current transaction pointer */ xfs_extnum_t *idx, /* extent number to update/delete */ - xfs_bmap_free_t *flist, /* list of extents to be freed */ + struct xfs_defer_ops *dfops, /* list of extents to be freed */ xfs_btree_cur_t *cur, /* if null, not a btree */ xfs_bmbt_irec_t *del, /* data to remove from extents */ int *logflagsp, /* inode logging flags */ @@ -4887,6 +4866,7 @@ xfs_bmap_del_extent( nblks = 0; do_fx = 0; } + /* * Set flag value to use in switch statement. * Left-contig is 2, right-contig is 1. @@ -5069,12 +5049,20 @@ xfs_bmap_del_extent( ++*idx; break; } + + /* remove reverse mapping */ + if (!delay) { + error = xfs_rmap_unmap_extent(mp, dfops, ip, whichfork, del); + if (error) + goto done; + } + /* * If we need to, add to list of extents to delete. */ if (do_fx) - xfs_bmap_add_free(del->br_startblock, del->br_blockcount, flist, - mp); + xfs_bmap_add_free(mp, dfops, del->br_startblock, + del->br_blockcount, NULL); /* * Adjust inode # blocks in the file. */ @@ -5114,7 +5102,7 @@ xfs_bunmapi( xfs_extnum_t nexts, /* number of extents max */ xfs_fsblock_t *firstblock, /* first allocated block controls a.g. for allocs */ - xfs_bmap_free_t *flist, /* i/o: list extents to free */ + struct xfs_defer_ops *dfops, /* i/o: list extents to free */ int *done) /* set if not done yet */ { xfs_btree_cur_t *cur; /* bmap btree cursor */ @@ -5187,7 +5175,7 @@ xfs_bunmapi( ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE); cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); cur->bc_private.b.firstblock = *firstblock; - cur->bc_private.b.flist = flist; + cur->bc_private.b.dfops = dfops; cur->bc_private.b.flags = 0; } else cur = NULL; @@ -5196,8 +5184,10 @@ xfs_bunmapi( /* * Synchronize by locking the bitmap inode. */ - xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL); + xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL|XFS_ILOCK_RTBITMAP); xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL); + xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL|XFS_ILOCK_RTSUM); + xfs_trans_ijoin(tp, mp->m_rsumip, XFS_ILOCK_EXCL); } extno = 0; @@ -5279,7 +5269,7 @@ xfs_bunmapi( } del.br_state = XFS_EXT_UNWRITTEN; error = xfs_bmap_add_extent_unwritten_real(tp, ip, - &lastx, &cur, &del, firstblock, flist, + &lastx, &cur, &del, firstblock, dfops, &logflags); if (error) goto error0; @@ -5338,7 +5328,7 @@ xfs_bunmapi( lastx--; error = xfs_bmap_add_extent_unwritten_real(tp, ip, &lastx, &cur, &prev, - firstblock, flist, &logflags); + firstblock, dfops, &logflags); if (error) goto error0; goto nodelete; @@ -5347,7 +5337,7 @@ xfs_bunmapi( del.br_state = XFS_EXT_UNWRITTEN; error = xfs_bmap_add_extent_unwritten_real(tp, ip, &lastx, &cur, &del, - firstblock, flist, &logflags); + firstblock, dfops, &logflags); if (error) goto error0; goto nodelete; @@ -5405,7 +5395,7 @@ xfs_bunmapi( } else if (cur) cur->bc_private.b.flags &= ~XFS_BTCUR_BPRV_WASDEL; - error = xfs_bmap_del_extent(ip, tp, &lastx, flist, cur, &del, + error = xfs_bmap_del_extent(ip, tp, &lastx, dfops, cur, &del, &tmp_logflags, whichfork); logflags |= tmp_logflags; if (error) @@ -5439,7 +5429,7 @@ nodelete: */ if (xfs_bmap_needs_btree(ip, whichfork)) { ASSERT(cur == NULL); - error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist, + error = xfs_bmap_extents_to_btree(tp, ip, firstblock, dfops, &cur, 0, &tmp_logflags, whichfork); logflags |= tmp_logflags; if (error) @@ -5606,7 +5596,8 @@ xfs_bmse_shift_one( struct xfs_bmbt_rec_host *gotp, struct xfs_btree_cur *cur, int *logflags, - enum shift_direction direction) + enum shift_direction direction, + struct xfs_defer_ops *dfops) { struct xfs_ifork *ifp; struct xfs_mount *mp; @@ -5654,9 +5645,13 @@ xfs_bmse_shift_one( /* check whether to merge the extent or shift it down */ if (xfs_bmse_can_merge(&adj_irec, &got, offset_shift_fsb)) { - return xfs_bmse_merge(ip, whichfork, offset_shift_fsb, - *current_ext, gotp, adj_irecp, - cur, logflags); + error = xfs_bmse_merge(ip, whichfork, offset_shift_fsb, + *current_ext, gotp, adj_irecp, + cur, logflags); + if (error) + return error; + adj_irec = got; + goto update_rmap; } } else { startoff = got.br_startoff + offset_shift_fsb; @@ -5693,9 +5688,10 @@ update_current_ext: (*current_ext)--; xfs_bmbt_set_startoff(gotp, startoff); *logflags |= XFS_ILOG_CORE; + adj_irec = got; if (!cur) { *logflags |= XFS_ILOG_DEXT; - return 0; + goto update_rmap; } error = xfs_bmbt_lookup_eq(cur, got.br_startoff, got.br_startblock, @@ -5705,8 +5701,18 @@ update_current_ext: XFS_WANT_CORRUPTED_RETURN(mp, i == 1); got.br_startoff = startoff; - return xfs_bmbt_update(cur, got.br_startoff, got.br_startblock, - got.br_blockcount, got.br_state); + error = xfs_bmbt_update(cur, got.br_startoff, got.br_startblock, + got.br_blockcount, got.br_state); + if (error) + return error; + +update_rmap: + /* update reverse mapping */ + error = xfs_rmap_unmap_extent(mp, dfops, ip, whichfork, &adj_irec); + if (error) + return error; + adj_irec.br_startoff = startoff; + return xfs_rmap_map_extent(mp, dfops, ip, whichfork, &adj_irec); } /* @@ -5728,7 +5734,7 @@ xfs_bmap_shift_extents( int *done, xfs_fileoff_t stop_fsb, xfs_fsblock_t *firstblock, - struct xfs_bmap_free *flist, + struct xfs_defer_ops *dfops, enum shift_direction direction, int num_exts) { @@ -5773,7 +5779,7 @@ xfs_bmap_shift_extents( if (ifp->if_flags & XFS_IFBROOT) { cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); cur->bc_private.b.firstblock = *firstblock; - cur->bc_private.b.flist = flist; + cur->bc_private.b.dfops = dfops; cur->bc_private.b.flags = 0; } @@ -5834,7 +5840,7 @@ xfs_bmap_shift_extents( while (nexts++ < num_exts) { error = xfs_bmse_shift_one(ip, whichfork, offset_shift_fsb, ¤t_ext, gotp, cur, &logflags, - direction); + direction, dfops); if (error) goto del_cursor; /* @@ -5882,7 +5888,7 @@ xfs_bmap_split_extent_at( struct xfs_inode *ip, xfs_fileoff_t split_fsb, xfs_fsblock_t *firstfsb, - struct xfs_bmap_free *free_list) + struct xfs_defer_ops *dfops) { int whichfork = XFS_DATA_FORK; struct xfs_btree_cur *cur = NULL; @@ -5944,7 +5950,7 @@ xfs_bmap_split_extent_at( if (ifp->if_flags & XFS_IFBROOT) { cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); cur->bc_private.b.firstblock = *firstfsb; - cur->bc_private.b.flist = free_list; + cur->bc_private.b.dfops = dfops; cur->bc_private.b.flags = 0; error = xfs_bmbt_lookup_eq(cur, got.br_startoff, got.br_startblock, @@ -5997,7 +6003,7 @@ xfs_bmap_split_extent_at( int tmp_logflags; /* partial log flag return val */ ASSERT(cur == NULL); - error = xfs_bmap_extents_to_btree(tp, ip, firstfsb, free_list, + error = xfs_bmap_extents_to_btree(tp, ip, firstfsb, dfops, &cur, 0, &tmp_logflags, whichfork); logflags |= tmp_logflags; } @@ -6021,7 +6027,7 @@ xfs_bmap_split_extent( { struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; - struct xfs_bmap_free free_list; + struct xfs_defer_ops dfops; xfs_fsblock_t firstfsb; int error; @@ -6033,21 +6039,21 @@ xfs_bmap_split_extent( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - xfs_bmap_init(&free_list, &firstfsb); + xfs_defer_init(&dfops, &firstfsb); error = xfs_bmap_split_extent_at(tp, ip, split_fsb, - &firstfsb, &free_list); + &firstfsb, &dfops); if (error) goto out; - error = xfs_bmap_finish(&tp, &free_list, NULL); + error = xfs_defer_finish(&tp, &dfops, NULL); if (error) goto out; return xfs_trans_commit(tp); out: - xfs_bmap_cancel(&free_list); + xfs_defer_cancel(&dfops); xfs_trans_cancel(tp); return error; } diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 423a34e83..254034f96 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -32,7 +32,7 @@ extern kmem_zone_t *xfs_bmap_free_item_zone; */ struct xfs_bmalloca { xfs_fsblock_t *firstblock; /* i/o first block allocated */ - struct xfs_bmap_free *flist; /* bmap freelist */ + struct xfs_defer_ops *dfops; /* bmap freelist */ struct xfs_trans *tp; /* transaction pointer */ struct xfs_inode *ip; /* incore inode pointer */ struct xfs_bmbt_irec prev; /* extent before the new one */ @@ -62,33 +62,13 @@ struct xfs_bmalloca { * List of extents to be free "later". * The list is kept sorted on xbf_startblock. */ -typedef struct xfs_bmap_free_item +struct xfs_extent_free_item { - xfs_fsblock_t xbfi_startblock;/* starting fs block number */ - xfs_extlen_t xbfi_blockcount;/* number of blocks in extent */ - struct xfs_bmap_free_item *xbfi_next; /* link to next entry */ -} xfs_bmap_free_item_t; - -/* - * Header for free extent list. - * - * xbf_low is used by the allocator to activate the lowspace algorithm - - * when free space is running low the extent allocator may choose to - * allocate an extent from an AG without leaving sufficient space for - * a btree split when inserting the new extent. In this case the allocator - * will enable the lowspace algorithm which is supposed to allow further - * allocations (such as btree splits and newroots) to allocate from - * sequential AGs. In order to avoid locking AGs out of order the lowspace - * algorithm will start searching for free space from AG 0. If the correct - * transaction reservations have been made then this algorithm will eventually - * find all the space it needs. - */ -typedef struct xfs_bmap_free -{ - xfs_bmap_free_item_t *xbf_first; /* list of to-be-free extents */ - int xbf_count; /* count of items on list */ - int xbf_low; /* alloc in low mode */ -} xfs_bmap_free_t; + xfs_fsblock_t xefi_startblock;/* starting fs block number */ + xfs_extlen_t xefi_blockcount;/* number of blocks in extent */ + struct list_head xefi_list; + struct xfs_owner_info xefi_oinfo; /* extent owner */ +}; #define XFS_BMAP_MAX_NMAP 4 @@ -139,12 +119,6 @@ static inline int xfs_bmapi_aflag(int w) #define DELAYSTARTBLOCK ((xfs_fsblock_t)-1LL) #define HOLESTARTBLOCK ((xfs_fsblock_t)-2LL) -static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp) -{ - ((flp)->xbf_first = NULL, (flp)->xbf_count = 0, \ - (flp)->xbf_low = 0, *(fbp) = NULLFSBLOCK); -} - /* * Flags for xfs_bmap_add_extent*. */ @@ -191,11 +165,9 @@ void xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt, int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd); void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork); -void xfs_bmap_add_free(xfs_fsblock_t bno, xfs_filblks_t len, - struct xfs_bmap_free *flist, struct xfs_mount *mp); -void xfs_bmap_cancel(struct xfs_bmap_free *flist); -int xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist, - struct xfs_inode *ip); +void xfs_bmap_add_free(struct xfs_mount *mp, struct xfs_defer_ops *dfops, + xfs_fsblock_t bno, xfs_filblks_t len, + struct xfs_owner_info *oinfo); void xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork); int xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip, xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork); @@ -216,18 +188,18 @@ int xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, int flags, xfs_fsblock_t *firstblock, xfs_extlen_t total, struct xfs_bmbt_irec *mval, int *nmap, - struct xfs_bmap_free *flist); + struct xfs_defer_ops *dfops); int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, int flags, xfs_extnum_t nexts, xfs_fsblock_t *firstblock, - struct xfs_bmap_free *flist, int *done); + struct xfs_defer_ops *dfops, int *done); int xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx, xfs_extnum_t num); uint xfs_default_attroffset(struct xfs_inode *ip); int xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb, int *done, xfs_fileoff_t stop_fsb, xfs_fsblock_t *firstblock, - struct xfs_bmap_free *flist, enum shift_direction direction, + struct xfs_defer_ops *dfops, enum shift_direction direction, int num_exts); int xfs_bmap_split_extent(struct xfs_inode *ip, xfs_fileoff_t split_offset); diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 6282f6e70..cd85274e8 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -23,6 +23,7 @@ #include "xfs_trans_resv.h" #include "xfs_bit.h" #include "xfs_mount.h" +#include "xfs_defer.h" #include "xfs_inode.h" #include "xfs_trans.h" #include "xfs_inode_item.h" @@ -34,6 +35,7 @@ #include "xfs_quota.h" #include "xfs_trace.h" #include "xfs_cksum.h" +#include "xfs_rmap.h" /* * Determine the extent state. @@ -406,11 +408,11 @@ xfs_bmbt_dup_cursor( cur->bc_private.b.ip, cur->bc_private.b.whichfork); /* - * Copy the firstblock, flist, and flags values, + * Copy the firstblock, dfops, and flags values, * since init cursor doesn't get them. */ new->bc_private.b.firstblock = cur->bc_private.b.firstblock; - new->bc_private.b.flist = cur->bc_private.b.flist; + new->bc_private.b.dfops = cur->bc_private.b.dfops; new->bc_private.b.flags = cur->bc_private.b.flags; return new; @@ -423,7 +425,7 @@ xfs_bmbt_update_cursor( { ASSERT((dst->bc_private.b.firstblock != NULLFSBLOCK) || (dst->bc_private.b.ip->i_d.di_flags & XFS_DIFLAG_REALTIME)); - ASSERT(dst->bc_private.b.flist == src->bc_private.b.flist); + ASSERT(dst->bc_private.b.dfops == src->bc_private.b.dfops); dst->bc_private.b.allocated += src->bc_private.b.allocated; dst->bc_private.b.firstblock = src->bc_private.b.firstblock; @@ -446,6 +448,8 @@ xfs_bmbt_alloc_block( args.mp = cur->bc_mp; args.fsbno = cur->bc_private.b.firstblock; args.firstblock = args.fsbno; + xfs_rmap_ino_bmbt_owner(&args.oinfo, cur->bc_private.b.ip->i_ino, + cur->bc_private.b.whichfork); if (args.fsbno == NULLFSBLOCK) { args.fsbno = be64_to_cpu(start->l); @@ -462,7 +466,7 @@ xfs_bmbt_alloc_block( * block allocation here and corrupt the filesystem. */ args.minleft = args.tp->t_blk_res; - } else if (cur->bc_private.b.flist->xbf_low) { + } else if (cur->bc_private.b.dfops->dop_low) { args.type = XFS_ALLOCTYPE_START_BNO; } else { args.type = XFS_ALLOCTYPE_NEAR_BNO; @@ -490,7 +494,7 @@ xfs_bmbt_alloc_block( error = xfs_alloc_vextent(&args); if (error) goto error0; - cur->bc_private.b.flist->xbf_low = 1; + cur->bc_private.b.dfops->dop_low = true; } if (args.fsbno == NULLFSBLOCK) { XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); @@ -525,8 +529,10 @@ xfs_bmbt_free_block( struct xfs_inode *ip = cur->bc_private.b.ip; struct xfs_trans *tp = cur->bc_tp; xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp)); + struct xfs_owner_info oinfo; - xfs_bmap_add_free(fsbno, 1, cur->bc_private.b.flist, mp); + xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_private.b.whichfork); + xfs_bmap_add_free(mp, cur->bc_private.b.dfops, fsbno, 1, &oinfo); ip->i_d.di_nblocks--; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); @@ -599,17 +605,6 @@ xfs_bmbt_init_key_from_rec( cpu_to_be64(xfs_bmbt_disk_get_startoff(&rec->bmbt)); } -STATIC void -xfs_bmbt_init_rec_from_key( - union xfs_btree_key *key, - union xfs_btree_rec *rec) -{ - ASSERT(key->bmbt.br_startoff != 0); - - xfs_bmbt_disk_set_allf(&rec->bmbt, be64_to_cpu(key->bmbt.br_startoff), - 0, 0, XFS_EXT_NORM); -} - STATIC void xfs_bmbt_init_rec_from_cur( struct xfs_btree_cur *cur, @@ -760,7 +755,6 @@ static const struct xfs_btree_ops xfs_bmbt_ops = { .get_minrecs = xfs_bmbt_get_minrecs, .get_dmaxrecs = xfs_bmbt_get_dmaxrecs, .init_key_from_rec = xfs_bmbt_init_key_from_rec, - .init_rec_from_key = xfs_bmbt_init_rec_from_key, .init_rec_from_cur = xfs_bmbt_init_rec_from_cur, .init_ptr_from_cur = xfs_bmbt_init_ptr_from_cur, .key_diff = xfs_bmbt_key_diff, @@ -800,7 +794,7 @@ xfs_bmbt_init_cursor( cur->bc_private.b.forksize = XFS_IFORK_SIZE(ip, whichfork); cur->bc_private.b.ip = ip; cur->bc_private.b.firstblock = NULLFSBLOCK; - cur->bc_private.b.flist = NULL; + cur->bc_private.b.dfops = NULL; cur->bc_private.b.allocated = 0; cur->bc_private.b.flags = 0; cur->bc_private.b.whichfork = whichfork; diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 1f88e1ce7..08569792f 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -23,6 +23,7 @@ #include "xfs_trans_resv.h" #include "xfs_bit.h" #include "xfs_mount.h" +#include "xfs_defer.h" #include "xfs_inode.h" #include "xfs_trans.h" #include "xfs_inode_item.h" @@ -43,15 +44,14 @@ kmem_zone_t *xfs_btree_cur_zone; * Btree magic numbers. */ static const __uint32_t xfs_magics[2][XFS_BTNUM_MAX] = { - { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC, + { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, 0, XFS_BMAP_MAGIC, XFS_IBT_MAGIC, XFS_FIBT_MAGIC }, - { XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC, + { XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC, XFS_RMAP_CRC_MAGIC, XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC } }; #define xfs_btree_magic(cur) \ xfs_magics[!!((cur)->bc_flags & XFS_BTREE_CRC_BLOCKS)][cur->bc_btnum] - STATIC int /* error (0 or EFSCORRUPTED) */ xfs_btree_check_lblock( struct xfs_btree_cur *cur, /* btree cursor */ @@ -428,6 +428,50 @@ xfs_btree_dup_cursor( * into a btree block (xfs_btree_*_offset) or return a pointer to the given * record, key or pointer (xfs_btree_*_addr). Note that all addressing * inside the btree block is done using indices starting at one, not zero! + * + * If XFS_BTREE_OVERLAPPING is set, then this btree supports keys containing + * overlapping intervals. In such a tree, records are still sorted lowest to + * highest and indexed by the smallest key value that refers to the record. + * However, nodes are different: each pointer has two associated keys -- one + * indexing the lowest key available in the block(s) below (the same behavior + * as the key in a regular btree) and another indexing the highest key + * available in the block(s) below. Because records are /not/ sorted by the + * highest key, all leaf block updates require us to compute the highest key + * that matches any record in the leaf and to recursively update the high keys + * in the nodes going further up in the tree, if necessary. Nodes look like + * this: + * + * +--------+-----+-----+-----+-----+-----+-------+-------+-----+ + * Non-Leaf: | header | lo1 | hi1 | lo2 | hi2 | ... | ptr 1 | ptr 2 | ... | + * +--------+-----+-----+-----+-----+-----+-------+-------+-----+ + * + * To perform an interval query on an overlapped tree, perform the usual + * depth-first search and use the low and high keys to decide if we can skip + * that particular node. If a leaf node is reached, return the records that + * intersect the interval. Note that an interval query may return numerous + * entries. For a non-overlapped tree, simply search for the record associated + * with the lowest key and iterate forward until a non-matching record is + * found. Section 14.3 ("Interval Trees") of _Introduction to Algorithms_ by + * Cormen, Leiserson, Rivest, and Stein (2nd or 3rd ed. only) discuss this in + * more detail. + * + * Why do we care about overlapping intervals? Let's say you have a bunch of + * reverse mapping records on a reflink filesystem: + * + * 1: +- file A startblock B offset C length D -----------+ + * 2: +- file E startblock F offset G length H --------------+ + * 3: +- file I startblock F offset J length K --+ + * 4: +- file L... --+ + * + * Now say we want to map block (B+D) into file A at offset (C+D). Ideally, + * we'd simply increment the length of record 1. But how do we find the record + * that ends at (B+D-1) (i.e. record 1)? A LE lookup of (B+D-1) would return + * record 3 because the keys are ordered first by startblock. An interval + * query would return records 1 and 2 because they both overlap (B+D-1), and + * from that we can pick out record 1 as the appropriate left neighbor. + * + * In the non-overlapped case you can do a LE lookup and decrement the cursor + * because a record's interval must end before the next record. */ /* @@ -478,6 +522,18 @@ xfs_btree_key_offset( (n - 1) * cur->bc_ops->key_len; } +/* + * Calculate offset of the n-th high key in a btree block. + */ +STATIC size_t +xfs_btree_high_key_offset( + struct xfs_btree_cur *cur, + int n) +{ + return xfs_btree_block_len(cur) + + (n - 1) * cur->bc_ops->key_len + (cur->bc_ops->key_len / 2); +} + /* * Calculate offset of the n-th block pointer in a btree block. */ @@ -518,6 +574,19 @@ xfs_btree_key_addr( ((char *)block + xfs_btree_key_offset(cur, n)); } +/* + * Return a pointer to the n-th high key in the btree block. + */ +STATIC union xfs_btree_key * +xfs_btree_high_key_addr( + struct xfs_btree_cur *cur, + int n, + struct xfs_btree_block *block) +{ + return (union xfs_btree_key *) + ((char *)block + xfs_btree_high_key_offset(cur, n)); +} + /* * Return a pointer to the n-th block pointer in the btree block. */ @@ -543,12 +612,12 @@ xfs_btree_ptr_addr( */ STATIC struct xfs_btree_block * xfs_btree_get_iroot( - struct xfs_btree_cur *cur) + struct xfs_btree_cur *cur) { - struct xfs_ifork *ifp; + struct xfs_ifork *ifp; - ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork); - return (struct xfs_btree_block *)ifp->if_broot; + ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork); + return (struct xfs_btree_block *)ifp->if_broot; } /* @@ -1144,6 +1213,9 @@ xfs_btree_set_refs( case XFS_BTNUM_BMAP: xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF); break; + case XFS_BTNUM_RMAP: + xfs_buf_set_ref(bp, XFS_RMAP_BTREE_REF); + break; default: ASSERT(0); } @@ -1742,6 +1814,10 @@ xfs_btree_lookup( XFS_BTREE_STATS_INC(cur, lookup); + /* No such thing as a zero-level tree. */ + if (cur->bc_nlevels == 0) + return -EFSCORRUPTED; + block = NULL; keyno = 0; @@ -1879,32 +1955,214 @@ error0: return error; } +/* Find the high key storage area from a regular key. */ +STATIC union xfs_btree_key * +xfs_btree_high_key_from_key( + struct xfs_btree_cur *cur, + union xfs_btree_key *key) +{ + ASSERT(cur->bc_flags & XFS_BTREE_OVERLAPPING); + return (union xfs_btree_key *)((char *)key + + (cur->bc_ops->key_len / 2)); +} + +/* Determine the low (and high if overlapped) keys of a leaf block */ +STATIC void +xfs_btree_get_leaf_keys( + struct xfs_btree_cur *cur, + struct xfs_btree_block *block, + union xfs_btree_key *key) +{ + union xfs_btree_key max_hkey; + union xfs_btree_key hkey; + union xfs_btree_rec *rec; + union xfs_btree_key *high; + int n; + + rec = xfs_btree_rec_addr(cur, 1, block); + cur->bc_ops->init_key_from_rec(key, rec); + + if (cur->bc_flags & XFS_BTREE_OVERLAPPING) { + + cur->bc_ops->init_high_key_from_rec(&max_hkey, rec); + for (n = 2; n <= xfs_btree_get_numrecs(block); n++) { + rec = xfs_btree_rec_addr(cur, n, block); + cur->bc_ops->init_high_key_from_rec(&hkey, rec); + if (cur->bc_ops->diff_two_keys(cur, &hkey, &max_hkey) + > 0) + max_hkey = hkey; + } + + high = xfs_btree_high_key_from_key(cur, key); + memcpy(high, &max_hkey, cur->bc_ops->key_len / 2); + } +} + +/* Determine the low (and high if overlapped) keys of a node block */ +STATIC void +xfs_btree_get_node_keys( + struct xfs_btree_cur *cur, + struct xfs_btree_block *block, + union xfs_btree_key *key) +{ + union xfs_btree_key *hkey; + union xfs_btree_key *max_hkey; + union xfs_btree_key *high; + int n; + + if (cur->bc_flags & XFS_BTREE_OVERLAPPING) { + memcpy(key, xfs_btree_key_addr(cur, 1, block), + cur->bc_ops->key_len / 2); + + max_hkey = xfs_btree_high_key_addr(cur, 1, block); + for (n = 2; n <= xfs_btree_get_numrecs(block); n++) { + hkey = xfs_btree_high_key_addr(cur, n, block); + if (cur->bc_ops->diff_two_keys(cur, hkey, max_hkey) > 0) + max_hkey = hkey; + } + + high = xfs_btree_high_key_from_key(cur, key); + memcpy(high, max_hkey, cur->bc_ops->key_len / 2); + } else { + memcpy(key, xfs_btree_key_addr(cur, 1, block), + cur->bc_ops->key_len); + } +} + +/* Derive the keys for any btree block. */ +STATIC void +xfs_btree_get_keys( + struct xfs_btree_cur *cur, + struct xfs_btree_block *block, + union xfs_btree_key *key) +{ + if (be16_to_cpu(block->bb_level) == 0) + xfs_btree_get_leaf_keys(cur, block, key); + else + xfs_btree_get_node_keys(cur, block, key); +} + +/* + * Decide if we need to update the parent keys of a btree block. For + * a standard btree this is only necessary if we're updating the first + * record/key. For an overlapping btree, we must always update the + * keys because the highest key can be in any of the records or keys + * in the block. + */ +static inline bool +xfs_btree_needs_key_update( + struct xfs_btree_cur *cur, + int ptr) +{ + return (cur->bc_flags & XFS_BTREE_OVERLAPPING) || ptr == 1; +} + +/* + * Update the low and high parent keys of the given level, progressing + * towards the root. If force_all is false, stop if the keys for a given + * level do not need updating. + */ +STATIC int +__xfs_btree_updkeys( + struct xfs_btree_cur *cur, + int level, + struct xfs_btree_block *block, + struct xfs_buf *bp0, + bool force_all) +{ + union xfs_btree_bigkey key; /* keys from current level */ + union xfs_btree_key *lkey; /* keys from the next level up */ + union xfs_btree_key *hkey; + union xfs_btree_key *nlkey; /* keys from the next level up */ + union xfs_btree_key *nhkey; + struct xfs_buf *bp; + int ptr; + + ASSERT(cur->bc_flags & XFS_BTREE_OVERLAPPING); + + /* Exit if there aren't any parent levels to update. */ + if (level + 1 >= cur->bc_nlevels) + return 0; + + trace_xfs_btree_updkeys(cur, level, bp0); + + lkey = (union xfs_btree_key *)&key; + hkey = xfs_btree_high_key_from_key(cur, lkey); + xfs_btree_get_keys(cur, block, lkey); + for (level++; level < cur->bc_nlevels; level++) { +#ifdef DEBUG + int error; +#endif + block = xfs_btree_get_block(cur, level, &bp); + trace_xfs_btree_updkeys(cur, level, bp); +#ifdef DEBUG + error = xfs_btree_check_block(cur, block, level, bp); + if (error) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; + } +#endif + ptr = cur->bc_ptrs[level]; + nlkey = xfs_btree_key_addr(cur, ptr, block); + nhkey = xfs_btree_high_key_addr(cur, ptr, block); + if (!force_all && + !(cur->bc_ops->diff_two_keys(cur, nlkey, lkey) != 0 || + cur->bc_ops->diff_two_keys(cur, nhkey, hkey) != 0)) + break; + xfs_btree_copy_keys(cur, nlkey, lkey, 1); + xfs_btree_log_keys(cur, bp, ptr, ptr); + if (level + 1 >= cur->bc_nlevels) + break; + xfs_btree_get_node_keys(cur, block, lkey); + } + + return 0; +} + +/* Update all the keys from some level in cursor back to the root. */ +STATIC int +xfs_btree_updkeys_force( + struct xfs_btree_cur *cur, + int level) +{ + struct xfs_buf *bp; + struct xfs_btree_block *block; + + block = xfs_btree_get_block(cur, level, &bp); + return __xfs_btree_updkeys(cur, level, block, bp, true); +} + /* - * Update keys at all levels from here to the root along the cursor's path. + * Update the parent keys of the given level, progressing towards the root. */ STATIC int -xfs_btree_updkey( +xfs_btree_update_keys( struct xfs_btree_cur *cur, - union xfs_btree_key *keyp, int level) { struct xfs_btree_block *block; struct xfs_buf *bp; union xfs_btree_key *kp; + union xfs_btree_key key; int ptr; + ASSERT(level >= 0); + + block = xfs_btree_get_block(cur, level, &bp); + if (cur->bc_flags & XFS_BTREE_OVERLAPPING) + return __xfs_btree_updkeys(cur, level, block, bp, false); + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); XFS_BTREE_TRACE_ARGIK(cur, level, keyp); - ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) || level >= 1); - /* * Go up the tree from this level toward the root. * At each level, update the key value to the value input. * Stop when we reach a level where the cursor isn't pointing * at the first entry in the block. */ - for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) { + xfs_btree_get_keys(cur, block, &key); + for (level++, ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) { #ifdef DEBUG int error; #endif @@ -1918,7 +2176,7 @@ xfs_btree_updkey( #endif ptr = cur->bc_ptrs[level]; kp = xfs_btree_key_addr(cur, ptr, block); - xfs_btree_copy_keys(cur, kp, keyp, 1); + xfs_btree_copy_keys(cur, kp, &key, 1); xfs_btree_log_keys(cur, bp, ptr, ptr); } @@ -1970,12 +2228,9 @@ xfs_btree_update( ptr, LASTREC_UPDATE); } - /* Updating first rec in leaf. Pass new key value up to our parent. */ - if (ptr == 1) { - union xfs_btree_key key; - - cur->bc_ops->init_key_from_rec(&key, rec); - error = xfs_btree_updkey(cur, &key, 1); + /* Pass new key value up to our parent. */ + if (xfs_btree_needs_key_update(cur, ptr)) { + error = xfs_btree_update_keys(cur, 0); if (error) goto error0; } @@ -1998,18 +2253,19 @@ xfs_btree_lshift( int level, int *stat) /* success/failure */ { - union xfs_btree_key key; /* btree key */ struct xfs_buf *lbp; /* left buffer pointer */ struct xfs_btree_block *left; /* left btree block */ int lrecs; /* left record count */ struct xfs_buf *rbp; /* right buffer pointer */ struct xfs_btree_block *right; /* right btree block */ + struct xfs_btree_cur *tcur; /* temporary btree cursor */ int rrecs; /* right record count */ union xfs_btree_ptr lptr; /* left btree pointer */ union xfs_btree_key *rkp = NULL; /* right btree key */ union xfs_btree_ptr *rpp = NULL; /* right address pointer */ union xfs_btree_rec *rrp = NULL; /* right record pointer */ int error; /* error return value */ + int i; XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); XFS_BTREE_TRACE_ARGI(cur, level); @@ -2139,18 +2395,33 @@ xfs_btree_lshift( xfs_btree_rec_addr(cur, 2, right), -1, rrecs); xfs_btree_log_recs(cur, rbp, 1, rrecs); + } - /* - * If it's the first record in the block, we'll need a key - * structure to pass up to the next level (updkey). - */ - cur->bc_ops->init_key_from_rec(&key, - xfs_btree_rec_addr(cur, 1, right)); - rkp = &key; + /* + * Using a temporary cursor, update the parent key values of the + * block on the left. + */ + if (cur->bc_flags & XFS_BTREE_OVERLAPPING) { + error = xfs_btree_dup_cursor(cur, &tcur); + if (error) + goto error0; + i = xfs_btree_firstrec(tcur, level); + XFS_WANT_CORRUPTED_GOTO(tcur->bc_mp, i == 1, error0); + + error = xfs_btree_decrement(tcur, level, &i); + if (error) + goto error1; + + /* Update the parent high keys of the left block, if needed. */ + error = xfs_btree_update_keys(tcur, level); + if (error) + goto error1; + + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); } - /* Update the parent key values of right. */ - error = xfs_btree_updkey(cur, rkp, level + 1); + /* Update the parent keys of the right block. */ + error = xfs_btree_update_keys(cur, level); if (error) goto error0; @@ -2169,6 +2440,11 @@ out0: error0: XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); return error; + +error1: + XFS_BTREE_TRACE_CURSOR(tcur, XBT_ERROR); + xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); + return error; } /* @@ -2181,7 +2457,6 @@ xfs_btree_rshift( int level, int *stat) /* success/failure */ { - union xfs_btree_key key; /* btree key */ struct xfs_buf *lbp; /* left buffer pointer */ struct xfs_btree_block *left; /* left btree block */ struct xfs_buf *rbp; /* right buffer pointer */ @@ -2290,12 +2565,6 @@ xfs_btree_rshift( /* Now put the new data in, and log it. */ xfs_btree_copy_recs(cur, rrp, lrp, 1); xfs_btree_log_recs(cur, rbp, 1, rrecs + 1); - - cur->bc_ops->init_key_from_rec(&key, rrp); - rkp = &key; - - ASSERT(cur->bc_ops->recs_inorder(cur, rrp, - xfs_btree_rec_addr(cur, 2, right))); } /* @@ -2315,13 +2584,21 @@ xfs_btree_rshift( if (error) goto error0; i = xfs_btree_lastrec(tcur, level); - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(tcur->bc_mp, i == 1, error0); error = xfs_btree_increment(tcur, level, &i); if (error) goto error1; - error = xfs_btree_updkey(tcur, rkp, level + 1); + /* Update the parent high keys of the left block, if needed. */ + if (cur->bc_flags & XFS_BTREE_OVERLAPPING) { + error = xfs_btree_update_keys(cur, level); + if (error) + goto error1; + } + + /* Update the parent keys of the right block. */ + error = xfs_btree_update_keys(tcur, level); if (error) goto error1; @@ -2422,6 +2699,11 @@ __xfs_btree_split( XFS_BTREE_STATS_ADD(cur, moves, rrecs); + /* Adjust numrecs for the later get_*_keys() calls. */ + lrecs -= rrecs; + xfs_btree_set_numrecs(left, lrecs); + xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs); + /* * Copy btree block entries from the left block over to the * new block, the right. Update the right block and log the @@ -2447,14 +2729,15 @@ __xfs_btree_split( } #endif + /* Copy the keys & pointers to the new block. */ xfs_btree_copy_keys(cur, rkp, lkp, rrecs); xfs_btree_copy_ptrs(cur, rpp, lpp, rrecs); xfs_btree_log_keys(cur, rbp, 1, rrecs); xfs_btree_log_ptrs(cur, rbp, 1, rrecs); - /* Grab the keys to the entries moved to the right block */ - xfs_btree_copy_keys(cur, key, rkp, 1); + /* Stash the keys of the new block for later insertion. */ + xfs_btree_get_node_keys(cur, right, key); } else { /* It's a leaf. Move records. */ union xfs_btree_rec *lrp; /* left record pointer */ @@ -2463,27 +2746,23 @@ __xfs_btree_split( lrp = xfs_btree_rec_addr(cur, src_index, left); rrp = xfs_btree_rec_addr(cur, 1, right); + /* Copy records to the new block. */ xfs_btree_copy_recs(cur, rrp, lrp, rrecs); xfs_btree_log_recs(cur, rbp, 1, rrecs); - cur->bc_ops->init_key_from_rec(key, - xfs_btree_rec_addr(cur, 1, right)); + /* Stash the keys of the new block for later insertion. */ + xfs_btree_get_leaf_keys(cur, right, key); } - /* * Find the left block number by looking in the buffer. - * Adjust numrecs, sibling pointers. + * Adjust sibling pointers. */ xfs_btree_get_sibling(cur, left, &rrptr, XFS_BB_RIGHTSIB); xfs_btree_set_sibling(cur, right, &rrptr, XFS_BB_RIGHTSIB); xfs_btree_set_sibling(cur, right, &lptr, XFS_BB_LEFTSIB); xfs_btree_set_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB); - lrecs -= rrecs; - xfs_btree_set_numrecs(left, lrecs); - xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs); - xfs_btree_log_block(cur, rbp, XFS_BB_ALL_BITS); xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB); @@ -2499,6 +2778,14 @@ __xfs_btree_split( xfs_btree_set_sibling(cur, rrblock, &rptr, XFS_BB_LEFTSIB); xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB); } + + /* Update the parent high keys of the left block, if needed. */ + if (cur->bc_flags & XFS_BTREE_OVERLAPPING) { + error = xfs_btree_update_keys(cur, level); + if (error) + goto error0; + } + /* * If the cursor is really in the right block, move it there. * If it's just pointing past the last entry in left, then we'll @@ -2802,6 +3089,7 @@ xfs_btree_new_root( bp = lbp; nptr = 2; } + /* Fill in the new block's btree header and log it. */ xfs_btree_init_block_cur(cur, nbp, cur->bc_nlevels, 2); xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS); @@ -2810,19 +3098,24 @@ xfs_btree_new_root( /* Fill in the key data in the new root. */ if (xfs_btree_get_level(left) > 0) { - xfs_btree_copy_keys(cur, - xfs_btree_key_addr(cur, 1, new), - xfs_btree_key_addr(cur, 1, left), 1); - xfs_btree_copy_keys(cur, - xfs_btree_key_addr(cur, 2, new), - xfs_btree_key_addr(cur, 1, right), 1); + /* + * Get the keys for the left block's keys and put them directly + * in the parent block. Do the same for the right block. + */ + xfs_btree_get_node_keys(cur, left, + xfs_btree_key_addr(cur, 1, new)); + xfs_btree_get_node_keys(cur, right, + xfs_btree_key_addr(cur, 2, new)); } else { - cur->bc_ops->init_key_from_rec( - xfs_btree_key_addr(cur, 1, new), - xfs_btree_rec_addr(cur, 1, left)); - cur->bc_ops->init_key_from_rec( - xfs_btree_key_addr(cur, 2, new), - xfs_btree_rec_addr(cur, 1, right)); + /* + * Get the keys for the left block's records and put them + * directly in the parent block. Do the same for the right + * block. + */ + xfs_btree_get_leaf_keys(cur, left, + xfs_btree_key_addr(cur, 1, new)); + xfs_btree_get_leaf_keys(cur, right, + xfs_btree_key_addr(cur, 2, new)); } xfs_btree_log_keys(cur, nbp, 1, 2); @@ -2858,10 +3151,9 @@ xfs_btree_make_block_unfull( int *index, /* new tree index */ union xfs_btree_ptr *nptr, /* new btree ptr */ struct xfs_btree_cur **ncur, /* new btree cursor */ - union xfs_btree_rec *nrec, /* new record */ + union xfs_btree_key *key, /* key of new block */ int *stat) { - union xfs_btree_key key; /* new btree key value */ int error = 0; if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && @@ -2871,6 +3163,7 @@ xfs_btree_make_block_unfull( if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) { /* A root block that can be made bigger. */ xfs_iroot_realloc(ip, 1, cur->bc_private.b.whichfork); + *stat = 1; } else { /* A root block that needs replacing */ int logflags = 0; @@ -2906,13 +3199,12 @@ xfs_btree_make_block_unfull( * If this works we have to re-set our variables because we * could be in a different block now. */ - error = xfs_btree_split(cur, level, nptr, &key, ncur, stat); + error = xfs_btree_split(cur, level, nptr, key, ncur, stat); if (error || *stat == 0) return error; *index = cur->bc_ptrs[level]; - cur->bc_ops->init_rec_from_key(&key, nrec); return 0; } @@ -2925,16 +3217,17 @@ xfs_btree_insrec( struct xfs_btree_cur *cur, /* btree cursor */ int level, /* level to insert record at */ union xfs_btree_ptr *ptrp, /* i/o: block number inserted */ - union xfs_btree_rec *recp, /* i/o: record data inserted */ + union xfs_btree_rec *rec, /* record to insert */ + union xfs_btree_key *key, /* i/o: block key for ptrp */ struct xfs_btree_cur **curp, /* output: new cursor replacing cur */ int *stat) /* success/failure */ { struct xfs_btree_block *block; /* btree block */ struct xfs_buf *bp; /* buffer for block */ - union xfs_btree_key key; /* btree key */ union xfs_btree_ptr nptr; /* new block ptr */ struct xfs_btree_cur *ncur; /* new btree cursor */ - union xfs_btree_rec nrec; /* new record count */ + union xfs_btree_bigkey nkey; /* new block key */ + union xfs_btree_key *lkey; int optr; /* old key/record index */ int ptr; /* key/record index */ int numrecs;/* number of records */ @@ -2942,11 +3235,13 @@ xfs_btree_insrec( #ifdef DEBUG int i; #endif + xfs_daddr_t old_bn; XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); - XFS_BTREE_TRACE_ARGIPR(cur, level, *ptrp, recp); + XFS_BTREE_TRACE_ARGIPR(cur, level, *ptrp, &rec); ncur = NULL; + lkey = (union xfs_btree_key *)&nkey; /* * If we have an external root pointer, and we've made it to the @@ -2969,15 +3264,13 @@ xfs_btree_insrec( return 0; } - /* Make a key out of the record data to be inserted, and save it. */ - cur->bc_ops->init_key_from_rec(&key, recp); - optr = ptr; XFS_BTREE_STATS_INC(cur, insrec); /* Get pointers to the btree buffer and block. */ block = xfs_btree_get_block(cur, level, &bp); + old_bn = bp ? bp->b_bn : XFS_BUF_DADDR_NULL; numrecs = xfs_btree_get_numrecs(block); #ifdef DEBUG @@ -2988,10 +3281,10 @@ xfs_btree_insrec( /* Check that the new entry is being inserted in the right place. */ if (ptr <= numrecs) { if (level == 0) { - ASSERT(cur->bc_ops->recs_inorder(cur, recp, + ASSERT(cur->bc_ops->recs_inorder(cur, rec, xfs_btree_rec_addr(cur, ptr, block))); } else { - ASSERT(cur->bc_ops->keys_inorder(cur, &key, + ASSERT(cur->bc_ops->keys_inorder(cur, key, xfs_btree_key_addr(cur, ptr, block))); } } @@ -3004,7 +3297,7 @@ xfs_btree_insrec( xfs_btree_set_ptr_null(cur, &nptr); if (numrecs == cur->bc_ops->get_maxrecs(cur, level)) { error = xfs_btree_make_block_unfull(cur, level, numrecs, - &optr, &ptr, &nptr, &ncur, &nrec, stat); + &optr, &ptr, &nptr, &ncur, lkey, stat); if (error || *stat == 0) goto error0; } @@ -3054,7 +3347,7 @@ xfs_btree_insrec( #endif /* Now put the new data in, bump numrecs and log it. */ - xfs_btree_copy_keys(cur, kp, &key, 1); + xfs_btree_copy_keys(cur, kp, key, 1); xfs_btree_copy_ptrs(cur, pp, ptrp, 1); numrecs++; xfs_btree_set_numrecs(block, numrecs); @@ -3075,7 +3368,7 @@ xfs_btree_insrec( xfs_btree_shift_recs(cur, rp, 1, numrecs - ptr + 1); /* Now put the new data in, bump numrecs and log it. */ - xfs_btree_copy_recs(cur, rp, recp, 1); + xfs_btree_copy_recs(cur, rp, rec, 1); xfs_btree_set_numrecs(block, ++numrecs); xfs_btree_log_recs(cur, bp, ptr, numrecs); #ifdef DEBUG @@ -3089,9 +3382,18 @@ xfs_btree_insrec( /* Log the new number of records in the btree header. */ xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS); - /* If we inserted at the start of a block, update the parents' keys. */ - if (optr == 1) { - error = xfs_btree_updkey(cur, &key, level + 1); + /* + * If we just inserted into a new tree block, we have to + * recalculate nkey here because nkey is out of date. + * + * Otherwise we're just updating an existing block (having shoved + * some records into the new tree block), so use the regular key + * update mechanism. + */ + if (bp && bp->b_bn != old_bn) { + xfs_btree_get_keys(cur, block, lkey); + } else if (xfs_btree_needs_key_update(cur, optr)) { + error = xfs_btree_update_keys(cur, level); if (error) goto error0; } @@ -3101,7 +3403,7 @@ xfs_btree_insrec( * we are at the far right edge of the tree, update it. */ if (xfs_btree_is_lastrec(cur, block, level)) { - cur->bc_ops->update_lastrec(cur, block, recp, + cur->bc_ops->update_lastrec(cur, block, rec, ptr, LASTREC_INSREC); } @@ -3111,7 +3413,7 @@ xfs_btree_insrec( */ *ptrp = nptr; if (!xfs_btree_ptr_is_null(cur, &nptr)) { - *recp = nrec; + xfs_btree_copy_keys(cur, key, lkey, 1); *curp = ncur; } @@ -3142,14 +3444,20 @@ xfs_btree_insert( union xfs_btree_ptr nptr; /* new block number (split result) */ struct xfs_btree_cur *ncur; /* new cursor (split result) */ struct xfs_btree_cur *pcur; /* previous level's cursor */ + union xfs_btree_bigkey bkey; /* key of block to insert */ + union xfs_btree_key *key; union xfs_btree_rec rec; /* record to insert */ level = 0; ncur = NULL; pcur = cur; + key = (union xfs_btree_key *)&bkey; xfs_btree_set_ptr_null(cur, &nptr); + + /* Make a key out of the record data to be inserted, and save it. */ cur->bc_ops->init_rec_from_cur(cur, &rec); + cur->bc_ops->init_key_from_rec(key, &rec); /* * Loop going up the tree, starting at the leaf level. @@ -3161,7 +3469,8 @@ xfs_btree_insert( * Insert nrec/nptr into this level of the tree. * Note if we fail, nptr will be null. */ - error = xfs_btree_insrec(pcur, level, &nptr, &rec, &ncur, &i); + error = xfs_btree_insrec(pcur, level, &nptr, &rec, key, + &ncur, &i); if (error) { if (pcur != cur) xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR); @@ -3385,8 +3694,6 @@ xfs_btree_delrec( struct xfs_buf *bp; /* buffer for block */ int error; /* error return value */ int i; /* loop counter */ - union xfs_btree_key key; /* storage for keyp */ - union xfs_btree_key *keyp = &key; /* passed to the next level */ union xfs_btree_ptr lptr; /* left sibling block ptr */ struct xfs_buf *lbp; /* left buffer pointer */ struct xfs_btree_block *left; /* left btree block */ @@ -3457,13 +3764,6 @@ xfs_btree_delrec( xfs_btree_log_keys(cur, bp, ptr, numrecs - 1); xfs_btree_log_ptrs(cur, bp, ptr, numrecs - 1); } - - /* - * If it's the first record in the block, we'll need to pass a - * key up to the next level (updkey). - */ - if (ptr == 1) - keyp = xfs_btree_key_addr(cur, 1, block); } else { /* It's a leaf. operate on records */ if (ptr < numrecs) { @@ -3472,16 +3772,6 @@ xfs_btree_delrec( -1, numrecs - ptr); xfs_btree_log_recs(cur, bp, ptr, numrecs - 1); } - - /* - * If it's the first record in the block, we'll need a key - * structure to pass up to the next level (updkey). - */ - if (ptr == 1) { - cur->bc_ops->init_key_from_rec(&key, - xfs_btree_rec_addr(cur, 1, block)); - keyp = &key; - } } /* @@ -3548,8 +3838,8 @@ xfs_btree_delrec( * If we deleted the leftmost entry in the block, update the * key values above us in the tree. */ - if (ptr == 1) { - error = xfs_btree_updkey(cur, keyp, level + 1); + if (xfs_btree_needs_key_update(cur, ptr)) { + error = xfs_btree_update_keys(cur, level); if (error) goto error0; } @@ -3878,6 +4168,16 @@ xfs_btree_delrec( if (level > 0) cur->bc_ptrs[level]--; + /* + * We combined blocks, so we have to update the parent keys if the + * btree supports overlapped intervals. However, bc_ptrs[level + 1] + * points to the old block so that the caller knows which record to + * delete. Therefore, the caller must be savvy enough to call updkeys + * for us if we return stat == 2. The other exit points from this + * function don't require deletions further up the tree, so they can + * call updkeys directly. + */ + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); /* Return value means the next level up has something to do. */ *stat = 2; @@ -3903,6 +4203,7 @@ xfs_btree_delete( int error; /* error return value */ int level; int i; + bool joined = false; XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); @@ -3916,6 +4217,18 @@ xfs_btree_delete( error = xfs_btree_delrec(cur, level, &i); if (error) goto error0; + if (i == 2) + joined = true; + } + + /* + * If we combined blocks as part of deleting the record, delrec won't + * have updated the parent high keys so we have to do that here. + */ + if (joined && (cur->bc_flags & XFS_BTREE_OVERLAPPING)) { + error = xfs_btree_updkeys_force(cur, 0); + if (error) + goto error0; } if (i == 0) { @@ -3978,6 +4291,81 @@ xfs_btree_get_rec( return 0; } +/* Visit a block in a btree. */ +STATIC int +xfs_btree_visit_block( + struct xfs_btree_cur *cur, + int level, + xfs_btree_visit_blocks_fn fn, + void *data) +{ + struct xfs_btree_block *block; + struct xfs_buf *bp; + union xfs_btree_ptr rptr; + int error; + + /* do right sibling readahead */ + xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA); + block = xfs_btree_get_block(cur, level, &bp); + + /* process the block */ + error = fn(cur, level, data); + if (error) + return error; + + /* now read rh sibling block for next iteration */ + xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB); + if (xfs_btree_ptr_is_null(cur, &rptr)) + return -ENOENT; + + return xfs_btree_lookup_get_block(cur, level, &rptr, &block); +} + + +/* Visit every block in a btree. */ +int +xfs_btree_visit_blocks( + struct xfs_btree_cur *cur, + xfs_btree_visit_blocks_fn fn, + void *data) +{ + union xfs_btree_ptr lptr; + int level; + struct xfs_btree_block *block = NULL; + int error = 0; + + cur->bc_ops->init_ptr_from_cur(cur, &lptr); + + /* for each level */ + for (level = cur->bc_nlevels - 1; level >= 0; level--) { + /* grab the left hand block */ + error = xfs_btree_lookup_get_block(cur, level, &lptr, &block); + if (error) + return error; + + /* readahead the left most block for the next level down */ + if (level > 0) { + union xfs_btree_ptr *ptr; + + ptr = xfs_btree_ptr_addr(cur, 1, block); + xfs_btree_readahead_ptr(cur, ptr, 1); + + /* save for the next iteration of the loop */ + lptr = *ptr; + } + + /* for each buffer in the level */ + do { + error = xfs_btree_visit_block(cur, level, fn, data); + } while (!error); + + if (error != -ENOENT) + return error; + } + + return 0; +} + /* * Change the owner of a btree. * @@ -4002,26 +4390,27 @@ xfs_btree_get_rec( * just queue the modified buffer as delayed write buffer so the transaction * recovery completion writes the changes to disk. */ +struct xfs_btree_block_change_owner_info { + __uint64_t new_owner; + struct list_head *buffer_list; +}; + static int xfs_btree_block_change_owner( struct xfs_btree_cur *cur, int level, - __uint64_t new_owner, - struct list_head *buffer_list) + void *data) { + struct xfs_btree_block_change_owner_info *bbcoi = data; struct xfs_btree_block *block; struct xfs_buf *bp; - union xfs_btree_ptr rptr; - - /* do right sibling readahead */ - xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA); /* modify the owner */ block = xfs_btree_get_block(cur, level, &bp); if (cur->bc_flags & XFS_BTREE_LONG_PTRS) - block->bb_u.l.bb_owner = cpu_to_be64(new_owner); + block->bb_u.l.bb_owner = cpu_to_be64(bbcoi->new_owner); else - block->bb_u.s.bb_owner = cpu_to_be32(new_owner); + block->bb_u.s.bb_owner = cpu_to_be32(bbcoi->new_owner); /* * If the block is a root block hosted in an inode, we might not have a @@ -4035,19 +4424,14 @@ xfs_btree_block_change_owner( xfs_trans_ordered_buf(cur->bc_tp, bp); xfs_btree_log_block(cur, bp, XFS_BB_OWNER); } else { - xfs_buf_delwri_queue(bp, buffer_list); + xfs_buf_delwri_queue(bp, bbcoi->buffer_list); } } else { ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); ASSERT(level == cur->bc_nlevels - 1); } - /* now read rh sibling block for next iteration */ - xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB); - if (xfs_btree_ptr_is_null(cur, &rptr)) - return -ENOENT; - - return xfs_btree_lookup_get_block(cur, level, &rptr, &block); + return 0; } int @@ -4056,43 +4440,13 @@ xfs_btree_change_owner( __uint64_t new_owner, struct list_head *buffer_list) { - union xfs_btree_ptr lptr; - int level; - struct xfs_btree_block *block = NULL; - int error = 0; - - cur->bc_ops->init_ptr_from_cur(cur, &lptr); - - /* for each level */ - for (level = cur->bc_nlevels - 1; level >= 0; level--) { - /* grab the left hand block */ - error = xfs_btree_lookup_get_block(cur, level, &lptr, &block); - if (error) - return error; - - /* readahead the left most block for the next level down */ - if (level > 0) { - union xfs_btree_ptr *ptr; - - ptr = xfs_btree_ptr_addr(cur, 1, block); - xfs_btree_readahead_ptr(cur, ptr, 1); - - /* save for the next iteration of the loop */ - lptr = *ptr; - } - - /* for each buffer in the level */ - do { - error = xfs_btree_block_change_owner(cur, level, - new_owner, - buffer_list); - } while (!error); + struct xfs_btree_block_change_owner_info bbcoi; - if (error != -ENOENT) - return error; - } + bbcoi.new_owner = new_owner; + bbcoi.buffer_list = buffer_list; - return 0; + return xfs_btree_visit_blocks(cur, xfs_btree_block_change_owner, + &bbcoi); } /** @@ -4152,3 +4506,294 @@ xfs_btree_sblock_verify( return true; } + +/* + * Calculate the number of btree levels needed to store a given number of + * records in a short-format btree. + */ +uint +xfs_btree_compute_maxlevels( + struct xfs_mount *mp, + uint *limits, + unsigned long len) +{ + uint level; + unsigned long maxblocks; + + maxblocks = (len + limits[0] - 1) / limits[0]; + for (level = 1; maxblocks > 1; level++) + maxblocks = (maxblocks + limits[1] - 1) / limits[1]; + return level; +} + +/* + * Query a regular btree for all records overlapping a given interval. + * Start with a LE lookup of the key of low_rec and return all records + * until we find a record with a key greater than the key of high_rec. + */ +STATIC int +xfs_btree_simple_query_range( + struct xfs_btree_cur *cur, + union xfs_btree_key *low_key, + union xfs_btree_key *high_key, + xfs_btree_query_range_fn fn, + void *priv) +{ + union xfs_btree_rec *recp; + union xfs_btree_key rec_key; + __int64_t diff; + int stat; + bool firstrec = true; + int error; + + ASSERT(cur->bc_ops->init_high_key_from_rec); + ASSERT(cur->bc_ops->diff_two_keys); + + /* + * Find the leftmost record. The btree cursor must be set + * to the low record used to generate low_key. + */ + stat = 0; + error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, &stat); + if (error) + goto out; + + /* Nothing? See if there's anything to the right. */ + if (!stat) { + error = xfs_btree_increment(cur, 0, &stat); + if (error) + goto out; + } + + while (stat) { + /* Find the record. */ + error = xfs_btree_get_rec(cur, &recp, &stat); + if (error || !stat) + break; + + /* Skip if high_key(rec) < low_key. */ + if (firstrec) { + cur->bc_ops->init_high_key_from_rec(&rec_key, recp); + firstrec = false; + diff = cur->bc_ops->diff_two_keys(cur, low_key, + &rec_key); + if (diff > 0) + goto advloop; + } + + /* Stop if high_key < low_key(rec). */ + cur->bc_ops->init_key_from_rec(&rec_key, recp); + diff = cur->bc_ops->diff_two_keys(cur, &rec_key, high_key); + if (diff > 0) + break; + + /* Callback */ + error = fn(cur, recp, priv); + if (error < 0 || error == XFS_BTREE_QUERY_RANGE_ABORT) + break; + +advloop: + /* Move on to the next record. */ + error = xfs_btree_increment(cur, 0, &stat); + if (error) + break; + } + +out: + return error; +} + +/* + * Query an overlapped interval btree for all records overlapping a given + * interval. This function roughly follows the algorithm given in + * "Interval Trees" of _Introduction to Algorithms_, which is section + * 14.3 in the 2nd and 3rd editions. + * + * First, generate keys for the low and high records passed in. + * + * For any leaf node, generate the high and low keys for the record. + * If the record keys overlap with the query low/high keys, pass the + * record to the function iterator. + * + * For any internal node, compare the low and high keys of each + * pointer against the query low/high keys. If there's an overlap, + * follow the pointer. + * + * As an optimization, we stop scanning a block when we find a low key + * that is greater than the query's high key. + */ +STATIC int +xfs_btree_overlapped_query_range( + struct xfs_btree_cur *cur, + union xfs_btree_key *low_key, + union xfs_btree_key *high_key, + xfs_btree_query_range_fn fn, + void *priv) +{ + union xfs_btree_ptr ptr; + union xfs_btree_ptr *pp; + union xfs_btree_key rec_key; + union xfs_btree_key rec_hkey; + union xfs_btree_key *lkp; + union xfs_btree_key *hkp; + union xfs_btree_rec *recp; + struct xfs_btree_block *block; + __int64_t ldiff; + __int64_t hdiff; + int level; + struct xfs_buf *bp; + int i; + int error; + + /* Load the root of the btree. */ + level = cur->bc_nlevels - 1; + cur->bc_ops->init_ptr_from_cur(cur, &ptr); + error = xfs_btree_lookup_get_block(cur, level, &ptr, &block); + if (error) + return error; + xfs_btree_get_block(cur, level, &bp); + trace_xfs_btree_overlapped_query_range(cur, level, bp); +#ifdef DEBUG + error = xfs_btree_check_block(cur, block, level, bp); + if (error) + goto out; +#endif + cur->bc_ptrs[level] = 1; + + while (level < cur->bc_nlevels) { + block = xfs_btree_get_block(cur, level, &bp); + + /* End of node, pop back towards the root. */ + if (cur->bc_ptrs[level] > be16_to_cpu(block->bb_numrecs)) { +pop_up: + if (level < cur->bc_nlevels - 1) + cur->bc_ptrs[level + 1]++; + level++; + continue; + } + + if (level == 0) { + /* Handle a leaf node. */ + recp = xfs_btree_rec_addr(cur, cur->bc_ptrs[0], block); + + cur->bc_ops->init_high_key_from_rec(&rec_hkey, recp); + ldiff = cur->bc_ops->diff_two_keys(cur, &rec_hkey, + low_key); + + cur->bc_ops->init_key_from_rec(&rec_key, recp); + hdiff = cur->bc_ops->diff_two_keys(cur, high_key, + &rec_key); + + /* + * If (record's high key >= query's low key) and + * (query's high key >= record's low key), then + * this record overlaps the query range; callback. + */ + if (ldiff >= 0 && hdiff >= 0) { + error = fn(cur, recp, priv); + if (error < 0 || + error == XFS_BTREE_QUERY_RANGE_ABORT) + break; + } else if (hdiff < 0) { + /* Record is larger than high key; pop. */ + goto pop_up; + } + cur->bc_ptrs[level]++; + continue; + } + + /* Handle an internal node. */ + lkp = xfs_btree_key_addr(cur, cur->bc_ptrs[level], block); + hkp = xfs_btree_high_key_addr(cur, cur->bc_ptrs[level], block); + pp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[level], block); + + ldiff = cur->bc_ops->diff_two_keys(cur, hkp, low_key); + hdiff = cur->bc_ops->diff_two_keys(cur, high_key, lkp); + + /* + * If (pointer's high key >= query's low key) and + * (query's high key >= pointer's low key), then + * this record overlaps the query range; follow pointer. + */ + if (ldiff >= 0 && hdiff >= 0) { + level--; + error = xfs_btree_lookup_get_block(cur, level, pp, + &block); + if (error) + goto out; + xfs_btree_get_block(cur, level, &bp); + trace_xfs_btree_overlapped_query_range(cur, level, bp); +#ifdef DEBUG + error = xfs_btree_check_block(cur, block, level, bp); + if (error) + goto out; +#endif + cur->bc_ptrs[level] = 1; + continue; + } else if (hdiff < 0) { + /* The low key is larger than the upper range; pop. */ + goto pop_up; + } + cur->bc_ptrs[level]++; + } + +out: + /* + * If we don't end this function with the cursor pointing at a record + * block, a subsequent non-error cursor deletion will not release + * node-level buffers, causing a buffer leak. This is quite possible + * with a zero-results range query, so release the buffers if we + * failed to return any results. + */ + if (cur->bc_bufs[0] == NULL) { + for (i = 0; i < cur->bc_nlevels; i++) { + if (cur->bc_bufs[i]) { + xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]); + cur->bc_bufs[i] = NULL; + cur->bc_ptrs[i] = 0; + cur->bc_ra[i] = 0; + } + } + } + + return error; +} + +/* + * Query a btree for all records overlapping a given interval of keys. The + * supplied function will be called with each record found; return one of the + * XFS_BTREE_QUERY_RANGE_{CONTINUE,ABORT} values or the usual negative error + * code. This function returns XFS_BTREE_QUERY_RANGE_ABORT, zero, or a + * negative error code. + */ +int +xfs_btree_query_range( + struct xfs_btree_cur *cur, + union xfs_btree_irec *low_rec, + union xfs_btree_irec *high_rec, + xfs_btree_query_range_fn fn, + void *priv) +{ + union xfs_btree_rec rec; + union xfs_btree_key low_key; + union xfs_btree_key high_key; + + /* Find the keys of both ends of the interval. */ + cur->bc_rec = *high_rec; + cur->bc_ops->init_rec_from_cur(cur, &rec); + cur->bc_ops->init_key_from_rec(&high_key, &rec); + + cur->bc_rec = *low_rec; + cur->bc_ops->init_rec_from_cur(cur, &rec); + cur->bc_ops->init_key_from_rec(&low_key, &rec); + + /* Enforce low key < high key. */ + if (cur->bc_ops->diff_two_keys(cur, &low_key, &high_key) > 0) + return -EINVAL; + + if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING)) + return xfs_btree_simple_query_range(cur, &low_key, + &high_key, fn, priv); + return xfs_btree_overlapped_query_range(cur, &low_key, &high_key, + fn, priv); +} diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 2e874be70..04d0865e5 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -19,7 +19,7 @@ #define __XFS_BTREE_H__ struct xfs_buf; -struct xfs_bmap_free; +struct xfs_defer_ops; struct xfs_inode; struct xfs_mount; struct xfs_trans; @@ -38,17 +38,37 @@ union xfs_btree_ptr { }; union xfs_btree_key { - xfs_bmbt_key_t bmbt; - xfs_bmdr_key_t bmbr; /* bmbt root block */ - xfs_alloc_key_t alloc; - xfs_inobt_key_t inobt; + struct xfs_bmbt_key bmbt; + xfs_bmdr_key_t bmbr; /* bmbt root block */ + xfs_alloc_key_t alloc; + struct xfs_inobt_key inobt; + struct xfs_rmap_key rmap; +}; + +/* + * In-core key that holds both low and high keys for overlapped btrees. + * The two keys are packed next to each other on disk, so do the same + * in memory. Preserve the existing xfs_btree_key as a single key to + * avoid the mental model breakage that would happen if we passed a + * bigkey into a function that operates on a single key. + */ +union xfs_btree_bigkey { + struct xfs_bmbt_key bmbt; + xfs_bmdr_key_t bmbr; /* bmbt root block */ + xfs_alloc_key_t alloc; + struct xfs_inobt_key inobt; + struct { + struct xfs_rmap_key rmap; + struct xfs_rmap_key rmap_hi; + }; }; union xfs_btree_rec { - xfs_bmbt_rec_t bmbt; - xfs_bmdr_rec_t bmbr; /* bmbt root block */ - xfs_alloc_rec_t alloc; - xfs_inobt_rec_t inobt; + struct xfs_bmbt_rec bmbt; + xfs_bmdr_rec_t bmbr; /* bmbt root block */ + struct xfs_alloc_rec alloc; + struct xfs_inobt_rec inobt; + struct xfs_rmap_rec rmap; }; /* @@ -63,6 +83,7 @@ union xfs_btree_rec { #define XFS_BTNUM_BMAP ((xfs_btnum_t)XFS_BTNUM_BMAPi) #define XFS_BTNUM_INO ((xfs_btnum_t)XFS_BTNUM_INOi) #define XFS_BTNUM_FINO ((xfs_btnum_t)XFS_BTNUM_FINOi) +#define XFS_BTNUM_RMAP ((xfs_btnum_t)XFS_BTNUM_RMAPi) /* * For logging record fields. @@ -95,6 +116,7 @@ do { \ case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_INC(__mp, bmbt, stat); break; \ case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(__mp, ibt, stat); break; \ case XFS_BTNUM_FINO: __XFS_BTREE_STATS_INC(__mp, fibt, stat); break; \ + case XFS_BTNUM_RMAP: __XFS_BTREE_STATS_INC(__mp, rmap, stat); break; \ case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \ } \ } while (0) @@ -115,11 +137,13 @@ do { \ __XFS_BTREE_STATS_ADD(__mp, ibt, stat, val); break; \ case XFS_BTNUM_FINO: \ __XFS_BTREE_STATS_ADD(__mp, fibt, stat, val); break; \ + case XFS_BTNUM_RMAP: \ + __XFS_BTREE_STATS_ADD(__mp, rmap, stat, val); break; \ case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \ } \ } while (0) -#define XFS_BTREE_MAXLEVELS 8 /* max of all btrees */ +#define XFS_BTREE_MAXLEVELS 9 /* max of all btrees */ struct xfs_btree_ops { /* size of the key and record structures */ @@ -158,17 +182,25 @@ struct xfs_btree_ops { /* init values of btree structures */ void (*init_key_from_rec)(union xfs_btree_key *key, union xfs_btree_rec *rec); - void (*init_rec_from_key)(union xfs_btree_key *key, - union xfs_btree_rec *rec); void (*init_rec_from_cur)(struct xfs_btree_cur *cur, union xfs_btree_rec *rec); void (*init_ptr_from_cur)(struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr); + void (*init_high_key_from_rec)(union xfs_btree_key *key, + union xfs_btree_rec *rec); /* difference between key value and cursor value */ __int64_t (*key_diff)(struct xfs_btree_cur *cur, union xfs_btree_key *key); + /* + * Difference between key2 and key1 -- positive if key1 > key2, + * negative if key1 < key2, and zero if equal. + */ + __int64_t (*diff_two_keys)(struct xfs_btree_cur *cur, + union xfs_btree_key *key1, + union xfs_btree_key *key2); + const struct xfs_buf_ops *buf_ops; #if defined(DEBUG) || defined(XFS_WARN) @@ -192,6 +224,13 @@ struct xfs_btree_ops { #define LASTREC_DELREC 2 +union xfs_btree_irec { + struct xfs_alloc_rec_incore a; + struct xfs_bmbt_irec b; + struct xfs_inobt_rec_incore i; + struct xfs_rmap_irec r; +}; + /* * Btree cursor structure. * This collects all information needed by the btree code in one place. @@ -202,11 +241,7 @@ typedef struct xfs_btree_cur struct xfs_mount *bc_mp; /* file system mount struct */ const struct xfs_btree_ops *bc_ops; uint bc_flags; /* btree features - below */ - union { - xfs_alloc_rec_incore_t a; - xfs_bmbt_irec_t b; - xfs_inobt_rec_incore_t i; - } bc_rec; /* current insert/search record value */ + union xfs_btree_irec bc_rec; /* current insert/search record value */ struct xfs_buf *bc_bufs[XFS_BTREE_MAXLEVELS]; /* buf ptr per level */ int bc_ptrs[XFS_BTREE_MAXLEVELS]; /* key/record # */ __uint8_t bc_ra[XFS_BTREE_MAXLEVELS]; /* readahead bits */ @@ -218,11 +253,12 @@ typedef struct xfs_btree_cur union { struct { /* needed for BNO, CNT, INO */ struct xfs_buf *agbp; /* agf/agi buffer pointer */ + struct xfs_defer_ops *dfops; /* deferred updates */ xfs_agnumber_t agno; /* ag number */ } a; struct { /* needed for BMAP */ struct xfs_inode *ip; /* pointer to our inode */ - struct xfs_bmap_free *flist; /* list to free after */ + struct xfs_defer_ops *dfops; /* deferred updates */ xfs_fsblock_t firstblock; /* 1st blk allocated */ int allocated; /* count of alloced */ short forksize; /* fork's inode space */ @@ -238,6 +274,7 @@ typedef struct xfs_btree_cur #define XFS_BTREE_ROOT_IN_INODE (1<<1) /* root may be variable size */ #define XFS_BTREE_LASTREC_UPDATE (1<<2) /* track last rec externally */ #define XFS_BTREE_CRC_BLOCKS (1<<3) /* uses extended btree blocks */ +#define XFS_BTREE_OVERLAPPING (1<<4) /* overlapping intervals */ #define XFS_BTREE_NOERROR 0 @@ -474,5 +511,22 @@ static inline int xfs_btree_get_level(struct xfs_btree_block *block) bool xfs_btree_sblock_v5hdr_verify(struct xfs_buf *bp); bool xfs_btree_sblock_verify(struct xfs_buf *bp, unsigned int max_recs); +uint xfs_btree_compute_maxlevels(struct xfs_mount *mp, uint *limits, + unsigned long len); + +/* return codes */ +#define XFS_BTREE_QUERY_RANGE_CONTINUE 0 /* keep iterating */ +#define XFS_BTREE_QUERY_RANGE_ABORT 1 /* stop iterating */ +typedef int (*xfs_btree_query_range_fn)(struct xfs_btree_cur *cur, + union xfs_btree_rec *rec, void *priv); + +int xfs_btree_query_range(struct xfs_btree_cur *cur, + union xfs_btree_irec *low_rec, union xfs_btree_irec *high_rec, + xfs_btree_query_range_fn fn, void *priv); + +typedef int (*xfs_btree_visit_blocks_fn)(struct xfs_btree_cur *cur, int level, + void *data); +int xfs_btree_visit_blocks(struct xfs_btree_cur *cur, + xfs_btree_visit_blocks_fn fn, void *data); #endif /* __XFS_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 097bf7717..f2dc1a950 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -356,7 +356,6 @@ xfs_da3_split( struct xfs_da_state_blk *newblk; struct xfs_da_state_blk *addblk; struct xfs_da_intnode *node; - struct xfs_buf *bp; int max; int action = 0; int error; @@ -397,7 +396,9 @@ xfs_da3_split( break; } /* - * Entry wouldn't fit, split the leaf again. + * Entry wouldn't fit, split the leaf again. The new + * extrablk will be consumed by xfs_da3_node_split if + * the node is split. */ state->extravalid = 1; if (state->inleaf) { @@ -445,6 +446,14 @@ xfs_da3_split( if (!addblk) return 0; + /* + * xfs_da3_node_split() should have consumed any extra blocks we added + * during a double leaf split in the attr fork. This is guaranteed as + * we can't be here if the attr fork only has a single leaf block. + */ + ASSERT(state->extravalid == 0 || + state->path.blk[max].magic == XFS_DIR2_LEAFN_MAGIC); + /* * Split the root node. */ @@ -457,43 +466,33 @@ xfs_da3_split( } /* - * Update pointers to the node which used to be block 0 and - * just got bumped because of the addition of a new root node. - * There might be three blocks involved if a double split occurred, - * and the original block 0 could be at any position in the list. + * Update pointers to the node which used to be block 0 and just got + * bumped because of the addition of a new root node. Note that the + * original block 0 could be at any position in the list of blocks in + * the tree. * - * Note: the magic numbers and sibling pointers are in the same - * physical place for both v2 and v3 headers (by design). Hence it - * doesn't matter which version of the xfs_da_intnode structure we use - * here as the result will be the same using either structure. + * Note: the magic numbers and sibling pointers are in the same physical + * place for both v2 and v3 headers (by design). Hence it doesn't matter + * which version of the xfs_da_intnode structure we use here as the + * result will be the same using either structure. */ node = oldblk->bp->b_addr; if (node->hdr.info.forw) { - if (be32_to_cpu(node->hdr.info.forw) == addblk->blkno) { - bp = addblk->bp; - } else { - ASSERT(state->extravalid); - bp = state->extrablk.bp; - } - node = bp->b_addr; + ASSERT(be32_to_cpu(node->hdr.info.forw) == addblk->blkno); + node = addblk->bp->b_addr; node->hdr.info.back = cpu_to_be32(oldblk->blkno); - xfs_trans_log_buf(state->args->trans, bp, - XFS_DA_LOGRANGE(node, &node->hdr.info, - sizeof(node->hdr.info))); + xfs_trans_log_buf(state->args->trans, addblk->bp, + XFS_DA_LOGRANGE(node, &node->hdr.info, + sizeof(node->hdr.info))); } node = oldblk->bp->b_addr; if (node->hdr.info.back) { - if (be32_to_cpu(node->hdr.info.back) == addblk->blkno) { - bp = addblk->bp; - } else { - ASSERT(state->extravalid); - bp = state->extrablk.bp; - } - node = bp->b_addr; + ASSERT(be32_to_cpu(node->hdr.info.back) == addblk->blkno); + node = addblk->bp->b_addr; node->hdr.info.forw = cpu_to_be32(oldblk->blkno); - xfs_trans_log_buf(state->args->trans, bp, - XFS_DA_LOGRANGE(node, &node->hdr.info, - sizeof(node->hdr.info))); + xfs_trans_log_buf(state->args->trans, addblk->bp, + XFS_DA_LOGRANGE(node, &node->hdr.info, + sizeof(node->hdr.info))); } addblk->bp = NULL; return 0; @@ -2030,7 +2029,7 @@ xfs_da_grow_inode_int( error = xfs_bmapi_write(tp, dp, *bno, count, xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG, args->firstblock, args->total, &map, &nmap, - args->flist); + args->dfops); if (error) return error; @@ -2053,7 +2052,7 @@ xfs_da_grow_inode_int( error = xfs_bmapi_write(tp, dp, b, c, xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA, args->firstblock, args->total, - &mapp[mapi], &nmap, args->flist); + &mapp[mapi], &nmap, args->dfops); if (error) goto out_free_map; if (nmap < 1) @@ -2363,7 +2362,7 @@ xfs_da_shrink_inode( */ error = xfs_bunmapi(tp, dp, dead_blkno, count, xfs_bmapi_aflag(w), 0, args->firstblock, - args->flist, &done); + args->dfops, &done); if (error == -ENOSPC) { if (w != XFS_DATA_FORK) break; diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h index 6e153e399..98c75cbe6 100644 --- a/fs/xfs/libxfs/xfs_da_btree.h +++ b/fs/xfs/libxfs/xfs_da_btree.h @@ -19,7 +19,7 @@ #ifndef __XFS_DA_BTREE_H__ #define __XFS_DA_BTREE_H__ -struct xfs_bmap_free; +struct xfs_defer_ops; struct xfs_inode; struct xfs_trans; struct zone; @@ -70,7 +70,7 @@ typedef struct xfs_da_args { xfs_ino_t inumber; /* input/output inode number */ struct xfs_inode *dp; /* directory inode to manipulate */ xfs_fsblock_t *firstblock; /* ptr to firstblock for bmap calls */ - struct xfs_bmap_free *flist; /* ptr to freelist for bmap_finish */ + struct xfs_defer_ops *dfops; /* ptr to freelist for bmap_finish */ struct xfs_trans *trans; /* current trans (changes over time) */ xfs_extlen_t total; /* total blocks needed, for 1st bmap */ int whichfork; /* data or attribute fork */ diff --git a/fs/xfs/libxfs/xfs_da_format.c b/fs/xfs/libxfs/xfs_da_format.c index 9d624a622..f1e8d4dbb 100644 --- a/fs/xfs/libxfs/xfs_da_format.c +++ b/fs/xfs/libxfs/xfs_da_format.c @@ -40,8 +40,7 @@ xfs_dir2_sf_entsize( int count = sizeof(struct xfs_dir2_sf_entry); /* namelen + offset */ count += len; /* name */ - count += hdr->i8count ? sizeof(xfs_dir2_ino8_t) : - sizeof(xfs_dir2_ino4_t); /* ino # */ + count += hdr->i8count ? XFS_INO64_SIZE : XFS_INO32_SIZE; /* ino # */ return count; } @@ -125,33 +124,33 @@ xfs_dir3_sfe_put_ftype( static xfs_ino_t xfs_dir2_sf_get_ino( struct xfs_dir2_sf_hdr *hdr, - xfs_dir2_inou_t *from) + __uint8_t *from) { if (hdr->i8count) - return get_unaligned_be64(&from->i8.i) & 0x00ffffffffffffffULL; + return get_unaligned_be64(from) & 0x00ffffffffffffffULL; else - return get_unaligned_be32(&from->i4.i); + return get_unaligned_be32(from); } static void xfs_dir2_sf_put_ino( struct xfs_dir2_sf_hdr *hdr, - xfs_dir2_inou_t *to, + __uint8_t *to, xfs_ino_t ino) { ASSERT((ino & 0xff00000000000000ULL) == 0); if (hdr->i8count) - put_unaligned_be64(ino, &to->i8.i); + put_unaligned_be64(ino, to); else - put_unaligned_be32(ino, &to->i4.i); + put_unaligned_be32(ino, to); } static xfs_ino_t xfs_dir2_sf_get_parent_ino( struct xfs_dir2_sf_hdr *hdr) { - return xfs_dir2_sf_get_ino(hdr, &hdr->parent); + return xfs_dir2_sf_get_ino(hdr, hdr->parent); } static void @@ -159,7 +158,7 @@ xfs_dir2_sf_put_parent_ino( struct xfs_dir2_sf_hdr *hdr, xfs_ino_t ino) { - xfs_dir2_sf_put_ino(hdr, &hdr->parent, ino); + xfs_dir2_sf_put_ino(hdr, hdr->parent, ino); } /* @@ -173,8 +172,7 @@ xfs_dir2_sfe_get_ino( struct xfs_dir2_sf_hdr *hdr, struct xfs_dir2_sf_entry *sfep) { - return xfs_dir2_sf_get_ino(hdr, - (xfs_dir2_inou_t *)&sfep->name[sfep->namelen]); + return xfs_dir2_sf_get_ino(hdr, &sfep->name[sfep->namelen]); } static void @@ -183,8 +181,7 @@ xfs_dir2_sfe_put_ino( struct xfs_dir2_sf_entry *sfep, xfs_ino_t ino) { - xfs_dir2_sf_put_ino(hdr, - (xfs_dir2_inou_t *)&sfep->name[sfep->namelen], ino); + xfs_dir2_sf_put_ino(hdr, &sfep->name[sfep->namelen], ino); } static xfs_ino_t @@ -192,8 +189,7 @@ xfs_dir3_sfe_get_ino( struct xfs_dir2_sf_hdr *hdr, struct xfs_dir2_sf_entry *sfep) { - return xfs_dir2_sf_get_ino(hdr, - (xfs_dir2_inou_t *)&sfep->name[sfep->namelen + 1]); + return xfs_dir2_sf_get_ino(hdr, &sfep->name[sfep->namelen + 1]); } static void @@ -202,8 +198,7 @@ xfs_dir3_sfe_put_ino( struct xfs_dir2_sf_entry *sfep, xfs_ino_t ino) { - xfs_dir2_sf_put_ino(hdr, - (xfs_dir2_inou_t *)&sfep->name[sfep->namelen + 1], ino); + xfs_dir2_sf_put_ino(hdr, &sfep->name[sfep->namelen + 1], ino); } diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index 8d4d8bce4..9a492a9e1 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -191,12 +191,6 @@ typedef __uint16_t xfs_dir2_data_off_t; #define NULLDATAOFF 0xffffU typedef uint xfs_dir2_data_aoff_t; /* argument form */ -/* - * Normalized offset (in a data block) of the entry, really xfs_dir2_data_off_t. - * Only need 16 bits, this is the byte offset into the single block form. - */ -typedef struct { __uint8_t i[2]; } __arch_pack xfs_dir2_sf_off_t; - /* * Offset in data space of a data entry. */ @@ -214,22 +208,10 @@ typedef xfs_off_t xfs_dir2_off_t; */ typedef __uint32_t xfs_dir2_db_t; -/* - * Inode number stored as 8 8-bit values. - */ -typedef struct { __uint8_t i[8]; } xfs_dir2_ino8_t; - -/* - * Inode number stored as 4 8-bit values. - * Works a lot of the time, when all the inode numbers in a directory - * fit in 32 bits. - */ -typedef struct { __uint8_t i[4]; } xfs_dir2_ino4_t; +#define XFS_INO32_SIZE 4 +#define XFS_INO64_SIZE 8 +#define XFS_INO64_DIFF (XFS_INO64_SIZE - XFS_INO32_SIZE) -typedef union { - xfs_dir2_ino8_t i8; - xfs_dir2_ino4_t i4; -} xfs_dir2_inou_t; #define XFS_DIR2_MAX_SHORT_INUM ((xfs_ino_t)0xffffffffULL) /* @@ -246,39 +228,38 @@ typedef union { typedef struct xfs_dir2_sf_hdr { __uint8_t count; /* count of entries */ __uint8_t i8count; /* count of 8-byte inode #s */ - xfs_dir2_inou_t parent; /* parent dir inode number */ -} __arch_pack xfs_dir2_sf_hdr_t; + __uint8_t parent[8]; /* parent dir inode number */ +} __packed xfs_dir2_sf_hdr_t; typedef struct xfs_dir2_sf_entry { __u8 namelen; /* actual name length */ - xfs_dir2_sf_off_t offset; /* saved offset */ + __u8 offset[2]; /* saved offset */ __u8 name[]; /* name, variable size */ /* * A single byte containing the file type field follows the inode * number for version 3 directory entries. * - * A xfs_dir2_ino8_t or xfs_dir2_ino4_t follows here, at a - * variable offset after the name. + * A 64-bit or 32-bit inode number follows here, at a variable offset + * after the name. */ -} __arch_pack xfs_dir2_sf_entry_t; +} xfs_dir2_sf_entry_t; static inline int xfs_dir2_sf_hdr_size(int i8count) { return sizeof(struct xfs_dir2_sf_hdr) - - (i8count == 0) * - (sizeof(xfs_dir2_ino8_t) - sizeof(xfs_dir2_ino4_t)); + (i8count == 0) * XFS_INO64_DIFF; } static inline xfs_dir2_data_aoff_t xfs_dir2_sf_get_offset(xfs_dir2_sf_entry_t *sfep) { - return get_unaligned_be16(&sfep->offset.i); + return get_unaligned_be16(sfep->offset); } static inline void xfs_dir2_sf_put_offset(xfs_dir2_sf_entry_t *sfep, xfs_dir2_data_aoff_t off) { - put_unaligned_be16(off, &sfep->offset.i); + put_unaligned_be16(off, sfep->offset); } static inline struct xfs_dir2_sf_entry * @@ -648,6 +629,7 @@ typedef struct xfs_attr_shortform { struct xfs_attr_sf_hdr { /* constant-structure header block */ __be16 totsize; /* total bytes in shortform list */ __u8 count; /* count of active entries */ + __u8 padding; } hdr; struct xfs_attr_sf_entry { __uint8_t namelen; /* actual length of name (no NULL) */ diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c new file mode 100644 index 000000000..c221d0ecd --- /dev/null +++ b/fs/xfs/libxfs/xfs_defer.c @@ -0,0 +1,454 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_trans.h" +#include "xfs_trace.h" + +/* + * Deferred Operations in XFS + * + * Due to the way locking rules work in XFS, certain transactions (block + * mapping and unmapping, typically) have permanent reservations so that + * we can roll the transaction to adhere to AG locking order rules and + * to unlock buffers between metadata updates. Prior to rmap/reflink, + * the mapping code had a mechanism to perform these deferrals for + * extents that were going to be freed; this code makes that facility + * more generic. + * + * When adding the reverse mapping and reflink features, it became + * necessary to perform complex remapping multi-transactions to comply + * with AG locking order rules, and to be able to spread a single + * refcount update operation (an operation on an n-block extent can + * update as many as n records!) among multiple transactions. XFS can + * roll a transaction to facilitate this, but using this facility + * requires us to log "intent" items in case log recovery needs to + * redo the operation, and to log "done" items to indicate that redo + * is not necessary. + * + * Deferred work is tracked in xfs_defer_pending items. Each pending + * item tracks one type of deferred work. Incoming work items (which + * have not yet had an intent logged) are attached to a pending item + * on the dop_intake list, where they wait for the caller to finish + * the deferred operations. + * + * Finishing a set of deferred operations is an involved process. To + * start, we define "rolling a deferred-op transaction" as follows: + * + * > For each xfs_defer_pending item on the dop_intake list, + * - Sort the work items in AG order. XFS locking + * order rules require us to lock buffers in AG order. + * - Create a log intent item for that type. + * - Attach it to the pending item. + * - Move the pending item from the dop_intake list to the + * dop_pending list. + * > Roll the transaction. + * + * NOTE: To avoid exceeding the transaction reservation, we limit the + * number of items that we attach to a given xfs_defer_pending. + * + * The actual finishing process looks like this: + * + * > For each xfs_defer_pending in the dop_pending list, + * - Roll the deferred-op transaction as above. + * - Create a log done item for that type, and attach it to the + * log intent item. + * - For each work item attached to the log intent item, + * * Perform the described action. + * * Attach the work item to the log done item. + * + * The key here is that we must log an intent item for all pending + * work items every time we roll the transaction, and that we must log + * a done item as soon as the work is completed. With this mechanism + * we can perform complex remapping operations, chaining intent items + * as needed. + * + * This is an example of remapping the extent (E, E+B) into file X at + * offset A and dealing with the extent (C, C+B) already being mapped + * there: + * +-------------------------------------------------+ + * | Unmap file X startblock C offset A length B | t0 + * | Intent to reduce refcount for extent (C, B) | + * | Intent to remove rmap (X, C, A, B) | + * | Intent to free extent (D, 1) (bmbt block) | + * | Intent to map (X, A, B) at startblock E | + * +-------------------------------------------------+ + * | Map file X startblock E offset A length B | t1 + * | Done mapping (X, E, A, B) | + * | Intent to increase refcount for extent (E, B) | + * | Intent to add rmap (X, E, A, B) | + * +-------------------------------------------------+ + * | Reduce refcount for extent (C, B) | t2 + * | Done reducing refcount for extent (C, B) | + * | Increase refcount for extent (E, B) | + * | Done increasing refcount for extent (E, B) | + * | Intent to free extent (C, B) | + * | Intent to free extent (F, 1) (refcountbt block) | + * | Intent to remove rmap (F, 1, REFC) | + * +-------------------------------------------------+ + * | Remove rmap (X, C, A, B) | t3 + * | Done removing rmap (X, C, A, B) | + * | Add rmap (X, E, A, B) | + * | Done adding rmap (X, E, A, B) | + * | Remove rmap (F, 1, REFC) | + * | Done removing rmap (F, 1, REFC) | + * +-------------------------------------------------+ + * | Free extent (C, B) | t4 + * | Done freeing extent (C, B) | + * | Free extent (D, 1) | + * | Done freeing extent (D, 1) | + * | Free extent (F, 1) | + * | Done freeing extent (F, 1) | + * +-------------------------------------------------+ + * + * If we should crash before t2 commits, log recovery replays + * the following intent items: + * + * - Intent to reduce refcount for extent (C, B) + * - Intent to remove rmap (X, C, A, B) + * - Intent to free extent (D, 1) (bmbt block) + * - Intent to increase refcount for extent (E, B) + * - Intent to add rmap (X, E, A, B) + * + * In the process of recovering, it should also generate and take care + * of these intent items: + * + * - Intent to free extent (C, B) + * - Intent to free extent (F, 1) (refcountbt block) + * - Intent to remove rmap (F, 1, REFC) + */ + +static const struct xfs_defer_op_type *defer_op_types[XFS_DEFER_OPS_TYPE_MAX]; + +/* + * For each pending item in the intake list, log its intent item and the + * associated extents, then add the entire intake list to the end of + * the pending list. + */ +STATIC void +xfs_defer_intake_work( + struct xfs_trans *tp, + struct xfs_defer_ops *dop) +{ + struct list_head *li; + struct xfs_defer_pending *dfp; + + list_for_each_entry(dfp, &dop->dop_intake, dfp_list) { + trace_xfs_defer_intake_work(tp->t_mountp, dfp); + dfp->dfp_intent = dfp->dfp_type->create_intent(tp, + dfp->dfp_count); + list_sort(tp->t_mountp, &dfp->dfp_work, + dfp->dfp_type->diff_items); + list_for_each(li, &dfp->dfp_work) + dfp->dfp_type->log_item(tp, dfp->dfp_intent, li); + } + + list_splice_tail_init(&dop->dop_intake, &dop->dop_pending); +} + +/* Abort all the intents that were committed. */ +STATIC void +xfs_defer_trans_abort( + struct xfs_trans *tp, + struct xfs_defer_ops *dop, + int error) +{ + struct xfs_defer_pending *dfp; + + trace_xfs_defer_trans_abort(tp->t_mountp, dop); + /* + * If the transaction was committed, drop the intent reference + * since we're bailing out of here. The other reference is + * dropped when the intent hits the AIL. If the transaction + * was not committed, the intent is freed by the intent item + * unlock handler on abort. + */ + if (!dop->dop_committed) + return; + + /* Abort intent items. */ + list_for_each_entry(dfp, &dop->dop_pending, dfp_list) { + trace_xfs_defer_pending_abort(tp->t_mountp, dfp); + if (!dfp->dfp_done) + dfp->dfp_type->abort_intent(dfp->dfp_intent); + } + + /* Shut down FS. */ + xfs_force_shutdown(tp->t_mountp, (error == -EFSCORRUPTED) ? + SHUTDOWN_CORRUPT_INCORE : SHUTDOWN_META_IO_ERROR); +} + +/* Roll a transaction so we can do some deferred op processing. */ +STATIC int +xfs_defer_trans_roll( + struct xfs_trans **tp, + struct xfs_defer_ops *dop, + struct xfs_inode *ip) +{ + int i; + int error; + + /* Log all the joined inodes except the one we passed in. */ + for (i = 0; i < XFS_DEFER_OPS_NR_INODES && dop->dop_inodes[i]; i++) { + if (dop->dop_inodes[i] == ip) + continue; + xfs_trans_log_inode(*tp, dop->dop_inodes[i], XFS_ILOG_CORE); + } + + trace_xfs_defer_trans_roll((*tp)->t_mountp, dop); + + /* Roll the transaction. */ + error = xfs_trans_roll(tp, ip); + if (error) { + trace_xfs_defer_trans_roll_error((*tp)->t_mountp, dop, error); + xfs_defer_trans_abort(*tp, dop, error); + return error; + } + dop->dop_committed = true; + + /* Rejoin the joined inodes except the one we passed in. */ + for (i = 0; i < XFS_DEFER_OPS_NR_INODES && dop->dop_inodes[i]; i++) { + if (dop->dop_inodes[i] == ip) + continue; + xfs_trans_ijoin(*tp, dop->dop_inodes[i], 0); + } + + return error; +} + +/* Do we have any work items to finish? */ +bool +xfs_defer_has_unfinished_work( + struct xfs_defer_ops *dop) +{ + return !list_empty(&dop->dop_pending) || !list_empty(&dop->dop_intake); +} + +/* + * Add this inode to the deferred op. Each joined inode is relogged + * each time we roll the transaction, in addition to any inode passed + * to xfs_defer_finish(). + */ +int +xfs_defer_join( + struct xfs_defer_ops *dop, + struct xfs_inode *ip) +{ + int i; + + for (i = 0; i < XFS_DEFER_OPS_NR_INODES; i++) { + if (dop->dop_inodes[i] == ip) + return 0; + else if (dop->dop_inodes[i] == NULL) { + dop->dop_inodes[i] = ip; + return 0; + } + } + + return -EFSCORRUPTED; +} + +/* + * Finish all the pending work. This involves logging intent items for + * any work items that wandered in since the last transaction roll (if + * one has even happened), rolling the transaction, and finishing the + * work items in the first item on the logged-and-pending list. + * + * If an inode is provided, relog it to the new transaction. + */ +int +xfs_defer_finish( + struct xfs_trans **tp, + struct xfs_defer_ops *dop, + struct xfs_inode *ip) +{ + struct xfs_defer_pending *dfp; + struct list_head *li; + struct list_head *n; + void *state; + int error = 0; + void (*cleanup_fn)(struct xfs_trans *, void *, int); + + ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); + + trace_xfs_defer_finish((*tp)->t_mountp, dop); + + /* Until we run out of pending work to finish... */ + while (xfs_defer_has_unfinished_work(dop)) { + /* Log intents for work items sitting in the intake. */ + xfs_defer_intake_work(*tp, dop); + + /* Roll the transaction. */ + error = xfs_defer_trans_roll(tp, dop, ip); + if (error) + goto out; + + /* Log an intent-done item for the first pending item. */ + dfp = list_first_entry(&dop->dop_pending, + struct xfs_defer_pending, dfp_list); + trace_xfs_defer_pending_finish((*tp)->t_mountp, dfp); + dfp->dfp_done = dfp->dfp_type->create_done(*tp, dfp->dfp_intent, + dfp->dfp_count); + cleanup_fn = dfp->dfp_type->finish_cleanup; + + /* Finish the work items. */ + state = NULL; + list_for_each_safe(li, n, &dfp->dfp_work) { + list_del(li); + dfp->dfp_count--; + error = dfp->dfp_type->finish_item(*tp, dop, li, + dfp->dfp_done, &state); + if (error) { + /* + * Clean up after ourselves and jump out. + * xfs_defer_cancel will take care of freeing + * all these lists and stuff. + */ + if (cleanup_fn) + cleanup_fn(*tp, state, error); + xfs_defer_trans_abort(*tp, dop, error); + goto out; + } + } + /* Done with the dfp, free it. */ + list_del(&dfp->dfp_list); + kmem_free(dfp); + + if (cleanup_fn) + cleanup_fn(*tp, state, error); + } + +out: + if (error) + trace_xfs_defer_finish_error((*tp)->t_mountp, dop, error); + else + trace_xfs_defer_finish_done((*tp)->t_mountp, dop); + return error; +} + +/* + * Free up any items left in the list. + */ +void +xfs_defer_cancel( + struct xfs_defer_ops *dop) +{ + struct xfs_defer_pending *dfp; + struct xfs_defer_pending *pli; + struct list_head *pwi; + struct list_head *n; + + trace_xfs_defer_cancel(NULL, dop); + + /* + * Free the pending items. Caller should already have arranged + * for the intent items to be released. + */ + list_for_each_entry_safe(dfp, pli, &dop->dop_intake, dfp_list) { + trace_xfs_defer_intake_cancel(NULL, dfp); + list_del(&dfp->dfp_list); + list_for_each_safe(pwi, n, &dfp->dfp_work) { + list_del(pwi); + dfp->dfp_count--; + dfp->dfp_type->cancel_item(pwi); + } + ASSERT(dfp->dfp_count == 0); + kmem_free(dfp); + } + list_for_each_entry_safe(dfp, pli, &dop->dop_pending, dfp_list) { + trace_xfs_defer_pending_cancel(NULL, dfp); + list_del(&dfp->dfp_list); + list_for_each_safe(pwi, n, &dfp->dfp_work) { + list_del(pwi); + dfp->dfp_count--; + dfp->dfp_type->cancel_item(pwi); + } + ASSERT(dfp->dfp_count == 0); + kmem_free(dfp); + } +} + +/* Add an item for later deferred processing. */ +void +xfs_defer_add( + struct xfs_defer_ops *dop, + enum xfs_defer_ops_type type, + struct list_head *li) +{ + struct xfs_defer_pending *dfp = NULL; + + /* + * Add the item to a pending item at the end of the intake list. + * If the last pending item has the same type, reuse it. Else, + * create a new pending item at the end of the intake list. + */ + if (!list_empty(&dop->dop_intake)) { + dfp = list_last_entry(&dop->dop_intake, + struct xfs_defer_pending, dfp_list); + if (dfp->dfp_type->type != type || + (dfp->dfp_type->max_items && + dfp->dfp_count >= dfp->dfp_type->max_items)) + dfp = NULL; + } + if (!dfp) { + dfp = kmem_alloc(sizeof(struct xfs_defer_pending), + KM_SLEEP | KM_NOFS); + dfp->dfp_type = defer_op_types[type]; + dfp->dfp_intent = NULL; + dfp->dfp_done = NULL; + dfp->dfp_count = 0; + INIT_LIST_HEAD(&dfp->dfp_work); + list_add_tail(&dfp->dfp_list, &dop->dop_intake); + } + + list_add_tail(li, &dfp->dfp_work); + dfp->dfp_count++; +} + +/* Initialize a deferred operation list. */ +void +xfs_defer_init_op_type( + const struct xfs_defer_op_type *type) +{ + defer_op_types[type->type] = type; +} + +/* Initialize a deferred operation. */ +void +xfs_defer_init( + struct xfs_defer_ops *dop, + xfs_fsblock_t *fbp) +{ + dop->dop_committed = false; + dop->dop_low = false; + memset(&dop->dop_inodes, 0, sizeof(dop->dop_inodes)); + *fbp = NULLFSBLOCK; + INIT_LIST_HEAD(&dop->dop_intake); + INIT_LIST_HEAD(&dop->dop_pending); + trace_xfs_defer_init(NULL, dop); +} diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h new file mode 100644 index 000000000..e96533d17 --- /dev/null +++ b/fs/xfs/libxfs/xfs_defer.h @@ -0,0 +1,97 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#ifndef __XFS_DEFER_H__ +#define __XFS_DEFER_H__ + +struct xfs_defer_op_type; + +/* + * Save a log intent item and a list of extents, so that we can replay + * whatever action had to happen to the extent list and file the log done + * item. + */ +struct xfs_defer_pending { + const struct xfs_defer_op_type *dfp_type; /* function pointers */ + struct list_head dfp_list; /* pending items */ + void *dfp_intent; /* log intent item */ + void *dfp_done; /* log done item */ + struct list_head dfp_work; /* work items */ + unsigned int dfp_count; /* # extent items */ +}; + +/* + * Header for deferred operation list. + * + * dop_low is used by the allocator to activate the lowspace algorithm - + * when free space is running low the extent allocator may choose to + * allocate an extent from an AG without leaving sufficient space for + * a btree split when inserting the new extent. In this case the allocator + * will enable the lowspace algorithm which is supposed to allow further + * allocations (such as btree splits and newroots) to allocate from + * sequential AGs. In order to avoid locking AGs out of order the lowspace + * algorithm will start searching for free space from AG 0. If the correct + * transaction reservations have been made then this algorithm will eventually + * find all the space it needs. + */ +enum xfs_defer_ops_type { + XFS_DEFER_OPS_TYPE_RMAP, + XFS_DEFER_OPS_TYPE_FREE, + XFS_DEFER_OPS_TYPE_MAX, +}; + +#define XFS_DEFER_OPS_NR_INODES 2 /* join up to two inodes */ + +struct xfs_defer_ops { + bool dop_committed; /* did any trans commit? */ + bool dop_low; /* alloc in low mode */ + struct list_head dop_intake; /* unlogged pending work */ + struct list_head dop_pending; /* logged pending work */ + + /* relog these inodes with each roll */ + struct xfs_inode *dop_inodes[XFS_DEFER_OPS_NR_INODES]; +}; + +void xfs_defer_add(struct xfs_defer_ops *dop, enum xfs_defer_ops_type type, + struct list_head *h); +int xfs_defer_finish(struct xfs_trans **tp, struct xfs_defer_ops *dop, + struct xfs_inode *ip); +void xfs_defer_cancel(struct xfs_defer_ops *dop); +void xfs_defer_init(struct xfs_defer_ops *dop, xfs_fsblock_t *fbp); +bool xfs_defer_has_unfinished_work(struct xfs_defer_ops *dop); +int xfs_defer_join(struct xfs_defer_ops *dop, struct xfs_inode *ip); + +/* Description of a deferred type. */ +struct xfs_defer_op_type { + enum xfs_defer_ops_type type; + unsigned int max_items; + void (*abort_intent)(void *); + void *(*create_done)(struct xfs_trans *, void *, unsigned int); + int (*finish_item)(struct xfs_trans *, struct xfs_defer_ops *, + struct list_head *, void *, void **); + void (*finish_cleanup)(struct xfs_trans *, void *, int); + void (*cancel_item)(struct list_head *); + int (*diff_items)(void *, struct list_head *, struct list_head *); + void *(*create_intent)(struct xfs_trans *, uint); + void (*log_item)(struct xfs_trans *, void *, struct list_head *); +}; + +void xfs_defer_init_op_type(const struct xfs_defer_op_type *type); + +#endif /* __XFS_DEFER_H__ */ diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index af0f9d171..20a96dd5a 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -21,6 +21,7 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" +#include "xfs_defer.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" #include "xfs_inode.h" @@ -259,7 +260,7 @@ xfs_dir_createname( struct xfs_name *name, xfs_ino_t inum, /* new entry inode number */ xfs_fsblock_t *first, /* bmap's firstblock */ - xfs_bmap_free_t *flist, /* bmap's freeblock list */ + struct xfs_defer_ops *dfops, /* bmap's freeblock list */ xfs_extlen_t total) /* bmap's total block count */ { struct xfs_da_args *args; @@ -286,7 +287,7 @@ xfs_dir_createname( args->inumber = inum; args->dp = dp; args->firstblock = first; - args->flist = flist; + args->dfops = dfops; args->total = total; args->whichfork = XFS_DATA_FORK; args->trans = tp; @@ -436,7 +437,7 @@ xfs_dir_removename( struct xfs_name *name, xfs_ino_t ino, xfs_fsblock_t *first, /* bmap's firstblock */ - xfs_bmap_free_t *flist, /* bmap's freeblock list */ + struct xfs_defer_ops *dfops, /* bmap's freeblock list */ xfs_extlen_t total) /* bmap's total block count */ { struct xfs_da_args *args; @@ -458,7 +459,7 @@ xfs_dir_removename( args->inumber = ino; args->dp = dp; args->firstblock = first; - args->flist = flist; + args->dfops = dfops; args->total = total; args->whichfork = XFS_DATA_FORK; args->trans = tp; @@ -498,7 +499,7 @@ xfs_dir_replace( struct xfs_name *name, /* name of entry to replace */ xfs_ino_t inum, /* new inode number */ xfs_fsblock_t *first, /* bmap's firstblock */ - xfs_bmap_free_t *flist, /* bmap's freeblock list */ + struct xfs_defer_ops *dfops, /* bmap's freeblock list */ xfs_extlen_t total) /* bmap's total block count */ { struct xfs_da_args *args; @@ -523,7 +524,7 @@ xfs_dir_replace( args->inumber = inum; args->dp = dp; args->firstblock = first; - args->flist = flist; + args->dfops = dfops; args->total = total; args->whichfork = XFS_DATA_FORK; args->trans = tp; @@ -680,7 +681,7 @@ xfs_dir2_shrink_inode( /* Unmap the fsblock(s). */ error = xfs_bunmapi(tp, dp, da, args->geo->fsbcount, 0, 0, - args->firstblock, args->flist, &done); + args->firstblock, args->dfops, &done); if (error) { /* * ENOSPC actually can happen if we're in a removename with no diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h index e55353651..becc926c3 100644 --- a/fs/xfs/libxfs/xfs_dir2.h +++ b/fs/xfs/libxfs/xfs_dir2.h @@ -18,7 +18,7 @@ #ifndef __XFS_DIR2_H__ #define __XFS_DIR2_H__ -struct xfs_bmap_free; +struct xfs_defer_ops; struct xfs_da_args; struct xfs_inode; struct xfs_mount; @@ -129,18 +129,18 @@ extern int xfs_dir_init(struct xfs_trans *tp, struct xfs_inode *dp, extern int xfs_dir_createname(struct xfs_trans *tp, struct xfs_inode *dp, struct xfs_name *name, xfs_ino_t inum, xfs_fsblock_t *first, - struct xfs_bmap_free *flist, xfs_extlen_t tot); + struct xfs_defer_ops *dfops, xfs_extlen_t tot); extern int xfs_dir_lookup(struct xfs_trans *tp, struct xfs_inode *dp, struct xfs_name *name, xfs_ino_t *inum, struct xfs_name *ci_name); extern int xfs_dir_removename(struct xfs_trans *tp, struct xfs_inode *dp, struct xfs_name *name, xfs_ino_t ino, xfs_fsblock_t *first, - struct xfs_bmap_free *flist, xfs_extlen_t tot); + struct xfs_defer_ops *dfops, xfs_extlen_t tot); extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp, struct xfs_name *name, xfs_ino_t inum, xfs_fsblock_t *first, - struct xfs_bmap_free *flist, xfs_extlen_t tot); + struct xfs_defer_ops *dfops, xfs_extlen_t tot); extern int xfs_dir_canenter(struct xfs_trans *tp, struct xfs_inode *dp, struct xfs_name *name); diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c index e5bb9cc3b..c6809ff41 100644 --- a/fs/xfs/libxfs/xfs_dir2_sf.c +++ b/fs/xfs/libxfs/xfs_dir2_sf.c @@ -126,13 +126,12 @@ xfs_dir2_block_sfsize( /* * Calculate the new size, see if we should give up yet. */ - size = xfs_dir2_sf_hdr_size(i8count) + /* header */ - count + /* namelen */ - count * (uint)sizeof(xfs_dir2_sf_off_t) + /* offset */ - namelen + /* name */ - (i8count ? /* inumber */ - (uint)sizeof(xfs_dir2_ino8_t) * count : - (uint)sizeof(xfs_dir2_ino4_t) * count); + size = xfs_dir2_sf_hdr_size(i8count) + /* header */ + count * 3 * sizeof(u8) + /* namelen + offset */ + namelen + /* name */ + (i8count ? /* inumber */ + count * XFS_INO64_SIZE : + count * XFS_INO32_SIZE); if (size > XFS_IFORK_DSIZE(dp)) return size; /* size value is a failure */ } @@ -319,10 +318,7 @@ xfs_dir2_sf_addname( /* * Yes, adjust the inode size. old count + (parent + new) */ - incr_isize += - (sfp->count + 2) * - ((uint)sizeof(xfs_dir2_ino8_t) - - (uint)sizeof(xfs_dir2_ino4_t)); + incr_isize += (sfp->count + 2) * XFS_INO64_DIFF; objchange = 1; } @@ -897,11 +893,7 @@ xfs_dir2_sf_replace( int error; /* error return value */ int newsize; /* new inode size */ - newsize = - dp->i_df.if_bytes + - (sfp->count + 1) * - ((uint)sizeof(xfs_dir2_ino8_t) - - (uint)sizeof(xfs_dir2_ino4_t)); + newsize = dp->i_df.if_bytes + (sfp->count + 1) * XFS_INO64_DIFF; /* * Won't fit as shortform, convert to block then do replace. */ @@ -1022,10 +1014,7 @@ xfs_dir2_sf_toino4( /* * Compute the new inode size. */ - newsize = - oldsize - - (oldsfp->count + 1) * - ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t)); + newsize = oldsize - (oldsfp->count + 1) * XFS_INO64_DIFF; xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK); xfs_idata_realloc(dp, newsize, XFS_DATA_FORK); /* @@ -1048,7 +1037,7 @@ xfs_dir2_sf_toino4( i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep), oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) { sfep->namelen = oldsfep->namelen; - sfep->offset = oldsfep->offset; + memcpy(sfep->offset, oldsfep->offset, sizeof(sfep->offset)); memcpy(sfep->name, oldsfep->name, sfep->namelen); dp->d_ops->sf_put_ino(sfp, sfep, dp->d_ops->sf_get_ino(oldsfp, oldsfep)); @@ -1098,10 +1087,7 @@ xfs_dir2_sf_toino8( /* * Compute the new inode size (nb: entry count + 1 for parent) */ - newsize = - oldsize + - (oldsfp->count + 1) * - ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t)); + newsize = oldsize + (oldsfp->count + 1) * XFS_INO64_DIFF; xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK); xfs_idata_realloc(dp, newsize, XFS_DATA_FORK); /* @@ -1124,7 +1110,7 @@ xfs_dir2_sf_toino8( i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep), oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) { sfep->namelen = oldsfep->namelen; - sfep->offset = oldsfep->offset; + memcpy(sfep->offset, oldsfep->offset, sizeof(sfep->offset)); memcpy(sfep->name, oldsfep->name, sfep->namelen); dp->d_ops->sf_put_ino(sfp, sfep, dp->d_ops->sf_get_ino(oldsfp, oldsfep)); diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index dc97eb21a..270fb5cf4 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -455,8 +455,10 @@ xfs_sb_has_compat_feature( } #define XFS_SB_FEAT_RO_COMPAT_FINOBT (1 << 0) /* free inode btree */ +#define XFS_SB_FEAT_RO_COMPAT_RMAPBT (1 << 1) /* reverse map btree */ #define XFS_SB_FEAT_RO_COMPAT_ALL \ - (XFS_SB_FEAT_RO_COMPAT_FINOBT) + (XFS_SB_FEAT_RO_COMPAT_FINOBT | \ + XFS_SB_FEAT_RO_COMPAT_RMAPBT) #define XFS_SB_FEAT_RO_COMPAT_UNKNOWN ~XFS_SB_FEAT_RO_COMPAT_ALL static inline bool xfs_sb_has_ro_compat_feature( @@ -538,6 +540,12 @@ static inline bool xfs_sb_version_hasmetauuid(struct xfs_sb *sbp) (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_META_UUID); } +static inline bool xfs_sb_version_hasrmapbt(struct xfs_sb *sbp) +{ + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) && + (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_RMAPBT); +} + /* * end of superblock version macros */ @@ -598,10 +606,10 @@ xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino) #define XFS_AGI_GOOD_VERSION(v) ((v) == XFS_AGI_VERSION) /* - * Btree number 0 is bno, 1 is cnt. This value gives the size of the + * Btree number 0 is bno, 1 is cnt, 2 is rmap. This value gives the size of the * arrays below. */ -#define XFS_BTNUM_AGF ((int)XFS_BTNUM_CNTi + 1) +#define XFS_BTNUM_AGF ((int)XFS_BTNUM_RMAPi + 1) /* * The second word of agf_levels in the first a.g. overlaps the EFS @@ -618,12 +626,10 @@ typedef struct xfs_agf { __be32 agf_seqno; /* sequence # starting from 0 */ __be32 agf_length; /* size in blocks of a.g. */ /* - * Freespace information + * Freespace and rmap information */ __be32 agf_roots[XFS_BTNUM_AGF]; /* root blocks */ - __be32 agf_spare0; /* spare field */ __be32 agf_levels[XFS_BTNUM_AGF]; /* btree levels */ - __be32 agf_spare1; /* spare field */ __be32 agf_flfirst; /* first freelist block's index */ __be32 agf_fllast; /* last freelist block's index */ @@ -634,12 +640,15 @@ typedef struct xfs_agf { __be32 agf_btreeblks; /* # of blocks held in AGF btrees */ uuid_t agf_uuid; /* uuid of filesystem */ + __be32 agf_rmap_blocks; /* rmapbt blocks used */ + __be32 agf_padding; /* padding */ + /* * reserve some contiguous space for future logged fields before we add * the unlogged fields. This makes the range logging via flags and * structure offsets much simpler. */ - __be64 agf_spare64[16]; + __be64 agf_spare64[15]; /* unlogged fields, written during buffer writeback. */ __be64 agf_lsn; /* last write sequence */ @@ -664,7 +673,9 @@ typedef struct xfs_agf { #define XFS_AGF_LONGEST 0x00000400 #define XFS_AGF_BTREEBLKS 0x00000800 #define XFS_AGF_UUID 0x00001000 -#define XFS_AGF_NUM_BITS 13 +#define XFS_AGF_RMAP_BLOCKS 0x00002000 +#define XFS_AGF_SPARE64 0x00004000 +#define XFS_AGF_NUM_BITS 15 #define XFS_AGF_ALL_BITS ((1 << XFS_AGF_NUM_BITS) - 1) #define XFS_AGF_FLAGS \ @@ -680,7 +691,9 @@ typedef struct xfs_agf { { XFS_AGF_FREEBLKS, "FREEBLKS" }, \ { XFS_AGF_LONGEST, "LONGEST" }, \ { XFS_AGF_BTREEBLKS, "BTREEBLKS" }, \ - { XFS_AGF_UUID, "UUID" } + { XFS_AGF_UUID, "UUID" }, \ + { XFS_AGF_RMAP_BLOCKS, "RMAP_BLOCKS" }, \ + { XFS_AGF_SPARE64, "SPARE64" } /* disk block (xfs_daddr_t) in the AG */ #define XFS_AGF_DADDR(mp) ((xfs_daddr_t)(1 << (mp)->m_sectbb_log)) @@ -1308,17 +1321,118 @@ typedef __be32 xfs_inobt_ptr_t; #define XFS_FIBT_BLOCK(mp) ((xfs_agblock_t)(XFS_IBT_BLOCK(mp) + 1)) /* - * The first data block of an AG depends on whether the filesystem was formatted - * with the finobt feature. If so, account for the finobt reserved root btree - * block. + * Reverse mapping btree format definitions + * + * There is a btree for the reverse map per allocation group + */ +#define XFS_RMAP_CRC_MAGIC 0x524d4233 /* 'RMB3' */ + +/* + * Ownership info for an extent. This is used to create reverse-mapping + * entries. + */ +#define XFS_OWNER_INFO_ATTR_FORK (1 << 0) +#define XFS_OWNER_INFO_BMBT_BLOCK (1 << 1) +struct xfs_owner_info { + uint64_t oi_owner; + xfs_fileoff_t oi_offset; + unsigned int oi_flags; +}; + +/* + * Special owner types. + * + * Seeing as we only support up to 8EB, we have the upper bit of the owner field + * to tell us we have a special owner value. We use these for static metadata + * allocated at mkfs/growfs time, as well as for freespace management metadata. + */ +#define XFS_RMAP_OWN_NULL (-1ULL) /* No owner, for growfs */ +#define XFS_RMAP_OWN_UNKNOWN (-2ULL) /* Unknown owner, for EFI recovery */ +#define XFS_RMAP_OWN_FS (-3ULL) /* static fs metadata */ +#define XFS_RMAP_OWN_LOG (-4ULL) /* static fs metadata */ +#define XFS_RMAP_OWN_AG (-5ULL) /* AG freespace btree blocks */ +#define XFS_RMAP_OWN_INOBT (-6ULL) /* Inode btree blocks */ +#define XFS_RMAP_OWN_INODES (-7ULL) /* Inode chunk */ +#define XFS_RMAP_OWN_MIN (-8ULL) /* guard */ + +#define XFS_RMAP_NON_INODE_OWNER(owner) (!!((owner) & (1ULL << 63))) + +/* + * Data record structure + */ +struct xfs_rmap_rec { + __be32 rm_startblock; /* extent start block */ + __be32 rm_blockcount; /* extent length */ + __be64 rm_owner; /* extent owner */ + __be64 rm_offset; /* offset within the owner */ +}; + +/* + * rmap btree record + * rm_offset:63 is the attribute fork flag + * rm_offset:62 is the bmbt block flag + * rm_offset:61 is the unwritten extent flag (same as l0:63 in bmbt) + * rm_offset:54-60 aren't used and should be zero + * rm_offset:0-53 is the block offset within the inode + */ +#define XFS_RMAP_OFF_ATTR_FORK ((__uint64_t)1ULL << 63) +#define XFS_RMAP_OFF_BMBT_BLOCK ((__uint64_t)1ULL << 62) +#define XFS_RMAP_OFF_UNWRITTEN ((__uint64_t)1ULL << 61) + +#define XFS_RMAP_LEN_MAX ((__uint32_t)~0U) +#define XFS_RMAP_OFF_FLAGS (XFS_RMAP_OFF_ATTR_FORK | \ + XFS_RMAP_OFF_BMBT_BLOCK | \ + XFS_RMAP_OFF_UNWRITTEN) +#define XFS_RMAP_OFF_MASK ((__uint64_t)0x3FFFFFFFFFFFFFULL) + +#define XFS_RMAP_OFF(off) ((off) & XFS_RMAP_OFF_MASK) + +#define XFS_RMAP_IS_BMBT_BLOCK(off) (!!((off) & XFS_RMAP_OFF_BMBT_BLOCK)) +#define XFS_RMAP_IS_ATTR_FORK(off) (!!((off) & XFS_RMAP_OFF_ATTR_FORK)) +#define XFS_RMAP_IS_UNWRITTEN(len) (!!((off) & XFS_RMAP_OFF_UNWRITTEN)) + +#define RMAPBT_STARTBLOCK_BITLEN 32 +#define RMAPBT_BLOCKCOUNT_BITLEN 32 +#define RMAPBT_OWNER_BITLEN 64 +#define RMAPBT_ATTRFLAG_BITLEN 1 +#define RMAPBT_BMBTFLAG_BITLEN 1 +#define RMAPBT_EXNTFLAG_BITLEN 1 +#define RMAPBT_UNUSED_OFFSET_BITLEN 7 +#define RMAPBT_OFFSET_BITLEN 54 + +#define XFS_RMAP_ATTR_FORK (1 << 0) +#define XFS_RMAP_BMBT_BLOCK (1 << 1) +#define XFS_RMAP_UNWRITTEN (1 << 2) +#define XFS_RMAP_KEY_FLAGS (XFS_RMAP_ATTR_FORK | \ + XFS_RMAP_BMBT_BLOCK) +#define XFS_RMAP_REC_FLAGS (XFS_RMAP_UNWRITTEN) +struct xfs_rmap_irec { + xfs_agblock_t rm_startblock; /* extent start block */ + xfs_extlen_t rm_blockcount; /* extent length */ + __uint64_t rm_owner; /* extent owner */ + __uint64_t rm_offset; /* offset within the owner */ + unsigned int rm_flags; /* state flags */ +}; + +/* + * Key structure + * + * We don't use the length for lookups */ -#define XFS_PREALLOC_BLOCKS(mp) \ +struct xfs_rmap_key { + __be32 rm_startblock; /* extent start block */ + __be64 rm_owner; /* extent owner */ + __be64 rm_offset; /* offset within the owner */ +} __attribute__((packed)); + +/* btree pointer type */ +typedef __be32 xfs_rmap_ptr_t; + +#define XFS_RMAP_BLOCK(mp) \ (xfs_sb_version_hasfinobt(&((mp)->m_sb)) ? \ XFS_FIBT_BLOCK(mp) + 1 : \ XFS_IBT_BLOCK(mp) + 1) - - /* * BMAP Btree format definitions * @@ -1435,41 +1549,57 @@ typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t; * with the crc feature bit, and all accesses to them must be conditional on * that flag. */ +/* short form block header */ +struct xfs_btree_block_shdr { + __be32 bb_leftsib; + __be32 bb_rightsib; + + __be64 bb_blkno; + __be64 bb_lsn; + uuid_t bb_uuid; + __be32 bb_owner; + __le32 bb_crc; +}; + +/* long form block header */ +struct xfs_btree_block_lhdr { + __be64 bb_leftsib; + __be64 bb_rightsib; + + __be64 bb_blkno; + __be64 bb_lsn; + uuid_t bb_uuid; + __be64 bb_owner; + __le32 bb_crc; + __be32 bb_pad; /* padding for alignment */ +}; + struct xfs_btree_block { __be32 bb_magic; /* magic number for block type */ __be16 bb_level; /* 0 is a leaf */ __be16 bb_numrecs; /* current # of data records */ union { - struct { - __be32 bb_leftsib; - __be32 bb_rightsib; - - __be64 bb_blkno; - __be64 bb_lsn; - uuid_t bb_uuid; - __be32 bb_owner; - __le32 bb_crc; - } s; /* short form pointers */ - struct { - __be64 bb_leftsib; - __be64 bb_rightsib; - - __be64 bb_blkno; - __be64 bb_lsn; - uuid_t bb_uuid; - __be64 bb_owner; - __le32 bb_crc; - __be32 bb_pad; /* padding for alignment */ - } l; /* long form pointers */ + struct xfs_btree_block_shdr s; + struct xfs_btree_block_lhdr l; } bb_u; /* rest */ }; -#define XFS_BTREE_SBLOCK_LEN 16 /* size of a short form block */ -#define XFS_BTREE_LBLOCK_LEN 24 /* size of a long form block */ +/* size of a short form block */ +#define XFS_BTREE_SBLOCK_LEN \ + (offsetof(struct xfs_btree_block, bb_u) + \ + offsetof(struct xfs_btree_block_shdr, bb_blkno)) +/* size of a long form block */ +#define XFS_BTREE_LBLOCK_LEN \ + (offsetof(struct xfs_btree_block, bb_u) + \ + offsetof(struct xfs_btree_block_lhdr, bb_blkno)) /* sizes of CRC enabled btree blocks */ -#define XFS_BTREE_SBLOCK_CRC_LEN (XFS_BTREE_SBLOCK_LEN + 40) -#define XFS_BTREE_LBLOCK_CRC_LEN (XFS_BTREE_LBLOCK_LEN + 48) +#define XFS_BTREE_SBLOCK_CRC_LEN \ + (offsetof(struct xfs_btree_block, bb_u) + \ + sizeof(struct xfs_btree_block_shdr)) +#define XFS_BTREE_LBLOCK_CRC_LEN \ + (offsetof(struct xfs_btree_block, bb_u) + \ + sizeof(struct xfs_btree_block_lhdr)) #define XFS_BTREE_SBLOCK_CRC_OFF \ offsetof(struct xfs_btree_block, bb_u.s.bb_crc) diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index fffe3d01b..79455058b 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -206,6 +206,7 @@ typedef struct xfs_fsop_resblks { #define XFS_FSOP_GEOM_FLAGS_FTYPE 0x10000 /* inode directory types */ #define XFS_FSOP_GEOM_FLAGS_FINOBT 0x20000 /* free inode btree */ #define XFS_FSOP_GEOM_FLAGS_SPINODES 0x40000 /* sparse inode chunks */ +#define XFS_FSOP_GEOM_FLAGS_RMAPBT 0x80000 /* Reverse mapping btree */ /* * Minimum and maximum sizes need for growth checks. @@ -521,12 +522,8 @@ typedef struct xfs_swapext #define XFS_IOC_ERROR_CLEARALL _IOW ('X', 117, struct xfs_error_injection) /* XFS_IOC_ATTRCTL_BY_HANDLE -- deprecated 118 */ -/* XFS_IOC_FREEZE -- FIFREEZE 119 */ -/* XFS_IOC_THAW -- FITHAW 120 */ -#ifndef FIFREEZE -#define XFS_IOC_FREEZE _IOWR('X', 119, int) -#define XFS_IOC_THAW _IOWR('X', 120, int) -#endif +#define XFS_IOC_FREEZE _IOWR('X', 119, int) /* aka FIFREEZE */ +#define XFS_IOC_THAW _IOWR('X', 120, int) /* aka FITHAW */ #define XFS_IOC_FSSETDM_BY_HANDLE _IOW ('X', 121, struct xfs_fsop_setdm_handlereq) #define XFS_IOC_ATTRLIST_BY_HANDLE _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq) diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 22297f9b0..51b4e0de1 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -24,6 +24,7 @@ #include "xfs_bit.h" #include "xfs_sb.h" #include "xfs_mount.h" +#include "xfs_defer.h" #include "xfs_inode.h" #include "xfs_btree.h" #include "xfs_ialloc.h" @@ -39,6 +40,7 @@ #include "xfs_icache.h" #include "xfs_trace.h" #include "xfs_log.h" +#include "xfs_rmap.h" /* @@ -614,6 +616,7 @@ xfs_ialloc_ag_alloc( args.tp = tp; args.mp = tp->t_mountp; args.fsbno = NULLFSBLOCK; + xfs_rmap_ag_owner(&args.oinfo, XFS_RMAP_OWN_INODES); #ifdef DEBUG /* randomly do sparse inode allocations */ @@ -1817,20 +1820,21 @@ xfs_difree_inode_chunk( struct xfs_mount *mp, xfs_agnumber_t agno, struct xfs_inobt_rec_incore *rec, - struct xfs_bmap_free *flist) + struct xfs_defer_ops *dfops) { xfs_agblock_t sagbno = XFS_AGINO_TO_AGBNO(mp, rec->ir_startino); int startidx, endidx; int nextbit; xfs_agblock_t agbno; int contigblk; + struct xfs_owner_info oinfo; DECLARE_BITMAP(holemask, XFS_INOBT_HOLEMASK_BITS); + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INODES); if (!xfs_inobt_issparse(rec->ir_holemask)) { /* not sparse, calculate extent info directly */ - xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno, - XFS_AGINO_TO_AGBNO(mp, rec->ir_startino)), - mp->m_ialloc_blks, flist, mp); + xfs_bmap_add_free(mp, dfops, XFS_AGB_TO_FSB(mp, agno, sagbno), + mp->m_ialloc_blks, &oinfo); return; } @@ -1873,8 +1877,8 @@ xfs_difree_inode_chunk( ASSERT(agbno % mp->m_sb.sb_spino_align == 0); ASSERT(contigblk % mp->m_sb.sb_spino_align == 0); - xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno, agbno), contigblk, - flist, mp); + xfs_bmap_add_free(mp, dfops, XFS_AGB_TO_FSB(mp, agno, agbno), + contigblk, &oinfo); /* reset range to current bit and carry on... */ startidx = endidx = nextbit; @@ -1890,7 +1894,7 @@ xfs_difree_inobt( struct xfs_trans *tp, struct xfs_buf *agbp, xfs_agino_t agino, - struct xfs_bmap_free *flist, + struct xfs_defer_ops *dfops, struct xfs_icluster *xic, struct xfs_inobt_rec_incore *orec) { @@ -1977,7 +1981,7 @@ xfs_difree_inobt( goto error0; } - xfs_difree_inode_chunk(mp, agno, &rec, flist); + xfs_difree_inode_chunk(mp, agno, &rec, dfops); } else { xic->deleted = 0; @@ -2122,7 +2126,7 @@ int xfs_difree( struct xfs_trans *tp, /* transaction pointer */ xfs_ino_t inode, /* inode to be freed */ - struct xfs_bmap_free *flist, /* extents to free */ + struct xfs_defer_ops *dfops, /* extents to free */ struct xfs_icluster *xic) /* cluster info if deleted */ { /* REFERENCED */ @@ -2174,7 +2178,7 @@ xfs_difree( /* * Fix up the inode allocation btree. */ - error = xfs_difree_inobt(mp, tp, agbp, agino, flist, xic, &rec); + error = xfs_difree_inobt(mp, tp, agbp, agino, dfops, xic, &rec); if (error) goto error0; @@ -2395,20 +2399,11 @@ void xfs_ialloc_compute_maxlevels( xfs_mount_t *mp) /* file system mount structure */ { - int level; - uint maxblocks; - uint maxleafents; - int minleafrecs; - int minnoderecs; - - maxleafents = (1LL << XFS_INO_AGINO_BITS(mp)) >> - XFS_INODES_PER_CHUNK_LOG; - minleafrecs = mp->m_inobt_mnr[0]; - minnoderecs = mp->m_inobt_mnr[1]; - maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs; - for (level = 1; maxblocks > 1; level++) - maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs; - mp->m_in_maxlevels = level; + uint inodes; + + inodes = (1LL << XFS_INO_AGINO_BITS(mp)) >> XFS_INODES_PER_CHUNK_LOG; + mp->m_in_maxlevels = xfs_btree_compute_maxlevels(mp, mp->m_inobt_mnr, + inodes); } /* diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h index 6e450df29..0bb89669f 100644 --- a/fs/xfs/libxfs/xfs_ialloc.h +++ b/fs/xfs/libxfs/xfs_ialloc.h @@ -95,7 +95,7 @@ int /* error */ xfs_difree( struct xfs_trans *tp, /* transaction pointer */ xfs_ino_t inode, /* inode to be freed */ - struct xfs_bmap_free *flist, /* extents to free */ + struct xfs_defer_ops *dfops, /* extents to free */ struct xfs_icluster *ifree); /* cluster info if deleted */ /* diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 89c21d771..31ca2208c 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -32,6 +32,7 @@ #include "xfs_trace.h" #include "xfs_cksum.h" #include "xfs_trans.h" +#include "xfs_rmap.h" STATIC int @@ -96,6 +97,7 @@ xfs_inobt_alloc_block( memset(&args, 0, sizeof(args)); args.tp = cur->bc_tp; args.mp = cur->bc_mp; + xfs_rmap_ag_owner(&args.oinfo, XFS_RMAP_OWN_INOBT); args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno, sbno); args.minlen = 1; args.maxlen = 1; @@ -125,8 +127,12 @@ xfs_inobt_free_block( struct xfs_btree_cur *cur, struct xfs_buf *bp) { + struct xfs_owner_info oinfo; + + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INOBT); return xfs_free_extent(cur->bc_tp, - XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp)), 1); + XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp)), 1, + &oinfo); } STATIC int @@ -145,14 +151,6 @@ xfs_inobt_init_key_from_rec( key->inobt.ir_startino = rec->inobt.ir_startino; } -STATIC void -xfs_inobt_init_rec_from_key( - union xfs_btree_key *key, - union xfs_btree_rec *rec) -{ - rec->inobt.ir_startino = key->inobt.ir_startino; -} - STATIC void xfs_inobt_init_rec_from_cur( struct xfs_btree_cur *cur, @@ -314,7 +312,6 @@ static const struct xfs_btree_ops xfs_inobt_ops = { .get_minrecs = xfs_inobt_get_minrecs, .get_maxrecs = xfs_inobt_get_maxrecs, .init_key_from_rec = xfs_inobt_init_key_from_rec, - .init_rec_from_key = xfs_inobt_init_rec_from_key, .init_rec_from_cur = xfs_inobt_init_rec_from_cur, .init_ptr_from_cur = xfs_inobt_init_ptr_from_cur, .key_diff = xfs_inobt_key_diff, @@ -336,7 +333,6 @@ static const struct xfs_btree_ops xfs_finobt_ops = { .get_minrecs = xfs_inobt_get_minrecs, .get_maxrecs = xfs_inobt_get_maxrecs, .init_key_from_rec = xfs_inobt_init_key_from_rec, - .init_rec_from_key = xfs_inobt_init_rec_from_key, .init_rec_from_cur = xfs_inobt_init_rec_from_cur, .init_ptr_from_cur = xfs_finobt_init_ptr_from_cur, .key_diff = xfs_inobt_key_diff, diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 9d9559eb2..4b9769e23 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -22,6 +22,7 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" +#include "xfs_defer.h" #include "xfs_inode.h" #include "xfs_error.h" #include "xfs_cksum.h" diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index e8f49c029..a6eed43fa 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -110,7 +110,9 @@ static inline uint xlog_get_cycle(char *ptr) #define XLOG_REG_TYPE_COMMIT 18 #define XLOG_REG_TYPE_TRANSHDR 19 #define XLOG_REG_TYPE_ICREATE 20 -#define XLOG_REG_TYPE_MAX 20 +#define XLOG_REG_TYPE_RUI_FORMAT 21 +#define XLOG_REG_TYPE_RUD_FORMAT 22 +#define XLOG_REG_TYPE_MAX 22 /* * Flags to log operation header @@ -227,6 +229,8 @@ typedef struct xfs_trans_header { #define XFS_LI_DQUOT 0x123d #define XFS_LI_QUOTAOFF 0x123e #define XFS_LI_ICREATE 0x123f +#define XFS_LI_RUI 0x1240 /* rmap update intent */ +#define XFS_LI_RUD 0x1241 #define XFS_LI_TYPE_DESC \ { XFS_LI_EFI, "XFS_LI_EFI" }, \ @@ -236,7 +240,9 @@ typedef struct xfs_trans_header { { XFS_LI_BUF, "XFS_LI_BUF" }, \ { XFS_LI_DQUOT, "XFS_LI_DQUOT" }, \ { XFS_LI_QUOTAOFF, "XFS_LI_QUOTAOFF" }, \ - { XFS_LI_ICREATE, "XFS_LI_ICREATE" } + { XFS_LI_ICREATE, "XFS_LI_ICREATE" }, \ + { XFS_LI_RUI, "XFS_LI_RUI" }, \ + { XFS_LI_RUD, "XFS_LI_RUD" } /* * Inode Log Item Format definitions. @@ -603,6 +609,59 @@ typedef struct xfs_efd_log_format_64 { xfs_extent_64_t efd_extents[1]; /* array of extents freed */ } xfs_efd_log_format_64_t; +/* + * RUI/RUD (reverse mapping) log format definitions + */ +struct xfs_map_extent { + __uint64_t me_owner; + __uint64_t me_startblock; + __uint64_t me_startoff; + __uint32_t me_len; + __uint32_t me_flags; +}; + +/* rmap me_flags: upper bits are flags, lower byte is type code */ +#define XFS_RMAP_EXTENT_MAP 1 +#define XFS_RMAP_EXTENT_UNMAP 3 +#define XFS_RMAP_EXTENT_CONVERT 5 +#define XFS_RMAP_EXTENT_ALLOC 7 +#define XFS_RMAP_EXTENT_FREE 8 +#define XFS_RMAP_EXTENT_TYPE_MASK 0xFF + +#define XFS_RMAP_EXTENT_ATTR_FORK (1U << 31) +#define XFS_RMAP_EXTENT_BMBT_BLOCK (1U << 30) +#define XFS_RMAP_EXTENT_UNWRITTEN (1U << 29) + +#define XFS_RMAP_EXTENT_FLAGS (XFS_RMAP_EXTENT_TYPE_MASK | \ + XFS_RMAP_EXTENT_ATTR_FORK | \ + XFS_RMAP_EXTENT_BMBT_BLOCK | \ + XFS_RMAP_EXTENT_UNWRITTEN) + +/* + * This is the structure used to lay out an rui log item in the + * log. The rui_extents field is a variable size array whose + * size is given by rui_nextents. + */ +struct xfs_rui_log_format { + __uint16_t rui_type; /* rui log item type */ + __uint16_t rui_size; /* size of this item */ + __uint32_t rui_nextents; /* # extents to free */ + __uint64_t rui_id; /* rui identifier */ + struct xfs_map_extent rui_extents[1]; /* array of extents to rmap */ +}; + +/* + * This is the structure used to lay out an rud log item in the + * log. The rud_extents array is a variable size array whose + * size is given by rud_nextents; + */ +struct xfs_rud_log_format { + __uint16_t rud_type; /* rud log item type */ + __uint16_t rud_size; /* size of this item */ + __uint32_t __pad; + __uint64_t rud_rui_id; /* id of corresponding rui */ +}; + /* * Dquot Log format definitions. * diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c new file mode 100644 index 000000000..73d05407d --- /dev/null +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -0,0 +1,1399 @@ +/* + * Copyright (c) 2014 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_btree.h" +#include "xfs_trans.h" +#include "xfs_alloc.h" +#include "xfs_rmap.h" +#include "xfs_rmap_btree.h" +#include "xfs_trans_space.h" +#include "xfs_trace.h" +#include "xfs_error.h" +#include "xfs_extent_busy.h" +#include "xfs_bmap.h" +#include "xfs_inode.h" + +/* + * Lookup the first record less than or equal to [bno, len, owner, offset] + * in the btree given by cur. + */ +int +xfs_rmap_lookup_le( + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + xfs_extlen_t len, + uint64_t owner, + uint64_t offset, + unsigned int flags, + int *stat) +{ + cur->bc_rec.r.rm_startblock = bno; + cur->bc_rec.r.rm_blockcount = len; + cur->bc_rec.r.rm_owner = owner; + cur->bc_rec.r.rm_offset = offset; + cur->bc_rec.r.rm_flags = flags; + return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat); +} + +/* + * Lookup the record exactly matching [bno, len, owner, offset] + * in the btree given by cur. + */ +int +xfs_rmap_lookup_eq( + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + xfs_extlen_t len, + uint64_t owner, + uint64_t offset, + unsigned int flags, + int *stat) +{ + cur->bc_rec.r.rm_startblock = bno; + cur->bc_rec.r.rm_blockcount = len; + cur->bc_rec.r.rm_owner = owner; + cur->bc_rec.r.rm_offset = offset; + cur->bc_rec.r.rm_flags = flags; + return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat); +} + +/* + * Update the record referred to by cur to the value given + * by [bno, len, owner, offset]. + * This either works (return 0) or gets an EFSCORRUPTED error. + */ +STATIC int +xfs_rmap_update( + struct xfs_btree_cur *cur, + struct xfs_rmap_irec *irec) +{ + union xfs_btree_rec rec; + int error; + + trace_xfs_rmap_update(cur->bc_mp, cur->bc_private.a.agno, + irec->rm_startblock, irec->rm_blockcount, + irec->rm_owner, irec->rm_offset, irec->rm_flags); + + rec.rmap.rm_startblock = cpu_to_be32(irec->rm_startblock); + rec.rmap.rm_blockcount = cpu_to_be32(irec->rm_blockcount); + rec.rmap.rm_owner = cpu_to_be64(irec->rm_owner); + rec.rmap.rm_offset = cpu_to_be64( + xfs_rmap_irec_offset_pack(irec)); + error = xfs_btree_update(cur, &rec); + if (error) + trace_xfs_rmap_update_error(cur->bc_mp, + cur->bc_private.a.agno, error, _RET_IP_); + return error; +} + +int +xfs_rmap_insert( + struct xfs_btree_cur *rcur, + xfs_agblock_t agbno, + xfs_extlen_t len, + uint64_t owner, + uint64_t offset, + unsigned int flags) +{ + int i; + int error; + + trace_xfs_rmap_insert(rcur->bc_mp, rcur->bc_private.a.agno, agbno, + len, owner, offset, flags); + + error = xfs_rmap_lookup_eq(rcur, agbno, len, owner, offset, flags, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(rcur->bc_mp, i == 0, done); + + rcur->bc_rec.r.rm_startblock = agbno; + rcur->bc_rec.r.rm_blockcount = len; + rcur->bc_rec.r.rm_owner = owner; + rcur->bc_rec.r.rm_offset = offset; + rcur->bc_rec.r.rm_flags = flags; + error = xfs_btree_insert(rcur, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(rcur->bc_mp, i == 1, done); +done: + if (error) + trace_xfs_rmap_insert_error(rcur->bc_mp, + rcur->bc_private.a.agno, error, _RET_IP_); + return error; +} + +static int +xfs_rmap_btrec_to_irec( + union xfs_btree_rec *rec, + struct xfs_rmap_irec *irec) +{ + irec->rm_flags = 0; + irec->rm_startblock = be32_to_cpu(rec->rmap.rm_startblock); + irec->rm_blockcount = be32_to_cpu(rec->rmap.rm_blockcount); + irec->rm_owner = be64_to_cpu(rec->rmap.rm_owner); + return xfs_rmap_irec_offset_unpack(be64_to_cpu(rec->rmap.rm_offset), + irec); +} + +/* + * Get the data from the pointed-to record. + */ +int +xfs_rmap_get_rec( + struct xfs_btree_cur *cur, + struct xfs_rmap_irec *irec, + int *stat) +{ + union xfs_btree_rec *rec; + int error; + + error = xfs_btree_get_rec(cur, &rec, stat); + if (error || !*stat) + return error; + + return xfs_rmap_btrec_to_irec(rec, irec); +} + +/* + * Find the extent in the rmap btree and remove it. + * + * The record we find should always be an exact match for the extent that we're + * looking for, since we insert them into the btree without modification. + * + * Special Case #1: when growing the filesystem, we "free" an extent when + * growing the last AG. This extent is new space and so it is not tracked as + * used space in the btree. The growfs code will pass in an owner of + * XFS_RMAP_OWN_NULL to indicate that it expected that there is no owner of this + * extent. We verify that - the extent lookup result in a record that does not + * overlap. + * + * Special Case #2: EFIs do not record the owner of the extent, so when + * recovering EFIs from the log we pass in XFS_RMAP_OWN_UNKNOWN to tell the rmap + * btree to ignore the owner (i.e. wildcard match) so we don't trigger + * corruption checks during log recovery. + */ +STATIC int +xfs_rmap_unmap( + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + xfs_extlen_t len, + bool unwritten, + struct xfs_owner_info *oinfo) +{ + struct xfs_mount *mp = cur->bc_mp; + struct xfs_rmap_irec ltrec; + uint64_t ltoff; + int error = 0; + int i; + uint64_t owner; + uint64_t offset; + unsigned int flags; + bool ignore_off; + + xfs_owner_info_unpack(oinfo, &owner, &offset, &flags); + ignore_off = XFS_RMAP_NON_INODE_OWNER(owner) || + (flags & XFS_RMAP_BMBT_BLOCK); + if (unwritten) + flags |= XFS_RMAP_UNWRITTEN; + trace_xfs_rmap_unmap(mp, cur->bc_private.a.agno, bno, len, + unwritten, oinfo); + + /* + * We should always have a left record because there's a static record + * for the AG headers at rm_startblock == 0 created by mkfs/growfs that + * will not ever be removed from the tree. + */ + error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, flags, &i); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error); + + error = xfs_rmap_get_rec(cur, <rec, &i); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error); + trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, + cur->bc_private.a.agno, ltrec.rm_startblock, + ltrec.rm_blockcount, ltrec.rm_owner, + ltrec.rm_offset, ltrec.rm_flags); + ltoff = ltrec.rm_offset; + + /* + * For growfs, the incoming extent must be beyond the left record we + * just found as it is new space and won't be used by anyone. This is + * just a corruption check as we don't actually do anything with this + * extent. Note that we need to use >= instead of > because it might + * be the case that the "left" extent goes all the way to EOFS. + */ + if (owner == XFS_RMAP_OWN_NULL) { + XFS_WANT_CORRUPTED_GOTO(mp, bno >= ltrec.rm_startblock + + ltrec.rm_blockcount, out_error); + goto out_done; + } + + /* Make sure the unwritten flag matches. */ + XFS_WANT_CORRUPTED_GOTO(mp, (flags & XFS_RMAP_UNWRITTEN) == + (ltrec.rm_flags & XFS_RMAP_UNWRITTEN), out_error); + + /* Make sure the extent we found covers the entire freeing range. */ + XFS_WANT_CORRUPTED_GOTO(mp, ltrec.rm_startblock <= bno && + ltrec.rm_startblock + ltrec.rm_blockcount >= + bno + len, out_error); + + /* Make sure the owner matches what we expect to find in the tree. */ + XFS_WANT_CORRUPTED_GOTO(mp, owner == ltrec.rm_owner || + XFS_RMAP_NON_INODE_OWNER(owner), out_error); + + /* Check the offset, if necessary. */ + if (!XFS_RMAP_NON_INODE_OWNER(owner)) { + if (flags & XFS_RMAP_BMBT_BLOCK) { + XFS_WANT_CORRUPTED_GOTO(mp, + ltrec.rm_flags & XFS_RMAP_BMBT_BLOCK, + out_error); + } else { + XFS_WANT_CORRUPTED_GOTO(mp, + ltrec.rm_offset <= offset, out_error); + XFS_WANT_CORRUPTED_GOTO(mp, + ltoff + ltrec.rm_blockcount >= offset + len, + out_error); + } + } + + if (ltrec.rm_startblock == bno && ltrec.rm_blockcount == len) { + /* exact match, simply remove the record from rmap tree */ + trace_xfs_rmap_delete(mp, cur->bc_private.a.agno, + ltrec.rm_startblock, ltrec.rm_blockcount, + ltrec.rm_owner, ltrec.rm_offset, + ltrec.rm_flags); + error = xfs_btree_delete(cur, &i); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error); + } else if (ltrec.rm_startblock == bno) { + /* + * overlap left hand side of extent: move the start, trim the + * length and update the current record. + * + * ltbno ltlen + * Orig: |oooooooooooooooooooo| + * Freeing: |fffffffff| + * Result: |rrrrrrrrrr| + * bno len + */ + ltrec.rm_startblock += len; + ltrec.rm_blockcount -= len; + if (!ignore_off) + ltrec.rm_offset += len; + error = xfs_rmap_update(cur, <rec); + if (error) + goto out_error; + } else if (ltrec.rm_startblock + ltrec.rm_blockcount == bno + len) { + /* + * overlap right hand side of extent: trim the length and update + * the current record. + * + * ltbno ltlen + * Orig: |oooooooooooooooooooo| + * Freeing: |fffffffff| + * Result: |rrrrrrrrrr| + * bno len + */ + ltrec.rm_blockcount -= len; + error = xfs_rmap_update(cur, <rec); + if (error) + goto out_error; + } else { + + /* + * overlap middle of extent: trim the length of the existing + * record to the length of the new left-extent size, increment + * the insertion position so we can insert a new record + * containing the remaining right-extent space. + * + * ltbno ltlen + * Orig: |oooooooooooooooooooo| + * Freeing: |fffffffff| + * Result: |rrrrr| |rrrr| + * bno len + */ + xfs_extlen_t orig_len = ltrec.rm_blockcount; + + ltrec.rm_blockcount = bno - ltrec.rm_startblock; + error = xfs_rmap_update(cur, <rec); + if (error) + goto out_error; + + error = xfs_btree_increment(cur, 0, &i); + if (error) + goto out_error; + + cur->bc_rec.r.rm_startblock = bno + len; + cur->bc_rec.r.rm_blockcount = orig_len - len - + ltrec.rm_blockcount; + cur->bc_rec.r.rm_owner = ltrec.rm_owner; + if (ignore_off) + cur->bc_rec.r.rm_offset = 0; + else + cur->bc_rec.r.rm_offset = offset + len; + cur->bc_rec.r.rm_flags = flags; + trace_xfs_rmap_insert(mp, cur->bc_private.a.agno, + cur->bc_rec.r.rm_startblock, + cur->bc_rec.r.rm_blockcount, + cur->bc_rec.r.rm_owner, + cur->bc_rec.r.rm_offset, + cur->bc_rec.r.rm_flags); + error = xfs_btree_insert(cur, &i); + if (error) + goto out_error; + } + +out_done: + trace_xfs_rmap_unmap_done(mp, cur->bc_private.a.agno, bno, len, + unwritten, oinfo); +out_error: + if (error) + trace_xfs_rmap_unmap_error(mp, cur->bc_private.a.agno, + error, _RET_IP_); + return error; +} + +/* + * Remove a reference to an extent in the rmap btree. + */ +int +xfs_rmap_free( + struct xfs_trans *tp, + struct xfs_buf *agbp, + xfs_agnumber_t agno, + xfs_agblock_t bno, + xfs_extlen_t len, + struct xfs_owner_info *oinfo) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_btree_cur *cur; + int error; + + if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) + return 0; + + cur = xfs_rmapbt_init_cursor(mp, tp, agbp, agno); + + error = xfs_rmap_unmap(cur, bno, len, false, oinfo); + if (error) + goto out_error; + + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + return 0; + +out_error: + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + return error; +} + +/* + * A mergeable rmap must have the same owner and the same values for + * the unwritten, attr_fork, and bmbt flags. The startblock and + * offset are checked separately. + */ +static bool +xfs_rmap_is_mergeable( + struct xfs_rmap_irec *irec, + uint64_t owner, + unsigned int flags) +{ + if (irec->rm_owner == XFS_RMAP_OWN_NULL) + return false; + if (irec->rm_owner != owner) + return false; + if ((flags & XFS_RMAP_UNWRITTEN) ^ + (irec->rm_flags & XFS_RMAP_UNWRITTEN)) + return false; + if ((flags & XFS_RMAP_ATTR_FORK) ^ + (irec->rm_flags & XFS_RMAP_ATTR_FORK)) + return false; + if ((flags & XFS_RMAP_BMBT_BLOCK) ^ + (irec->rm_flags & XFS_RMAP_BMBT_BLOCK)) + return false; + return true; +} + +/* + * When we allocate a new block, the first thing we do is add a reference to + * the extent in the rmap btree. This takes the form of a [agbno, length, + * owner, offset] record. Flags are encoded in the high bits of the offset + * field. + */ +STATIC int +xfs_rmap_map( + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + xfs_extlen_t len, + bool unwritten, + struct xfs_owner_info *oinfo) +{ + struct xfs_mount *mp = cur->bc_mp; + struct xfs_rmap_irec ltrec; + struct xfs_rmap_irec gtrec; + int have_gt; + int have_lt; + int error = 0; + int i; + uint64_t owner; + uint64_t offset; + unsigned int flags = 0; + bool ignore_off; + + xfs_owner_info_unpack(oinfo, &owner, &offset, &flags); + ASSERT(owner != 0); + ignore_off = XFS_RMAP_NON_INODE_OWNER(owner) || + (flags & XFS_RMAP_BMBT_BLOCK); + if (unwritten) + flags |= XFS_RMAP_UNWRITTEN; + trace_xfs_rmap_map(mp, cur->bc_private.a.agno, bno, len, + unwritten, oinfo); + + /* + * For the initial lookup, look for an exact match or the left-adjacent + * record for our insertion point. This will also give us the record for + * start block contiguity tests. + */ + error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, flags, + &have_lt); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(mp, have_lt == 1, out_error); + + error = xfs_rmap_get_rec(cur, <rec, &have_lt); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(mp, have_lt == 1, out_error); + trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, + cur->bc_private.a.agno, ltrec.rm_startblock, + ltrec.rm_blockcount, ltrec.rm_owner, + ltrec.rm_offset, ltrec.rm_flags); + + if (!xfs_rmap_is_mergeable(<rec, owner, flags)) + have_lt = 0; + + XFS_WANT_CORRUPTED_GOTO(mp, + have_lt == 0 || + ltrec.rm_startblock + ltrec.rm_blockcount <= bno, out_error); + + /* + * Increment the cursor to see if we have a right-adjacent record to our + * insertion point. This will give us the record for end block + * contiguity tests. + */ + error = xfs_btree_increment(cur, 0, &have_gt); + if (error) + goto out_error; + if (have_gt) { + error = xfs_rmap_get_rec(cur, >rec, &have_gt); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(mp, have_gt == 1, out_error); + XFS_WANT_CORRUPTED_GOTO(mp, bno + len <= gtrec.rm_startblock, + out_error); + trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp, + cur->bc_private.a.agno, gtrec.rm_startblock, + gtrec.rm_blockcount, gtrec.rm_owner, + gtrec.rm_offset, gtrec.rm_flags); + if (!xfs_rmap_is_mergeable(>rec, owner, flags)) + have_gt = 0; + } + + /* + * Note: cursor currently points one record to the right of ltrec, even + * if there is no record in the tree to the right. + */ + if (have_lt && + ltrec.rm_startblock + ltrec.rm_blockcount == bno && + (ignore_off || ltrec.rm_offset + ltrec.rm_blockcount == offset)) { + /* + * left edge contiguous, merge into left record. + * + * ltbno ltlen + * orig: |ooooooooo| + * adding: |aaaaaaaaa| + * result: |rrrrrrrrrrrrrrrrrrr| + * bno len + */ + ltrec.rm_blockcount += len; + if (have_gt && + bno + len == gtrec.rm_startblock && + (ignore_off || offset + len == gtrec.rm_offset) && + (unsigned long)ltrec.rm_blockcount + len + + gtrec.rm_blockcount <= XFS_RMAP_LEN_MAX) { + /* + * right edge also contiguous, delete right record + * and merge into left record. + * + * ltbno ltlen gtbno gtlen + * orig: |ooooooooo| |ooooooooo| + * adding: |aaaaaaaaa| + * result: |rrrrrrrrrrrrrrrrrrrrrrrrrrrrr| + */ + ltrec.rm_blockcount += gtrec.rm_blockcount; + trace_xfs_rmap_delete(mp, cur->bc_private.a.agno, + gtrec.rm_startblock, + gtrec.rm_blockcount, + gtrec.rm_owner, + gtrec.rm_offset, + gtrec.rm_flags); + error = xfs_btree_delete(cur, &i); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error); + } + + /* point the cursor back to the left record and update */ + error = xfs_btree_decrement(cur, 0, &have_gt); + if (error) + goto out_error; + error = xfs_rmap_update(cur, <rec); + if (error) + goto out_error; + } else if (have_gt && + bno + len == gtrec.rm_startblock && + (ignore_off || offset + len == gtrec.rm_offset)) { + /* + * right edge contiguous, merge into right record. + * + * gtbno gtlen + * Orig: |ooooooooo| + * adding: |aaaaaaaaa| + * Result: |rrrrrrrrrrrrrrrrrrr| + * bno len + */ + gtrec.rm_startblock = bno; + gtrec.rm_blockcount += len; + if (!ignore_off) + gtrec.rm_offset = offset; + error = xfs_rmap_update(cur, >rec); + if (error) + goto out_error; + } else { + /* + * no contiguous edge with identical owner, insert + * new record at current cursor position. + */ + cur->bc_rec.r.rm_startblock = bno; + cur->bc_rec.r.rm_blockcount = len; + cur->bc_rec.r.rm_owner = owner; + cur->bc_rec.r.rm_offset = offset; + cur->bc_rec.r.rm_flags = flags; + trace_xfs_rmap_insert(mp, cur->bc_private.a.agno, bno, len, + owner, offset, flags); + error = xfs_btree_insert(cur, &i); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error); + } + + trace_xfs_rmap_map_done(mp, cur->bc_private.a.agno, bno, len, + unwritten, oinfo); +out_error: + if (error) + trace_xfs_rmap_map_error(mp, cur->bc_private.a.agno, + error, _RET_IP_); + return error; +} + +/* + * Add a reference to an extent in the rmap btree. + */ +int +xfs_rmap_alloc( + struct xfs_trans *tp, + struct xfs_buf *agbp, + xfs_agnumber_t agno, + xfs_agblock_t bno, + xfs_extlen_t len, + struct xfs_owner_info *oinfo) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_btree_cur *cur; + int error; + + if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) + return 0; + + cur = xfs_rmapbt_init_cursor(mp, tp, agbp, agno); + error = xfs_rmap_map(cur, bno, len, false, oinfo); + if (error) + goto out_error; + + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + return 0; + +out_error: + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + return error; +} + +#define RMAP_LEFT_CONTIG (1 << 0) +#define RMAP_RIGHT_CONTIG (1 << 1) +#define RMAP_LEFT_FILLING (1 << 2) +#define RMAP_RIGHT_FILLING (1 << 3) +#define RMAP_LEFT_VALID (1 << 6) +#define RMAP_RIGHT_VALID (1 << 7) + +#define LEFT r[0] +#define RIGHT r[1] +#define PREV r[2] +#define NEW r[3] + +/* + * Convert an unwritten extent to a real extent or vice versa. + * Does not handle overlapping extents. + */ +STATIC int +xfs_rmap_convert( + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + xfs_extlen_t len, + bool unwritten, + struct xfs_owner_info *oinfo) +{ + struct xfs_mount *mp = cur->bc_mp; + struct xfs_rmap_irec r[4]; /* neighbor extent entries */ + /* left is 0, right is 1, prev is 2 */ + /* new is 3 */ + uint64_t owner; + uint64_t offset; + uint64_t new_endoff; + unsigned int oldext; + unsigned int newext; + unsigned int flags = 0; + int i; + int state = 0; + int error; + + xfs_owner_info_unpack(oinfo, &owner, &offset, &flags); + ASSERT(!(XFS_RMAP_NON_INODE_OWNER(owner) || + (flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)))); + oldext = unwritten ? XFS_RMAP_UNWRITTEN : 0; + new_endoff = offset + len; + trace_xfs_rmap_convert(mp, cur->bc_private.a.agno, bno, len, + unwritten, oinfo); + + /* + * For the initial lookup, look for an exact match or the left-adjacent + * record for our insertion point. This will also give us the record for + * start block contiguity tests. + */ + error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, oldext, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + + error = xfs_rmap_get_rec(cur, &PREV, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, + cur->bc_private.a.agno, PREV.rm_startblock, + PREV.rm_blockcount, PREV.rm_owner, + PREV.rm_offset, PREV.rm_flags); + + ASSERT(PREV.rm_offset <= offset); + ASSERT(PREV.rm_offset + PREV.rm_blockcount >= new_endoff); + ASSERT((PREV.rm_flags & XFS_RMAP_UNWRITTEN) == oldext); + newext = ~oldext & XFS_RMAP_UNWRITTEN; + + /* + * Set flags determining what part of the previous oldext allocation + * extent is being replaced by a newext allocation. + */ + if (PREV.rm_offset == offset) + state |= RMAP_LEFT_FILLING; + if (PREV.rm_offset + PREV.rm_blockcount == new_endoff) + state |= RMAP_RIGHT_FILLING; + + /* + * Decrement the cursor to see if we have a left-adjacent record to our + * insertion point. This will give us the record for end block + * contiguity tests. + */ + error = xfs_btree_decrement(cur, 0, &i); + if (error) + goto done; + if (i) { + state |= RMAP_LEFT_VALID; + error = xfs_rmap_get_rec(cur, &LEFT, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, + LEFT.rm_startblock + LEFT.rm_blockcount <= bno, + done); + trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp, + cur->bc_private.a.agno, LEFT.rm_startblock, + LEFT.rm_blockcount, LEFT.rm_owner, + LEFT.rm_offset, LEFT.rm_flags); + if (LEFT.rm_startblock + LEFT.rm_blockcount == bno && + LEFT.rm_offset + LEFT.rm_blockcount == offset && + xfs_rmap_is_mergeable(&LEFT, owner, newext)) + state |= RMAP_LEFT_CONTIG; + } + + /* + * Increment the cursor to see if we have a right-adjacent record to our + * insertion point. This will give us the record for end block + * contiguity tests. + */ + error = xfs_btree_increment(cur, 0, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + error = xfs_btree_increment(cur, 0, &i); + if (error) + goto done; + if (i) { + state |= RMAP_RIGHT_VALID; + error = xfs_rmap_get_rec(cur, &RIGHT, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, bno + len <= RIGHT.rm_startblock, + done); + trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp, + cur->bc_private.a.agno, RIGHT.rm_startblock, + RIGHT.rm_blockcount, RIGHT.rm_owner, + RIGHT.rm_offset, RIGHT.rm_flags); + if (bno + len == RIGHT.rm_startblock && + offset + len == RIGHT.rm_offset && + xfs_rmap_is_mergeable(&RIGHT, owner, newext)) + state |= RMAP_RIGHT_CONTIG; + } + + /* check that left + prev + right is not too long */ + if ((state & (RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG | + RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG)) == + (RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG | + RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG) && + (unsigned long)LEFT.rm_blockcount + len + + RIGHT.rm_blockcount > XFS_RMAP_LEN_MAX) + state &= ~RMAP_RIGHT_CONTIG; + + trace_xfs_rmap_convert_state(mp, cur->bc_private.a.agno, state, + _RET_IP_); + + /* reset the cursor back to PREV */ + error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, oldext, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + + /* + * Switch out based on the FILLING and CONTIG state bits. + */ + switch (state & (RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG | + RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG)) { + case RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG | + RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG: + /* + * Setting all of a previous oldext extent to newext. + * The left and right neighbors are both contiguous with new. + */ + error = xfs_btree_increment(cur, 0, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + trace_xfs_rmap_delete(mp, cur->bc_private.a.agno, + RIGHT.rm_startblock, RIGHT.rm_blockcount, + RIGHT.rm_owner, RIGHT.rm_offset, + RIGHT.rm_flags); + error = xfs_btree_delete(cur, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + error = xfs_btree_decrement(cur, 0, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + trace_xfs_rmap_delete(mp, cur->bc_private.a.agno, + PREV.rm_startblock, PREV.rm_blockcount, + PREV.rm_owner, PREV.rm_offset, + PREV.rm_flags); + error = xfs_btree_delete(cur, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + error = xfs_btree_decrement(cur, 0, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + NEW = LEFT; + NEW.rm_blockcount += PREV.rm_blockcount + RIGHT.rm_blockcount; + error = xfs_rmap_update(cur, &NEW); + if (error) + goto done; + break; + + case RMAP_LEFT_FILLING | RMAP_RIGHT_FILLING | RMAP_LEFT_CONTIG: + /* + * Setting all of a previous oldext extent to newext. + * The left neighbor is contiguous, the right is not. + */ + trace_xfs_rmap_delete(mp, cur->bc_private.a.agno, + PREV.rm_startblock, PREV.rm_blockcount, + PREV.rm_owner, PREV.rm_offset, + PREV.rm_flags); + error = xfs_btree_delete(cur, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + error = xfs_btree_decrement(cur, 0, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + NEW = LEFT; + NEW.rm_blockcount += PREV.rm_blockcount; + error = xfs_rmap_update(cur, &NEW); + if (error) + goto done; + break; + + case RMAP_LEFT_FILLING | RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG: + /* + * Setting all of a previous oldext extent to newext. + * The right neighbor is contiguous, the left is not. + */ + error = xfs_btree_increment(cur, 0, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + trace_xfs_rmap_delete(mp, cur->bc_private.a.agno, + RIGHT.rm_startblock, RIGHT.rm_blockcount, + RIGHT.rm_owner, RIGHT.rm_offset, + RIGHT.rm_flags); + error = xfs_btree_delete(cur, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + error = xfs_btree_decrement(cur, 0, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + NEW = PREV; + NEW.rm_blockcount = len + RIGHT.rm_blockcount; + NEW.rm_flags = newext; + error = xfs_rmap_update(cur, &NEW); + if (error) + goto done; + break; + + case RMAP_LEFT_FILLING | RMAP_RIGHT_FILLING: + /* + * Setting all of a previous oldext extent to newext. + * Neither the left nor right neighbors are contiguous with + * the new one. + */ + NEW = PREV; + NEW.rm_flags = newext; + error = xfs_rmap_update(cur, &NEW); + if (error) + goto done; + break; + + case RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG: + /* + * Setting the first part of a previous oldext extent to newext. + * The left neighbor is contiguous. + */ + NEW = PREV; + NEW.rm_offset += len; + NEW.rm_startblock += len; + NEW.rm_blockcount -= len; + error = xfs_rmap_update(cur, &NEW); + if (error) + goto done; + error = xfs_btree_decrement(cur, 0, &i); + if (error) + goto done; + NEW = LEFT; + NEW.rm_blockcount += len; + error = xfs_rmap_update(cur, &NEW); + if (error) + goto done; + break; + + case RMAP_LEFT_FILLING: + /* + * Setting the first part of a previous oldext extent to newext. + * The left neighbor is not contiguous. + */ + NEW = PREV; + NEW.rm_startblock += len; + NEW.rm_offset += len; + NEW.rm_blockcount -= len; + error = xfs_rmap_update(cur, &NEW); + if (error) + goto done; + NEW.rm_startblock = bno; + NEW.rm_owner = owner; + NEW.rm_offset = offset; + NEW.rm_blockcount = len; + NEW.rm_flags = newext; + cur->bc_rec.r = NEW; + trace_xfs_rmap_insert(mp, cur->bc_private.a.agno, bno, + len, owner, offset, newext); + error = xfs_btree_insert(cur, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + break; + + case RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG: + /* + * Setting the last part of a previous oldext extent to newext. + * The right neighbor is contiguous with the new allocation. + */ + NEW = PREV; + NEW.rm_blockcount -= len; + error = xfs_rmap_update(cur, &NEW); + if (error) + goto done; + error = xfs_btree_increment(cur, 0, &i); + if (error) + goto done; + NEW = RIGHT; + NEW.rm_offset = offset; + NEW.rm_startblock = bno; + NEW.rm_blockcount += len; + error = xfs_rmap_update(cur, &NEW); + if (error) + goto done; + break; + + case RMAP_RIGHT_FILLING: + /* + * Setting the last part of a previous oldext extent to newext. + * The right neighbor is not contiguous. + */ + NEW = PREV; + NEW.rm_blockcount -= len; + error = xfs_rmap_update(cur, &NEW); + if (error) + goto done; + error = xfs_rmap_lookup_eq(cur, bno, len, owner, offset, + oldext, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); + NEW.rm_startblock = bno; + NEW.rm_owner = owner; + NEW.rm_offset = offset; + NEW.rm_blockcount = len; + NEW.rm_flags = newext; + cur->bc_rec.r = NEW; + trace_xfs_rmap_insert(mp, cur->bc_private.a.agno, bno, + len, owner, offset, newext); + error = xfs_btree_insert(cur, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + break; + + case 0: + /* + * Setting the middle part of a previous oldext extent to + * newext. Contiguity is impossible here. + * One extent becomes three extents. + */ + /* new right extent - oldext */ + NEW.rm_startblock = bno + len; + NEW.rm_owner = owner; + NEW.rm_offset = new_endoff; + NEW.rm_blockcount = PREV.rm_offset + PREV.rm_blockcount - + new_endoff; + NEW.rm_flags = PREV.rm_flags; + error = xfs_rmap_update(cur, &NEW); + if (error) + goto done; + /* new left extent - oldext */ + NEW = PREV; + NEW.rm_blockcount = offset - PREV.rm_offset; + cur->bc_rec.r = NEW; + trace_xfs_rmap_insert(mp, cur->bc_private.a.agno, + NEW.rm_startblock, NEW.rm_blockcount, + NEW.rm_owner, NEW.rm_offset, + NEW.rm_flags); + error = xfs_btree_insert(cur, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + /* + * Reset the cursor to the position of the new extent + * we are about to insert as we can't trust it after + * the previous insert. + */ + error = xfs_rmap_lookup_eq(cur, bno, len, owner, offset, + oldext, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); + /* new middle extent - newext */ + cur->bc_rec.r.rm_flags &= ~XFS_RMAP_UNWRITTEN; + cur->bc_rec.r.rm_flags |= newext; + trace_xfs_rmap_insert(mp, cur->bc_private.a.agno, bno, len, + owner, offset, newext); + error = xfs_btree_insert(cur, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + break; + + case RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG | RMAP_RIGHT_CONTIG: + case RMAP_RIGHT_FILLING | RMAP_LEFT_CONTIG | RMAP_RIGHT_CONTIG: + case RMAP_LEFT_FILLING | RMAP_RIGHT_CONTIG: + case RMAP_RIGHT_FILLING | RMAP_LEFT_CONTIG: + case RMAP_LEFT_CONTIG | RMAP_RIGHT_CONTIG: + case RMAP_LEFT_CONTIG: + case RMAP_RIGHT_CONTIG: + /* + * These cases are all impossible. + */ + ASSERT(0); + } + + trace_xfs_rmap_convert_done(mp, cur->bc_private.a.agno, bno, len, + unwritten, oinfo); +done: + if (error) + trace_xfs_rmap_convert_error(cur->bc_mp, + cur->bc_private.a.agno, error, _RET_IP_); + return error; +} + +#undef NEW +#undef LEFT +#undef RIGHT +#undef PREV + +struct xfs_rmap_query_range_info { + xfs_rmap_query_range_fn fn; + void *priv; +}; + +/* Format btree record and pass to our callback. */ +STATIC int +xfs_rmap_query_range_helper( + struct xfs_btree_cur *cur, + union xfs_btree_rec *rec, + void *priv) +{ + struct xfs_rmap_query_range_info *query = priv; + struct xfs_rmap_irec irec; + int error; + + error = xfs_rmap_btrec_to_irec(rec, &irec); + if (error) + return error; + return query->fn(cur, &irec, query->priv); +} + +/* Find all rmaps between two keys. */ +int +xfs_rmap_query_range( + struct xfs_btree_cur *cur, + struct xfs_rmap_irec *low_rec, + struct xfs_rmap_irec *high_rec, + xfs_rmap_query_range_fn fn, + void *priv) +{ + union xfs_btree_irec low_brec; + union xfs_btree_irec high_brec; + struct xfs_rmap_query_range_info query; + + low_brec.r = *low_rec; + high_brec.r = *high_rec; + query.priv = priv; + query.fn = fn; + return xfs_btree_query_range(cur, &low_brec, &high_brec, + xfs_rmap_query_range_helper, &query); +} + +/* Clean up after calling xfs_rmap_finish_one. */ +void +xfs_rmap_finish_one_cleanup( + struct xfs_trans *tp, + struct xfs_btree_cur *rcur, + int error) +{ + struct xfs_buf *agbp; + + if (rcur == NULL) + return; + agbp = rcur->bc_private.a.agbp; + xfs_btree_del_cursor(rcur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + if (error) + xfs_trans_brelse(tp, agbp); +} + +/* + * Process one of the deferred rmap operations. We pass back the + * btree cursor to maintain our lock on the rmapbt between calls. + * This saves time and eliminates a buffer deadlock between the + * superblock and the AGF because we'll always grab them in the same + * order. + */ +int +xfs_rmap_finish_one( + struct xfs_trans *tp, + enum xfs_rmap_intent_type type, + __uint64_t owner, + int whichfork, + xfs_fileoff_t startoff, + xfs_fsblock_t startblock, + xfs_filblks_t blockcount, + xfs_exntst_t state, + struct xfs_btree_cur **pcur) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_btree_cur *rcur; + struct xfs_buf *agbp = NULL; + int error = 0; + xfs_agnumber_t agno; + struct xfs_owner_info oinfo; + xfs_agblock_t bno; + bool unwritten; + + agno = XFS_FSB_TO_AGNO(mp, startblock); + ASSERT(agno != NULLAGNUMBER); + bno = XFS_FSB_TO_AGBNO(mp, startblock); + + trace_xfs_rmap_deferred(mp, agno, type, bno, owner, whichfork, + startoff, blockcount, state); + + if (XFS_TEST_ERROR(false, mp, + XFS_ERRTAG_RMAP_FINISH_ONE, + XFS_RANDOM_RMAP_FINISH_ONE)) + return -EIO; + + /* + * If we haven't gotten a cursor or the cursor AG doesn't match + * the startblock, get one now. + */ + rcur = *pcur; + if (rcur != NULL && rcur->bc_private.a.agno != agno) { + xfs_rmap_finish_one_cleanup(tp, rcur, 0); + rcur = NULL; + *pcur = NULL; + } + if (rcur == NULL) { + /* + * Refresh the freelist before we start changing the + * rmapbt, because a shape change could cause us to + * allocate blocks. + */ + error = xfs_free_extent_fix_freelist(tp, agno, &agbp); + if (error) + return error; + if (!agbp) + return -EFSCORRUPTED; + + rcur = xfs_rmapbt_init_cursor(mp, tp, agbp, agno); + if (!rcur) { + error = -ENOMEM; + goto out_cur; + } + } + *pcur = rcur; + + xfs_rmap_ino_owner(&oinfo, owner, whichfork, startoff); + unwritten = state == XFS_EXT_UNWRITTEN; + bno = XFS_FSB_TO_AGBNO(rcur->bc_mp, startblock); + + switch (type) { + case XFS_RMAP_ALLOC: + case XFS_RMAP_MAP: + error = xfs_rmap_map(rcur, bno, blockcount, unwritten, &oinfo); + break; + case XFS_RMAP_FREE: + case XFS_RMAP_UNMAP: + error = xfs_rmap_unmap(rcur, bno, blockcount, unwritten, + &oinfo); + break; + case XFS_RMAP_CONVERT: + error = xfs_rmap_convert(rcur, bno, blockcount, !unwritten, + &oinfo); + break; + default: + ASSERT(0); + error = -EFSCORRUPTED; + } + return error; + +out_cur: + xfs_trans_brelse(tp, agbp); + + return error; +} + +/* + * Don't defer an rmap if we aren't an rmap filesystem. + */ +static bool +xfs_rmap_update_is_needed( + struct xfs_mount *mp) +{ + return xfs_sb_version_hasrmapbt(&mp->m_sb); +} + +/* + * Record a rmap intent; the list is kept sorted first by AG and then by + * increasing age. + */ +static int +__xfs_rmap_add( + struct xfs_mount *mp, + struct xfs_defer_ops *dfops, + enum xfs_rmap_intent_type type, + __uint64_t owner, + int whichfork, + struct xfs_bmbt_irec *bmap) +{ + struct xfs_rmap_intent *ri; + + trace_xfs_rmap_defer(mp, XFS_FSB_TO_AGNO(mp, bmap->br_startblock), + type, + XFS_FSB_TO_AGBNO(mp, bmap->br_startblock), + owner, whichfork, + bmap->br_startoff, + bmap->br_blockcount, + bmap->br_state); + + ri = kmem_alloc(sizeof(struct xfs_rmap_intent), KM_SLEEP | KM_NOFS); + INIT_LIST_HEAD(&ri->ri_list); + ri->ri_type = type; + ri->ri_owner = owner; + ri->ri_whichfork = whichfork; + ri->ri_bmap = *bmap; + + xfs_defer_add(dfops, XFS_DEFER_OPS_TYPE_RMAP, &ri->ri_list); + return 0; +} + +/* Map an extent into a file. */ +int +xfs_rmap_map_extent( + struct xfs_mount *mp, + struct xfs_defer_ops *dfops, + struct xfs_inode *ip, + int whichfork, + struct xfs_bmbt_irec *PREV) +{ + if (!xfs_rmap_update_is_needed(mp)) + return 0; + + return __xfs_rmap_add(mp, dfops, XFS_RMAP_MAP, ip->i_ino, + whichfork, PREV); +} + +/* Unmap an extent out of a file. */ +int +xfs_rmap_unmap_extent( + struct xfs_mount *mp, + struct xfs_defer_ops *dfops, + struct xfs_inode *ip, + int whichfork, + struct xfs_bmbt_irec *PREV) +{ + if (!xfs_rmap_update_is_needed(mp)) + return 0; + + return __xfs_rmap_add(mp, dfops, XFS_RMAP_UNMAP, ip->i_ino, + whichfork, PREV); +} + +/* Convert a data fork extent from unwritten to real or vice versa. */ +int +xfs_rmap_convert_extent( + struct xfs_mount *mp, + struct xfs_defer_ops *dfops, + struct xfs_inode *ip, + int whichfork, + struct xfs_bmbt_irec *PREV) +{ + if (!xfs_rmap_update_is_needed(mp)) + return 0; + + return __xfs_rmap_add(mp, dfops, XFS_RMAP_CONVERT, ip->i_ino, + whichfork, PREV); +} + +/* Schedule the creation of an rmap for non-file data. */ +int +xfs_rmap_alloc_extent( + struct xfs_mount *mp, + struct xfs_defer_ops *dfops, + xfs_agnumber_t agno, + xfs_agblock_t bno, + xfs_extlen_t len, + __uint64_t owner) +{ + struct xfs_bmbt_irec bmap; + + if (!xfs_rmap_update_is_needed(mp)) + return 0; + + bmap.br_startblock = XFS_AGB_TO_FSB(mp, agno, bno); + bmap.br_blockcount = len; + bmap.br_startoff = 0; + bmap.br_state = XFS_EXT_NORM; + + return __xfs_rmap_add(mp, dfops, XFS_RMAP_ALLOC, owner, + XFS_DATA_FORK, &bmap); +} + +/* Schedule the deletion of an rmap for non-file data. */ +int +xfs_rmap_free_extent( + struct xfs_mount *mp, + struct xfs_defer_ops *dfops, + xfs_agnumber_t agno, + xfs_agblock_t bno, + xfs_extlen_t len, + __uint64_t owner) +{ + struct xfs_bmbt_irec bmap; + + if (!xfs_rmap_update_is_needed(mp)) + return 0; + + bmap.br_startblock = XFS_AGB_TO_FSB(mp, agno, bno); + bmap.br_blockcount = len; + bmap.br_startoff = 0; + bmap.br_state = XFS_EXT_NORM; + + return __xfs_rmap_add(mp, dfops, XFS_RMAP_FREE, owner, + XFS_DATA_FORK, &bmap); +} diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h new file mode 100644 index 000000000..71cf99a4a --- /dev/null +++ b/fs/xfs/libxfs/xfs_rmap.h @@ -0,0 +1,209 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#ifndef __XFS_RMAP_H__ +#define __XFS_RMAP_H__ + +static inline void +xfs_rmap_ag_owner( + struct xfs_owner_info *oi, + uint64_t owner) +{ + oi->oi_owner = owner; + oi->oi_offset = 0; + oi->oi_flags = 0; +} + +static inline void +xfs_rmap_ino_bmbt_owner( + struct xfs_owner_info *oi, + xfs_ino_t ino, + int whichfork) +{ + oi->oi_owner = ino; + oi->oi_offset = 0; + oi->oi_flags = XFS_OWNER_INFO_BMBT_BLOCK; + if (whichfork == XFS_ATTR_FORK) + oi->oi_flags |= XFS_OWNER_INFO_ATTR_FORK; +} + +static inline void +xfs_rmap_ino_owner( + struct xfs_owner_info *oi, + xfs_ino_t ino, + int whichfork, + xfs_fileoff_t offset) +{ + oi->oi_owner = ino; + oi->oi_offset = offset; + oi->oi_flags = 0; + if (whichfork == XFS_ATTR_FORK) + oi->oi_flags |= XFS_OWNER_INFO_ATTR_FORK; +} + +static inline void +xfs_rmap_skip_owner_update( + struct xfs_owner_info *oi) +{ + oi->oi_owner = XFS_RMAP_OWN_UNKNOWN; +} + +/* Reverse mapping functions. */ + +struct xfs_buf; + +static inline __u64 +xfs_rmap_irec_offset_pack( + const struct xfs_rmap_irec *irec) +{ + __u64 x; + + x = XFS_RMAP_OFF(irec->rm_offset); + if (irec->rm_flags & XFS_RMAP_ATTR_FORK) + x |= XFS_RMAP_OFF_ATTR_FORK; + if (irec->rm_flags & XFS_RMAP_BMBT_BLOCK) + x |= XFS_RMAP_OFF_BMBT_BLOCK; + if (irec->rm_flags & XFS_RMAP_UNWRITTEN) + x |= XFS_RMAP_OFF_UNWRITTEN; + return x; +} + +static inline int +xfs_rmap_irec_offset_unpack( + __u64 offset, + struct xfs_rmap_irec *irec) +{ + if (offset & ~(XFS_RMAP_OFF_MASK | XFS_RMAP_OFF_FLAGS)) + return -EFSCORRUPTED; + irec->rm_offset = XFS_RMAP_OFF(offset); + if (offset & XFS_RMAP_OFF_ATTR_FORK) + irec->rm_flags |= XFS_RMAP_ATTR_FORK; + if (offset & XFS_RMAP_OFF_BMBT_BLOCK) + irec->rm_flags |= XFS_RMAP_BMBT_BLOCK; + if (offset & XFS_RMAP_OFF_UNWRITTEN) + irec->rm_flags |= XFS_RMAP_UNWRITTEN; + return 0; +} + +static inline void +xfs_owner_info_unpack( + struct xfs_owner_info *oinfo, + uint64_t *owner, + uint64_t *offset, + unsigned int *flags) +{ + unsigned int r = 0; + + *owner = oinfo->oi_owner; + *offset = oinfo->oi_offset; + if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK) + r |= XFS_RMAP_ATTR_FORK; + if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK) + r |= XFS_RMAP_BMBT_BLOCK; + *flags = r; +} + +static inline void +xfs_owner_info_pack( + struct xfs_owner_info *oinfo, + uint64_t owner, + uint64_t offset, + unsigned int flags) +{ + oinfo->oi_owner = owner; + oinfo->oi_offset = XFS_RMAP_OFF(offset); + oinfo->oi_flags = 0; + if (flags & XFS_RMAP_ATTR_FORK) + oinfo->oi_flags |= XFS_OWNER_INFO_ATTR_FORK; + if (flags & XFS_RMAP_BMBT_BLOCK) + oinfo->oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK; +} + +int xfs_rmap_alloc(struct xfs_trans *tp, struct xfs_buf *agbp, + xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len, + struct xfs_owner_info *oinfo); +int xfs_rmap_free(struct xfs_trans *tp, struct xfs_buf *agbp, + xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len, + struct xfs_owner_info *oinfo); + +int xfs_rmap_lookup_le(struct xfs_btree_cur *cur, xfs_agblock_t bno, + xfs_extlen_t len, uint64_t owner, uint64_t offset, + unsigned int flags, int *stat); +int xfs_rmap_lookup_eq(struct xfs_btree_cur *cur, xfs_agblock_t bno, + xfs_extlen_t len, uint64_t owner, uint64_t offset, + unsigned int flags, int *stat); +int xfs_rmap_insert(struct xfs_btree_cur *rcur, xfs_agblock_t agbno, + xfs_extlen_t len, uint64_t owner, uint64_t offset, + unsigned int flags); +int xfs_rmap_get_rec(struct xfs_btree_cur *cur, struct xfs_rmap_irec *irec, + int *stat); + +typedef int (*xfs_rmap_query_range_fn)( + struct xfs_btree_cur *cur, + struct xfs_rmap_irec *rec, + void *priv); + +int xfs_rmap_query_range(struct xfs_btree_cur *cur, + struct xfs_rmap_irec *low_rec, struct xfs_rmap_irec *high_rec, + xfs_rmap_query_range_fn fn, void *priv); + +enum xfs_rmap_intent_type { + XFS_RMAP_MAP, + XFS_RMAP_MAP_SHARED, + XFS_RMAP_UNMAP, + XFS_RMAP_UNMAP_SHARED, + XFS_RMAP_CONVERT, + XFS_RMAP_CONVERT_SHARED, + XFS_RMAP_ALLOC, + XFS_RMAP_FREE, +}; + +struct xfs_rmap_intent { + struct list_head ri_list; + enum xfs_rmap_intent_type ri_type; + __uint64_t ri_owner; + int ri_whichfork; + struct xfs_bmbt_irec ri_bmap; +}; + +/* functions for updating the rmapbt based on bmbt map/unmap operations */ +int xfs_rmap_map_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops, + struct xfs_inode *ip, int whichfork, + struct xfs_bmbt_irec *imap); +int xfs_rmap_unmap_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops, + struct xfs_inode *ip, int whichfork, + struct xfs_bmbt_irec *imap); +int xfs_rmap_convert_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops, + struct xfs_inode *ip, int whichfork, + struct xfs_bmbt_irec *imap); +int xfs_rmap_alloc_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops, + xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len, + __uint64_t owner); +int xfs_rmap_free_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops, + xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len, + __uint64_t owner); + +void xfs_rmap_finish_one_cleanup(struct xfs_trans *tp, + struct xfs_btree_cur *rcur, int error); +int xfs_rmap_finish_one(struct xfs_trans *tp, enum xfs_rmap_intent_type type, + __uint64_t owner, int whichfork, xfs_fileoff_t startoff, + xfs_fsblock_t startblock, xfs_filblks_t blockcount, + xfs_exntst_t state, struct xfs_btree_cur **pcur); + +#endif /* __XFS_RMAP_H__ */ diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c new file mode 100644 index 000000000..17b8eeb34 --- /dev/null +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -0,0 +1,517 @@ +/* + * Copyright (c) 2014 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_alloc.h" +#include "xfs_btree.h" +#include "xfs_rmap.h" +#include "xfs_rmap_btree.h" +#include "xfs_trace.h" +#include "xfs_cksum.h" +#include "xfs_error.h" +#include "xfs_extent_busy.h" + +/* + * Reverse map btree. + * + * This is a per-ag tree used to track the owner(s) of a given extent. With + * reflink it is possible for there to be multiple owners, which is a departure + * from classic XFS. Owner records for data extents are inserted when the + * extent is mapped and removed when an extent is unmapped. Owner records for + * all other block types (i.e. metadata) are inserted when an extent is + * allocated and removed when an extent is freed. There can only be one owner + * of a metadata extent, usually an inode or some other metadata structure like + * an AG btree. + * + * The rmap btree is part of the free space management, so blocks for the tree + * are sourced from the agfl. Hence we need transaction reservation support for + * this tree so that the freelist is always large enough. This also impacts on + * the minimum space we need to leave free in the AG. + * + * The tree is ordered by [ag block, owner, offset]. This is a large key size, + * but it is the only way to enforce unique keys when a block can be owned by + * multiple files at any offset. There's no need to order/search by extent + * size for online updating/management of the tree. It is intended that most + * reverse lookups will be to find the owner(s) of a particular block, or to + * try to recover tree and file data from corrupt primary metadata. + */ + +static struct xfs_btree_cur * +xfs_rmapbt_dup_cursor( + struct xfs_btree_cur *cur) +{ + return xfs_rmapbt_init_cursor(cur->bc_mp, cur->bc_tp, + cur->bc_private.a.agbp, cur->bc_private.a.agno); +} + +STATIC void +xfs_rmapbt_set_root( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + int inc) +{ + struct xfs_buf *agbp = cur->bc_private.a.agbp; + struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno); + int btnum = cur->bc_btnum; + struct xfs_perag *pag = xfs_perag_get(cur->bc_mp, seqno); + + ASSERT(ptr->s != 0); + + agf->agf_roots[btnum] = ptr->s; + be32_add_cpu(&agf->agf_levels[btnum], inc); + pag->pagf_levels[btnum] += inc; + xfs_perag_put(pag); + + xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS); +} + +STATIC int +xfs_rmapbt_alloc_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *start, + union xfs_btree_ptr *new, + int *stat) +{ + struct xfs_buf *agbp = cur->bc_private.a.agbp; + struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + int error; + xfs_agblock_t bno; + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + + /* Allocate the new block from the freelist. If we can't, give up. */ + error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp, + &bno, 1); + if (error) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; + } + + trace_xfs_rmapbt_alloc_block(cur->bc_mp, cur->bc_private.a.agno, + bno, 1); + if (bno == NULLAGBLOCK) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; + } + + xfs_extent_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1, + false); + + xfs_trans_agbtree_delta(cur->bc_tp, 1); + new->s = cpu_to_be32(bno); + be32_add_cpu(&agf->agf_rmap_blocks, 1); + xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS); + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; + return 0; +} + +STATIC int +xfs_rmapbt_free_block( + struct xfs_btree_cur *cur, + struct xfs_buf *bp) +{ + struct xfs_buf *agbp = cur->bc_private.a.agbp; + struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + xfs_agblock_t bno; + int error; + + bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp)); + trace_xfs_rmapbt_free_block(cur->bc_mp, cur->bc_private.a.agno, + bno, 1); + be32_add_cpu(&agf->agf_rmap_blocks, -1); + xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS); + error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1); + if (error) + return error; + + xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1, + XFS_EXTENT_BUSY_SKIP_DISCARD); + xfs_trans_agbtree_delta(cur->bc_tp, -1); + + return 0; +} + +STATIC int +xfs_rmapbt_get_minrecs( + struct xfs_btree_cur *cur, + int level) +{ + return cur->bc_mp->m_rmap_mnr[level != 0]; +} + +STATIC int +xfs_rmapbt_get_maxrecs( + struct xfs_btree_cur *cur, + int level) +{ + return cur->bc_mp->m_rmap_mxr[level != 0]; +} + +STATIC void +xfs_rmapbt_init_key_from_rec( + union xfs_btree_key *key, + union xfs_btree_rec *rec) +{ + key->rmap.rm_startblock = rec->rmap.rm_startblock; + key->rmap.rm_owner = rec->rmap.rm_owner; + key->rmap.rm_offset = rec->rmap.rm_offset; +} + +/* + * The high key for a reverse mapping record can be computed by shifting + * the startblock and offset to the highest value that would still map + * to that record. In practice this means that we add blockcount-1 to + * the startblock for all records, and if the record is for a data/attr + * fork mapping, we add blockcount-1 to the offset too. + */ +STATIC void +xfs_rmapbt_init_high_key_from_rec( + union xfs_btree_key *key, + union xfs_btree_rec *rec) +{ + __uint64_t off; + int adj; + + adj = be32_to_cpu(rec->rmap.rm_blockcount) - 1; + + key->rmap.rm_startblock = rec->rmap.rm_startblock; + be32_add_cpu(&key->rmap.rm_startblock, adj); + key->rmap.rm_owner = rec->rmap.rm_owner; + key->rmap.rm_offset = rec->rmap.rm_offset; + if (XFS_RMAP_NON_INODE_OWNER(be64_to_cpu(rec->rmap.rm_owner)) || + XFS_RMAP_IS_BMBT_BLOCK(be64_to_cpu(rec->rmap.rm_offset))) + return; + off = be64_to_cpu(key->rmap.rm_offset); + off = (XFS_RMAP_OFF(off) + adj) | (off & ~XFS_RMAP_OFF_MASK); + key->rmap.rm_offset = cpu_to_be64(off); +} + +STATIC void +xfs_rmapbt_init_rec_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_rec *rec) +{ + rec->rmap.rm_startblock = cpu_to_be32(cur->bc_rec.r.rm_startblock); + rec->rmap.rm_blockcount = cpu_to_be32(cur->bc_rec.r.rm_blockcount); + rec->rmap.rm_owner = cpu_to_be64(cur->bc_rec.r.rm_owner); + rec->rmap.rm_offset = cpu_to_be64( + xfs_rmap_irec_offset_pack(&cur->bc_rec.r)); +} + +STATIC void +xfs_rmapbt_init_ptr_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr) +{ + struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); + + ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno)); + ASSERT(agf->agf_roots[cur->bc_btnum] != 0); + + ptr->s = agf->agf_roots[cur->bc_btnum]; +} + +STATIC __int64_t +xfs_rmapbt_key_diff( + struct xfs_btree_cur *cur, + union xfs_btree_key *key) +{ + struct xfs_rmap_irec *rec = &cur->bc_rec.r; + struct xfs_rmap_key *kp = &key->rmap; + __u64 x, y; + __int64_t d; + + d = (__int64_t)be32_to_cpu(kp->rm_startblock) - rec->rm_startblock; + if (d) + return d; + + x = be64_to_cpu(kp->rm_owner); + y = rec->rm_owner; + if (x > y) + return 1; + else if (y > x) + return -1; + + x = XFS_RMAP_OFF(be64_to_cpu(kp->rm_offset)); + y = rec->rm_offset; + if (x > y) + return 1; + else if (y > x) + return -1; + return 0; +} + +STATIC __int64_t +xfs_rmapbt_diff_two_keys( + struct xfs_btree_cur *cur, + union xfs_btree_key *k1, + union xfs_btree_key *k2) +{ + struct xfs_rmap_key *kp1 = &k1->rmap; + struct xfs_rmap_key *kp2 = &k2->rmap; + __int64_t d; + __u64 x, y; + + d = (__int64_t)be32_to_cpu(kp1->rm_startblock) - + be32_to_cpu(kp2->rm_startblock); + if (d) + return d; + + x = be64_to_cpu(kp1->rm_owner); + y = be64_to_cpu(kp2->rm_owner); + if (x > y) + return 1; + else if (y > x) + return -1; + + x = XFS_RMAP_OFF(be64_to_cpu(kp1->rm_offset)); + y = XFS_RMAP_OFF(be64_to_cpu(kp2->rm_offset)); + if (x > y) + return 1; + else if (y > x) + return -1; + return 0; +} + +static bool +xfs_rmapbt_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + struct xfs_perag *pag = bp->b_pag; + unsigned int level; + + /* + * magic number and level verification + * + * During growfs operations, we can't verify the exact level or owner as + * the perag is not fully initialised and hence not attached to the + * buffer. In this case, check against the maximum tree depth. + * + * Similarly, during log recovery we will have a perag structure + * attached, but the agf information will not yet have been initialised + * from the on disk AGF. Again, we can only check against maximum limits + * in this case. + */ + if (block->bb_magic != cpu_to_be32(XFS_RMAP_CRC_MAGIC)) + return false; + + if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) + return false; + if (!xfs_btree_sblock_v5hdr_verify(bp)) + return false; + + level = be16_to_cpu(block->bb_level); + if (pag && pag->pagf_init) { + if (level >= pag->pagf_levels[XFS_BTNUM_RMAPi]) + return false; + } else if (level >= mp->m_rmap_maxlevels) + return false; + + return xfs_btree_sblock_verify(bp, mp->m_rmap_mxr[level != 0]); +} + +static void +xfs_rmapbt_read_verify( + struct xfs_buf *bp) +{ + if (!xfs_btree_sblock_verify_crc(bp)) + xfs_buf_ioerror(bp, -EFSBADCRC); + else if (!xfs_rmapbt_verify(bp)) + xfs_buf_ioerror(bp, -EFSCORRUPTED); + + if (bp->b_error) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + xfs_verifier_error(bp); + } +} + +static void +xfs_rmapbt_write_verify( + struct xfs_buf *bp) +{ + if (!xfs_rmapbt_verify(bp)) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + xfs_buf_ioerror(bp, -EFSCORRUPTED); + xfs_verifier_error(bp); + return; + } + xfs_btree_sblock_calc_crc(bp); + +} + +const struct xfs_buf_ops xfs_rmapbt_buf_ops = { + .name = "xfs_rmapbt", + .verify_read = xfs_rmapbt_read_verify, + .verify_write = xfs_rmapbt_write_verify, +}; + +#if defined(DEBUG) || defined(XFS_WARN) +STATIC int +xfs_rmapbt_keys_inorder( + struct xfs_btree_cur *cur, + union xfs_btree_key *k1, + union xfs_btree_key *k2) +{ + __uint32_t x; + __uint32_t y; + __uint64_t a; + __uint64_t b; + + x = be32_to_cpu(k1->rmap.rm_startblock); + y = be32_to_cpu(k2->rmap.rm_startblock); + if (x < y) + return 1; + else if (x > y) + return 0; + a = be64_to_cpu(k1->rmap.rm_owner); + b = be64_to_cpu(k2->rmap.rm_owner); + if (a < b) + return 1; + else if (a > b) + return 0; + a = XFS_RMAP_OFF(be64_to_cpu(k1->rmap.rm_offset)); + b = XFS_RMAP_OFF(be64_to_cpu(k2->rmap.rm_offset)); + if (a <= b) + return 1; + return 0; +} + +STATIC int +xfs_rmapbt_recs_inorder( + struct xfs_btree_cur *cur, + union xfs_btree_rec *r1, + union xfs_btree_rec *r2) +{ + __uint32_t x; + __uint32_t y; + __uint64_t a; + __uint64_t b; + + x = be32_to_cpu(r1->rmap.rm_startblock); + y = be32_to_cpu(r2->rmap.rm_startblock); + if (x < y) + return 1; + else if (x > y) + return 0; + a = be64_to_cpu(r1->rmap.rm_owner); + b = be64_to_cpu(r2->rmap.rm_owner); + if (a < b) + return 1; + else if (a > b) + return 0; + a = XFS_RMAP_OFF(be64_to_cpu(r1->rmap.rm_offset)); + b = XFS_RMAP_OFF(be64_to_cpu(r2->rmap.rm_offset)); + if (a <= b) + return 1; + return 0; +} +#endif /* DEBUG */ + +static const struct xfs_btree_ops xfs_rmapbt_ops = { + .rec_len = sizeof(struct xfs_rmap_rec), + .key_len = 2 * sizeof(struct xfs_rmap_key), + + .dup_cursor = xfs_rmapbt_dup_cursor, + .set_root = xfs_rmapbt_set_root, + .alloc_block = xfs_rmapbt_alloc_block, + .free_block = xfs_rmapbt_free_block, + .get_minrecs = xfs_rmapbt_get_minrecs, + .get_maxrecs = xfs_rmapbt_get_maxrecs, + .init_key_from_rec = xfs_rmapbt_init_key_from_rec, + .init_high_key_from_rec = xfs_rmapbt_init_high_key_from_rec, + .init_rec_from_cur = xfs_rmapbt_init_rec_from_cur, + .init_ptr_from_cur = xfs_rmapbt_init_ptr_from_cur, + .key_diff = xfs_rmapbt_key_diff, + .buf_ops = &xfs_rmapbt_buf_ops, + .diff_two_keys = xfs_rmapbt_diff_two_keys, +#if defined(DEBUG) || defined(XFS_WARN) + .keys_inorder = xfs_rmapbt_keys_inorder, + .recs_inorder = xfs_rmapbt_recs_inorder, +#endif +}; + +/* + * Allocate a new allocation btree cursor. + */ +struct xfs_btree_cur * +xfs_rmapbt_init_cursor( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buf *agbp, + xfs_agnumber_t agno) +{ + struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + struct xfs_btree_cur *cur; + + cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS); + cur->bc_tp = tp; + cur->bc_mp = mp; + /* Overlapping btree; 2 keys per pointer. */ + cur->bc_btnum = XFS_BTNUM_RMAP; + cur->bc_flags = XFS_BTREE_CRC_BLOCKS | XFS_BTREE_OVERLAPPING; + cur->bc_blocklog = mp->m_sb.sb_blocklog; + cur->bc_ops = &xfs_rmapbt_ops; + cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]); + + cur->bc_private.a.agbp = agbp; + cur->bc_private.a.agno = agno; + + return cur; +} + +/* + * Calculate number of records in an rmap btree block. + */ +int +xfs_rmapbt_maxrecs( + struct xfs_mount *mp, + int blocklen, + int leaf) +{ + blocklen -= XFS_RMAP_BLOCK_LEN; + + if (leaf) + return blocklen / sizeof(struct xfs_rmap_rec); + return blocklen / + (2 * sizeof(struct xfs_rmap_key) + sizeof(xfs_rmap_ptr_t)); +} + +/* Compute the maximum height of an rmap btree. */ +void +xfs_rmapbt_compute_maxlevels( + struct xfs_mount *mp) +{ + mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels(mp, + mp->m_rmap_mnr, mp->m_sb.sb_agblocks); +} diff --git a/fs/xfs/libxfs/xfs_rmap_btree.h b/fs/xfs/libxfs/xfs_rmap_btree.h new file mode 100644 index 000000000..e73a55357 --- /dev/null +++ b/fs/xfs/libxfs/xfs_rmap_btree.h @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2014 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_RMAP_BTREE_H__ +#define __XFS_RMAP_BTREE_H__ + +struct xfs_buf; +struct xfs_btree_cur; +struct xfs_mount; + +/* rmaps only exist on crc enabled filesystems */ +#define XFS_RMAP_BLOCK_LEN XFS_BTREE_SBLOCK_CRC_LEN + +/* + * Record, key, and pointer address macros for btree blocks. + * + * (note that some of these may appear unused, but they are used in userspace) + */ +#define XFS_RMAP_REC_ADDR(block, index) \ + ((struct xfs_rmap_rec *) \ + ((char *)(block) + XFS_RMAP_BLOCK_LEN + \ + (((index) - 1) * sizeof(struct xfs_rmap_rec)))) + +#define XFS_RMAP_KEY_ADDR(block, index) \ + ((struct xfs_rmap_key *) \ + ((char *)(block) + XFS_RMAP_BLOCK_LEN + \ + ((index) - 1) * 2 * sizeof(struct xfs_rmap_key))) + +#define XFS_RMAP_HIGH_KEY_ADDR(block, index) \ + ((struct xfs_rmap_key *) \ + ((char *)(block) + XFS_RMAP_BLOCK_LEN + \ + sizeof(struct xfs_rmap_key) + \ + ((index) - 1) * 2 * sizeof(struct xfs_rmap_key))) + +#define XFS_RMAP_PTR_ADDR(block, index, maxrecs) \ + ((xfs_rmap_ptr_t *) \ + ((char *)(block) + XFS_RMAP_BLOCK_LEN + \ + (maxrecs) * 2 * sizeof(struct xfs_rmap_key) + \ + ((index) - 1) * sizeof(xfs_rmap_ptr_t))) + +struct xfs_btree_cur *xfs_rmapbt_init_cursor(struct xfs_mount *mp, + struct xfs_trans *tp, struct xfs_buf *bp, + xfs_agnumber_t agno); +int xfs_rmapbt_maxrecs(struct xfs_mount *mp, int blocklen, int leaf); +extern void xfs_rmapbt_compute_maxlevels(struct xfs_mount *mp); + +#endif /* __XFS_RMAP_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index 951c044e2..e2e1106c9 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -70,7 +70,7 @@ const struct xfs_buf_ops xfs_rtbuf_ops = { * Get a buffer for the bitmap or summary file block specified. * The buffer is returned read and locked. */ -int +static int xfs_rtbuf_get( xfs_mount_t *mp, /* file system mount structure */ xfs_trans_t *tp, /* transaction pointer */ diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 85bdf3de2..4aecc5fef 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -24,6 +24,7 @@ #include "xfs_bit.h" #include "xfs_sb.h" #include "xfs_mount.h" +#include "xfs_defer.h" #include "xfs_inode.h" #include "xfs_ialloc.h" #include "xfs_alloc.h" @@ -36,6 +37,7 @@ #include "xfs_alloc_btree.h" #include "xfs_ialloc_btree.h" #include "xfs_log.h" +#include "xfs_rmap_btree.h" /* * Physical superblock buffer manipulations. Shared with libxfs in userspace. @@ -730,6 +732,11 @@ xfs_sb_mount_common( mp->m_bmap_dmnr[0] = mp->m_bmap_dmxr[0] / 2; mp->m_bmap_dmnr[1] = mp->m_bmap_dmxr[1] / 2; + mp->m_rmap_mxr[0] = xfs_rmapbt_maxrecs(mp, sbp->sb_blocksize, 1); + mp->m_rmap_mxr[1] = xfs_rmapbt_maxrecs(mp, sbp->sb_blocksize, 0); + mp->m_rmap_mnr[0] = mp->m_rmap_mxr[0] / 2; + mp->m_rmap_mnr[1] = mp->m_rmap_mxr[1] / 2; + mp->m_bsize = XFS_FSB_TO_BB(mp, 1); mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK, sbp->sb_inopblock); @@ -739,6 +746,8 @@ xfs_sb_mount_common( mp->m_ialloc_min_blks = sbp->sb_spino_align; else mp->m_ialloc_min_blks = mp->m_ialloc_blks; + mp->m_alloc_set_aside = xfs_alloc_set_aside(mp); + mp->m_ag_max_usable = xfs_alloc_ag_max_usable(mp); } /* diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index 16002b5ec..0c5b30bd8 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -38,6 +38,7 @@ extern const struct xfs_buf_ops xfs_agi_buf_ops; extern const struct xfs_buf_ops xfs_agf_buf_ops; extern const struct xfs_buf_ops xfs_agfl_buf_ops; extern const struct xfs_buf_ops xfs_allocbt_buf_ops; +extern const struct xfs_buf_ops xfs_rmapbt_buf_ops; extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops; extern const struct xfs_buf_ops xfs_attr3_rmt_buf_ops; extern const struct xfs_buf_ops xfs_bmbt_buf_ops; @@ -116,6 +117,7 @@ int xfs_log_calc_minimum_size(struct xfs_mount *); #define XFS_INO_BTREE_REF 3 #define XFS_ALLOC_BTREE_REF 2 #define XFS_BMAP_BTREE_REF 2 +#define XFS_RMAP_BTREE_REF 2 #define XFS_DIR_BTREE_REF 2 #define XFS_INO_REF 2 #define XFS_ATTR_BTREE_REF 1 diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index 68cb1e7bf..301ef2f4d 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -63,6 +63,30 @@ xfs_calc_buf_res( return nbufs * (size + xfs_buf_log_overhead()); } +/* + * Per-extent log reservation for the btree changes involved in freeing or + * allocating an extent. In classic XFS there were two trees that will be + * modified (bnobt + cntbt). With rmap enabled, there are three trees + * (rmapbt). The number of blocks reserved is based on the formula: + * + * num trees * ((2 blocks/level * max depth) - 1) + * + * Keep in mind that max depth is calculated separately for each type of tree. + */ +static uint +xfs_allocfree_log_count( + struct xfs_mount *mp, + uint num_ops) +{ + uint blocks; + + blocks = num_ops * 2 * (2 * mp->m_ag_maxlevels - 1); + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + blocks += num_ops * (2 * mp->m_rmap_maxlevels - 1); + + return blocks; +} + /* * Logging inodes is really tricksy. They are logged in memory format, * which means that what we write into the log doesn't directly translate into @@ -126,7 +150,7 @@ xfs_calc_inode_res( */ STATIC uint xfs_calc_finobt_res( - struct xfs_mount *mp, + struct xfs_mount *mp, int alloc, int modify) { @@ -137,7 +161,7 @@ xfs_calc_finobt_res( res = xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)); if (alloc) - res += xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + res += xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), XFS_FSB_TO_B(mp, 1)); if (modify) res += (uint)XFS_FSB_TO_B(mp, 1); @@ -153,9 +177,9 @@ xfs_calc_finobt_res( * item logged to try to account for the overhead of the transaction mechanism. * * Note: Most of the reservations underestimate the number of allocation - * groups into which they could free extents in the xfs_bmap_finish() call. + * groups into which they could free extents in the xfs_defer_finish() call. * This is because the number in the worst case is quite high and quite - * unusual. In order to fix this we need to change xfs_bmap_finish() to free + * unusual. In order to fix this we need to change xfs_defer_finish() to free * extents in only a single AG at a time. This will require changes to the * EFI code as well, however, so that the EFI for the extents not freed is * logged again in each transaction. See SGI PV #261917. @@ -188,10 +212,10 @@ xfs_calc_write_reservation( xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), XFS_FSB_TO_B(mp, 1)) + xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2), + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), XFS_FSB_TO_B(mp, 1))), (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2), + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), XFS_FSB_TO_B(mp, 1)))); } @@ -217,10 +241,10 @@ xfs_calc_itruncate_reservation( xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1, XFS_FSB_TO_B(mp, 1))), (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4), + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 4), XFS_FSB_TO_B(mp, 1)) + xfs_calc_buf_res(5, 0) + - xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), XFS_FSB_TO_B(mp, 1)) + xfs_calc_buf_res(2 + mp->m_ialloc_blks + mp->m_in_maxlevels, 0))); @@ -247,7 +271,7 @@ xfs_calc_rename_reservation( xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1))), (xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 3), + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 3), XFS_FSB_TO_B(mp, 1)))); } @@ -286,7 +310,7 @@ xfs_calc_link_reservation( xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1))), (xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), XFS_FSB_TO_B(mp, 1)))); } @@ -324,7 +348,7 @@ xfs_calc_remove_reservation( xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1))), (xfs_calc_buf_res(4, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2), + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), XFS_FSB_TO_B(mp, 1)))); } @@ -371,7 +395,7 @@ xfs_calc_create_resv_alloc( mp->m_sb.sb_sectsize + xfs_calc_buf_res(mp->m_ialloc_blks, XFS_FSB_TO_B(mp, 1)) + xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) + - xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), XFS_FSB_TO_B(mp, 1)); } @@ -399,7 +423,7 @@ xfs_calc_icreate_resv_alloc( return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + mp->m_sb.sb_sectsize + xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) + - xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), XFS_FSB_TO_B(mp, 1)) + xfs_calc_finobt_res(mp, 0, 0); } @@ -483,7 +507,7 @@ xfs_calc_ifree_reservation( xfs_calc_buf_res(1, 0) + xfs_calc_buf_res(2 + mp->m_ialloc_blks + mp->m_in_maxlevels, 0) + - xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), XFS_FSB_TO_B(mp, 1)) + xfs_calc_finobt_res(mp, 0, 1); } @@ -513,7 +537,7 @@ xfs_calc_growdata_reservation( struct xfs_mount *mp) { return xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), XFS_FSB_TO_B(mp, 1)); } @@ -535,7 +559,7 @@ xfs_calc_growrtalloc_reservation( xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), XFS_FSB_TO_B(mp, 1)) + xfs_calc_inode_res(mp, 1) + - xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), XFS_FSB_TO_B(mp, 1)); } @@ -611,7 +635,7 @@ xfs_calc_addafork_reservation( xfs_calc_buf_res(1, mp->m_dir_geo->blksize) + xfs_calc_buf_res(XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1, XFS_FSB_TO_B(mp, 1)) + - xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), XFS_FSB_TO_B(mp, 1)); } @@ -634,7 +658,7 @@ xfs_calc_attrinval_reservation( xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK), XFS_FSB_TO_B(mp, 1))), (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4), + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 4), XFS_FSB_TO_B(mp, 1)))); } @@ -701,7 +725,7 @@ xfs_calc_attrrm_reservation( XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), 0)), (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2), + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), XFS_FSB_TO_B(mp, 1)))); } diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h index 797815012..0eb46ed6d 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.h +++ b/fs/xfs/libxfs/xfs_trans_resv.h @@ -67,16 +67,6 @@ struct xfs_trans_resv { /* shorthand way of accessing reservation structure */ #define M_RES(mp) (&(mp)->m_resv) -/* - * Per-extent log reservation for the allocation btree changes - * involved in freeing or allocating an extent. - * 2 trees * (2 blocks/level * max depth - 1) * block size - */ -#define XFS_ALLOCFREE_LOG_RES(mp,nx) \ - ((nx) * (2 * XFS_FSB_TO_B((mp), 2 * (mp)->m_ag_maxlevels - 1))) -#define XFS_ALLOCFREE_LOG_COUNT(mp,nx) \ - ((nx) * (2 * (2 * (mp)->m_ag_maxlevels - 1))) - /* * Per-directory log reservation for any directory change. * dir blocks: (1 btree block per level + data block + free block) * dblock size diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index b79dc66b2..3d503647f 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -108,8 +108,8 @@ typedef enum { } xfs_lookup_t; typedef enum { - XFS_BTNUM_BNOi, XFS_BTNUM_CNTi, XFS_BTNUM_BMAPi, XFS_BTNUM_INOi, - XFS_BTNUM_FINOi, XFS_BTNUM_MAX + XFS_BTNUM_BNOi, XFS_BTNUM_CNTi, XFS_BTNUM_RMAPi, XFS_BTNUM_BMAPi, + XFS_BTNUM_INOi, XFS_BTNUM_FINOi, XFS_BTNUM_MAX } xfs_btnum_t; struct xfs_name { diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index b15db62b3..a68645abd 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -447,6 +447,7 @@ xfs_submit_ioend( ioend->io_bio->bi_private = ioend; ioend->io_bio->bi_end_io = xfs_end_bio; + bio_set_op_attrs(ioend->io_bio, REQ_OP_WRITE, wbc_to_write_flags(wbc)); /* * If we are failing the IO now, just mark the ioend with an @@ -460,7 +461,7 @@ xfs_submit_ioend( return status; } - submit_bio(wbc_to_write_cmd(wbc), ioend->io_bio); + submit_bio(ioend->io_bio); return 0; } @@ -518,7 +519,8 @@ xfs_chain_bio( bio_chain(ioend->io_bio, new); bio_get(ioend->io_bio); /* for xfs_destroy_ioend */ - submit_bio(wbc_to_write_cmd(wbc), ioend->io_bio); + bio_set_op_attrs(ioend->io_bio, REQ_OP_WRITE, wbc_to_write_flags(wbc)); + submit_bio(ioend->io_bio); ioend->io_bio = new; } @@ -1047,6 +1049,20 @@ xfs_vm_releasepage( trace_xfs_releasepage(page->mapping->host, page, 0, 0); + /* + * mm accommodates an old ext3 case where clean pages might not have had + * the dirty bit cleared. Thus, it can send actual dirty pages to + * ->releasepage() via shrink_active_list(). Conversely, + * block_invalidatepage() can send pages that are still marked dirty + * but otherwise have invalidated buffers. + * + * We've historically freed buffers on the latter. Instead, quietly + * filter out all dirty pages to avoid spurious buffer state warnings. + * This can likely be removed once shrink_active_list() is fixed. + */ + if (PageDirty(page)) + return 0; + xfs_count_page_state(page, &delalloc, &unwritten); if (WARN_ON_ONCE(delalloc)) @@ -1150,6 +1166,8 @@ __xfs_get_blocks( ssize_t size; int new = 0; + BUG_ON(create && !direct); + if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; @@ -1157,22 +1175,14 @@ __xfs_get_blocks( ASSERT(bh_result->b_size >= (1 << inode->i_blkbits)); size = bh_result->b_size; - if (!create && direct && offset >= i_size_read(inode)) + if (!create && offset >= i_size_read(inode)) return 0; /* * Direct I/O is usually done on preallocated files, so try getting - * a block mapping without an exclusive lock first. For buffered - * writes we already have the exclusive iolock anyway, so avoiding - * a lock roundtrip here by taking the ilock exclusive from the - * beginning is a useful micro optimization. + * a block mapping without an exclusive lock first. */ - if (create && !direct) { - lockmode = XFS_ILOCK_EXCL; - xfs_ilock(ip, lockmode); - } else { - lockmode = xfs_ilock_data_map_shared(ip); - } + lockmode = xfs_ilock_data_map_shared(ip); ASSERT(offset <= mp->m_super->s_maxbytes); if (offset + size > mp->m_super->s_maxbytes) @@ -1191,37 +1201,19 @@ __xfs_get_blocks( (imap.br_startblock == HOLESTARTBLOCK || imap.br_startblock == DELAYSTARTBLOCK) || (IS_DAX(inode) && ISUNWRITTEN(&imap)))) { - if (direct || xfs_get_extsz_hint(ip)) { - /* - * xfs_iomap_write_direct() expects the shared lock. It - * is unlocked on return. - */ - if (lockmode == XFS_ILOCK_EXCL) - xfs_ilock_demote(ip, lockmode); - - error = xfs_iomap_write_direct(ip, offset, size, - &imap, nimaps); - if (error) - return error; - new = 1; + /* + * xfs_iomap_write_direct() expects the shared lock. It + * is unlocked on return. + */ + if (lockmode == XFS_ILOCK_EXCL) + xfs_ilock_demote(ip, lockmode); - } else { - /* - * Delalloc reservations do not require a transaction, - * we can go on without dropping the lock here. If we - * are allocating a new delalloc block, make sure that - * we set the new flag so that we mark the buffer new so - * that we know that it is newly allocated if the write - * fails. - */ - if (nimaps && imap.br_startblock == HOLESTARTBLOCK) - new = 1; - error = xfs_iomap_write_delay(ip, offset, size, &imap); - if (error) - goto out_unlock; + error = xfs_iomap_write_direct(ip, offset, size, + &imap, nimaps); + if (error) + return error; + new = 1; - xfs_iunlock(ip, lockmode); - } trace_xfs_get_blocks_alloc(ip, offset, size, ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN : XFS_IO_DELALLOC, &imap); @@ -1242,9 +1234,7 @@ __xfs_get_blocks( } /* trim mapping down to size requested */ - if (direct || size > (1 << inode->i_blkbits)) - xfs_map_trim_size(inode, iblock, bh_result, - &imap, offset, size); + xfs_map_trim_size(inode, iblock, bh_result, &imap, offset, size); /* * For unwritten extents do not report a disk address in the buffered @@ -1257,7 +1247,7 @@ __xfs_get_blocks( if (ISUNWRITTEN(&imap)) set_buffer_unwritten(bh_result); /* direct IO needs special help */ - if (create && direct) { + if (create) { if (dax_fault) ASSERT(!ISUNWRITTEN(&imap)); else @@ -1286,14 +1276,7 @@ __xfs_get_blocks( (new || ISUNWRITTEN(&imap)))) set_buffer_new(bh_result); - if (imap.br_startblock == DELAYSTARTBLOCK) { - BUG_ON(direct); - if (create) { - set_buffer_uptodate(bh_result); - set_buffer_mapped(bh_result); - set_buffer_delay(bh_result); - } - } + BUG_ON(direct && imap.br_startblock == DELAYSTARTBLOCK); return 0; @@ -1343,7 +1326,7 @@ xfs_get_blocks_dax_fault( * whereas if we have flags set we will always be called in task context * (i.e. from a workqueue). */ -STATIC int +int xfs_end_io_direct_write( struct kiocb *iocb, loff_t offset, @@ -1414,234 +1397,10 @@ xfs_vm_direct_IO( struct kiocb *iocb, struct iov_iter *iter) { - struct inode *inode = iocb->ki_filp->f_mapping->host; - dio_iodone_t *endio = NULL; - int flags = 0; - struct block_device *bdev; - - if (iov_iter_rw(iter) == WRITE) { - endio = xfs_end_io_direct_write; - flags = DIO_ASYNC_EXTEND; - } - - if (IS_DAX(inode)) { - return dax_do_io(iocb, inode, iter, - xfs_get_blocks_direct, endio, 0); - } - - bdev = xfs_find_bdev_for_inode(inode); - return __blockdev_direct_IO(iocb, inode, bdev, iter, - xfs_get_blocks_direct, endio, NULL, flags); -} - -/* - * Punch out the delalloc blocks we have already allocated. - * - * Don't bother with xfs_setattr given that nothing can have made it to disk yet - * as the page is still locked at this point. - */ -STATIC void -xfs_vm_kill_delalloc_range( - struct inode *inode, - loff_t start, - loff_t end) -{ - struct xfs_inode *ip = XFS_I(inode); - xfs_fileoff_t start_fsb; - xfs_fileoff_t end_fsb; - int error; - - start_fsb = XFS_B_TO_FSB(ip->i_mount, start); - end_fsb = XFS_B_TO_FSB(ip->i_mount, end); - if (end_fsb <= start_fsb) - return; - - xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_bmap_punch_delalloc_range(ip, start_fsb, - end_fsb - start_fsb); - if (error) { - /* something screwed, just bail */ - if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { - xfs_alert(ip->i_mount, - "xfs_vm_write_failed: unable to clean up ino %lld", - ip->i_ino); - } - } - xfs_iunlock(ip, XFS_ILOCK_EXCL); -} - -STATIC void -xfs_vm_write_failed( - struct inode *inode, - struct page *page, - loff_t pos, - unsigned len) -{ - loff_t block_offset; - loff_t block_start; - loff_t block_end; - loff_t from = pos & (PAGE_SIZE - 1); - loff_t to = from + len; - struct buffer_head *bh, *head; - struct xfs_mount *mp = XFS_I(inode)->i_mount; - /* - * The request pos offset might be 32 or 64 bit, this is all fine - * on 64-bit platform. However, for 64-bit pos request on 32-bit - * platform, the high 32-bit will be masked off if we evaluate the - * block_offset via (pos & PAGE_MASK) because the PAGE_MASK is - * 0xfffff000 as an unsigned long, hence the result is incorrect - * which could cause the following ASSERT failed in most cases. - * In order to avoid this, we can evaluate the block_offset of the - * start of the page by using shifts rather than masks the mismatch - * problem. + * We just need the method present so that open/fcntl allow direct I/O. */ - block_offset = (pos >> PAGE_SHIFT) << PAGE_SHIFT; - - ASSERT(block_offset + from == pos); - - head = page_buffers(page); - block_start = 0; - for (bh = head; bh != head || !block_start; - bh = bh->b_this_page, block_start = block_end, - block_offset += bh->b_size) { - block_end = block_start + bh->b_size; - - /* skip buffers before the write */ - if (block_end <= from) - continue; - - /* if the buffer is after the write, we're done */ - if (block_start >= to) - break; - - /* - * Process delalloc and unwritten buffers beyond EOF. We can - * encounter unwritten buffers in the event that a file has - * post-EOF unwritten extents and an extending write happens to - * fail (e.g., an unaligned write that also involves a delalloc - * to the same page). - */ - if (!buffer_delay(bh) && !buffer_unwritten(bh)) - continue; - - if (!xfs_mp_fail_writes(mp) && !buffer_new(bh) && - block_offset < i_size_read(inode)) - continue; - - if (buffer_delay(bh)) - xfs_vm_kill_delalloc_range(inode, block_offset, - block_offset + bh->b_size); - - /* - * This buffer does not contain data anymore. make sure anyone - * who finds it knows that for certain. - */ - clear_buffer_delay(bh); - clear_buffer_uptodate(bh); - clear_buffer_mapped(bh); - clear_buffer_new(bh); - clear_buffer_dirty(bh); - clear_buffer_unwritten(bh); - } - -} - -/* - * This used to call block_write_begin(), but it unlocks and releases the page - * on error, and we need that page to be able to punch stale delalloc blocks out - * on failure. hence we copy-n-waste it here and call xfs_vm_write_failed() at - * the appropriate point. - */ -STATIC int -xfs_vm_write_begin( - struct file *file, - struct address_space *mapping, - loff_t pos, - unsigned len, - unsigned flags, - struct page **pagep, - void **fsdata) -{ - pgoff_t index = pos >> PAGE_SHIFT; - struct page *page; - int status; - struct xfs_mount *mp = XFS_I(mapping->host)->i_mount; - - ASSERT(len <= PAGE_SIZE); - - page = grab_cache_page_write_begin(mapping, index, flags); - if (!page) - return -ENOMEM; - - status = __block_write_begin(page, pos, len, xfs_get_blocks); - if (xfs_mp_fail_writes(mp)) - status = -EIO; - if (unlikely(status)) { - struct inode *inode = mapping->host; - size_t isize = i_size_read(inode); - - xfs_vm_write_failed(inode, page, pos, len); - unlock_page(page); - - /* - * If the write is beyond EOF, we only want to kill blocks - * allocated in this write, not blocks that were previously - * written successfully. - */ - if (xfs_mp_fail_writes(mp)) - isize = 0; - if (pos + len > isize) { - ssize_t start = max_t(ssize_t, pos, isize); - - truncate_pagecache_range(inode, start, pos + len); - } - - put_page(page); - page = NULL; - } - - *pagep = page; - return status; -} - -/* - * On failure, we only need to kill delalloc blocks beyond EOF in the range of - * this specific write because they will never be written. Previous writes - * beyond EOF where block allocation succeeded do not need to be trashed, so - * only new blocks from this write should be trashed. For blocks within - * EOF, generic_write_end() zeros them so they are safe to leave alone and be - * written with all the other valid data. - */ -STATIC int -xfs_vm_write_end( - struct file *file, - struct address_space *mapping, - loff_t pos, - unsigned len, - unsigned copied, - struct page *page, - void *fsdata) -{ - int ret; - - ASSERT(len <= PAGE_SIZE); - - ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); - if (unlikely(ret < len)) { - struct inode *inode = mapping->host; - size_t isize = i_size_read(inode); - loff_t to = pos + len; - - if (to > isize) { - /* only kill blocks in this write beyond EOF */ - if (pos > isize) - isize = pos; - xfs_vm_kill_delalloc_range(inode, isize, to); - truncate_pagecache_range(inode, isize, to); - } - } - return ret; + return -EINVAL; } STATIC sector_t @@ -1754,8 +1513,6 @@ const struct address_space_operations xfs_address_space_operations = { .set_page_dirty = xfs_vm_set_page_dirty, .releasepage = xfs_vm_releasepage, .invalidatepage = xfs_vm_invalidatepage, - .write_begin = xfs_vm_write_begin, - .write_end = xfs_vm_write_end, .bmap = xfs_vm_bmap, .direct_IO = xfs_vm_direct_IO, .migratepage = buffer_migrate_page, diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index 814aab790..bf2d9a141 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -60,6 +60,9 @@ int xfs_get_blocks_direct(struct inode *inode, sector_t offset, int xfs_get_blocks_dax_fault(struct inode *inode, sector_t offset, struct buffer_head *map_bh, int create); +int xfs_end_io_direct_write(struct kiocb *iocb, loff_t offset, + ssize_t size, void *private); + extern void xfs_count_page_state(struct page *, int *, int *); extern struct block_device *xfs_find_bdev_for_inode(struct inode *); diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index 55d214981..be0b79d89 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -322,7 +322,7 @@ xfs_attr3_node_inactive( * Recurse (gasp!) through the attribute nodes until we find leaves. * We're doing a depth-first traversal in order to invalidate everything. */ -int +static int xfs_attr3_root_inactive( struct xfs_trans **trans, struct xfs_inode *dp) diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index d25f26b22..25e76cd6c 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -65,7 +65,7 @@ xfs_attr_shortform_compare(const void *a, const void *b) * we have to calculate each entries' hashvalue and sort them before * we can begin returning them to the user. */ -int +static int xfs_attr_shortform_list(xfs_attr_list_context_t *context) { attrlist_cursor_kern_t *cursor; diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 586bb64e6..4ece4f2ff 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -25,6 +25,7 @@ #include "xfs_bit.h" #include "xfs_mount.h" #include "xfs_da_format.h" +#include "xfs_defer.h" #include "xfs_inode.h" #include "xfs_btree.h" #include "xfs_trans.h" @@ -40,6 +41,7 @@ #include "xfs_trace.h" #include "xfs_icache.h" #include "xfs_log.h" +#include "xfs_rmap_btree.h" /* Kernel only BMAP related definitions and functions */ @@ -79,79 +81,6 @@ xfs_zero_extent( GFP_NOFS, true); } -/* - * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi - * caller. Frees all the extents that need freeing, which must be done - * last due to locking considerations. We never free any extents in - * the first transaction. - * - * If an inode *ip is provided, rejoin it to the transaction if - * the transaction was committed. - */ -int /* error */ -xfs_bmap_finish( - struct xfs_trans **tp, /* transaction pointer addr */ - struct xfs_bmap_free *flist, /* i/o: list extents to free */ - struct xfs_inode *ip) -{ - struct xfs_efd_log_item *efd; /* extent free data */ - struct xfs_efi_log_item *efi; /* extent free intention */ - int error; /* error return value */ - int committed;/* xact committed or not */ - struct xfs_bmap_free_item *free; /* free extent item */ - struct xfs_bmap_free_item *next; /* next item on free list */ - - ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); - if (flist->xbf_count == 0) - return 0; - - efi = xfs_trans_get_efi(*tp, flist->xbf_count); - for (free = flist->xbf_first; free; free = free->xbfi_next) - xfs_trans_log_efi_extent(*tp, efi, free->xbfi_startblock, - free->xbfi_blockcount); - - error = __xfs_trans_roll(tp, ip, &committed); - if (error) { - /* - * If the transaction was committed, drop the EFD reference - * since we're bailing out of here. The other reference is - * dropped when the EFI hits the AIL. - * - * If the transaction was not committed, the EFI is freed by the - * EFI item unlock handler on abort. Also, we have a new - * transaction so we should return committed=1 even though we're - * returning an error. - */ - if (committed) { - xfs_efi_release(efi); - xfs_force_shutdown((*tp)->t_mountp, - (error == -EFSCORRUPTED) ? - SHUTDOWN_CORRUPT_INCORE : - SHUTDOWN_META_IO_ERROR); - } - return error; - } - - /* - * Get an EFD and free each extent in the list, logging to the EFD in - * the process. The remaining bmap free list is cleaned up by the caller - * on error. - */ - efd = xfs_trans_get_efd(*tp, efi, flist->xbf_count); - for (free = flist->xbf_first; free != NULL; free = next) { - next = free->xbfi_next; - - error = xfs_trans_free_extent(*tp, efd, free->xbfi_startblock, - free->xbfi_blockcount); - if (error) - return error; - - xfs_bmap_del_free(flist, NULL, free); - } - - return 0; -} - int xfs_bmap_rtalloc( struct xfs_bmalloca *ap) /* bmap alloc argument struct */ @@ -198,9 +127,9 @@ xfs_bmap_rtalloc( /* * Lock out modifications to both the RT bitmap and summary inodes */ - xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL); + xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL|XFS_ILOCK_RTBITMAP); xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL); - xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL); + xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL|XFS_ILOCK_RTSUM); xfs_trans_ijoin(ap->tp, mp->m_rsumip, XFS_ILOCK_EXCL); /* @@ -409,7 +338,7 @@ xfs_bmap_count_tree( /* * Count fsblocks of the given fork. */ -int /* error */ +static int /* error */ xfs_bmap_count_blocks( xfs_trans_t *tp, /* transaction pointer */ xfs_inode_t *ip, /* incore inode */ @@ -757,7 +686,7 @@ xfs_bmap_punch_delalloc_range( xfs_bmbt_irec_t imap; int nimaps = 1; xfs_fsblock_t firstblock; - xfs_bmap_free_t flist; + struct xfs_defer_ops dfops; /* * Map the range first and check that it is a delalloc extent @@ -788,18 +717,18 @@ xfs_bmap_punch_delalloc_range( WARN_ON(imap.br_blockcount == 0); /* - * Note: while we initialise the firstblock/flist pair, they + * Note: while we initialise the firstblock/dfops pair, they * should never be used because blocks should never be * allocated or freed for a delalloc extent and hence we need * don't cancel or finish them after the xfs_bunmapi() call. */ - xfs_bmap_init(&flist, &firstblock); + xfs_defer_init(&dfops, &firstblock); error = xfs_bunmapi(NULL, ip, start_fsb, 1, 0, 1, &firstblock, - &flist, &done); + &dfops, &done); if (error) break; - ASSERT(!flist.xbf_count && !flist.xbf_first); + ASSERT(!xfs_defer_has_unfinished_work(&dfops)); next_block: start_fsb++; remaining--; @@ -956,7 +885,7 @@ xfs_alloc_file_space( int rt; xfs_trans_t *tp; xfs_bmbt_irec_t imaps[1], *imapp; - xfs_bmap_free_t free_list; + struct xfs_defer_ops dfops; uint qblocks, resblks, resrtextents; int error; @@ -1047,17 +976,17 @@ xfs_alloc_file_space( xfs_trans_ijoin(tp, ip, 0); - xfs_bmap_init(&free_list, &firstfsb); + xfs_defer_init(&dfops, &firstfsb); error = xfs_bmapi_write(tp, ip, startoffset_fsb, allocatesize_fsb, alloc_type, &firstfsb, - resblks, imapp, &nimaps, &free_list); + resblks, imapp, &nimaps, &dfops); if (error) goto error0; /* * Complete the transaction */ - error = xfs_bmap_finish(&tp, &free_list, NULL); + error = xfs_defer_finish(&tp, &dfops, NULL); if (error) goto error0; @@ -1080,7 +1009,7 @@ xfs_alloc_file_space( return error; error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */ - xfs_bmap_cancel(&free_list); + xfs_defer_cancel(&dfops); xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag); error1: /* Just cancel transaction */ @@ -1089,99 +1018,120 @@ error1: /* Just cancel transaction */ return error; } -/* - * Zero file bytes between startoff and endoff inclusive. - * The iolock is held exclusive and no blocks are buffered. - * - * This function is used by xfs_free_file_space() to zero - * partial blocks when the range to free is not block aligned. - * When unreserving space with boundaries that are not block - * aligned we round up the start and round down the end - * boundaries and then use this function to zero the parts of - * the blocks that got dropped during the rounding. - */ -STATIC int -xfs_zero_remaining_bytes( - xfs_inode_t *ip, - xfs_off_t startoff, - xfs_off_t endoff) +static int +xfs_unmap_extent( + struct xfs_inode *ip, + xfs_fileoff_t startoffset_fsb, + xfs_filblks_t len_fsb, + int *done) { - xfs_bmbt_irec_t imap; - xfs_fileoff_t offset_fsb; - xfs_off_t lastoffset; - xfs_off_t offset; - xfs_buf_t *bp; - xfs_mount_t *mp = ip->i_mount; - int nimap; - int error = 0; + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + struct xfs_defer_ops dfops; + xfs_fsblock_t firstfsb; + uint resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); + int error; - /* - * Avoid doing I/O beyond eof - it's not necessary - * since nothing can read beyond eof. The space will - * be zeroed when the file is extended anyway. - */ - if (startoff >= XFS_ISIZE(ip)) - return 0; + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); + if (error) { + ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp)); + return error; + } - if (endoff > XFS_ISIZE(ip)) - endoff = XFS_ISIZE(ip); + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot, ip->i_gdquot, + ip->i_pdquot, resblks, 0, XFS_QMOPT_RES_REGBLKS); + if (error) + goto out_trans_cancel; - for (offset = startoff; offset <= endoff; offset = lastoffset + 1) { - uint lock_mode; + xfs_trans_ijoin(tp, ip, 0); - offset_fsb = XFS_B_TO_FSBT(mp, offset); - nimap = 1; + xfs_defer_init(&dfops, &firstfsb); + error = xfs_bunmapi(tp, ip, startoffset_fsb, len_fsb, 0, 2, &firstfsb, + &dfops, done); + if (error) + goto out_bmap_cancel; - lock_mode = xfs_ilock_data_map_shared(ip); - error = xfs_bmapi_read(ip, offset_fsb, 1, &imap, &nimap, 0); - xfs_iunlock(ip, lock_mode); + error = xfs_defer_finish(&tp, &dfops, ip); + if (error) + goto out_bmap_cancel; - if (error || nimap < 1) - break; - ASSERT(imap.br_blockcount >= 1); - ASSERT(imap.br_startoff == offset_fsb); - ASSERT(imap.br_startblock != DELAYSTARTBLOCK); + error = xfs_trans_commit(tp); +out_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; - if (imap.br_startblock == HOLESTARTBLOCK || - imap.br_state == XFS_EXT_UNWRITTEN) { - /* skip the entire extent */ - lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + - imap.br_blockcount) - 1; - continue; - } +out_bmap_cancel: + xfs_defer_cancel(&dfops); +out_trans_cancel: + xfs_trans_cancel(tp); + goto out_unlock; +} - lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1; - if (lastoffset > endoff) - lastoffset = endoff; +static int +xfs_adjust_extent_unmap_boundaries( + struct xfs_inode *ip, + xfs_fileoff_t *startoffset_fsb, + xfs_fileoff_t *endoffset_fsb) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_bmbt_irec imap; + int nimap, error; + xfs_extlen_t mod = 0; - /* DAX can just zero the backing device directly */ - if (IS_DAX(VFS_I(ip))) { - error = dax_zero_page_range(VFS_I(ip), offset, - lastoffset - offset + 1, - xfs_get_blocks_direct); - if (error) - return error; - continue; - } + nimap = 1; + error = xfs_bmapi_read(ip, *startoffset_fsb, 1, &imap, &nimap, 0); + if (error) + return error; - error = xfs_buf_read_uncached(XFS_IS_REALTIME_INODE(ip) ? - mp->m_rtdev_targp : mp->m_ddev_targp, - xfs_fsb_to_db(ip, imap.br_startblock), - BTOBB(mp->m_sb.sb_blocksize), - 0, &bp, NULL); - if (error) - return error; + if (nimap && imap.br_startblock != HOLESTARTBLOCK) { + xfs_daddr_t block; - memset(bp->b_addr + - (offset - XFS_FSB_TO_B(mp, imap.br_startoff)), - 0, lastoffset - offset + 1); + ASSERT(imap.br_startblock != DELAYSTARTBLOCK); + block = imap.br_startblock; + mod = do_div(block, mp->m_sb.sb_rextsize); + if (mod) + *startoffset_fsb += mp->m_sb.sb_rextsize - mod; + } - error = xfs_bwrite(bp); - xfs_buf_relse(bp); - if (error) - return error; + nimap = 1; + error = xfs_bmapi_read(ip, *endoffset_fsb - 1, 1, &imap, &nimap, 0); + if (error) + return error; + + if (nimap && imap.br_startblock != HOLESTARTBLOCK) { + ASSERT(imap.br_startblock != DELAYSTARTBLOCK); + mod++; + if (mod && mod != mp->m_sb.sb_rextsize) + *endoffset_fsb -= mod; } - return error; + + return 0; +} + +static int +xfs_flush_unmap_range( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t len) +{ + struct xfs_mount *mp = ip->i_mount; + struct inode *inode = VFS_I(ip); + xfs_off_t rounding, start, end; + int error; + + /* wait for the completion of any pending DIOs */ + inode_dio_wait(inode); + + rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_SIZE); + start = round_down(offset, rounding); + end = round_up(offset + len, rounding) - 1; + + error = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (error) + return error; + truncate_pagecache_range(inode, start, end); + return 0; } int @@ -1190,24 +1140,10 @@ xfs_free_file_space( xfs_off_t offset, xfs_off_t len) { - int done; - xfs_fileoff_t endoffset_fsb; - int error; - xfs_fsblock_t firstfsb; - xfs_bmap_free_t free_list; - xfs_bmbt_irec_t imap; - xfs_off_t ioffset; - xfs_off_t iendoffset; - xfs_extlen_t mod=0; - xfs_mount_t *mp; - int nimap; - uint resblks; - xfs_off_t rounding; - int rt; + struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t startoffset_fsb; - xfs_trans_t *tp; - - mp = ip->i_mount; + xfs_fileoff_t endoffset_fsb; + int done = 0, error; trace_xfs_free_file_space(ip); @@ -1215,135 +1151,45 @@ xfs_free_file_space( if (error) return error; - error = 0; if (len <= 0) /* if nothing being freed */ - return error; - rt = XFS_IS_REALTIME_INODE(ip); - startoffset_fsb = XFS_B_TO_FSB(mp, offset); - endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len); - - /* wait for the completion of any pending DIOs */ - inode_dio_wait(VFS_I(ip)); + return 0; - rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_SIZE); - ioffset = round_down(offset, rounding); - iendoffset = round_up(offset + len, rounding) - 1; - error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, ioffset, - iendoffset); + error = xfs_flush_unmap_range(ip, offset, len); if (error) - goto out; - truncate_pagecache_range(VFS_I(ip), ioffset, iendoffset); + return error; + + startoffset_fsb = XFS_B_TO_FSB(mp, offset); + endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len); /* - * Need to zero the stuff we're not freeing, on disk. - * If it's a realtime file & can't use unwritten extents then we - * actually need to zero the extent edges. Otherwise xfs_bunmapi - * will take care of it for us. + * Need to zero the stuff we're not freeing, on disk. If it's a RT file + * and we can't use unwritten extents then we actually need to ensure + * to zero the whole extent, otherwise we just need to take of block + * boundaries, and xfs_bunmapi will handle the rest. */ - if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) { - nimap = 1; - error = xfs_bmapi_read(ip, startoffset_fsb, 1, - &imap, &nimap, 0); - if (error) - goto out; - ASSERT(nimap == 0 || nimap == 1); - if (nimap && imap.br_startblock != HOLESTARTBLOCK) { - xfs_daddr_t block; - - ASSERT(imap.br_startblock != DELAYSTARTBLOCK); - block = imap.br_startblock; - mod = do_div(block, mp->m_sb.sb_rextsize); - if (mod) - startoffset_fsb += mp->m_sb.sb_rextsize - mod; - } - nimap = 1; - error = xfs_bmapi_read(ip, endoffset_fsb - 1, 1, - &imap, &nimap, 0); + if (XFS_IS_REALTIME_INODE(ip) && + !xfs_sb_version_hasextflgbit(&mp->m_sb)) { + error = xfs_adjust_extent_unmap_boundaries(ip, &startoffset_fsb, + &endoffset_fsb); if (error) - goto out; - ASSERT(nimap == 0 || nimap == 1); - if (nimap && imap.br_startblock != HOLESTARTBLOCK) { - ASSERT(imap.br_startblock != DELAYSTARTBLOCK); - mod++; - if (mod && (mod != mp->m_sb.sb_rextsize)) - endoffset_fsb -= mod; - } - } - if ((done = (endoffset_fsb <= startoffset_fsb))) - /* - * One contiguous piece to clear - */ - error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1); - else { - /* - * Some full blocks, possibly two pieces to clear - */ - if (offset < XFS_FSB_TO_B(mp, startoffset_fsb)) - error = xfs_zero_remaining_bytes(ip, offset, - XFS_FSB_TO_B(mp, startoffset_fsb) - 1); - if (!error && - XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len) - error = xfs_zero_remaining_bytes(ip, - XFS_FSB_TO_B(mp, endoffset_fsb), - offset + len - 1); + return error; } - /* - * free file space until done or until there is an error - */ - resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); - while (!error && !done) { - - /* - * allocate and setup the transaction. Allow this - * transaction to dip into the reserve blocks to ensure - * the freeing of the space succeeds at ENOSPC. - */ - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, - &tp); - if (error) { - ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp)); - break; + if (endoffset_fsb > startoffset_fsb) { + while (!done) { + error = xfs_unmap_extent(ip, startoffset_fsb, + endoffset_fsb - startoffset_fsb, &done); + if (error) + return error; } - xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_trans_reserve_quota(tp, mp, - ip->i_udquot, ip->i_gdquot, ip->i_pdquot, - resblks, 0, XFS_QMOPT_RES_REGBLKS); - if (error) - goto error1; - - xfs_trans_ijoin(tp, ip, 0); - - /* - * issue the bunmapi() call to free the blocks - */ - xfs_bmap_init(&free_list, &firstfsb); - error = xfs_bunmapi(tp, ip, startoffset_fsb, - endoffset_fsb - startoffset_fsb, - 0, 2, &firstfsb, &free_list, &done); - if (error) - goto error0; - - /* - * complete the transaction - */ - error = xfs_bmap_finish(&tp, &free_list, NULL); - if (error) - goto error0; - - error = xfs_trans_commit(tp); - xfs_iunlock(ip, XFS_ILOCK_EXCL); } - out: - return error; - - error0: - xfs_bmap_cancel(&free_list); - error1: - xfs_trans_cancel(tp); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - goto out; + /* + * Now that we've unmap all full blocks we'll have to zero out any + * partial block at the beginning and/or end. xfs_zero_range is + * smart enough to skip any holes, including those we just created. + */ + return xfs_zero_range(ip, offset, len, NULL); } /* @@ -1405,7 +1251,7 @@ xfs_shift_file_space( struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; int error; - struct xfs_bmap_free free_list; + struct xfs_defer_ops dfops; xfs_fsblock_t first_block; xfs_fileoff_t stop_fsb; xfs_fileoff_t next_fsb; @@ -1483,19 +1329,19 @@ xfs_shift_file_space( xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - xfs_bmap_init(&free_list, &first_block); + xfs_defer_init(&dfops, &first_block); /* * We are using the write transaction in which max 2 bmbt * updates are allowed */ error = xfs_bmap_shift_extents(tp, ip, &next_fsb, shift_fsb, - &done, stop_fsb, &first_block, &free_list, + &done, stop_fsb, &first_block, &dfops, direction, XFS_BMAP_MAX_SHIFT_EXTENTS); if (error) goto out_bmap_cancel; - error = xfs_bmap_finish(&tp, &free_list, NULL); + error = xfs_defer_finish(&tp, &dfops, NULL); if (error) goto out_bmap_cancel; @@ -1505,7 +1351,7 @@ xfs_shift_file_space( return error; out_bmap_cancel: - xfs_bmap_cancel(&free_list); + xfs_defer_cancel(&dfops); out_trans_cancel: xfs_trans_cancel(tp); return error; @@ -1689,6 +1535,10 @@ xfs_swap_extents( __uint64_t tmp; int lock_flags; + /* XXX: we can't do this with rmap, will fix later */ + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + return -EOPNOTSUPP; + tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL); if (!tempifp) { error = -ENOMEM; diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index af97d9a1d..68a621a8e 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -21,7 +21,7 @@ /* Kernel only BMAP related definitions and functions */ struct xfs_bmbt_irec; -struct xfs_bmap_free_item; +struct xfs_extent_free_item; struct xfs_ifork; struct xfs_inode; struct xfs_mount; @@ -31,8 +31,6 @@ struct xfs_bmalloca; int xfs_bmap_rtalloc(struct xfs_bmalloca *ap); int xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff, int whichfork, int *eof); -int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip, - int whichfork, int *count); int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, xfs_fileoff_t start_fsb, xfs_fileoff_t length); @@ -42,9 +40,6 @@ int xfs_getbmap(struct xfs_inode *ip, struct getbmapx *bmv, xfs_bmap_format_t formatter, void *arg); /* functions in xfs_bmap.c that are only needed by xfs_bmap_util.c */ -void xfs_bmap_del_free(struct xfs_bmap_free *flist, - struct xfs_bmap_free_item *prev, - struct xfs_bmap_free_item *free); int xfs_bmap_extsize_align(struct xfs_mount *mp, struct xfs_bmbt_irec *gotp, struct xfs_bmbt_irec *prevp, xfs_extlen_t extsz, int rt, int eof, int delay, int convert, diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 41c20b66d..b5b9bffe3 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -79,6 +79,46 @@ xfs_buf_vmap_len( return (bp->b_page_count * PAGE_SIZE) - bp->b_offset; } +/* + * Bump the I/O in flight count on the buftarg if we haven't yet done so for + * this buffer. The count is incremented once per buffer (per hold cycle) + * because the corresponding decrement is deferred to buffer release. Buffers + * can undergo I/O multiple times in a hold-release cycle and per buffer I/O + * tracking adds unnecessary overhead. This is used for sychronization purposes + * with unmount (see xfs_wait_buftarg()), so all we really need is a count of + * in-flight buffers. + * + * Buffers that are never released (e.g., superblock, iclog buffers) must set + * the XBF_NO_IOACCT flag before I/O submission. Otherwise, the buftarg count + * never reaches zero and unmount hangs indefinitely. + */ +static inline void +xfs_buf_ioacct_inc( + struct xfs_buf *bp) +{ + if (bp->b_flags & (XBF_NO_IOACCT|_XBF_IN_FLIGHT)) + return; + + ASSERT(bp->b_flags & XBF_ASYNC); + bp->b_flags |= _XBF_IN_FLIGHT; + percpu_counter_inc(&bp->b_target->bt_io_count); +} + +/* + * Clear the in-flight state on a buffer about to be released to the LRU or + * freed and unaccount from the buftarg. + */ +static inline void +xfs_buf_ioacct_dec( + struct xfs_buf *bp) +{ + if (!(bp->b_flags & _XBF_IN_FLIGHT)) + return; + + bp->b_flags &= ~_XBF_IN_FLIGHT; + percpu_counter_dec(&bp->b_target->bt_io_count); +} + /* * When we mark a buffer stale, we remove the buffer from the LRU and clear the * b_lru_ref count so that the buffer is freed immediately when the buffer @@ -102,6 +142,14 @@ xfs_buf_stale( */ bp->b_flags &= ~_XBF_DELWRI_Q; + /* + * Once the buffer is marked stale and unlocked, a subsequent lookup + * could reset b_flags. There is no guarantee that the buffer is + * unaccounted (released to LRU) before that occurs. Drop in-flight + * status now to preserve accounting consistency. + */ + xfs_buf_ioacct_dec(bp); + spin_lock(&bp->b_lock); atomic_set(&bp->b_lru_ref, 0); if (!(bp->b_state & XFS_BSTATE_DISPOSE) && @@ -815,7 +863,8 @@ xfs_buf_get_uncached( struct xfs_buf *bp; DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks); - bp = _xfs_buf_alloc(target, &map, 1, 0); + /* flags might contain irrelevant bits, pass only what we care about */ + bp = _xfs_buf_alloc(target, &map, 1, flags & XBF_NO_IOACCT); if (unlikely(bp == NULL)) goto fail; @@ -866,63 +915,85 @@ xfs_buf_hold( } /* - * Releases a hold on the specified buffer. If the - * the hold count is 1, calls xfs_buf_free. + * Release a hold on the specified buffer. If the hold count is 1, the buffer is + * placed on LRU or freed (depending on b_lru_ref). */ void xfs_buf_rele( xfs_buf_t *bp) { struct xfs_perag *pag = bp->b_pag; + bool release; + bool freebuf = false; trace_xfs_buf_rele(bp, _RET_IP_); if (!pag) { ASSERT(list_empty(&bp->b_lru)); ASSERT(RB_EMPTY_NODE(&bp->b_rbnode)); - if (atomic_dec_and_test(&bp->b_hold)) + if (atomic_dec_and_test(&bp->b_hold)) { + xfs_buf_ioacct_dec(bp); xfs_buf_free(bp); + } return; } ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode)); ASSERT(atomic_read(&bp->b_hold) > 0); - if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) { - spin_lock(&bp->b_lock); - if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) { - /* - * If the buffer is added to the LRU take a new - * reference to the buffer for the LRU and clear the - * (now stale) dispose list state flag - */ - if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) { - bp->b_state &= ~XFS_BSTATE_DISPOSE; - atomic_inc(&bp->b_hold); - } - spin_unlock(&bp->b_lock); - spin_unlock(&pag->pag_buf_lock); - } else { - /* - * most of the time buffers will already be removed from - * the LRU, so optimise that case by checking for the - * XFS_BSTATE_DISPOSE flag indicating the last list the - * buffer was on was the disposal list - */ - if (!(bp->b_state & XFS_BSTATE_DISPOSE)) { - list_lru_del(&bp->b_target->bt_lru, &bp->b_lru); - } else { - ASSERT(list_empty(&bp->b_lru)); - } - spin_unlock(&bp->b_lock); - ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); - rb_erase(&bp->b_rbnode, &pag->pag_buf_tree); - spin_unlock(&pag->pag_buf_lock); - xfs_perag_put(pag); - xfs_buf_free(bp); + release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock); + spin_lock(&bp->b_lock); + if (!release) { + /* + * Drop the in-flight state if the buffer is already on the LRU + * and it holds the only reference. This is racy because we + * haven't acquired the pag lock, but the use of _XBF_IN_FLIGHT + * ensures the decrement occurs only once per-buf. + */ + if ((atomic_read(&bp->b_hold) == 1) && !list_empty(&bp->b_lru)) + xfs_buf_ioacct_dec(bp); + goto out_unlock; + } + + /* the last reference has been dropped ... */ + xfs_buf_ioacct_dec(bp); + if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) { + /* + * If the buffer is added to the LRU take a new reference to the + * buffer for the LRU and clear the (now stale) dispose list + * state flag + */ + if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) { + bp->b_state &= ~XFS_BSTATE_DISPOSE; + atomic_inc(&bp->b_hold); } + spin_unlock(&pag->pag_buf_lock); + } else { + /* + * most of the time buffers will already be removed from the + * LRU, so optimise that case by checking for the + * XFS_BSTATE_DISPOSE flag indicating the last list the buffer + * was on was the disposal list + */ + if (!(bp->b_state & XFS_BSTATE_DISPOSE)) { + list_lru_del(&bp->b_target->bt_lru, &bp->b_lru); + } else { + ASSERT(list_empty(&bp->b_lru)); + } + + ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); + rb_erase(&bp->b_rbnode, &pag->pag_buf_tree); + spin_unlock(&pag->pag_buf_lock); + xfs_perag_put(pag); + freebuf = true; } + +out_unlock: + spin_unlock(&bp->b_lock); + + if (freebuf) + xfs_buf_free(bp); } @@ -944,10 +1015,12 @@ xfs_buf_trylock( int locked; locked = down_trylock(&bp->b_sema) == 0; - if (locked) + if (locked) { XB_SET_OWNER(bp); - - trace_xfs_buf_trylock(bp, _RET_IP_); + trace_xfs_buf_trylock(bp, _RET_IP_); + } else { + trace_xfs_buf_trylock_fail(bp, _RET_IP_); + } return locked; } @@ -1127,7 +1200,8 @@ xfs_buf_ioapply_map( int map, int *buf_offset, int *count, - int rw) + int op, + int op_flags) { int page_index; int total_nr_pages = bp->b_page_count; @@ -1157,16 +1231,14 @@ xfs_buf_ioapply_map( next_chunk: atomic_inc(&bp->b_io_remaining); - nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT); - if (nr_pages > total_nr_pages) - nr_pages = total_nr_pages; + nr_pages = min(total_nr_pages, BIO_MAX_PAGES); bio = bio_alloc(GFP_NOIO, nr_pages); bio->bi_bdev = bp->b_target->bt_bdev; bio->bi_iter.bi_sector = sector; bio->bi_end_io = xfs_buf_bio_end_io; bio->bi_private = bp; - + bio_set_op_attrs(bio, op, op_flags); for (; size && nr_pages; nr_pages--, page_index++) { int rbytes, nbytes = PAGE_SIZE - offset; @@ -1190,7 +1262,7 @@ next_chunk: flush_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); } - submit_bio(rw, bio); + submit_bio(bio); if (size) goto next_chunk; } else { @@ -1210,7 +1282,8 @@ _xfs_buf_ioapply( struct xfs_buf *bp) { struct blk_plug plug; - int rw; + int op; + int op_flags = 0; int offset; int size; int i; @@ -1229,14 +1302,13 @@ _xfs_buf_ioapply( bp->b_ioend_wq = bp->b_target->bt_mount->m_buf_workqueue; if (bp->b_flags & XBF_WRITE) { + op = REQ_OP_WRITE; if (bp->b_flags & XBF_SYNCIO) - rw = WRITE_SYNC; - else - rw = WRITE; + op_flags = WRITE_SYNC; if (bp->b_flags & XBF_FUA) - rw |= REQ_FUA; + op_flags |= REQ_FUA; if (bp->b_flags & XBF_FLUSH) - rw |= REQ_FLUSH; + op_flags |= REQ_PREFLUSH; /* * Run the write verifier callback function if it exists. If @@ -1266,13 +1338,14 @@ _xfs_buf_ioapply( } } } else if (bp->b_flags & XBF_READ_AHEAD) { - rw = READA; + op = REQ_OP_READ; + op_flags = REQ_RAHEAD; } else { - rw = READ; + op = REQ_OP_READ; } /* we only use the buffer cache for meta-data */ - rw |= REQ_META; + op_flags |= REQ_META; /* * Walk all the vectors issuing IO on them. Set up the initial offset @@ -1284,7 +1357,7 @@ _xfs_buf_ioapply( size = BBTOB(bp->b_io_length); blk_start_plug(&plug); for (i = 0; i < bp->b_map_count; i++) { - xfs_buf_ioapply_map(bp, i, &offset, &size, rw); + xfs_buf_ioapply_map(bp, i, &offset, &size, op, op_flags); if (bp->b_error) break; if (size <= 0) @@ -1339,6 +1412,7 @@ xfs_buf_submit( * xfs_buf_ioend too early. */ atomic_set(&bp->b_io_remaining, 1); + xfs_buf_ioacct_inc(bp); _xfs_buf_ioapply(bp); /* @@ -1524,13 +1598,19 @@ xfs_wait_buftarg( int loop = 0; /* - * We need to flush the buffer workqueue to ensure that all IO - * completion processing is 100% done. Just waiting on buffer locks is - * not sufficient for async IO as the reference count held over IO is - * not released until after the buffer lock is dropped. Hence we need to - * ensure here that all reference counts have been dropped before we - * start walking the LRU list. + * First wait on the buftarg I/O count for all in-flight buffers to be + * released. This is critical as new buffers do not make the LRU until + * they are released. + * + * Next, flush the buffer workqueue to ensure all completion processing + * has finished. Just waiting on buffer locks is not sufficient for + * async IO as the reference count held over IO is not released until + * after the buffer lock is dropped. Hence we need to ensure here that + * all reference counts have been dropped before we start walking the + * LRU list. */ + while (percpu_counter_sum(&btp->bt_io_count)) + delay(100); flush_workqueue(btp->bt_mount->m_buf_workqueue); /* loop until there is nothing left on the lru list. */ @@ -1627,6 +1707,8 @@ xfs_free_buftarg( struct xfs_buftarg *btp) { unregister_shrinker(&btp->bt_shrinker); + ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0); + percpu_counter_destroy(&btp->bt_io_count); list_lru_destroy(&btp->bt_lru); if (mp->m_flags & XFS_MOUNT_BARRIER) @@ -1691,6 +1773,9 @@ xfs_alloc_buftarg( if (list_lru_init(&btp->bt_lru)) goto error; + if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL)) + goto error; + btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count; btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan; btp->bt_shrinker.seeks = DEFAULT_SEEKS; @@ -1774,18 +1859,33 @@ xfs_buf_cmp( return 0; } +/* + * submit buffers for write. + * + * When we have a large buffer list, we do not want to hold all the buffers + * locked while we block on the request queue waiting for IO dispatch. To avoid + * this problem, we lock and submit buffers in groups of 50, thereby minimising + * the lock hold times for lists which may contain thousands of objects. + * + * To do this, we sort the buffer list before we walk the list to lock and + * submit buffers, and we plug and unplug around each group of buffers we + * submit. + */ static int -__xfs_buf_delwri_submit( +xfs_buf_delwri_submit_buffers( struct list_head *buffer_list, - struct list_head *io_list, - bool wait) + struct list_head *wait_list) { - struct blk_plug plug; struct xfs_buf *bp, *n; + LIST_HEAD (submit_list); int pinned = 0; + struct blk_plug plug; + list_sort(NULL, buffer_list, xfs_buf_cmp); + + blk_start_plug(&plug); list_for_each_entry_safe(bp, n, buffer_list, b_list) { - if (!wait) { + if (!wait_list) { if (xfs_buf_ispinned(bp)) { pinned++; continue; @@ -1808,25 +1908,21 @@ __xfs_buf_delwri_submit( continue; } - list_move_tail(&bp->b_list, io_list); trace_xfs_buf_delwri_split(bp, _RET_IP_); - } - - list_sort(NULL, io_list, xfs_buf_cmp); - - blk_start_plug(&plug); - list_for_each_entry_safe(bp, n, io_list, b_list) { - bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC | XBF_WRITE_FAIL); - bp->b_flags |= XBF_WRITE | XBF_ASYNC; /* - * we do all Io submission async. This means if we need to wait - * for IO completion we need to take an extra reference so the - * buffer is still valid on the other side. + * We do all IO submission async. This means if we need + * to wait for IO completion we need to take an extra + * reference so the buffer is still valid on the other + * side. We need to move the buffer onto the io_list + * at this point so the caller can still access it. */ - if (wait) + bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_WRITE_FAIL); + bp->b_flags |= XBF_WRITE | XBF_ASYNC; + if (wait_list) { xfs_buf_hold(bp); - else + list_move_tail(&bp->b_list, wait_list); + } else list_del_init(&bp->b_list); xfs_buf_submit(bp); @@ -1849,8 +1945,7 @@ int xfs_buf_delwri_submit_nowait( struct list_head *buffer_list) { - LIST_HEAD (io_list); - return __xfs_buf_delwri_submit(buffer_list, &io_list, false); + return xfs_buf_delwri_submit_buffers(buffer_list, NULL); } /* @@ -1865,15 +1960,15 @@ int xfs_buf_delwri_submit( struct list_head *buffer_list) { - LIST_HEAD (io_list); + LIST_HEAD (wait_list); int error = 0, error2; struct xfs_buf *bp; - __xfs_buf_delwri_submit(buffer_list, &io_list, true); + xfs_buf_delwri_submit_buffers(buffer_list, &wait_list); /* Wait for IO to complete. */ - while (!list_empty(&io_list)) { - bp = list_first_entry(&io_list, struct xfs_buf, b_list); + while (!list_empty(&wait_list)) { + bp = list_first_entry(&wait_list, struct xfs_buf, b_list); list_del_init(&bp->b_list); diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 8bfb974f0..1c2e52b2d 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -43,6 +43,7 @@ typedef enum { #define XBF_READ (1 << 0) /* buffer intended for reading from device */ #define XBF_WRITE (1 << 1) /* buffer intended for writing to device */ #define XBF_READ_AHEAD (1 << 2) /* asynchronous read-ahead */ +#define XBF_NO_IOACCT (1 << 3) /* bypass I/O accounting (non-LRU bufs) */ #define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */ #define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */ #define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */ @@ -62,6 +63,7 @@ typedef enum { #define _XBF_KMEM (1 << 21)/* backed by heap memory */ #define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */ #define _XBF_COMPOUND (1 << 23)/* compound buffer */ +#define _XBF_IN_FLIGHT (1 << 25) /* I/O in flight, for accounting purposes */ typedef unsigned int xfs_buf_flags_t; @@ -81,7 +83,8 @@ typedef unsigned int xfs_buf_flags_t; { _XBF_PAGES, "PAGES" }, \ { _XBF_KMEM, "KMEM" }, \ { _XBF_DELWRI_Q, "DELWRI_Q" }, \ - { _XBF_COMPOUND, "COMPOUND" } + { _XBF_COMPOUND, "COMPOUND" }, \ + { _XBF_IN_FLIGHT, "IN_FLIGHT" } /* @@ -115,6 +118,8 @@ typedef struct xfs_buftarg { /* LRU control structures */ struct shrinker bt_shrinker; struct list_lru bt_lru; + + struct percpu_counter bt_io_count; } xfs_buftarg_t; struct xfs_buf; diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 342579929..e455f9098 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -359,7 +359,7 @@ xfs_buf_item_format( for (i = 0; i < bip->bli_format_count; i++) { xfs_buf_item_format_segment(bip, lv, &vecp, offset, &bip->bli_formats[i]); - offset += bp->b_maps[i].bm_len; + offset += BBTOB(bp->b_maps[i].bm_len); } /* @@ -915,20 +915,28 @@ xfs_buf_item_log( for (i = 0; i < bip->bli_format_count; i++) { if (start > last) break; - end = start + BBTOB(bp->b_maps[i].bm_len); + end = start + BBTOB(bp->b_maps[i].bm_len) - 1; + + /* skip to the map that includes the first byte to log */ if (first > end) { start += BBTOB(bp->b_maps[i].bm_len); continue; } + + /* + * Trim the range to this segment and mark it in the bitmap. + * Note that we must convert buffer offsets to segment relative + * offsets (e.g., the first byte of each segment is byte 0 of + * that segment). + */ if (first < start) first = start; if (end > last) end = last; - - xfs_buf_item_log_segment(first, end, + xfs_buf_item_log_segment(first - start, end - start, &bip->bli_formats[i].blf_data_map[0]); - start += bp->b_maps[i].bm_len; + start += BBTOB(bp->b_maps[i].bm_len); } } @@ -949,6 +957,7 @@ xfs_buf_item_free( xfs_buf_log_item_t *bip) { xfs_buf_item_free_format(bip); + kmem_free(bip->bli_item.li_lv_shadow); kmem_zone_free(xfs_buf_item_zone, bip); } @@ -1073,6 +1082,8 @@ xfs_buf_iodone_callback_error( trace_xfs_buf_item_iodone_async(bp, _RET_IP_); ASSERT(bp->b_iodone != NULL); + cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error); + /* * If the write was asynchronous then no one will be looking for the * error. If this is the first failure of this type, clear the error @@ -1080,13 +1091,12 @@ xfs_buf_iodone_callback_error( * async write failure at least once, but we also need to set the buffer * up to behave correctly now for repeated failures. */ - if (!(bp->b_flags & (XBF_STALE|XBF_WRITE_FAIL)) || + if (!(bp->b_flags & (XBF_STALE | XBF_WRITE_FAIL)) || bp->b_last_error != bp->b_error) { - bp->b_flags |= (XBF_WRITE | XBF_ASYNC | - XBF_DONE | XBF_WRITE_FAIL); + bp->b_flags |= (XBF_WRITE | XBF_DONE | XBF_WRITE_FAIL); bp->b_last_error = bp->b_error; - bp->b_retries = 0; - bp->b_first_retry_time = jiffies; + if (cfg->retry_timeout && !bp->b_first_retry_time) + bp->b_first_retry_time = jiffies; xfs_buf_ioerror(bp, 0); xfs_buf_submit(bp); @@ -1097,7 +1107,6 @@ xfs_buf_iodone_callback_error( * Repeated failure on an async write. Take action according to the * error configuration we have been set up to use. */ - cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error); if (cfg->max_retries != XFS_ERR_RETRY_FOREVER && ++bp->b_retries > cfg->max_retries) diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index 272c3f8b6..4ff499aa7 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -179,7 +179,7 @@ xfs_ioc_trim( * matter as trimming blocks is an advisory interface. */ if (range.start >= XFS_FSB_TO_B(mp, mp->m_sb.sb_dblocks) || - range.minlen > XFS_FSB_TO_B(mp, XFS_ALLOC_AG_MAX_USABLE(mp)) || + range.minlen > XFS_FSB_TO_B(mp, mp->m_ag_max_usable) || range.len < mp->m_sb.sb_blocksize) return -EINVAL; diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index e0646659c..7a30b8f11 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -23,6 +23,7 @@ #include "xfs_trans_resv.h" #include "xfs_bit.h" #include "xfs_mount.h" +#include "xfs_defer.h" #include "xfs_inode.h" #include "xfs_bmap.h" #include "xfs_bmap_util.h" @@ -74,6 +75,7 @@ xfs_qm_dqdestroy( { ASSERT(list_empty(&dqp->q_lru)); + kmem_free(dqp->q_logitem.qli_item.li_lv_shadow); mutex_destroy(&dqp->q_qlock); XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot); @@ -306,7 +308,7 @@ xfs_qm_dqalloc( xfs_buf_t **O_bpp) { xfs_fsblock_t firstblock; - xfs_bmap_free_t flist; + struct xfs_defer_ops dfops; xfs_bmbt_irec_t map; int nmaps, error; xfs_buf_t *bp; @@ -319,7 +321,7 @@ xfs_qm_dqalloc( /* * Initialize the bmap freelist prior to calling bmapi code. */ - xfs_bmap_init(&flist, &firstblock); + xfs_defer_init(&dfops, &firstblock); xfs_ilock(quotip, XFS_ILOCK_EXCL); /* * Return if this type of quotas is turned off while we didn't @@ -335,7 +337,7 @@ xfs_qm_dqalloc( error = xfs_bmapi_write(tp, quotip, offset_fsb, XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA, &firstblock, XFS_QM_DQALLOC_SPACE_RES(mp), - &map, &nmaps, &flist); + &map, &nmaps, &dfops); if (error) goto error0; ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB); @@ -367,7 +369,7 @@ xfs_qm_dqalloc( dqp->dq_flags & XFS_DQ_ALLTYPES, bp); /* - * xfs_bmap_finish() may commit the current transaction and + * xfs_defer_finish() may commit the current transaction and * start a second transaction if the freelist is not empty. * * Since we still want to modify this buffer, we need to @@ -381,7 +383,7 @@ xfs_qm_dqalloc( xfs_trans_bhold(tp, bp); - error = xfs_bmap_finish(tpp, &flist, NULL); + error = xfs_defer_finish(tpp, &dfops, NULL); if (error) goto error1; @@ -397,7 +399,7 @@ xfs_qm_dqalloc( return 0; error1: - xfs_bmap_cancel(&flist); + xfs_defer_cancel(&dfops); error0: xfs_iunlock(quotip, XFS_ILOCK_EXCL); diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index 814cff94e..2c7a1629e 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -370,6 +370,8 @@ xfs_qm_qoffend_logitem_committed( spin_lock(&ailp->xa_lock); xfs_trans_ail_delete(ailp, &qfs->qql_item, SHUTDOWN_LOG_IO_ERROR); + kmem_free(qfs->qql_item.li_lv_shadow); + kmem_free(lip->li_lv_shadow); kmem_free(qfs); kmem_free(qfe); return (xfs_lsn_t)-1; diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 88693a98f..ed7ee4e8a 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -55,12 +55,15 @@ xfs_error_test(int error_tag, int *fsidp, char *expression, } int -xfs_errortag_add(int error_tag, xfs_mount_t *mp) +xfs_errortag_add(unsigned int error_tag, xfs_mount_t *mp) { int i; int len; int64_t fsid; + if (error_tag >= XFS_ERRTAG_MAX) + return -EINVAL; + memcpy(&fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t)); for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) { diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 4ed3042a0..3d224702f 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -90,7 +90,9 @@ extern void xfs_verifier_error(struct xfs_buf *bp); #define XFS_ERRTAG_STRATCMPL_IOERR 19 #define XFS_ERRTAG_DIOWRITE_IOERR 20 #define XFS_ERRTAG_BMAPIFORMAT 21 -#define XFS_ERRTAG_MAX 22 +#define XFS_ERRTAG_FREE_EXTENT 22 +#define XFS_ERRTAG_RMAP_FINISH_ONE 23 +#define XFS_ERRTAG_MAX 24 /* * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. @@ -117,6 +119,8 @@ extern void xfs_verifier_error(struct xfs_buf *bp); #define XFS_RANDOM_STRATCMPL_IOERR (XFS_RANDOM_DEFAULT/10) #define XFS_RANDOM_DIOWRITE_IOERR (XFS_RANDOM_DEFAULT/10) #define XFS_RANDOM_BMAPIFORMAT XFS_RANDOM_DEFAULT +#define XFS_RANDOM_FREE_EXTENT 1 +#define XFS_RANDOM_RMAP_FINISH_ONE 1 #ifdef DEBUG extern int xfs_error_test_active; @@ -128,7 +132,7 @@ extern int xfs_error_test(int, int *, char *, int, char *, unsigned long); xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \ (rf)))) -extern int xfs_errortag_add(int error_tag, struct xfs_mount *mp); +extern int xfs_errortag_add(unsigned int error_tag, struct xfs_mount *mp); extern int xfs_errortag_clearall(struct xfs_mount *mp, int loud); #else #define XFS_TEST_ERROR(expr, mp, tag, rf) (expr) diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c index a1b2dd828..fe1bfee35 100644 --- a/fs/xfs/xfs_export.c +++ b/fs/xfs/xfs_export.c @@ -246,7 +246,7 @@ const struct export_operations xfs_export_operations = { .fh_to_parent = xfs_fs_fh_to_parent, .get_parent = xfs_fs_get_parent, .commit_metadata = xfs_fs_nfs_commit_metadata, -#ifdef CONFIG_NFSD_BLOCKLAYOUT +#ifdef CONFIG_EXPORTFS_BLOCK_OPS .get_uuid = xfs_fs_get_uuid, .map_blocks = xfs_fs_map_blocks, .commit_blocks = xfs_fs_commit_blocks, diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 4aa015321..d7bc14906 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -20,12 +20,15 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" +#include "xfs_bit.h" #include "xfs_mount.h" #include "xfs_trans.h" #include "xfs_trans_priv.h" #include "xfs_buf_item.h" #include "xfs_extfree_item.h" #include "xfs_log.h" +#include "xfs_btree.h" +#include "xfs_rmap.h" kmem_zone_t *xfs_efi_zone; @@ -40,6 +43,7 @@ void xfs_efi_item_free( struct xfs_efi_log_item *efip) { + kmem_free(efip->efi_item.li_lv_shadow); if (efip->efi_format.efi_nextents > XFS_EFI_MAX_FAST_EXTENTS) kmem_free(efip); else @@ -300,6 +304,7 @@ static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip) STATIC void xfs_efd_item_free(struct xfs_efd_log_item *efdp) { + kmem_free(efdp->efd_item.li_lv_shadow); if (efdp->efd_format.efd_nextents > XFS_EFD_MAX_FAST_EXTENTS) kmem_free(efdp); else @@ -484,3 +489,69 @@ xfs_efd_init( return efdp; } + +/* + * Process an extent free intent item that was recovered from + * the log. We need to free the extents that it describes. + */ +int +xfs_efi_recover( + struct xfs_mount *mp, + struct xfs_efi_log_item *efip) +{ + struct xfs_efd_log_item *efdp; + struct xfs_trans *tp; + int i; + int error = 0; + xfs_extent_t *extp; + xfs_fsblock_t startblock_fsb; + struct xfs_owner_info oinfo; + + ASSERT(!test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)); + + /* + * First check the validity of the extents described by the + * EFI. If any are bad, then assume that all are bad and + * just toss the EFI. + */ + for (i = 0; i < efip->efi_format.efi_nextents; i++) { + extp = &efip->efi_format.efi_extents[i]; + startblock_fsb = XFS_BB_TO_FSB(mp, + XFS_FSB_TO_DADDR(mp, extp->ext_start)); + if (startblock_fsb == 0 || + extp->ext_len == 0 || + startblock_fsb >= mp->m_sb.sb_dblocks || + extp->ext_len >= mp->m_sb.sb_agblocks) { + /* + * This will pull the EFI from the AIL and + * free the memory associated with it. + */ + set_bit(XFS_EFI_RECOVERED, &efip->efi_flags); + xfs_efi_release(efip); + return -EIO; + } + } + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); + if (error) + return error; + efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents); + + xfs_rmap_skip_owner_update(&oinfo); + for (i = 0; i < efip->efi_format.efi_nextents; i++) { + extp = &efip->efi_format.efi_extents[i]; + error = xfs_trans_free_extent(tp, efdp, extp->ext_start, + extp->ext_len, &oinfo); + if (error) + goto abort_error; + + } + + set_bit(XFS_EFI_RECOVERED, &efip->efi_flags); + error = xfs_trans_commit(tp); + return error; + +abort_error: + xfs_trans_cancel(tp); + return error; +} diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h index 8fa865170..a32c794a8 100644 --- a/fs/xfs/xfs_extfree_item.h +++ b/fs/xfs/xfs_extfree_item.h @@ -98,4 +98,7 @@ int xfs_efi_copy_format(xfs_log_iovec_t *buf, void xfs_efi_item_free(xfs_efi_log_item_t *); void xfs_efi_release(struct xfs_efi_log_item *); +int xfs_efi_recover(struct xfs_mount *mp, + struct xfs_efi_log_item *efip); + #endif /* __XFS_EXTFREE_ITEM_H__ */ diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 47fc63295..e612a0233 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -37,6 +37,7 @@ #include "xfs_log.h" #include "xfs_icache.h" #include "xfs_pnfs.h" +#include "xfs_iomap.h" #include #include @@ -80,61 +81,17 @@ xfs_rw_ilock_demote( } /* - * xfs_iozero clears the specified range supplied via the page cache (except in - * the DAX case). Writes through the page cache will allocate blocks over holes, - * though the callers usually map the holes first and avoid them. If a block is - * not completely zeroed, then it will be read from disk before being partially - * zeroed. - * - * In the DAX case, we can just directly write to the underlying pages. This - * will not allocate blocks, but will avoid holes and unwritten extents and so - * not do unnecessary work. + * Clear the specified ranges to zero through either the pagecache or DAX. + * Holes and unwritten extents will be left as-is as they already are zeroed. */ int -xfs_iozero( - struct xfs_inode *ip, /* inode */ - loff_t pos, /* offset in file */ - size_t count) /* size of data to zero */ +xfs_zero_range( + struct xfs_inode *ip, + xfs_off_t pos, + xfs_off_t count, + bool *did_zero) { - struct page *page; - struct address_space *mapping; - int status = 0; - - - mapping = VFS_I(ip)->i_mapping; - do { - unsigned offset, bytes; - void *fsdata; - - offset = (pos & (PAGE_SIZE -1)); /* Within page */ - bytes = PAGE_SIZE - offset; - if (bytes > count) - bytes = count; - - if (IS_DAX(VFS_I(ip))) { - status = dax_zero_page_range(VFS_I(ip), pos, bytes, - xfs_get_blocks_direct); - if (status) - break; - } else { - status = pagecache_write_begin(NULL, mapping, pos, bytes, - AOP_FLAG_UNINTERRUPTIBLE, - &page, &fsdata); - if (status) - break; - - zero_user(page, offset, bytes); - - status = pagecache_write_end(NULL, mapping, pos, bytes, - bytes, page, fsdata); - WARN_ON(status <= 0); /* can't return less than zero! */ - status = 0; - } - pos += bytes; - count -= bytes; - } while (count); - - return status; + return iomap_zero_range(VFS_I(ip), pos, count, NULL, &xfs_iomap_ops); } int @@ -282,48 +239,35 @@ xfs_file_fsync( } STATIC ssize_t -xfs_file_read_iter( +xfs_file_dio_aio_read( struct kiocb *iocb, struct iov_iter *to) { - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; + struct address_space *mapping = iocb->ki_filp->f_mapping; + struct inode *inode = mapping->host; struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; - size_t size = iov_iter_count(to); + loff_t isize = i_size_read(inode); + size_t count = iov_iter_count(to); + struct iov_iter data; + struct xfs_buftarg *target; ssize_t ret = 0; - int ioflags = 0; - xfs_fsize_t n; - loff_t pos = iocb->ki_pos; - XFS_STATS_INC(mp, xs_read_calls); - - if (unlikely(iocb->ki_flags & IOCB_DIRECT)) - ioflags |= XFS_IO_ISDIRECT; - if (file->f_mode & FMODE_NOCMTIME) - ioflags |= XFS_IO_INVIS; - - if ((ioflags & XFS_IO_ISDIRECT) && !IS_DAX(inode)) { - xfs_buftarg_t *target = - XFS_IS_REALTIME_INODE(ip) ? - mp->m_rtdev_targp : mp->m_ddev_targp; - /* DIO must be aligned to device logical sector size */ - if ((pos | size) & target->bt_logical_sectormask) { - if (pos == i_size_read(inode)) - return 0; - return -EINVAL; - } - } + trace_xfs_file_direct_read(ip, count, iocb->ki_pos); - n = mp->m_super->s_maxbytes - pos; - if (n <= 0 || size == 0) - return 0; + if (!count) + return 0; /* skip atime */ - if (n < size) - size = n; + if (XFS_IS_REALTIME_INODE(ip)) + target = ip->i_mount->m_rtdev_targp; + else + target = ip->i_mount->m_ddev_targp; - if (XFS_FORCED_SHUTDOWN(mp)) - return -EIO; + /* DIO must be aligned to device logical sector size */ + if ((iocb->ki_pos | count) & target->bt_logical_sectormask) { + if (iocb->ki_pos == isize) + return 0; + return -EINVAL; + } /* * Locking is a bit tricky here. If we take an exclusive lock for direct @@ -336,7 +280,7 @@ xfs_file_read_iter( * serialisation. */ xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); - if ((ioflags & XFS_IO_ISDIRECT) && inode->i_mapping->nrpages) { + if (mapping->nrpages) { xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); @@ -351,8 +295,8 @@ xfs_file_read_iter( * flush and reduce the chances of repeated iolock cycles going * forward. */ - if (inode->i_mapping->nrpages) { - ret = filemap_write_and_wait(VFS_I(ip)->i_mapping); + if (mapping->nrpages) { + ret = filemap_write_and_wait(mapping); if (ret) { xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL); return ret; @@ -363,20 +307,95 @@ xfs_file_read_iter( * we fail to invalidate a page, but this should never * happen on XFS. Warn if it does fail. */ - ret = invalidate_inode_pages2(VFS_I(ip)->i_mapping); + ret = invalidate_inode_pages2(mapping); WARN_ON_ONCE(ret); ret = 0; } xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); } - trace_xfs_file_read(ip, size, pos, ioflags); + data = *to; + ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, + xfs_get_blocks_direct, NULL, NULL, 0); + if (ret > 0) { + iocb->ki_pos += ret; + iov_iter_advance(to, ret); + } + xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); + + file_accessed(iocb->ki_filp); + return ret; +} + +static noinline ssize_t +xfs_file_dax_read( + struct kiocb *iocb, + struct iov_iter *to) +{ + struct address_space *mapping = iocb->ki_filp->f_mapping; + struct inode *inode = mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct iov_iter data = *to; + size_t count = iov_iter_count(to); + ssize_t ret = 0; + + trace_xfs_file_dax_read(ip, count, iocb->ki_pos); + + if (!count) + return 0; /* skip atime */ + + xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); + ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, NULL, 0); + if (ret > 0) { + iocb->ki_pos += ret; + iov_iter_advance(to, ret); + } + xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); + + file_accessed(iocb->ki_filp); + return ret; +} + +STATIC ssize_t +xfs_file_buffered_aio_read( + struct kiocb *iocb, + struct iov_iter *to) +{ + struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); + ssize_t ret; + + trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos); + xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); ret = generic_file_read_iter(iocb, to); + xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); + + return ret; +} + +STATIC ssize_t +xfs_file_read_iter( + struct kiocb *iocb, + struct iov_iter *to) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct xfs_mount *mp = XFS_I(inode)->i_mount; + ssize_t ret = 0; + + XFS_STATS_INC(mp, xs_read_calls); + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + if (IS_DAX(inode)) + ret = xfs_file_dax_read(iocb, to); + else if (iocb->ki_flags & IOCB_DIRECT) + ret = xfs_file_dio_aio_read(iocb, to); + else + ret = xfs_file_buffered_aio_read(iocb, to); + if (ret > 0) XFS_STATS_ADD(mp, xs_read_bytes, ret); - - xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); return ret; } @@ -389,18 +408,14 @@ xfs_file_splice_read( unsigned int flags) { struct xfs_inode *ip = XFS_I(infilp->f_mapping->host); - int ioflags = 0; ssize_t ret; XFS_STATS_INC(ip->i_mount, xs_read_calls); - if (infilp->f_mode & FMODE_NOCMTIME) - ioflags |= XFS_IO_INVIS; - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return -EIO; - trace_xfs_file_splice_read(ip, count, *ppos, ioflags); + trace_xfs_file_splice_read(ip, count, *ppos); /* * DAX inodes cannot ues the page cache for splice, so we have to push @@ -423,49 +438,6 @@ out: return ret; } -/* - * This routine is called to handle zeroing any space in the last block of the - * file that is beyond the EOF. We do this since the size is being increased - * without writing anything to that block and we don't want to read the - * garbage on the disk. - */ -STATIC int /* error (positive) */ -xfs_zero_last_block( - struct xfs_inode *ip, - xfs_fsize_t offset, - xfs_fsize_t isize, - bool *did_zeroing) -{ - struct xfs_mount *mp = ip->i_mount; - xfs_fileoff_t last_fsb = XFS_B_TO_FSBT(mp, isize); - int zero_offset = XFS_B_FSB_OFFSET(mp, isize); - int zero_len; - int nimaps = 1; - int error = 0; - struct xfs_bmbt_irec imap; - - xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_bmapi_read(ip, last_fsb, 1, &imap, &nimaps, 0); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - if (error) - return error; - - ASSERT(nimaps > 0); - - /* - * If the block underlying isize is just a hole, then there - * is nothing to zero. - */ - if (imap.br_startblock == HOLESTARTBLOCK) - return 0; - - zero_len = mp->m_sb.sb_blocksize - zero_offset; - if (isize + zero_len > offset) - zero_len = offset - isize; - *did_zeroing = true; - return xfs_iozero(ip, isize, zero_len); -} - /* * Zero any on disk space between the current EOF and the new, larger EOF. * @@ -484,94 +456,11 @@ xfs_zero_eof( xfs_fsize_t isize, /* current inode size */ bool *did_zeroing) { - struct xfs_mount *mp = ip->i_mount; - xfs_fileoff_t start_zero_fsb; - xfs_fileoff_t end_zero_fsb; - xfs_fileoff_t zero_count_fsb; - xfs_fileoff_t last_fsb; - xfs_fileoff_t zero_off; - xfs_fsize_t zero_len; - int nimaps; - int error = 0; - struct xfs_bmbt_irec imap; - ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); ASSERT(offset > isize); trace_xfs_zero_eof(ip, isize, offset - isize); - - /* - * First handle zeroing the block on which isize resides. - * - * We only zero a part of that block so it is handled specially. - */ - if (XFS_B_FSB_OFFSET(mp, isize) != 0) { - error = xfs_zero_last_block(ip, offset, isize, did_zeroing); - if (error) - return error; - } - - /* - * Calculate the range between the new size and the old where blocks - * needing to be zeroed may exist. - * - * To get the block where the last byte in the file currently resides, - * we need to subtract one from the size and truncate back to a block - * boundary. We subtract 1 in case the size is exactly on a block - * boundary. - */ - last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1; - start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize); - end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1); - ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb); - if (last_fsb == end_zero_fsb) { - /* - * The size was only incremented on its last block. - * We took care of that above, so just return. - */ - return 0; - } - - ASSERT(start_zero_fsb <= end_zero_fsb); - while (start_zero_fsb <= end_zero_fsb) { - nimaps = 1; - zero_count_fsb = end_zero_fsb - start_zero_fsb + 1; - - xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_bmapi_read(ip, start_zero_fsb, zero_count_fsb, - &imap, &nimaps, 0); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - if (error) - return error; - - ASSERT(nimaps > 0); - - if (imap.br_state == XFS_EXT_UNWRITTEN || - imap.br_startblock == HOLESTARTBLOCK) { - start_zero_fsb = imap.br_startoff + imap.br_blockcount; - ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); - continue; - } - - /* - * There are blocks we need to zero. - */ - zero_off = XFS_FSB_TO_B(mp, start_zero_fsb); - zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount); - - if ((zero_off + zero_len) > offset) - zero_len = offset - zero_off; - - error = xfs_iozero(ip, zero_off, zero_len); - if (error) - return error; - - *did_zeroing = true; - start_zero_fsb = imap.br_startoff + imap.br_blockcount; - ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); - } - - return 0; + return xfs_zero_range(ip, isize, offset - isize, did_zeroing); } /* @@ -722,8 +611,7 @@ xfs_file_dio_aio_write( mp->m_rtdev_targp : mp->m_ddev_targp; /* DIO must be aligned to device logical sector size */ - if (!IS_DAX(inode) && - ((iocb->ki_pos | count) & target->bt_logical_sectormask)) + if ((iocb->ki_pos | count) & target->bt_logical_sectormask) return -EINVAL; /* "unaligned" here means not aligned to a filesystem block */ @@ -762,7 +650,7 @@ xfs_file_dio_aio_write( end = iocb->ki_pos + count - 1; /* - * See xfs_file_read_iter() for why we do a full-file flush here. + * See xfs_file_dio_aio_read() for why we do a full-file flush here. */ if (mapping->nrpages) { ret = filemap_write_and_wait(VFS_I(ip)->i_mapping); @@ -789,10 +677,12 @@ xfs_file_dio_aio_write( iolock = XFS_IOLOCK_SHARED; } - trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0); + trace_xfs_file_direct_write(ip, count, iocb->ki_pos); data = *from; - ret = mapping->a_ops->direct_IO(iocb, &data); + ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, + xfs_get_blocks_direct, xfs_end_io_direct_write, + NULL, DIO_ASYNC_EXTEND); /* see generic_file_direct_write() for why this is necessary */ if (mapping->nrpages) { @@ -809,10 +699,81 @@ out: xfs_rw_iunlock(ip, iolock); /* - * No fallback to buffered IO on errors for XFS. DAX can result in - * partial writes, but direct IO will either complete fully or fail. + * No fallback to buffered IO on errors for XFS, direct IO will either + * complete fully or fail. + */ + ASSERT(ret < 0 || ret == count); + return ret; +} + +static noinline ssize_t +xfs_file_dax_write( + struct kiocb *iocb, + struct iov_iter *from) +{ + struct address_space *mapping = iocb->ki_filp->f_mapping; + struct inode *inode = mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + ssize_t ret = 0; + int unaligned_io = 0; + int iolock; + struct iov_iter data; + + /* "unaligned" here means not aligned to a filesystem block */ + if ((iocb->ki_pos & mp->m_blockmask) || + ((iocb->ki_pos + iov_iter_count(from)) & mp->m_blockmask)) { + unaligned_io = 1; + iolock = XFS_IOLOCK_EXCL; + } else if (mapping->nrpages) { + iolock = XFS_IOLOCK_EXCL; + } else { + iolock = XFS_IOLOCK_SHARED; + } + xfs_rw_ilock(ip, iolock); + + ret = xfs_file_aio_write_checks(iocb, from, &iolock); + if (ret) + goto out; + + /* + * Yes, even DAX files can have page cache attached to them: A zeroed + * page is inserted into the pagecache when we have to serve a write + * fault on a hole. It should never be dirtied and can simply be + * dropped from the pagecache once we get real data for the page. + * + * XXX: This is racy against mmap, and there's nothing we can do about + * it. dax_do_io() should really do this invalidation internally as + * it will know if we've allocated over a holei for this specific IO and + * if so it needs to update the mapping tree and invalidate existing + * PTEs over the newly allocated range. Remove this invalidation when + * dax_do_io() is fixed up. */ - ASSERT(ret < 0 || ret == count || IS_DAX(VFS_I(ip))); + if (mapping->nrpages) { + loff_t end = iocb->ki_pos + iov_iter_count(from) - 1; + + ret = invalidate_inode_pages2_range(mapping, + iocb->ki_pos >> PAGE_SHIFT, + end >> PAGE_SHIFT); + WARN_ON_ONCE(ret); + } + + if (iolock == XFS_IOLOCK_EXCL && !unaligned_io) { + xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); + iolock = XFS_IOLOCK_SHARED; + } + + trace_xfs_file_dax_write(ip, iov_iter_count(from), iocb->ki_pos); + + data = *from; + ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, + xfs_end_io_direct_write, 0); + if (ret > 0) { + iocb->ki_pos += ret; + iov_iter_advance(from, ret); + } +out: + xfs_rw_iunlock(ip, iolock); return ret; } @@ -839,9 +800,8 @@ xfs_file_buffered_aio_write( current->backing_dev_info = inode_to_bdi(inode); write_retry: - trace_xfs_file_buffered_write(ip, iov_iter_count(from), - iocb->ki_pos, 0); - ret = generic_perform_write(file, from, iocb->ki_pos); + trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos); + ret = iomap_file_buffered_write(iocb, from, &xfs_iomap_ops); if (likely(ret >= 0)) iocb->ki_pos += ret; @@ -895,7 +855,9 @@ xfs_file_write_iter( if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return -EIO; - if ((iocb->ki_flags & IOCB_DIRECT) || IS_DAX(inode)) + if (IS_DAX(inode)) + ret = xfs_file_dax_write(iocb, from); + else if (iocb->ki_flags & IOCB_DIRECT) ret = xfs_file_dio_aio_write(iocb, from); else ret = xfs_file_buffered_aio_write(iocb, from); @@ -1551,9 +1513,9 @@ xfs_filemap_page_mkwrite( xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (IS_DAX(inode)) { - ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault); + ret = dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault); } else { - ret = block_page_mkwrite(vma, vmf, xfs_get_blocks); + ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops); ret = block_page_mkwrite_return(ret); } @@ -1585,7 +1547,7 @@ xfs_filemap_fault( * changes to xfs_get_blocks_direct() to map unwritten extent * ioend for conversion on read-only mappings. */ - ret = __dax_fault(vma, vmf, xfs_get_blocks_dax_fault); + ret = dax_fault(vma, vmf, xfs_get_blocks_dax_fault); } else ret = filemap_fault(vma, vmf); xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); @@ -1622,7 +1584,7 @@ xfs_filemap_pmd_fault( } xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault); + ret = dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault); xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (flags & FAULT_FLAG_WRITE) diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index a51353a1f..4a33a3304 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -22,6 +22,7 @@ #include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_mount.h" +#include "xfs_defer.h" #include "xfs_inode.h" #include "xfs_bmap.h" #include "xfs_bmap_util.h" @@ -385,7 +386,7 @@ xfs_filestream_new_ag( } flags = (ap->userdata ? XFS_PICK_USERDATA : 0) | - (ap->flist->xbf_low ? XFS_PICK_LOWSPACE : 0); + (ap->dfops->dop_low ? XFS_PICK_LOWSPACE : 0); err = xfs_filestream_pick_ag(pip, startag, agp, flags, minlen); diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index b4d75825a..0b7f98674 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -23,6 +23,7 @@ #include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_mount.h" +#include "xfs_defer.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" #include "xfs_inode.h" @@ -32,6 +33,7 @@ #include "xfs_btree.h" #include "xfs_alloc_btree.h" #include "xfs_alloc.h" +#include "xfs_rmap_btree.h" #include "xfs_ialloc.h" #include "xfs_fsops.h" #include "xfs_itable.h" @@ -40,6 +42,7 @@ #include "xfs_trace.h" #include "xfs_log.h" #include "xfs_filestream.h" +#include "xfs_rmap.h" /* * File system operations @@ -103,7 +106,9 @@ xfs_fs_geometry( (xfs_sb_version_hasfinobt(&mp->m_sb) ? XFS_FSOP_GEOM_FLAGS_FINOBT : 0) | (xfs_sb_version_hassparseinodes(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_SPINODES : 0); + XFS_FSOP_GEOM_FLAGS_SPINODES : 0) | + (xfs_sb_version_hasrmapbt(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_RMAPBT : 0); geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ? mp->m_sb.sb_logsectsize : BBSIZE; geo->rtsectsize = mp->m_sb.sb_blocksize; @@ -239,10 +244,17 @@ xfs_growfs_data_private( agf->agf_roots[XFS_BTNUM_CNTi] = cpu_to_be32(XFS_CNT_BLOCK(mp)); agf->agf_levels[XFS_BTNUM_BNOi] = cpu_to_be32(1); agf->agf_levels[XFS_BTNUM_CNTi] = cpu_to_be32(1); + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) { + agf->agf_roots[XFS_BTNUM_RMAPi] = + cpu_to_be32(XFS_RMAP_BLOCK(mp)); + agf->agf_levels[XFS_BTNUM_RMAPi] = cpu_to_be32(1); + agf->agf_rmap_blocks = cpu_to_be32(1); + } + agf->agf_flfirst = cpu_to_be32(1); agf->agf_fllast = 0; agf->agf_flcount = 0; - tmpsize = agsize - XFS_PREALLOC_BLOCKS(mp); + tmpsize = agsize - mp->m_ag_prealloc_blocks; agf->agf_freeblks = cpu_to_be32(tmpsize); agf->agf_longest = cpu_to_be32(tmpsize); if (xfs_sb_version_hascrc(&mp->m_sb)) @@ -339,7 +351,7 @@ xfs_growfs_data_private( agno, 0); arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1); - arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp)); + arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks); arec->ar_blockcount = cpu_to_be32( agsize - be32_to_cpu(arec->ar_startblock)); @@ -368,7 +380,7 @@ xfs_growfs_data_private( agno, 0); arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1); - arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp)); + arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks); arec->ar_blockcount = cpu_to_be32( agsize - be32_to_cpu(arec->ar_startblock)); nfree += be32_to_cpu(arec->ar_blockcount); @@ -378,6 +390,72 @@ xfs_growfs_data_private( if (error) goto error0; + /* RMAP btree root block */ + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) { + struct xfs_rmap_rec *rrec; + struct xfs_btree_block *block; + + bp = xfs_growfs_get_hdr_buf(mp, + XFS_AGB_TO_DADDR(mp, agno, XFS_RMAP_BLOCK(mp)), + BTOBB(mp->m_sb.sb_blocksize), 0, + &xfs_rmapbt_buf_ops); + if (!bp) { + error = -ENOMEM; + goto error0; + } + + xfs_btree_init_block(mp, bp, XFS_RMAP_CRC_MAGIC, 0, 0, + agno, XFS_BTREE_CRC_BLOCKS); + block = XFS_BUF_TO_BLOCK(bp); + + + /* + * mark the AG header regions as static metadata The BNO + * btree block is the first block after the headers, so + * it's location defines the size of region the static + * metadata consumes. + * + * Note: unlike mkfs, we never have to account for log + * space when growing the data regions + */ + rrec = XFS_RMAP_REC_ADDR(block, 1); + rrec->rm_startblock = 0; + rrec->rm_blockcount = cpu_to_be32(XFS_BNO_BLOCK(mp)); + rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_FS); + rrec->rm_offset = 0; + be16_add_cpu(&block->bb_numrecs, 1); + + /* account freespace btree root blocks */ + rrec = XFS_RMAP_REC_ADDR(block, 2); + rrec->rm_startblock = cpu_to_be32(XFS_BNO_BLOCK(mp)); + rrec->rm_blockcount = cpu_to_be32(2); + rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_AG); + rrec->rm_offset = 0; + be16_add_cpu(&block->bb_numrecs, 1); + + /* account inode btree root blocks */ + rrec = XFS_RMAP_REC_ADDR(block, 3); + rrec->rm_startblock = cpu_to_be32(XFS_IBT_BLOCK(mp)); + rrec->rm_blockcount = cpu_to_be32(XFS_RMAP_BLOCK(mp) - + XFS_IBT_BLOCK(mp)); + rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_INOBT); + rrec->rm_offset = 0; + be16_add_cpu(&block->bb_numrecs, 1); + + /* account for rmap btree root */ + rrec = XFS_RMAP_REC_ADDR(block, 4); + rrec->rm_startblock = cpu_to_be32(XFS_RMAP_BLOCK(mp)); + rrec->rm_blockcount = cpu_to_be32(1); + rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_AG); + rrec->rm_offset = 0; + be16_add_cpu(&block->bb_numrecs, 1); + + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + if (error) + goto error0; + } + /* * INO btree root block */ @@ -435,6 +513,8 @@ xfs_growfs_data_private( * There are new blocks in the old last a.g. */ if (new) { + struct xfs_owner_info oinfo; + /* * Change the agi length. */ @@ -462,14 +542,20 @@ xfs_growfs_data_private( be32_to_cpu(agi->agi_length)); xfs_alloc_log_agf(tp, bp, XFS_AGF_LENGTH); + /* * Free the new space. + * + * XFS_RMAP_OWN_NULL is used here to tell the rmap btree that + * this doesn't actually exist in the rmap btree. */ - error = xfs_free_extent(tp, XFS_AGB_TO_FSB(mp, agno, - be32_to_cpu(agf->agf_length) - new), new); - if (error) { + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_NULL); + error = xfs_free_extent(tp, + XFS_AGB_TO_FSB(mp, agno, + be32_to_cpu(agf->agf_length) - new), + new, &oinfo); + if (error) goto error0; - } } /* @@ -501,6 +587,7 @@ xfs_growfs_data_private( } else mp->m_maxicount = 0; xfs_set_low_space_thresholds(mp); + mp->m_alloc_set_aside = xfs_alloc_set_aside(mp); /* update secondary superblocks. */ for (agno = 1; agno < nagcount; agno++) { @@ -638,7 +725,7 @@ xfs_fs_counts( cnt->allocino = percpu_counter_read_positive(&mp->m_icount); cnt->freeino = percpu_counter_read_positive(&mp->m_ifree); cnt->freedata = percpu_counter_read_positive(&mp->m_fdblocks) - - XFS_ALLOC_SET_ASIDE(mp); + mp->m_alloc_set_aside; spin_lock(&mp->m_sb_lock); cnt->freertx = mp->m_sb.sb_frextents; @@ -667,8 +754,11 @@ xfs_reserve_blocks( __uint64_t *inval, xfs_fsop_resblks_t *outval) { - __int64_t lcounter, delta, fdblks_delta; + __int64_t lcounter, delta; + __int64_t fdblks_delta = 0; __uint64_t request; + __int64_t free; + int error = 0; /* If inval is null, report current values and return */ if (inval == (__uint64_t *)NULL) { @@ -682,24 +772,23 @@ xfs_reserve_blocks( request = *inval; /* - * With per-cpu counters, this becomes an interesting - * problem. we needto work out if we are freeing or allocation - * blocks first, then we can do the modification as necessary. + * With per-cpu counters, this becomes an interesting problem. we need + * to work out if we are freeing or allocation blocks first, then we can + * do the modification as necessary. * - * We do this under the m_sb_lock so that if we are near - * ENOSPC, we will hold out any changes while we work out - * what to do. This means that the amount of free space can - * change while we do this, so we need to retry if we end up - * trying to reserve more space than is available. + * We do this under the m_sb_lock so that if we are near ENOSPC, we will + * hold out any changes while we work out what to do. This means that + * the amount of free space can change while we do this, so we need to + * retry if we end up trying to reserve more space than is available. */ -retry: spin_lock(&mp->m_sb_lock); /* * If our previous reservation was larger than the current value, - * then move any unused blocks back to the free pool. + * then move any unused blocks back to the free pool. Modify the resblks + * counters directly since we shouldn't have any problems unreserving + * space. */ - fdblks_delta = 0; if (mp->m_resblks > request) { lcounter = mp->m_resblks_avail - request; if (lcounter > 0) { /* release unused blocks */ @@ -707,54 +796,67 @@ retry: mp->m_resblks_avail -= lcounter; } mp->m_resblks = request; - } else { - __int64_t free; + if (fdblks_delta) { + spin_unlock(&mp->m_sb_lock); + error = xfs_mod_fdblocks(mp, fdblks_delta, 0); + spin_lock(&mp->m_sb_lock); + } + + goto out; + } + /* + * If the request is larger than the current reservation, reserve the + * blocks before we update the reserve counters. Sample m_fdblocks and + * perform a partial reservation if the request exceeds free space. + */ + error = -ENOSPC; + do { free = percpu_counter_sum(&mp->m_fdblocks) - - XFS_ALLOC_SET_ASIDE(mp); + mp->m_alloc_set_aside; if (!free) - goto out; /* ENOSPC and fdblks_delta = 0 */ + break; delta = request - mp->m_resblks; lcounter = free - delta; - if (lcounter < 0) { + if (lcounter < 0) /* We can't satisfy the request, just get what we can */ - mp->m_resblks += free; - mp->m_resblks_avail += free; - fdblks_delta = -free; - } else { - fdblks_delta = -delta; - mp->m_resblks = request; - mp->m_resblks_avail += delta; - } - } -out: - if (outval) { - outval->resblks = mp->m_resblks; - outval->resblks_avail = mp->m_resblks_avail; - } - spin_unlock(&mp->m_sb_lock); + fdblks_delta = free; + else + fdblks_delta = delta; - if (fdblks_delta) { /* - * If we are putting blocks back here, m_resblks_avail is - * already at its max so this will put it in the free pool. - * - * If we need space, we'll either succeed in getting it - * from the free block count or we'll get an enospc. If - * we get a ENOSPC, it means things changed while we were - * calculating fdblks_delta and so we should try again to - * see if there is anything left to reserve. + * We'll either succeed in getting space from the free block + * count or we'll get an ENOSPC. If we get a ENOSPC, it means + * things changed while we were calculating fdblks_delta and so + * we should try again to see if there is anything left to + * reserve. * * Don't set the reserved flag here - we don't want to reserve * the extra reserve blocks from the reserve..... */ - int error; - error = xfs_mod_fdblocks(mp, fdblks_delta, 0); - if (error == -ENOSPC) - goto retry; + spin_unlock(&mp->m_sb_lock); + error = xfs_mod_fdblocks(mp, -fdblks_delta, 0); + spin_lock(&mp->m_sb_lock); + } while (error == -ENOSPC); + + /* + * Update the reserve counters if blocks have been successfully + * allocated. + */ + if (!error && fdblks_delta) { + mp->m_resblks += fdblks_delta; + mp->m_resblks_avail += fdblks_delta; } - return 0; + +out: + if (outval) { + outval->resblks = mp->m_resblks; + outval->resblks_avail = mp->m_resblks_avail; + } + + spin_unlock(&mp->m_sb_lock); + return error; } int diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 99ee6eee5..fb39a6691 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -765,7 +765,7 @@ restart: * Background scanning to trim post-EOF preallocated space. This is queued * based on the 'speculative_prealloc_lifetime' tunable (5m by default). */ -STATIC void +void xfs_queue_eofblocks( struct xfs_mount *mp) { diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 62f1f91c3..05bac99be 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -68,6 +68,7 @@ void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip); int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *); int xfs_inode_free_quota_eofblocks(struct xfs_inode *ip); void xfs_eofblocks_worker(struct work_struct *); +void xfs_queue_eofblocks(struct xfs_mount *); int xfs_inode_ag_iterator(struct xfs_mount *mp, int (*execute)(struct xfs_inode *ip, int flags, void *args), diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index ee6799e04..e08eaea63 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -25,6 +25,7 @@ #include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_mount.h" +#include "xfs_defer.h" #include "xfs_inode.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" @@ -431,7 +432,7 @@ xfs_lock_inumorder(int lock_mode, int subclass) * lock more than one at a time, lockdep will report false positives saying we * have violated locking orders. */ -void +static void xfs_lock_inodes( xfs_inode_t **ips, int inodes, @@ -667,14 +668,6 @@ xfs_ip2xflags( return _xfs_dic2xflags(dic->di_flags, dic->di_flags2, XFS_IFORK_Q(ip)); } -uint -xfs_dic2xflags( - struct xfs_dinode *dip) -{ - return _xfs_dic2xflags(be16_to_cpu(dip->di_flags), - be64_to_cpu(dip->di_flags2), XFS_DFORK_Q(dip)); -} - /* * Lookups up an inode from "name". If ci_name is not NULL, then a CI match * is allowed, otherwise it has to be an exact match. If a CI match is found, @@ -748,7 +741,7 @@ out_unlock: * are not linked into the directory structure - they are attached * directly to the superblock - and so have no parent. */ -int +static int xfs_ialloc( xfs_trans_t *tp, xfs_inode_t *pip, @@ -1085,7 +1078,7 @@ xfs_dir_ialloc( * link count to go to zero, move the inode to AGI unlinked list so that it can * be freed when the last active reference goes away via xfs_inactive(). */ -int /* error */ +static int /* error */ xfs_droplink( xfs_trans_t *tp, xfs_inode_t *ip) @@ -1104,7 +1097,7 @@ xfs_droplink( /* * Increment the link count on an inode & log the change. */ -int +static int xfs_bumplink( xfs_trans_t *tp, xfs_inode_t *ip) @@ -1130,7 +1123,7 @@ xfs_create( struct xfs_inode *ip = NULL; struct xfs_trans *tp = NULL; int error; - xfs_bmap_free_t free_list; + struct xfs_defer_ops dfops; xfs_fsblock_t first_block; bool unlock_dp_on_error = false; prid_t prid; @@ -1190,7 +1183,7 @@ xfs_create( XFS_IOLOCK_PARENT | XFS_ILOCK_PARENT); unlock_dp_on_error = true; - xfs_bmap_init(&free_list, &first_block); + xfs_defer_init(&dfops, &first_block); /* * Reserve disk quota and the inode. @@ -1227,7 +1220,7 @@ xfs_create( unlock_dp_on_error = false; error = xfs_dir_createname(tp, dp, name, ip->i_ino, - &first_block, &free_list, resblks ? + &first_block, &dfops, resblks ? resblks - XFS_IALLOC_SPACE_RES(mp) : 0); if (error) { ASSERT(error != -ENOSPC); @@ -1261,7 +1254,7 @@ xfs_create( */ xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp); - error = xfs_bmap_finish(&tp, &free_list, NULL); + error = xfs_defer_finish(&tp, &dfops, NULL); if (error) goto out_bmap_cancel; @@ -1277,7 +1270,7 @@ xfs_create( return 0; out_bmap_cancel: - xfs_bmap_cancel(&free_list); + xfs_defer_cancel(&dfops); out_trans_cancel: xfs_trans_cancel(tp); out_release_inode: @@ -1409,7 +1402,7 @@ xfs_link( xfs_mount_t *mp = tdp->i_mount; xfs_trans_t *tp; int error; - xfs_bmap_free_t free_list; + struct xfs_defer_ops dfops; xfs_fsblock_t first_block; int resblks; @@ -1460,7 +1453,7 @@ xfs_link( goto error_return; } - xfs_bmap_init(&free_list, &first_block); + xfs_defer_init(&dfops, &first_block); /* * Handle initial link state of O_TMPFILE inode @@ -1472,7 +1465,7 @@ xfs_link( } error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino, - &first_block, &free_list, resblks); + &first_block, &dfops, resblks); if (error) goto error_return; xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); @@ -1490,9 +1483,9 @@ xfs_link( if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) xfs_trans_set_sync(tp); - error = xfs_bmap_finish(&tp, &free_list, NULL); + error = xfs_defer_finish(&tp, &dfops, NULL); if (error) { - xfs_bmap_cancel(&free_list); + xfs_defer_cancel(&dfops); goto error_return; } @@ -1534,7 +1527,7 @@ xfs_itruncate_extents( { struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp = *tpp; - xfs_bmap_free_t free_list; + struct xfs_defer_ops dfops; xfs_fsblock_t first_block; xfs_fileoff_t first_unmap_block; xfs_fileoff_t last_block; @@ -1570,12 +1563,12 @@ xfs_itruncate_extents( ASSERT(first_unmap_block < last_block); unmap_len = last_block - first_unmap_block + 1; while (!done) { - xfs_bmap_init(&free_list, &first_block); + xfs_defer_init(&dfops, &first_block); error = xfs_bunmapi(tp, ip, first_unmap_block, unmap_len, xfs_bmapi_aflag(whichfork), XFS_ITRUNC_MAX_EXTENTS, - &first_block, &free_list, + &first_block, &dfops, &done); if (error) goto out_bmap_cancel; @@ -1584,7 +1577,7 @@ xfs_itruncate_extents( * Duplicate the transaction that has the permanent * reservation and commit the old transaction. */ - error = xfs_bmap_finish(&tp, &free_list, ip); + error = xfs_defer_finish(&tp, &dfops, ip); if (error) goto out_bmap_cancel; @@ -1610,7 +1603,7 @@ out_bmap_cancel: * the transaction can be properly aborted. We just need to make sure * we're not holding any resources that we were not when we came in. */ - xfs_bmap_cancel(&free_list); + xfs_defer_cancel(&dfops); goto out; } @@ -1751,7 +1744,7 @@ STATIC int xfs_inactive_ifree( struct xfs_inode *ip) { - xfs_bmap_free_t free_list; + struct xfs_defer_ops dfops; xfs_fsblock_t first_block; struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; @@ -1788,8 +1781,8 @@ xfs_inactive_ifree( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); - xfs_bmap_init(&free_list, &first_block); - error = xfs_ifree(tp, ip, &free_list); + xfs_defer_init(&dfops, &first_block); + error = xfs_ifree(tp, ip, &dfops); if (error) { /* * If we fail to free the inode, shut down. The cancel @@ -1815,11 +1808,11 @@ xfs_inactive_ifree( * Just ignore errors at this point. There is nothing we can do except * to try to keep going. Make sure it's not a silent error. */ - error = xfs_bmap_finish(&tp, &free_list, NULL); + error = xfs_defer_finish(&tp, &dfops, NULL); if (error) { - xfs_notice(mp, "%s: xfs_bmap_finish returned error %d", + xfs_notice(mp, "%s: xfs_defer_finish returned error %d", __func__, error); - xfs_bmap_cancel(&free_list); + xfs_defer_cancel(&dfops); } error = xfs_trans_commit(tp); if (error) @@ -2375,7 +2368,7 @@ int xfs_ifree( xfs_trans_t *tp, xfs_inode_t *ip, - xfs_bmap_free_t *flist) + struct xfs_defer_ops *dfops) { int error; struct xfs_icluster xic = { 0 }; @@ -2394,7 +2387,7 @@ xfs_ifree( if (error) return error; - error = xfs_difree(tp, ip->i_ino, flist, &xic); + error = xfs_difree(tp, ip->i_ino, dfops, &xic); if (error) return error; @@ -2482,7 +2475,7 @@ xfs_iunpin_wait( * directory entry. * * This is still safe from a transactional point of view - it is not until we - * get to xfs_bmap_finish() that we have the possibility of multiple + * get to xfs_defer_finish() that we have the possibility of multiple * transactions in this operation. Hence as long as we remove the directory * entry and drop the link count in the first transaction of the remove * operation, there are no transactional constraints on the ordering here. @@ -2497,7 +2490,7 @@ xfs_remove( xfs_trans_t *tp = NULL; int is_dir = S_ISDIR(VFS_I(ip)->i_mode); int error = 0; - xfs_bmap_free_t free_list; + struct xfs_defer_ops dfops; xfs_fsblock_t first_block; uint resblks; @@ -2579,9 +2572,9 @@ xfs_remove( if (error) goto out_trans_cancel; - xfs_bmap_init(&free_list, &first_block); + xfs_defer_init(&dfops, &first_block); error = xfs_dir_removename(tp, dp, name, ip->i_ino, - &first_block, &free_list, resblks); + &first_block, &dfops, resblks); if (error) { ASSERT(error != -ENOENT); goto out_bmap_cancel; @@ -2595,7 +2588,7 @@ xfs_remove( if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) xfs_trans_set_sync(tp); - error = xfs_bmap_finish(&tp, &free_list, NULL); + error = xfs_defer_finish(&tp, &dfops, NULL); if (error) goto out_bmap_cancel; @@ -2609,7 +2602,7 @@ xfs_remove( return 0; out_bmap_cancel: - xfs_bmap_cancel(&free_list); + xfs_defer_cancel(&dfops); out_trans_cancel: xfs_trans_cancel(tp); std_return: @@ -2670,7 +2663,7 @@ xfs_sort_for_rename( static int xfs_finish_rename( struct xfs_trans *tp, - struct xfs_bmap_free *free_list) + struct xfs_defer_ops *dfops) { int error; @@ -2681,9 +2674,9 @@ xfs_finish_rename( if (tp->t_mountp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) xfs_trans_set_sync(tp); - error = xfs_bmap_finish(&tp, free_list, NULL); + error = xfs_defer_finish(&tp, dfops, NULL); if (error) { - xfs_bmap_cancel(free_list); + xfs_defer_cancel(dfops); xfs_trans_cancel(tp); return error; } @@ -2705,7 +2698,7 @@ xfs_cross_rename( struct xfs_inode *dp2, struct xfs_name *name2, struct xfs_inode *ip2, - struct xfs_bmap_free *free_list, + struct xfs_defer_ops *dfops, xfs_fsblock_t *first_block, int spaceres) { @@ -2717,14 +2710,14 @@ xfs_cross_rename( /* Swap inode number for dirent in first parent */ error = xfs_dir_replace(tp, dp1, name1, ip2->i_ino, - first_block, free_list, spaceres); + first_block, dfops, spaceres); if (error) goto out_trans_abort; /* Swap inode number for dirent in second parent */ error = xfs_dir_replace(tp, dp2, name2, ip1->i_ino, - first_block, free_list, spaceres); + first_block, dfops, spaceres); if (error) goto out_trans_abort; @@ -2739,7 +2732,7 @@ xfs_cross_rename( if (S_ISDIR(VFS_I(ip2)->i_mode)) { error = xfs_dir_replace(tp, ip2, &xfs_name_dotdot, dp1->i_ino, first_block, - free_list, spaceres); + dfops, spaceres); if (error) goto out_trans_abort; @@ -2766,7 +2759,7 @@ xfs_cross_rename( if (S_ISDIR(VFS_I(ip1)->i_mode)) { error = xfs_dir_replace(tp, ip1, &xfs_name_dotdot, dp2->i_ino, first_block, - free_list, spaceres); + dfops, spaceres); if (error) goto out_trans_abort; @@ -2805,10 +2798,10 @@ xfs_cross_rename( } xfs_trans_ichgtime(tp, dp1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, dp1, XFS_ILOG_CORE); - return xfs_finish_rename(tp, free_list); + return xfs_finish_rename(tp, dfops); out_trans_abort: - xfs_bmap_cancel(free_list); + xfs_defer_cancel(dfops); xfs_trans_cancel(tp); return error; } @@ -2863,7 +2856,7 @@ xfs_rename( { struct xfs_mount *mp = src_dp->i_mount; struct xfs_trans *tp; - struct xfs_bmap_free free_list; + struct xfs_defer_ops dfops; xfs_fsblock_t first_block; struct xfs_inode *wip = NULL; /* whiteout inode */ struct xfs_inode *inodes[__XFS_SORT_INODES]; @@ -2952,13 +2945,13 @@ xfs_rename( goto out_trans_cancel; } - xfs_bmap_init(&free_list, &first_block); + xfs_defer_init(&dfops, &first_block); /* RENAME_EXCHANGE is unique from here on. */ if (flags & RENAME_EXCHANGE) return xfs_cross_rename(tp, src_dp, src_name, src_ip, target_dp, target_name, target_ip, - &free_list, &first_block, spaceres); + &dfops, &first_block, spaceres); /* * Set up the target. @@ -2980,7 +2973,7 @@ xfs_rename( */ error = xfs_dir_createname(tp, target_dp, target_name, src_ip->i_ino, &first_block, - &free_list, spaceres); + &dfops, spaceres); if (error) goto out_bmap_cancel; @@ -3020,7 +3013,7 @@ xfs_rename( */ error = xfs_dir_replace(tp, target_dp, target_name, src_ip->i_ino, - &first_block, &free_list, spaceres); + &first_block, &dfops, spaceres); if (error) goto out_bmap_cancel; @@ -3055,7 +3048,7 @@ xfs_rename( */ error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot, target_dp->i_ino, - &first_block, &free_list, spaceres); + &first_block, &dfops, spaceres); ASSERT(error != -EEXIST); if (error) goto out_bmap_cancel; @@ -3094,10 +3087,10 @@ xfs_rename( */ if (wip) { error = xfs_dir_replace(tp, src_dp, src_name, wip->i_ino, - &first_block, &free_list, spaceres); + &first_block, &dfops, spaceres); } else error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino, - &first_block, &free_list, spaceres); + &first_block, &dfops, spaceres); if (error) goto out_bmap_cancel; @@ -3132,13 +3125,13 @@ xfs_rename( if (new_parent) xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE); - error = xfs_finish_rename(tp, &free_list); + error = xfs_finish_rename(tp, &dfops); if (wip) IRELE(wip); return error; out_bmap_cancel: - xfs_bmap_cancel(&free_list); + xfs_defer_cancel(&dfops); out_trans_cancel: xfs_trans_cancel(tp); out_release_wip: diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index e52d7c7ae..e1a411e08 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -27,7 +27,7 @@ struct xfs_dinode; struct xfs_inode; struct xfs_buf; -struct xfs_bmap_free; +struct xfs_defer_ops; struct xfs_bmbt_irec; struct xfs_inode_log_item; struct xfs_mount; @@ -395,14 +395,10 @@ void xfs_ilock_demote(xfs_inode_t *, uint); int xfs_isilocked(xfs_inode_t *, uint); uint xfs_ilock_data_map_shared(struct xfs_inode *); uint xfs_ilock_attr_map_shared(struct xfs_inode *); -int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, umode_t, - xfs_nlink_t, xfs_dev_t, prid_t, int, - struct xfs_buf **, xfs_inode_t **); uint xfs_ip2xflags(struct xfs_inode *); -uint xfs_dic2xflags(struct xfs_dinode *); int xfs_ifree(struct xfs_trans *, xfs_inode_t *, - struct xfs_bmap_free *); + struct xfs_defer_ops *); int xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *, int, xfs_fsize_t); void xfs_iext_realloc(xfs_inode_t *, int, int); @@ -411,7 +407,6 @@ void xfs_iunpin_wait(xfs_inode_t *); #define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount)) int xfs_iflush(struct xfs_inode *, struct xfs_buf **); -void xfs_lock_inodes(xfs_inode_t **, int, uint); void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); @@ -419,8 +414,6 @@ xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); int xfs_dir_ialloc(struct xfs_trans **, struct xfs_inode *, umode_t, xfs_nlink_t, xfs_dev_t, prid_t, int, struct xfs_inode **, int *); -int xfs_droplink(struct xfs_trans *, struct xfs_inode *); -int xfs_bumplink(struct xfs_trans *, struct xfs_inode *); /* from xfs_file.c */ enum xfs_prealloc_flags { @@ -434,7 +427,8 @@ int xfs_update_prealloc_flags(struct xfs_inode *ip, enum xfs_prealloc_flags flags); int xfs_zero_eof(struct xfs_inode *ip, xfs_off_t offset, xfs_fsize_t isize, bool *did_zeroing); -int xfs_iozero(struct xfs_inode *ip, loff_t pos, size_t count); +int xfs_zero_range(struct xfs_inode *ip, xfs_off_t pos, xfs_off_t count, + bool *did_zero); loff_t __xfs_seek_hole_data(struct inode *inode, loff_t start, loff_t eof, int whence); @@ -479,14 +473,4 @@ do { \ extern struct kmem_zone *xfs_inode_zone; -/* - * Flags for read/write calls - */ -#define XFS_IO_ISDIRECT 0x00001 /* bypass page cache */ -#define XFS_IO_INVIS 0x00002 /* don't update inode timestamps */ - -#define XFS_IO_FLAGS \ - { XFS_IO_ISDIRECT, "DIRECT" }, \ - { XFS_IO_INVIS, "INVIS"} - #endif /* __XFS_INODE_H__ */ diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index a1b076122..892c2aced 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -651,6 +651,7 @@ void xfs_inode_item_destroy( xfs_inode_t *ip) { + kmem_free(ip->i_itemp->ili_item.li_lv_shadow); kmem_zone_free(xfs_ili_zone, ip->i_itemp); } diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 63a6ff2cf..96a70fd1f 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -232,7 +232,7 @@ xfs_open_by_handle( } if ((fmode & FMODE_WRITE) && IS_IMMUTABLE(inode)) { - error = -EACCES; + error = -EPERM; goto out_dput; } @@ -387,6 +387,7 @@ xfs_attrlist_by_handle( { int error = -ENOMEM; attrlist_cursor_kern_t *cursor; + struct xfs_fsop_attrlist_handlereq __user *p = arg; xfs_fsop_attrlist_handlereq_t al_hreq; struct dentry *dentry; char *kbuf; @@ -419,6 +420,11 @@ xfs_attrlist_by_handle( if (error) goto out_kfree; + if (copy_to_user(&p->pos, cursor, sizeof(attrlist_cursor_kern_t))) { + error = -EFAULT; + goto out_kfree; + } + if (copy_to_user(al_hreq.buffer, kbuf, al_hreq.buflen)) error = -EFAULT; @@ -595,13 +601,12 @@ xfs_attrmulti_by_handle( int xfs_ioc_space( - struct xfs_inode *ip, - struct inode *inode, struct file *filp, - int ioflags, unsigned int cmd, xfs_flock64_t *bf) { + struct inode *inode = file_inode(filp); + struct xfs_inode *ip = XFS_I(inode); struct iattr iattr; enum xfs_prealloc_flags flags = 0; uint iolock = XFS_IOLOCK_EXCL; @@ -626,7 +631,7 @@ xfs_ioc_space( if (filp->f_flags & O_DSYNC) flags |= XFS_PREALLOC_SYNC; - if (ioflags & XFS_IO_INVIS) + if (filp->f_mode & FMODE_NOCMTIME) flags |= XFS_PREALLOC_INVISIBLE; error = mnt_want_write_file(filp); @@ -1464,8 +1469,7 @@ xfs_getbmap_format(void **ap, struct getbmapx *bmv, int *full) STATIC int xfs_ioc_getbmap( - struct xfs_inode *ip, - int ioflags, + struct file *file, unsigned int cmd, void __user *arg) { @@ -1479,10 +1483,10 @@ xfs_ioc_getbmap( return -EINVAL; bmx.bmv_iflags = (cmd == XFS_IOC_GETBMAPA ? BMV_IF_ATTRFORK : 0); - if (ioflags & XFS_IO_INVIS) + if (file->f_mode & FMODE_NOCMTIME) bmx.bmv_iflags |= BMV_IF_NO_DMAPI_READ; - error = xfs_getbmap(ip, &bmx, xfs_getbmap_format, + error = xfs_getbmap(XFS_I(file_inode(file)), &bmx, xfs_getbmap_format, (__force struct getbmap *)arg+1); if (error) return error; @@ -1575,6 +1579,11 @@ xfs_ioc_swapext( goto out_put_tmp_file; } + /* + * We need to ensure that the fds passed in point to XFS inodes + * before we cast and access them as XFS structures as we have no + * control over what the user passes us here. + */ if (f.file->f_op != &xfs_file_operations || tmp.file->f_op != &xfs_file_operations) { error = -EINVAL; @@ -1625,12 +1634,8 @@ xfs_file_ioctl( struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; void __user *arg = (void __user *)p; - int ioflags = 0; int error; - if (filp->f_mode & FMODE_NOCMTIME) - ioflags |= XFS_IO_INVIS; - trace_xfs_file_ioctl(ip); switch (cmd) { @@ -1649,7 +1654,7 @@ xfs_file_ioctl( if (copy_from_user(&bf, arg, sizeof(bf))) return -EFAULT; - return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf); + return xfs_ioc_space(filp, cmd, &bf); } case XFS_IOC_DIOINFO: { struct dioattr da; @@ -1708,7 +1713,7 @@ xfs_file_ioctl( case XFS_IOC_GETBMAP: case XFS_IOC_GETBMAPA: - return xfs_ioc_getbmap(ip, ioflags, cmd, arg); + return xfs_ioc_getbmap(filp, cmd, arg); case XFS_IOC_GETBMAPX: return xfs_ioc_getbmapx(ip, arg); diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h index 77c02c790..8b52881bf 100644 --- a/fs/xfs/xfs_ioctl.h +++ b/fs/xfs/xfs_ioctl.h @@ -20,10 +20,7 @@ extern int xfs_ioc_space( - struct xfs_inode *ip, - struct inode *inode, struct file *filp, - int ioflags, unsigned int cmd, xfs_flock64_t *bf); diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index 1a05d8ae3..321f57721 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -532,12 +532,8 @@ xfs_file_compat_ioctl( struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; void __user *arg = (void __user *)p; - int ioflags = 0; int error; - if (filp->f_mode & FMODE_NOCMTIME) - ioflags |= XFS_IO_INVIS; - trace_xfs_file_compat_ioctl(ip); switch (cmd) { @@ -589,7 +585,7 @@ xfs_file_compat_ioctl( if (xfs_compat_flock64_copyin(&bf, arg)) return -EFAULT; cmd = _NATIVE_IOC(cmd, struct xfs_flock64); - return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf); + return xfs_ioc_space(filp, cmd, &bf); } case XFS_IOC_FSGEOMETRY_V1_32: return xfs_compat_ioc_fsgeometry_v1(mp, arg); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 58391355a..2af0dda1c 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -15,6 +15,7 @@ * along with this program; if not, write the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ +#include #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" @@ -22,6 +23,7 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" +#include "xfs_defer.h" #include "xfs_inode.h" #include "xfs_btree.h" #include "xfs_bmap_btree.h" @@ -127,7 +129,7 @@ xfs_iomap_write_direct( int quota_flag; int rt; xfs_trans_t *tp; - xfs_bmap_free_t free_list; + struct xfs_defer_ops dfops; uint qblocks, resblks, resrtextents; int error; int lockmode; @@ -230,18 +232,18 @@ xfs_iomap_write_direct( * From this point onwards we overwrite the imap pointer that the * caller gave to us. */ - xfs_bmap_init(&free_list, &firstfsb); + xfs_defer_init(&dfops, &firstfsb); nimaps = 1; error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, bmapi_flags, &firstfsb, resblks, imap, - &nimaps, &free_list); + &nimaps, &dfops); if (error) goto out_bmap_cancel; /* * Complete the transaction */ - error = xfs_bmap_finish(&tp, &free_list, NULL); + error = xfs_defer_finish(&tp, &dfops, NULL); if (error) goto out_bmap_cancel; @@ -265,7 +267,7 @@ out_unlock: return error; out_bmap_cancel: - xfs_bmap_cancel(&free_list); + xfs_defer_cancel(&dfops); xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag); out_trans_cancel: xfs_trans_cancel(tp); @@ -684,7 +686,7 @@ xfs_iomap_write_allocate( xfs_fileoff_t offset_fsb, last_block; xfs_fileoff_t end_fsb, map_start_fsb; xfs_fsblock_t first_block; - xfs_bmap_free_t free_list; + struct xfs_defer_ops dfops; xfs_filblks_t count_fsb; xfs_trans_t *tp; int nimaps; @@ -713,12 +715,16 @@ xfs_iomap_write_allocate( * is in the delayed allocation extent on which we sit * but before our buffer starts. */ - nimaps = 0; while (nimaps == 0) { nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); - - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, nres, + /* + * We have already reserved space for the extent and any + * indirect blocks when creating the delalloc extent, + * there is no need to reserve space in this transaction + * again. + */ + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, XFS_TRANS_RESERVE, &tp); if (error) return error; @@ -726,7 +732,7 @@ xfs_iomap_write_allocate( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); - xfs_bmap_init(&free_list, &first_block); + xfs_defer_init(&dfops, &first_block); /* * it is possible that the extents have changed since @@ -782,11 +788,11 @@ xfs_iomap_write_allocate( error = xfs_bmapi_write(tp, ip, map_start_fsb, count_fsb, 0, &first_block, nres, imap, &nimaps, - &free_list); + &dfops); if (error) goto trans_cancel; - error = xfs_bmap_finish(&tp, &free_list, NULL); + error = xfs_defer_finish(&tp, &dfops, NULL); if (error) goto trans_cancel; @@ -820,7 +826,7 @@ xfs_iomap_write_allocate( } trans_cancel: - xfs_bmap_cancel(&free_list); + xfs_defer_cancel(&dfops); xfs_trans_cancel(tp); error0: xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -841,7 +847,7 @@ xfs_iomap_write_unwritten( int nimaps; xfs_trans_t *tp; xfs_bmbt_irec_t imap; - xfs_bmap_free_t free_list; + struct xfs_defer_ops dfops; xfs_fsize_t i_size; uint resblks; int error; @@ -885,11 +891,11 @@ xfs_iomap_write_unwritten( /* * Modify the unwritten extent state of the buffer. */ - xfs_bmap_init(&free_list, &firstfsb); + xfs_defer_init(&dfops, &firstfsb); nimaps = 1; error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, XFS_BMAPI_CONVERT, &firstfsb, resblks, - &imap, &nimaps, &free_list); + &imap, &nimaps, &dfops); if (error) goto error_on_bmapi_transaction; @@ -908,7 +914,7 @@ xfs_iomap_write_unwritten( xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); } - error = xfs_bmap_finish(&tp, &free_list, NULL); + error = xfs_defer_finish(&tp, &dfops, NULL); if (error) goto error_on_bmapi_transaction; @@ -935,8 +941,217 @@ xfs_iomap_write_unwritten( return 0; error_on_bmapi_transaction: - xfs_bmap_cancel(&free_list); + xfs_defer_cancel(&dfops); xfs_trans_cancel(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } + +void +xfs_bmbt_to_iomap( + struct xfs_inode *ip, + struct iomap *iomap, + struct xfs_bmbt_irec *imap) +{ + struct xfs_mount *mp = ip->i_mount; + + if (imap->br_startblock == HOLESTARTBLOCK) { + iomap->blkno = IOMAP_NULL_BLOCK; + iomap->type = IOMAP_HOLE; + } else if (imap->br_startblock == DELAYSTARTBLOCK) { + iomap->blkno = IOMAP_NULL_BLOCK; + iomap->type = IOMAP_DELALLOC; + } else { + iomap->blkno = xfs_fsb_to_db(ip, imap->br_startblock); + if (imap->br_state == XFS_EXT_UNWRITTEN) + iomap->type = IOMAP_UNWRITTEN; + else + iomap->type = IOMAP_MAPPED; + } + iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff); + iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount); + iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip)); +} + +static inline bool imap_needs_alloc(struct xfs_bmbt_irec *imap, int nimaps) +{ + return !nimaps || + imap->br_startblock == HOLESTARTBLOCK || + imap->br_startblock == DELAYSTARTBLOCK; +} + +static int +xfs_file_iomap_begin( + struct inode *inode, + loff_t offset, + loff_t length, + unsigned flags, + struct iomap *iomap) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + struct xfs_bmbt_irec imap; + xfs_fileoff_t offset_fsb, end_fsb; + int nimaps = 1, error = 0; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + + ASSERT(offset <= mp->m_super->s_maxbytes); + if ((xfs_fsize_t)offset + length > mp->m_super->s_maxbytes) + length = mp->m_super->s_maxbytes - offset; + offset_fsb = XFS_B_TO_FSBT(mp, offset); + end_fsb = XFS_B_TO_FSB(mp, offset + length); + + error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, + &nimaps, XFS_BMAPI_ENTIRE); + if (error) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; + } + + if ((flags & IOMAP_WRITE) && imap_needs_alloc(&imap, nimaps)) { + /* + * We cap the maximum length we map here to MAX_WRITEBACK_PAGES + * pages to keep the chunks of work done where somewhat symmetric + * with the work writeback does. This is a completely arbitrary + * number pulled out of thin air as a best guess for initial + * testing. + * + * Note that the values needs to be less than 32-bits wide until + * the lower level functions are updated. + */ + length = min_t(loff_t, length, 1024 * PAGE_SIZE); + if (xfs_get_extsz_hint(ip)) { + /* + * xfs_iomap_write_direct() expects the shared lock. It + * is unlocked on return. + */ + xfs_ilock_demote(ip, XFS_ILOCK_EXCL); + error = xfs_iomap_write_direct(ip, offset, length, &imap, + nimaps); + } else { + error = xfs_iomap_write_delay(ip, offset, length, &imap); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + } + + if (error) + return error; + + trace_xfs_iomap_alloc(ip, offset, length, 0, &imap); + } else { + ASSERT(nimaps); + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + trace_xfs_iomap_found(ip, offset, length, 0, &imap); + } + + xfs_bmbt_to_iomap(ip, iomap, &imap); + return 0; +} + +static int +xfs_file_iomap_end_delalloc( + struct xfs_inode *ip, + loff_t offset, + loff_t length, + ssize_t written) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t start_fsb; + xfs_fileoff_t end_fsb; + int error = 0; + + start_fsb = XFS_B_TO_FSB(mp, offset + written); + end_fsb = XFS_B_TO_FSB(mp, offset + length); + + /* + * Trim back delalloc blocks if we didn't manage to write the whole + * range reserved. + * + * We don't need to care about racing delalloc as we hold i_mutex + * across the reserve/allocate/unreserve calls. If there are delalloc + * blocks in the range, they are ours. + */ + if (start_fsb < end_fsb) { + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_bmap_punch_delalloc_range(ip, start_fsb, + end_fsb - start_fsb); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + if (error && !XFS_FORCED_SHUTDOWN(mp)) { + xfs_alert(mp, "%s: unable to clean up ino %lld", + __func__, ip->i_ino); + return error; + } + } + + return 0; +} + +static int +xfs_file_iomap_end( + struct inode *inode, + loff_t offset, + loff_t length, + ssize_t written, + unsigned flags, + struct iomap *iomap) +{ + if ((flags & IOMAP_WRITE) && iomap->type == IOMAP_DELALLOC) + return xfs_file_iomap_end_delalloc(XFS_I(inode), offset, + length, written); + return 0; +} + +struct iomap_ops xfs_iomap_ops = { + .iomap_begin = xfs_file_iomap_begin, + .iomap_end = xfs_file_iomap_end, +}; + +static int +xfs_xattr_iomap_begin( + struct inode *inode, + loff_t offset, + loff_t length, + unsigned flags, + struct iomap *iomap) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); + xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + length); + struct xfs_bmbt_irec imap; + int nimaps = 1, error = 0; + unsigned lockmode; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + lockmode = xfs_ilock_data_map_shared(ip); + + /* if there are no attribute fork or extents, return ENOENT */ + if (XFS_IFORK_Q(ip) || !ip->i_d.di_anextents) { + error = -ENOENT; + goto out_unlock; + } + + ASSERT(ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL); + error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, + &nimaps, XFS_BMAPI_ENTIRE | XFS_BMAPI_ATTRFORK); +out_unlock: + xfs_iunlock(ip, lockmode); + + if (!error) { + ASSERT(nimaps); + xfs_bmbt_to_iomap(ip, iomap, &imap); + } + + return error; +} + +struct iomap_ops xfs_xattr_iomap_ops = { + .iomap_begin = xfs_xattr_iomap_begin, +}; diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index 8688e663d..fb8aca3d6 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -18,6 +18,8 @@ #ifndef __XFS_IOMAP_H__ #define __XFS_IOMAP_H__ +#include + struct xfs_inode; struct xfs_bmbt_irec; @@ -29,4 +31,10 @@ int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, struct xfs_bmbt_irec *); int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t); +void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *, + struct xfs_bmbt_irec *); + +extern struct iomap_ops xfs_iomap_ops; +extern struct iomap_ops xfs_xattr_iomap_ops; + #endif /* __XFS_IOMAP_H__*/ diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index c5d4eba69..b24c3102f 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -38,12 +38,13 @@ #include "xfs_dir2.h" #include "xfs_trans_space.h" #include "xfs_pnfs.h" +#include "xfs_iomap.h" #include #include #include #include -#include +#include #include /* @@ -800,21 +801,31 @@ xfs_setattr_size( if (error) return error; + /* + * Wait for all direct I/O to complete. + */ + inode_dio_wait(inode); + /* * File data changes must be complete before we start the transaction to * modify the inode. This needs to be done before joining the inode to * the transaction because the inode cannot be unlocked once it is a * part of the transaction. * - * Start with zeroing any data block beyond EOF that we may expose on - * file extension. + * Start with zeroing any data beyond EOF that we may expose on file + * extension, or zeroing out the rest of the block on a downward + * truncate. */ if (newsize > oldsize) { error = xfs_zero_eof(ip, newsize, oldsize, &did_zeroing); - if (error) - return error; + } else { + error = iomap_truncate_page(inode, newsize, &did_zeroing, + &xfs_iomap_ops); } + if (error) + return error; + /* * We are going to log the inode size change in this transaction so * any previous writes that are beyond the on disk EOF and the new @@ -823,17 +834,14 @@ xfs_setattr_size( * problem. Note that this includes any block zeroing we did above; * otherwise those blocks may not be zeroed after a crash. */ - if (newsize > ip->i_d.di_size && - (oldsize != ip->i_d.di_size || did_zeroing)) { + if (did_zeroing || + (newsize > ip->i_d.di_size && oldsize != ip->i_d.di_size)) { error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, ip->i_d.di_size, newsize); if (error) return error; } - /* Now wait for all direct I/O to complete. */ - inode_dio_wait(inode); - /* * We've already locked out new page faults, so now we can safely remove * pages from the page cache knowing they won't get refaulted until we @@ -851,13 +859,6 @@ xfs_setattr_size( * to hope that the caller sees ENOMEM and retries the truncate * operation. */ - if (IS_DAX(inode)) - error = dax_truncate_page(inode, newsize, xfs_get_blocks_direct); - else - error = block_truncate_page(inode->i_mapping, newsize, - xfs_get_blocks); - if (error) - return error; truncate_setsize(inode, newsize); error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); @@ -998,51 +999,6 @@ xfs_vn_update_time( return xfs_trans_commit(tp); } -#define XFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) - -/* - * Call fiemap helper to fill in user data. - * Returns positive errors to xfs_getbmap. - */ -STATIC int -xfs_fiemap_format( - void **arg, - struct getbmapx *bmv, - int *full) -{ - int error; - struct fiemap_extent_info *fieinfo = *arg; - u32 fiemap_flags = 0; - u64 logical, physical, length; - - /* Do nothing for a hole */ - if (bmv->bmv_block == -1LL) - return 0; - - logical = BBTOB(bmv->bmv_offset); - physical = BBTOB(bmv->bmv_block); - length = BBTOB(bmv->bmv_length); - - if (bmv->bmv_oflags & BMV_OF_PREALLOC) - fiemap_flags |= FIEMAP_EXTENT_UNWRITTEN; - else if (bmv->bmv_oflags & BMV_OF_DELALLOC) { - fiemap_flags |= (FIEMAP_EXTENT_DELALLOC | - FIEMAP_EXTENT_UNKNOWN); - physical = 0; /* no block yet */ - } - if (bmv->bmv_oflags & BMV_OF_LAST) - fiemap_flags |= FIEMAP_EXTENT_LAST; - - error = fiemap_fill_next_extent(fieinfo, logical, physical, - length, fiemap_flags); - if (error > 0) { - error = 0; - *full = 1; /* user array now full */ - } - - return error; -} - STATIC int xfs_vn_fiemap( struct inode *inode, @@ -1050,38 +1006,20 @@ xfs_vn_fiemap( u64 start, u64 length) { - xfs_inode_t *ip = XFS_I(inode); - struct getbmapx bm; int error; - error = fiemap_check_flags(fieinfo, XFS_FIEMAP_FLAGS); - if (error) - return error; - - /* Set up bmap header for xfs internal routine */ - bm.bmv_offset = BTOBBT(start); - /* Special case for whole file */ - if (length == FIEMAP_MAX_OFFSET) - bm.bmv_length = -1LL; - else - bm.bmv_length = BTOBB(start + length) - bm.bmv_offset; - - /* We add one because in getbmap world count includes the header */ - bm.bmv_count = !fieinfo->fi_extents_max ? MAXEXTNUM : - fieinfo->fi_extents_max + 1; - bm.bmv_count = min_t(__s32, bm.bmv_count, - (PAGE_SIZE * 16 / sizeof(struct getbmapx))); - bm.bmv_iflags = BMV_IF_PREALLOC | BMV_IF_NO_HOLES; - if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) - bm.bmv_iflags |= BMV_IF_ATTRFORK; - if (!(fieinfo->fi_flags & FIEMAP_FLAG_SYNC)) - bm.bmv_iflags |= BMV_IF_DELALLOC; - - error = xfs_getbmap(ip, &bm, xfs_fiemap_format, fieinfo); - if (error) - return error; + xfs_ilock(XFS_I(inode), XFS_IOLOCK_SHARED); + if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { + fieinfo->fi_flags &= ~FIEMAP_FLAG_XATTR; + error = iomap_fiemap(inode, fieinfo, start, length, + &xfs_xattr_iomap_ops); + } else { + error = iomap_fiemap(inode, fieinfo, start, length, + &xfs_iomap_ops); + } + xfs_iunlock(XFS_I(inode), XFS_IOLOCK_SHARED); - return 0; + return error; } STATIC int diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index a8192dc79..b8d64d520 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -328,13 +328,6 @@ static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y) return x; } -/* ARM old ABI has some weird alignment/padding */ -#if defined(__arm__) && !defined(__ARM_EABI__) -#define __arch_pack __attribute__((packed)) -#else -#define __arch_pack -#endif - #define ASSERT_ALWAYS(expr) \ (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__)) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index bde02f1fb..3b74fa011 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -788,7 +788,7 @@ xfs_log_mount_cancel( * As far as I know, there weren't any dependencies on the old behaviour. */ -int +static int xfs_log_unmount_write(xfs_mount_t *mp) { struct xlog *log = mp->m_log; @@ -1036,7 +1036,7 @@ xfs_log_space_wake( * there's no point in running a dummy transaction at this point because we * can't start trying to idle the log until both the CIL and AIL are empty. */ -int +static int xfs_log_need_covered(xfs_mount_t *mp) { struct xlog *log = mp->m_log; @@ -1177,7 +1177,7 @@ xlog_space_left( * The log manager needs its own routine, in order to control what * happens with the buffer after the write completes. */ -void +static void xlog_iodone(xfs_buf_t *bp) { struct xlog_in_core *iclog = bp->b_fspriv; @@ -1302,7 +1302,7 @@ xfs_log_work_queue( * disk. If there is nothing dirty, then we might need to cover the log to * indicate that the filesystem is idle. */ -void +static void xfs_log_worker( struct work_struct *work) { @@ -1415,7 +1415,7 @@ xlog_alloc_log( */ error = -ENOMEM; bp = xfs_buf_alloc(mp->m_logdev_targp, XFS_BUF_DADDR_NULL, - BTOBB(log->l_iclog_size), 0); + BTOBB(log->l_iclog_size), XBF_NO_IOACCT); if (!bp) goto out_free_log; @@ -1454,7 +1454,8 @@ xlog_alloc_log( prev_iclog = iclog; bp = xfs_buf_get_uncached(mp->m_logdev_targp, - BTOBB(log->l_iclog_size), 0); + BTOBB(log->l_iclog_size), + XBF_NO_IOACCT); if (!bp) goto out_free_iclog; diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 80ba0c047..b5e71072f 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -163,12 +163,8 @@ int xfs_log_reserve(struct xfs_mount *mp, __uint8_t clientid, bool permanent); int xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic); -int xfs_log_unmount_write(struct xfs_mount *mp); void xfs_log_unmount(struct xfs_mount *mp); int xfs_log_force_umount(struct xfs_mount *mp, int logerror); -int xfs_log_need_covered(struct xfs_mount *mp); - -void xlog_iodone(struct xfs_buf *); struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket); void xfs_log_ticket_put(struct xlog_ticket *ticket); @@ -178,7 +174,6 @@ void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); void xfs_log_work_queue(struct xfs_mount *mp); -void xfs_log_worker(struct work_struct *work); void xfs_log_quiesce(struct xfs_mount *mp); bool xfs_log_check_lsn(struct xfs_mount *, xfs_lsn_t); diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 5e54e7955..a4ab192e1 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -78,6 +78,157 @@ xlog_cil_init_post_recovery( log->l_cilp->xc_ctx->sequence = 1; } +static inline int +xlog_cil_iovec_space( + uint niovecs) +{ + return round_up((sizeof(struct xfs_log_vec) + + niovecs * sizeof(struct xfs_log_iovec)), + sizeof(uint64_t)); +} + +/* + * Allocate or pin log vector buffers for CIL insertion. + * + * The CIL currently uses disposable buffers for copying a snapshot of the + * modified items into the log during a push. The biggest problem with this is + * the requirement to allocate the disposable buffer during the commit if: + * a) does not exist; or + * b) it is too small + * + * If we do this allocation within xlog_cil_insert_format_items(), it is done + * under the xc_ctx_lock, which means that a CIL push cannot occur during + * the memory allocation. This means that we have a potential deadlock situation + * under low memory conditions when we have lots of dirty metadata pinned in + * the CIL and we need a CIL commit to occur to free memory. + * + * To avoid this, we need to move the memory allocation outside the + * xc_ctx_lock, but because the log vector buffers are disposable, that opens + * up a TOCTOU race condition w.r.t. the CIL committing and removing the log + * vector buffers between the check and the formatting of the item into the + * log vector buffer within the xc_ctx_lock. + * + * Because the log vector buffer needs to be unchanged during the CIL push + * process, we cannot share the buffer between the transaction commit (which + * modifies the buffer) and the CIL push context that is writing the changes + * into the log. This means skipping preallocation of buffer space is + * unreliable, but we most definitely do not want to be allocating and freeing + * buffers unnecessarily during commits when overwrites can be done safely. + * + * The simplest solution to this problem is to allocate a shadow buffer when a + * log item is committed for the second time, and then to only use this buffer + * if necessary. The buffer can remain attached to the log item until such time + * it is needed, and this is the buffer that is reallocated to match the size of + * the incoming modification. Then during the formatting of the item we can swap + * the active buffer with the new one if we can't reuse the existing buffer. We + * don't free the old buffer as it may be reused on the next modification if + * it's size is right, otherwise we'll free and reallocate it at that point. + * + * This function builds a vector for the changes in each log item in the + * transaction. It then works out the length of the buffer needed for each log + * item, allocates them and attaches the vector to the log item in preparation + * for the formatting step which occurs under the xc_ctx_lock. + * + * While this means the memory footprint goes up, it avoids the repeated + * alloc/free pattern that repeated modifications of an item would otherwise + * cause, and hence minimises the CPU overhead of such behaviour. + */ +static void +xlog_cil_alloc_shadow_bufs( + struct xlog *log, + struct xfs_trans *tp) +{ + struct xfs_log_item_desc *lidp; + + list_for_each_entry(lidp, &tp->t_items, lid_trans) { + struct xfs_log_item *lip = lidp->lid_item; + struct xfs_log_vec *lv; + int niovecs = 0; + int nbytes = 0; + int buf_size; + bool ordered = false; + + /* Skip items which aren't dirty in this transaction. */ + if (!(lidp->lid_flags & XFS_LID_DIRTY)) + continue; + + /* get number of vecs and size of data to be stored */ + lip->li_ops->iop_size(lip, &niovecs, &nbytes); + + /* + * Ordered items need to be tracked but we do not wish to write + * them. We need a logvec to track the object, but we do not + * need an iovec or buffer to be allocated for copying data. + */ + if (niovecs == XFS_LOG_VEC_ORDERED) { + ordered = true; + niovecs = 0; + nbytes = 0; + } + + /* + * We 64-bit align the length of each iovec so that the start + * of the next one is naturally aligned. We'll need to + * account for that slack space here. Then round nbytes up + * to 64-bit alignment so that the initial buffer alignment is + * easy to calculate and verify. + */ + nbytes += niovecs * sizeof(uint64_t); + nbytes = round_up(nbytes, sizeof(uint64_t)); + + /* + * The data buffer needs to start 64-bit aligned, so round up + * that space to ensure we can align it appropriately and not + * overrun the buffer. + */ + buf_size = nbytes + xlog_cil_iovec_space(niovecs); + + /* + * if we have no shadow buffer, or it is too small, we need to + * reallocate it. + */ + if (!lip->li_lv_shadow || + buf_size > lip->li_lv_shadow->lv_size) { + + /* + * We free and allocate here as a realloc would copy + * unecessary data. We don't use kmem_zalloc() for the + * same reason - we don't need to zero the data area in + * the buffer, only the log vector header and the iovec + * storage. + */ + kmem_free(lip->li_lv_shadow); + + lv = kmem_alloc(buf_size, KM_SLEEP|KM_NOFS); + memset(lv, 0, xlog_cil_iovec_space(niovecs)); + + lv->lv_item = lip; + lv->lv_size = buf_size; + if (ordered) + lv->lv_buf_len = XFS_LOG_VEC_ORDERED; + else + lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1]; + lip->li_lv_shadow = lv; + } else { + /* same or smaller, optimise common overwrite case */ + lv = lip->li_lv_shadow; + if (ordered) + lv->lv_buf_len = XFS_LOG_VEC_ORDERED; + else + lv->lv_buf_len = 0; + lv->lv_bytes = 0; + lv->lv_next = NULL; + } + + /* Ensure the lv is set up according to ->iop_size */ + lv->lv_niovecs = niovecs; + + /* The allocated data region lies beyond the iovec region */ + lv->lv_buf = (char *)lv + xlog_cil_iovec_space(niovecs); + } + +} + /* * Prepare the log item for insertion into the CIL. Calculate the difference in * log space and vectors it will consume, and if it is a new item pin it as @@ -100,16 +251,19 @@ xfs_cil_prepare_item( /* * If there is no old LV, this is the first time we've seen the item in * this CIL context and so we need to pin it. If we are replacing the - * old_lv, then remove the space it accounts for and free it. + * old_lv, then remove the space it accounts for and make it the shadow + * buffer for later freeing. In both cases we are now switching to the + * shadow buffer, so update the the pointer to it appropriately. */ - if (!old_lv) + if (!old_lv) { lv->lv_item->li_ops->iop_pin(lv->lv_item); - else if (old_lv != lv) { + lv->lv_item->li_lv_shadow = NULL; + } else if (old_lv != lv) { ASSERT(lv->lv_buf_len != XFS_LOG_VEC_ORDERED); *diff_len -= old_lv->lv_bytes; *diff_iovecs -= old_lv->lv_niovecs; - kmem_free(old_lv); + lv->lv_item->li_lv_shadow = old_lv; } /* attach new log vector to log item */ @@ -133,11 +287,13 @@ xfs_cil_prepare_item( * write it out asynchronously without needing to relock the object that was * modified at the time it gets written into the iclog. * - * This function builds a vector for the changes in each log item in the - * transaction. It then works out the length of the buffer needed for each log - * item, allocates them and formats the vector for the item into the buffer. - * The buffer is then attached to the log item are then inserted into the - * Committed Item List for tracking until the next checkpoint is written out. + * This function takes the prepared log vectors attached to each log item, and + * formats the changes into the log vector buffer. The buffer it uses is + * dependent on the current state of the vector in the CIL - the shadow lv is + * guaranteed to be large enough for the current modification, but we will only + * use that if we can't reuse the existing lv. If we can't reuse the existing + * lv, then simple swap it out for the shadow lv. We don't free it - that is + * done lazily either by th enext modification or the freeing of the log item. * * We don't set up region headers during this process; we simply copy the * regions into the flat buffer. We can do this because we still have to do a @@ -170,59 +326,29 @@ xlog_cil_insert_format_items( list_for_each_entry(lidp, &tp->t_items, lid_trans) { struct xfs_log_item *lip = lidp->lid_item; struct xfs_log_vec *lv; - struct xfs_log_vec *old_lv; - int niovecs = 0; - int nbytes = 0; - int buf_size; + struct xfs_log_vec *old_lv = NULL; + struct xfs_log_vec *shadow; bool ordered = false; /* Skip items which aren't dirty in this transaction. */ if (!(lidp->lid_flags & XFS_LID_DIRTY)) continue; - /* get number of vecs and size of data to be stored */ - lip->li_ops->iop_size(lip, &niovecs, &nbytes); - - /* Skip items that do not have any vectors for writing */ - if (!niovecs) - continue; - /* - * Ordered items need to be tracked but we do not wish to write - * them. We need a logvec to track the object, but we do not - * need an iovec or buffer to be allocated for copying data. + * The formatting size information is already attached to + * the shadow lv on the log item. */ - if (niovecs == XFS_LOG_VEC_ORDERED) { + shadow = lip->li_lv_shadow; + if (shadow->lv_buf_len == XFS_LOG_VEC_ORDERED) ordered = true; - niovecs = 0; - nbytes = 0; - } - /* - * We 64-bit align the length of each iovec so that the start - * of the next one is naturally aligned. We'll need to - * account for that slack space here. Then round nbytes up - * to 64-bit alignment so that the initial buffer alignment is - * easy to calculate and verify. - */ - nbytes += niovecs * sizeof(uint64_t); - nbytes = round_up(nbytes, sizeof(uint64_t)); - - /* grab the old item if it exists for reservation accounting */ - old_lv = lip->li_lv; - - /* - * The data buffer needs to start 64-bit aligned, so round up - * that space to ensure we can align it appropriately and not - * overrun the buffer. - */ - buf_size = nbytes + - round_up((sizeof(struct xfs_log_vec) + - niovecs * sizeof(struct xfs_log_iovec)), - sizeof(uint64_t)); + /* Skip items that do not have any vectors for writing */ + if (!shadow->lv_niovecs && !ordered) + continue; /* compare to existing item size */ - if (lip->li_lv && buf_size <= lip->li_lv->lv_size) { + old_lv = lip->li_lv; + if (lip->li_lv && shadow->lv_size <= lip->li_lv->lv_size) { /* same or smaller, optimise common overwrite case */ lv = lip->li_lv; lv->lv_next = NULL; @@ -236,32 +362,29 @@ xlog_cil_insert_format_items( */ *diff_iovecs -= lv->lv_niovecs; *diff_len -= lv->lv_bytes; + + /* Ensure the lv is set up according to ->iop_size */ + lv->lv_niovecs = shadow->lv_niovecs; + + /* reset the lv buffer information for new formatting */ + lv->lv_buf_len = 0; + lv->lv_bytes = 0; + lv->lv_buf = (char *)lv + + xlog_cil_iovec_space(lv->lv_niovecs); } else { - /* allocate new data chunk */ - lv = kmem_zalloc(buf_size, KM_SLEEP|KM_NOFS); + /* switch to shadow buffer! */ + lv = shadow; lv->lv_item = lip; - lv->lv_size = buf_size; if (ordered) { /* track as an ordered logvec */ ASSERT(lip->li_lv == NULL); - lv->lv_buf_len = XFS_LOG_VEC_ORDERED; goto insert; } - lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1]; } - /* Ensure the lv is set up according to ->iop_size */ - lv->lv_niovecs = niovecs; - - /* The allocated data region lies beyond the iovec region */ - lv->lv_buf_len = 0; - lv->lv_bytes = 0; - lv->lv_buf = (char *)lv + buf_size - nbytes; ASSERT(IS_ALIGNED((unsigned long)lv->lv_buf, sizeof(uint64_t))); - lip->li_ops->iop_format(lip, lv); insert: - ASSERT(lv->lv_buf_len <= nbytes); xfs_cil_prepare_item(log, lv, old_lv, diff_len, diff_iovecs); } } @@ -783,6 +906,13 @@ xfs_log_commit_cil( struct xlog *log = mp->m_log; struct xfs_cil *cil = log->l_cilp; + /* + * Do all necessary memory allocation before we lock the CIL. + * This ensures the allocation does not deadlock with a CIL + * push in memory reclaim (e.g. from kswapd). + */ + xlog_cil_alloc_shadow_bufs(log, tp); + /* lock out background commit */ down_read(&cil->xc_ctx_lock); diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 835997843..e8638fd2c 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -43,6 +43,7 @@ #include "xfs_bmap_btree.h" #include "xfs_error.h" #include "xfs_dir2.h" +#include "xfs_rmap_item.h" #define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1) @@ -1911,6 +1912,8 @@ xlog_recover_reorder_trans( case XFS_LI_QUOTAOFF: case XFS_LI_EFD: case XFS_LI_EFI: + case XFS_LI_RUI: + case XFS_LI_RUD: trace_xfs_log_recover_item_reorder_tail(log, trans, item, pass); list_move_tail(&item->ri_list, &inode_list); @@ -2228,6 +2231,7 @@ xlog_recover_get_buf_lsn( case XFS_ABTC_CRC_MAGIC: case XFS_ABTB_MAGIC: case XFS_ABTC_MAGIC: + case XFS_RMAP_CRC_MAGIC: case XFS_IBT_CRC_MAGIC: case XFS_IBT_MAGIC: { struct xfs_btree_block *btb = blk; @@ -2396,6 +2400,9 @@ xlog_recover_validate_buf_type( case XFS_BMAP_MAGIC: bp->b_ops = &xfs_bmbt_buf_ops; break; + case XFS_RMAP_CRC_MAGIC: + bp->b_ops = &xfs_rmapbt_buf_ops; + break; default: xfs_warn(mp, "Bad btree block magic!"); ASSERT(0); @@ -3414,6 +3421,99 @@ xlog_recover_efd_pass2( return 0; } +/* + * This routine is called to create an in-core extent rmap update + * item from the rui format structure which was logged on disk. + * It allocates an in-core rui, copies the extents from the format + * structure into it, and adds the rui to the AIL with the given + * LSN. + */ +STATIC int +xlog_recover_rui_pass2( + struct xlog *log, + struct xlog_recover_item *item, + xfs_lsn_t lsn) +{ + int error; + struct xfs_mount *mp = log->l_mp; + struct xfs_rui_log_item *ruip; + struct xfs_rui_log_format *rui_formatp; + + rui_formatp = item->ri_buf[0].i_addr; + + ruip = xfs_rui_init(mp, rui_formatp->rui_nextents); + error = xfs_rui_copy_format(&item->ri_buf[0], &ruip->rui_format); + if (error) { + xfs_rui_item_free(ruip); + return error; + } + atomic_set(&ruip->rui_next_extent, rui_formatp->rui_nextents); + + spin_lock(&log->l_ailp->xa_lock); + /* + * The RUI has two references. One for the RUD and one for RUI to ensure + * it makes it into the AIL. Insert the RUI into the AIL directly and + * drop the RUI reference. Note that xfs_trans_ail_update() drops the + * AIL lock. + */ + xfs_trans_ail_update(log->l_ailp, &ruip->rui_item, lsn); + xfs_rui_release(ruip); + return 0; +} + + +/* + * This routine is called when an RUD format structure is found in a committed + * transaction in the log. Its purpose is to cancel the corresponding RUI if it + * was still in the log. To do this it searches the AIL for the RUI with an id + * equal to that in the RUD format structure. If we find it we drop the RUD + * reference, which removes the RUI from the AIL and frees it. + */ +STATIC int +xlog_recover_rud_pass2( + struct xlog *log, + struct xlog_recover_item *item) +{ + struct xfs_rud_log_format *rud_formatp; + struct xfs_rui_log_item *ruip = NULL; + struct xfs_log_item *lip; + __uint64_t rui_id; + struct xfs_ail_cursor cur; + struct xfs_ail *ailp = log->l_ailp; + + rud_formatp = item->ri_buf[0].i_addr; + ASSERT(item->ri_buf[0].i_len == sizeof(struct xfs_rud_log_format)); + rui_id = rud_formatp->rud_rui_id; + + /* + * Search for the RUI with the id in the RUD format structure in the + * AIL. + */ + spin_lock(&ailp->xa_lock); + lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); + while (lip != NULL) { + if (lip->li_type == XFS_LI_RUI) { + ruip = (struct xfs_rui_log_item *)lip; + if (ruip->rui_format.rui_id == rui_id) { + /* + * Drop the RUD reference to the RUI. This + * removes the RUI from the AIL and frees it. + */ + spin_unlock(&ailp->xa_lock); + xfs_rui_release(ruip); + spin_lock(&ailp->xa_lock); + break; + } + } + lip = xfs_trans_ail_cursor_next(ailp, &cur); + } + + xfs_trans_ail_cursor_done(&cur); + spin_unlock(&ailp->xa_lock); + + return 0; +} + /* * This routine is called when an inode create format structure is found in a * committed transaction in the log. It's purpose is to initialise the inodes @@ -3639,6 +3739,8 @@ xlog_recover_ra_pass2( case XFS_LI_EFI: case XFS_LI_EFD: case XFS_LI_QUOTAOFF: + case XFS_LI_RUI: + case XFS_LI_RUD: default: break; } @@ -3662,6 +3764,8 @@ xlog_recover_commit_pass1( case XFS_LI_EFD: case XFS_LI_DQUOT: case XFS_LI_ICREATE: + case XFS_LI_RUI: + case XFS_LI_RUD: /* nothing to do in pass 1 */ return 0; default: @@ -3692,6 +3796,10 @@ xlog_recover_commit_pass2( return xlog_recover_efi_pass2(log, item, trans->r_lsn); case XFS_LI_EFD: return xlog_recover_efd_pass2(log, item); + case XFS_LI_RUI: + return xlog_recover_rui_pass2(log, item, trans->r_lsn); + case XFS_LI_RUD: + return xlog_recover_rud_pass2(log, item); case XFS_LI_DQUOT: return xlog_recover_dquot_pass2(log, buffer_list, item, trans->r_lsn); @@ -4164,126 +4272,156 @@ xlog_recover_process_data( return 0; } -/* - * Process an extent free intent item that was recovered from - * the log. We need to free the extents that it describes. - */ +/* Recover the EFI if necessary. */ STATIC int xlog_recover_process_efi( - xfs_mount_t *mp, - xfs_efi_log_item_t *efip) + struct xfs_mount *mp, + struct xfs_ail *ailp, + struct xfs_log_item *lip) { - xfs_efd_log_item_t *efdp; - xfs_trans_t *tp; - int i; - int error = 0; - xfs_extent_t *extp; - xfs_fsblock_t startblock_fsb; - - ASSERT(!test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)); + struct xfs_efi_log_item *efip; + int error; /* - * First check the validity of the extents described by the - * EFI. If any are bad, then assume that all are bad and - * just toss the EFI. + * Skip EFIs that we've already processed. */ - for (i = 0; i < efip->efi_format.efi_nextents; i++) { - extp = &(efip->efi_format.efi_extents[i]); - startblock_fsb = XFS_BB_TO_FSB(mp, - XFS_FSB_TO_DADDR(mp, extp->ext_start)); - if ((startblock_fsb == 0) || - (extp->ext_len == 0) || - (startblock_fsb >= mp->m_sb.sb_dblocks) || - (extp->ext_len >= mp->m_sb.sb_agblocks)) { - /* - * This will pull the EFI from the AIL and - * free the memory associated with it. - */ - set_bit(XFS_EFI_RECOVERED, &efip->efi_flags); - xfs_efi_release(efip); - return -EIO; - } - } + efip = container_of(lip, struct xfs_efi_log_item, efi_item); + if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) + return 0; - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); - if (error) - return error; - efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents); + spin_unlock(&ailp->xa_lock); + error = xfs_efi_recover(mp, efip); + spin_lock(&ailp->xa_lock); - for (i = 0; i < efip->efi_format.efi_nextents; i++) { - extp = &(efip->efi_format.efi_extents[i]); - error = xfs_trans_free_extent(tp, efdp, extp->ext_start, - extp->ext_len); - if (error) - goto abort_error; + return error; +} - } +/* Release the EFI since we're cancelling everything. */ +STATIC void +xlog_recover_cancel_efi( + struct xfs_mount *mp, + struct xfs_ail *ailp, + struct xfs_log_item *lip) +{ + struct xfs_efi_log_item *efip; - set_bit(XFS_EFI_RECOVERED, &efip->efi_flags); - error = xfs_trans_commit(tp); - return error; + efip = container_of(lip, struct xfs_efi_log_item, efi_item); + + spin_unlock(&ailp->xa_lock); + xfs_efi_release(efip); + spin_lock(&ailp->xa_lock); +} + +/* Recover the RUI if necessary. */ +STATIC int +xlog_recover_process_rui( + struct xfs_mount *mp, + struct xfs_ail *ailp, + struct xfs_log_item *lip) +{ + struct xfs_rui_log_item *ruip; + int error; + + /* + * Skip RUIs that we've already processed. + */ + ruip = container_of(lip, struct xfs_rui_log_item, rui_item); + if (test_bit(XFS_RUI_RECOVERED, &ruip->rui_flags)) + return 0; + + spin_unlock(&ailp->xa_lock); + error = xfs_rui_recover(mp, ruip); + spin_lock(&ailp->xa_lock); -abort_error: - xfs_trans_cancel(tp); return error; } +/* Release the RUI since we're cancelling everything. */ +STATIC void +xlog_recover_cancel_rui( + struct xfs_mount *mp, + struct xfs_ail *ailp, + struct xfs_log_item *lip) +{ + struct xfs_rui_log_item *ruip; + + ruip = container_of(lip, struct xfs_rui_log_item, rui_item); + + spin_unlock(&ailp->xa_lock); + xfs_rui_release(ruip); + spin_lock(&ailp->xa_lock); +} + +/* Is this log item a deferred action intent? */ +static inline bool xlog_item_is_intent(struct xfs_log_item *lip) +{ + switch (lip->li_type) { + case XFS_LI_EFI: + case XFS_LI_RUI: + return true; + default: + return false; + } +} + /* - * When this is called, all of the EFIs which did not have - * corresponding EFDs should be in the AIL. What we do now - * is free the extents associated with each one. + * When this is called, all of the log intent items which did not have + * corresponding log done items should be in the AIL. What we do now + * is update the data structures associated with each one. * - * Since we process the EFIs in normal transactions, they - * will be removed at some point after the commit. This prevents - * us from just walking down the list processing each one. - * We'll use a flag in the EFI to skip those that we've already - * processed and use the AIL iteration mechanism's generation - * count to try to speed this up at least a bit. + * Since we process the log intent items in normal transactions, they + * will be removed at some point after the commit. This prevents us + * from just walking down the list processing each one. We'll use a + * flag in the intent item to skip those that we've already processed + * and use the AIL iteration mechanism's generation count to try to + * speed this up at least a bit. * - * When we start, we know that the EFIs are the only things in - * the AIL. As we process them, however, other items are added - * to the AIL. Since everything added to the AIL must come after - * everything already in the AIL, we stop processing as soon as - * we see something other than an EFI in the AIL. + * When we start, we know that the intents are the only things in the + * AIL. As we process them, however, other items are added to the + * AIL. */ STATIC int -xlog_recover_process_efis( +xlog_recover_process_intents( struct xlog *log) { struct xfs_log_item *lip; - struct xfs_efi_log_item *efip; int error = 0; struct xfs_ail_cursor cur; struct xfs_ail *ailp; + xfs_lsn_t last_lsn; ailp = log->l_ailp; spin_lock(&ailp->xa_lock); lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); + last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block); while (lip != NULL) { /* - * We're done when we see something other than an EFI. - * There should be no EFIs left in the AIL now. + * We're done when we see something other than an intent. + * There should be no intents left in the AIL now. */ - if (lip->li_type != XFS_LI_EFI) { + if (!xlog_item_is_intent(lip)) { #ifdef DEBUG for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur)) - ASSERT(lip->li_type != XFS_LI_EFI); + ASSERT(!xlog_item_is_intent(lip)); #endif break; } /* - * Skip EFIs that we've already processed. + * We should never see a redo item with a LSN higher than + * the last transaction we found in the log at the start + * of recovery. */ - efip = container_of(lip, struct xfs_efi_log_item, efi_item); - if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) { - lip = xfs_trans_ail_cursor_next(ailp, &cur); - continue; - } + ASSERT(XFS_LSN_CMP(last_lsn, lip->li_lsn) >= 0); - spin_unlock(&ailp->xa_lock); - error = xlog_recover_process_efi(log->l_mp, efip); - spin_lock(&ailp->xa_lock); + switch (lip->li_type) { + case XFS_LI_EFI: + error = xlog_recover_process_efi(log->l_mp, ailp, lip); + break; + case XFS_LI_RUI: + error = xlog_recover_process_rui(log->l_mp, ailp, lip); + break; + } if (error) goto out; lip = xfs_trans_ail_cursor_next(ailp, &cur); @@ -4295,15 +4433,14 @@ out: } /* - * A cancel occurs when the mount has failed and we're bailing out. Release all - * pending EFIs so they don't pin the AIL. + * A cancel occurs when the mount has failed and we're bailing out. + * Release all pending log intent items so they don't pin the AIL. */ STATIC int -xlog_recover_cancel_efis( +xlog_recover_cancel_intents( struct xlog *log) { struct xfs_log_item *lip; - struct xfs_efi_log_item *efip; int error = 0; struct xfs_ail_cursor cur; struct xfs_ail *ailp; @@ -4313,22 +4450,25 @@ xlog_recover_cancel_efis( lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); while (lip != NULL) { /* - * We're done when we see something other than an EFI. - * There should be no EFIs left in the AIL now. + * We're done when we see something other than an intent. + * There should be no intents left in the AIL now. */ - if (lip->li_type != XFS_LI_EFI) { + if (!xlog_item_is_intent(lip)) { #ifdef DEBUG for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur)) - ASSERT(lip->li_type != XFS_LI_EFI); + ASSERT(!xlog_item_is_intent(lip)); #endif break; } - efip = container_of(lip, struct xfs_efi_log_item, efi_item); - - spin_unlock(&ailp->xa_lock); - xfs_efi_release(efip); - spin_lock(&ailp->xa_lock); + switch (lip->li_type) { + case XFS_LI_EFI: + xlog_recover_cancel_efi(log->l_mp, ailp, lip); + break; + case XFS_LI_RUI: + xlog_recover_cancel_rui(log->l_mp, ailp, lip); + break; + } lip = xfs_trans_ail_cursor_next(ailp, &cur); } @@ -5023,6 +5163,7 @@ xlog_do_recover( xfs_warn(mp, "Failed post-recovery per-ag init: %d", error); return error; } + mp->m_alloc_set_aside = xfs_alloc_set_aside(mp); xlog_recover_check_summary(log); @@ -5139,16 +5280,17 @@ xlog_recover_finish( */ if (log->l_flags & XLOG_RECOVERY_NEEDED) { int error; - error = xlog_recover_process_efis(log); + error = xlog_recover_process_intents(log); if (error) { - xfs_alert(log->l_mp, "Failed to recover EFIs"); + xfs_alert(log->l_mp, "Failed to recover intents"); return error; } + /* - * Sync the log to get all the EFIs out of the AIL. + * Sync the log to get all the intents out of the AIL. * This isn't absolutely necessary, but it helps in * case the unlink transactions would have problems - * pushing the EFIs out of the way. + * pushing the intents out of the way. */ xfs_log_force(log->l_mp, XFS_LOG_SYNC); @@ -5173,7 +5315,7 @@ xlog_recover_cancel( int error = 0; if (log->l_flags & XLOG_RECOVERY_NEEDED) - error = xlog_recover_cancel_efis(log); + error = xlog_recover_cancel_intents(log); return error; } diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index e39b02351..faeead671 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -24,6 +24,7 @@ #include "xfs_bit.h" #include "xfs_sb.h" #include "xfs_mount.h" +#include "xfs_defer.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" #include "xfs_inode.h" @@ -41,6 +42,7 @@ #include "xfs_trace.h" #include "xfs_icache.h" #include "xfs_sysfs.h" +#include "xfs_rmap_btree.h" static DEFINE_MUTEX(xfs_uuid_table_mutex); @@ -230,6 +232,8 @@ xfs_initialize_perag( if (maxagi) *maxagi = index; + + mp->m_ag_prealloc_blocks = xfs_prealloc_blocks(mp); return 0; out_unwind: @@ -272,13 +276,15 @@ xfs_readsb( buf_ops = NULL; /* - * Allocate a (locked) buffer to hold the superblock. - * This will be kept around at all times to optimize - * access to the superblock. + * Allocate a (locked) buffer to hold the superblock. This will be kept + * around at all times to optimize access to the superblock. Therefore, + * set XBF_NO_IOACCT to make sure it doesn't hold the buftarg count + * elevated. */ reread: error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR, - BTOBB(sector_size), 0, &bp, buf_ops); + BTOBB(sector_size), XBF_NO_IOACCT, &bp, + buf_ops); if (error) { if (loud) xfs_warn(mp, "SB validate failed with error %d.", error); @@ -677,6 +683,7 @@ xfs_mountfs( xfs_bmap_compute_maxlevels(mp, XFS_DATA_FORK); xfs_bmap_compute_maxlevels(mp, XFS_ATTR_FORK); xfs_ialloc_compute_maxlevels(mp); + xfs_rmapbt_compute_maxlevels(mp); xfs_set_maxicount(mp); @@ -1214,7 +1221,7 @@ xfs_mod_fdblocks( batch = XFS_FDBLOCKS_BATCH; __percpu_counter_add(&mp->m_fdblocks, delta, batch); - if (__percpu_counter_compare(&mp->m_fdblocks, XFS_ALLOC_SET_ASIDE(mp), + if (__percpu_counter_compare(&mp->m_fdblocks, mp->m_alloc_set_aside, XFS_FDBLOCKS_BATCH) >= 0) { /* we had space! */ return 0; diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index c1b798c72..b36676cde 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -116,9 +116,15 @@ typedef struct xfs_mount { uint m_bmap_dmnr[2]; /* min bmap btree records */ uint m_inobt_mxr[2]; /* max inobt btree records */ uint m_inobt_mnr[2]; /* min inobt btree records */ + uint m_rmap_mxr[2]; /* max rmap btree records */ + uint m_rmap_mnr[2]; /* min rmap btree records */ uint m_ag_maxlevels; /* XFS_AG_MAXLEVELS */ uint m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */ uint m_in_maxlevels; /* max inobt btree levels. */ + uint m_rmap_maxlevels; /* max rmap btree levels */ + xfs_extlen_t m_ag_prealloc_blocks; /* reserved ag blocks */ + uint m_alloc_set_aside; /* space we can't use */ + uint m_ag_max_usable; /* max space per AG */ struct radix_tree_root m_perag_tree; /* per-ag accounting info */ spinlock_t m_perag_lock; /* lock for m_perag_tree */ struct mutex m_growlock; /* growfs mutex */ diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h index 184c44eff..69e2986a3 100644 --- a/fs/xfs/xfs_ondisk.h +++ b/fs/xfs/xfs_ondisk.h @@ -22,6 +22,11 @@ BUILD_BUG_ON_MSG(sizeof(structname) != (size), "XFS: sizeof(" \ #structname ") is wrong, expected " #size) +#define XFS_CHECK_OFFSET(structname, member, off) \ + BUILD_BUG_ON_MSG(offsetof(structname, member) != (off), \ + "XFS: offsetof(" #structname ", " #member ") is wrong, " \ + "expected " #off) + static inline void __init xfs_check_ondisk_structs(void) { @@ -34,6 +39,8 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_key, 8); XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_rec, 16); XFS_CHECK_STRUCT_SIZE(struct xfs_bmdr_block, 4); + XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block_shdr, 48); + XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block_lhdr, 64); XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block, 72); XFS_CHECK_STRUCT_SIZE(struct xfs_dinode, 176); XFS_CHECK_STRUCT_SIZE(struct xfs_disk_dquot, 104); @@ -42,11 +49,14 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(struct xfs_dsymlink_hdr, 56); XFS_CHECK_STRUCT_SIZE(struct xfs_inobt_key, 4); XFS_CHECK_STRUCT_SIZE(struct xfs_inobt_rec, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_rmap_key, 20); + XFS_CHECK_STRUCT_SIZE(struct xfs_rmap_rec, 24); XFS_CHECK_STRUCT_SIZE(struct xfs_timestamp, 8); XFS_CHECK_STRUCT_SIZE(xfs_alloc_key_t, 8); XFS_CHECK_STRUCT_SIZE(xfs_alloc_ptr_t, 4); XFS_CHECK_STRUCT_SIZE(xfs_alloc_rec_t, 8); XFS_CHECK_STRUCT_SIZE(xfs_inobt_ptr_t, 4); + XFS_CHECK_STRUCT_SIZE(xfs_rmap_ptr_t, 4); /* dir/attr trees */ XFS_CHECK_STRUCT_SIZE(struct xfs_attr3_leaf_hdr, 80); @@ -75,27 +85,39 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_name_remote_t, 12); */ + XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, valuelen, 0); + XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, namelen, 2); + XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, nameval, 3); + XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, valueblk, 0); + XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, valuelen, 4); + XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, namelen, 8); + XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, name, 9); XFS_CHECK_STRUCT_SIZE(xfs_attr_leafblock_t, 40); - XFS_CHECK_STRUCT_SIZE(xfs_attr_shortform_t, 8); + XFS_CHECK_OFFSET(xfs_attr_shortform_t, hdr.totsize, 0); + XFS_CHECK_OFFSET(xfs_attr_shortform_t, hdr.count, 2); + XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].namelen, 4); + XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].valuelen, 5); + XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].flags, 6); + XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].nameval, 7); XFS_CHECK_STRUCT_SIZE(xfs_da_blkinfo_t, 12); XFS_CHECK_STRUCT_SIZE(xfs_da_intnode_t, 16); XFS_CHECK_STRUCT_SIZE(xfs_da_node_entry_t, 8); XFS_CHECK_STRUCT_SIZE(xfs_da_node_hdr_t, 16); XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_free_t, 4); XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_hdr_t, 16); - XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_unused_t, 6); + XFS_CHECK_OFFSET(xfs_dir2_data_unused_t, freetag, 0); + XFS_CHECK_OFFSET(xfs_dir2_data_unused_t, length, 2); XFS_CHECK_STRUCT_SIZE(xfs_dir2_free_hdr_t, 16); XFS_CHECK_STRUCT_SIZE(xfs_dir2_free_t, 16); - XFS_CHECK_STRUCT_SIZE(xfs_dir2_ino4_t, 4); - XFS_CHECK_STRUCT_SIZE(xfs_dir2_ino8_t, 8); - XFS_CHECK_STRUCT_SIZE(xfs_dir2_inou_t, 8); XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_entry_t, 8); XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_hdr_t, 16); XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_t, 16); XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_tail_t, 4); XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_entry_t, 3); + XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, namelen, 0); + XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, offset, 1); + XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, name, 3); XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_hdr_t, 10); - XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_off_t, 2); /* log structures */ XFS_CHECK_STRUCT_SIZE(struct xfs_dq_logformat, 24); diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index d5b756669..0f14b2e4b 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -1,6 +1,7 @@ /* * Copyright (c) 2014 Christoph Hellwig. */ +#include #include "xfs.h" #include "xfs_format.h" #include "xfs_log_format.h" @@ -79,32 +80,6 @@ xfs_fs_get_uuid( return 0; } -static void -xfs_bmbt_to_iomap( - struct xfs_inode *ip, - struct iomap *iomap, - struct xfs_bmbt_irec *imap) -{ - struct xfs_mount *mp = ip->i_mount; - - if (imap->br_startblock == HOLESTARTBLOCK) { - iomap->blkno = IOMAP_NULL_BLOCK; - iomap->type = IOMAP_HOLE; - } else if (imap->br_startblock == DELAYSTARTBLOCK) { - iomap->blkno = IOMAP_NULL_BLOCK; - iomap->type = IOMAP_DELALLOC; - } else { - iomap->blkno = - XFS_FSB_TO_DADDR(ip->i_mount, imap->br_startblock); - if (imap->br_state == XFS_EXT_UNWRITTEN) - iomap->type = IOMAP_UNWRITTEN; - else - iomap->type = IOMAP_MAPPED; - } - iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff); - iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount); -} - /* * Get a layout for the pNFS client. */ diff --git a/fs/xfs/xfs_pnfs.h b/fs/xfs/xfs_pnfs.h index 93f748539..e8339f749 100644 --- a/fs/xfs/xfs_pnfs.h +++ b/fs/xfs/xfs_pnfs.h @@ -1,7 +1,7 @@ #ifndef _XFS_PNFS_H #define _XFS_PNFS_H 1 -#if defined(CONFIG_NFSD_BLOCKLAYOUT) || defined(CONFIG_NFSD_SCSILAYOUT) +#ifdef CONFIG_EXPORTFS_BLOCK_OPS int xfs_fs_get_uuid(struct super_block *sb, u8 *buf, u32 *len, u64 *offset); int xfs_fs_map_blocks(struct inode *inode, loff_t offset, u64 length, struct iomap *iomap, bool write, u32 *device_generation); @@ -15,5 +15,5 @@ xfs_break_layouts(struct inode *inode, uint *iolock, bool with_imutex) { return 0; } -#endif /* CONFIG_NFSD_PNFS */ +#endif /* CONFIG_EXPORTFS_BLOCK_OPS */ #endif /* _XFS_PNFS_H */ diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c new file mode 100644 index 000000000..2500f2868 --- /dev/null +++ b/fs/xfs/xfs_rmap_item.c @@ -0,0 +1,536 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_buf_item.h" +#include "xfs_rmap_item.h" +#include "xfs_log.h" +#include "xfs_rmap.h" + + +kmem_zone_t *xfs_rui_zone; +kmem_zone_t *xfs_rud_zone; + +static inline struct xfs_rui_log_item *RUI_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_rui_log_item, rui_item); +} + +void +xfs_rui_item_free( + struct xfs_rui_log_item *ruip) +{ + if (ruip->rui_format.rui_nextents > XFS_RUI_MAX_FAST_EXTENTS) + kmem_free(ruip); + else + kmem_zone_free(xfs_rui_zone, ruip); +} + +/* + * This returns the number of iovecs needed to log the given rui item. + * We only need 1 iovec for an rui item. It just logs the rui_log_format + * structure. + */ +static inline int +xfs_rui_item_sizeof( + struct xfs_rui_log_item *ruip) +{ + return sizeof(struct xfs_rui_log_format) + + (ruip->rui_format.rui_nextents - 1) * + sizeof(struct xfs_map_extent); +} + +STATIC void +xfs_rui_item_size( + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) +{ + *nvecs += 1; + *nbytes += xfs_rui_item_sizeof(RUI_ITEM(lip)); +} + +/* + * This is called to fill in the vector of log iovecs for the + * given rui log item. We use only 1 iovec, and we point that + * at the rui_log_format structure embedded in the rui item. + * It is at this point that we assert that all of the extent + * slots in the rui item have been filled. + */ +STATIC void +xfs_rui_item_format( + struct xfs_log_item *lip, + struct xfs_log_vec *lv) +{ + struct xfs_rui_log_item *ruip = RUI_ITEM(lip); + struct xfs_log_iovec *vecp = NULL; + + ASSERT(atomic_read(&ruip->rui_next_extent) == + ruip->rui_format.rui_nextents); + + ruip->rui_format.rui_type = XFS_LI_RUI; + ruip->rui_format.rui_size = 1; + + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_RUI_FORMAT, &ruip->rui_format, + xfs_rui_item_sizeof(ruip)); +} + +/* + * Pinning has no meaning for an rui item, so just return. + */ +STATIC void +xfs_rui_item_pin( + struct xfs_log_item *lip) +{ +} + +/* + * The unpin operation is the last place an RUI is manipulated in the log. It is + * either inserted in the AIL or aborted in the event of a log I/O error. In + * either case, the RUI transaction has been successfully committed to make it + * this far. Therefore, we expect whoever committed the RUI to either construct + * and commit the RUD or drop the RUD's reference in the event of error. Simply + * drop the log's RUI reference now that the log is done with it. + */ +STATIC void +xfs_rui_item_unpin( + struct xfs_log_item *lip, + int remove) +{ + struct xfs_rui_log_item *ruip = RUI_ITEM(lip); + + xfs_rui_release(ruip); +} + +/* + * RUI items have no locking or pushing. However, since RUIs are pulled from + * the AIL when their corresponding RUDs are committed to disk, their situation + * is very similar to being pinned. Return XFS_ITEM_PINNED so that the caller + * will eventually flush the log. This should help in getting the RUI out of + * the AIL. + */ +STATIC uint +xfs_rui_item_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) +{ + return XFS_ITEM_PINNED; +} + +/* + * The RUI has been either committed or aborted if the transaction has been + * cancelled. If the transaction was cancelled, an RUD isn't going to be + * constructed and thus we free the RUI here directly. + */ +STATIC void +xfs_rui_item_unlock( + struct xfs_log_item *lip) +{ + if (lip->li_flags & XFS_LI_ABORTED) + xfs_rui_item_free(RUI_ITEM(lip)); +} + +/* + * The RUI is logged only once and cannot be moved in the log, so simply return + * the lsn at which it's been logged. + */ +STATIC xfs_lsn_t +xfs_rui_item_committed( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ + return lsn; +} + +/* + * The RUI dependency tracking op doesn't do squat. It can't because + * it doesn't know where the free extent is coming from. The dependency + * tracking has to be handled by the "enclosing" metadata object. For + * example, for inodes, the inode is locked throughout the extent freeing + * so the dependency should be recorded there. + */ +STATIC void +xfs_rui_item_committing( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ +} + +/* + * This is the ops vector shared by all rui log items. + */ +static const struct xfs_item_ops xfs_rui_item_ops = { + .iop_size = xfs_rui_item_size, + .iop_format = xfs_rui_item_format, + .iop_pin = xfs_rui_item_pin, + .iop_unpin = xfs_rui_item_unpin, + .iop_unlock = xfs_rui_item_unlock, + .iop_committed = xfs_rui_item_committed, + .iop_push = xfs_rui_item_push, + .iop_committing = xfs_rui_item_committing, +}; + +/* + * Allocate and initialize an rui item with the given number of extents. + */ +struct xfs_rui_log_item * +xfs_rui_init( + struct xfs_mount *mp, + uint nextents) + +{ + struct xfs_rui_log_item *ruip; + uint size; + + ASSERT(nextents > 0); + if (nextents > XFS_RUI_MAX_FAST_EXTENTS) { + size = (uint)(sizeof(struct xfs_rui_log_item) + + ((nextents - 1) * sizeof(struct xfs_map_extent))); + ruip = kmem_zalloc(size, KM_SLEEP); + } else { + ruip = kmem_zone_zalloc(xfs_rui_zone, KM_SLEEP); + } + + xfs_log_item_init(mp, &ruip->rui_item, XFS_LI_RUI, &xfs_rui_item_ops); + ruip->rui_format.rui_nextents = nextents; + ruip->rui_format.rui_id = (uintptr_t)(void *)ruip; + atomic_set(&ruip->rui_next_extent, 0); + atomic_set(&ruip->rui_refcount, 2); + + return ruip; +} + +/* + * Copy an RUI format buffer from the given buf, and into the destination + * RUI format structure. The RUI/RUD items were designed not to need any + * special alignment handling. + */ +int +xfs_rui_copy_format( + struct xfs_log_iovec *buf, + struct xfs_rui_log_format *dst_rui_fmt) +{ + struct xfs_rui_log_format *src_rui_fmt; + uint len; + + src_rui_fmt = buf->i_addr; + len = sizeof(struct xfs_rui_log_format) + + (src_rui_fmt->rui_nextents - 1) * + sizeof(struct xfs_map_extent); + + if (buf->i_len != len) + return -EFSCORRUPTED; + + memcpy((char *)dst_rui_fmt, (char *)src_rui_fmt, len); + return 0; +} + +/* + * Freeing the RUI requires that we remove it from the AIL if it has already + * been placed there. However, the RUI may not yet have been placed in the AIL + * when called by xfs_rui_release() from RUD processing due to the ordering of + * committed vs unpin operations in bulk insert operations. Hence the reference + * count to ensure only the last caller frees the RUI. + */ +void +xfs_rui_release( + struct xfs_rui_log_item *ruip) +{ + if (atomic_dec_and_test(&ruip->rui_refcount)) { + xfs_trans_ail_remove(&ruip->rui_item, SHUTDOWN_LOG_IO_ERROR); + xfs_rui_item_free(ruip); + } +} + +static inline struct xfs_rud_log_item *RUD_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_rud_log_item, rud_item); +} + +STATIC void +xfs_rud_item_size( + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) +{ + *nvecs += 1; + *nbytes += sizeof(struct xfs_rud_log_format); +} + +/* + * This is called to fill in the vector of log iovecs for the + * given rud log item. We use only 1 iovec, and we point that + * at the rud_log_format structure embedded in the rud item. + * It is at this point that we assert that all of the extent + * slots in the rud item have been filled. + */ +STATIC void +xfs_rud_item_format( + struct xfs_log_item *lip, + struct xfs_log_vec *lv) +{ + struct xfs_rud_log_item *rudp = RUD_ITEM(lip); + struct xfs_log_iovec *vecp = NULL; + + rudp->rud_format.rud_type = XFS_LI_RUD; + rudp->rud_format.rud_size = 1; + + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_RUD_FORMAT, &rudp->rud_format, + sizeof(struct xfs_rud_log_format)); +} + +/* + * Pinning has no meaning for an rud item, so just return. + */ +STATIC void +xfs_rud_item_pin( + struct xfs_log_item *lip) +{ +} + +/* + * Since pinning has no meaning for an rud item, unpinning does + * not either. + */ +STATIC void +xfs_rud_item_unpin( + struct xfs_log_item *lip, + int remove) +{ +} + +/* + * There isn't much you can do to push on an rud item. It is simply stuck + * waiting for the log to be flushed to disk. + */ +STATIC uint +xfs_rud_item_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) +{ + return XFS_ITEM_PINNED; +} + +/* + * The RUD is either committed or aborted if the transaction is cancelled. If + * the transaction is cancelled, drop our reference to the RUI and free the + * RUD. + */ +STATIC void +xfs_rud_item_unlock( + struct xfs_log_item *lip) +{ + struct xfs_rud_log_item *rudp = RUD_ITEM(lip); + + if (lip->li_flags & XFS_LI_ABORTED) { + xfs_rui_release(rudp->rud_ruip); + kmem_zone_free(xfs_rud_zone, rudp); + } +} + +/* + * When the rud item is committed to disk, all we need to do is delete our + * reference to our partner rui item and then free ourselves. Since we're + * freeing ourselves we must return -1 to keep the transaction code from + * further referencing this item. + */ +STATIC xfs_lsn_t +xfs_rud_item_committed( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ + struct xfs_rud_log_item *rudp = RUD_ITEM(lip); + + /* + * Drop the RUI reference regardless of whether the RUD has been + * aborted. Once the RUD transaction is constructed, it is the sole + * responsibility of the RUD to release the RUI (even if the RUI is + * aborted due to log I/O error). + */ + xfs_rui_release(rudp->rud_ruip); + kmem_zone_free(xfs_rud_zone, rudp); + + return (xfs_lsn_t)-1; +} + +/* + * The RUD dependency tracking op doesn't do squat. It can't because + * it doesn't know where the free extent is coming from. The dependency + * tracking has to be handled by the "enclosing" metadata object. For + * example, for inodes, the inode is locked throughout the extent freeing + * so the dependency should be recorded there. + */ +STATIC void +xfs_rud_item_committing( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ +} + +/* + * This is the ops vector shared by all rud log items. + */ +static const struct xfs_item_ops xfs_rud_item_ops = { + .iop_size = xfs_rud_item_size, + .iop_format = xfs_rud_item_format, + .iop_pin = xfs_rud_item_pin, + .iop_unpin = xfs_rud_item_unpin, + .iop_unlock = xfs_rud_item_unlock, + .iop_committed = xfs_rud_item_committed, + .iop_push = xfs_rud_item_push, + .iop_committing = xfs_rud_item_committing, +}; + +/* + * Allocate and initialize an rud item with the given number of extents. + */ +struct xfs_rud_log_item * +xfs_rud_init( + struct xfs_mount *mp, + struct xfs_rui_log_item *ruip) + +{ + struct xfs_rud_log_item *rudp; + + rudp = kmem_zone_zalloc(xfs_rud_zone, KM_SLEEP); + xfs_log_item_init(mp, &rudp->rud_item, XFS_LI_RUD, &xfs_rud_item_ops); + rudp->rud_ruip = ruip; + rudp->rud_format.rud_rui_id = ruip->rui_format.rui_id; + + return rudp; +} + +/* + * Process an rmap update intent item that was recovered from the log. + * We need to update the rmapbt. + */ +int +xfs_rui_recover( + struct xfs_mount *mp, + struct xfs_rui_log_item *ruip) +{ + int i; + int error = 0; + struct xfs_map_extent *rmap; + xfs_fsblock_t startblock_fsb; + bool op_ok; + struct xfs_rud_log_item *rudp; + enum xfs_rmap_intent_type type; + int whichfork; + xfs_exntst_t state; + struct xfs_trans *tp; + struct xfs_btree_cur *rcur = NULL; + + ASSERT(!test_bit(XFS_RUI_RECOVERED, &ruip->rui_flags)); + + /* + * First check the validity of the extents described by the + * RUI. If any are bad, then assume that all are bad and + * just toss the RUI. + */ + for (i = 0; i < ruip->rui_format.rui_nextents; i++) { + rmap = &ruip->rui_format.rui_extents[i]; + startblock_fsb = XFS_BB_TO_FSB(mp, + XFS_FSB_TO_DADDR(mp, rmap->me_startblock)); + switch (rmap->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) { + case XFS_RMAP_EXTENT_MAP: + case XFS_RMAP_EXTENT_UNMAP: + case XFS_RMAP_EXTENT_CONVERT: + case XFS_RMAP_EXTENT_ALLOC: + case XFS_RMAP_EXTENT_FREE: + op_ok = true; + break; + default: + op_ok = false; + break; + } + if (!op_ok || startblock_fsb == 0 || + rmap->me_len == 0 || + startblock_fsb >= mp->m_sb.sb_dblocks || + rmap->me_len >= mp->m_sb.sb_agblocks || + (rmap->me_flags & ~XFS_RMAP_EXTENT_FLAGS)) { + /* + * This will pull the RUI from the AIL and + * free the memory associated with it. + */ + set_bit(XFS_RUI_RECOVERED, &ruip->rui_flags); + xfs_rui_release(ruip); + return -EIO; + } + } + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); + if (error) + return error; + rudp = xfs_trans_get_rud(tp, ruip); + + for (i = 0; i < ruip->rui_format.rui_nextents; i++) { + rmap = &ruip->rui_format.rui_extents[i]; + state = (rmap->me_flags & XFS_RMAP_EXTENT_UNWRITTEN) ? + XFS_EXT_UNWRITTEN : XFS_EXT_NORM; + whichfork = (rmap->me_flags & XFS_RMAP_EXTENT_ATTR_FORK) ? + XFS_ATTR_FORK : XFS_DATA_FORK; + switch (rmap->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) { + case XFS_RMAP_EXTENT_MAP: + type = XFS_RMAP_MAP; + break; + case XFS_RMAP_EXTENT_UNMAP: + type = XFS_RMAP_UNMAP; + break; + case XFS_RMAP_EXTENT_CONVERT: + type = XFS_RMAP_CONVERT; + break; + case XFS_RMAP_EXTENT_ALLOC: + type = XFS_RMAP_ALLOC; + break; + case XFS_RMAP_EXTENT_FREE: + type = XFS_RMAP_FREE; + break; + default: + error = -EFSCORRUPTED; + goto abort_error; + } + error = xfs_trans_log_finish_rmap_update(tp, rudp, type, + rmap->me_owner, whichfork, + rmap->me_startoff, rmap->me_startblock, + rmap->me_len, state, &rcur); + if (error) + goto abort_error; + + } + + xfs_rmap_finish_one_cleanup(tp, rcur, error); + set_bit(XFS_RUI_RECOVERED, &ruip->rui_flags); + error = xfs_trans_commit(tp); + return error; + +abort_error: + xfs_rmap_finish_one_cleanup(tp, rcur, error); + xfs_trans_cancel(tp); + return error; +} diff --git a/fs/xfs/xfs_rmap_item.h b/fs/xfs/xfs_rmap_item.h new file mode 100644 index 000000000..aefcc3a31 --- /dev/null +++ b/fs/xfs/xfs_rmap_item.h @@ -0,0 +1,95 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#ifndef __XFS_RMAP_ITEM_H__ +#define __XFS_RMAP_ITEM_H__ + +/* + * There are (currently) three pairs of rmap btree redo item types: map, unmap, + * and convert. The common abbreviations for these are RUI (rmap update + * intent) and RUD (rmap update done). The redo item type is encoded in the + * flags field of each xfs_map_extent. + * + * *I items should be recorded in the *first* of a series of rolled + * transactions, and the *D items should be recorded in the same transaction + * that records the associated rmapbt updates. Typically, the first + * transaction will record a bmbt update, followed by some number of + * transactions containing rmapbt updates, and finally transactions with any + * bnobt/cntbt updates. + * + * Should the system crash after the commit of the first transaction but + * before the commit of the final transaction in a series, log recovery will + * use the redo information recorded by the intent items to replay the + * (rmapbt/bnobt/cntbt) metadata updates in the non-first transaction. + */ + +/* kernel only RUI/RUD definitions */ + +struct xfs_mount; +struct kmem_zone; + +/* + * Max number of extents in fast allocation path. + */ +#define XFS_RUI_MAX_FAST_EXTENTS 16 + +/* + * Define RUI flag bits. Manipulated by set/clear/test_bit operators. + */ +#define XFS_RUI_RECOVERED 1 + +/* + * This is the "rmap update intent" log item. It is used to log the fact that + * some reverse mappings need to change. It is used in conjunction with the + * "rmap update done" log item described below. + * + * These log items follow the same rules as struct xfs_efi_log_item; see the + * comments about that structure (in xfs_extfree_item.h) for more details. + */ +struct xfs_rui_log_item { + struct xfs_log_item rui_item; + atomic_t rui_refcount; + atomic_t rui_next_extent; + unsigned long rui_flags; /* misc flags */ + struct xfs_rui_log_format rui_format; +}; + +/* + * This is the "rmap update done" log item. It is used to log the fact that + * some rmapbt updates mentioned in an earlier rui item have been performed. + */ +struct xfs_rud_log_item { + struct xfs_log_item rud_item; + struct xfs_rui_log_item *rud_ruip; + struct xfs_rud_log_format rud_format; +}; + +extern struct kmem_zone *xfs_rui_zone; +extern struct kmem_zone *xfs_rud_zone; + +struct xfs_rui_log_item *xfs_rui_init(struct xfs_mount *, uint); +struct xfs_rud_log_item *xfs_rud_init(struct xfs_mount *, + struct xfs_rui_log_item *); +int xfs_rui_copy_format(struct xfs_log_iovec *buf, + struct xfs_rui_log_format *dst_rui_fmt); +void xfs_rui_item_free(struct xfs_rui_log_item *); +void xfs_rui_release(struct xfs_rui_log_item *); +int xfs_rui_recover(struct xfs_mount *mp, struct xfs_rui_log_item *ruip); + +#endif /* __XFS_RMAP_ITEM_H__ */ diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 3938b37d1..802bcc326 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -23,6 +23,7 @@ #include "xfs_trans_resv.h" #include "xfs_bit.h" #include "xfs_mount.h" +#include "xfs_defer.h" #include "xfs_inode.h" #include "xfs_bmap.h" #include "xfs_bmap_util.h" @@ -769,7 +770,7 @@ xfs_growfs_rt_alloc( xfs_daddr_t d; /* disk block address */ int error; /* error return value */ xfs_fsblock_t firstblock;/* first block allocated in xaction */ - struct xfs_bmap_free flist; /* list of freed blocks */ + struct xfs_defer_ops dfops; /* list of freed blocks */ xfs_fsblock_t fsbno; /* filesystem block for bno */ struct xfs_bmbt_irec map; /* block map output */ int nmap; /* number of block maps */ @@ -794,14 +795,14 @@ xfs_growfs_rt_alloc( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - xfs_bmap_init(&flist, &firstblock); + xfs_defer_init(&dfops, &firstblock); /* * Allocate blocks to the bitmap file. */ nmap = 1; error = xfs_bmapi_write(tp, ip, oblocks, nblocks - oblocks, XFS_BMAPI_METADATA, &firstblock, - resblks, &map, &nmap, &flist); + resblks, &map, &nmap, &dfops); if (!error && nmap < 1) error = -ENOSPC; if (error) @@ -809,7 +810,7 @@ xfs_growfs_rt_alloc( /* * Free any blocks freed up in the transaction, then commit. */ - error = xfs_bmap_finish(&tp, &flist, NULL); + error = xfs_defer_finish(&tp, &dfops, NULL); if (error) goto out_bmap_cancel; error = xfs_trans_commit(tp); @@ -862,7 +863,7 @@ xfs_growfs_rt_alloc( return 0; out_bmap_cancel: - xfs_bmap_cancel(&flist); + xfs_defer_cancel(&dfops); out_trans_cancel: xfs_trans_cancel(tp); return error; diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h index 76c0a4a9b..355dd9e1c 100644 --- a/fs/xfs/xfs_rtalloc.h +++ b/fs/xfs/xfs_rtalloc.h @@ -98,8 +98,6 @@ xfs_growfs_rt( /* * From xfs_rtbitmap.c */ -int xfs_rtbuf_get(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_rtblock_t block, int issum, struct xfs_buf **bpp); int xfs_rtcheck_range(struct xfs_mount *mp, struct xfs_trans *tp, xfs_rtblock_t start, xfs_extlen_t len, int val, xfs_rtblock_t *new, int *stat); diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c index 8686df6c7..6e812fe0f 100644 --- a/fs/xfs/xfs_stats.c +++ b/fs/xfs/xfs_stats.c @@ -61,6 +61,7 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf) { "bmbt2", XFSSTAT_END_BMBT_V2 }, { "ibt2", XFSSTAT_END_IBT_V2 }, { "fibt2", XFSSTAT_END_FIBT_V2 }, + { "rmapbt", XFSSTAT_END_RMAP_V2 }, /* we print both series of quota information together */ { "qm", XFSSTAT_END_QM }, }; @@ -128,7 +129,6 @@ static int xqm_proc_open(struct inode *inode, struct file *file) } static const struct file_operations xqm_proc_fops = { - .owner = THIS_MODULE, .open = xqm_proc_open, .read = seq_read, .llseek = seq_lseek, diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h index 483b0eff1..657865f51 100644 --- a/fs/xfs/xfs_stats.h +++ b/fs/xfs/xfs_stats.h @@ -197,7 +197,23 @@ struct xfsstats { __uint32_t xs_fibt_2_alloc; __uint32_t xs_fibt_2_free; __uint32_t xs_fibt_2_moves; -#define XFSSTAT_END_XQMSTAT (XFSSTAT_END_FIBT_V2+6) +#define XFSSTAT_END_RMAP_V2 (XFSSTAT_END_FIBT_V2+15) + __uint32_t xs_rmap_2_lookup; + __uint32_t xs_rmap_2_compare; + __uint32_t xs_rmap_2_insrec; + __uint32_t xs_rmap_2_delrec; + __uint32_t xs_rmap_2_newroot; + __uint32_t xs_rmap_2_killroot; + __uint32_t xs_rmap_2_increment; + __uint32_t xs_rmap_2_decrement; + __uint32_t xs_rmap_2_lshift; + __uint32_t xs_rmap_2_rshift; + __uint32_t xs_rmap_2_split; + __uint32_t xs_rmap_2_join; + __uint32_t xs_rmap_2_alloc; + __uint32_t xs_rmap_2_free; + __uint32_t xs_rmap_2_moves; +#define XFSSTAT_END_XQMSTAT (XFSSTAT_END_RMAP_V2+6) __uint32_t xs_qm_dqreclaims; __uint32_t xs_qm_dqreclaim_misses; __uint32_t xs_qm_dquot_dups; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 11ea5d51d..fd6be45b3 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -46,6 +46,7 @@ #include "xfs_quota.h" #include "xfs_sysfs.h" #include "xfs_ondisk.h" +#include "xfs_rmap_item.h" #include #include @@ -546,7 +547,7 @@ xfs_showargs( return 0; } -__uint64_t +static __uint64_t xfs_max_file_offset( unsigned int blockshift) { @@ -1075,7 +1076,7 @@ xfs_fs_statfs( statp->f_blocks = sbp->sb_dblocks - lsize; spin_unlock(&mp->m_sb_lock); - statp->f_bfree = fdblocks - XFS_ALLOC_SET_ASIDE(mp); + statp->f_bfree = fdblocks - mp->m_alloc_set_aside; statp->f_bavail = statp->f_bfree; fakeinos = statp->f_bfree << sbp->sb_inopblog; @@ -1294,6 +1295,7 @@ xfs_fs_remount( */ xfs_restore_resvblks(mp); xfs_log_work_queue(mp); + xfs_queue_eofblocks(mp); } /* rw -> ro */ @@ -1306,6 +1308,13 @@ xfs_fs_remount( * return it to the same size. */ xfs_save_resvblks(mp); + + /* + * Cancel background eofb scanning so it cannot race with the + * final log force+buftarg wait and deadlock the remount. + */ + cancel_delayed_work_sync(&mp->m_eofblocks_work); + xfs_quiesce_attr(mp); mp->m_flags |= XFS_MOUNT_RDONLY; } @@ -1565,9 +1574,16 @@ xfs_fs_fill_super( } } - if (xfs_sb_version_hassparseinodes(&mp->m_sb)) + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) { + if (mp->m_sb.sb_rblocks) { + xfs_alert(mp, + "EXPERIMENTAL reverse mapping btree not compatible with realtime device!"); + error = -EINVAL; + goto out_filestream_unmount; + } xfs_alert(mp, - "EXPERIMENTAL sparse inode feature enabled. Use at your own risk!"); + "EXPERIMENTAL reverse mapping btree feature enabled. Use at your own risk!"); + } error = xfs_mountfs(mp); if (error) @@ -1692,8 +1708,9 @@ xfs_init_zones(void) if (!xfs_log_ticket_zone) goto out_free_ioend_bioset; - xfs_bmap_free_item_zone = kmem_zone_init(sizeof(xfs_bmap_free_item_t), - "xfs_bmap_free_item"); + xfs_bmap_free_item_zone = kmem_zone_init( + sizeof(struct xfs_extent_free_item), + "xfs_bmap_free_item"); if (!xfs_bmap_free_item_zone) goto out_destroy_log_ticket_zone; @@ -1760,8 +1777,24 @@ xfs_init_zones(void) if (!xfs_icreate_zone) goto out_destroy_ili_zone; + xfs_rud_zone = kmem_zone_init(sizeof(struct xfs_rud_log_item), + "xfs_rud_item"); + if (!xfs_rud_zone) + goto out_destroy_icreate_zone; + + xfs_rui_zone = kmem_zone_init((sizeof(struct xfs_rui_log_item) + + ((XFS_RUI_MAX_FAST_EXTENTS - 1) * + sizeof(struct xfs_map_extent))), + "xfs_rui_item"); + if (!xfs_rui_zone) + goto out_destroy_rud_zone; + return 0; + out_destroy_rud_zone: + kmem_zone_destroy(xfs_rud_zone); + out_destroy_icreate_zone: + kmem_zone_destroy(xfs_icreate_zone); out_destroy_ili_zone: kmem_zone_destroy(xfs_ili_zone); out_destroy_inode_zone: @@ -1800,6 +1833,8 @@ xfs_destroy_zones(void) * destroy caches. */ rcu_barrier(); + kmem_zone_destroy(xfs_rui_zone); + kmem_zone_destroy(xfs_rud_zone); kmem_zone_destroy(xfs_icreate_zone); kmem_zone_destroy(xfs_ili_zone); kmem_zone_destroy(xfs_inode_zone); @@ -1849,6 +1884,9 @@ init_xfs_fs(void) printk(KERN_INFO XFS_VERSION_STRING " with " XFS_BUILD_OPTIONS " enabled\n"); + xfs_extent_free_init_defer_op(); + xfs_rmap_update_init_defer_op(); + xfs_dir_startup(); error = xfs_init_zones(); diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h index 2dfb1ce45..529bce9fc 100644 --- a/fs/xfs/xfs_super.h +++ b/fs/xfs/xfs_super.h @@ -61,8 +61,6 @@ struct xfs_mount; struct xfs_buftarg; struct block_device; -extern __uint64_t xfs_max_file_offset(unsigned int); - extern void xfs_flush_inodes(struct xfs_mount *mp); extern void xfs_blkdev_issue_flush(struct xfs_buftarg *); extern xfs_agnumber_t xfs_set_inode_alloc(struct xfs_mount *, diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 08a46c618..58142aeee 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -26,6 +26,7 @@ #include "xfs_mount.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" +#include "xfs_defer.h" #include "xfs_dir2.h" #include "xfs_inode.h" #include "xfs_ialloc.h" @@ -172,7 +173,7 @@ xfs_symlink( struct xfs_inode *ip = NULL; int error = 0; int pathlen; - struct xfs_bmap_free free_list; + struct xfs_defer_ops dfops; xfs_fsblock_t first_block; bool unlock_dp_on_error = false; xfs_fileoff_t first_fsb; @@ -269,7 +270,7 @@ xfs_symlink( * Initialize the bmap freelist prior to calling either * bmapi or the directory create code. */ - xfs_bmap_init(&free_list, &first_block); + xfs_defer_init(&dfops, &first_block); /* * Allocate an inode for the symlink. @@ -313,7 +314,7 @@ xfs_symlink( error = xfs_bmapi_write(tp, ip, first_fsb, fs_blocks, XFS_BMAPI_METADATA, &first_block, resblks, - mval, &nmaps, &free_list); + mval, &nmaps, &dfops); if (error) goto out_bmap_cancel; @@ -361,7 +362,7 @@ xfs_symlink( * Create the directory entry for the symlink. */ error = xfs_dir_createname(tp, dp, link_name, ip->i_ino, - &first_block, &free_list, resblks); + &first_block, &dfops, resblks); if (error) goto out_bmap_cancel; xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); @@ -376,7 +377,7 @@ xfs_symlink( xfs_trans_set_sync(tp); } - error = xfs_bmap_finish(&tp, &free_list, NULL); + error = xfs_defer_finish(&tp, &dfops, NULL); if (error) goto out_bmap_cancel; @@ -392,7 +393,7 @@ xfs_symlink( return 0; out_bmap_cancel: - xfs_bmap_cancel(&free_list); + xfs_defer_cancel(&dfops); out_trans_cancel: xfs_trans_cancel(tp); out_release_inode: @@ -426,7 +427,7 @@ xfs_inactive_symlink_rmt( int done; int error; xfs_fsblock_t first_block; - xfs_bmap_free_t free_list; + struct xfs_defer_ops dfops; int i; xfs_mount_t *mp; xfs_bmbt_irec_t mval[XFS_SYMLINK_MAPS]; @@ -465,7 +466,7 @@ xfs_inactive_symlink_rmt( * Find the block(s) so we can inval and unmap them. */ done = 0; - xfs_bmap_init(&free_list, &first_block); + xfs_defer_init(&dfops, &first_block); nmaps = ARRAY_SIZE(mval); error = xfs_bmapi_read(ip, 0, xfs_symlink_blocks(mp, size), mval, &nmaps, 0); @@ -485,17 +486,17 @@ xfs_inactive_symlink_rmt( xfs_trans_binval(tp, bp); } /* - * Unmap the dead block(s) to the free_list. + * Unmap the dead block(s) to the dfops. */ error = xfs_bunmapi(tp, ip, 0, size, 0, nmaps, - &first_block, &free_list, &done); + &first_block, &dfops, &done); if (error) goto error_bmap_cancel; ASSERT(done); /* * Commit the first transaction. This logs the EFI and the inode. */ - error = xfs_bmap_finish(&tp, &free_list, ip); + error = xfs_defer_finish(&tp, &dfops, ip); if (error) goto error_bmap_cancel; /* @@ -525,7 +526,7 @@ xfs_inactive_symlink_rmt( return 0; error_bmap_cancel: - xfs_bmap_cancel(&free_list); + xfs_defer_cancel(&dfops); error_trans_cancel: xfs_trans_cancel(tp); error_unlock: diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index 4c2c55086..79cfd3fc5 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -634,6 +634,9 @@ xfs_error_get_cfg( { struct xfs_error_cfg *cfg; + if (error < 0) + error = -error; + switch (error) { case EIO: cfg = &mp->m_error_cfg[error_class][XFS_ERR_EIO]; diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c index 13a029806..7f17ae6d7 100644 --- a/fs/xfs/xfs_trace.c +++ b/fs/xfs/xfs_trace.c @@ -22,7 +22,9 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" +#include "xfs_defer.h" #include "xfs_da_format.h" +#include "xfs_defer.h" #include "xfs_inode.h" #include "xfs_btree.h" #include "xfs_da_btree.h" diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index ea94ee0fe..d303a665d 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -38,6 +38,7 @@ struct xlog_recover_item; struct xfs_buf_log_format; struct xfs_inode_log_format; struct xfs_bmbt_irec; +struct xfs_btree_cur; DECLARE_EVENT_CLASS(xfs_attr_list_class, TP_PROTO(struct xfs_attr_list_context *ctx), @@ -354,6 +355,7 @@ DEFINE_BUF_EVENT(xfs_buf_submit_wait); DEFINE_BUF_EVENT(xfs_buf_bawrite); DEFINE_BUF_EVENT(xfs_buf_lock); DEFINE_BUF_EVENT(xfs_buf_lock_done); +DEFINE_BUF_EVENT(xfs_buf_trylock_fail); DEFINE_BUF_EVENT(xfs_buf_trylock); DEFINE_BUF_EVENT(xfs_buf_unlock); DEFINE_BUF_EVENT(xfs_buf_iowait); @@ -1134,15 +1136,14 @@ TRACE_EVENT(xfs_log_assign_tail_lsn, ) DECLARE_EVENT_CLASS(xfs_file_class, - TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), - TP_ARGS(ip, count, offset, flags), + TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset), + TP_ARGS(ip, count, offset), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_ino_t, ino) __field(xfs_fsize_t, size) __field(loff_t, offset) __field(size_t, count) - __field(int, flags) ), TP_fast_assign( __entry->dev = VFS_I(ip)->i_sb->s_dev; @@ -1150,25 +1151,25 @@ DECLARE_EVENT_CLASS(xfs_file_class, __entry->size = ip->i_d.di_size; __entry->offset = offset; __entry->count = count; - __entry->flags = flags; ), - TP_printk("dev %d:%d ino 0x%llx size 0x%llx " - "offset 0x%llx count 0x%zx ioflags %s", + TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count 0x%zx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->size, __entry->offset, - __entry->count, - __print_flags(__entry->flags, "|", XFS_IO_FLAGS)) + __entry->count) ) #define DEFINE_RW_EVENT(name) \ DEFINE_EVENT(xfs_file_class, name, \ - TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), \ - TP_ARGS(ip, count, offset, flags)) -DEFINE_RW_EVENT(xfs_file_read); + TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset), \ + TP_ARGS(ip, count, offset)) +DEFINE_RW_EVENT(xfs_file_buffered_read); +DEFINE_RW_EVENT(xfs_file_direct_read); +DEFINE_RW_EVENT(xfs_file_dax_read); DEFINE_RW_EVENT(xfs_file_buffered_write); DEFINE_RW_EVENT(xfs_file_direct_write); +DEFINE_RW_EVENT(xfs_file_dax_write); DEFINE_RW_EVENT(xfs_file_splice_read); DECLARE_EVENT_CLASS(xfs_page_class, @@ -1295,6 +1296,8 @@ DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc); DEFINE_IOMAP_EVENT(xfs_get_blocks_found); DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc); DEFINE_IOMAP_EVENT(xfs_get_blocks_map_direct); +DEFINE_IOMAP_EVENT(xfs_iomap_alloc); +DEFINE_IOMAP_EVENT(xfs_iomap_found); DECLARE_EVENT_CLASS(xfs_simple_io_class, TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), @@ -2182,6 +2185,379 @@ DEFINE_DISCARD_EVENT(xfs_discard_toosmall); DEFINE_DISCARD_EVENT(xfs_discard_exclude); DEFINE_DISCARD_EVENT(xfs_discard_busy); +/* btree cursor events */ +DECLARE_EVENT_CLASS(xfs_btree_cur_class, + TP_PROTO(struct xfs_btree_cur *cur, int level, struct xfs_buf *bp), + TP_ARGS(cur, level, bp), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_btnum_t, btnum) + __field(int, level) + __field(int, nlevels) + __field(int, ptr) + __field(xfs_daddr_t, daddr) + ), + TP_fast_assign( + __entry->dev = cur->bc_mp->m_super->s_dev; + __entry->btnum = cur->bc_btnum; + __entry->level = level; + __entry->nlevels = cur->bc_nlevels; + __entry->ptr = cur->bc_ptrs[level]; + __entry->daddr = bp ? bp->b_bn : -1; + ), + TP_printk("dev %d:%d btnum %d level %d/%d ptr %d daddr 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->btnum, + __entry->level, + __entry->nlevels, + __entry->ptr, + (unsigned long long)__entry->daddr) +) + +#define DEFINE_BTREE_CUR_EVENT(name) \ +DEFINE_EVENT(xfs_btree_cur_class, name, \ + TP_PROTO(struct xfs_btree_cur *cur, int level, struct xfs_buf *bp), \ + TP_ARGS(cur, level, bp)) +DEFINE_BTREE_CUR_EVENT(xfs_btree_updkeys); +DEFINE_BTREE_CUR_EVENT(xfs_btree_overlapped_query_range); + +/* deferred ops */ +struct xfs_defer_pending; +struct xfs_defer_intake; +struct xfs_defer_ops; + +DECLARE_EVENT_CLASS(xfs_defer_class, + TP_PROTO(struct xfs_mount *mp, struct xfs_defer_ops *dop), + TP_ARGS(mp, dop), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(void *, dop) + __field(bool, committed) + __field(bool, low) + ), + TP_fast_assign( + __entry->dev = mp ? mp->m_super->s_dev : 0; + __entry->dop = dop; + __entry->committed = dop->dop_committed; + __entry->low = dop->dop_low; + ), + TP_printk("dev %d:%d ops %p committed %d low %d\n", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->dop, + __entry->committed, + __entry->low) +) +#define DEFINE_DEFER_EVENT(name) \ +DEFINE_EVENT(xfs_defer_class, name, \ + TP_PROTO(struct xfs_mount *mp, struct xfs_defer_ops *dop), \ + TP_ARGS(mp, dop)) + +DECLARE_EVENT_CLASS(xfs_defer_error_class, + TP_PROTO(struct xfs_mount *mp, struct xfs_defer_ops *dop, int error), + TP_ARGS(mp, dop, error), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(void *, dop) + __field(bool, committed) + __field(bool, low) + __field(int, error) + ), + TP_fast_assign( + __entry->dev = mp ? mp->m_super->s_dev : 0; + __entry->dop = dop; + __entry->committed = dop->dop_committed; + __entry->low = dop->dop_low; + __entry->error = error; + ), + TP_printk("dev %d:%d ops %p committed %d low %d err %d\n", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->dop, + __entry->committed, + __entry->low, + __entry->error) +) +#define DEFINE_DEFER_ERROR_EVENT(name) \ +DEFINE_EVENT(xfs_defer_error_class, name, \ + TP_PROTO(struct xfs_mount *mp, struct xfs_defer_ops *dop, int error), \ + TP_ARGS(mp, dop, error)) + +DECLARE_EVENT_CLASS(xfs_defer_pending_class, + TP_PROTO(struct xfs_mount *mp, struct xfs_defer_pending *dfp), + TP_ARGS(mp, dfp), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(int, type) + __field(void *, intent) + __field(bool, committed) + __field(int, nr) + ), + TP_fast_assign( + __entry->dev = mp ? mp->m_super->s_dev : 0; + __entry->type = dfp->dfp_type->type; + __entry->intent = dfp->dfp_intent; + __entry->committed = dfp->dfp_done != NULL; + __entry->nr = dfp->dfp_count; + ), + TP_printk("dev %d:%d optype %d intent %p committed %d nr %d\n", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->type, + __entry->intent, + __entry->committed, + __entry->nr) +) +#define DEFINE_DEFER_PENDING_EVENT(name) \ +DEFINE_EVENT(xfs_defer_pending_class, name, \ + TP_PROTO(struct xfs_mount *mp, struct xfs_defer_pending *dfp), \ + TP_ARGS(mp, dfp)) + +DECLARE_EVENT_CLASS(xfs_phys_extent_deferred_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + int type, xfs_agblock_t agbno, xfs_extlen_t len), + TP_ARGS(mp, agno, type, agbno, len), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(int, type) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->type = type; + __entry->agbno = agbno; + __entry->len = len; + ), + TP_printk("dev %d:%d op %d agno %u agbno %u len %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->type, + __entry->agno, + __entry->agbno, + __entry->len) +); +#define DEFINE_PHYS_EXTENT_DEFERRED_EVENT(name) \ +DEFINE_EVENT(xfs_phys_extent_deferred_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + int type, \ + xfs_agblock_t bno, \ + xfs_extlen_t len), \ + TP_ARGS(mp, agno, type, bno, len)) + +DECLARE_EVENT_CLASS(xfs_map_extent_deferred_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + int op, + xfs_agblock_t agbno, + xfs_ino_t ino, + int whichfork, + xfs_fileoff_t offset, + xfs_filblks_t len, + xfs_exntst_t state), + TP_ARGS(mp, agno, op, agbno, ino, whichfork, offset, len, state), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_ino_t, ino) + __field(xfs_agblock_t, agbno) + __field(int, whichfork) + __field(xfs_fileoff_t, l_loff) + __field(xfs_filblks_t, l_len) + __field(xfs_exntst_t, l_state) + __field(int, op) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->ino = ino; + __entry->agbno = agbno; + __entry->whichfork = whichfork; + __entry->l_loff = offset; + __entry->l_len = len; + __entry->l_state = state; + __entry->op = op; + ), + TP_printk("dev %d:%d op %d agno %u agbno %u owner %lld %s offset %llu len %llu state %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->op, + __entry->agno, + __entry->agbno, + __entry->ino, + __entry->whichfork == XFS_ATTR_FORK ? "attr" : "data", + __entry->l_loff, + __entry->l_len, + __entry->l_state) +); +#define DEFINE_MAP_EXTENT_DEFERRED_EVENT(name) \ +DEFINE_EVENT(xfs_map_extent_deferred_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + int op, \ + xfs_agblock_t agbno, \ + xfs_ino_t ino, \ + int whichfork, \ + xfs_fileoff_t offset, \ + xfs_filblks_t len, \ + xfs_exntst_t state), \ + TP_ARGS(mp, agno, op, agbno, ino, whichfork, offset, len, state)) + +DEFINE_DEFER_EVENT(xfs_defer_init); +DEFINE_DEFER_EVENT(xfs_defer_cancel); +DEFINE_DEFER_EVENT(xfs_defer_trans_roll); +DEFINE_DEFER_EVENT(xfs_defer_trans_abort); +DEFINE_DEFER_EVENT(xfs_defer_finish); +DEFINE_DEFER_EVENT(xfs_defer_finish_done); + +DEFINE_DEFER_ERROR_EVENT(xfs_defer_trans_roll_error); +DEFINE_DEFER_ERROR_EVENT(xfs_defer_finish_error); +DEFINE_DEFER_ERROR_EVENT(xfs_defer_op_finish_error); + +DEFINE_DEFER_PENDING_EVENT(xfs_defer_intake_work); +DEFINE_DEFER_PENDING_EVENT(xfs_defer_intake_cancel); +DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_commit); +DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_cancel); +DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_finish); +DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_abort); + +#define DEFINE_BMAP_FREE_DEFERRED_EVENT DEFINE_PHYS_EXTENT_DEFERRED_EVENT +DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_bmap_free_defer); +DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_bmap_free_deferred); + +/* rmap tracepoints */ +DECLARE_EVENT_CLASS(xfs_rmap_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t agbno, xfs_extlen_t len, bool unwritten, + struct xfs_owner_info *oinfo), + TP_ARGS(mp, agno, agbno, len, unwritten, oinfo), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) + __field(uint64_t, owner) + __field(uint64_t, offset) + __field(unsigned long, flags) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agbno = agbno; + __entry->len = len; + __entry->owner = oinfo->oi_owner; + __entry->offset = oinfo->oi_offset; + __entry->flags = oinfo->oi_flags; + if (unwritten) + __entry->flags |= XFS_RMAP_UNWRITTEN; + ), + TP_printk("dev %d:%d agno %u agbno %u len %u owner %lld offset %llu flags 0x%lx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->len, + __entry->owner, + __entry->offset, + __entry->flags) +); +#define DEFINE_RMAP_EVENT(name) \ +DEFINE_EVENT(xfs_rmap_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + xfs_agblock_t agbno, xfs_extlen_t len, bool unwritten, \ + struct xfs_owner_info *oinfo), \ + TP_ARGS(mp, agno, agbno, len, unwritten, oinfo)) + +/* simple AG-based error/%ip tracepoint class */ +DECLARE_EVENT_CLASS(xfs_ag_error_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int error, + unsigned long caller_ip), + TP_ARGS(mp, agno, error, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(int, error) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->error = error; + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d agno %u error %d caller %ps", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->error, + (char *)__entry->caller_ip) +); + +#define DEFINE_AG_ERROR_EVENT(name) \ +DEFINE_EVENT(xfs_ag_error_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int error, \ + unsigned long caller_ip), \ + TP_ARGS(mp, agno, error, caller_ip)) + +DEFINE_RMAP_EVENT(xfs_rmap_unmap); +DEFINE_RMAP_EVENT(xfs_rmap_unmap_done); +DEFINE_AG_ERROR_EVENT(xfs_rmap_unmap_error); +DEFINE_RMAP_EVENT(xfs_rmap_map); +DEFINE_RMAP_EVENT(xfs_rmap_map_done); +DEFINE_AG_ERROR_EVENT(xfs_rmap_map_error); +DEFINE_RMAP_EVENT(xfs_rmap_convert); +DEFINE_RMAP_EVENT(xfs_rmap_convert_done); +DEFINE_AG_ERROR_EVENT(xfs_rmap_convert_error); +DEFINE_AG_ERROR_EVENT(xfs_rmap_convert_state); + +DECLARE_EVENT_CLASS(xfs_rmapbt_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t agbno, xfs_extlen_t len, + uint64_t owner, uint64_t offset, unsigned int flags), + TP_ARGS(mp, agno, agbno, len, owner, offset, flags), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) + __field(uint64_t, owner) + __field(uint64_t, offset) + __field(unsigned int, flags) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agbno = agbno; + __entry->len = len; + __entry->owner = owner; + __entry->offset = offset; + __entry->flags = flags; + ), + TP_printk("dev %d:%d agno %u agbno %u len %u owner %lld offset %llu flags 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->len, + __entry->owner, + __entry->offset, + __entry->flags) +); +#define DEFINE_RMAPBT_EVENT(name) \ +DEFINE_EVENT(xfs_rmapbt_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + xfs_agblock_t agbno, xfs_extlen_t len, \ + uint64_t owner, uint64_t offset, unsigned int flags), \ + TP_ARGS(mp, agno, agbno, len, owner, offset, flags)) + +#define DEFINE_RMAP_DEFERRED_EVENT DEFINE_MAP_EXTENT_DEFERRED_EVENT +DEFINE_RMAP_DEFERRED_EVENT(xfs_rmap_defer); +DEFINE_RMAP_DEFERRED_EVENT(xfs_rmap_deferred); + +DEFINE_BUSY_EVENT(xfs_rmapbt_alloc_block); +DEFINE_BUSY_EVENT(xfs_rmapbt_free_block); +DEFINE_RMAPBT_EVENT(xfs_rmap_update); +DEFINE_RMAPBT_EVENT(xfs_rmap_insert); +DEFINE_RMAPBT_EVENT(xfs_rmap_delete); +DEFINE_AG_ERROR_EVENT(xfs_rmap_insert_error); +DEFINE_AG_ERROR_EVENT(xfs_rmap_delete_error); +DEFINE_AG_ERROR_EVENT(xfs_rmap_update_error); +DEFINE_RMAPBT_EVENT(xfs_rmap_lookup_le_range_result); +DEFINE_RMAPBT_EVENT(xfs_rmap_find_right_neighbor_result); +DEFINE_RMAPBT_EVENT(xfs_rmap_find_left_neighbor_result); + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 9a462e892..e2bf86aad 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -33,6 +33,9 @@ struct xfs_trans; struct xfs_trans_res; struct xfs_dquot_acct; struct xfs_busy_extent; +struct xfs_rud_log_item; +struct xfs_rui_log_item; +struct xfs_btree_cur; typedef struct xfs_log_item { struct list_head li_ail; /* AIL pointers */ @@ -52,6 +55,7 @@ typedef struct xfs_log_item { /* delayed logging */ struct list_head li_cil; /* CIL pointers */ struct xfs_log_vec *li_lv; /* active log vector */ + struct xfs_log_vec *li_lv_shadow; /* standby vector */ xfs_lsn_t li_seq; /* CIL commit seq */ } xfs_log_item_t; @@ -209,17 +213,14 @@ void xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int); void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *, uint); void xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint); void xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint); -struct xfs_efi_log_item *xfs_trans_get_efi(xfs_trans_t *, uint); -void xfs_trans_log_efi_extent(xfs_trans_t *, - struct xfs_efi_log_item *, - xfs_fsblock_t, - xfs_extlen_t); -struct xfs_efd_log_item *xfs_trans_get_efd(xfs_trans_t *, + +void xfs_extent_free_init_defer_op(void); +struct xfs_efd_log_item *xfs_trans_get_efd(struct xfs_trans *, struct xfs_efi_log_item *, uint); int xfs_trans_free_extent(struct xfs_trans *, struct xfs_efd_log_item *, xfs_fsblock_t, - xfs_extlen_t); + xfs_extlen_t, struct xfs_owner_info *); int xfs_trans_commit(struct xfs_trans *); int __xfs_trans_roll(struct xfs_trans **, struct xfs_inode *, int *); int xfs_trans_roll(struct xfs_trans **, struct xfs_inode *); @@ -235,4 +236,16 @@ void xfs_trans_buf_copy_type(struct xfs_buf *dst_bp, extern kmem_zone_t *xfs_trans_zone; extern kmem_zone_t *xfs_log_item_desc_zone; +/* rmap updates */ +enum xfs_rmap_intent_type; + +void xfs_rmap_update_init_defer_op(void); +struct xfs_rud_log_item *xfs_trans_get_rud(struct xfs_trans *tp, + struct xfs_rui_log_item *ruip); +int xfs_trans_log_finish_rmap_update(struct xfs_trans *tp, + struct xfs_rud_log_item *rudp, enum xfs_rmap_intent_type type, + __uint64_t owner, int whichfork, xfs_fileoff_t startoff, + xfs_fsblock_t startblock, xfs_filblks_t blockcount, + xfs_exntst_t state, struct xfs_btree_cur **pcur); + #endif /* __XFS_TRANS_H__ */ diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c index a96ae540e..459ddec13 100644 --- a/fs/xfs/xfs_trans_extfree.c +++ b/fs/xfs/xfs_trans_extfree.c @@ -21,66 +21,15 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" +#include "xfs_bit.h" #include "xfs_mount.h" +#include "xfs_defer.h" #include "xfs_trans.h" #include "xfs_trans_priv.h" #include "xfs_extfree_item.h" #include "xfs_alloc.h" - -/* - * This routine is called to allocate an "extent free intention" - * log item that will hold nextents worth of extents. The - * caller must use all nextents extents, because we are not - * flexible about this at all. - */ -xfs_efi_log_item_t * -xfs_trans_get_efi(xfs_trans_t *tp, - uint nextents) -{ - xfs_efi_log_item_t *efip; - - ASSERT(tp != NULL); - ASSERT(nextents > 0); - - efip = xfs_efi_init(tp->t_mountp, nextents); - ASSERT(efip != NULL); - - /* - * Get a log_item_desc to point at the new item. - */ - xfs_trans_add_item(tp, &efip->efi_item); - return efip; -} - -/* - * This routine is called to indicate that the described - * extent is to be logged as needing to be freed. It should - * be called once for each extent to be freed. - */ -void -xfs_trans_log_efi_extent(xfs_trans_t *tp, - xfs_efi_log_item_t *efip, - xfs_fsblock_t start_block, - xfs_extlen_t ext_len) -{ - uint next_extent; - xfs_extent_t *extp; - - tp->t_flags |= XFS_TRANS_DIRTY; - efip->efi_item.li_desc->lid_flags |= XFS_LID_DIRTY; - - /* - * atomic_inc_return gives us the value after the increment; - * we want to use it as an array index so we need to subtract 1 from - * it. - */ - next_extent = atomic_inc_return(&efip->efi_next_extent) - 1; - ASSERT(next_extent < efip->efi_format.efi_nextents); - extp = &(efip->efi_format.efi_extents[next_extent]); - extp->ext_start = start_block; - extp->ext_len = ext_len; -} - +#include "xfs_bmap.h" +#include "xfs_trace.h" /* * This routine is called to allocate an "extent free done" @@ -88,12 +37,12 @@ xfs_trans_log_efi_extent(xfs_trans_t *tp, * caller must use all nextents extents, because we are not * flexible about this at all. */ -xfs_efd_log_item_t * -xfs_trans_get_efd(xfs_trans_t *tp, - xfs_efi_log_item_t *efip, - uint nextents) +struct xfs_efd_log_item * +xfs_trans_get_efd(struct xfs_trans *tp, + struct xfs_efi_log_item *efip, + uint nextents) { - xfs_efd_log_item_t *efdp; + struct xfs_efd_log_item *efdp; ASSERT(tp != NULL); ASSERT(nextents > 0); @@ -118,13 +67,19 @@ xfs_trans_free_extent( struct xfs_trans *tp, struct xfs_efd_log_item *efdp, xfs_fsblock_t start_block, - xfs_extlen_t ext_len) + xfs_extlen_t ext_len, + struct xfs_owner_info *oinfo) { + struct xfs_mount *mp = tp->t_mountp; uint next_extent; + xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, start_block); + xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, start_block); struct xfs_extent *extp; int error; - error = xfs_free_extent(tp, start_block, ext_len); + trace_xfs_bmap_free_deferred(tp->t_mountp, agno, 0, agbno, ext_len); + + error = xfs_free_extent(tp, start_block, ext_len, oinfo); /* * Mark the transaction dirty, even on error. This ensures the @@ -145,3 +100,139 @@ xfs_trans_free_extent( return error; } + +/* Sort bmap items by AG. */ +static int +xfs_extent_free_diff_items( + void *priv, + struct list_head *a, + struct list_head *b) +{ + struct xfs_mount *mp = priv; + struct xfs_extent_free_item *ra; + struct xfs_extent_free_item *rb; + + ra = container_of(a, struct xfs_extent_free_item, xefi_list); + rb = container_of(b, struct xfs_extent_free_item, xefi_list); + return XFS_FSB_TO_AGNO(mp, ra->xefi_startblock) - + XFS_FSB_TO_AGNO(mp, rb->xefi_startblock); +} + +/* Get an EFI. */ +STATIC void * +xfs_extent_free_create_intent( + struct xfs_trans *tp, + unsigned int count) +{ + struct xfs_efi_log_item *efip; + + ASSERT(tp != NULL); + ASSERT(count > 0); + + efip = xfs_efi_init(tp->t_mountp, count); + ASSERT(efip != NULL); + + /* + * Get a log_item_desc to point at the new item. + */ + xfs_trans_add_item(tp, &efip->efi_item); + return efip; +} + +/* Log a free extent to the intent item. */ +STATIC void +xfs_extent_free_log_item( + struct xfs_trans *tp, + void *intent, + struct list_head *item) +{ + struct xfs_efi_log_item *efip = intent; + struct xfs_extent_free_item *free; + uint next_extent; + struct xfs_extent *extp; + + free = container_of(item, struct xfs_extent_free_item, xefi_list); + + tp->t_flags |= XFS_TRANS_DIRTY; + efip->efi_item.li_desc->lid_flags |= XFS_LID_DIRTY; + + /* + * atomic_inc_return gives us the value after the increment; + * we want to use it as an array index so we need to subtract 1 from + * it. + */ + next_extent = atomic_inc_return(&efip->efi_next_extent) - 1; + ASSERT(next_extent < efip->efi_format.efi_nextents); + extp = &efip->efi_format.efi_extents[next_extent]; + extp->ext_start = free->xefi_startblock; + extp->ext_len = free->xefi_blockcount; +} + +/* Get an EFD so we can process all the free extents. */ +STATIC void * +xfs_extent_free_create_done( + struct xfs_trans *tp, + void *intent, + unsigned int count) +{ + return xfs_trans_get_efd(tp, intent, count); +} + +/* Process a free extent. */ +STATIC int +xfs_extent_free_finish_item( + struct xfs_trans *tp, + struct xfs_defer_ops *dop, + struct list_head *item, + void *done_item, + void **state) +{ + struct xfs_extent_free_item *free; + int error; + + free = container_of(item, struct xfs_extent_free_item, xefi_list); + error = xfs_trans_free_extent(tp, done_item, + free->xefi_startblock, + free->xefi_blockcount, + &free->xefi_oinfo); + kmem_free(free); + return error; +} + +/* Abort all pending EFIs. */ +STATIC void +xfs_extent_free_abort_intent( + void *intent) +{ + xfs_efi_release(intent); +} + +/* Cancel a free extent. */ +STATIC void +xfs_extent_free_cancel_item( + struct list_head *item) +{ + struct xfs_extent_free_item *free; + + free = container_of(item, struct xfs_extent_free_item, xefi_list); + kmem_free(free); +} + +static const struct xfs_defer_op_type xfs_extent_free_defer_type = { + .type = XFS_DEFER_OPS_TYPE_FREE, + .max_items = XFS_EFI_MAX_FAST_EXTENTS, + .diff_items = xfs_extent_free_diff_items, + .create_intent = xfs_extent_free_create_intent, + .abort_intent = xfs_extent_free_abort_intent, + .log_item = xfs_extent_free_log_item, + .create_done = xfs_extent_free_create_done, + .finish_item = xfs_extent_free_finish_item, + .cancel_item = xfs_extent_free_cancel_item, +}; + +/* Register the deferred op type. */ +void +xfs_extent_free_init_defer_op(void) +{ + xfs_defer_init_op_type(&xfs_extent_free_defer_type); +} diff --git a/fs/xfs/xfs_trans_rmap.c b/fs/xfs/xfs_trans_rmap.c new file mode 100644 index 000000000..5a50ef881 --- /dev/null +++ b/fs/xfs/xfs_trans_rmap.c @@ -0,0 +1,271 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_rmap_item.h" +#include "xfs_alloc.h" +#include "xfs_rmap.h" + +/* Set the map extent flags for this reverse mapping. */ +static void +xfs_trans_set_rmap_flags( + struct xfs_map_extent *rmap, + enum xfs_rmap_intent_type type, + int whichfork, + xfs_exntst_t state) +{ + rmap->me_flags = 0; + if (state == XFS_EXT_UNWRITTEN) + rmap->me_flags |= XFS_RMAP_EXTENT_UNWRITTEN; + if (whichfork == XFS_ATTR_FORK) + rmap->me_flags |= XFS_RMAP_EXTENT_ATTR_FORK; + switch (type) { + case XFS_RMAP_MAP: + rmap->me_flags |= XFS_RMAP_EXTENT_MAP; + break; + case XFS_RMAP_UNMAP: + rmap->me_flags |= XFS_RMAP_EXTENT_UNMAP; + break; + case XFS_RMAP_CONVERT: + rmap->me_flags |= XFS_RMAP_EXTENT_CONVERT; + break; + case XFS_RMAP_ALLOC: + rmap->me_flags |= XFS_RMAP_EXTENT_ALLOC; + break; + case XFS_RMAP_FREE: + rmap->me_flags |= XFS_RMAP_EXTENT_FREE; + break; + default: + ASSERT(0); + } +} + +struct xfs_rud_log_item * +xfs_trans_get_rud( + struct xfs_trans *tp, + struct xfs_rui_log_item *ruip) +{ + struct xfs_rud_log_item *rudp; + + rudp = xfs_rud_init(tp->t_mountp, ruip); + xfs_trans_add_item(tp, &rudp->rud_item); + return rudp; +} + +/* + * Finish an rmap update and log it to the RUD. Note that the transaction is + * marked dirty regardless of whether the rmap update succeeds or fails to + * support the RUI/RUD lifecycle rules. + */ +int +xfs_trans_log_finish_rmap_update( + struct xfs_trans *tp, + struct xfs_rud_log_item *rudp, + enum xfs_rmap_intent_type type, + __uint64_t owner, + int whichfork, + xfs_fileoff_t startoff, + xfs_fsblock_t startblock, + xfs_filblks_t blockcount, + xfs_exntst_t state, + struct xfs_btree_cur **pcur) +{ + int error; + + error = xfs_rmap_finish_one(tp, type, owner, whichfork, startoff, + startblock, blockcount, state, pcur); + + /* + * Mark the transaction dirty, even on error. This ensures the + * transaction is aborted, which: + * + * 1.) releases the RUI and frees the RUD + * 2.) shuts down the filesystem + */ + tp->t_flags |= XFS_TRANS_DIRTY; + rudp->rud_item.li_desc->lid_flags |= XFS_LID_DIRTY; + + return error; +} + +/* Sort rmap intents by AG. */ +static int +xfs_rmap_update_diff_items( + void *priv, + struct list_head *a, + struct list_head *b) +{ + struct xfs_mount *mp = priv; + struct xfs_rmap_intent *ra; + struct xfs_rmap_intent *rb; + + ra = container_of(a, struct xfs_rmap_intent, ri_list); + rb = container_of(b, struct xfs_rmap_intent, ri_list); + return XFS_FSB_TO_AGNO(mp, ra->ri_bmap.br_startblock) - + XFS_FSB_TO_AGNO(mp, rb->ri_bmap.br_startblock); +} + +/* Get an RUI. */ +STATIC void * +xfs_rmap_update_create_intent( + struct xfs_trans *tp, + unsigned int count) +{ + struct xfs_rui_log_item *ruip; + + ASSERT(tp != NULL); + ASSERT(count > 0); + + ruip = xfs_rui_init(tp->t_mountp, count); + ASSERT(ruip != NULL); + + /* + * Get a log_item_desc to point at the new item. + */ + xfs_trans_add_item(tp, &ruip->rui_item); + return ruip; +} + +/* Log rmap updates in the intent item. */ +STATIC void +xfs_rmap_update_log_item( + struct xfs_trans *tp, + void *intent, + struct list_head *item) +{ + struct xfs_rui_log_item *ruip = intent; + struct xfs_rmap_intent *rmap; + uint next_extent; + struct xfs_map_extent *map; + + rmap = container_of(item, struct xfs_rmap_intent, ri_list); + + tp->t_flags |= XFS_TRANS_DIRTY; + ruip->rui_item.li_desc->lid_flags |= XFS_LID_DIRTY; + + /* + * atomic_inc_return gives us the value after the increment; + * we want to use it as an array index so we need to subtract 1 from + * it. + */ + next_extent = atomic_inc_return(&ruip->rui_next_extent) - 1; + ASSERT(next_extent < ruip->rui_format.rui_nextents); + map = &ruip->rui_format.rui_extents[next_extent]; + map->me_owner = rmap->ri_owner; + map->me_startblock = rmap->ri_bmap.br_startblock; + map->me_startoff = rmap->ri_bmap.br_startoff; + map->me_len = rmap->ri_bmap.br_blockcount; + xfs_trans_set_rmap_flags(map, rmap->ri_type, rmap->ri_whichfork, + rmap->ri_bmap.br_state); +} + +/* Get an RUD so we can process all the deferred rmap updates. */ +STATIC void * +xfs_rmap_update_create_done( + struct xfs_trans *tp, + void *intent, + unsigned int count) +{ + return xfs_trans_get_rud(tp, intent); +} + +/* Process a deferred rmap update. */ +STATIC int +xfs_rmap_update_finish_item( + struct xfs_trans *tp, + struct xfs_defer_ops *dop, + struct list_head *item, + void *done_item, + void **state) +{ + struct xfs_rmap_intent *rmap; + int error; + + rmap = container_of(item, struct xfs_rmap_intent, ri_list); + error = xfs_trans_log_finish_rmap_update(tp, done_item, + rmap->ri_type, + rmap->ri_owner, rmap->ri_whichfork, + rmap->ri_bmap.br_startoff, + rmap->ri_bmap.br_startblock, + rmap->ri_bmap.br_blockcount, + rmap->ri_bmap.br_state, + (struct xfs_btree_cur **)state); + kmem_free(rmap); + return error; +} + +/* Clean up after processing deferred rmaps. */ +STATIC void +xfs_rmap_update_finish_cleanup( + struct xfs_trans *tp, + void *state, + int error) +{ + struct xfs_btree_cur *rcur = state; + + xfs_rmap_finish_one_cleanup(tp, rcur, error); +} + +/* Abort all pending RUIs. */ +STATIC void +xfs_rmap_update_abort_intent( + void *intent) +{ + xfs_rui_release(intent); +} + +/* Cancel a deferred rmap update. */ +STATIC void +xfs_rmap_update_cancel_item( + struct list_head *item) +{ + struct xfs_rmap_intent *rmap; + + rmap = container_of(item, struct xfs_rmap_intent, ri_list); + kmem_free(rmap); +} + +static const struct xfs_defer_op_type xfs_rmap_update_defer_type = { + .type = XFS_DEFER_OPS_TYPE_RMAP, + .max_items = XFS_RUI_MAX_FAST_EXTENTS, + .diff_items = xfs_rmap_update_diff_items, + .create_intent = xfs_rmap_update_create_intent, + .abort_intent = xfs_rmap_update_abort_intent, + .log_item = xfs_rmap_update_log_item, + .create_done = xfs_rmap_update_create_done, + .finish_item = xfs_rmap_update_finish_item, + .finish_cleanup = xfs_rmap_update_finish_cleanup, + .cancel_item = xfs_rmap_update_cancel_item, +}; + +/* Register the deferred op type. */ +void +xfs_rmap_update_init_defer_op(void) +{ + xfs_defer_init_op_type(&xfs_rmap_update_defer_type); +} -- cgit v1.2.3-54-g00ecf