XFS fixes for 6.15-rc3

Signed-off-by: Carlos Maiolino <cem@kernel.org>
 -----BEGIN PGP SIGNATURE-----
 
 iJUEABMJAB0WIQSmtYVZ/MfVMGUq1GNcsMJ8RxYuYwUCaADqUgAKCRBcsMJ8RxYu
 Y+7RAX4zY9d/ub0PfJ+/Ml1LyN2deqrKCtjEJHYaMhrFfp+3b92Tn8eBiDGF1VlG
 qNoNcbMBgIMk5SPu1qimUOIo0IZO3bfMz/wJwvEPKxdw6pCB8rRCBfUOZ0j7tIXv
 RP1NX2PIew==
 =0eib
 -----END PGP SIGNATURE-----

Merge tag 'xfs-fixes-6.15-rc3' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux

Pull XFS fixes from Carlos Maiolino:
 "This mostly includes fixes and documentation for the zoned allocator
  feature merged during previous merge window, but it also adds a sysfs
  tunable for the zone garbage collector.

  There is also a fix for a regression to the RT device that we'd like
  to fix ASAP now that we're getting more users on the RT zoned
  allocator"

* tag 'xfs-fixes-6.15-rc3' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux:
  xfs: document zoned rt specifics in admin-guide
  xfs: fix fsmap for internal zoned devices
  xfs: Fix spelling mistake "drity" -> "dirty"
  xfs: compute buffer address correctly in xmbuf_map_backing_mem
  xfs: add tunable threshold parameter for triggering zone GC
  xfs: mark xfs_buf_free as might_sleep()
  xfs: remove the leftover xfs_{set,clear}_li_failed infrastructure
This commit is contained in:
Linus Torvalds 2025-04-17 09:24:56 -07:00
commit 096384deed
13 changed files with 143 additions and 61 deletions

View File

@ -124,6 +124,14 @@ When mounting an XFS filesystem, the following options are accepted.
controls the size of each buffer and so is also relevant to
this case.
lifetime (default) or nolifetime
Enable data placement based on write life time hints provided
by the user. This turns on co-allocation of data of similar
life times when statistically favorable to reduce garbage
collection cost.
These options are only available for zoned rt file systems.
logbsize=value
Set the size of each in-memory log buffer. The size may be
specified in bytes, or in kilobytes with a "k" suffix.
@ -143,6 +151,14 @@ When mounting an XFS filesystem, the following options are accepted.
optional, and the log section can be separate from the data
section or contained within it.
max_open_zones=value
Specify the max number of zones to keep open for writing on a
zoned rt device. Many open zones aids file data separation
but may impact performance on HDDs.
If ``max_open_zones`` is not specified, the value is determined
by the capabilities and the size of the zoned rt device.
noalign
Data allocations will not be aligned at stripe unit
boundaries. This is only relevant to filesystems created
@ -542,3 +558,37 @@ The interesting knobs for XFS workqueues are as follows:
nice Relative priority of scheduling the threads. These are the
same nice levels that can be applied to userspace processes.
============ ===========
Zoned Filesystems
=================
For zoned file systems, the following attribute is exposed in:
/sys/fs/xfs/<dev>/zoned/
max_open_zones (Min: 1 Default: Varies Max: UINTMAX)
This read-only attribute exposes the maximum number of open zones
available for data placement. The value is determined at mount time and
is limited by the capabilities of the backing zoned device, file system
size and the max_open_zones mount option.
Zoned Filesystems
=================
For zoned file systems, the following attributes are exposed in:
/sys/fs/xfs/<dev>/zoned/
max_open_zones (Min: 1 Default: Varies Max: UINTMAX)
This read-only attribute exposes the maximum number of open zones
available for data placement. The value is determined at mount time and
is limited by the capabilities of the backing zoned device, file system
size and the max_open_zones mount option.
zonegc_low_space (Min: 0 Default: 0 Max: 100)
Define a percentage for how much of the unused space that GC should keep
available for writing. A high value will reclaim more of the space
occupied by unused blocks, creating a larger buffer against write
bursts at the cost of increased write amplification. Regardless
of this value, garbage collection will always aim to free a minimum
amount of blocks to keep max_open_zones open for data placement purposes.

View File

@ -105,6 +105,7 @@ xfs_buf_free(
{
unsigned int size = BBTOB(bp->b_length);
might_sleep();
trace_xfs_buf_free(bp, _RET_IP_);
ASSERT(list_empty(&bp->b_lru));

View File

@ -165,7 +165,7 @@ xmbuf_map_backing_mem(
folio_set_dirty(folio);
folio_unlock(folio);
bp->b_addr = folio_address(folio);
bp->b_addr = folio_address(folio) + offset_in_folio(folio, pos);
return 0;
}

View File

@ -1186,9 +1186,8 @@ xfs_qm_dqflush_done(
if (test_bit(XFS_LI_IN_AIL, &lip->li_flags) &&
(lip->li_lsn == qlip->qli_flush_lsn ||
test_bit(XFS_LI_FAILED, &lip->li_flags))) {
spin_lock(&ailp->ail_lock);
xfs_clear_li_failed(lip);
clear_bit(XFS_LI_FAILED, &lip->li_flags);
if (lip->li_lsn == qlip->qli_flush_lsn) {
/* xfs_ail_update_finish() drops the AIL lock */
tail_lsn = xfs_ail_delete_one(ailp, lip);

View File

@ -876,6 +876,7 @@ xfs_getfsmap_rtdev_rmapbt(
const struct xfs_fsmap *keys,
struct xfs_getfsmap_info *info)
{
struct xfs_fsmap key0 = *keys; /* struct copy */
struct xfs_mount *mp = tp->t_mountp;
struct xfs_rtgroup *rtg = NULL;
struct xfs_btree_cur *bt_cur = NULL;
@ -887,32 +888,46 @@ xfs_getfsmap_rtdev_rmapbt(
int error = 0;
eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart + mp->m_sb.sb_rblocks);
if (keys[0].fmr_physical >= eofs)
if (key0.fmr_physical >= eofs)
return 0;
/*
* On zoned filesystems with an internal rt volume, the volume comes
* immediately after the end of the data volume. However, the
* xfs_rtblock_t address space is relative to the start of the data
* device, which means that the first @rtstart fsblocks do not actually
* point anywhere. If a fsmap query comes in with the low key starting
* below @rtstart, report it as "owned by filesystem".
*/
rtstart_daddr = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart);
if (keys[0].fmr_physical < rtstart_daddr) {
if (xfs_has_zoned(mp) && key0.fmr_physical < rtstart_daddr) {
struct xfs_fsmap_irec frec = {
.owner = XFS_RMAP_OWN_FS,
.len_daddr = rtstart_daddr,
};
/* Adjust the low key if we are continuing from where we left off. */
if (keys[0].fmr_length > 0) {
info->low_daddr = keys[0].fmr_physical + keys[0].fmr_length;
return 0;
/*
* Adjust the start of the query range if we're picking up from
* a previous round, and only emit the record if we haven't
* already gone past.
*/
key0.fmr_physical += key0.fmr_length;
if (key0.fmr_physical < rtstart_daddr) {
error = xfs_getfsmap_helper(tp, info, &frec);
if (error)
return error;
key0.fmr_physical = rtstart_daddr;
}
/* Fabricate an rmap entry for space occupied by the data dev */
error = xfs_getfsmap_helper(tp, info, &frec);
if (error)
return error;
/* Zero the other fields to avoid further adjustments. */
key0.fmr_owner = 0;
key0.fmr_offset = 0;
key0.fmr_length = 0;
}
start_rtb = xfs_daddr_to_rtb(mp, rtstart_daddr + keys[0].fmr_physical);
end_rtb = xfs_daddr_to_rtb(mp, rtstart_daddr +
min(eofs - 1, keys[1].fmr_physical));
start_rtb = xfs_daddr_to_rtb(mp, key0.fmr_physical);
end_rtb = xfs_daddr_to_rtb(mp, min(eofs - 1, keys[1].fmr_physical));
info->missing_owner = XFS_FMR_OWN_FREE;
/*
@ -920,12 +935,12 @@ xfs_getfsmap_rtdev_rmapbt(
* low to the fsmap low key and max out the high key to the end
* of the rtgroup.
*/
info->low.rm_offset = XFS_BB_TO_FSBT(mp, keys[0].fmr_offset);
error = xfs_fsmap_owner_to_rmap(&info->low, &keys[0]);
info->low.rm_offset = XFS_BB_TO_FSBT(mp, key0.fmr_offset);
error = xfs_fsmap_owner_to_rmap(&info->low, &key0);
if (error)
return error;
info->low.rm_blockcount = XFS_BB_TO_FSBT(mp, keys[0].fmr_length);
xfs_getfsmap_set_irec_flags(&info->low, &keys[0]);
info->low.rm_blockcount = XFS_BB_TO_FSBT(mp, key0.fmr_length);
xfs_getfsmap_set_irec_flags(&info->low, &key0);
/* Adjust the low key if we are continuing from where we left off. */
if (info->low.rm_blockcount == 0) {

View File

@ -1089,13 +1089,7 @@ xfs_iflush_abort(
* state. Whilst the inode is in the AIL, it should have a valid buffer
* pointer for push operations to access - it is only safe to remove the
* inode from the buffer once it has been removed from the AIL.
*
* We also clear the failed bit before removing the item from the AIL
* as xfs_trans_ail_delete()->xfs_clear_li_failed() will release buffer
* references the inode item owns and needs to hold until we've fully
* aborted the inode log item and detached it from the buffer.
*/
clear_bit(XFS_LI_FAILED, &iip->ili_item.li_flags);
xfs_trans_ail_delete(&iip->ili_item, 0);
/*

View File

@ -2888,7 +2888,7 @@ xlog_force_and_check_iclog(
*
* 1. the current iclog is active and has no data; the previous iclog
* is in the active or dirty state.
* 2. the current iclog is drity, and the previous iclog is in the
* 2. the current iclog is dirty, and the previous iclog is in the
* active or dirty state.
*
* We may sleep if:

View File

@ -229,6 +229,7 @@ typedef struct xfs_mount {
bool m_finobt_nores; /* no per-AG finobt resv. */
bool m_update_sb; /* sb needs update in mount */
unsigned int m_max_open_zones;
unsigned int m_zonegc_low_space;
/*
* Bitsets of per-fs metadata that have been checked and/or are sick.

View File

@ -718,8 +718,40 @@ max_open_zones_show(
}
XFS_SYSFS_ATTR_RO(max_open_zones);
static ssize_t
zonegc_low_space_store(
struct kobject *kobj,
const char *buf,
size_t count)
{
int ret;
unsigned int val;
ret = kstrtouint(buf, 0, &val);
if (ret)
return ret;
if (val > 100)
return -EINVAL;
zoned_to_mp(kobj)->m_zonegc_low_space = val;
return count;
}
static ssize_t
zonegc_low_space_show(
struct kobject *kobj,
char *buf)
{
return sysfs_emit(buf, "%u\n",
zoned_to_mp(kobj)->m_zonegc_low_space);
}
XFS_SYSFS_ATTR_RW(zonegc_low_space);
static struct attribute *xfs_zoned_attrs[] = {
ATTR_LIST(max_open_zones),
ATTR_LIST(zonegc_low_space),
NULL,
};
ATTRIBUTE_GROUPS(xfs_zoned);

View File

@ -909,10 +909,9 @@ xfs_trans_ail_delete(
return;
}
/* xfs_ail_update_finish() drops the AIL lock */
xfs_clear_li_failed(lip);
clear_bit(XFS_LI_FAILED, &lip->li_flags);
tail_lsn = xfs_ail_delete_one(ailp, lip);
xfs_ail_update_finish(ailp, tail_lsn);
xfs_ail_update_finish(ailp, tail_lsn); /* drops the AIL lock */
}
int

View File

@ -167,32 +167,4 @@ xfs_trans_ail_copy_lsn(
}
#endif
static inline void
xfs_clear_li_failed(
struct xfs_log_item *lip)
{
struct xfs_buf *bp = lip->li_buf;
ASSERT(test_bit(XFS_LI_IN_AIL, &lip->li_flags));
lockdep_assert_held(&lip->li_ailp->ail_lock);
if (test_and_clear_bit(XFS_LI_FAILED, &lip->li_flags)) {
lip->li_buf = NULL;
xfs_buf_rele(bp);
}
}
static inline void
xfs_set_li_failed(
struct xfs_log_item *lip,
struct xfs_buf *bp)
{
lockdep_assert_held(&lip->li_ailp->ail_lock);
if (!test_and_set_bit(XFS_LI_FAILED, &lip->li_flags)) {
xfs_buf_hold(bp);
lip->li_buf = bp;
}
}
#endif /* __XFS_TRANS_PRIV_H__ */

View File

@ -1201,6 +1201,13 @@ xfs_mount_zones(
xfs_set_freecounter(mp, XC_FREE_RTEXTENTS,
iz.available + iz.reclaimable);
/*
* The user may configure GC to free up a percentage of unused blocks.
* By default this is 0. GC will always trigger at the minimum level
* for keeping max_open_zones available for data placement.
*/
mp->m_zonegc_low_space = 0;
error = xfs_zone_gc_mount(mp);
if (error)
goto out_free_zone_info;

View File

@ -162,18 +162,30 @@ struct xfs_zone_gc_data {
/*
* We aim to keep enough zones free in stock to fully use the open zone limit
* for data placement purposes.
* for data placement purposes. Additionally, the m_zonegc_low_space tunable
* can be set to make sure a fraction of the unused blocks are available for
* writing.
*/
bool
xfs_zoned_need_gc(
struct xfs_mount *mp)
{
s64 available, free;
if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE))
return false;
if (xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE) <
available = xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE);
if (available <
mp->m_groups[XG_TYPE_RTG].blocks *
(mp->m_max_open_zones - XFS_OPEN_GC_ZONES))
return true;
free = xfs_estimate_freecounter(mp, XC_FREE_RTEXTENTS);
if (available < mult_frac(free, mp->m_zonegc_low_space, 100))
return true;
return false;
}