mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/
synced 2025-04-19 20:58:31 +09:00
XFS fixes for 6.15-rc3
Signed-off-by: Carlos Maiolino <cem@kernel.org> -----BEGIN PGP SIGNATURE----- iJUEABMJAB0WIQSmtYVZ/MfVMGUq1GNcsMJ8RxYuYwUCaADqUgAKCRBcsMJ8RxYu Y+7RAX4zY9d/ub0PfJ+/Ml1LyN2deqrKCtjEJHYaMhrFfp+3b92Tn8eBiDGF1VlG qNoNcbMBgIMk5SPu1qimUOIo0IZO3bfMz/wJwvEPKxdw6pCB8rRCBfUOZ0j7tIXv RP1NX2PIew== =0eib -----END PGP SIGNATURE----- Merge tag 'xfs-fixes-6.15-rc3' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux Pull XFS fixes from Carlos Maiolino: "This mostly includes fixes and documentation for the zoned allocator feature merged during previous merge window, but it also adds a sysfs tunable for the zone garbage collector. There is also a fix for a regression to the RT device that we'd like to fix ASAP now that we're getting more users on the RT zoned allocator" * tag 'xfs-fixes-6.15-rc3' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux: xfs: document zoned rt specifics in admin-guide xfs: fix fsmap for internal zoned devices xfs: Fix spelling mistake "drity" -> "dirty" xfs: compute buffer address correctly in xmbuf_map_backing_mem xfs: add tunable threshold parameter for triggering zone GC xfs: mark xfs_buf_free as might_sleep() xfs: remove the leftover xfs_{set,clear}_li_failed infrastructure
This commit is contained in:
commit
096384deed
@ -124,6 +124,14 @@ When mounting an XFS filesystem, the following options are accepted.
|
||||
controls the size of each buffer and so is also relevant to
|
||||
this case.
|
||||
|
||||
lifetime (default) or nolifetime
|
||||
Enable data placement based on write life time hints provided
|
||||
by the user. This turns on co-allocation of data of similar
|
||||
life times when statistically favorable to reduce garbage
|
||||
collection cost.
|
||||
|
||||
These options are only available for zoned rt file systems.
|
||||
|
||||
logbsize=value
|
||||
Set the size of each in-memory log buffer. The size may be
|
||||
specified in bytes, or in kilobytes with a "k" suffix.
|
||||
@ -143,6 +151,14 @@ When mounting an XFS filesystem, the following options are accepted.
|
||||
optional, and the log section can be separate from the data
|
||||
section or contained within it.
|
||||
|
||||
max_open_zones=value
|
||||
Specify the max number of zones to keep open for writing on a
|
||||
zoned rt device. Many open zones aids file data separation
|
||||
but may impact performance on HDDs.
|
||||
|
||||
If ``max_open_zones`` is not specified, the value is determined
|
||||
by the capabilities and the size of the zoned rt device.
|
||||
|
||||
noalign
|
||||
Data allocations will not be aligned at stripe unit
|
||||
boundaries. This is only relevant to filesystems created
|
||||
@ -542,3 +558,37 @@ The interesting knobs for XFS workqueues are as follows:
|
||||
nice Relative priority of scheduling the threads. These are the
|
||||
same nice levels that can be applied to userspace processes.
|
||||
============ ===========
|
||||
|
||||
Zoned Filesystems
|
||||
=================
|
||||
|
||||
For zoned file systems, the following attribute is exposed in:
|
||||
|
||||
/sys/fs/xfs/<dev>/zoned/
|
||||
|
||||
max_open_zones (Min: 1 Default: Varies Max: UINTMAX)
|
||||
This read-only attribute exposes the maximum number of open zones
|
||||
available for data placement. The value is determined at mount time and
|
||||
is limited by the capabilities of the backing zoned device, file system
|
||||
size and the max_open_zones mount option.
|
||||
|
||||
Zoned Filesystems
|
||||
=================
|
||||
|
||||
For zoned file systems, the following attributes are exposed in:
|
||||
|
||||
/sys/fs/xfs/<dev>/zoned/
|
||||
|
||||
max_open_zones (Min: 1 Default: Varies Max: UINTMAX)
|
||||
This read-only attribute exposes the maximum number of open zones
|
||||
available for data placement. The value is determined at mount time and
|
||||
is limited by the capabilities of the backing zoned device, file system
|
||||
size and the max_open_zones mount option.
|
||||
|
||||
zonegc_low_space (Min: 0 Default: 0 Max: 100)
|
||||
Define a percentage for how much of the unused space that GC should keep
|
||||
available for writing. A high value will reclaim more of the space
|
||||
occupied by unused blocks, creating a larger buffer against write
|
||||
bursts at the cost of increased write amplification. Regardless
|
||||
of this value, garbage collection will always aim to free a minimum
|
||||
amount of blocks to keep max_open_zones open for data placement purposes.
|
||||
|
@ -105,6 +105,7 @@ xfs_buf_free(
|
||||
{
|
||||
unsigned int size = BBTOB(bp->b_length);
|
||||
|
||||
might_sleep();
|
||||
trace_xfs_buf_free(bp, _RET_IP_);
|
||||
|
||||
ASSERT(list_empty(&bp->b_lru));
|
||||
|
@ -165,7 +165,7 @@ xmbuf_map_backing_mem(
|
||||
folio_set_dirty(folio);
|
||||
folio_unlock(folio);
|
||||
|
||||
bp->b_addr = folio_address(folio);
|
||||
bp->b_addr = folio_address(folio) + offset_in_folio(folio, pos);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -1186,9 +1186,8 @@ xfs_qm_dqflush_done(
|
||||
if (test_bit(XFS_LI_IN_AIL, &lip->li_flags) &&
|
||||
(lip->li_lsn == qlip->qli_flush_lsn ||
|
||||
test_bit(XFS_LI_FAILED, &lip->li_flags))) {
|
||||
|
||||
spin_lock(&ailp->ail_lock);
|
||||
xfs_clear_li_failed(lip);
|
||||
clear_bit(XFS_LI_FAILED, &lip->li_flags);
|
||||
if (lip->li_lsn == qlip->qli_flush_lsn) {
|
||||
/* xfs_ail_update_finish() drops the AIL lock */
|
||||
tail_lsn = xfs_ail_delete_one(ailp, lip);
|
||||
|
@ -876,6 +876,7 @@ xfs_getfsmap_rtdev_rmapbt(
|
||||
const struct xfs_fsmap *keys,
|
||||
struct xfs_getfsmap_info *info)
|
||||
{
|
||||
struct xfs_fsmap key0 = *keys; /* struct copy */
|
||||
struct xfs_mount *mp = tp->t_mountp;
|
||||
struct xfs_rtgroup *rtg = NULL;
|
||||
struct xfs_btree_cur *bt_cur = NULL;
|
||||
@ -887,32 +888,46 @@ xfs_getfsmap_rtdev_rmapbt(
|
||||
int error = 0;
|
||||
|
||||
eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart + mp->m_sb.sb_rblocks);
|
||||
if (keys[0].fmr_physical >= eofs)
|
||||
if (key0.fmr_physical >= eofs)
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* On zoned filesystems with an internal rt volume, the volume comes
|
||||
* immediately after the end of the data volume. However, the
|
||||
* xfs_rtblock_t address space is relative to the start of the data
|
||||
* device, which means that the first @rtstart fsblocks do not actually
|
||||
* point anywhere. If a fsmap query comes in with the low key starting
|
||||
* below @rtstart, report it as "owned by filesystem".
|
||||
*/
|
||||
rtstart_daddr = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart);
|
||||
if (keys[0].fmr_physical < rtstart_daddr) {
|
||||
if (xfs_has_zoned(mp) && key0.fmr_physical < rtstart_daddr) {
|
||||
struct xfs_fsmap_irec frec = {
|
||||
.owner = XFS_RMAP_OWN_FS,
|
||||
.len_daddr = rtstart_daddr,
|
||||
};
|
||||
|
||||
/* Adjust the low key if we are continuing from where we left off. */
|
||||
if (keys[0].fmr_length > 0) {
|
||||
info->low_daddr = keys[0].fmr_physical + keys[0].fmr_length;
|
||||
return 0;
|
||||
/*
|
||||
* Adjust the start of the query range if we're picking up from
|
||||
* a previous round, and only emit the record if we haven't
|
||||
* already gone past.
|
||||
*/
|
||||
key0.fmr_physical += key0.fmr_length;
|
||||
if (key0.fmr_physical < rtstart_daddr) {
|
||||
error = xfs_getfsmap_helper(tp, info, &frec);
|
||||
if (error)
|
||||
return error;
|
||||
|
||||
key0.fmr_physical = rtstart_daddr;
|
||||
}
|
||||
|
||||
/* Fabricate an rmap entry for space occupied by the data dev */
|
||||
error = xfs_getfsmap_helper(tp, info, &frec);
|
||||
if (error)
|
||||
return error;
|
||||
/* Zero the other fields to avoid further adjustments. */
|
||||
key0.fmr_owner = 0;
|
||||
key0.fmr_offset = 0;
|
||||
key0.fmr_length = 0;
|
||||
}
|
||||
|
||||
start_rtb = xfs_daddr_to_rtb(mp, rtstart_daddr + keys[0].fmr_physical);
|
||||
end_rtb = xfs_daddr_to_rtb(mp, rtstart_daddr +
|
||||
min(eofs - 1, keys[1].fmr_physical));
|
||||
|
||||
start_rtb = xfs_daddr_to_rtb(mp, key0.fmr_physical);
|
||||
end_rtb = xfs_daddr_to_rtb(mp, min(eofs - 1, keys[1].fmr_physical));
|
||||
info->missing_owner = XFS_FMR_OWN_FREE;
|
||||
|
||||
/*
|
||||
@ -920,12 +935,12 @@ xfs_getfsmap_rtdev_rmapbt(
|
||||
* low to the fsmap low key and max out the high key to the end
|
||||
* of the rtgroup.
|
||||
*/
|
||||
info->low.rm_offset = XFS_BB_TO_FSBT(mp, keys[0].fmr_offset);
|
||||
error = xfs_fsmap_owner_to_rmap(&info->low, &keys[0]);
|
||||
info->low.rm_offset = XFS_BB_TO_FSBT(mp, key0.fmr_offset);
|
||||
error = xfs_fsmap_owner_to_rmap(&info->low, &key0);
|
||||
if (error)
|
||||
return error;
|
||||
info->low.rm_blockcount = XFS_BB_TO_FSBT(mp, keys[0].fmr_length);
|
||||
xfs_getfsmap_set_irec_flags(&info->low, &keys[0]);
|
||||
info->low.rm_blockcount = XFS_BB_TO_FSBT(mp, key0.fmr_length);
|
||||
xfs_getfsmap_set_irec_flags(&info->low, &key0);
|
||||
|
||||
/* Adjust the low key if we are continuing from where we left off. */
|
||||
if (info->low.rm_blockcount == 0) {
|
||||
|
@ -1089,13 +1089,7 @@ xfs_iflush_abort(
|
||||
* state. Whilst the inode is in the AIL, it should have a valid buffer
|
||||
* pointer for push operations to access - it is only safe to remove the
|
||||
* inode from the buffer once it has been removed from the AIL.
|
||||
*
|
||||
* We also clear the failed bit before removing the item from the AIL
|
||||
* as xfs_trans_ail_delete()->xfs_clear_li_failed() will release buffer
|
||||
* references the inode item owns and needs to hold until we've fully
|
||||
* aborted the inode log item and detached it from the buffer.
|
||||
*/
|
||||
clear_bit(XFS_LI_FAILED, &iip->ili_item.li_flags);
|
||||
xfs_trans_ail_delete(&iip->ili_item, 0);
|
||||
|
||||
/*
|
||||
|
@ -2888,7 +2888,7 @@ xlog_force_and_check_iclog(
|
||||
*
|
||||
* 1. the current iclog is active and has no data; the previous iclog
|
||||
* is in the active or dirty state.
|
||||
* 2. the current iclog is drity, and the previous iclog is in the
|
||||
* 2. the current iclog is dirty, and the previous iclog is in the
|
||||
* active or dirty state.
|
||||
*
|
||||
* We may sleep if:
|
||||
|
@ -229,6 +229,7 @@ typedef struct xfs_mount {
|
||||
bool m_finobt_nores; /* no per-AG finobt resv. */
|
||||
bool m_update_sb; /* sb needs update in mount */
|
||||
unsigned int m_max_open_zones;
|
||||
unsigned int m_zonegc_low_space;
|
||||
|
||||
/*
|
||||
* Bitsets of per-fs metadata that have been checked and/or are sick.
|
||||
|
@ -718,8 +718,40 @@ max_open_zones_show(
|
||||
}
|
||||
XFS_SYSFS_ATTR_RO(max_open_zones);
|
||||
|
||||
static ssize_t
|
||||
zonegc_low_space_store(
|
||||
struct kobject *kobj,
|
||||
const char *buf,
|
||||
size_t count)
|
||||
{
|
||||
int ret;
|
||||
unsigned int val;
|
||||
|
||||
ret = kstrtouint(buf, 0, &val);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (val > 100)
|
||||
return -EINVAL;
|
||||
|
||||
zoned_to_mp(kobj)->m_zonegc_low_space = val;
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
static ssize_t
|
||||
zonegc_low_space_show(
|
||||
struct kobject *kobj,
|
||||
char *buf)
|
||||
{
|
||||
return sysfs_emit(buf, "%u\n",
|
||||
zoned_to_mp(kobj)->m_zonegc_low_space);
|
||||
}
|
||||
XFS_SYSFS_ATTR_RW(zonegc_low_space);
|
||||
|
||||
static struct attribute *xfs_zoned_attrs[] = {
|
||||
ATTR_LIST(max_open_zones),
|
||||
ATTR_LIST(zonegc_low_space),
|
||||
NULL,
|
||||
};
|
||||
ATTRIBUTE_GROUPS(xfs_zoned);
|
||||
|
@ -909,10 +909,9 @@ xfs_trans_ail_delete(
|
||||
return;
|
||||
}
|
||||
|
||||
/* xfs_ail_update_finish() drops the AIL lock */
|
||||
xfs_clear_li_failed(lip);
|
||||
clear_bit(XFS_LI_FAILED, &lip->li_flags);
|
||||
tail_lsn = xfs_ail_delete_one(ailp, lip);
|
||||
xfs_ail_update_finish(ailp, tail_lsn);
|
||||
xfs_ail_update_finish(ailp, tail_lsn); /* drops the AIL lock */
|
||||
}
|
||||
|
||||
int
|
||||
|
@ -167,32 +167,4 @@ xfs_trans_ail_copy_lsn(
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline void
|
||||
xfs_clear_li_failed(
|
||||
struct xfs_log_item *lip)
|
||||
{
|
||||
struct xfs_buf *bp = lip->li_buf;
|
||||
|
||||
ASSERT(test_bit(XFS_LI_IN_AIL, &lip->li_flags));
|
||||
lockdep_assert_held(&lip->li_ailp->ail_lock);
|
||||
|
||||
if (test_and_clear_bit(XFS_LI_FAILED, &lip->li_flags)) {
|
||||
lip->li_buf = NULL;
|
||||
xfs_buf_rele(bp);
|
||||
}
|
||||
}
|
||||
|
||||
static inline void
|
||||
xfs_set_li_failed(
|
||||
struct xfs_log_item *lip,
|
||||
struct xfs_buf *bp)
|
||||
{
|
||||
lockdep_assert_held(&lip->li_ailp->ail_lock);
|
||||
|
||||
if (!test_and_set_bit(XFS_LI_FAILED, &lip->li_flags)) {
|
||||
xfs_buf_hold(bp);
|
||||
lip->li_buf = bp;
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* __XFS_TRANS_PRIV_H__ */
|
||||
|
@ -1201,6 +1201,13 @@ xfs_mount_zones(
|
||||
xfs_set_freecounter(mp, XC_FREE_RTEXTENTS,
|
||||
iz.available + iz.reclaimable);
|
||||
|
||||
/*
|
||||
* The user may configure GC to free up a percentage of unused blocks.
|
||||
* By default this is 0. GC will always trigger at the minimum level
|
||||
* for keeping max_open_zones available for data placement.
|
||||
*/
|
||||
mp->m_zonegc_low_space = 0;
|
||||
|
||||
error = xfs_zone_gc_mount(mp);
|
||||
if (error)
|
||||
goto out_free_zone_info;
|
||||
|
@ -162,18 +162,30 @@ struct xfs_zone_gc_data {
|
||||
|
||||
/*
|
||||
* We aim to keep enough zones free in stock to fully use the open zone limit
|
||||
* for data placement purposes.
|
||||
* for data placement purposes. Additionally, the m_zonegc_low_space tunable
|
||||
* can be set to make sure a fraction of the unused blocks are available for
|
||||
* writing.
|
||||
*/
|
||||
bool
|
||||
xfs_zoned_need_gc(
|
||||
struct xfs_mount *mp)
|
||||
{
|
||||
s64 available, free;
|
||||
|
||||
if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE))
|
||||
return false;
|
||||
if (xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE) <
|
||||
|
||||
available = xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE);
|
||||
|
||||
if (available <
|
||||
mp->m_groups[XG_TYPE_RTG].blocks *
|
||||
(mp->m_max_open_zones - XFS_OPEN_GC_ZONES))
|
||||
return true;
|
||||
|
||||
free = xfs_estimate_freecounter(mp, XC_FREE_RTEXTENTS);
|
||||
if (available < mult_frac(free, mp->m_zonegc_low_space, 100))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user