mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/
synced 2025-04-19 20:58:31 +09:00
io_uring-6.15-20250418
-----BEGIN PGP SIGNATURE----- iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAmgCWuYQHGF4Ym9lQGtl cm5lbC5kawAKCRD301j7KXHgpp7PD/4rqs3cAj0cRD8A1EkU+UQC0U3hA73uyFzF ovHmC7bdfm08m+swx1eIwP4Ke2SuHpR/seCQwQoS058Jq5UageK4Lpi/TL9B9gyG NOauvNUO8bd1DB07r887jwF9xKLugkNmdmEUZqpkMMl364xgLZntAUebt7jzRkgp xJk2BBgNmDNRebvE6RkP6RPO6GvshzYIDkZmDKO3mXtgkQUO2eG7TeBT9SFlbYo4 UyLsxL9PlZve4fHqJRORMSx9gnfKoO3NC7r2/K3ULHmftZSEy+fvHGYo6i46n6LQ wzZ/0DJw9jXz7twtKib2LjN1fxAA1rGy8ulaMWvP0rADhc2CMw3YMGmnEeaN0miw DCewKWsJk14ht07zdZmyTNshd09BPg+WL+6huXsIEVXHBo9LfHcuoWkLUGrqp6d8 JBjoMi42gz5tXGy5uh1wJaEr5Mo8PRND2QWZoRXL4ETQX0V48HrrUY1lHO0KFoZt rK8zSols85A7h9jYbxUWeHe9tmddGU5Os5AuOB9fC6jJZ106ky2+1E0EpPn3Rkwh nV9UVdw4yAfdXZwHF9QgRZvvrVtugLk9t800uvcTdvG/e/5gNp7Zbc9tGjBKqRB+ iT7911a2V0BBU5KdF8JpTeRBzwMpo2FQoEC2y/UvUl9Z6PQMCmpPTfjuSGRWS0gN RB2f6QO5Dw== =iVgS -----END PGP SIGNATURE----- Merge tag 'io_uring-6.15-20250418' of git://git.kernel.dk/linux Pull io_uring fixes from Jens Axboe: - Correctly cap iov_iter->nr_segs for imports of registered buffers, both kbuf and normal ones. Three cleanups to make it saner first, then two fixes for each of the buffer types. This fixes a performance regression where partial buffer usage doesn't trim the tail number of segments, leading the block layer to iterate the IOs to check if it needs splitting. - Two patches tweaking the newly introduced zero-copy rx API, mostly to keep the API consistent once we add multiple interface queues per ring support in the 6.16 release. - zc rx unmapping fix for a dead device * tag 'io_uring-6.15-20250418' of git://git.kernel.dk/linux: io_uring/zcrx: fix late dma unmap for a dead dev io_uring/rsrc: ensure segments counts are correct on kbuf buffers io_uring/rsrc: send exact nr_segs for fixed buffer io_uring/rsrc: refactor io_import_fixed io_uring/rsrc: separate kbuf offset adjustments io_uring/rsrc: don't skip offset calculation io_uring/zcrx: add pp to ifq conversion helper io_uring/zcrx: return ifq id to the user
This commit is contained in:
commit
b1011b2b45
@ -1010,7 +1010,9 @@ struct io_uring_zcrx_ifq_reg {
|
||||
__u64 region_ptr; /* struct io_uring_region_desc * */
|
||||
|
||||
struct io_uring_zcrx_offsets offsets;
|
||||
__u64 __resv[4];
|
||||
__u32 zcrx_id;
|
||||
__u32 __resv2;
|
||||
__u64 __resv[3];
|
||||
};
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
@ -1032,10 +1032,33 @@ static int validate_fixed_range(u64 buf_addr, size_t len,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int io_import_kbuf(int ddir, struct iov_iter *iter,
|
||||
struct io_mapped_ubuf *imu, size_t len, size_t offset)
|
||||
{
|
||||
size_t count = len + offset;
|
||||
|
||||
iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, count);
|
||||
iov_iter_advance(iter, offset);
|
||||
|
||||
if (count < imu->len) {
|
||||
const struct bio_vec *bvec = iter->bvec;
|
||||
|
||||
while (len > bvec->bv_len) {
|
||||
len -= bvec->bv_len;
|
||||
bvec++;
|
||||
}
|
||||
iter->nr_segs = 1 + bvec - iter->bvec;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int io_import_fixed(int ddir, struct iov_iter *iter,
|
||||
struct io_mapped_ubuf *imu,
|
||||
u64 buf_addr, size_t len)
|
||||
{
|
||||
const struct bio_vec *bvec;
|
||||
size_t folio_mask;
|
||||
unsigned nr_segs;
|
||||
size_t offset;
|
||||
int ret;
|
||||
|
||||
@ -1047,56 +1070,35 @@ static int io_import_fixed(int ddir, struct iov_iter *iter,
|
||||
if (!(imu->dir & (1 << ddir)))
|
||||
return -EFAULT;
|
||||
|
||||
/*
|
||||
* Might not be a start of buffer, set size appropriately
|
||||
* and advance us to the beginning.
|
||||
*/
|
||||
offset = buf_addr - imu->ubuf;
|
||||
iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len);
|
||||
|
||||
if (offset) {
|
||||
/*
|
||||
* Don't use iov_iter_advance() here, as it's really slow for
|
||||
* using the latter parts of a big fixed buffer - it iterates
|
||||
* over each segment manually. We can cheat a bit here for user
|
||||
* registered nodes, because we know that:
|
||||
*
|
||||
* 1) it's a BVEC iter, we set it up
|
||||
* 2) all bvecs are the same in size, except potentially the
|
||||
* first and last bvec
|
||||
*
|
||||
* So just find our index, and adjust the iterator afterwards.
|
||||
* If the offset is within the first bvec (or the whole first
|
||||
* bvec, just use iov_iter_advance(). This makes it easier
|
||||
* since we can just skip the first segment, which may not
|
||||
* be folio_size aligned.
|
||||
*/
|
||||
const struct bio_vec *bvec = imu->bvec;
|
||||
if (imu->is_kbuf)
|
||||
return io_import_kbuf(ddir, iter, imu, len, offset);
|
||||
|
||||
/*
|
||||
* Kernel buffer bvecs, on the other hand, don't necessarily
|
||||
* have the size property of user registered ones, so we have
|
||||
* to use the slow iter advance.
|
||||
*/
|
||||
if (offset < bvec->bv_len) {
|
||||
iter->count -= offset;
|
||||
iter->iov_offset = offset;
|
||||
} else if (imu->is_kbuf) {
|
||||
iov_iter_advance(iter, offset);
|
||||
} else {
|
||||
unsigned long seg_skip;
|
||||
/*
|
||||
* Don't use iov_iter_advance() here, as it's really slow for
|
||||
* using the latter parts of a big fixed buffer - it iterates
|
||||
* over each segment manually. We can cheat a bit here for user
|
||||
* registered nodes, because we know that:
|
||||
*
|
||||
* 1) it's a BVEC iter, we set it up
|
||||
* 2) all bvecs are the same in size, except potentially the
|
||||
* first and last bvec
|
||||
*/
|
||||
folio_mask = (1UL << imu->folio_shift) - 1;
|
||||
bvec = imu->bvec;
|
||||
if (offset >= bvec->bv_len) {
|
||||
unsigned long seg_skip;
|
||||
|
||||
/* skip first vec */
|
||||
offset -= bvec->bv_len;
|
||||
seg_skip = 1 + (offset >> imu->folio_shift);
|
||||
|
||||
iter->bvec += seg_skip;
|
||||
iter->nr_segs -= seg_skip;
|
||||
iter->count -= bvec->bv_len + offset;
|
||||
iter->iov_offset = offset & ((1UL << imu->folio_shift) - 1);
|
||||
}
|
||||
/* skip first vec */
|
||||
offset -= bvec->bv_len;
|
||||
seg_skip = 1 + (offset >> imu->folio_shift);
|
||||
bvec += seg_skip;
|
||||
offset &= folio_mask;
|
||||
}
|
||||
|
||||
nr_segs = (offset + len + bvec->bv_offset + folio_mask) >> imu->folio_shift;
|
||||
iov_iter_bvec(iter, ddir, bvec, nr_segs, len);
|
||||
iter->iov_offset = offset;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -26,6 +26,11 @@
|
||||
#include "zcrx.h"
|
||||
#include "rsrc.h"
|
||||
|
||||
static inline struct io_zcrx_ifq *io_pp_to_ifq(struct page_pool *pp)
|
||||
{
|
||||
return pp->mp_priv;
|
||||
}
|
||||
|
||||
#define IO_DMA_ATTR (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING)
|
||||
|
||||
static void __io_zcrx_unmap_area(struct io_zcrx_ifq *ifq,
|
||||
@ -46,14 +51,21 @@ static void __io_zcrx_unmap_area(struct io_zcrx_ifq *ifq,
|
||||
|
||||
static void io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
|
||||
{
|
||||
guard(mutex)(&ifq->dma_lock);
|
||||
|
||||
if (area->is_mapped)
|
||||
__io_zcrx_unmap_area(ifq, area, area->nia.num_niovs);
|
||||
area->is_mapped = false;
|
||||
}
|
||||
|
||||
static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
|
||||
{
|
||||
int i;
|
||||
|
||||
guard(mutex)(&ifq->dma_lock);
|
||||
if (area->is_mapped)
|
||||
return 0;
|
||||
|
||||
for (i = 0; i < area->nia.num_niovs; i++) {
|
||||
struct net_iov *niov = &area->nia.niovs[i];
|
||||
dma_addr_t dma;
|
||||
@ -275,6 +287,7 @@ static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx)
|
||||
ifq->ctx = ctx;
|
||||
spin_lock_init(&ifq->lock);
|
||||
spin_lock_init(&ifq->rq_lock);
|
||||
mutex_init(&ifq->dma_lock);
|
||||
return ifq;
|
||||
}
|
||||
|
||||
@ -324,6 +337,7 @@ static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq)
|
||||
put_device(ifq->dev);
|
||||
|
||||
io_free_rbuf_ring(ifq);
|
||||
mutex_destroy(&ifq->dma_lock);
|
||||
kfree(ifq);
|
||||
}
|
||||
|
||||
@ -354,7 +368,8 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
|
||||
return -EFAULT;
|
||||
if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd)))
|
||||
return -EFAULT;
|
||||
if (memchr_inv(®.__resv, 0, sizeof(reg.__resv)))
|
||||
if (memchr_inv(®.__resv, 0, sizeof(reg.__resv)) ||
|
||||
reg.__resv2 || reg.zcrx_id)
|
||||
return -EINVAL;
|
||||
if (reg.if_rxq == -1 || !reg.rq_entries || reg.flags)
|
||||
return -EINVAL;
|
||||
@ -394,10 +409,6 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
|
||||
goto err;
|
||||
get_device(ifq->dev);
|
||||
|
||||
ret = io_zcrx_map_area(ifq, ifq->area);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
mp_param.mp_ops = &io_uring_pp_zc_ops;
|
||||
mp_param.mp_priv = ifq;
|
||||
ret = net_mp_open_rxq(ifq->netdev, reg.if_rxq, &mp_param);
|
||||
@ -585,7 +596,7 @@ static void io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *ifq)
|
||||
|
||||
static netmem_ref io_pp_zc_alloc_netmems(struct page_pool *pp, gfp_t gfp)
|
||||
{
|
||||
struct io_zcrx_ifq *ifq = pp->mp_priv;
|
||||
struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp);
|
||||
|
||||
/* pp should already be ensuring that */
|
||||
if (unlikely(pp->alloc.count))
|
||||
@ -617,7 +628,8 @@ static bool io_pp_zc_release_netmem(struct page_pool *pp, netmem_ref netmem)
|
||||
|
||||
static int io_pp_zc_init(struct page_pool *pp)
|
||||
{
|
||||
struct io_zcrx_ifq *ifq = pp->mp_priv;
|
||||
struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp);
|
||||
int ret;
|
||||
|
||||
if (WARN_ON_ONCE(!ifq))
|
||||
return -EINVAL;
|
||||
@ -630,13 +642,17 @@ static int io_pp_zc_init(struct page_pool *pp)
|
||||
if (pp->p.dma_dir != DMA_FROM_DEVICE)
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
ret = io_zcrx_map_area(ifq, ifq->area);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
percpu_ref_get(&ifq->ctx->refs);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void io_pp_zc_destroy(struct page_pool *pp)
|
||||
{
|
||||
struct io_zcrx_ifq *ifq = pp->mp_priv;
|
||||
struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp);
|
||||
struct io_zcrx_area *area = ifq->area;
|
||||
|
||||
if (WARN_ON_ONCE(area->free_count != area->nia.num_niovs))
|
||||
@ -665,6 +681,9 @@ static void io_pp_uninstall(void *mp_priv, struct netdev_rx_queue *rxq)
|
||||
struct io_zcrx_ifq *ifq = mp_priv;
|
||||
|
||||
io_zcrx_drop_netdev(ifq);
|
||||
if (ifq->area)
|
||||
io_zcrx_unmap_area(ifq, ifq->area);
|
||||
|
||||
p->mp_ops = NULL;
|
||||
p->mp_priv = NULL;
|
||||
}
|
||||
@ -791,7 +810,7 @@ static int io_zcrx_recv_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
|
||||
|
||||
niov = netmem_to_net_iov(frag->netmem);
|
||||
if (niov->pp->mp_ops != &io_uring_pp_zc_ops ||
|
||||
niov->pp->mp_priv != ifq)
|
||||
io_pp_to_ifq(niov->pp) != ifq)
|
||||
return -EFAULT;
|
||||
|
||||
if (!io_zcrx_queue_cqe(req, niov, ifq, off + skb_frag_off(frag), len))
|
||||
|
@ -38,6 +38,7 @@ struct io_zcrx_ifq {
|
||||
struct net_device *netdev;
|
||||
netdevice_tracker netdev_tracker;
|
||||
spinlock_t lock;
|
||||
struct mutex dma_lock;
|
||||
};
|
||||
|
||||
#if defined(CONFIG_IO_URING_ZCRX)
|
||||
|
Loading…
x
Reference in New Issue
Block a user