io_uring-6.15-20250418

-----BEGIN PGP SIGNATURE----- iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAmgCWuYQHGF4Ym9lQGtl cm5lbC5kawAKCRD301j7KXHgpp7PD/4rqs3cAj0cRD8A1EkU+UQC0U3hA73uyFzF ovHmC7bdfm08m+swx1eIwP4Ke2SuHpR/seCQwQoS058Jq5UageK4Lpi/TL9B9gyG NOauvNUO8bd1DB07r887jwF9xKLugkNmdmEUZqpkMMl364xgLZntAUebt7jzRkgp xJk2BBgNmDNRebvE6RkP6RPO6GvshzYIDkZmDKO3mXtgkQUO2eG7TeBT9SFlbYo4 UyLsxL9PlZve4fHqJRORMSx9gnfKoO3NC7r2/K3ULHmftZSEy+fvHGYo6i46n6LQ wzZ/0DJw9jXz7twtKib2LjN1fxAA1rGy8ulaMWvP0rADhc2CMw3YMGmnEeaN0miw DCewKWsJk14ht07zdZmyTNshd09BPg+WL+6huXsIEVXHBo9LfHcuoWkLUGrqp6d8 JBjoMi42gz5tXGy5uh1wJaEr5Mo8PRND2QWZoRXL4ETQX0V48HrrUY1lHO0KFoZt rK8zSols85A7h9jYbxUWeHe9tmddGU5Os5AuOB9fC6jJZ106ky2+1E0EpPn3Rkwh nV9UVdw4yAfdXZwHF9QgRZvvrVtugLk9t800uvcTdvG/e/5gNp7Zbc9tGjBKqRB+ iT7911a2V0BBU5KdF8JpTeRBzwMpo2FQoEC2y/UvUl9Z6PQMCmpPTfjuSGRWS0gN RB2f6QO5Dw== =iVgS -----END PGP SIGNATURE----- Merge tag 'io_uring-6.15-20250418' of git://git.kernel.dk/linux Pull io_uring fixes from Jens Axboe: - Correctly cap iov_iter->nr_segs for imports of registered buffers, both kbuf and normal ones. Three cleanups to make it saner first, then two fixes for each of the buffer types. This fixes a performance regression where partial buffer usage doesn't trim the tail number of segments, leading the block layer to iterate the IOs to check if it needs splitting. - Two patches tweaking the newly introduced zero-copy rx API, mostly to keep the API consistent once we add multiple interface queues per ring support in the 6.16 release. - zc rx unmapping fix for a dead device * tag 'io_uring-6.15-20250418' of git://git.kernel.dk/linux: io_uring/zcrx: fix late dma unmap for a dead dev io_uring/rsrc: ensure segments counts are correct on kbuf buffers io_uring/rsrc: send exact nr_segs for fixed buffer io_uring/rsrc: refactor io_import_fixed io_uring/rsrc: separate kbuf offset adjustments io_uring/rsrc: don't skip offset calculation io_uring/zcrx: add pp to ifq conversion helper io_uring/zcrx: return ifq id to the user
2025-04-19 20:58:31 +09:00 · 2025-04-18 09:13:52 -07:00 · 2025-04-18 09:13:52 -07:00 · b1011b2b45
commit b1011b2b45
parent fc96b232f8 f12ecf5e1c
4 changed files with 79 additions and 55 deletions
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@ -1010,7 +1010,9 @@ struct io_uring_zcrx_ifq_reg {
 	__u64	region_ptr; /* struct io_uring_region_desc * */

 	struct io_uring_zcrx_offsets offsets;
-	__u64	__resv[4];
+	__u32	zcrx_id;
+	__u32	__resv2;
+	__u64	__resv[3];
 };

 #ifdef __cplusplus
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@ -1032,10 +1032,33 @@ static int validate_fixed_range(u64 buf_addr, size_t len,
 	return 0;
 }

+static int io_import_kbuf(int ddir, struct iov_iter *iter,
+			  struct io_mapped_ubuf *imu, size_t len, size_t offset)
+{
+	size_t count = len + offset;
+
+	iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, count);
+	iov_iter_advance(iter, offset);
+
+	if (count < imu->len) {
+		const struct bio_vec *bvec = iter->bvec;
+
+		while (len > bvec->bv_len) {
+			len -= bvec->bv_len;
+			bvec++;
+		}
+		iter->nr_segs = 1 + bvec - iter->bvec;
+	}
+	return 0;
+}
+
 static int io_import_fixed(int ddir, struct iov_iter *iter,
 			   struct io_mapped_ubuf *imu,
 			   u64 buf_addr, size_t len)
 {
+	const struct bio_vec *bvec;
+	size_t folio_mask;
+	unsigned nr_segs;
 	size_t offset;
 	int ret;

@ -1047,56 +1070,35 @@ static int io_import_fixed(int ddir, struct iov_iter *iter,
 	if (!(imu->dir & (1 << ddir)))
 		return -EFAULT;

-	/*
-	 * Might not be a start of buffer, set size appropriately
-	 * and advance us to the beginning.
-	 */
 	offset = buf_addr - imu->ubuf;
-	iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len);

-	if (offset) {
-		/*
-		 * Don't use iov_iter_advance() here, as it's really slow for
-		 * using the latter parts of a big fixed buffer - it iterates
-		 * over each segment manually. We can cheat a bit here for user
-		 * registered nodes, because we know that:
-		 *
-		 * 1) it's a BVEC iter, we set it up
-		 * 2) all bvecs are the same in size, except potentially the
-		 *    first and last bvec
-		 *
-		 * So just find our index, and adjust the iterator afterwards.
-		 * If the offset is within the first bvec (or the whole first
-		 * bvec, just use iov_iter_advance(). This makes it easier
-		 * since we can just skip the first segment, which may not
-		 * be folio_size aligned.
-		 */
-		const struct bio_vec *bvec = imu->bvec;
+	if (imu->is_kbuf)
+		return io_import_kbuf(ddir, iter, imu, len, offset);

-		/*
-		 * Kernel buffer bvecs, on the other hand, don't necessarily
-		 * have the size property of user registered ones, so we have
-		 * to use the slow iter advance.
-		 */
-		if (offset < bvec->bv_len) {
-			iter->count -= offset;
-			iter->iov_offset = offset;
-		} else if (imu->is_kbuf) {
-			iov_iter_advance(iter, offset);
-		} else {
-			unsigned long seg_skip;
+	/*
+	 * Don't use iov_iter_advance() here, as it's really slow for
+	 * using the latter parts of a big fixed buffer - it iterates
+	 * over each segment manually. We can cheat a bit here for user
+	 * registered nodes, because we know that:
+	 *
+	 * 1) it's a BVEC iter, we set it up
+	 * 2) all bvecs are the same in size, except potentially the
+	 *    first and last bvec
+	 */
+	folio_mask = (1UL << imu->folio_shift) - 1;
+	bvec = imu->bvec;
+	if (offset >= bvec->bv_len) {
+		unsigned long seg_skip;

-			/* skip first vec */
-			offset -= bvec->bv_len;
-			seg_skip = 1 + (offset >> imu->folio_shift);
-
-			iter->bvec += seg_skip;
-			iter->nr_segs -= seg_skip;
-			iter->count -= bvec->bv_len + offset;
-			iter->iov_offset = offset & ((1UL << imu->folio_shift) - 1);
-		}
+		/* skip first vec */
+		offset -= bvec->bv_len;
+		seg_skip = 1 + (offset >> imu->folio_shift);
+		bvec += seg_skip;
+		offset &= folio_mask;
 	}
-
+	nr_segs = (offset + len + bvec->bv_offset + folio_mask) >> imu->folio_shift;
+	iov_iter_bvec(iter, ddir, bvec, nr_segs, len);
+	iter->iov_offset = offset;
 	return 0;
 }

--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@ -26,6 +26,11 @@
 #include "zcrx.h"
 #include "rsrc.h"

+static inline struct io_zcrx_ifq *io_pp_to_ifq(struct page_pool *pp)
+{
+	return pp->mp_priv;
+}
+
 #define IO_DMA_ATTR (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING)

 static void __io_zcrx_unmap_area(struct io_zcrx_ifq *ifq,
@ -46,14 +51,21 @@ static void __io_zcrx_unmap_area(struct io_zcrx_ifq *ifq,

 static void io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
 {
+	guard(mutex)(&ifq->dma_lock);
+
 	if (area->is_mapped)
 		__io_zcrx_unmap_area(ifq, area, area->nia.num_niovs);
+	area->is_mapped = false;
 }

 static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
 {
 	int i;

+	guard(mutex)(&ifq->dma_lock);
+	if (area->is_mapped)
+		return 0;
+
 	for (i = 0; i < area->nia.num_niovs; i++) {
 		struct net_iov *niov = &area->nia.niovs[i];
 		dma_addr_t dma;
@ -275,6 +287,7 @@ static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx)
 	ifq->ctx = ctx;
 	spin_lock_init(&ifq->lock);
 	spin_lock_init(&ifq->rq_lock);
+	mutex_init(&ifq->dma_lock);
 	return ifq;
 }

@ -324,6 +337,7 @@ static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq)
 		put_device(ifq->dev);

 	io_free_rbuf_ring(ifq);
+	mutex_destroy(&ifq->dma_lock);
 	kfree(ifq);
 }

@ -354,7 +368,8 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
 		return -EFAULT;
 	if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd)))
 		return -EFAULT;
-	if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)))
+	if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)) ||
+	    reg.__resv2 || reg.zcrx_id)
 		return -EINVAL;
 	if (reg.if_rxq == -1 || !reg.rq_entries || reg.flags)
 		return -EINVAL;
@ -394,10 +409,6 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
 		goto err;
 	get_device(ifq->dev);

-	ret = io_zcrx_map_area(ifq, ifq->area);
-	if (ret)
-		goto err;
-
 	mp_param.mp_ops = &io_uring_pp_zc_ops;
 	mp_param.mp_priv = ifq;
 	ret = net_mp_open_rxq(ifq->netdev, reg.if_rxq, &mp_param);
@ -585,7 +596,7 @@ static void io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *ifq)

 static netmem_ref io_pp_zc_alloc_netmems(struct page_pool *pp, gfp_t gfp)
 {
-	struct io_zcrx_ifq *ifq = pp->mp_priv;
+	struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp);

 	/* pp should already be ensuring that */
 	if (unlikely(pp->alloc.count))
@ -617,7 +628,8 @@ static bool io_pp_zc_release_netmem(struct page_pool *pp, netmem_ref netmem)

 static int io_pp_zc_init(struct page_pool *pp)
 {
-	struct io_zcrx_ifq *ifq = pp->mp_priv;
+	struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp);
+	int ret;

 	if (WARN_ON_ONCE(!ifq))
 		return -EINVAL;
@ -630,13 +642,17 @@ static int io_pp_zc_init(struct page_pool *pp)
 	if (pp->p.dma_dir != DMA_FROM_DEVICE)
 		return -EOPNOTSUPP;

+	ret = io_zcrx_map_area(ifq, ifq->area);
+	if (ret)
+		return ret;
+
 	percpu_ref_get(&ifq->ctx->refs);
 	return 0;
 }

 static void io_pp_zc_destroy(struct page_pool *pp)
 {
-	struct io_zcrx_ifq *ifq = pp->mp_priv;
+	struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp);
 	struct io_zcrx_area *area = ifq->area;

 	if (WARN_ON_ONCE(area->free_count != area->nia.num_niovs))
@ -665,6 +681,9 @@ static void io_pp_uninstall(void *mp_priv, struct netdev_rx_queue *rxq)
 	struct io_zcrx_ifq *ifq = mp_priv;

 	io_zcrx_drop_netdev(ifq);
+	if (ifq->area)
+		io_zcrx_unmap_area(ifq, ifq->area);
+
 	p->mp_ops = NULL;
 	p->mp_priv = NULL;
 }
@ -791,7 +810,7 @@ static int io_zcrx_recv_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq,

 	niov = netmem_to_net_iov(frag->netmem);
 	if (niov->pp->mp_ops != &io_uring_pp_zc_ops ||
-	    niov->pp->mp_priv != ifq)
+	    io_pp_to_ifq(niov->pp) != ifq)
 		return -EFAULT;

 	if (!io_zcrx_queue_cqe(req, niov, ifq, off + skb_frag_off(frag), len))
--- a/io_uring/zcrx.h
+++ b/io_uring/zcrx.h
@ -38,6 +38,7 @@ struct io_zcrx_ifq {
 	struct net_device		*netdev;
 	netdevice_tracker		netdev_tracker;
 	spinlock_t			lock;
+	struct mutex			dma_lock;
 };

 #if defined(CONFIG_IO_URING_ZCRX)