io_uring/zcrx: add interface queue and refill queue

Add a new object called an interface queue (ifq) that represents a net
rx queue that has been configured for zero copy. Each ifq is registered
using a new registration opcode IORING_REGISTER_ZCRX_IFQ.

The refill queue is allocated by the kernel and mapped by userspace
using a new offset IORING_OFF_RQ_RING, in a similar fashion to the main
SQ/CQ. It is used by userspace to return buffers that it is done with,
which will then be re-used by the netdev again.

The main CQ ring is used to notify userspace of received data by using
the upper 16 bytes of a big CQE as a new struct io_uring_zcrx_cqe. Each
entry contains the offset + len to the data.

For now, each io_uring instance only has a single ifq.

Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: David Wei <dw@davidwei.uk>
Acked-by: Jakub Kicinski <kuba@kernel.org>
Link: https://lore.kernel.org/r/20250215000947.789731-2-dw@davidwei.uk
Signed-off-by: Jens Axboe <axboe@kernel.dk>
This commit is contained in:
David Wei 2025-02-14 16:09:36 -08:00 committed by Jens Axboe
parent 5c496ff11d
commit 6f377873cb
10 changed files with 260 additions and 1 deletions

View File

@ -30,3 +30,5 @@ source "lib/Kconfig"
source "lib/Kconfig.debug"
source "Documentation/Kconfig"
source "io_uring/KConfig"

View File

@ -40,6 +40,8 @@ enum io_uring_cmd_flags {
IO_URING_F_TASK_DEAD = (1 << 13),
};
struct io_zcrx_ifq;
struct io_wq_work_node {
struct io_wq_work_node *next;
};
@ -382,6 +384,8 @@ struct io_ring_ctx {
struct wait_queue_head poll_wq;
struct io_restriction restrictions;
struct io_zcrx_ifq *ifq;
u32 pers_next;
struct xarray personalities;
@ -434,6 +438,8 @@ struct io_ring_ctx {
struct io_mapped_region ring_region;
/* used for optimised request parameter and wait argument passing */
struct io_mapped_region param_region;
/* just one zcrx per ring for now, will move to io_zcrx_ifq eventually */
struct io_mapped_region zcrx_region;
};
/*

View File

@ -639,7 +639,8 @@ enum io_uring_register_op {
/* send MSG_RING without having a ring */
IORING_REGISTER_SEND_MSG_RING = 31,
/* 32 reserved for zc rx */
/* register a netdev hw rx queue for zerocopy */
IORING_REGISTER_ZCRX_IFQ = 32,
/* resize CQ ring */
IORING_REGISTER_RESIZE_RINGS = 33,
@ -956,6 +957,46 @@ enum io_uring_socket_op {
SOCKET_URING_OP_SETSOCKOPT,
};
/* Zero copy receive refill queue entry */
struct io_uring_zcrx_rqe {
__u64 off;
__u32 len;
__u32 __pad;
};
struct io_uring_zcrx_cqe {
__u64 off;
__u64 __pad;
};
/* The bit from which area id is encoded into offsets */
#define IORING_ZCRX_AREA_SHIFT 48
#define IORING_ZCRX_AREA_MASK (~(((__u64)1 << IORING_ZCRX_AREA_SHIFT) - 1))
struct io_uring_zcrx_offsets {
__u32 head;
__u32 tail;
__u32 rqes;
__u32 __resv2;
__u64 __resv[2];
};
/*
* Argument for IORING_REGISTER_ZCRX_IFQ
*/
struct io_uring_zcrx_ifq_reg {
__u32 if_idx;
__u32 if_rxq;
__u32 rq_entries;
__u32 flags;
__u64 area_ptr; /* pointer to struct io_uring_zcrx_area_reg */
__u64 region_ptr; /* struct io_uring_region_desc * */
struct io_uring_zcrx_offsets offsets;
__u64 __resv[4];
};
#ifdef __cplusplus
}
#endif

10
io_uring/KConfig Normal file
View File

@ -0,0 +1,10 @@
# SPDX-License-Identifier: GPL-2.0-only
#
# io_uring configuration
#
config IO_URING_ZCRX
def_bool y
depends on PAGE_POOL
depends on INET
depends on NET_RX_BUSY_POLL

View File

@ -14,6 +14,7 @@ obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \
epoll.o statx.o timeout.o fdinfo.o \
cancel.o waitid.o register.o \
truncate.o memmap.o alloc_cache.o
obj-$(CONFIG_IO_URING_ZCRX) += zcrx.o
obj-$(CONFIG_IO_WQ) += io-wq.o
obj-$(CONFIG_FUTEX) += futex.o
obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o

View File

@ -97,6 +97,7 @@
#include "uring_cmd.h"
#include "msg_ring.h"
#include "memmap.h"
#include "zcrx.h"
#include "timeout.h"
#include "poll.h"
@ -2700,6 +2701,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
mutex_lock(&ctx->uring_lock);
io_sqe_buffers_unregister(ctx);
io_sqe_files_unregister(ctx);
io_unregister_zcrx_ifqs(ctx);
io_cqring_overflow_kill(ctx);
io_eventfd_unregister(ctx);
io_free_alloc_caches(ctx);
@ -2859,6 +2861,11 @@ static __cold void io_ring_exit_work(struct work_struct *work)
io_cqring_overflow_kill(ctx);
mutex_unlock(&ctx->uring_lock);
}
if (ctx->ifq) {
mutex_lock(&ctx->uring_lock);
io_shutdown_zcrx_ifqs(ctx);
mutex_unlock(&ctx->uring_lock);
}
if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
io_move_task_work_from_local(ctx);

View File

@ -2,6 +2,7 @@
#define IO_URING_MEMMAP_H
#define IORING_MAP_OFF_PARAM_REGION 0x20000000ULL
#define IORING_MAP_OFF_ZCRX_REGION 0x30000000ULL
struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages);

View File

@ -30,6 +30,7 @@
#include "eventfd.h"
#include "msg_ring.h"
#include "memmap.h"
#include "zcrx.h"
#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
IORING_REGISTER_LAST + IORING_OP_LAST)
@ -813,6 +814,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
break;
ret = io_register_clone_buffers(ctx, arg);
break;
case IORING_REGISTER_ZCRX_IFQ:
ret = -EINVAL;
if (!arg || nr_args != 1)
break;
ret = io_register_zcrx_ifq(ctx, arg);
break;
case IORING_REGISTER_RESIZE_RINGS:
ret = -EINVAL;
if (!arg || nr_args != 1)

149
io_uring/zcrx.c Normal file
View File

@ -0,0 +1,149 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/mm.h>
#include <linux/io_uring.h>
#include <uapi/linux/io_uring.h>
#include "io_uring.h"
#include "kbuf.h"
#include "memmap.h"
#include "zcrx.h"
#define IO_RQ_MAX_ENTRIES 32768
static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq,
struct io_uring_zcrx_ifq_reg *reg,
struct io_uring_region_desc *rd)
{
size_t off, size;
void *ptr;
int ret;
off = sizeof(struct io_uring);
size = off + sizeof(struct io_uring_zcrx_rqe) * reg->rq_entries;
if (size > rd->size)
return -EINVAL;
ret = io_create_region_mmap_safe(ifq->ctx, &ifq->ctx->zcrx_region, rd,
IORING_MAP_OFF_ZCRX_REGION);
if (ret < 0)
return ret;
ptr = io_region_get_ptr(&ifq->ctx->zcrx_region);
ifq->rq_ring = (struct io_uring *)ptr;
ifq->rqes = (struct io_uring_zcrx_rqe *)(ptr + off);
return 0;
}
static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq)
{
io_free_region(ifq->ctx, &ifq->ctx->zcrx_region);
ifq->rq_ring = NULL;
ifq->rqes = NULL;
}
static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx)
{
struct io_zcrx_ifq *ifq;
ifq = kzalloc(sizeof(*ifq), GFP_KERNEL);
if (!ifq)
return NULL;
ifq->if_rxq = -1;
ifq->ctx = ctx;
return ifq;
}
static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq)
{
io_free_rbuf_ring(ifq);
kfree(ifq);
}
int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
struct io_uring_zcrx_ifq_reg __user *arg)
{
struct io_uring_zcrx_ifq_reg reg;
struct io_uring_region_desc rd;
struct io_zcrx_ifq *ifq;
int ret;
/*
* 1. Interface queue allocation.
* 2. It can observe data destined for sockets of other tasks.
*/
if (!capable(CAP_NET_ADMIN))
return -EPERM;
/* mandatory io_uring features for zc rx */
if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN &&
ctx->flags & IORING_SETUP_CQE32))
return -EINVAL;
if (ctx->ifq)
return -EBUSY;
if (copy_from_user(&reg, arg, sizeof(reg)))
return -EFAULT;
if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd)))
return -EFAULT;
if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)))
return -EINVAL;
if (reg.if_rxq == -1 || !reg.rq_entries || reg.flags)
return -EINVAL;
if (reg.rq_entries > IO_RQ_MAX_ENTRIES) {
if (!(ctx->flags & IORING_SETUP_CLAMP))
return -EINVAL;
reg.rq_entries = IO_RQ_MAX_ENTRIES;
}
reg.rq_entries = roundup_pow_of_two(reg.rq_entries);
if (!reg.area_ptr)
return -EFAULT;
ifq = io_zcrx_ifq_alloc(ctx);
if (!ifq)
return -ENOMEM;
ret = io_allocate_rbuf_ring(ifq, &reg, &rd);
if (ret)
goto err;
ifq->rq_entries = reg.rq_entries;
ifq->if_rxq = reg.if_rxq;
reg.offsets.rqes = sizeof(struct io_uring);
reg.offsets.head = offsetof(struct io_uring, head);
reg.offsets.tail = offsetof(struct io_uring, tail);
if (copy_to_user(arg, &reg, sizeof(reg)) ||
copy_to_user(u64_to_user_ptr(reg.region_ptr), &rd, sizeof(rd))) {
ret = -EFAULT;
goto err;
}
ctx->ifq = ifq;
return 0;
err:
io_zcrx_ifq_free(ifq);
return ret;
}
void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx)
{
struct io_zcrx_ifq *ifq = ctx->ifq;
lockdep_assert_held(&ctx->uring_lock);
if (!ifq)
return;
ctx->ifq = NULL;
io_zcrx_ifq_free(ifq);
}
void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx)
{
lockdep_assert_held(&ctx->uring_lock);
}

35
io_uring/zcrx.h Normal file
View File

@ -0,0 +1,35 @@
// SPDX-License-Identifier: GPL-2.0
#ifndef IOU_ZC_RX_H
#define IOU_ZC_RX_H
#include <linux/io_uring_types.h>
struct io_zcrx_ifq {
struct io_ring_ctx *ctx;
struct io_uring *rq_ring;
struct io_uring_zcrx_rqe *rqes;
u32 rq_entries;
u32 if_rxq;
};
#if defined(CONFIG_IO_URING_ZCRX)
int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
struct io_uring_zcrx_ifq_reg __user *arg);
void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx);
void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx);
#else
static inline int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
struct io_uring_zcrx_ifq_reg __user *arg)
{
return -EOPNOTSUPP;
}
static inline void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx)
{
}
static inline void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx)
{
}
#endif
#endif