mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/
synced 2025-04-19 20:58:31 +09:00
io_uring/zcrx: add interface queue and refill queue
Add a new object called an interface queue (ifq) that represents a net rx queue that has been configured for zero copy. Each ifq is registered using a new registration opcode IORING_REGISTER_ZCRX_IFQ. The refill queue is allocated by the kernel and mapped by userspace using a new offset IORING_OFF_RQ_RING, in a similar fashion to the main SQ/CQ. It is used by userspace to return buffers that it is done with, which will then be re-used by the netdev again. The main CQ ring is used to notify userspace of received data by using the upper 16 bytes of a big CQE as a new struct io_uring_zcrx_cqe. Each entry contains the offset + len to the data. For now, each io_uring instance only has a single ifq. Reviewed-by: Jens Axboe <axboe@kernel.dk> Signed-off-by: David Wei <dw@davidwei.uk> Acked-by: Jakub Kicinski <kuba@kernel.org> Link: https://lore.kernel.org/r/20250215000947.789731-2-dw@davidwei.uk Signed-off-by: Jens Axboe <axboe@kernel.dk>
This commit is contained in:
parent
5c496ff11d
commit
6f377873cb
2
Kconfig
2
Kconfig
@ -30,3 +30,5 @@ source "lib/Kconfig"
|
||||
source "lib/Kconfig.debug"
|
||||
|
||||
source "Documentation/Kconfig"
|
||||
|
||||
source "io_uring/KConfig"
|
||||
|
@ -40,6 +40,8 @@ enum io_uring_cmd_flags {
|
||||
IO_URING_F_TASK_DEAD = (1 << 13),
|
||||
};
|
||||
|
||||
struct io_zcrx_ifq;
|
||||
|
||||
struct io_wq_work_node {
|
||||
struct io_wq_work_node *next;
|
||||
};
|
||||
@ -382,6 +384,8 @@ struct io_ring_ctx {
|
||||
struct wait_queue_head poll_wq;
|
||||
struct io_restriction restrictions;
|
||||
|
||||
struct io_zcrx_ifq *ifq;
|
||||
|
||||
u32 pers_next;
|
||||
struct xarray personalities;
|
||||
|
||||
@ -434,6 +438,8 @@ struct io_ring_ctx {
|
||||
struct io_mapped_region ring_region;
|
||||
/* used for optimised request parameter and wait argument passing */
|
||||
struct io_mapped_region param_region;
|
||||
/* just one zcrx per ring for now, will move to io_zcrx_ifq eventually */
|
||||
struct io_mapped_region zcrx_region;
|
||||
};
|
||||
|
||||
/*
|
||||
|
@ -639,7 +639,8 @@ enum io_uring_register_op {
|
||||
/* send MSG_RING without having a ring */
|
||||
IORING_REGISTER_SEND_MSG_RING = 31,
|
||||
|
||||
/* 32 reserved for zc rx */
|
||||
/* register a netdev hw rx queue for zerocopy */
|
||||
IORING_REGISTER_ZCRX_IFQ = 32,
|
||||
|
||||
/* resize CQ ring */
|
||||
IORING_REGISTER_RESIZE_RINGS = 33,
|
||||
@ -956,6 +957,46 @@ enum io_uring_socket_op {
|
||||
SOCKET_URING_OP_SETSOCKOPT,
|
||||
};
|
||||
|
||||
/* Zero copy receive refill queue entry */
|
||||
struct io_uring_zcrx_rqe {
|
||||
__u64 off;
|
||||
__u32 len;
|
||||
__u32 __pad;
|
||||
};
|
||||
|
||||
struct io_uring_zcrx_cqe {
|
||||
__u64 off;
|
||||
__u64 __pad;
|
||||
};
|
||||
|
||||
/* The bit from which area id is encoded into offsets */
|
||||
#define IORING_ZCRX_AREA_SHIFT 48
|
||||
#define IORING_ZCRX_AREA_MASK (~(((__u64)1 << IORING_ZCRX_AREA_SHIFT) - 1))
|
||||
|
||||
struct io_uring_zcrx_offsets {
|
||||
__u32 head;
|
||||
__u32 tail;
|
||||
__u32 rqes;
|
||||
__u32 __resv2;
|
||||
__u64 __resv[2];
|
||||
};
|
||||
|
||||
/*
|
||||
* Argument for IORING_REGISTER_ZCRX_IFQ
|
||||
*/
|
||||
struct io_uring_zcrx_ifq_reg {
|
||||
__u32 if_idx;
|
||||
__u32 if_rxq;
|
||||
__u32 rq_entries;
|
||||
__u32 flags;
|
||||
|
||||
__u64 area_ptr; /* pointer to struct io_uring_zcrx_area_reg */
|
||||
__u64 region_ptr; /* struct io_uring_region_desc * */
|
||||
|
||||
struct io_uring_zcrx_offsets offsets;
|
||||
__u64 __resv[4];
|
||||
};
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
10
io_uring/KConfig
Normal file
10
io_uring/KConfig
Normal file
@ -0,0 +1,10 @@
|
||||
# SPDX-License-Identifier: GPL-2.0-only
|
||||
#
|
||||
# io_uring configuration
|
||||
#
|
||||
|
||||
config IO_URING_ZCRX
|
||||
def_bool y
|
||||
depends on PAGE_POOL
|
||||
depends on INET
|
||||
depends on NET_RX_BUSY_POLL
|
@ -14,6 +14,7 @@ obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \
|
||||
epoll.o statx.o timeout.o fdinfo.o \
|
||||
cancel.o waitid.o register.o \
|
||||
truncate.o memmap.o alloc_cache.o
|
||||
obj-$(CONFIG_IO_URING_ZCRX) += zcrx.o
|
||||
obj-$(CONFIG_IO_WQ) += io-wq.o
|
||||
obj-$(CONFIG_FUTEX) += futex.o
|
||||
obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o
|
||||
|
@ -97,6 +97,7 @@
|
||||
#include "uring_cmd.h"
|
||||
#include "msg_ring.h"
|
||||
#include "memmap.h"
|
||||
#include "zcrx.h"
|
||||
|
||||
#include "timeout.h"
|
||||
#include "poll.h"
|
||||
@ -2700,6 +2701,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
|
||||
mutex_lock(&ctx->uring_lock);
|
||||
io_sqe_buffers_unregister(ctx);
|
||||
io_sqe_files_unregister(ctx);
|
||||
io_unregister_zcrx_ifqs(ctx);
|
||||
io_cqring_overflow_kill(ctx);
|
||||
io_eventfd_unregister(ctx);
|
||||
io_free_alloc_caches(ctx);
|
||||
@ -2859,6 +2861,11 @@ static __cold void io_ring_exit_work(struct work_struct *work)
|
||||
io_cqring_overflow_kill(ctx);
|
||||
mutex_unlock(&ctx->uring_lock);
|
||||
}
|
||||
if (ctx->ifq) {
|
||||
mutex_lock(&ctx->uring_lock);
|
||||
io_shutdown_zcrx_ifqs(ctx);
|
||||
mutex_unlock(&ctx->uring_lock);
|
||||
}
|
||||
|
||||
if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
|
||||
io_move_task_work_from_local(ctx);
|
||||
|
@ -2,6 +2,7 @@
|
||||
#define IO_URING_MEMMAP_H
|
||||
|
||||
#define IORING_MAP_OFF_PARAM_REGION 0x20000000ULL
|
||||
#define IORING_MAP_OFF_ZCRX_REGION 0x30000000ULL
|
||||
|
||||
struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages);
|
||||
|
||||
|
@ -30,6 +30,7 @@
|
||||
#include "eventfd.h"
|
||||
#include "msg_ring.h"
|
||||
#include "memmap.h"
|
||||
#include "zcrx.h"
|
||||
|
||||
#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
|
||||
IORING_REGISTER_LAST + IORING_OP_LAST)
|
||||
@ -813,6 +814,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
|
||||
break;
|
||||
ret = io_register_clone_buffers(ctx, arg);
|
||||
break;
|
||||
case IORING_REGISTER_ZCRX_IFQ:
|
||||
ret = -EINVAL;
|
||||
if (!arg || nr_args != 1)
|
||||
break;
|
||||
ret = io_register_zcrx_ifq(ctx, arg);
|
||||
break;
|
||||
case IORING_REGISTER_RESIZE_RINGS:
|
||||
ret = -EINVAL;
|
||||
if (!arg || nr_args != 1)
|
||||
|
149
io_uring/zcrx.c
Normal file
149
io_uring/zcrx.c
Normal file
@ -0,0 +1,149 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/io_uring.h>
|
||||
|
||||
#include <uapi/linux/io_uring.h>
|
||||
|
||||
#include "io_uring.h"
|
||||
#include "kbuf.h"
|
||||
#include "memmap.h"
|
||||
#include "zcrx.h"
|
||||
|
||||
#define IO_RQ_MAX_ENTRIES 32768
|
||||
|
||||
static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq,
|
||||
struct io_uring_zcrx_ifq_reg *reg,
|
||||
struct io_uring_region_desc *rd)
|
||||
{
|
||||
size_t off, size;
|
||||
void *ptr;
|
||||
int ret;
|
||||
|
||||
off = sizeof(struct io_uring);
|
||||
size = off + sizeof(struct io_uring_zcrx_rqe) * reg->rq_entries;
|
||||
if (size > rd->size)
|
||||
return -EINVAL;
|
||||
|
||||
ret = io_create_region_mmap_safe(ifq->ctx, &ifq->ctx->zcrx_region, rd,
|
||||
IORING_MAP_OFF_ZCRX_REGION);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
ptr = io_region_get_ptr(&ifq->ctx->zcrx_region);
|
||||
ifq->rq_ring = (struct io_uring *)ptr;
|
||||
ifq->rqes = (struct io_uring_zcrx_rqe *)(ptr + off);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq)
|
||||
{
|
||||
io_free_region(ifq->ctx, &ifq->ctx->zcrx_region);
|
||||
ifq->rq_ring = NULL;
|
||||
ifq->rqes = NULL;
|
||||
}
|
||||
|
||||
static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx)
|
||||
{
|
||||
struct io_zcrx_ifq *ifq;
|
||||
|
||||
ifq = kzalloc(sizeof(*ifq), GFP_KERNEL);
|
||||
if (!ifq)
|
||||
return NULL;
|
||||
|
||||
ifq->if_rxq = -1;
|
||||
ifq->ctx = ctx;
|
||||
return ifq;
|
||||
}
|
||||
|
||||
static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq)
|
||||
{
|
||||
io_free_rbuf_ring(ifq);
|
||||
kfree(ifq);
|
||||
}
|
||||
|
||||
int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
|
||||
struct io_uring_zcrx_ifq_reg __user *arg)
|
||||
{
|
||||
struct io_uring_zcrx_ifq_reg reg;
|
||||
struct io_uring_region_desc rd;
|
||||
struct io_zcrx_ifq *ifq;
|
||||
int ret;
|
||||
|
||||
/*
|
||||
* 1. Interface queue allocation.
|
||||
* 2. It can observe data destined for sockets of other tasks.
|
||||
*/
|
||||
if (!capable(CAP_NET_ADMIN))
|
||||
return -EPERM;
|
||||
|
||||
/* mandatory io_uring features for zc rx */
|
||||
if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN &&
|
||||
ctx->flags & IORING_SETUP_CQE32))
|
||||
return -EINVAL;
|
||||
if (ctx->ifq)
|
||||
return -EBUSY;
|
||||
if (copy_from_user(®, arg, sizeof(reg)))
|
||||
return -EFAULT;
|
||||
if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd)))
|
||||
return -EFAULT;
|
||||
if (memchr_inv(®.__resv, 0, sizeof(reg.__resv)))
|
||||
return -EINVAL;
|
||||
if (reg.if_rxq == -1 || !reg.rq_entries || reg.flags)
|
||||
return -EINVAL;
|
||||
if (reg.rq_entries > IO_RQ_MAX_ENTRIES) {
|
||||
if (!(ctx->flags & IORING_SETUP_CLAMP))
|
||||
return -EINVAL;
|
||||
reg.rq_entries = IO_RQ_MAX_ENTRIES;
|
||||
}
|
||||
reg.rq_entries = roundup_pow_of_two(reg.rq_entries);
|
||||
|
||||
if (!reg.area_ptr)
|
||||
return -EFAULT;
|
||||
|
||||
ifq = io_zcrx_ifq_alloc(ctx);
|
||||
if (!ifq)
|
||||
return -ENOMEM;
|
||||
|
||||
ret = io_allocate_rbuf_ring(ifq, ®, &rd);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
ifq->rq_entries = reg.rq_entries;
|
||||
ifq->if_rxq = reg.if_rxq;
|
||||
|
||||
reg.offsets.rqes = sizeof(struct io_uring);
|
||||
reg.offsets.head = offsetof(struct io_uring, head);
|
||||
reg.offsets.tail = offsetof(struct io_uring, tail);
|
||||
|
||||
if (copy_to_user(arg, ®, sizeof(reg)) ||
|
||||
copy_to_user(u64_to_user_ptr(reg.region_ptr), &rd, sizeof(rd))) {
|
||||
ret = -EFAULT;
|
||||
goto err;
|
||||
}
|
||||
|
||||
ctx->ifq = ifq;
|
||||
return 0;
|
||||
err:
|
||||
io_zcrx_ifq_free(ifq);
|
||||
return ret;
|
||||
}
|
||||
|
||||
void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx)
|
||||
{
|
||||
struct io_zcrx_ifq *ifq = ctx->ifq;
|
||||
|
||||
lockdep_assert_held(&ctx->uring_lock);
|
||||
|
||||
if (!ifq)
|
||||
return;
|
||||
|
||||
ctx->ifq = NULL;
|
||||
io_zcrx_ifq_free(ifq);
|
||||
}
|
||||
|
||||
void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx)
|
||||
{
|
||||
lockdep_assert_held(&ctx->uring_lock);
|
||||
}
|
35
io_uring/zcrx.h
Normal file
35
io_uring/zcrx.h
Normal file
@ -0,0 +1,35 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
#ifndef IOU_ZC_RX_H
|
||||
#define IOU_ZC_RX_H
|
||||
|
||||
#include <linux/io_uring_types.h>
|
||||
|
||||
struct io_zcrx_ifq {
|
||||
struct io_ring_ctx *ctx;
|
||||
struct io_uring *rq_ring;
|
||||
struct io_uring_zcrx_rqe *rqes;
|
||||
u32 rq_entries;
|
||||
|
||||
u32 if_rxq;
|
||||
};
|
||||
|
||||
#if defined(CONFIG_IO_URING_ZCRX)
|
||||
int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
|
||||
struct io_uring_zcrx_ifq_reg __user *arg);
|
||||
void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx);
|
||||
void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx);
|
||||
#else
|
||||
static inline int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
|
||||
struct io_uring_zcrx_ifq_reg __user *arg)
|
||||
{
|
||||
return -EOPNOTSUPP;
|
||||
}
|
||||
static inline void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx)
|
||||
{
|
||||
}
|
||||
static inline void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
Loading…
x
Reference in New Issue
Block a user