for-6.15/io_uring-reg-vec-20250327

-----BEGIN PGP SIGNATURE-----
 
 iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAmflYcAQHGF4Ym9lQGtl
 cm5lbC5kawAKCRD301j7KXHgpmvJD/4tKQlr0yRhln/JzPiONS41mUAuNRI4MdqJ
 ykpQkMx3NcQANbNyOxI0PV5I7y1Jdlg/UP9gy11BrIaBk4Kqoluc6iAzgr5q9pBC
 8pXhPIe80R/q/LOKEz9n5gqOMPNyUtd7IaBayJPBJre/YZXQu+49IL2Uyy3hss8d
 neqAbWErd2FoUfTY14XB3ImLM6a76Z6CjE3pJYvVDM5uRBuH0IGqehJJuNpsViBf
 M9XAW/HZt8ISsVt1tJbCQVWx4b63L/omHI8u5K2M0isTPV+QPk1O2Vgkn7dBrDeT
 JvThWrM1uE++DYGcQ3DXHfb3gBIFEjTrNb2nddstyEU2ZaEXUkuOV2O0b7WPuphj
 zp0oFaLl/ivHT8NoJzzZzK24zt99Qz43GWUaFCQeR0R8oTix/M1q0unguER45Iv7
 Po/b3h6+RAi+87KOlM5WWo05ScswS8AwcSUsP5xMR5BjjD+GQYO5PmVVyo8w0rid
 8F9U9DpN2CTA5YVjI+ax1cxWMOfmAXPK5ONjzZpyJoWb0THgj97esEwc2un7SBi7
 TJJz7Gc9/xOqfRKaPDoH9t8+b6ruWHMqCYDw6exSAUKeDxQ+7z0zNMudHkuR5VrX
 x+Taaj95ONLVNZYz0mbFcvmJC0UBOqkE94omXk7TU2Cn7SBzAW//XDep6CPpX/sa
 LcmOK4UXdg==
 =vOm1
 -----END PGP SIGNATURE-----

Merge tag 'for-6.15/io_uring-reg-vec-20250327' of git://git.kernel.dk/linux

Pull more io_uring updates from Jens Axboe:
 "Final separate updates for io_uring.

  This started out as a series of cleanups improvements and improvements
  for registered buffers, but as the last series of the io_uring changes
  for 6.15, it also collected a few fixes for the other branches on top:

   - Add support for vectored fixed/registered buffers.

     Previously only single segments have been supported for commands,
     now vectored variants are supported as well. This series includes
     networking and file read/write support.

   - Small series unifying return codes across multi and single shot.

   - Small series cleaning up registerd buffer importing.

   - Adding support for vectored registered buffers for uring_cmd.

   - Fix for io-wq handling of command reissue.

   - Various little fixes and tweaks"

* tag 'for-6.15/io_uring-reg-vec-20250327' of git://git.kernel.dk/linux: (25 commits)
  io_uring/net: fix io_req_post_cqe abuse by send bundle
  io_uring/net: use REQ_F_IMPORT_BUFFER for send_zc
  io_uring: move min_events sanitisation
  io_uring: rename "min" arg in io_iopoll_check()
  io_uring: open code __io_post_aux_cqe()
  io_uring: defer iowq cqe overflow via task_work
  io_uring: fix retry handling off iowq
  io_uring/net: only import send_zc buffer once
  io_uring/cmd: introduce io_uring_cmd_import_fixed_vec
  io_uring/cmd: add iovec cache for commands
  io_uring/cmd: don't expose entire cmd async data
  io_uring: rename the data cmd cache
  io_uring: rely on io_prep_reg_vec for iovec placement
  io_uring: introduce io_prep_reg_iovec()
  io_uring: unify STOP_MULTISHOT with IOU_OK
  io_uring: return -EAGAIN to continue multishot
  io_uring: cap cached iovec/bvec size
  io_uring/net: implement vectored reg bufs for zctx
  io_uring/net: convert to struct iou_vec
  io_uring/net: pull vec alloc out of msghdr import
  ...
This commit is contained in:
Linus Torvalds 2025-03-28 15:07:04 -07:00
commit eff5f16bfd
16 changed files with 569 additions and 213 deletions

View File

@ -43,6 +43,11 @@ int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
struct iov_iter *iter,
struct io_uring_cmd *ioucmd,
unsigned int issue_flags);
int io_uring_cmd_import_fixed_vec(struct io_uring_cmd *ioucmd,
const struct iovec __user *uvec,
size_t uvec_segs,
int ddir, struct iov_iter *iter,
unsigned issue_flags);
/*
* Completes the request, i.e. posts an io_uring CQE and deallocates @ioucmd
@ -76,6 +81,14 @@ io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
{
return -EOPNOTSUPP;
}
static inline int io_uring_cmd_import_fixed_vec(struct io_uring_cmd *ioucmd,
const struct iovec __user *uvec,
size_t uvec_segs,
int ddir, struct iov_iter *iter,
unsigned issue_flags)
{
return -EOPNOTSUPP;
}
static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret,
u64 ret2, unsigned issue_flags)
{

View File

@ -110,6 +110,14 @@ struct io_uring_task {
} ____cacheline_aligned_in_smp;
};
struct iou_vec {
union {
struct iovec *iovec;
struct bio_vec *bvec;
};
unsigned nr; /* number of struct iovec it can hold */
};
struct io_uring {
u32 head;
u32 tail;
@ -310,7 +318,7 @@ struct io_ring_ctx {
struct io_alloc_cache apoll_cache;
struct io_alloc_cache netmsg_cache;
struct io_alloc_cache rw_cache;
struct io_alloc_cache uring_cache;
struct io_alloc_cache cmd_cache;
/*
* Any cancelable uring_cmd is added to this list in
@ -482,6 +490,7 @@ enum {
REQ_F_SKIP_LINK_CQES_BIT,
REQ_F_SINGLE_POLL_BIT,
REQ_F_DOUBLE_POLL_BIT,
REQ_F_MULTISHOT_BIT,
REQ_F_APOLL_MULTISHOT_BIT,
REQ_F_CLEAR_POLLIN_BIT,
/* keep async read/write and isreg together and in order */
@ -494,6 +503,7 @@ enum {
REQ_F_BUFFERS_COMMIT_BIT,
REQ_F_BUF_NODE_BIT,
REQ_F_HAS_METADATA_BIT,
REQ_F_IMPORT_BUFFER_BIT,
/* not a real bit, just to check we're not overflowing the space */
__REQ_F_LAST_BIT,
@ -558,6 +568,8 @@ enum {
REQ_F_SINGLE_POLL = IO_REQ_FLAG(REQ_F_SINGLE_POLL_BIT),
/* double poll may active */
REQ_F_DOUBLE_POLL = IO_REQ_FLAG(REQ_F_DOUBLE_POLL_BIT),
/* request posts multiple completions, should be set at prep time */
REQ_F_MULTISHOT = IO_REQ_FLAG(REQ_F_MULTISHOT_BIT),
/* fast poll multishot mode */
REQ_F_APOLL_MULTISHOT = IO_REQ_FLAG(REQ_F_APOLL_MULTISHOT_BIT),
/* recvmsg special flag, clear EPOLLIN */
@ -576,6 +588,11 @@ enum {
REQ_F_BUF_NODE = IO_REQ_FLAG(REQ_F_BUF_NODE_BIT),
/* request has read/write metadata assigned */
REQ_F_HAS_METADATA = IO_REQ_FLAG(REQ_F_HAS_METADATA_BIT),
/*
* For vectored fixed buffers, resolve iovec to registered buffers.
* For SEND_ZC, whether to import buffers (i.e. the first issue).
*/
REQ_F_IMPORT_BUFFER = IO_REQ_FLAG(REQ_F_IMPORT_BUFFER_BIT),
};
typedef void (*io_req_tw_func_t)(struct io_kiocb *req, io_tw_token_t tw);

View File

@ -281,6 +281,8 @@ enum io_uring_op {
IORING_OP_LISTEN,
IORING_OP_RECV_ZC,
IORING_OP_EPOLL_WAIT,
IORING_OP_READV_FIXED,
IORING_OP_WRITEV_FIXED,
/* this goes last, obviously */
IORING_OP_LAST,

View File

@ -16,15 +16,6 @@ bool io_alloc_cache_init(struct io_alloc_cache *cache,
void *io_cache_alloc_new(struct io_alloc_cache *cache, gfp_t gfp);
static inline void io_alloc_cache_kasan(struct iovec **iov, int *nr)
{
if (IS_ENABLED(CONFIG_KASAN)) {
kfree(*iov);
*iov = NULL;
*nr = 0;
}
}
static inline bool io_alloc_cache_put(struct io_alloc_cache *cache,
void *entry)
{

View File

@ -289,7 +289,7 @@ static void io_free_alloc_caches(struct io_ring_ctx *ctx)
io_alloc_cache_free(&ctx->apoll_cache, kfree);
io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free);
io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free);
io_alloc_cache_free(&ctx->uring_cache, kfree);
io_alloc_cache_free(&ctx->cmd_cache, io_cmd_cache_free);
io_alloc_cache_free(&ctx->msg_cache, kfree);
io_futex_cache_free(ctx);
io_rsrc_cache_free(ctx);
@ -334,8 +334,9 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
ret |= io_alloc_cache_init(&ctx->rw_cache, IO_ALLOC_CACHE_MAX,
sizeof(struct io_async_rw),
offsetof(struct io_async_rw, clear));
ret |= io_alloc_cache_init(&ctx->uring_cache, IO_ALLOC_CACHE_MAX,
sizeof(struct io_uring_cmd_data), 0);
ret |= io_alloc_cache_init(&ctx->cmd_cache, IO_ALLOC_CACHE_MAX,
sizeof(struct io_async_cmd),
sizeof(struct io_async_cmd));
spin_lock_init(&ctx->msg_lock);
ret |= io_alloc_cache_init(&ctx->msg_cache, IO_ALLOC_CACHE_MAX,
sizeof(struct io_kiocb), 0);
@ -833,24 +834,14 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
return false;
}
static bool __io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res,
u32 cflags)
{
bool filled;
filled = io_fill_cqe_aux(ctx, user_data, res, cflags);
if (!filled)
filled = io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0);
return filled;
}
bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags)
{
bool filled;
io_cq_lock(ctx);
filled = __io_post_aux_cqe(ctx, user_data, res, cflags);
filled = io_fill_cqe_aux(ctx, user_data, res, cflags);
if (!filled)
filled = io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0);
io_cq_unlock_post(ctx);
return filled;
}
@ -891,6 +882,7 @@ bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags)
static void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags)
{
struct io_ring_ctx *ctx = req->ctx;
bool completed = true;
/*
* All execution paths but io-wq use the deferred completions by
@ -903,19 +895,21 @@ static void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags)
* Handle special CQ sync cases via task_work. DEFER_TASKRUN requires
* the submitter task context, IOPOLL protects with uring_lock.
*/
if (ctx->lockless_cq) {
if (ctx->lockless_cq || (req->flags & REQ_F_REISSUE)) {
defer_complete:
req->io_task_work.func = io_req_task_complete;
io_req_task_work_add(req);
return;
}
io_cq_lock(ctx);
if (!(req->flags & REQ_F_CQE_SKIP)) {
if (!io_fill_cqe_req(ctx, req))
io_req_cqe_overflow(req);
}
if (!(req->flags & REQ_F_CQE_SKIP))
completed = io_fill_cqe_req(ctx, req);
io_cq_unlock_post(ctx);
if (!completed)
goto defer_complete;
/*
* We don't free the request here because we know it's called from
* io-wq only, which holds a reference, so it cannot be the last put.
@ -1511,11 +1505,13 @@ static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
mutex_unlock(&ctx->uring_lock);
}
static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned int min_events)
{
unsigned int nr_events = 0;
unsigned long check_cq;
min_events = min(min_events, ctx->cq_entries);
lockdep_assert_held(&ctx->uring_lock);
if (!io_allowed_run_tw(ctx))
@ -1557,7 +1553,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
io_task_work_pending(ctx)) {
u32 tail = ctx->cached_cq_tail;
(void) io_run_local_work_locked(ctx, min);
(void) io_run_local_work_locked(ctx, min_events);
if (task_work_pending(current) ||
wq_list_empty(&ctx->iopoll_list)) {
@ -1570,7 +1566,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
wq_list_empty(&ctx->iopoll_list))
break;
}
ret = io_do_iopoll(ctx, !min);
ret = io_do_iopoll(ctx, !min_events);
if (unlikely(ret < 0))
return ret;
@ -1580,7 +1576,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
break;
nr_events += ret;
} while (nr_events < min);
} while (nr_events < min_events);
return 0;
}
@ -1791,10 +1787,7 @@ int io_poll_issue(struct io_kiocb *req, io_tw_token_t tw)
ret = __io_issue_sqe(req, issue_flags, &io_issue_defs[req->opcode]);
WARN_ON_ONCE(ret == IOU_OK);
if (ret == IOU_ISSUE_SKIP_COMPLETE)
ret = 0;
WARN_ON_ONCE(ret == IOU_ISSUE_SKIP_COMPLETE);
return ret;
}
@ -1847,7 +1840,7 @@ fail:
* Don't allow any multishot execution from io-wq. It's more restrictive
* than necessary and also cleaner.
*/
if (req->flags & REQ_F_APOLL_MULTISHOT) {
if (req->flags & (REQ_F_MULTISHOT|REQ_F_APOLL_MULTISHOT)) {
err = -EBADFD;
if (!io_file_can_poll(req))
goto fail;
@ -1858,7 +1851,7 @@ fail:
goto fail;
return;
} else {
req->flags &= ~REQ_F_APOLL_MULTISHOT;
req->flags &= ~(REQ_F_APOLL_MULTISHOT|REQ_F_MULTISHOT);
}
}
@ -2549,6 +2542,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
ktime_t start_time;
int ret;
min_events = min_t(int, min_events, ctx->cq_entries);
if (!io_allowed_run_tw(ctx))
return -EEXIST;
if (io_local_work_pending(ctx))
@ -3435,22 +3430,16 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
mutex_lock(&ctx->uring_lock);
iopoll_locked:
ret2 = io_validate_ext_arg(ctx, flags, argp, argsz);
if (likely(!ret2)) {
min_complete = min(min_complete,
ctx->cq_entries);
if (likely(!ret2))
ret2 = io_iopoll_check(ctx, min_complete);
}
mutex_unlock(&ctx->uring_lock);
} else {
struct ext_arg ext_arg = { .argsz = argsz };
ret2 = io_get_ext_arg(ctx, flags, argp, &ext_arg);
if (likely(!ret2)) {
min_complete = min(min_complete,
ctx->cq_entries);
if (likely(!ret2))
ret2 = io_cqring_wait(ctx, min_complete, flags,
&ext_arg);
}
}
if (!ret) {

View File

@ -19,22 +19,25 @@
#endif
enum {
IOU_OK = 0,
IOU_OK = 0, /* deprecated, use IOU_COMPLETE */
IOU_COMPLETE = 0,
IOU_ISSUE_SKIP_COMPLETE = -EIOCBQUEUED,
/*
* The request has more work to do and should be retried. io_uring will
* attempt to wait on the file for eligible opcodes, but otherwise
* it'll be handed to iowq for blocking execution. It works for normal
* requests as well as for the multi shot mode.
*/
IOU_RETRY = -EAGAIN,
/*
* Requeue the task_work to restart operations on this request. The
* actual value isn't important, should just be not an otherwise
* valid error code, yet less than -MAX_ERRNO and valid internally.
*/
IOU_REQUEUE = -3072,
/*
* Intended only when both IO_URING_F_MULTISHOT is passed
* to indicate to the poll runner that multishot should be
* removed and the result is set on req->cqe.res.
*/
IOU_STOP_MULTISHOT = -ECANCELED,
};
struct io_wait_queue {

View File

@ -136,11 +136,8 @@ static bool io_net_retry(struct socket *sock, int flags)
static void io_netmsg_iovec_free(struct io_async_msghdr *kmsg)
{
if (kmsg->free_iov) {
kfree(kmsg->free_iov);
kmsg->free_iov_nr = 0;
kmsg->free_iov = NULL;
}
if (kmsg->vec.iovec)
io_vec_free(&kmsg->vec);
}
static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags)
@ -154,7 +151,10 @@ static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags)
}
/* Let normal cleanup path reap it if we fail adding to the cache */
io_alloc_cache_kasan(&hdr->free_iov, &hdr->free_iov_nr);
io_alloc_cache_vec_kasan(&hdr->vec);
if (hdr->vec.nr > IO_VEC_CACHE_SOFT_CAP)
io_vec_free(&hdr->vec);
if (io_alloc_cache_put(&req->ctx->netmsg_cache, hdr)) {
req->async_data = NULL;
req->flags &= ~(REQ_F_ASYNC_DATA|REQ_F_NEED_CLEANUP);
@ -171,7 +171,7 @@ static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req)
return NULL;
/* If the async data was cached, we might have an iov cached inside. */
if (hdr->free_iov)
if (hdr->vec.iovec)
req->flags |= REQ_F_NEED_CLEANUP;
return hdr;
}
@ -182,10 +182,7 @@ static void io_net_vec_assign(struct io_kiocb *req, struct io_async_msghdr *kmsg
{
if (iov) {
req->flags |= REQ_F_NEED_CLEANUP;
kmsg->free_iov_nr = kmsg->msg.msg_iter.nr_segs;
if (kmsg->free_iov)
kfree(kmsg->free_iov);
kmsg->free_iov = iov;
io_vec_reset_iovec(&kmsg->vec, iov, kmsg->msg.msg_iter.nr_segs);
}
}
@ -208,9 +205,9 @@ static int io_net_import_vec(struct io_kiocb *req, struct io_async_msghdr *iomsg
struct iovec *iov;
int ret, nr_segs;
if (iomsg->free_iov) {
nr_segs = iomsg->free_iov_nr;
iov = iomsg->free_iov;
if (iomsg->vec.iovec) {
nr_segs = iomsg->vec.nr;
iov = iomsg->vec.iovec;
} else {
nr_segs = 1;
iov = &iomsg->fast_iov;
@ -253,12 +250,8 @@ static int io_compat_msg_copy_hdr(struct io_kiocb *req,
return -EFAULT;
sr->len = tmp_iov.iov_len;
}
return 0;
}
return io_net_import_vec(req, iomsg, (struct iovec __user *)uiov,
msg->msg_iovlen, ddir);
return 0;
}
static int io_copy_msghdr_from_user(struct user_msghdr *msg,
@ -287,6 +280,24 @@ static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg,
struct user_msghdr __user *umsg = sr->umsg;
int ret;
iomsg->msg.msg_name = &iomsg->addr;
iomsg->msg.msg_iter.nr_segs = 0;
if (io_is_compat(req->ctx)) {
struct compat_msghdr cmsg;
ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ddir, save_addr);
if (ret)
return ret;
memset(msg, 0, sizeof(*msg));
msg->msg_namelen = cmsg.msg_namelen;
msg->msg_controllen = cmsg.msg_controllen;
msg->msg_iov = compat_ptr(cmsg.msg_iov);
msg->msg_iovlen = cmsg.msg_iovlen;
return 0;
}
ret = io_copy_msghdr_from_user(msg, umsg);
if (unlikely(ret))
return ret;
@ -310,10 +321,8 @@ static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg,
return -EFAULT;
sr->len = tmp_iov.iov_len;
}
return 0;
}
return io_net_import_vec(req, iomsg, msg->msg_iov, msg->msg_iovlen, ddir);
return 0;
}
static int io_sendmsg_copy_hdr(struct io_kiocb *req,
@ -323,19 +332,13 @@ static int io_sendmsg_copy_hdr(struct io_kiocb *req,
struct user_msghdr msg;
int ret;
iomsg->msg.msg_name = &iomsg->addr;
iomsg->msg.msg_iter.nr_segs = 0;
if (io_is_compat(req->ctx)) {
struct compat_msghdr cmsg;
ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ITER_SOURCE,
NULL);
sr->msg_control = iomsg->msg.msg_control_user;
return ret;
}
ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_SOURCE, NULL);
if (unlikely(ret))
return ret;
if (!(req->flags & REQ_F_BUFFER_SELECT))
ret = io_net_import_vec(req, iomsg, msg.msg_iov, msg.msg_iovlen,
ITER_SOURCE);
/* save msg_control as sys_sendmsg() overwrites it */
sr->msg_control = iomsg->msg.msg_control_user;
return ret;
@ -395,6 +398,27 @@ static int io_sendmsg_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe
return io_sendmsg_copy_hdr(req, kmsg);
}
static int io_sendmsg_zc_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
struct io_async_msghdr *kmsg = req->async_data;
struct user_msghdr msg;
int ret;
if (!(sr->flags & IORING_RECVSEND_FIXED_BUF))
return io_sendmsg_setup(req, sqe);
sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
ret = io_msg_copy_hdr(req, kmsg, &msg, ITER_SOURCE, NULL);
if (unlikely(ret))
return ret;
sr->msg_control = kmsg->msg.msg_control_user;
kmsg->msg.msg_iter.nr_segs = msg.msg_iovlen;
return io_prep_reg_iovec(req, &kmsg->vec, msg.msg_iov, msg.msg_iovlen);
}
#define SENDMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_BUNDLE)
int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@ -424,6 +448,7 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
sr->msg_flags |= MSG_WAITALL;
sr->buf_group = req->buf_index;
req->buf_list = NULL;
req->flags |= REQ_F_MULTISHOT;
}
if (io_is_compat(req->ctx))
@ -461,7 +486,7 @@ static int io_bundle_nbufs(struct io_async_msghdr *kmsg, int ret)
if (iter_is_ubuf(&kmsg->msg.msg_iter))
return 1;
iov = kmsg->free_iov;
iov = kmsg->vec.iovec;
if (!iov)
iov = &kmsg->fast_iov;
@ -577,9 +602,9 @@ static int io_send_select_buffer(struct io_kiocb *req, unsigned int issue_flags,
.nr_iovs = 1,
};
if (kmsg->free_iov) {
arg.nr_iovs = kmsg->free_iov_nr;
arg.iovs = kmsg->free_iov;
if (kmsg->vec.iovec) {
arg.nr_iovs = kmsg->vec.nr;
arg.iovs = kmsg->vec.iovec;
arg.mode = KBUF_MODE_FREE;
}
@ -592,9 +617,9 @@ static int io_send_select_buffer(struct io_kiocb *req, unsigned int issue_flags,
if (unlikely(ret < 0))
return ret;
if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) {
kmsg->free_iov_nr = ret;
kmsg->free_iov = arg.iovs;
if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->vec.iovec) {
kmsg->vec.nr = ret;
kmsg->vec.iovec = arg.iovs;
req->flags |= REQ_F_NEED_CLEANUP;
}
sr->len = arg.out_len;
@ -709,23 +734,16 @@ static int io_recvmsg_copy_hdr(struct io_kiocb *req,
struct user_msghdr msg;
int ret;
iomsg->msg.msg_name = &iomsg->addr;
iomsg->msg.msg_iter.nr_segs = 0;
if (io_is_compat(req->ctx)) {
struct compat_msghdr cmsg;
ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ITER_DEST,
&iomsg->uaddr);
memset(&msg, 0, sizeof(msg));
msg.msg_namelen = cmsg.msg_namelen;
msg.msg_controllen = cmsg.msg_controllen;
} else {
ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_DEST, &iomsg->uaddr);
}
ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_DEST, &iomsg->uaddr);
if (unlikely(ret))
return ret;
if (!(req->flags & REQ_F_BUFFER_SELECT)) {
ret = io_net_import_vec(req, iomsg, msg.msg_iov, msg.msg_iovlen,
ITER_DEST);
if (unlikely(ret))
return ret;
}
return io_recvmsg_mshot_prep(req, iomsg, msg.msg_namelen,
msg.msg_controllen);
}
@ -863,8 +881,7 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
*/
if ((req->flags & REQ_F_APOLL_MULTISHOT) && !mshot_finished &&
io_req_post_cqe(req, *ret, cflags | IORING_CQE_F_MORE)) {
int mshot_retry_ret = IOU_ISSUE_SKIP_COMPLETE;
*ret = IOU_RETRY;
io_mshot_prep_retry(req, kmsg);
/* Known not-empty or unknown state, retry */
if (cflags & IORING_CQE_F_SOCK_NONEMPTY || kmsg->msg.msg_inq < 0) {
@ -872,23 +889,16 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
return false;
/* mshot retries exceeded, force a requeue */
sr->nr_multishot_loops = 0;
mshot_retry_ret = IOU_REQUEUE;
if (issue_flags & IO_URING_F_MULTISHOT)
*ret = IOU_REQUEUE;
}
if (issue_flags & IO_URING_F_MULTISHOT)
*ret = mshot_retry_ret;
else
*ret = -EAGAIN;
return true;
}
/* Finish the request / stop multishot. */
finish:
io_req_set_res(req, *ret, cflags);
if (issue_flags & IO_URING_F_MULTISHOT)
*ret = IOU_STOP_MULTISHOT;
else
*ret = IOU_OK;
*ret = IOU_COMPLETE;
io_req_msg_cleanup(req, issue_flags);
return true;
}
@ -1035,16 +1045,15 @@ retry_multishot:
if (ret < min_ret) {
if (ret == -EAGAIN && force_nonblock) {
if (issue_flags & IO_URING_F_MULTISHOT) {
if (issue_flags & IO_URING_F_MULTISHOT)
io_kbuf_recycle(req, issue_flags);
return IOU_ISSUE_SKIP_COMPLETE;
}
return -EAGAIN;
return IOU_RETRY;
}
if (ret > 0 && io_net_retry(sock, flags)) {
sr->done_io += ret;
req->flags |= REQ_F_BL_NO_RECYCLE;
return -EAGAIN;
return IOU_RETRY;
}
if (ret == -ERESTARTSYS)
ret = -EINTR;
@ -1085,9 +1094,9 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg
.mode = KBUF_MODE_EXPAND,
};
if (kmsg->free_iov) {
arg.nr_iovs = kmsg->free_iov_nr;
arg.iovs = kmsg->free_iov;
if (kmsg->vec.iovec) {
arg.nr_iovs = kmsg->vec.nr;
arg.iovs = kmsg->vec.iovec;
arg.mode |= KBUF_MODE_FREE;
}
@ -1106,9 +1115,9 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg
}
iov_iter_init(&kmsg->msg.msg_iter, ITER_DEST, arg.iovs, ret,
arg.out_len);
if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) {
kmsg->free_iov_nr = ret;
kmsg->free_iov = arg.iovs;
if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->vec.iovec) {
kmsg->vec.nr = ret;
kmsg->vec.iovec = arg.iovs;
req->flags |= REQ_F_NEED_CLEANUP;
}
} else {
@ -1172,12 +1181,10 @@ retry_multishot:
ret = sock_recvmsg(sock, &kmsg->msg, flags);
if (ret < min_ret) {
if (ret == -EAGAIN && force_nonblock) {
if (issue_flags & IO_URING_F_MULTISHOT) {
if (issue_flags & IO_URING_F_MULTISHOT)
io_kbuf_recycle(req, issue_flags);
return IOU_ISSUE_SKIP_COMPLETE;
}
return -EAGAIN;
return IOU_RETRY;
}
if (ret > 0 && io_net_retry(sock, flags)) {
sr->len -= ret;
@ -1260,9 +1267,7 @@ int io_recvzc(struct io_kiocb *req, unsigned int issue_flags)
if (len && zc->len == 0) {
io_req_set_res(req, 0, 0);
if (issue_flags & IO_URING_F_MULTISHOT)
return IOU_STOP_MULTISHOT;
return IOU_OK;
return IOU_COMPLETE;
}
if (unlikely(ret <= 0) && ret != -EAGAIN) {
if (ret == -ERESTARTSYS)
@ -1272,15 +1277,9 @@ int io_recvzc(struct io_kiocb *req, unsigned int issue_flags)
req_set_fail(req);
io_req_set_res(req, ret, 0);
if (issue_flags & IO_URING_F_MULTISHOT)
return IOU_STOP_MULTISHOT;
return IOU_OK;
return IOU_COMPLETE;
}
if (issue_flags & IO_URING_F_MULTISHOT)
return IOU_ISSUE_SKIP_COMPLETE;
return -EAGAIN;
return IOU_RETRY;
}
void io_send_zc_cleanup(struct io_kiocb *req)
@ -1339,8 +1338,6 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (req->opcode != IORING_OP_SEND_ZC) {
if (unlikely(sqe->addr2 || sqe->file_index))
return -EINVAL;
if (unlikely(zc->flags & IORING_RECVSEND_FIXED_BUF))
return -EINVAL;
}
zc->len = READ_ONCE(sqe->len);
@ -1354,9 +1351,11 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (unlikely(!io_msg_alloc_async(req)))
return -ENOMEM;
if (req->opcode != IORING_OP_SENDMSG_ZC)
if (req->opcode == IORING_OP_SEND_ZC) {
req->flags |= REQ_F_IMPORT_BUFFER;
return io_send_setup(req, sqe);
return io_sendmsg_setup(req, sqe);
}
return io_sendmsg_zc_setup(req, sqe);
}
static int io_sg_from_iter_iovec(struct sk_buff *skb,
@ -1454,7 +1453,8 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags)
(zc->flags & IORING_RECVSEND_POLL_FIRST))
return -EAGAIN;
if (!zc->done_io) {
if (req->flags & REQ_F_IMPORT_BUFFER) {
req->flags &= ~REQ_F_IMPORT_BUFFER;
ret = io_send_zc_import(req, issue_flags);
if (unlikely(ret))
return ret;
@ -1513,6 +1513,20 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags)
unsigned flags;
int ret, min_ret = 0;
kmsg->msg.sg_from_iter = io_sg_from_iter_iovec;
if (req->flags & REQ_F_IMPORT_BUFFER) {
unsigned uvec_segs = kmsg->msg.msg_iter.nr_segs;
int ret;
ret = io_import_reg_vec(ITER_SOURCE, &kmsg->msg.msg_iter, req,
&kmsg->vec, uvec_segs, issue_flags);
if (unlikely(ret))
return ret;
kmsg->msg.sg_from_iter = io_sg_from_iter;
req->flags &= ~REQ_F_IMPORT_BUFFER;
}
sock = sock_from_file(req->file);
if (unlikely(!sock))
return -ENOTSOCK;
@ -1531,7 +1545,6 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags)
kmsg->msg.msg_control_user = sr->msg_control;
kmsg->msg.msg_ubuf = &io_notif_to_data(sr->notif)->uarg;
kmsg->msg.sg_from_iter = io_sg_from_iter_iovec;
ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
if (unlikely(ret < min_ret)) {
@ -1646,16 +1659,9 @@ retry:
put_unused_fd(fd);
ret = PTR_ERR(file);
if (ret == -EAGAIN && force_nonblock &&
!(accept->iou_flags & IORING_ACCEPT_DONTWAIT)) {
/*
* if it's multishot and polled, we don't need to
* return EAGAIN to arm the poll infra since it
* has already been done
*/
if (issue_flags & IO_URING_F_MULTISHOT)
return IOU_ISSUE_SKIP_COMPLETE;
return ret;
}
!(accept->iou_flags & IORING_ACCEPT_DONTWAIT))
return IOU_RETRY;
if (ret == -ERESTARTSYS)
ret = -EINTR;
} else if (!fixed) {
@ -1674,17 +1680,13 @@ retry:
io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) {
if (cflags & IORING_CQE_F_SOCK_NONEMPTY || arg.is_empty == -1)
goto retry;
if (issue_flags & IO_URING_F_MULTISHOT)
return IOU_ISSUE_SKIP_COMPLETE;
return -EAGAIN;
return IOU_RETRY;
}
io_req_set_res(req, ret, cflags);
if (ret < 0)
req_set_fail(req);
if (!(issue_flags & IO_URING_F_MULTISHOT))
return IOU_OK;
return IOU_STOP_MULTISHOT;
return IOU_COMPLETE;
}
int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@ -1876,8 +1878,7 @@ void io_netmsg_cache_free(const void *entry)
{
struct io_async_msghdr *kmsg = (struct io_async_msghdr *) entry;
if (kmsg->free_iov)
io_netmsg_iovec_free(kmsg);
io_vec_free(&kmsg->vec);
kfree(kmsg);
}
#endif

View File

@ -2,12 +2,12 @@
#include <linux/net.h>
#include <linux/uio.h>
#include <linux/io_uring_types.h>
struct io_async_msghdr {
#if defined(CONFIG_NET)
struct iovec *free_iov;
/* points to an allocated iov, if NULL we use fast_iov instead */
int free_iov_nr;
struct iou_vec vec;
struct_group(clear,
int namelen;
struct iovec fast_iov;

View File

@ -416,7 +416,7 @@ const struct io_issue_def io_issue_defs[] = {
.plug = 1,
.iopoll = 1,
.iopoll_queue = 1,
.async_size = sizeof(struct io_uring_cmd_data),
.async_size = sizeof(struct io_async_cmd),
.prep = io_uring_cmd_prep,
.issue = io_uring_cmd,
},
@ -540,6 +540,35 @@ const struct io_issue_def io_issue_defs[] = {
.prep = io_eopnotsupp_prep,
#endif
},
[IORING_OP_READV_FIXED] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
.plug = 1,
.audit_skip = 1,
.ioprio = 1,
.iopoll = 1,
.iopoll_queue = 1,
.vectored = 1,
.async_size = sizeof(struct io_async_rw),
.prep = io_prep_readv_fixed,
.issue = io_read,
},
[IORING_OP_WRITEV_FIXED] = {
.needs_file = 1,
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
.plug = 1,
.audit_skip = 1,
.ioprio = 1,
.iopoll = 1,
.iopoll_queue = 1,
.vectored = 1,
.async_size = sizeof(struct io_async_rw),
.prep = io_prep_writev_fixed,
.issue = io_write,
},
};
const struct io_cold_def io_cold_defs[] = {
@ -726,6 +755,7 @@ const struct io_cold_def io_cold_defs[] = {
},
[IORING_OP_URING_CMD] = {
.name = "URING_CMD",
.cleanup = io_uring_cmd_cleanup,
},
[IORING_OP_SEND_ZC] = {
.name = "SEND_ZC",
@ -775,6 +805,16 @@ const struct io_cold_def io_cold_defs[] = {
[IORING_OP_EPOLL_WAIT] = {
.name = "EPOLL_WAIT",
},
[IORING_OP_READV_FIXED] = {
.name = "READV_FIXED",
.cleanup = io_readv_writev_cleanup,
.fail = io_rw_fail,
},
[IORING_OP_WRITEV_FIXED] = {
.name = "WRITEV_FIXED",
.cleanup = io_readv_writev_cleanup,
.fail = io_rw_fail,
},
};
const char *io_uring_get_opcode(u8 opcode)

View File

@ -289,11 +289,12 @@ static int io_poll_check_events(struct io_kiocb *req, io_tw_token_t tw)
}
} else {
int ret = io_poll_issue(req, tw);
if (ret == IOU_STOP_MULTISHOT)
if (ret == IOU_COMPLETE)
return IOU_POLL_REMOVE_POLL_USE_RES;
else if (ret == IOU_REQUEUE)
return IOU_POLL_REQUEUE;
if (ret < 0)
if (ret != IOU_RETRY && ret < 0)
return ret;
}

View File

@ -1262,3 +1262,166 @@ int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg)
fput(file);
return ret;
}
void io_vec_free(struct iou_vec *iv)
{
if (!iv->iovec)
return;
kfree(iv->iovec);
iv->iovec = NULL;
iv->nr = 0;
}
int io_vec_realloc(struct iou_vec *iv, unsigned nr_entries)
{
gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
struct iovec *iov;
iov = kmalloc_array(nr_entries, sizeof(iov[0]), gfp);
if (!iov)
return -ENOMEM;
io_vec_free(iv);
iv->iovec = iov;
iv->nr = nr_entries;
return 0;
}
static int io_vec_fill_bvec(int ddir, struct iov_iter *iter,
struct io_mapped_ubuf *imu,
struct iovec *iovec, unsigned nr_iovs,
struct iou_vec *vec)
{
unsigned long folio_size = 1 << imu->folio_shift;
unsigned long folio_mask = folio_size - 1;
u64 folio_addr = imu->ubuf & ~folio_mask;
struct bio_vec *res_bvec = vec->bvec;
size_t total_len = 0;
unsigned bvec_idx = 0;
unsigned iov_idx;
for (iov_idx = 0; iov_idx < nr_iovs; iov_idx++) {
size_t iov_len = iovec[iov_idx].iov_len;
u64 buf_addr = (u64)(uintptr_t)iovec[iov_idx].iov_base;
struct bio_vec *src_bvec;
size_t offset;
u64 buf_end;
if (unlikely(check_add_overflow(buf_addr, (u64)iov_len, &buf_end)))
return -EFAULT;
if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len)))
return -EFAULT;
if (unlikely(!iov_len))
return -EFAULT;
if (unlikely(check_add_overflow(total_len, iov_len, &total_len)))
return -EOVERFLOW;
/* by using folio address it also accounts for bvec offset */
offset = buf_addr - folio_addr;
src_bvec = imu->bvec + (offset >> imu->folio_shift);
offset &= folio_mask;
for (; iov_len; offset = 0, bvec_idx++, src_bvec++) {
size_t seg_size = min_t(size_t, iov_len,
folio_size - offset);
bvec_set_page(&res_bvec[bvec_idx],
src_bvec->bv_page, seg_size, offset);
iov_len -= seg_size;
}
}
if (total_len > MAX_RW_COUNT)
return -EINVAL;
iov_iter_bvec(iter, ddir, res_bvec, bvec_idx, total_len);
return 0;
}
static int io_estimate_bvec_size(struct iovec *iov, unsigned nr_iovs,
struct io_mapped_ubuf *imu)
{
unsigned shift = imu->folio_shift;
size_t max_segs = 0;
unsigned i;
for (i = 0; i < nr_iovs; i++)
max_segs += (iov[i].iov_len >> shift) + 2;
return max_segs;
}
int io_import_reg_vec(int ddir, struct iov_iter *iter,
struct io_kiocb *req, struct iou_vec *vec,
unsigned nr_iovs, unsigned issue_flags)
{
struct io_rsrc_node *node;
struct io_mapped_ubuf *imu;
unsigned iovec_off;
struct iovec *iov;
unsigned nr_segs;
node = io_find_buf_node(req, issue_flags);
if (!node)
return -EFAULT;
imu = node->buf;
if (imu->is_kbuf)
return -EOPNOTSUPP;
if (!(imu->dir & (1 << ddir)))
return -EFAULT;
iovec_off = vec->nr - nr_iovs;
iov = vec->iovec + iovec_off;
nr_segs = io_estimate_bvec_size(iov, nr_iovs, imu);
if (sizeof(struct bio_vec) > sizeof(struct iovec)) {
size_t bvec_bytes;
bvec_bytes = nr_segs * sizeof(struct bio_vec);
nr_segs = (bvec_bytes + sizeof(*iov) - 1) / sizeof(*iov);
nr_segs += nr_iovs;
}
if (nr_segs > vec->nr) {
struct iou_vec tmp_vec = {};
int ret;
ret = io_vec_realloc(&tmp_vec, nr_segs);
if (ret)
return ret;
iovec_off = tmp_vec.nr - nr_iovs;
memcpy(tmp_vec.iovec + iovec_off, iov, sizeof(*iov) * nr_iovs);
io_vec_free(vec);
*vec = tmp_vec;
iov = vec->iovec + iovec_off;
req->flags |= REQ_F_NEED_CLEANUP;
}
return io_vec_fill_bvec(ddir, iter, imu, iov, nr_iovs, vec);
}
int io_prep_reg_iovec(struct io_kiocb *req, struct iou_vec *iv,
const struct iovec __user *uvec, size_t uvec_segs)
{
struct iovec *iov;
int iovec_off, ret;
void *res;
if (uvec_segs > iv->nr) {
ret = io_vec_realloc(iv, uvec_segs);
if (ret)
return ret;
req->flags |= REQ_F_NEED_CLEANUP;
}
/* pad iovec to the right */
iovec_off = iv->nr - uvec_segs;
iov = iv->iovec + iovec_off;
res = iovec_from_user(uvec, uvec_segs, uvec_segs, iov,
io_is_compat(req->ctx));
if (IS_ERR(res))
return PTR_ERR(res);
req->flags |= REQ_F_IMPORT_BUFFER;
return 0;
}

View File

@ -5,6 +5,8 @@
#include <linux/io_uring_types.h>
#include <linux/lockdep.h>
#define IO_VEC_CACHE_SOFT_CAP 256
enum {
IORING_RSRC_FILE = 0,
IORING_RSRC_BUFFER = 1,
@ -61,6 +63,11 @@ struct io_rsrc_node *io_find_buf_node(struct io_kiocb *req,
int io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter,
u64 buf_addr, size_t len, int ddir,
unsigned issue_flags);
int io_import_reg_vec(int ddir, struct iov_iter *iter,
struct io_kiocb *req, struct iou_vec *vec,
unsigned nr_iovs, unsigned issue_flags);
int io_prep_reg_iovec(struct io_kiocb *req, struct iou_vec *iv,
const struct iovec __user *uvec, size_t uvec_segs);
int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg);
int io_sqe_buffers_unregister(struct io_ring_ctx *ctx);
@ -145,4 +152,21 @@ static inline void __io_unaccount_mem(struct user_struct *user,
atomic_long_sub(nr_pages, &user->locked_vm);
}
void io_vec_free(struct iou_vec *iv);
int io_vec_realloc(struct iou_vec *iv, unsigned nr_entries);
static inline void io_vec_reset_iovec(struct iou_vec *iv,
struct iovec *iovec, unsigned nr)
{
io_vec_free(iv);
iv->iovec = iovec;
iv->nr = nr;
}
static inline void io_alloc_cache_vec_kasan(struct iou_vec *iv)
{
if (IS_ENABLED(CONFIG_KASAN))
io_vec_free(iv);
}
#endif

View File

@ -87,9 +87,9 @@ static int io_import_vec(int ddir, struct io_kiocb *req,
int ret, nr_segs;
struct iovec *iov;
if (io->free_iovec) {
nr_segs = io->free_iov_nr;
iov = io->free_iovec;
if (io->vec.iovec) {
nr_segs = io->vec.nr;
iov = io->vec.iovec;
} else {
nr_segs = 1;
iov = &io->fast_iov;
@ -101,9 +101,7 @@ static int io_import_vec(int ddir, struct io_kiocb *req,
return ret;
if (iov) {
req->flags |= REQ_F_NEED_CLEANUP;
io->free_iov_nr = io->iter.nr_segs;
kfree(io->free_iovec);
io->free_iovec = iov;
io_vec_reset_iovec(&io->vec, iov, io->iter.nr_segs);
}
return 0;
}
@ -151,7 +149,10 @@ static void io_rw_recycle(struct io_kiocb *req, unsigned int issue_flags)
if (unlikely(issue_flags & IO_URING_F_UNLOCKED))
return;
io_alloc_cache_kasan(&rw->free_iovec, &rw->free_iov_nr);
io_alloc_cache_vec_kasan(&rw->vec);
if (rw->vec.nr > IO_VEC_CACHE_SOFT_CAP)
io_vec_free(&rw->vec);
if (io_alloc_cache_put(&req->ctx->rw_cache, rw)) {
req->async_data = NULL;
req->flags &= ~REQ_F_ASYNC_DATA;
@ -201,7 +202,7 @@ static int io_rw_alloc_async(struct io_kiocb *req)
rw = io_uring_alloc_async_data(&ctx->rw_cache, req);
if (!rw)
return -ENOMEM;
if (rw->free_iovec)
if (rw->vec.iovec)
req->flags |= REQ_F_NEED_CLEANUP;
rw->bytes_done = 0;
return 0;
@ -383,6 +384,53 @@ int io_prep_write_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return __io_prep_rw(req, sqe, ITER_SOURCE);
}
static int io_rw_import_reg_vec(struct io_kiocb *req,
struct io_async_rw *io,
int ddir, unsigned int issue_flags)
{
struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
unsigned uvec_segs = rw->len;
int ret;
ret = io_import_reg_vec(ddir, &io->iter, req, &io->vec,
uvec_segs, issue_flags);
if (unlikely(ret))
return ret;
iov_iter_save_state(&io->iter, &io->iter_state);
req->flags &= ~REQ_F_IMPORT_BUFFER;
return 0;
}
static int io_rw_prep_reg_vec(struct io_kiocb *req)
{
struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
struct io_async_rw *io = req->async_data;
const struct iovec __user *uvec;
uvec = u64_to_user_ptr(rw->addr);
return io_prep_reg_iovec(req, &io->vec, uvec, rw->len);
}
int io_prep_readv_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
int ret;
ret = __io_prep_rw(req, sqe, ITER_DEST);
if (unlikely(ret))
return ret;
return io_rw_prep_reg_vec(req);
}
int io_prep_writev_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
int ret;
ret = __io_prep_rw(req, sqe, ITER_SOURCE);
if (unlikely(ret))
return ret;
return io_rw_prep_reg_vec(req);
}
/*
* Multishot read is prepared just like a normal read/write request, only
* difference is that we set the MULTISHOT flag.
@ -856,7 +904,11 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
ssize_t ret;
loff_t *ppos;
if (io_do_buffer_select(req)) {
if (req->flags & REQ_F_IMPORT_BUFFER) {
ret = io_rw_import_reg_vec(req, io, ITER_DEST, issue_flags);
if (unlikely(ret))
return ret;
} else if (io_do_buffer_select(req)) {
ret = io_import_rw_buffer(ITER_DEST, req, io, issue_flags);
if (unlikely(ret < 0))
return ret;
@ -995,9 +1047,7 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags)
*/
if (io_kbuf_recycle(req, issue_flags))
rw->len = 0;
if (issue_flags & IO_URING_F_MULTISHOT)
return IOU_ISSUE_SKIP_COMPLETE;
return -EAGAIN;
return IOU_RETRY;
} else if (ret <= 0) {
io_kbuf_recycle(req, issue_flags);
if (ret < 0)
@ -1015,16 +1065,15 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags)
rw->len = 0; /* similarly to above, reset len to 0 */
if (io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) {
if (issue_flags & IO_URING_F_MULTISHOT) {
if (issue_flags & IO_URING_F_MULTISHOT)
/*
* Force retry, as we might have more data to
* be read and otherwise it won't get retried
* until (if ever) another poll is triggered.
*/
io_poll_multishot_retry(req);
return IOU_ISSUE_SKIP_COMPLETE;
}
return -EAGAIN;
return IOU_RETRY;
}
}
@ -1034,9 +1083,7 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags)
*/
io_req_set_res(req, ret, cflags);
io_req_rw_cleanup(req, issue_flags);
if (issue_flags & IO_URING_F_MULTISHOT)
return IOU_STOP_MULTISHOT;
return IOU_OK;
return IOU_COMPLETE;
}
static bool io_kiocb_start_write(struct io_kiocb *req, struct kiocb *kiocb)
@ -1067,6 +1114,12 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
ssize_t ret, ret2;
loff_t *ppos;
if (req->flags & REQ_F_IMPORT_BUFFER) {
ret = io_rw_import_reg_vec(req, io, ITER_SOURCE, issue_flags);
if (unlikely(ret))
return ret;
}
ret = io_rw_init_file(req, FMODE_WRITE, WRITE);
if (unlikely(ret))
return ret;
@ -1326,7 +1379,6 @@ void io_rw_cache_free(const void *entry)
{
struct io_async_rw *rw = (struct io_async_rw *) entry;
if (rw->free_iovec)
kfree(rw->free_iovec);
io_vec_free(&rw->vec);
kfree(rw);
}

View File

@ -9,13 +9,13 @@ struct io_meta_state {
};
struct io_async_rw {
struct iou_vec vec;
size_t bytes_done;
struct iovec *free_iovec;
struct_group(clear,
struct iov_iter iter;
struct iov_iter_state iter_state;
struct iovec fast_iov;
int free_iov_nr;
/*
* wpq is for buffered io, while meta fields are used with
* direct io
@ -32,6 +32,8 @@ struct io_async_rw {
int io_prep_read_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_prep_write_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_prep_readv_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_prep_writev_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_prep_readv(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_prep_writev(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_prep_read(struct io_kiocb *req, const struct io_uring_sqe *sqe);

View File

@ -16,10 +16,19 @@
#include "rsrc.h"
#include "uring_cmd.h"
void io_cmd_cache_free(const void *entry)
{
struct io_async_cmd *ac = (struct io_async_cmd *)entry;
io_vec_free(&ac->vec);
kfree(ac);
}
static void io_req_uring_cleanup(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
struct io_uring_cmd_data *cache = req->async_data;
struct io_async_cmd *ac = req->async_data;
struct io_uring_cmd_data *cache = &ac->data;
if (cache->op_data) {
kfree(cache->op_data);
@ -28,13 +37,23 @@ static void io_req_uring_cleanup(struct io_kiocb *req, unsigned int issue_flags)
if (issue_flags & IO_URING_F_UNLOCKED)
return;
if (io_alloc_cache_put(&req->ctx->uring_cache, cache)) {
io_alloc_cache_vec_kasan(&ac->vec);
if (ac->vec.nr > IO_VEC_CACHE_SOFT_CAP)
io_vec_free(&ac->vec);
if (io_alloc_cache_put(&req->ctx->cmd_cache, cache)) {
ioucmd->sqe = NULL;
req->async_data = NULL;
req->flags &= ~REQ_F_ASYNC_DATA;
req->flags &= ~(REQ_F_ASYNC_DATA|REQ_F_NEED_CLEANUP);
}
}
void io_uring_cmd_cleanup(struct io_kiocb *req)
{
io_req_uring_cleanup(req, 0);
}
bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx,
struct io_uring_task *tctx, bool cancel_all)
{
@ -169,12 +188,15 @@ static int io_uring_cmd_prep_setup(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
struct io_uring_cmd_data *cache;
struct io_async_cmd *ac;
cache = io_uring_alloc_async_data(&req->ctx->uring_cache, req);
if (!cache)
/* see io_uring_cmd_get_async_data() */
BUILD_BUG_ON(offsetof(struct io_async_cmd, data) != 0);
ac = io_uring_alloc_async_data(&req->ctx->cmd_cache, req);
if (!ac)
return -ENOMEM;
cache->op_data = NULL;
ac->data.op_data = NULL;
/*
* Unconditionally cache the SQE for now - this is only needed for
@ -183,8 +205,8 @@ static int io_uring_cmd_prep_setup(struct io_kiocb *req,
* that it doesn't read in per-op data, play it safe and ensure that
* any SQE data is stable beyond prep. This can later get relaxed.
*/
memcpy(cache->sqes, sqe, uring_sqe_size(req->ctx));
ioucmd->sqe = cache->sqes;
memcpy(ac->data.sqes, sqe, uring_sqe_size(req->ctx));
ioucmd->sqe = ac->data.sqes;
return 0;
}
@ -255,6 +277,25 @@ int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
}
EXPORT_SYMBOL_GPL(io_uring_cmd_import_fixed);
int io_uring_cmd_import_fixed_vec(struct io_uring_cmd *ioucmd,
const struct iovec __user *uvec,
size_t uvec_segs,
int ddir, struct iov_iter *iter,
unsigned issue_flags)
{
struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
struct io_async_cmd *ac = req->async_data;
int ret;
ret = io_prep_reg_iovec(req, &ac->vec, uvec, uvec_segs);
if (ret)
return ret;
return io_import_reg_vec(ddir, iter, req, &ac->vec, uvec_segs,
issue_flags);
}
EXPORT_SYMBOL_GPL(io_uring_cmd_import_fixed_vec);
void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd)
{
struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);

View File

@ -1,7 +1,24 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/io_uring/cmd.h>
#include <linux/io_uring_types.h>
struct io_async_cmd {
struct io_uring_cmd_data data;
struct iou_vec vec;
};
int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags);
int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
void io_uring_cmd_cleanup(struct io_kiocb *req);
bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx,
struct io_uring_task *tctx, bool cancel_all);
void io_cmd_cache_free(const void *entry);
int io_uring_cmd_import_fixed_vec(struct io_uring_cmd *ioucmd,
const struct iovec __user *uvec,
size_t uvec_segs,
int ddir, struct iov_iter *iter,
unsigned issue_flags);