Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions Documentation/block/ublk.rst
Original file line number Diff line number Diff line change
Expand Up @@ -382,17 +382,17 @@ Zero copy
---------

ublk zero copy relies on io_uring's fixed kernel buffer, which provides
two APIs: `io_buffer_register_bvec()` and `io_buffer_unregister_bvec`.
two APIs: `io_buffer_register_request()` and `io_buffer_unregister`.

ublk adds IO command of `UBLK_IO_REGISTER_IO_BUF` to call
`io_buffer_register_bvec()` for ublk server to register client request
`io_buffer_register_request()` for ublk server to register client request
buffer into io_uring buffer table, then ublk server can submit io_uring
IOs with the registered buffer index. IO command of `UBLK_IO_UNREGISTER_IO_BUF`
calls `io_buffer_unregister_bvec()` to unregister the buffer, which is
guaranteed to be live between calling `io_buffer_register_bvec()` and
`io_buffer_unregister_bvec()`. Any io_uring operation which supports this
kind of kernel buffer will grab one reference of the buffer until the
operation is completed.
calls `io_buffer_unregister()` to unregister the buffer, which is guaranteed
to be live between calling `io_buffer_register_request()` and
`io_buffer_unregister()`. Any io_uring operation which supports this kind of
kernel buffer will grab one reference of the buffer until the operation is
completed.

ublk server implementing zero copy or user copy has to be CAP_SYS_ADMIN and
be trusted, because it is ublk server's responsibility to make sure IO buffer
Expand Down
22 changes: 11 additions & 11 deletions drivers/block/ublk_drv.c
Original file line number Diff line number Diff line change
Expand Up @@ -1699,8 +1699,8 @@ ublk_auto_buf_register(const struct ublk_queue *ubq, struct request *req,
{
int ret;

ret = io_buffer_register_bvec(cmd, req, ublk_io_release,
io->buf.auto_reg.index, issue_flags);
ret = io_buffer_register_request(cmd, req, ublk_io_release,
io->buf.auto_reg.index, issue_flags);
if (ret) {
if (io->buf.auto_reg.flags & UBLK_AUTO_BUF_REG_FALLBACK) {
ublk_auto_buf_reg_fallback(ubq, req->tag);
Expand Down Expand Up @@ -1906,7 +1906,7 @@ static noinline void ublk_batch_dispatch_fail(struct ublk_queue *ubq,
ublk_io_unlock(io);

if (index != -1)
io_buffer_unregister_bvec(data->cmd, index,
io_buffer_unregister(data->cmd, index,
data->issue_flags);
}

Expand Down Expand Up @@ -3194,8 +3194,8 @@ static int ublk_register_io_buf(struct io_uring_cmd *cmd,
if (!req)
return -EINVAL;

ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index,
issue_flags);
ret = io_buffer_register_request(cmd, req, ublk_io_release, index,
issue_flags);
if (ret) {
ublk_put_req_ref(io, req);
return ret;
Expand Down Expand Up @@ -3226,8 +3226,8 @@ ublk_daemon_register_io_buf(struct io_uring_cmd *cmd,
if (!ublk_dev_support_zero_copy(ub) || !blk_rq_has_data(req))
return -EINVAL;

ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index,
issue_flags);
ret = io_buffer_register_request(cmd, req, ublk_io_release, index,
issue_flags);
if (ret)
return ret;

Expand All @@ -3242,7 +3242,7 @@ static int ublk_unregister_io_buf(struct io_uring_cmd *cmd,
if (!(ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY))
return -EINVAL;

return io_buffer_unregister_bvec(cmd, index, issue_flags);
return io_buffer_unregister(cmd, index, issue_flags);
}

static int ublk_check_fetch_buf(const struct ublk_device *ub, __u64 buf_addr)
Expand Down Expand Up @@ -3383,7 +3383,7 @@ static int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd,
goto out;

/*
* io_buffer_unregister_bvec() doesn't access the ubq or io,
* io_buffer_unregister() doesn't access the ubq or io,
* so no need to validate the q_id, tag, or task
*/
if (_IOC_NR(cmd_op) == UBLK_IO_UNREGISTER_IO_BUF)
Expand Down Expand Up @@ -3450,7 +3450,7 @@ static int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd,
req = ublk_fill_io_cmd(io, cmd);
ret = ublk_config_io_buf(ub, io, cmd, addr, &buf_idx);
if (buf_idx != UBLK_INVALID_BUF_IDX)
io_buffer_unregister_bvec(cmd, buf_idx, issue_flags);
io_buffer_unregister(cmd, buf_idx, issue_flags);
compl = ublk_need_complete_req(ub, io);

if (req_op(req) == REQ_OP_ZONE_APPEND)
Expand Down Expand Up @@ -3787,7 +3787,7 @@ static int ublk_batch_commit_io(struct ublk_queue *ubq,
}

if (buf_idx != UBLK_INVALID_BUF_IDX)
io_buffer_unregister_bvec(data->cmd, buf_idx, data->issue_flags);
io_buffer_unregister(data->cmd, buf_idx, data->issue_flags);
if (req_op(req) == REQ_OP_ZONE_APPEND)
req->__sector = ublk_batch_zone_lba(uc, elem);
if (compl)
Expand Down
38 changes: 32 additions & 6 deletions include/linux/io_uring/cmd.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,15 @@ struct io_br_sel io_uring_cmd_buffer_select(struct io_uring_cmd *ioucmd,
bool io_uring_mshot_cmd_post_cqe(struct io_uring_cmd *ioucmd,
struct io_br_sel *sel, unsigned int issue_flags);

int io_buffer_register_request(struct io_uring_cmd *cmd, struct request *rq,
void (*release)(void *), unsigned int index,
unsigned int issue_flags);
int io_buffer_register_bvec(struct io_uring_cmd *cmd, const struct bio_vec *bvs,
unsigned int nr_bvecs, void (*release)(void *),
void *priv, u8 dir, unsigned int index,
unsigned int issue_flags);
int io_buffer_unregister(struct io_uring_cmd *cmd, unsigned int index,
unsigned int issue_flags);
#else
static inline int
io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
Expand Down Expand Up @@ -133,6 +142,29 @@ static inline bool io_uring_mshot_cmd_post_cqe(struct io_uring_cmd *ioucmd,
{
return true;
}
static inline int io_buffer_register_request(struct io_uring_cmd *cmd,
struct request *rq,
void (*release)(void *),
unsigned int index,
unsigned int issue_flags)
{
return -EOPNOTSUPP;
}
static inline int io_buffer_register_bvec(struct io_uring_cmd *cmd,
const struct bio_vec *bvs,
unsigned int nr_bvecs,
void (*release)(void *), void *priv,
u8 dir, unsigned int index,
unsigned int issue_flags)
{
return -EOPNOTSUPP;
}
static inline int io_buffer_unregister(struct io_uring_cmd *cmd,
unsigned int index,
unsigned int issue_flags)
{
return -EOPNOTSUPP;
}
#endif

static inline struct io_uring_cmd *io_uring_cmd_from_tw(struct io_tw_req tw_req)
Expand Down Expand Up @@ -182,10 +214,4 @@ static inline void io_uring_cmd_done32(struct io_uring_cmd *ioucmd, s32 ret,
return __io_uring_cmd_done(ioucmd, ret, res2, issue_flags, true);
}

int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq,
void (*release)(void *), unsigned int index,
unsigned int issue_flags);
int io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index,
unsigned int issue_flags);

#endif /* _LINUX_IO_URING_CMD_H */
7 changes: 5 additions & 2 deletions include/linux/io_uring_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,11 @@ enum io_uring_cmd_flags {
IO_URING_F_COMPAT = (1 << 12),
};

enum {
IO_BUF_DEST = 1 << ITER_DEST,
IO_BUF_SOURCE = 1 << ITER_SOURCE,
};

struct iou_loop_params;

struct io_wq_work_node {
Expand Down Expand Up @@ -149,8 +154,6 @@ struct io_uring_task {

struct { /* task_work */
struct mpscq task_list;
/* BIT(0) guards adding tw only once */
unsigned long tw_pending;
struct callback_head task_work;
} ____cacheline_aligned_in_smp;
};
Expand Down
18 changes: 16 additions & 2 deletions io_uring/epoll.c
Original file line number Diff line number Diff line change
Expand Up @@ -51,10 +51,24 @@ int io_epoll_ctl_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_epoll *ie = io_kiocb_to_cmd(req, struct io_epoll);
int ret;
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
struct epoll_key key;
int ret;

CLASS(fd, f)(ie->epfd);
if (fd_empty(f))
return -EBADF;

CLASS(fd, tf)(ie->fd);
if (fd_empty(tf))
return -EBADF;
/* disallow adding an epoll context to another epoll context */
if (ie->op == EPOLL_CTL_ADD && is_file_epoll(fd_file(tf)))
return -EINVAL;

ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock);
key.file = fd_file(tf);
key.fd = ie->fd;
ret = do_epoll_ctl_file(fd_file(f), ie->op, &key, &ie->event, force_nonblock);
if (force_nonblock && ret == -EAGAIN)
return -EAGAIN;

Expand Down
2 changes: 1 addition & 1 deletion io_uring/io_uring.c
Original file line number Diff line number Diff line change
Expand Up @@ -3245,7 +3245,7 @@ static int __init io_uring_init(void)
io_uring_optable_init();

/* imu->dir is u8 */
BUILD_BUG_ON((IO_IMU_DEST | IO_IMU_SOURCE) > U8_MAX);
BUILD_BUG_ON((IO_BUF_DEST | IO_BUF_SOURCE) > U8_MAX);

/*
* Allow user copy in the per-command field, which starts after the
Expand Down
2 changes: 1 addition & 1 deletion io_uring/memmap.c
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages)
nr_pages = end - start;
if (WARN_ON_ONCE(!nr_pages))
return ERR_PTR(-EINVAL);
if (WARN_ON_ONCE(nr_pages > INT_MAX))
if (nr_pages > INT_MAX / sizeof(struct page *))
return ERR_PTR(-EOVERFLOW);

pages = kvmalloc_objs(struct page *, nr_pages, GFP_KERNEL_ACCOUNT);
Expand Down
9 changes: 9 additions & 0 deletions io_uring/mpscq.h
Original file line number Diff line number Diff line change
Expand Up @@ -122,4 +122,13 @@ static inline struct llist_node *mpscq_pop(struct mpscq *q,
return NULL;
}

/*
* Returns true if the most recent mpscq_pop() that returned a node also
* emptied the queue. Consumer must be serialized.
*/
static inline bool mpscq_pop_emptied(struct mpscq *q, struct llist_node *head)
{
return head == &q->stub;
}

#endif /* IOU_MPSCQ_H */
8 changes: 4 additions & 4 deletions io_uring/nop.c
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
nop->fd = READ_ONCE(sqe->fd);
else
nop->fd = -1;
if (nop->flags & IORING_NOP_FIXED_FILE)
req->flags |= REQ_F_FIXED_FILE;
if (nop->flags & IORING_NOP_FIXED_BUFFER)
req->buf_index = READ_ONCE(sqe->buf_index);
if (nop->flags & IORING_NOP_CQE32) {
Expand All @@ -59,12 +61,10 @@ int io_nop(struct io_kiocb *req, unsigned int issue_flags)
int ret = nop->result;

if (nop->flags & IORING_NOP_FILE) {
if (nop->flags & IORING_NOP_FIXED_FILE) {
if (req->flags & REQ_F_FIXED_FILE)
req->file = io_file_get_fixed(req, nop->fd, issue_flags);
req->flags |= REQ_F_FIXED_FILE;
} else {
else
req->file = io_file_get_normal(req, nop->fd);
}
if (!req->file) {
ret = -EBADF;
goto done;
Expand Down
1 change: 1 addition & 0 deletions io_uring/opdef.c
Original file line number Diff line number Diff line change
Expand Up @@ -520,6 +520,7 @@ const struct io_issue_def io_issue_defs[] = {
#endif
},
[IORING_OP_RECV_ZC] = {
.audit_skip = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
Expand Down
31 changes: 21 additions & 10 deletions io_uring/register.c
Original file line number Diff line number Diff line change
Expand Up @@ -503,6 +503,7 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
unsigned i, tail, old_head;
struct io_uring_params *p = &config.p;
struct io_rings_layout *rl = &config.layout;
u32 *o_sq_array, *n_sq_array = NULL;
int ret;

memset(&config, 0, sizeof(config));
Expand Down Expand Up @@ -589,6 +590,9 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
ctx->rings = NULL;
o.sq_sqes = ctx->sq_sqes;
ctx->sq_sqes = NULL;
o_sq_array = ctx->sq_array;
if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
n_sq_array = (u32 *)((char *)n.rings + rl->sq_array_offset);

/*
* Now copy SQ and CQ entries, if any. If either of the destination
Expand All @@ -599,20 +603,27 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
if (tail - old_head > p->sq_entries)
goto overflow;
for (i = old_head; i < tail; i++) {
unsigned index, dst_mask, src_mask;
unsigned int dst, src;
size_t sq_size;

index = i;
dst = i & (p->sq_entries - 1);
src = i & (ctx->sq_entries - 1);
if (n_sq_array) {
src = READ_ONCE(o_sq_array[src]);
if (unlikely(src >= ctx->sq_entries)) {
WRITE_ONCE(n_sq_array[dst], UINT_MAX);
continue;
}
WRITE_ONCE(n_sq_array[dst], dst);
}

sq_size = sizeof(struct io_uring_sqe);
src_mask = ctx->sq_entries - 1;
dst_mask = p->sq_entries - 1;
if (ctx->flags & IORING_SETUP_SQE128) {
index <<= 1;
dst <<= 1;
src <<= 1;
sq_size <<= 1;
src_mask = (ctx->sq_entries << 1) - 1;
dst_mask = (p->sq_entries << 1) - 1;
}
memcpy(&n.sq_sqes[index & dst_mask], &o.sq_sqes[index & src_mask], sq_size);
memcpy(&n.sq_sqes[dst], &o.sq_sqes[src], sq_size);
}
WRITE_ONCE(n.rings->sq.head, old_head);
WRITE_ONCE(n.rings->sq.tail, tail);
Expand Down Expand Up @@ -655,8 +666,8 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
WRITE_ONCE(n.rings->cq_overflow, READ_ONCE(o.rings->cq_overflow));

/* all done, store old pointers and assign new ones */
if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
ctx->sq_array = (u32 *)((char *)n.rings + rl->sq_array_offset);
if (n_sq_array)
ctx->sq_array = n_sq_array;

ctx->sq_entries = p->sq_entries;
ctx->cq_entries = p->cq_entries;
Expand Down
Loading
Loading