diff --git a/Documentation/block/ublk.rst b/Documentation/block/ublk.rst index 0413dcd9ef69d..28300fee22bfc 100644 --- a/Documentation/block/ublk.rst +++ b/Documentation/block/ublk.rst @@ -382,17 +382,17 @@ Zero copy --------- ublk zero copy relies on io_uring's fixed kernel buffer, which provides -two APIs: `io_buffer_register_bvec()` and `io_buffer_unregister_bvec`. +two APIs: `io_buffer_register_request()` and `io_buffer_unregister`. ublk adds IO command of `UBLK_IO_REGISTER_IO_BUF` to call -`io_buffer_register_bvec()` for ublk server to register client request +`io_buffer_register_request()` for ublk server to register client request buffer into io_uring buffer table, then ublk server can submit io_uring IOs with the registered buffer index. IO command of `UBLK_IO_UNREGISTER_IO_BUF` -calls `io_buffer_unregister_bvec()` to unregister the buffer, which is -guaranteed to be live between calling `io_buffer_register_bvec()` and -`io_buffer_unregister_bvec()`. Any io_uring operation which supports this -kind of kernel buffer will grab one reference of the buffer until the -operation is completed. +calls `io_buffer_unregister()` to unregister the buffer, which is guaranteed +to be live between calling `io_buffer_register_request()` and +`io_buffer_unregister()`. Any io_uring operation which supports this kind of +kernel buffer will grab one reference of the buffer until the operation is +completed. ublk server implementing zero copy or user copy has to be CAP_SYS_ADMIN and be trusted, because it is ublk server's responsibility to make sure IO buffer diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 4f6d9e6521878..4036eb6be0560 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1699,8 +1699,8 @@ ublk_auto_buf_register(const struct ublk_queue *ubq, struct request *req, { int ret; - ret = io_buffer_register_bvec(cmd, req, ublk_io_release, - io->buf.auto_reg.index, issue_flags); + ret = io_buffer_register_request(cmd, req, ublk_io_release, + io->buf.auto_reg.index, issue_flags); if (ret) { if (io->buf.auto_reg.flags & UBLK_AUTO_BUF_REG_FALLBACK) { ublk_auto_buf_reg_fallback(ubq, req->tag); @@ -1906,7 +1906,7 @@ static noinline void ublk_batch_dispatch_fail(struct ublk_queue *ubq, ublk_io_unlock(io); if (index != -1) - io_buffer_unregister_bvec(data->cmd, index, + io_buffer_unregister(data->cmd, index, data->issue_flags); } @@ -3194,8 +3194,8 @@ static int ublk_register_io_buf(struct io_uring_cmd *cmd, if (!req) return -EINVAL; - ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index, - issue_flags); + ret = io_buffer_register_request(cmd, req, ublk_io_release, index, + issue_flags); if (ret) { ublk_put_req_ref(io, req); return ret; @@ -3226,8 +3226,8 @@ ublk_daemon_register_io_buf(struct io_uring_cmd *cmd, if (!ublk_dev_support_zero_copy(ub) || !blk_rq_has_data(req)) return -EINVAL; - ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index, - issue_flags); + ret = io_buffer_register_request(cmd, req, ublk_io_release, index, + issue_flags); if (ret) return ret; @@ -3242,7 +3242,7 @@ static int ublk_unregister_io_buf(struct io_uring_cmd *cmd, if (!(ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY)) return -EINVAL; - return io_buffer_unregister_bvec(cmd, index, issue_flags); + return io_buffer_unregister(cmd, index, issue_flags); } static int ublk_check_fetch_buf(const struct ublk_device *ub, __u64 buf_addr) @@ -3383,7 +3383,7 @@ static int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd, goto out; /* - * io_buffer_unregister_bvec() doesn't access the ubq or io, + * io_buffer_unregister() doesn't access the ubq or io, * so no need to validate the q_id, tag, or task */ if (_IOC_NR(cmd_op) == UBLK_IO_UNREGISTER_IO_BUF) @@ -3450,7 +3450,7 @@ static int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd, req = ublk_fill_io_cmd(io, cmd); ret = ublk_config_io_buf(ub, io, cmd, addr, &buf_idx); if (buf_idx != UBLK_INVALID_BUF_IDX) - io_buffer_unregister_bvec(cmd, buf_idx, issue_flags); + io_buffer_unregister(cmd, buf_idx, issue_flags); compl = ublk_need_complete_req(ub, io); if (req_op(req) == REQ_OP_ZONE_APPEND) @@ -3787,7 +3787,7 @@ static int ublk_batch_commit_io(struct ublk_queue *ubq, } if (buf_idx != UBLK_INVALID_BUF_IDX) - io_buffer_unregister_bvec(data->cmd, buf_idx, data->issue_flags); + io_buffer_unregister(data->cmd, buf_idx, data->issue_flags); if (req_op(req) == REQ_OP_ZONE_APPEND) req->__sector = ublk_batch_zone_lba(uc, elem); if (compl) diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index 331dcbefe72f1..42801f0b6456e 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -91,6 +91,15 @@ struct io_br_sel io_uring_cmd_buffer_select(struct io_uring_cmd *ioucmd, bool io_uring_mshot_cmd_post_cqe(struct io_uring_cmd *ioucmd, struct io_br_sel *sel, unsigned int issue_flags); +int io_buffer_register_request(struct io_uring_cmd *cmd, struct request *rq, + void (*release)(void *), unsigned int index, + unsigned int issue_flags); +int io_buffer_register_bvec(struct io_uring_cmd *cmd, const struct bio_vec *bvs, + unsigned int nr_bvecs, void (*release)(void *), + void *priv, u8 dir, unsigned int index, + unsigned int issue_flags); +int io_buffer_unregister(struct io_uring_cmd *cmd, unsigned int index, + unsigned int issue_flags); #else static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, @@ -133,6 +142,29 @@ static inline bool io_uring_mshot_cmd_post_cqe(struct io_uring_cmd *ioucmd, { return true; } +static inline int io_buffer_register_request(struct io_uring_cmd *cmd, + struct request *rq, + void (*release)(void *), + unsigned int index, + unsigned int issue_flags) +{ + return -EOPNOTSUPP; +} +static inline int io_buffer_register_bvec(struct io_uring_cmd *cmd, + const struct bio_vec *bvs, + unsigned int nr_bvecs, + void (*release)(void *), void *priv, + u8 dir, unsigned int index, + unsigned int issue_flags) +{ + return -EOPNOTSUPP; +} +static inline int io_buffer_unregister(struct io_uring_cmd *cmd, + unsigned int index, + unsigned int issue_flags) +{ + return -EOPNOTSUPP; +} #endif static inline struct io_uring_cmd *io_uring_cmd_from_tw(struct io_tw_req tw_req) @@ -182,10 +214,4 @@ static inline void io_uring_cmd_done32(struct io_uring_cmd *ioucmd, s32 ret, return __io_uring_cmd_done(ioucmd, ret, res2, issue_flags, true); } -int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq, - void (*release)(void *), unsigned int index, - unsigned int issue_flags); -int io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index, - unsigned int issue_flags); - #endif /* _LINUX_IO_URING_CMD_H */ diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 6415a3353ee0e..db42a548c7a53 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -44,6 +44,11 @@ enum io_uring_cmd_flags { IO_URING_F_COMPAT = (1 << 12), }; +enum { + IO_BUF_DEST = 1 << ITER_DEST, + IO_BUF_SOURCE = 1 << ITER_SOURCE, +}; + struct iou_loop_params; struct io_wq_work_node { @@ -149,8 +154,6 @@ struct io_uring_task { struct { /* task_work */ struct mpscq task_list; - /* BIT(0) guards adding tw only once */ - unsigned long tw_pending; struct callback_head task_work; } ____cacheline_aligned_in_smp; }; diff --git a/io_uring/epoll.c b/io_uring/epoll.c index 8d4610246ba0a..eecd748cad018 100644 --- a/io_uring/epoll.c +++ b/io_uring/epoll.c @@ -51,10 +51,24 @@ int io_epoll_ctl_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags) { struct io_epoll *ie = io_kiocb_to_cmd(req, struct io_epoll); - int ret; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; + struct epoll_key key; + int ret; + + CLASS(fd, f)(ie->epfd); + if (fd_empty(f)) + return -EBADF; + + CLASS(fd, tf)(ie->fd); + if (fd_empty(tf)) + return -EBADF; + /* disallow adding an epoll context to another epoll context */ + if (ie->op == EPOLL_CTL_ADD && is_file_epoll(fd_file(tf))) + return -EINVAL; - ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock); + key.file = fd_file(tf); + key.fd = ie->fd; + ret = do_epoll_ctl_file(fd_file(f), ie->op, &key, &ie->event, force_nonblock); if (force_nonblock && ret == -EAGAIN) return -EAGAIN; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 1ea2fca34a36f..ba685b6052edb 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3245,7 +3245,7 @@ static int __init io_uring_init(void) io_uring_optable_init(); /* imu->dir is u8 */ - BUILD_BUG_ON((IO_IMU_DEST | IO_IMU_SOURCE) > U8_MAX); + BUILD_BUG_ON((IO_BUF_DEST | IO_BUF_SOURCE) > U8_MAX); /* * Allow user copy in the per-command field, which starts after the diff --git a/io_uring/memmap.c b/io_uring/memmap.c index 4f9b439319c46..da1f6c5d07f8a 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -53,7 +53,7 @@ struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages) nr_pages = end - start; if (WARN_ON_ONCE(!nr_pages)) return ERR_PTR(-EINVAL); - if (WARN_ON_ONCE(nr_pages > INT_MAX)) + if (nr_pages > INT_MAX / sizeof(struct page *)) return ERR_PTR(-EOVERFLOW); pages = kvmalloc_objs(struct page *, nr_pages, GFP_KERNEL_ACCOUNT); diff --git a/io_uring/mpscq.h b/io_uring/mpscq.h index c801384c6a0aa..f910526766fd8 100644 --- a/io_uring/mpscq.h +++ b/io_uring/mpscq.h @@ -122,4 +122,13 @@ static inline struct llist_node *mpscq_pop(struct mpscq *q, return NULL; } +/* + * Returns true if the most recent mpscq_pop() that returned a node also + * emptied the queue. Consumer must be serialized. + */ +static inline bool mpscq_pop_emptied(struct mpscq *q, struct llist_node *head) +{ + return head == &q->stub; +} + #endif /* IOU_MPSCQ_H */ diff --git a/io_uring/nop.c b/io_uring/nop.c index 91ae0b2e7e556..60ab19604b36f 100644 --- a/io_uring/nop.c +++ b/io_uring/nop.c @@ -40,6 +40,8 @@ int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) nop->fd = READ_ONCE(sqe->fd); else nop->fd = -1; + if (nop->flags & IORING_NOP_FIXED_FILE) + req->flags |= REQ_F_FIXED_FILE; if (nop->flags & IORING_NOP_FIXED_BUFFER) req->buf_index = READ_ONCE(sqe->buf_index); if (nop->flags & IORING_NOP_CQE32) { @@ -59,12 +61,10 @@ int io_nop(struct io_kiocb *req, unsigned int issue_flags) int ret = nop->result; if (nop->flags & IORING_NOP_FILE) { - if (nop->flags & IORING_NOP_FIXED_FILE) { + if (req->flags & REQ_F_FIXED_FILE) req->file = io_file_get_fixed(req, nop->fd, issue_flags); - req->flags |= REQ_F_FIXED_FILE; - } else { + else req->file = io_file_get_normal(req, nop->fd); - } if (!req->file) { ret = -EBADF; goto done; diff --git a/io_uring/opdef.c b/io_uring/opdef.c index 88a45c7d897f2..4e58eb1344eaa 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -520,6 +520,7 @@ const struct io_issue_def io_issue_defs[] = { #endif }, [IORING_OP_RECV_ZC] = { + .audit_skip = 1, .needs_file = 1, .unbound_nonreg_file = 1, .pollin = 1, diff --git a/io_uring/register.c b/io_uring/register.c index dce5e2f9cf770..02bc103bcc9d5 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -503,6 +503,7 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) unsigned i, tail, old_head; struct io_uring_params *p = &config.p; struct io_rings_layout *rl = &config.layout; + u32 *o_sq_array, *n_sq_array = NULL; int ret; memset(&config, 0, sizeof(config)); @@ -589,6 +590,9 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) ctx->rings = NULL; o.sq_sqes = ctx->sq_sqes; ctx->sq_sqes = NULL; + o_sq_array = ctx->sq_array; + if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) + n_sq_array = (u32 *)((char *)n.rings + rl->sq_array_offset); /* * Now copy SQ and CQ entries, if any. If either of the destination @@ -599,20 +603,27 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) if (tail - old_head > p->sq_entries) goto overflow; for (i = old_head; i < tail; i++) { - unsigned index, dst_mask, src_mask; + unsigned int dst, src; size_t sq_size; - index = i; + dst = i & (p->sq_entries - 1); + src = i & (ctx->sq_entries - 1); + if (n_sq_array) { + src = READ_ONCE(o_sq_array[src]); + if (unlikely(src >= ctx->sq_entries)) { + WRITE_ONCE(n_sq_array[dst], UINT_MAX); + continue; + } + WRITE_ONCE(n_sq_array[dst], dst); + } + sq_size = sizeof(struct io_uring_sqe); - src_mask = ctx->sq_entries - 1; - dst_mask = p->sq_entries - 1; if (ctx->flags & IORING_SETUP_SQE128) { - index <<= 1; + dst <<= 1; + src <<= 1; sq_size <<= 1; - src_mask = (ctx->sq_entries << 1) - 1; - dst_mask = (p->sq_entries << 1) - 1; } - memcpy(&n.sq_sqes[index & dst_mask], &o.sq_sqes[index & src_mask], sq_size); + memcpy(&n.sq_sqes[dst], &o.sq_sqes[src], sq_size); } WRITE_ONCE(n.rings->sq.head, old_head); WRITE_ONCE(n.rings->sq.tail, tail); @@ -655,8 +666,8 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) WRITE_ONCE(n.rings->cq_overflow, READ_ONCE(o.rings->cq_overflow)); /* all done, store old pointers and assign new ones */ - if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) - ctx->sq_array = (u32 *)((char *)n.rings + rl->sq_array_offset); + if (n_sq_array) + ctx->sq_array = n_sq_array; ctx->sq_entries = p->sq_entries; ctx->cq_entries = p->cq_entries; diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 8d0f2ee24e0c2..f3f01e0c8102f 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -912,7 +912,7 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, imu->release = io_release_ubuf; imu->priv = imu; imu->flags = 0; - imu->dir = IO_IMU_DEST | IO_IMU_SOURCE; + imu->dir = IO_BUF_DEST | IO_BUF_SOURCE; if (coalesced) imu->folio_shift = data.folio_shift; refcount_set(&imu->refs, 1); @@ -1015,71 +1015,124 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, return ret; } -int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq, - void (*release)(void *), unsigned int index, - unsigned int issue_flags) +static struct io_mapped_ubuf *io_kernel_buffer_init(struct io_ring_ctx *ctx, + unsigned int nr_bvecs, + unsigned int total_bytes, + u8 dir, + void (*release)(void *), + void *priv, + unsigned int index) { - struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx; struct io_rsrc_data *data = &ctx->buf_table; - struct req_iterator rq_iter; struct io_mapped_ubuf *imu; struct io_rsrc_node *node; - struct bio_vec bv; - unsigned int nr_bvecs = 0; - int ret = 0; - io_ring_submit_lock(ctx, issue_flags); - if (index >= data->nr) { - ret = -EINVAL; - goto unlock; - } + if (index >= data->nr) + return ERR_PTR(-EINVAL); index = array_index_nospec(index, data->nr); - if (data->nodes[index]) { - ret = -EBUSY; - goto unlock; - } + if (data->nodes[index]) + return ERR_PTR(-EBUSY); node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); - if (!node) { - ret = -ENOMEM; - goto unlock; - } + if (!node) + return ERR_PTR(-ENOMEM); - /* - * blk_rq_nr_phys_segments() may overestimate the number of bvecs - * but avoids needing to iterate over the bvecs - */ - imu = io_alloc_imu(ctx, blk_rq_nr_phys_segments(rq)); + imu = io_alloc_imu(ctx, nr_bvecs); if (!imu) { io_cache_free(&ctx->node_cache, node); - ret = -ENOMEM; - goto unlock; + return ERR_PTR(-ENOMEM); } imu->ubuf = 0; - imu->len = blk_rq_bytes(rq); + imu->len = total_bytes; imu->folio_shift = PAGE_SHIFT; + imu->nr_bvecs = nr_bvecs; refcount_set(&imu->refs, 1); imu->release = release; - imu->priv = rq; + imu->priv = priv; + imu->dir = dir; imu->flags = IO_REGBUF_F_KBUF; - imu->dir = 1 << rq_data_dir(rq); + node->buf = imu; + data->nodes[index] = node; + + return imu; +} + +int io_buffer_register_request(struct io_uring_cmd *cmd, struct request *rq, + void (*release)(void *), unsigned int index, + unsigned int issue_flags) +{ + struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx; + struct req_iterator rq_iter; + struct io_mapped_ubuf *imu; + struct bio_vec bv; + /* + * blk_rq_nr_phys_segments() may overestimate the number of bvecs + * but avoids needing to iterate over the bvecs + */ + unsigned int nr_bvecs = blk_rq_nr_phys_segments(rq); + unsigned int total_bytes = blk_rq_bytes(rq); + int ret = 0; + + io_ring_submit_lock(ctx, issue_flags); + + imu = io_kernel_buffer_init(ctx, nr_bvecs, total_bytes, + 1 << rq_data_dir(rq), release, rq, index); + if (IS_ERR(imu)) { + ret = PTR_ERR(imu); + goto unlock; + } + + nr_bvecs = 0; rq_for_each_bvec(bv, rq, rq_iter) imu->bvec[nr_bvecs++] = bv; imu->nr_bvecs = nr_bvecs; - node->buf = imu; - data->nodes[index] = node; +unlock: + io_ring_submit_unlock(ctx, issue_flags); + return ret; +} +EXPORT_SYMBOL_GPL(io_buffer_register_request); + +/* + * bvs is copied internally. caller may free it on return. + */ +int io_buffer_register_bvec(struct io_uring_cmd *cmd, const struct bio_vec *bvs, + unsigned int nr_bvecs, void (*release)(void *), + void *priv, u8 dir, unsigned int index, + unsigned int issue_flags) +{ + struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx; + struct io_mapped_ubuf *imu; + struct bio_vec *bvec; + unsigned int i, total_bytes = 0; + int ret = 0; + + for (i = 0; i < nr_bvecs; i++) + total_bytes += bvs[i].bv_len; + + io_ring_submit_lock(ctx, issue_flags); + imu = io_kernel_buffer_init(ctx, nr_bvecs, total_bytes, dir, release, + priv, index); + if (IS_ERR(imu)) { + ret = PTR_ERR(imu); + goto unlock; + } + + bvec = imu->bvec; + for (i = 0; i < nr_bvecs; i++) + bvec[i] = bvs[i]; + unlock: io_ring_submit_unlock(ctx, issue_flags); return ret; } EXPORT_SYMBOL_GPL(io_buffer_register_bvec); -int io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index, - unsigned int issue_flags) +int io_buffer_unregister(struct io_uring_cmd *cmd, unsigned int index, + unsigned int issue_flags) { struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx; struct io_rsrc_data *data = &ctx->buf_table; @@ -1109,7 +1162,7 @@ int io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index, io_ring_submit_unlock(ctx, issue_flags); return ret; } -EXPORT_SYMBOL_GPL(io_buffer_unregister_bvec); +EXPORT_SYMBOL_GPL(io_buffer_unregister); static int validate_fixed_range(u64 buf_addr, size_t len, const struct io_mapped_ubuf *imu) diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 98ae8ef51009d..e503b02aa61a3 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -23,11 +23,6 @@ struct io_rsrc_node { }; }; -enum { - IO_IMU_DEST = 1 << ITER_DEST, - IO_IMU_SOURCE = 1 << ITER_SOURCE, -}; - enum { IO_REGBUF_F_KBUF = 1, }; diff --git a/io_uring/rw.c b/io_uring/rw.c index 0c48346452797..63b6519e498cd 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -601,15 +601,15 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res) { struct io_rw *rw = container_of(kiocb, struct io_rw, kiocb); struct io_kiocb *req = cmd_to_io_kiocb(rw); + int final_res = io_fixup_rw_res(req, res); if (kiocb->ki_flags & IOCB_WRITE) io_req_end_write(req); - if (unlikely(res != req->cqe.res)) { - if (res == -EAGAIN && io_rw_should_reissue(req)) - req->flags |= REQ_F_REISSUE | REQ_F_BL_NO_RECYCLE; - else - req->cqe.res = res; - } + + if (res == -EAGAIN && io_rw_should_reissue(req)) + req->flags |= REQ_F_REISSUE | REQ_F_BL_NO_RECYCLE; + else if (unlikely(final_res != req->cqe.res)) + req->cqe.res = final_res; /* order with io_iopoll_complete() checking ->iopoll_completed */ smp_store_release(&req->iopoll_completed, 1); diff --git a/io_uring/tw.c b/io_uring/tw.c index e74372233f40b..a4c872870d81c 100644 --- a/io_uring/tw.c +++ b/io_uring/tw.c @@ -34,10 +34,6 @@ void io_tctx_fallback_work(struct work_struct *work) fallback_work); unsigned int count = 0; - /* see tctx_task_work() - a set bit must always have a run coming */ - clear_bit(0, &tctx->tw_pending); - smp_mb__after_atomic(); - /* * Run the entries directly. We're in PF_KTHRED context, hence * io_should_terminate_tw() is true and they will be marked as @@ -55,7 +51,7 @@ static void io_fallback_tw(struct io_uring_task *tctx) * the queued work) stay around until the drain has run. */ get_task_struct(tctx->task); - if (!queue_work(system_unbound_wq, &tctx->fallback_work)) + if (!queue_work(system_dfl_wq, &tctx->fallback_work)) put_task_struct(tctx->task); } @@ -101,6 +97,13 @@ void tctx_task_work_run(struct io_uring_task *tctx, unsigned int max_entries, io_poll_task_func, io_req_rw_complete, (struct io_tw_req){req}, ts); (*count)++; + /* + * Break if most recent pop emptied the queue. This helps + * bound task_work run, and also protects the regular + * task_work addition. + */ + if (mpscq_pop_emptied(&tctx->task_list, tctx->task_head)) + break; if (unlikely(need_resched())) { ctx_flush_and_put(ctx, ts); ctx = NULL; @@ -127,8 +130,6 @@ void tctx_task_work(struct callback_head *cb) unsigned int count = 0; tctx = container_of(cb, struct io_uring_task, task_work); - clear_bit(0, &tctx->tw_pending); - smp_mb__after_atomic(); tctx_task_work_run(tctx, UINT_MAX, &count); } @@ -206,7 +207,7 @@ void io_req_normal_work_add(struct io_kiocb *req) struct io_uring_task *tctx = req->tctx; struct io_ring_ctx *ctx = req->ctx; - /* task_work already pending, we're done */ + /* tw run already pending, nothing else to do */ if (!mpscq_push(&tctx->task_list, &req->io_task_work.node)) return; @@ -223,10 +224,6 @@ void io_req_normal_work_add(struct io_kiocb *req) return; } - /* task_work must only be added once */ - if (test_and_set_bit(0, &tctx->tw_pending)) - return; - if (likely(!task_work_add(tctx->task, &tctx->task_work, ctx->notify_method))) return;