// SPDX-License-Identifier: GPL-2.0 /* * FUSE: Filesystem in Userspace * Copyright (c) 2023-2024 DataDirect Networks. */ #include "fuse_i.h" #include "dev_uring_i.h" #include "fuse_dev_i.h" #include #include static bool __read_mostly enable_uring; module_param(enable_uring, bool, 0644); MODULE_PARM_DESC(enable_uring, "Enable userspace communication through io-uring"); #define FUSE_URING_IOV_SEGS 2 /* header and payload */ bool fuse_uring_enabled(void) { return enable_uring; } struct fuse_uring_pdu { struct fuse_ring_ent *ent; }; static const struct fuse_iqueue_ops fuse_io_uring_ops; static void uring_cmd_set_ring_ent(struct io_uring_cmd *cmd, struct fuse_ring_ent *ring_ent) { struct fuse_uring_pdu *pdu = io_uring_cmd_to_pdu(cmd, struct fuse_uring_pdu); pdu->ent = ring_ent; } static struct fuse_ring_ent *uring_cmd_to_ring_ent(struct io_uring_cmd *cmd) { struct fuse_uring_pdu *pdu = io_uring_cmd_to_pdu(cmd, struct fuse_uring_pdu); return pdu->ent; } static void fuse_uring_flush_bg(struct fuse_ring_queue *queue) { struct fuse_ring *ring = queue->ring; struct fuse_conn *fc = ring->fc; lockdep_assert_held(&queue->lock); lockdep_assert_held(&fc->bg_lock); /* * Allow one bg request per queue, ignoring global fc limits. * This prevents a single queue from consuming all resources and * eliminates the need for remote queue wake-ups when global * limits are met but this queue has no more waiting requests. */ while ((fc->active_background < fc->max_background || !queue->active_background) && (!list_empty(&queue->fuse_req_bg_queue))) { struct fuse_req *req; req = list_first_entry(&queue->fuse_req_bg_queue, struct fuse_req, list); fc->active_background++; queue->active_background++; list_move_tail(&req->list, &queue->fuse_req_queue); } } static void fuse_uring_req_end(struct fuse_ring_ent *ent, struct fuse_req *req, int error) { struct fuse_ring_queue *queue = ent->queue; struct fuse_ring *ring = queue->ring; struct fuse_conn *fc = ring->fc; lockdep_assert_not_held(&queue->lock); spin_lock(&queue->lock); ent->fuse_req = NULL; if (test_bit(FR_BACKGROUND, &req->flags)) { queue->active_background--; spin_lock(&fc->bg_lock); fuse_uring_flush_bg(queue); spin_unlock(&fc->bg_lock); } spin_unlock(&queue->lock); if (error) req->out.h.error = error; clear_bit(FR_SENT, &req->flags); fuse_request_end(req); } /* Abort all list queued request on the given ring queue */ static void fuse_uring_abort_end_queue_requests(struct fuse_ring_queue *queue) { struct fuse_req *req; LIST_HEAD(req_list); spin_lock(&queue->lock); list_for_each_entry(req, &queue->fuse_req_queue, list) clear_bit(FR_PENDING, &req->flags); list_splice_init(&queue->fuse_req_queue, &req_list); spin_unlock(&queue->lock); /* must not hold queue lock to avoid order issues with fi->lock */ fuse_dev_end_requests(&req_list); } void fuse_uring_abort_end_requests(struct fuse_ring *ring) { int qid; struct fuse_ring_queue *queue; struct fuse_conn *fc = ring->fc; for (qid = 0; qid < ring->nr_queues; qid++) { queue = READ_ONCE(ring->queues[qid]); if (!queue) continue; queue->stopped = true; WARN_ON_ONCE(ring->fc->max_background != UINT_MAX); spin_lock(&queue->lock); spin_lock(&fc->bg_lock); fuse_uring_flush_bg(queue); spin_unlock(&fc->bg_lock); spin_unlock(&queue->lock); fuse_uring_abort_end_queue_requests(queue); } } void fuse_uring_destruct(struct fuse_conn *fc) { struct fuse_ring *ring = fc->ring; int qid; if (!ring) return; for (qid = 0; qid < ring->nr_queues; qid++) { struct fuse_ring_queue *queue = ring->queues[qid]; struct fuse_ring_ent *ent, *next; if (!queue) continue; WARN_ON(!list_empty(&queue->ent_avail_queue)); WARN_ON(!list_empty(&queue->ent_w_req_queue)); WARN_ON(!list_empty(&queue->ent_commit_queue)); WARN_ON(!list_empty(&queue->ent_in_userspace)); list_for_each_entry_safe(ent, next, &queue->ent_released, list) { list_del_init(&ent->list); kfree(ent); } kfree(queue->fpq.processing); kfree(queue); ring->queues[qid] = NULL; } kfree(ring->queues); kfree(ring); fc->ring = NULL; } /* * Basic ring setup for this connection based on the provided configuration */ static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc) { struct fuse_ring *ring; size_t nr_queues = num_possible_cpus(); struct fuse_ring *res = NULL; size_t max_payload_size; ring = kzalloc(sizeof(*fc->ring), GFP_KERNEL_ACCOUNT); if (!ring) return NULL; ring->queues = kcalloc(nr_queues, sizeof(struct fuse_ring_queue *), GFP_KERNEL_ACCOUNT); if (!ring->queues) goto out_err; max_payload_size = max(FUSE_MIN_READ_BUFFER, fc->max_write); max_payload_size = max(max_payload_size, fc->max_pages * PAGE_SIZE); spin_lock(&fc->lock); if (fc->ring) { /* race, another thread created the ring in the meantime */ spin_unlock(&fc->lock); res = fc->ring; goto out_err; } init_waitqueue_head(&ring->stop_waitq); fc->ring = ring; ring->nr_queues = nr_queues; ring->fc = fc; ring->max_payload_sz = max_payload_size; atomic_set(&ring->queue_refs, 0); spin_unlock(&fc->lock); return ring; out_err: kfree(ring->queues); kfree(ring); return res; } static struct fuse_ring_queue *fuse_uring_create_queue(struct fuse_ring *ring, int qid) { struct fuse_conn *fc = ring->fc; struct fuse_ring_queue *queue; struct list_head *pq; queue = kzalloc(sizeof(*queue), GFP_KERNEL_ACCOUNT); if (!queue) return NULL; pq = kcalloc(FUSE_PQ_HASH_SIZE, sizeof(struct list_head), GFP_KERNEL); if (!pq) { kfree(queue); return NULL; } queue->qid = qid; queue->ring = ring; spin_lock_init(&queue->lock); INIT_LIST_HEAD(&queue->ent_avail_queue); INIT_LIST_HEAD(&queue->ent_commit_queue); INIT_LIST_HEAD(&queue->ent_w_req_queue); INIT_LIST_HEAD(&queue->ent_in_userspace); INIT_LIST_HEAD(&queue->fuse_req_queue); INIT_LIST_HEAD(&queue->fuse_req_bg_queue); INIT_LIST_HEAD(&queue->ent_released); queue->fpq.processing = pq; fuse_pqueue_init(&queue->fpq); spin_lock(&fc->lock); if (ring->queues[qid]) { spin_unlock(&fc->lock); kfree(queue->fpq.processing); kfree(queue); return ring->queues[qid]; } /* * write_once and lock as the caller mostly doesn't take the lock at all */ WRITE_ONCE(ring->queues[qid], queue); spin_unlock(&fc->lock); return queue; } static void fuse_uring_stop_fuse_req_end(struct fuse_req *req) { clear_bit(FR_SENT, &req->flags); req->out.h.error = -ECONNABORTED; fuse_request_end(req); } /* * Release a request/entry on connection tear down */ static void fuse_uring_entry_teardown(struct fuse_ring_ent *ent) { struct fuse_req *req; struct io_uring_cmd *cmd; struct fuse_ring_queue *queue = ent->queue; spin_lock(&queue->lock); cmd = ent->cmd; ent->cmd = NULL; req = ent->fuse_req; ent->fuse_req = NULL; if (req) { /* remove entry from queue->fpq->processing */ list_del_init(&req->list); } /* * The entry must not be freed immediately, due to access of direct * pointer access of entries through IO_URING_F_CANCEL - there is a risk * of race between daemon termination (which triggers IO_URING_F_CANCEL * and accesses entries without checking the list state first */ list_move(&ent->list, &queue->ent_released); ent->state = FRRS_RELEASED; spin_unlock(&queue->lock); if (cmd) io_uring_cmd_done(cmd, -ENOTCONN, 0, IO_URING_F_UNLOCKED); if (req) fuse_uring_stop_fuse_req_end(req); } static void fuse_uring_stop_list_entries(struct list_head *head, struct fuse_ring_queue *queue, enum fuse_ring_req_state exp_state) { struct fuse_ring *ring = queue->ring; struct fuse_ring_ent *ent, *next; ssize_t queue_refs = SSIZE_MAX; LIST_HEAD(to_teardown); spin_lock(&queue->lock); list_for_each_entry_safe(ent, next, head, list) { if (ent->state != exp_state) { pr_warn("entry teardown qid=%d state=%d expected=%d", queue->qid, ent->state, exp_state); continue; } ent->state = FRRS_TEARDOWN; list_move(&ent->list, &to_teardown); } spin_unlock(&queue->lock); /* no queue lock to avoid lock order issues */ list_for_each_entry_safe(ent, next, &to_teardown, list) { fuse_uring_entry_teardown(ent); queue_refs = atomic_dec_return(&ring->queue_refs); WARN_ON_ONCE(queue_refs < 0); } } static void fuse_uring_teardown_entries(struct fuse_ring_queue *queue) { fuse_uring_stop_list_entries(&queue->ent_in_userspace, queue, FRRS_USERSPACE); fuse_uring_stop_list_entries(&queue->ent_avail_queue, queue, FRRS_AVAILABLE); } /* * Log state debug info */ static void fuse_uring_log_ent_state(struct fuse_ring *ring) { int qid; struct fuse_ring_ent *ent; for (qid = 0; qid < ring->nr_queues; qid++) { struct fuse_ring_queue *queue = ring->queues[qid]; if (!queue) continue; spin_lock(&queue->lock); /* * Log entries from the intermediate queue, the other queues * should be empty */ list_for_each_entry(ent, &queue->ent_w_req_queue, list) { pr_info(" ent-req-queue ring=%p qid=%d ent=%p state=%d\n", ring, qid, ent, ent->state); } list_for_each_entry(ent, &queue->ent_commit_queue, list) { pr_info(" ent-commit-queue ring=%p qid=%d ent=%p state=%d\n", ring, qid, ent, ent->state); } spin_unlock(&queue->lock); } ring->stop_debug_log = 1; } static void fuse_uring_async_stop_queues(struct work_struct *work) { int qid; struct fuse_ring *ring = container_of(work, struct fuse_ring, async_teardown_work.work); /* XXX code dup */ for (qid = 0; qid < ring->nr_queues; qid++) { struct fuse_ring_queue *queue = READ_ONCE(ring->queues[qid]); if (!queue) continue; fuse_uring_teardown_entries(queue); } /* * Some ring entries might be in the middle of IO operations, * i.e. in process to get handled by file_operations::uring_cmd * or on the way to userspace - we could handle that with conditions in * run time code, but easier/cleaner to have an async tear down handler * If there are still queue references left */ if (atomic_read(&ring->queue_refs) > 0) { if (time_after(jiffies, ring->teardown_time + FUSE_URING_TEARDOWN_TIMEOUT)) fuse_uring_log_ent_state(ring); schedule_delayed_work(&ring->async_teardown_work, FUSE_URING_TEARDOWN_INTERVAL); } else { wake_up_all(&ring->stop_waitq); } } /* * Stop the ring queues */ void fuse_uring_stop_queues(struct fuse_ring *ring) { int qid; for (qid = 0; qid < ring->nr_queues; qid++) { struct fuse_ring_queue *queue = READ_ONCE(ring->queues[qid]); if (!queue) continue; fuse_uring_teardown_entries(queue); } if (atomic_read(&ring->queue_refs) > 0) { ring->teardown_time = jiffies; INIT_DELAYED_WORK(&ring->async_teardown_work, fuse_uring_async_stop_queues); schedule_delayed_work(&ring->async_teardown_work, FUSE_URING_TEARDOWN_INTERVAL); } else { wake_up_all(&ring->stop_waitq); } } /* * Handle IO_URING_F_CANCEL, typically should come on daemon termination. * * Releasing the last entry should trigger fuse_dev_release() if * the daemon was terminated */ static void fuse_uring_cancel(struct io_uring_cmd *cmd, unsigned int issue_flags) { struct fuse_ring_ent *ent = uring_cmd_to_ring_ent(cmd); struct fuse_ring_queue *queue; bool need_cmd_done = false; /* * direct access on ent - it must not be destructed as long as * IO_URING_F_CANCEL might come up */ queue = ent->queue; spin_lock(&queue->lock); if (ent->state == FRRS_AVAILABLE) { ent->state = FRRS_USERSPACE; list_move(&ent->list, &queue->ent_in_userspace); need_cmd_done = true; ent->cmd = NULL; } spin_unlock(&queue->lock); if (need_cmd_done) { /* no queue lock to avoid lock order issues */ io_uring_cmd_done(cmd, -ENOTCONN, 0, issue_flags); } } static void fuse_uring_prepare_cancel(struct io_uring_cmd *cmd, int issue_flags, struct fuse_ring_ent *ring_ent) { uring_cmd_set_ring_ent(cmd, ring_ent); io_uring_cmd_mark_cancelable(cmd, issue_flags); } /* * Checks for errors and stores it into the request */ static int fuse_uring_out_header_has_err(struct fuse_out_header *oh, struct fuse_req *req, struct fuse_conn *fc) { int err; err = -EINVAL; if (oh->unique == 0) { /* Not supported through io-uring yet */ pr_warn_once("notify through fuse-io-uring not supported\n"); goto err; } if (oh->error <= -ERESTARTSYS || oh->error > 0) goto err; if (oh->error) { err = oh->error; goto err; } err = -ENOENT; if ((oh->unique & ~FUSE_INT_REQ_BIT) != req->in.h.unique) { pr_warn_ratelimited("unique mismatch, expected: %llu got %llu\n", req->in.h.unique, oh->unique & ~FUSE_INT_REQ_BIT); goto err; } /* * Is it an interrupt reply ID? * XXX: Not supported through fuse-io-uring yet, it should not even * find the request - should not happen. */ WARN_ON_ONCE(oh->unique & FUSE_INT_REQ_BIT); err = 0; err: return err; } static int fuse_uring_copy_from_ring(struct fuse_ring *ring, struct fuse_req *req, struct fuse_ring_ent *ent) { struct fuse_copy_state cs; struct fuse_args *args = req->args; struct iov_iter iter; int err; struct fuse_uring_ent_in_out ring_in_out; err = copy_from_user(&ring_in_out, &ent->headers->ring_ent_in_out, sizeof(ring_in_out)); if (err) return -EFAULT; err = import_ubuf(ITER_SOURCE, ent->payload, ring->max_payload_sz, &iter); if (err) return err; fuse_copy_init(&cs, 0, &iter); cs.is_uring = 1; cs.req = req; return fuse_copy_out_args(&cs, args, ring_in_out.payload_sz); } /* * Copy data from the req to the ring buffer */ static int fuse_uring_args_to_ring(struct fuse_ring *ring, struct fuse_req *req, struct fuse_ring_ent *ent) { struct fuse_copy_state cs; struct fuse_args *args = req->args; struct fuse_in_arg *in_args = args->in_args; int num_args = args->in_numargs; int err; struct iov_iter iter; struct fuse_uring_ent_in_out ent_in_out = { .flags = 0, .commit_id = req->in.h.unique, }; err = import_ubuf(ITER_DEST, ent->payload, ring->max_payload_sz, &iter); if (err) { pr_info_ratelimited("fuse: Import of user buffer failed\n"); return err; } fuse_copy_init(&cs, 1, &iter); cs.is_uring = 1; cs.req = req; if (num_args > 0) { /* * Expectation is that the first argument is the per op header. * Some op code have that as zero size. */ if (args->in_args[0].size > 0) { err = copy_to_user(&ent->headers->op_in, in_args->value, in_args->size); if (err) { pr_info_ratelimited( "Copying the header failed.\n"); return -EFAULT; } } in_args++; num_args--; } /* copy the payload */ err = fuse_copy_args(&cs, num_args, args->in_pages, (struct fuse_arg *)in_args, 0); if (err) { pr_info_ratelimited("%s fuse_copy_args failed\n", __func__); return err; } ent_in_out.payload_sz = cs.ring.copied_sz; err = copy_to_user(&ent->headers->ring_ent_in_out, &ent_in_out, sizeof(ent_in_out)); return err ? -EFAULT : 0; } static int fuse_uring_copy_to_ring(struct fuse_ring_ent *ent, struct fuse_req *req) { struct fuse_ring_queue *queue = ent->queue; struct fuse_ring *ring = queue->ring; int err; err = -EIO; if (WARN_ON(ent->state != FRRS_FUSE_REQ)) { pr_err("qid=%d ring-req=%p invalid state %d on send\n", queue->qid, ent, ent->state); return err; } err = -EINVAL; if (WARN_ON(req->in.h.unique == 0)) return err; /* copy the request */ err = fuse_uring_args_to_ring(ring, req, ent); if (unlikely(err)) { pr_info_ratelimited("Copy to ring failed: %d\n", err); return err; } /* copy fuse_in_header */ err = copy_to_user(&ent->headers->in_out, &req->in.h, sizeof(req->in.h)); if (err) { err = -EFAULT; return err; } return 0; } static int fuse_uring_prepare_send(struct fuse_ring_ent *ent, struct fuse_req *req) { int err; err = fuse_uring_copy_to_ring(ent, req); if (!err) set_bit(FR_SENT, &req->flags); else fuse_uring_req_end(ent, req, err); return err; } /* * Write data to the ring buffer and send the request to userspace, * userspace will read it * This is comparable with classical read(/dev/fuse) */ static int fuse_uring_send_next_to_ring(struct fuse_ring_ent *ent, struct fuse_req *req, unsigned int issue_flags) { struct fuse_ring_queue *queue = ent->queue; int err; struct io_uring_cmd *cmd; err = fuse_uring_prepare_send(ent, req); if (err) return err; spin_lock(&queue->lock); cmd = ent->cmd; ent->cmd = NULL; ent->state = FRRS_USERSPACE; list_move(&ent->list, &queue->ent_in_userspace); spin_unlock(&queue->lock); io_uring_cmd_done(cmd, 0, 0, issue_flags); return 0; } /* * Make a ring entry available for fuse_req assignment */ static void fuse_uring_ent_avail(struct fuse_ring_ent *ent, struct fuse_ring_queue *queue) { WARN_ON_ONCE(!ent->cmd); list_move(&ent->list, &queue->ent_avail_queue); ent->state = FRRS_AVAILABLE; } /* Used to find the request on SQE commit */ static void fuse_uring_add_to_pq(struct fuse_ring_ent *ent, struct fuse_req *req) { struct fuse_ring_queue *queue = ent->queue; struct fuse_pqueue *fpq = &queue->fpq; unsigned int hash; req->ring_entry = ent; hash = fuse_req_hash(req->in.h.unique); list_move_tail(&req->list, &fpq->processing[hash]); } /* * Assign a fuse queue entry to the given entry */ static void fuse_uring_add_req_to_ring_ent(struct fuse_ring_ent *ent, struct fuse_req *req) { struct fuse_ring_queue *queue = ent->queue; struct fuse_conn *fc = req->fm->fc; struct fuse_iqueue *fiq = &fc->iq; lockdep_assert_held(&queue->lock); if (WARN_ON_ONCE(ent->state != FRRS_AVAILABLE && ent->state != FRRS_COMMIT)) { pr_warn("%s qid=%d state=%d\n", __func__, ent->queue->qid, ent->state); } spin_lock(&fiq->lock); clear_bit(FR_PENDING, &req->flags); spin_unlock(&fiq->lock); ent->fuse_req = req; ent->state = FRRS_FUSE_REQ; list_move(&ent->list, &queue->ent_w_req_queue); fuse_uring_add_to_pq(ent, req); } /* Fetch the next fuse request if available */ static struct fuse_req *fuse_uring_ent_assign_req(struct fuse_ring_ent *ent) __must_hold(&queue->lock) { struct fuse_req *req; struct fuse_ring_queue *queue = ent->queue; struct list_head *req_queue = &queue->fuse_req_queue; lockdep_assert_held(&queue->lock); /* get and assign the next entry while it is still holding the lock */ req = list_first_entry_or_null(req_queue, struct fuse_req, list); if (req) fuse_uring_add_req_to_ring_ent(ent, req); return req; } /* * Read data from the ring buffer, which user space has written to * This is comparible with handling of classical write(/dev/fuse). * Also make the ring request available again for new fuse requests. */ static void fuse_uring_commit(struct fuse_ring_ent *ent, struct fuse_req *req, unsigned int issue_flags) { struct fuse_ring *ring = ent->queue->ring; struct fuse_conn *fc = ring->fc; ssize_t err = 0; err = copy_from_user(&req->out.h, &ent->headers->in_out, sizeof(req->out.h)); if (err) { req->out.h.error = -EFAULT; goto out; } err = fuse_uring_out_header_has_err(&req->out.h, req, fc); if (err) { /* req->out.h.error already set */ goto out; } err = fuse_uring_copy_from_ring(ring, req, ent); out: fuse_uring_req_end(ent, req, err); } /* * Get the next fuse req and send it */ static void fuse_uring_next_fuse_req(struct fuse_ring_ent *ent, struct fuse_ring_queue *queue, unsigned int issue_flags) { int err; struct fuse_req *req; retry: spin_lock(&queue->lock); fuse_uring_ent_avail(ent, queue); req = fuse_uring_ent_assign_req(ent); spin_unlock(&queue->lock); if (req) { err = fuse_uring_send_next_to_ring(ent, req, issue_flags); if (err) goto retry; } } static int fuse_ring_ent_set_commit(struct fuse_ring_ent *ent) { struct fuse_ring_queue *queue = ent->queue; lockdep_assert_held(&queue->lock); if (WARN_ON_ONCE(ent->state != FRRS_USERSPACE)) return -EIO; ent->state = FRRS_COMMIT; list_move(&ent->list, &queue->ent_commit_queue); return 0; } /* FUSE_URING_CMD_COMMIT_AND_FETCH handler */ static int fuse_uring_commit_fetch(struct io_uring_cmd *cmd, int issue_flags, struct fuse_conn *fc) { const struct fuse_uring_cmd_req *cmd_req = io_uring_sqe_cmd(cmd->sqe); struct fuse_ring_ent *ent; int err; struct fuse_ring *ring = fc->ring; struct fuse_ring_queue *queue; uint64_t commit_id = READ_ONCE(cmd_req->commit_id); unsigned int qid = READ_ONCE(cmd_req->qid); struct fuse_pqueue *fpq; struct fuse_req *req; err = -ENOTCONN; if (!ring) return err; if (qid >= ring->nr_queues) return -EINVAL; queue = ring->queues[qid]; if (!queue) return err; fpq = &queue->fpq; if (!READ_ONCE(fc->connected) || READ_ONCE(queue->stopped)) return err; spin_lock(&queue->lock); /* Find a request based on the unique ID of the fuse request * This should get revised, as it needs a hash calculation and list * search. And full struct fuse_pqueue is needed (memory overhead). * As well as the link from req to ring_ent. */ req = fuse_request_find(fpq, commit_id); err = -ENOENT; if (!req) { pr_info("qid=%d commit_id %llu not found\n", queue->qid, commit_id); spin_unlock(&queue->lock); return err; } list_del_init(&req->list); ent = req->ring_entry; req->ring_entry = NULL; err = fuse_ring_ent_set_commit(ent); if (err != 0) { pr_info_ratelimited("qid=%d commit_id %llu state %d", queue->qid, commit_id, ent->state); spin_unlock(&queue->lock); req->out.h.error = err; clear_bit(FR_SENT, &req->flags); fuse_request_end(req); return err; } ent->cmd = cmd; spin_unlock(&queue->lock); /* without the queue lock, as other locks are taken */ fuse_uring_prepare_cancel(cmd, issue_flags, ent); fuse_uring_commit(ent, req, issue_flags); /* * Fetching the next request is absolutely required as queued * fuse requests would otherwise not get processed - committing * and fetching is done in one step vs legacy fuse, which has separated * read (fetch request) and write (commit result). */ fuse_uring_next_fuse_req(ent, queue, issue_flags); return 0; } static bool is_ring_ready(struct fuse_ring *ring, int current_qid) { int qid; struct fuse_ring_queue *queue; bool ready = true; for (qid = 0; qid < ring->nr_queues && ready; qid++) { if (current_qid == qid) continue; queue = ring->queues[qid]; if (!queue) { ready = false; break; } spin_lock(&queue->lock); if (list_empty(&queue->ent_avail_queue)) ready = false; spin_unlock(&queue->lock); } return ready; } /* * fuse_uring_req_fetch command handling */ static void fuse_uring_do_register(struct fuse_ring_ent *ent, struct io_uring_cmd *cmd, unsigned int issue_flags) { struct fuse_ring_queue *queue = ent->queue; struct fuse_ring *ring = queue->ring; struct fuse_conn *fc = ring->fc; struct fuse_iqueue *fiq = &fc->iq; fuse_uring_prepare_cancel(cmd, issue_flags, ent); spin_lock(&queue->lock); ent->cmd = cmd; fuse_uring_ent_avail(ent, queue); spin_unlock(&queue->lock); if (!ring->ready) { bool ready = is_ring_ready(ring, queue->qid); if (ready) { WRITE_ONCE(fiq->ops, &fuse_io_uring_ops); WRITE_ONCE(ring->ready, true); wake_up_all(&fc->blocked_waitq); } } } /* * sqe->addr is a ptr to an iovec array, iov[0] has the headers, iov[1] * the payload */ static int fuse_uring_get_iovec_from_sqe(const struct io_uring_sqe *sqe, struct iovec iov[FUSE_URING_IOV_SEGS]) { struct iovec __user *uiov = u64_to_user_ptr(READ_ONCE(sqe->addr)); struct iov_iter iter; ssize_t ret; if (sqe->len != FUSE_URING_IOV_SEGS) return -EINVAL; /* * Direction for buffer access will actually be READ and WRITE, * using write for the import should include READ access as well. */ ret = import_iovec(WRITE, uiov, FUSE_URING_IOV_SEGS, FUSE_URING_IOV_SEGS, &iov, &iter); if (ret < 0) return ret; return 0; } static struct fuse_ring_ent * fuse_uring_create_ring_ent(struct io_uring_cmd *cmd, struct fuse_ring_queue *queue) { struct fuse_ring *ring = queue->ring; struct fuse_ring_ent *ent; size_t payload_size; struct iovec iov[FUSE_URING_IOV_SEGS]; int err; err = fuse_uring_get_iovec_from_sqe(cmd->sqe, iov); if (err) { pr_info_ratelimited("Failed to get iovec from sqe, err=%d\n", err); return ERR_PTR(err); } err = -EINVAL; if (iov[0].iov_len < sizeof(struct fuse_uring_req_header)) { pr_info_ratelimited("Invalid header len %zu\n", iov[0].iov_len); return ERR_PTR(err); } payload_size = iov[1].iov_len; if (payload_size < ring->max_payload_sz) { pr_info_ratelimited("Invalid req payload len %zu\n", payload_size); return ERR_PTR(err); } err = -ENOMEM; ent = kzalloc(sizeof(*ent), GFP_KERNEL_ACCOUNT); if (!ent) return ERR_PTR(err); INIT_LIST_HEAD(&ent->list); ent->queue = queue; ent->headers = iov[0].iov_base; ent->payload = iov[1].iov_base; atomic_inc(&ring->queue_refs); return ent; } /* * Register header and payload buffer with the kernel and puts the * entry as "ready to get fuse requests" on the queue */ static int fuse_uring_register(struct io_uring_cmd *cmd, unsigned int issue_flags, struct fuse_conn *fc) { const struct fuse_uring_cmd_req *cmd_req = io_uring_sqe_cmd(cmd->sqe); struct fuse_ring *ring = fc->ring; struct fuse_ring_queue *queue; struct fuse_ring_ent *ent; int err; unsigned int qid = READ_ONCE(cmd_req->qid); err = -ENOMEM; if (!ring) { ring = fuse_uring_create(fc); if (!ring) return err; } if (qid >= ring->nr_queues) { pr_info_ratelimited("fuse: Invalid ring qid %u\n", qid); return -EINVAL; } queue = ring->queues[qid]; if (!queue) { queue = fuse_uring_create_queue(ring, qid); if (!queue) return err; } /* * The created queue above does not need to be destructed in * case of entry errors below, will be done at ring destruction time. */ ent = fuse_uring_create_ring_ent(cmd, queue); if (IS_ERR(ent)) return PTR_ERR(ent); fuse_uring_do_register(ent, cmd, issue_flags); return 0; } /* * Entry function from io_uring to handle the given passthrough command * (op code IORING_OP_URING_CMD) */ int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) { struct fuse_dev *fud; struct fuse_conn *fc; u32 cmd_op = cmd->cmd_op; int err; if ((unlikely(issue_flags & IO_URING_F_CANCEL))) { fuse_uring_cancel(cmd, issue_flags); return 0; } /* This extra SQE size holds struct fuse_uring_cmd_req */ if (!(issue_flags & IO_URING_F_SQE128)) return -EINVAL; fud = fuse_get_dev(cmd->file); if (!fud) { pr_info_ratelimited("No fuse device found\n"); return -ENOTCONN; } fc = fud->fc; /* Once a connection has io-uring enabled on it, it can't be disabled */ if (!enable_uring && !fc->io_uring) { pr_info_ratelimited("fuse-io-uring is disabled\n"); return -EOPNOTSUPP; } if (fc->aborted) return -ECONNABORTED; if (!fc->connected) return -ENOTCONN; /* * fuse_uring_register() needs the ring to be initialized, * we need to know the max payload size */ if (!fc->initialized) return -EAGAIN; switch (cmd_op) { case FUSE_IO_URING_CMD_REGISTER: err = fuse_uring_register(cmd, issue_flags, fc); if (err) { pr_info_once("FUSE_IO_URING_CMD_REGISTER failed err=%d\n", err); fc->io_uring = 0; wake_up_all(&fc->blocked_waitq); return err; } break; case FUSE_IO_URING_CMD_COMMIT_AND_FETCH: err = fuse_uring_commit_fetch(cmd, issue_flags, fc); if (err) { pr_info_once("FUSE_IO_URING_COMMIT_AND_FETCH failed err=%d\n", err); return err; } break; default: return -EINVAL; } return -EIOCBQUEUED; } static void fuse_uring_send(struct fuse_ring_ent *ent, struct io_uring_cmd *cmd, ssize_t ret, unsigned int issue_flags) { struct fuse_ring_queue *queue = ent->queue; spin_lock(&queue->lock); ent->state = FRRS_USERSPACE; list_move(&ent->list, &queue->ent_in_userspace); ent->cmd = NULL; spin_unlock(&queue->lock); io_uring_cmd_done(cmd, ret, 0, issue_flags); } /* * This prepares and sends the ring request in fuse-uring task context. * User buffers are not mapped yet - the application does not have permission * to write to it - this has to be executed in ring task context. */ static void fuse_uring_send_in_task(struct io_uring_cmd *cmd, unsigned int issue_flags) { struct fuse_ring_ent *ent = uring_cmd_to_ring_ent(cmd); struct fuse_ring_queue *queue = ent->queue; int err; if (!(issue_flags & IO_URING_F_TASK_DEAD)) { err = fuse_uring_prepare_send(ent, ent->fuse_req); if (err) { fuse_uring_next_fuse_req(ent, queue, issue_flags); return; } } else { err = -ECANCELED; } fuse_uring_send(ent, cmd, err, issue_flags); } static struct fuse_ring_queue *fuse_uring_task_to_queue(struct fuse_ring *ring) { unsigned int qid; struct fuse_ring_queue *queue; qid = task_cpu(current); if (WARN_ONCE(qid >= ring->nr_queues, "Core number (%u) exceeds nr queues (%zu)\n", qid, ring->nr_queues)) qid = 0; queue = ring->queues[qid]; WARN_ONCE(!queue, "Missing queue for qid %d\n", qid); return queue; } static void fuse_uring_dispatch_ent(struct fuse_ring_ent *ent) { struct io_uring_cmd *cmd = ent->cmd; uring_cmd_set_ring_ent(cmd, ent); io_uring_cmd_complete_in_task(cmd, fuse_uring_send_in_task); } /* queue a fuse request and send it if a ring entry is available */ void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req) { struct fuse_conn *fc = req->fm->fc; struct fuse_ring *ring = fc->ring; struct fuse_ring_queue *queue; struct fuse_ring_ent *ent = NULL; int err; err = -EINVAL; queue = fuse_uring_task_to_queue(ring); if (!queue) goto err; if (req->in.h.opcode != FUSE_NOTIFY_REPLY) req->in.h.unique = fuse_get_unique(fiq); spin_lock(&queue->lock); err = -ENOTCONN; if (unlikely(queue->stopped)) goto err_unlock; ent = list_first_entry_or_null(&queue->ent_avail_queue, struct fuse_ring_ent, list); if (ent) fuse_uring_add_req_to_ring_ent(ent, req); else list_add_tail(&req->list, &queue->fuse_req_queue); spin_unlock(&queue->lock); if (ent) fuse_uring_dispatch_ent(ent); return; err_unlock: spin_unlock(&queue->lock); err: req->out.h.error = err; clear_bit(FR_PENDING, &req->flags); fuse_request_end(req); } bool fuse_uring_queue_bq_req(struct fuse_req *req) { struct fuse_conn *fc = req->fm->fc; struct fuse_ring *ring = fc->ring; struct fuse_ring_queue *queue; struct fuse_ring_ent *ent = NULL; queue = fuse_uring_task_to_queue(ring); if (!queue) return false; spin_lock(&queue->lock); if (unlikely(queue->stopped)) { spin_unlock(&queue->lock); return false; } list_add_tail(&req->list, &queue->fuse_req_bg_queue); ent = list_first_entry_or_null(&queue->ent_avail_queue, struct fuse_ring_ent, list); spin_lock(&fc->bg_lock); fc->num_background++; if (fc->num_background == fc->max_background) fc->blocked = 1; fuse_uring_flush_bg(queue); spin_unlock(&fc->bg_lock); /* * Due to bg_queue flush limits there might be other bg requests * in the queue that need to be handled first. Or no further req * might be available. */ req = list_first_entry_or_null(&queue->fuse_req_queue, struct fuse_req, list); if (ent && req) { fuse_uring_add_req_to_ring_ent(ent, req); spin_unlock(&queue->lock); fuse_uring_dispatch_ent(ent); } else { spin_unlock(&queue->lock); } return true; } static const struct fuse_iqueue_ops fuse_io_uring_ops = { /* should be send over io-uring as enhancement */ .send_forget = fuse_dev_queue_forget, /* * could be send over io-uring, but interrupts should be rare, * no need to make the code complex */ .send_interrupt = fuse_dev_queue_interrupt, .send_req = fuse_uring_queue_fuse_req, };