// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. * Copyright (c) 2010 David Chinner. * Copyright (c) 2011 Christoph Hellwig. * All Rights Reserved. */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_shared.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_alloc.h" #include "xfs_extent_busy.h" #include "xfs_trace.h" #include "xfs_trans.h" #include "xfs_log.h" #include "xfs_ag.h" #include "xfs_rtgroup.h" struct xfs_extent_busy_tree { spinlock_t eb_lock; struct rb_root eb_tree; unsigned int eb_gen; wait_queue_head_t eb_wait; }; static void xfs_extent_busy_insert_list( struct xfs_group *xg, xfs_agblock_t bno, xfs_extlen_t len, unsigned int flags, struct list_head *busy_list) { struct xfs_extent_busy_tree *eb = xg->xg_busy_extents; struct xfs_extent_busy *new; struct xfs_extent_busy *busyp; struct rb_node **rbp; struct rb_node *parent = NULL; new = kzalloc(sizeof(struct xfs_extent_busy), GFP_KERNEL | __GFP_NOFAIL); new->group = xfs_group_hold(xg); new->bno = bno; new->length = len; INIT_LIST_HEAD(&new->list); new->flags = flags; /* trace before insert to be able to see failed inserts */ trace_xfs_extent_busy(xg, bno, len); spin_lock(&eb->eb_lock); rbp = &eb->eb_tree.rb_node; while (*rbp) { parent = *rbp; busyp = rb_entry(parent, struct xfs_extent_busy, rb_node); if (new->bno < busyp->bno) { rbp = &(*rbp)->rb_left; ASSERT(new->bno + new->length <= busyp->bno); } else if (new->bno > busyp->bno) { rbp = &(*rbp)->rb_right; ASSERT(bno >= busyp->bno + busyp->length); } else { ASSERT(0); } } rb_link_node(&new->rb_node, parent, rbp); rb_insert_color(&new->rb_node, &eb->eb_tree); /* always process discard lists in fifo order */ list_add_tail(&new->list, busy_list); spin_unlock(&eb->eb_lock); } void xfs_extent_busy_insert( struct xfs_trans *tp, struct xfs_group *xg, xfs_agblock_t bno, xfs_extlen_t len, unsigned int flags) { xfs_extent_busy_insert_list(xg, bno, len, flags, &tp->t_busy); } void xfs_extent_busy_insert_discard( struct xfs_group *xg, xfs_agblock_t bno, xfs_extlen_t len, struct list_head *busy_list) { xfs_extent_busy_insert_list(xg, bno, len, XFS_EXTENT_BUSY_DISCARDED, busy_list); } /* * Search for a busy extent within the range of the extent we are about to * allocate. You need to be holding the busy extent tree lock when calling * xfs_extent_busy_search(). This function returns 0 for no overlapping busy * extent, -1 for an overlapping but not exact busy extent, and 1 for an exact * match. This is done so that a non-zero return indicates an overlap that * will require a synchronous transaction, but it can still be * used to distinguish between a partial or exact match. */ int xfs_extent_busy_search( struct xfs_group *xg, xfs_agblock_t bno, xfs_extlen_t len) { struct xfs_extent_busy_tree *eb = xg->xg_busy_extents; struct rb_node *rbp; struct xfs_extent_busy *busyp; int match = 0; /* find closest start bno overlap */ spin_lock(&eb->eb_lock); rbp = eb->eb_tree.rb_node; while (rbp) { busyp = rb_entry(rbp, struct xfs_extent_busy, rb_node); if (bno < busyp->bno) { /* may overlap, but exact start block is lower */ if (bno + len > busyp->bno) match = -1; rbp = rbp->rb_left; } else if (bno > busyp->bno) { /* may overlap, but exact start block is higher */ if (bno < busyp->bno + busyp->length) match = -1; rbp = rbp->rb_right; } else { /* bno matches busyp, length determines exact match */ match = (busyp->length == len) ? 1 : -1; break; } } spin_unlock(&eb->eb_lock); return match; } /* * The found free extent [fbno, fend] overlaps part or all of the given busy * extent. If the overlap covers the beginning, the end, or all of the busy * extent, the overlapping portion can be made unbusy and used for the * allocation. We can't split a busy extent because we can't modify a * transaction/CIL context busy list, but we can update an entry's block * number or length. * * Returns true if the extent can safely be reused, or false if the search * needs to be restarted. */ STATIC bool xfs_extent_busy_update_extent( struct xfs_group *xg, struct xfs_extent_busy *busyp, xfs_agblock_t fbno, xfs_extlen_t flen, bool userdata) __releases(&eb->eb_lock) __acquires(&eb->eb_lock) { struct xfs_extent_busy_tree *eb = xg->xg_busy_extents; xfs_agblock_t fend = fbno + flen; xfs_agblock_t bbno = busyp->bno; xfs_agblock_t bend = bbno + busyp->length; /* * This extent is currently being discarded. Give the thread * performing the discard a chance to mark the extent unbusy * and retry. */ if (busyp->flags & XFS_EXTENT_BUSY_DISCARDED) { spin_unlock(&eb->eb_lock); delay(1); spin_lock(&eb->eb_lock); return false; } /* * If there is a busy extent overlapping a user allocation, we have * no choice but to force the log and retry the search. * * Fortunately this does not happen during normal operation, but * only if the filesystem is very low on space and has to dip into * the AGFL for normal allocations. */ if (userdata) goto out_force_log; if (bbno < fbno && bend > fend) { /* * Case 1: * bbno bend * +BBBBBBBBBBBBBBBBB+ * +---------+ * fbno fend */ /* * We would have to split the busy extent to be able to track * it correct, which we cannot do because we would have to * modify the list of busy extents attached to the transaction * or CIL context, which is immutable. * * Force out the log to clear the busy extent and retry the * search. */ goto out_force_log; } else if (bbno >= fbno && bend <= fend) { /* * Case 2: * bbno bend * +BBBBBBBBBBBBBBBBB+ * +-----------------+ * fbno fend * * Case 3: * bbno bend * +BBBBBBBBBBBBBBBBB+ * +--------------------------+ * fbno fend * * Case 4: * bbno bend * +BBBBBBBBBBBBBBBBB+ * +--------------------------+ * fbno fend * * Case 5: * bbno bend * +BBBBBBBBBBBBBBBBB+ * +-----------------------------------+ * fbno fend * */ /* * The busy extent is fully covered by the extent we are * allocating, and can simply be removed from the rbtree. * However we cannot remove it from the immutable list * tracking busy extents in the transaction or CIL context, * so set the length to zero to mark it invalid. * * We also need to restart the busy extent search from the * tree root, because erasing the node can rearrange the * tree topology. */ rb_erase(&busyp->rb_node, &eb->eb_tree); busyp->length = 0; return false; } else if (fend < bend) { /* * Case 6: * bbno bend * +BBBBBBBBBBBBBBBBB+ * +---------+ * fbno fend * * Case 7: * bbno bend * +BBBBBBBBBBBBBBBBB+ * +------------------+ * fbno fend * */ busyp->bno = fend; busyp->length = bend - fend; } else if (bbno < fbno) { /* * Case 8: * bbno bend * +BBBBBBBBBBBBBBBBB+ * +-------------+ * fbno fend * * Case 9: * bbno bend * +BBBBBBBBBBBBBBBBB+ * +----------------------+ * fbno fend */ busyp->length = fbno - busyp->bno; } else { ASSERT(0); } trace_xfs_extent_busy_reuse(xg, fbno, flen); return true; out_force_log: spin_unlock(&eb->eb_lock); xfs_log_force(xg->xg_mount, XFS_LOG_SYNC); trace_xfs_extent_busy_force(xg, fbno, flen); spin_lock(&eb->eb_lock); return false; } /* * For a given extent [fbno, flen], make sure we can reuse it safely. */ void xfs_extent_busy_reuse( struct xfs_group *xg, xfs_agblock_t fbno, xfs_extlen_t flen, bool userdata) { struct xfs_extent_busy_tree *eb = xg->xg_busy_extents; struct rb_node *rbp; ASSERT(flen > 0); spin_lock(&eb->eb_lock); restart: rbp = eb->eb_tree.rb_node; while (rbp) { struct xfs_extent_busy *busyp = rb_entry(rbp, struct xfs_extent_busy, rb_node); xfs_agblock_t bbno = busyp->bno; xfs_agblock_t bend = bbno + busyp->length; if (fbno + flen <= bbno) { rbp = rbp->rb_left; continue; } else if (fbno >= bend) { rbp = rbp->rb_right; continue; } if (!xfs_extent_busy_update_extent(xg, busyp, fbno, flen, userdata)) goto restart; } spin_unlock(&eb->eb_lock); } /* * For a given extent [fbno, flen], search the busy extent list to find a * subset of the extent that is not busy. If *rlen is smaller than * args->minlen no suitable extent could be found, and the higher level * code needs to force out the log and retry the allocation. * * Return the current busy generation for the group if the extent is busy. This * value can be used to wait for at least one of the currently busy extents * to be cleared. Note that the busy list is not guaranteed to be empty after * the gen is woken. The state of a specific extent must always be confirmed * with another call to xfs_extent_busy_trim() before it can be used. */ bool xfs_extent_busy_trim( struct xfs_group *xg, xfs_extlen_t minlen, xfs_extlen_t maxlen, xfs_agblock_t *bno, xfs_extlen_t *len, unsigned *busy_gen) { struct xfs_extent_busy_tree *eb = xg->xg_busy_extents; xfs_agblock_t fbno; xfs_extlen_t flen; struct rb_node *rbp; bool ret = false; ASSERT(*len > 0); spin_lock(&eb->eb_lock); fbno = *bno; flen = *len; rbp = eb->eb_tree.rb_node; while (rbp && flen >= minlen) { struct xfs_extent_busy *busyp = rb_entry(rbp, struct xfs_extent_busy, rb_node); xfs_agblock_t fend = fbno + flen; xfs_agblock_t bbno = busyp->bno; xfs_agblock_t bend = bbno + busyp->length; if (fend <= bbno) { rbp = rbp->rb_left; continue; } else if (fbno >= bend) { rbp = rbp->rb_right; continue; } if (bbno <= fbno) { /* start overlap */ /* * Case 1: * bbno bend * +BBBBBBBBBBBBBBBBB+ * +---------+ * fbno fend * * Case 2: * bbno bend * +BBBBBBBBBBBBBBBBB+ * +-------------+ * fbno fend * * Case 3: * bbno bend * +BBBBBBBBBBBBBBBBB+ * +-------------+ * fbno fend * * Case 4: * bbno bend * +BBBBBBBBBBBBBBBBB+ * +-----------------+ * fbno fend * * No unbusy region in extent, return failure. */ if (fend <= bend) goto fail; /* * Case 5: * bbno bend * +BBBBBBBBBBBBBBBBB+ * +----------------------+ * fbno fend * * Case 6: * bbno bend * +BBBBBBBBBBBBBBBBB+ * +--------------------------+ * fbno fend * * Needs to be trimmed to: * +-------+ * fbno fend */ fbno = bend; } else if (bend >= fend) { /* end overlap */ /* * Case 7: * bbno bend * +BBBBBBBBBBBBBBBBB+ * +------------------+ * fbno fend * * Case 8: * bbno bend * +BBBBBBBBBBBBBBBBB+ * +--------------------------+ * fbno fend * * Needs to be trimmed to: * +-------+ * fbno fend */ fend = bbno; } else { /* middle overlap */ /* * Case 9: * bbno bend * +BBBBBBBBBBBBBBBBB+ * +-----------------------------------+ * fbno fend * * Can be trimmed to: * +-------+ OR +-------+ * fbno fend fbno fend * * Backward allocation leads to significant * fragmentation of directories, which degrades * directory performance, therefore we always want to * choose the option that produces forward allocation * patterns. * Preferring the lower bno extent will make the next * request use "fend" as the start of the next * allocation; if the segment is no longer busy at * that point, we'll get a contiguous allocation, but * even if it is still busy, we will get a forward * allocation. * We try to avoid choosing the segment at "bend", * because that can lead to the next allocation * taking the segment at "fbno", which would be a * backward allocation. We only use the segment at * "fbno" if it is much larger than the current * requested size, because in that case there's a * good chance subsequent allocations will be * contiguous. */ if (bbno - fbno >= maxlen) { /* left candidate fits perfect */ fend = bbno; } else if (fend - bend >= maxlen * 4) { /* right candidate has enough free space */ fbno = bend; } else if (bbno - fbno >= minlen) { /* left candidate fits minimum requirement */ fend = bbno; } else { goto fail; } } flen = fend - fbno; } out: if (fbno != *bno || flen != *len) { trace_xfs_extent_busy_trim(xg, *bno, *len, fbno, flen); *bno = fbno; *len = flen; *busy_gen = eb->eb_gen; ret = true; } spin_unlock(&eb->eb_lock); return ret; fail: /* * Return a zero extent length as failure indications. All callers * re-check if the trimmed extent satisfies the minlen requirement. */ flen = 0; goto out; } static bool xfs_extent_busy_clear_one( struct xfs_extent_busy *busyp, bool do_discard) { struct xfs_extent_busy_tree *eb = busyp->group->xg_busy_extents; if (busyp->length) { if (do_discard && !(busyp->flags & XFS_EXTENT_BUSY_SKIP_DISCARD)) { busyp->flags = XFS_EXTENT_BUSY_DISCARDED; return false; } trace_xfs_extent_busy_clear(busyp->group, busyp->bno, busyp->length); rb_erase(&busyp->rb_node, &eb->eb_tree); } list_del_init(&busyp->list); xfs_group_put(busyp->group); kfree(busyp); return true; } /* * Remove all extents on the passed in list from the busy extents tree. * If do_discard is set skip extents that need to be discarded, and mark * these as undergoing a discard operation instead. */ void xfs_extent_busy_clear( struct list_head *list, bool do_discard) { struct xfs_extent_busy *busyp, *next; busyp = list_first_entry_or_null(list, typeof(*busyp), list); if (!busyp) return; do { struct xfs_group *xg = xfs_group_hold(busyp->group); struct xfs_extent_busy_tree *eb = xg->xg_busy_extents; bool wakeup = false; spin_lock(&eb->eb_lock); do { next = list_next_entry(busyp, list); if (xfs_extent_busy_clear_one(busyp, do_discard)) wakeup = true; busyp = next; } while (!list_entry_is_head(busyp, list, list) && busyp->group == xg); if (wakeup) { eb->eb_gen++; wake_up_all(&eb->eb_wait); } spin_unlock(&eb->eb_lock); xfs_group_put(xg); } while (!list_entry_is_head(busyp, list, list)); } /* * Flush out all busy extents for this group. * * If the current transaction is holding busy extents, the caller may not want * to wait for committed busy extents to resolve. If we are being told just to * try a flush or progress has been made since we last skipped a busy extent, * return immediately to allow the caller to try again. * * If we are freeing extents, we might actually be holding the only free extents * in the transaction busy list and the log force won't resolve that situation. * In this case, we must return -EAGAIN to avoid a deadlock by informing the * caller it needs to commit the busy extents it holds before retrying the * extent free operation. */ int xfs_extent_busy_flush( struct xfs_trans *tp, struct xfs_group *xg, unsigned busy_gen, uint32_t alloc_flags) { struct xfs_extent_busy_tree *eb = xg->xg_busy_extents; DEFINE_WAIT (wait); int error; error = xfs_log_force(tp->t_mountp, XFS_LOG_SYNC); if (error) return error; /* Avoid deadlocks on uncommitted busy extents. */ if (!list_empty(&tp->t_busy)) { if (alloc_flags & XFS_ALLOC_FLAG_TRYFLUSH) return 0; if (busy_gen != READ_ONCE(eb->eb_gen)) return 0; if (alloc_flags & XFS_ALLOC_FLAG_FREEING) return -EAGAIN; } /* Wait for committed busy extents to resolve. */ do { prepare_to_wait(&eb->eb_wait, &wait, TASK_KILLABLE); if (busy_gen != READ_ONCE(eb->eb_gen)) break; schedule(); } while (1); finish_wait(&eb->eb_wait, &wait); return 0; } static void xfs_extent_busy_wait_group( struct xfs_group *xg) { DEFINE_WAIT (wait); struct xfs_extent_busy_tree *eb = xg->xg_busy_extents; do { prepare_to_wait(&eb->eb_wait, &wait, TASK_KILLABLE); if (RB_EMPTY_ROOT(&eb->eb_tree)) break; schedule(); } while (1); finish_wait(&eb->eb_wait, &wait); } void xfs_extent_busy_wait_all( struct xfs_mount *mp) { struct xfs_perag *pag = NULL; struct xfs_rtgroup *rtg = NULL; while ((pag = xfs_perag_next(mp, pag))) xfs_extent_busy_wait_group(pag_group(pag)); if (xfs_has_rtgroups(mp)) while ((rtg = xfs_rtgroup_next(mp, rtg))) xfs_extent_busy_wait_group(rtg_group(rtg)); } /* * Callback for list_sort to sort busy extents by the group they reside in. */ int xfs_extent_busy_ag_cmp( void *priv, const struct list_head *l1, const struct list_head *l2) { struct xfs_extent_busy *b1 = container_of(l1, struct xfs_extent_busy, list); struct xfs_extent_busy *b2 = container_of(l2, struct xfs_extent_busy, list); s32 diff; diff = b1->group->xg_gno - b2->group->xg_gno; if (!diff) diff = b1->bno - b2->bno; return diff; } /* Are there any busy extents in this group? */ bool xfs_extent_busy_list_empty( struct xfs_group *xg, unsigned *busy_gen) { struct xfs_extent_busy_tree *eb = xg->xg_busy_extents; bool res; spin_lock(&eb->eb_lock); res = RB_EMPTY_ROOT(&eb->eb_tree); *busy_gen = READ_ONCE(eb->eb_gen); spin_unlock(&eb->eb_lock); return res; } struct xfs_extent_busy_tree * xfs_extent_busy_alloc(void) { struct xfs_extent_busy_tree *eb; eb = kzalloc(sizeof(*eb), GFP_KERNEL); if (!eb) return NULL; spin_lock_init(&eb->eb_lock); init_waitqueue_head(&eb->eb_wait); eb->eb_tree = RB_ROOT; return eb; }