// SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2021-2024 Oracle. All Rights Reserved. * Author: Darrick J. Wong */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_log_format.h" #include "xfs_trans.h" #include "xfs_inode.h" #include "xfs_btree.h" #include "xfs_ialloc.h" #include "xfs_ialloc_btree.h" #include "xfs_ag.h" #include "xfs_error.h" #include "xfs_bit.h" #include "xfs_icache.h" #include "scrub/scrub.h" #include "scrub/iscan.h" #include "scrub/common.h" #include "scrub/trace.h" /* * Live File Scan * ============== * * Live file scans walk every inode in a live filesystem. This is more or * less like a regular iwalk, except that when we're advancing the scan cursor, * we must ensure that inodes cannot be added or deleted anywhere between the * old cursor value and the new cursor value. If we're advancing the cursor * by one inode, the caller must hold that inode; if we're finding the next * inode to scan, we must grab the AGI and hold it until we've updated the * scan cursor. * * Callers are expected to use this code to scan all files in the filesystem to * construct a new metadata index of some kind. The scan races against other * live updates, which means there must be a provision to update the new index * when updates are made to inodes that already been scanned. The iscan lock * can be used in live update hook code to stop the scan and protect this data * structure. * * To keep the new index up to date with other metadata updates being made to * the live filesystem, it is assumed that the caller will add hooks as needed * to be notified when a metadata update occurs. The inode scanner must tell * the hook code when an inode has been visited with xchk_iscan_mark_visit. * Hook functions can use xchk_iscan_want_live_update to decide if the * scanner's observations must be updated. */ /* * If the inobt record @rec covers @iscan->skip_ino, mark the inode free so * that the scan ignores that inode. */ STATIC void xchk_iscan_mask_skipino( struct xchk_iscan *iscan, struct xfs_perag *pag, struct xfs_inobt_rec_incore *rec, xfs_agino_t lastrecino) { struct xfs_scrub *sc = iscan->sc; struct xfs_mount *mp = sc->mp; xfs_agnumber_t skip_agno = XFS_INO_TO_AGNO(mp, iscan->skip_ino); xfs_agnumber_t skip_agino = XFS_INO_TO_AGINO(mp, iscan->skip_ino); if (pag_agno(pag) != skip_agno) return; if (skip_agino < rec->ir_startino) return; if (skip_agino > lastrecino) return; rec->ir_free |= xfs_inobt_maskn(skip_agino - rec->ir_startino, 1); } /* * Set *cursor to the next allocated inode after whatever it's set to now. * If there are no more inodes in this AG, cursor is set to NULLAGINO. */ STATIC int xchk_iscan_find_next( struct xchk_iscan *iscan, struct xfs_buf *agi_bp, struct xfs_perag *pag, xfs_inofree_t *allocmaskp, xfs_agino_t *cursor, uint8_t *nr_inodesp) { struct xfs_scrub *sc = iscan->sc; struct xfs_inobt_rec_incore rec; struct xfs_btree_cur *cur; struct xfs_mount *mp = sc->mp; struct xfs_trans *tp = sc->tp; xfs_agnumber_t agno = pag_agno(pag); xfs_agino_t lastino = NULLAGINO; xfs_agino_t first, last; xfs_agino_t agino = *cursor; int has_rec; int error; /* If the cursor is beyond the end of this AG, move to the next one. */ xfs_agino_range(mp, agno, &first, &last); if (agino > last) { *cursor = NULLAGINO; return 0; } /* * Look up the inode chunk for the current cursor position. If there * is no chunk here, we want the next one. */ cur = xfs_inobt_init_cursor(pag, tp, agi_bp); error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has_rec); if (!error && !has_rec) error = xfs_btree_increment(cur, 0, &has_rec); for (; !error; error = xfs_btree_increment(cur, 0, &has_rec)) { xfs_inofree_t allocmask; /* * If we've run out of inobt records in this AG, move the * cursor on to the next AG and exit. The caller can try * again with the next AG. */ if (!has_rec) { *cursor = NULLAGINO; break; } error = xfs_inobt_get_rec(cur, &rec, &has_rec); if (error) break; if (!has_rec) { error = -EFSCORRUPTED; break; } /* Make sure that we always move forward. */ if (lastino != NULLAGINO && XFS_IS_CORRUPT(mp, lastino >= rec.ir_startino)) { error = -EFSCORRUPTED; break; } lastino = rec.ir_startino + XFS_INODES_PER_CHUNK - 1; /* * If this record only covers inodes that come before the * cursor, advance to the next record. */ if (rec.ir_startino + XFS_INODES_PER_CHUNK <= agino) continue; if (iscan->skip_ino) xchk_iscan_mask_skipino(iscan, pag, &rec, lastino); /* * If the incoming lookup put us in the middle of an inobt * record, mark it and the previous inodes "free" so that the * search for allocated inodes will start at the cursor. * We don't care about ir_freecount here. */ if (agino >= rec.ir_startino) rec.ir_free |= xfs_inobt_maskn(0, agino + 1 - rec.ir_startino); /* * If there are allocated inodes in this chunk, find them * and update the scan cursor. */ allocmask = ~rec.ir_free; if (hweight64(allocmask) > 0) { int next = xfs_lowbit64(allocmask); ASSERT(next >= 0); *cursor = rec.ir_startino + next; *allocmaskp = allocmask >> next; *nr_inodesp = XFS_INODES_PER_CHUNK - next; break; } } xfs_btree_del_cursor(cur, error); return error; } /* * Advance both the scan and the visited cursors. * * The inumber address space for a given filesystem is sparse, which means that * the scan cursor can jump a long ways in a single iter() call. There are no * inodes in these sparse areas, so we must move the visited cursor forward at * the same time so that the scan user can receive live updates for inodes that * may get created once we release the AGI buffer. */ static inline void xchk_iscan_move_cursor( struct xchk_iscan *iscan, xfs_agnumber_t agno, xfs_agino_t agino) { struct xfs_scrub *sc = iscan->sc; struct xfs_mount *mp = sc->mp; xfs_ino_t cursor, visited; BUILD_BUG_ON(XFS_MAXINUMBER == NULLFSINO); /* * Special-case ino == 0 here so that we never set visited_ino to * NULLFSINO when wrapping around EOFS, for that will let through all * live updates. */ cursor = XFS_AGINO_TO_INO(mp, agno, agino); if (cursor == 0) visited = XFS_MAXINUMBER; else visited = cursor - 1; mutex_lock(&iscan->lock); iscan->cursor_ino = cursor; iscan->__visited_ino = visited; trace_xchk_iscan_move_cursor(iscan); mutex_unlock(&iscan->lock); } /* * Prepare to return agno/agino to the iscan caller by moving the lastino * cursor to the previous inode. Do this while we still hold the AGI so that * no other threads can create or delete inodes in this AG. */ static inline void xchk_iscan_finish( struct xchk_iscan *iscan) { mutex_lock(&iscan->lock); iscan->cursor_ino = NULLFSINO; /* All live updates will be applied from now on */ iscan->__visited_ino = NULLFSINO; mutex_unlock(&iscan->lock); } /* Mark an inode scan finished before we actually scan anything. */ void xchk_iscan_finish_early( struct xchk_iscan *iscan) { ASSERT(iscan->cursor_ino == iscan->scan_start_ino); ASSERT(iscan->__visited_ino == iscan->scan_start_ino); xchk_iscan_finish(iscan); } /* * Grab the AGI to advance the inode scan. Returns 0 if *agi_bpp is now set, * -ECANCELED if the live scan aborted, -EBUSY if the AGI could not be grabbed, * or the usual negative errno. */ STATIC int xchk_iscan_read_agi( struct xchk_iscan *iscan, struct xfs_perag *pag, struct xfs_buf **agi_bpp) { struct xfs_scrub *sc = iscan->sc; unsigned long relax; int ret; if (!xchk_iscan_agi_needs_trylock(iscan)) return xfs_ialloc_read_agi(pag, sc->tp, 0, agi_bpp); relax = msecs_to_jiffies(iscan->iget_retry_delay); do { ret = xfs_ialloc_read_agi(pag, sc->tp, XFS_IALLOC_FLAG_TRYLOCK, agi_bpp); if (ret != -EAGAIN) return ret; if (!iscan->iget_timeout || time_is_before_jiffies(iscan->__iget_deadline)) return -EBUSY; trace_xchk_iscan_agi_retry_wait(iscan); } while (!schedule_timeout_killable(relax) && !xchk_iscan_aborted(iscan)); return -ECANCELED; } /* * Advance ino to the next inode that the inobt thinks is allocated, being * careful to jump to the next AG if we've reached the right end of this AG's * inode btree. Advancing ino effectively means that we've pushed the inode * scan forward, so set the iscan cursor to (ino - 1) so that our live update * predicates will track inode allocations in that part of the inode number * key space once we release the AGI buffer. * * Returns 1 if there's a new inode to examine, 0 if we've run out of inodes, * -ECANCELED if the live scan aborted, or the usual negative errno. */ STATIC int xchk_iscan_advance( struct xchk_iscan *iscan, struct xfs_perag **pagp, struct xfs_buf **agi_bpp, xfs_inofree_t *allocmaskp, uint8_t *nr_inodesp) { struct xfs_scrub *sc = iscan->sc; struct xfs_mount *mp = sc->mp; struct xfs_buf *agi_bp; struct xfs_perag *pag; xfs_agnumber_t agno; xfs_agino_t agino; int ret; ASSERT(iscan->cursor_ino >= iscan->__visited_ino); do { if (xchk_iscan_aborted(iscan)) return -ECANCELED; agno = XFS_INO_TO_AGNO(mp, iscan->cursor_ino); pag = xfs_perag_get(mp, agno); if (!pag) return -ECANCELED; ret = xchk_iscan_read_agi(iscan, pag, &agi_bp); if (ret) goto out_pag; agino = XFS_INO_TO_AGINO(mp, iscan->cursor_ino); ret = xchk_iscan_find_next(iscan, agi_bp, pag, allocmaskp, &agino, nr_inodesp); if (ret) goto out_buf; if (agino != NULLAGINO) { /* * Found the next inode in this AG, so return it along * with the AGI buffer and the perag structure to * ensure it cannot go away. */ xchk_iscan_move_cursor(iscan, agno, agino); *agi_bpp = agi_bp; *pagp = pag; return 1; } /* * Did not find any more inodes in this AG, move on to the next * AG. */ agno = (agno + 1) % mp->m_sb.sb_agcount; xchk_iscan_move_cursor(iscan, agno, 0); xfs_trans_brelse(sc->tp, agi_bp); xfs_perag_put(pag); trace_xchk_iscan_advance_ag(iscan); } while (iscan->cursor_ino != iscan->scan_start_ino); xchk_iscan_finish(iscan); return 0; out_buf: xfs_trans_brelse(sc->tp, agi_bp); out_pag: xfs_perag_put(pag); return ret; } /* * Grabbing the inode failed, so we need to back up the scan and ask the caller * to try to _advance the scan again. Returns -EBUSY if we've run out of retry * opportunities, -ECANCELED if the process has a fatal signal pending, or * -EAGAIN if we should try again. */ STATIC int xchk_iscan_iget_retry( struct xchk_iscan *iscan, bool wait) { ASSERT(iscan->cursor_ino == iscan->__visited_ino + 1); if (!iscan->iget_timeout || time_is_before_jiffies(iscan->__iget_deadline)) return -EBUSY; if (wait) { unsigned long relax; /* * Sleep for a period of time to let the rest of the system * catch up. If we return early, someone sent a kill signal to * the calling process. */ relax = msecs_to_jiffies(iscan->iget_retry_delay); trace_xchk_iscan_iget_retry_wait(iscan); if (schedule_timeout_killable(relax) || xchk_iscan_aborted(iscan)) return -ECANCELED; } iscan->cursor_ino--; return -EAGAIN; } /* * For an inode scan, we hold the AGI and want to try to grab a batch of * inodes. Holding the AGI prevents inodegc from clearing freed inodes, * so we must use noretry here. For every inode after the first one in the * batch, we don't want to wait, so we use retry there too. Finally, use * dontcache to avoid polluting the cache. */ #define ISCAN_IGET_FLAGS (XFS_IGET_NORETRY | XFS_IGET_DONTCACHE) /* * Grab an inode as part of an inode scan. While scanning this inode, the * caller must ensure that no other threads can modify the inode until a call * to xchk_iscan_visit succeeds. * * Returns the number of incore inodes grabbed; -EAGAIN if the caller should * call again xchk_iscan_advance; -EBUSY if we couldn't grab an inode; * -ECANCELED if there's a fatal signal pending; or some other negative errno. */ STATIC int xchk_iscan_iget( struct xchk_iscan *iscan, struct xfs_perag *pag, struct xfs_buf *agi_bp, xfs_inofree_t allocmask, uint8_t nr_inodes) { struct xfs_scrub *sc = iscan->sc; struct xfs_mount *mp = sc->mp; xfs_ino_t ino = iscan->cursor_ino; unsigned int idx = 0; unsigned int i; int error; ASSERT(iscan->__inodes[0] == NULL); /* Fill the first slot in the inode array. */ error = xfs_iget(sc->mp, sc->tp, ino, ISCAN_IGET_FLAGS, 0, &iscan->__inodes[idx]); trace_xchk_iscan_iget(iscan, error); if (error == -ENOENT || error == -EAGAIN) { xfs_trans_brelse(sc->tp, agi_bp); xfs_perag_put(pag); /* * It's possible that this inode has lost all of its links but * hasn't yet been inactivated. If we don't have a transaction * or it's not writable, flush the inodegc workers and wait. * If we have a non-empty transaction, we must not block on * inodegc, which allocates its own transactions. */ if (sc->tp && !(sc->tp->t_flags & XFS_TRANS_NO_WRITECOUNT)) xfs_inodegc_push(mp); else xfs_inodegc_flush(mp); return xchk_iscan_iget_retry(iscan, true); } if (error == -EINVAL) { xfs_trans_brelse(sc->tp, agi_bp); xfs_perag_put(pag); /* * We thought the inode was allocated, but the inode btree * lookup failed, which means that it was freed since the last * time we advanced the cursor. Back up and try again. This * should never happen since still hold the AGI buffer from the * inobt check, but we need to be careful about infinite loops. */ return xchk_iscan_iget_retry(iscan, false); } if (error) { xfs_trans_brelse(sc->tp, agi_bp); xfs_perag_put(pag); return error; } idx++; ino++; allocmask >>= 1; /* * Now that we've filled the first slot in __inodes, try to fill the * rest of the batch with consecutively ordered inodes. to reduce the * number of _iter calls. Make a bitmap of unallocated inodes from the * zeroes in the inuse bitmap; these inodes will not be scanned, but * the _want_live_update predicate will pass through all live updates. * * If we can't iget an allocated inode, stop and return what we have. */ mutex_lock(&iscan->lock); iscan->__batch_ino = ino - 1; iscan->__skipped_inomask = 0; mutex_unlock(&iscan->lock); for (i = 1; i < nr_inodes; i++, ino++, allocmask >>= 1) { if (!(allocmask & 1)) { ASSERT(!(iscan->__skipped_inomask & (1ULL << i))); mutex_lock(&iscan->lock); iscan->cursor_ino = ino; iscan->__skipped_inomask |= (1ULL << i); mutex_unlock(&iscan->lock); continue; } ASSERT(iscan->__inodes[idx] == NULL); error = xfs_iget(sc->mp, sc->tp, ino, ISCAN_IGET_FLAGS, 0, &iscan->__inodes[idx]); if (error) break; mutex_lock(&iscan->lock); iscan->cursor_ino = ino; mutex_unlock(&iscan->lock); idx++; } trace_xchk_iscan_iget_batch(sc->mp, iscan, nr_inodes, idx); xfs_trans_brelse(sc->tp, agi_bp); xfs_perag_put(pag); return idx; } /* * Advance the visit cursor to reflect skipped inodes beyond whatever we * scanned. */ STATIC void xchk_iscan_finish_batch( struct xchk_iscan *iscan) { xfs_ino_t highest_skipped; mutex_lock(&iscan->lock); if (iscan->__batch_ino != NULLFSINO) { highest_skipped = iscan->__batch_ino + xfs_highbit64(iscan->__skipped_inomask); iscan->__visited_ino = max(iscan->__visited_ino, highest_skipped); trace_xchk_iscan_skip(iscan); } iscan->__batch_ino = NULLFSINO; iscan->__skipped_inomask = 0; mutex_unlock(&iscan->lock); } /* * Advance the inode scan cursor to the next allocated inode and return up to * 64 consecutive allocated inodes starting with the cursor position. */ STATIC int xchk_iscan_iter_batch( struct xchk_iscan *iscan) { struct xfs_scrub *sc = iscan->sc; int ret; xchk_iscan_finish_batch(iscan); if (iscan->iget_timeout) iscan->__iget_deadline = jiffies + msecs_to_jiffies(iscan->iget_timeout); do { struct xfs_buf *agi_bp = NULL; struct xfs_perag *pag = NULL; xfs_inofree_t allocmask = 0; uint8_t nr_inodes = 0; ret = xchk_iscan_advance(iscan, &pag, &agi_bp, &allocmask, &nr_inodes); if (ret != 1) return ret; if (xchk_iscan_aborted(iscan)) { xfs_trans_brelse(sc->tp, agi_bp); xfs_perag_put(pag); ret = -ECANCELED; break; } ret = xchk_iscan_iget(iscan, pag, agi_bp, allocmask, nr_inodes); } while (ret == -EAGAIN); return ret; } /* * Advance the inode scan cursor to the next allocated inode and return the * incore inode structure associated with it. * * Returns 1 if there's a new inode to examine, 0 if we've run out of inodes, * -ECANCELED if the live scan aborted, -EBUSY if the incore inode could not be * grabbed, or the usual negative errno. * * If the function returns -EBUSY and the caller can handle skipping an inode, * it may call this function again to continue the scan with the next allocated * inode. */ int xchk_iscan_iter( struct xchk_iscan *iscan, struct xfs_inode **ipp) { unsigned int i; int error; /* Find a cached inode, or go get another batch. */ for (i = 0; i < XFS_INODES_PER_CHUNK; i++) { if (iscan->__inodes[i]) goto foundit; } error = xchk_iscan_iter_batch(iscan); if (error <= 0) return error; ASSERT(iscan->__inodes[0] != NULL); i = 0; foundit: /* Give the caller our reference. */ *ipp = iscan->__inodes[i]; iscan->__inodes[i] = NULL; return 1; } /* Clean up an xfs_iscan_iter call by dropping any inodes that we still hold. */ void xchk_iscan_iter_finish( struct xchk_iscan *iscan) { struct xfs_scrub *sc = iscan->sc; unsigned int i; for (i = 0; i < XFS_INODES_PER_CHUNK; i++) { if (iscan->__inodes[i]) { xchk_irele(sc, iscan->__inodes[i]); iscan->__inodes[i] = NULL; } } } /* Mark this inode scan finished and release resources. */ void xchk_iscan_teardown( struct xchk_iscan *iscan) { xchk_iscan_iter_finish(iscan); xchk_iscan_finish(iscan); mutex_destroy(&iscan->lock); } /* Pick an AG from which to start a scan. */ static inline xfs_ino_t xchk_iscan_rotor( struct xfs_mount *mp) { static atomic_t agi_rotor; unsigned int r = atomic_inc_return(&agi_rotor) - 1; /* * Rotoring *backwards* through the AGs, so we add one here before * subtracting from the agcount to arrive at an AG number. */ r = (r % mp->m_sb.sb_agcount) + 1; return XFS_AGINO_TO_INO(mp, mp->m_sb.sb_agcount - r, 0); } /* * Set ourselves up to start an inode scan. If the @iget_timeout and * @iget_retry_delay parameters are set, the scan will try to iget each inode * for @iget_timeout milliseconds. If an iget call indicates that the inode is * waiting to be inactivated, the CPU will relax for @iget_retry_delay * milliseconds after pushing the inactivation workers. */ void xchk_iscan_start( struct xfs_scrub *sc, unsigned int iget_timeout, unsigned int iget_retry_delay, struct xchk_iscan *iscan) { xfs_ino_t start_ino; start_ino = xchk_iscan_rotor(sc->mp); iscan->__batch_ino = NULLFSINO; iscan->__skipped_inomask = 0; iscan->sc = sc; clear_bit(XCHK_ISCAN_OPSTATE_ABORTED, &iscan->__opstate); iscan->iget_timeout = iget_timeout; iscan->iget_retry_delay = iget_retry_delay; iscan->__visited_ino = start_ino; iscan->cursor_ino = start_ino; iscan->scan_start_ino = start_ino; mutex_init(&iscan->lock); memset(iscan->__inodes, 0, sizeof(iscan->__inodes)); trace_xchk_iscan_start(iscan, start_ino); } /* * Mark this inode as having been visited. Callers must hold a sufficiently * exclusive lock on the inode to prevent concurrent modifications. */ void xchk_iscan_mark_visited( struct xchk_iscan *iscan, struct xfs_inode *ip) { mutex_lock(&iscan->lock); iscan->__visited_ino = ip->i_ino; trace_xchk_iscan_visit(iscan); mutex_unlock(&iscan->lock); } /* * Did we skip this inode because it wasn't allocated when we loaded the batch? * If so, it is newly allocated and will not be scanned. All live updates to * this inode must be passed to the caller to maintain scan correctness. */ static inline bool xchk_iscan_skipped( const struct xchk_iscan *iscan, xfs_ino_t ino) { if (iscan->__batch_ino == NULLFSINO) return false; if (ino < iscan->__batch_ino) return false; if (ino >= iscan->__batch_ino + XFS_INODES_PER_CHUNK) return false; return iscan->__skipped_inomask & (1ULL << (ino - iscan->__batch_ino)); } /* * Do we need a live update for this inode? This is true if the scanner thread * has visited this inode and the scan hasn't been aborted due to errors. * Callers must hold a sufficiently exclusive lock on the inode to prevent * scanners from reading any inode metadata. */ bool xchk_iscan_want_live_update( struct xchk_iscan *iscan, xfs_ino_t ino) { bool ret = false; if (xchk_iscan_aborted(iscan)) return false; mutex_lock(&iscan->lock); trace_xchk_iscan_want_live_update(iscan, ino); /* Scan is finished, caller should receive all updates. */ if (iscan->__visited_ino == NULLFSINO) { ret = true; goto unlock; } /* * No inodes have been visited yet, so the visited cursor points at the * start of the scan range. The caller should not receive any updates. */ if (iscan->scan_start_ino == iscan->__visited_ino) { ret = false; goto unlock; } /* * This inode was not allocated at the time of the iscan batch. * The caller should receive all updates. */ if (xchk_iscan_skipped(iscan, ino)) { ret = true; goto unlock; } /* * The visited cursor hasn't yet wrapped around the end of the FS. If * @ino is inside the starred range, the caller should receive updates: * * 0 ------------ S ************ V ------------ EOFS */ if (iscan->scan_start_ino <= iscan->__visited_ino) { if (ino >= iscan->scan_start_ino && ino <= iscan->__visited_ino) ret = true; goto unlock; } /* * The visited cursor wrapped around the end of the FS. If @ino is * inside the starred range, the caller should receive updates: * * 0 ************ V ------------ S ************ EOFS */ if (ino >= iscan->scan_start_ino || ino <= iscan->__visited_ino) ret = true; unlock: mutex_unlock(&iscan->lock); return ret; }