// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2023-2025 Christoph Hellwig. * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates. */ #include "xfs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_btree.h" #include "xfs_trans.h" #include "xfs_icache.h" #include "xfs_rmap.h" #include "xfs_rtbitmap.h" #include "xfs_rtrmap_btree.h" #include "xfs_zone_alloc.h" #include "xfs_zone_priv.h" #include "xfs_zones.h" #include "xfs_trace.h" /* * Implement Garbage Collection (GC) of partially used zoned. * * To support the purely sequential writes in each zone, zoned XFS needs to be * able to move data remaining in a zone out of it to reset the zone to prepare * for writing to it again. * * This is done by the GC thread implemented in this file. To support that a * number of zones (XFS_GC_ZONES) is reserved from the user visible capacity to * write the garbage collected data into. * * Whenever the available space is below the chosen threshold, the GC thread * looks for potential non-empty but not fully used zones that are worth * reclaiming. Once found the rmap for the victim zone is queried, and after * a bit of sorting to reduce fragmentation, the still live extents are read * into memory and written to the GC target zone, and the bmap btree of the * files is updated to point to the new location. To avoid taking the IOLOCK * and MMAPLOCK for the entire GC process and thus affecting the latency of * user reads and writes to the files, the GC writes are speculative and the * I/O completion checks that no other writes happened for the affected regions * before remapping. * * Once a zone does not contain any valid data, be that through GC or user * block removal, it is queued for for a zone reset. The reset operation * carefully ensures that the RT device cache is flushed and all transactions * referencing the rmap have been committed to disk. */ /* * Size of each GC scratch pad. This is also the upper bound for each * GC I/O, which helps to keep latency down. */ #define XFS_GC_CHUNK_SIZE SZ_1M /* * Scratchpad data to read GCed data into. * * The offset member tracks where the next allocation starts, and freed tracks * the amount of space that is not used anymore. */ #define XFS_ZONE_GC_NR_SCRATCH 2 struct xfs_zone_scratch { struct folio *folio; unsigned int offset; unsigned int freed; }; /* * Chunk that is read and written for each GC operation. * * Note that for writes to actual zoned devices, the chunk can be split when * reaching the hardware limit. */ struct xfs_gc_bio { struct xfs_zone_gc_data *data; /* * Entry into the reading/writing/resetting list. Only accessed from * the GC thread, so no locking needed. */ struct list_head entry; /* * State of this gc_bio. Done means the current I/O completed. * Set from the bio end I/O handler, read from the GC thread. */ enum { XFS_GC_BIO_NEW, XFS_GC_BIO_DONE, } state; /* * Pointer to the inode and byte range in the inode that this * GC chunk is operating on. */ struct xfs_inode *ip; loff_t offset; unsigned int len; /* * Existing startblock (in the zone to be freed) and newly assigned * daddr in the zone GCed into. */ xfs_fsblock_t old_startblock; xfs_daddr_t new_daddr; struct xfs_zone_scratch *scratch; /* Are we writing to a sequential write required zone? */ bool is_seq; /* Open Zone being written to */ struct xfs_open_zone *oz; /* Bio used for reads and writes, including the bvec used by it */ struct bio_vec bv; struct bio bio; /* must be last */ }; #define XFS_ZONE_GC_RECS 1024 /* iterator, needs to be reinitialized for each victim zone */ struct xfs_zone_gc_iter { struct xfs_rtgroup *victim_rtg; unsigned int rec_count; unsigned int rec_idx; xfs_agblock_t next_startblock; struct xfs_rmap_irec *recs; }; /* * Per-mount GC state. */ struct xfs_zone_gc_data { struct xfs_mount *mp; /* bioset used to allocate the gc_bios */ struct bio_set bio_set; /* * Scratchpad used, and index to indicated which one is used. */ struct xfs_zone_scratch scratch[XFS_ZONE_GC_NR_SCRATCH]; unsigned int scratch_idx; /* * List of bios currently being read, written and reset. * These lists are only accessed by the GC thread itself, and must only * be processed in order. */ struct list_head reading; struct list_head writing; struct list_head resetting; /* * Iterator for the victim zone. */ struct xfs_zone_gc_iter iter; }; /* * We aim to keep enough zones free in stock to fully use the open zone limit * for data placement purposes. */ bool xfs_zoned_need_gc( struct xfs_mount *mp) { if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE)) return false; if (xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE) < mp->m_groups[XG_TYPE_RTG].blocks * (mp->m_max_open_zones - XFS_OPEN_GC_ZONES)) return true; return false; } static struct xfs_zone_gc_data * xfs_zone_gc_data_alloc( struct xfs_mount *mp) { struct xfs_zone_gc_data *data; int i; data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) return NULL; data->iter.recs = kcalloc(XFS_ZONE_GC_RECS, sizeof(*data->iter.recs), GFP_KERNEL); if (!data->iter.recs) goto out_free_data; /* * We actually only need a single bio_vec. It would be nice to have * a flag that only allocates the inline bvecs and not the separate * bvec pool. */ if (bioset_init(&data->bio_set, 16, offsetof(struct xfs_gc_bio, bio), BIOSET_NEED_BVECS)) goto out_free_recs; for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++) { data->scratch[i].folio = folio_alloc(GFP_KERNEL, get_order(XFS_GC_CHUNK_SIZE)); if (!data->scratch[i].folio) goto out_free_scratch; } INIT_LIST_HEAD(&data->reading); INIT_LIST_HEAD(&data->writing); INIT_LIST_HEAD(&data->resetting); data->mp = mp; return data; out_free_scratch: while (--i >= 0) folio_put(data->scratch[i].folio); bioset_exit(&data->bio_set); out_free_recs: kfree(data->iter.recs); out_free_data: kfree(data); return NULL; } static void xfs_zone_gc_data_free( struct xfs_zone_gc_data *data) { int i; for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++) folio_put(data->scratch[i].folio); bioset_exit(&data->bio_set); kfree(data->iter.recs); kfree(data); } static void xfs_zone_gc_iter_init( struct xfs_zone_gc_iter *iter, struct xfs_rtgroup *victim_rtg) { iter->next_startblock = 0; iter->rec_count = 0; iter->rec_idx = 0; iter->victim_rtg = victim_rtg; } /* * Query the rmap of the victim zone to gather the records to evacuate. */ static int xfs_zone_gc_query_cb( struct xfs_btree_cur *cur, const struct xfs_rmap_irec *irec, void *private) { struct xfs_zone_gc_iter *iter = private; ASSERT(!XFS_RMAP_NON_INODE_OWNER(irec->rm_owner)); ASSERT(!xfs_is_sb_inum(cur->bc_mp, irec->rm_owner)); ASSERT(!(irec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))); iter->recs[iter->rec_count] = *irec; if (++iter->rec_count == XFS_ZONE_GC_RECS) { iter->next_startblock = irec->rm_startblock + irec->rm_blockcount; return 1; } return 0; } #define cmp_int(l, r) ((l > r) - (l < r)) static int xfs_zone_gc_rmap_rec_cmp( const void *a, const void *b) { const struct xfs_rmap_irec *reca = a; const struct xfs_rmap_irec *recb = b; int diff; diff = cmp_int(reca->rm_owner, recb->rm_owner); if (diff) return diff; return cmp_int(reca->rm_offset, recb->rm_offset); } static int xfs_zone_gc_query( struct xfs_mount *mp, struct xfs_zone_gc_iter *iter) { struct xfs_rtgroup *rtg = iter->victim_rtg; struct xfs_rmap_irec ri_low = { }; struct xfs_rmap_irec ri_high; struct xfs_btree_cur *cur; struct xfs_trans *tp; int error; ASSERT(iter->next_startblock <= rtg_blocks(rtg)); if (iter->next_startblock == rtg_blocks(rtg)) goto done; ASSERT(iter->next_startblock < rtg_blocks(rtg)); ri_low.rm_startblock = iter->next_startblock; memset(&ri_high, 0xFF, sizeof(ri_high)); iter->rec_idx = 0; iter->rec_count = 0; error = xfs_trans_alloc_empty(mp, &tp); if (error) return error; xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); cur = xfs_rtrmapbt_init_cursor(tp, rtg); error = xfs_rmap_query_range(cur, &ri_low, &ri_high, xfs_zone_gc_query_cb, iter); xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP); xfs_btree_del_cursor(cur, error < 0 ? error : 0); xfs_trans_cancel(tp); if (error < 0) return error; /* * Sort the rmap records by inode number and increasing offset to * defragment the mappings. * * This could be further enhanced by an even bigger look ahead window, * but that's better left until we have better detection of changes to * inode mapping to avoid the potential of GCing already dead data. */ sort(iter->recs, iter->rec_count, sizeof(iter->recs[0]), xfs_zone_gc_rmap_rec_cmp, NULL); if (error == 0) { /* * We finished iterating through the zone. */ iter->next_startblock = rtg_blocks(rtg); if (iter->rec_count == 0) goto done; } return 0; done: xfs_rtgroup_rele(iter->victim_rtg); iter->victim_rtg = NULL; return 0; } static bool xfs_zone_gc_iter_next( struct xfs_mount *mp, struct xfs_zone_gc_iter *iter, struct xfs_rmap_irec *chunk_rec, struct xfs_inode **ipp) { struct xfs_rmap_irec *irec; int error; if (!iter->victim_rtg) return false; retry: if (iter->rec_idx == iter->rec_count) { error = xfs_zone_gc_query(mp, iter); if (error) goto fail; if (!iter->victim_rtg) return false; } irec = &iter->recs[iter->rec_idx]; error = xfs_iget(mp, NULL, irec->rm_owner, XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE, 0, ipp); if (error) { /* * If the inode was already deleted, skip over it. */ if (error == -ENOENT) { iter->rec_idx++; goto retry; } goto fail; } if (!S_ISREG(VFS_I(*ipp)->i_mode) || !XFS_IS_REALTIME_INODE(*ipp)) { iter->rec_idx++; xfs_irele(*ipp); goto retry; } *chunk_rec = *irec; return true; fail: xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); return false; } static void xfs_zone_gc_iter_advance( struct xfs_zone_gc_iter *iter, xfs_extlen_t count_fsb) { struct xfs_rmap_irec *irec = &iter->recs[iter->rec_idx]; irec->rm_offset += count_fsb; irec->rm_startblock += count_fsb; irec->rm_blockcount -= count_fsb; if (!irec->rm_blockcount) iter->rec_idx++; } static struct xfs_rtgroup * xfs_zone_gc_pick_victim_from( struct xfs_mount *mp, uint32_t bucket) { struct xfs_zone_info *zi = mp->m_zone_info; uint32_t victim_used = U32_MAX; struct xfs_rtgroup *victim_rtg = NULL; uint32_t bit; if (!zi->zi_used_bucket_entries[bucket]) return NULL; for_each_set_bit(bit, zi->zi_used_bucket_bitmap[bucket], mp->m_sb.sb_rgcount) { struct xfs_rtgroup *rtg = xfs_rtgroup_grab(mp, bit); if (!rtg) continue; /* skip zones that are just waiting for a reset */ if (rtg_rmap(rtg)->i_used_blocks == 0 || rtg_rmap(rtg)->i_used_blocks >= victim_used) { xfs_rtgroup_rele(rtg); continue; } if (victim_rtg) xfs_rtgroup_rele(victim_rtg); victim_rtg = rtg; victim_used = rtg_rmap(rtg)->i_used_blocks; /* * Any zone that is less than 1 percent used is fair game for * instant reclaim. All of these zones are in the last * bucket, so avoid the expensive division for the zones * in the other buckets. */ if (bucket == 0 && rtg_rmap(rtg)->i_used_blocks < rtg_blocks(rtg) / 100) break; } return victim_rtg; } /* * Iterate through all zones marked as reclaimable and find a candidate to * reclaim. */ static bool xfs_zone_gc_select_victim( struct xfs_zone_gc_data *data) { struct xfs_zone_gc_iter *iter = &data->iter; struct xfs_mount *mp = data->mp; struct xfs_zone_info *zi = mp->m_zone_info; struct xfs_rtgroup *victim_rtg = NULL; unsigned int bucket; if (xfs_is_shutdown(mp)) return false; if (iter->victim_rtg) return true; /* * Don't start new work if we are asked to stop or park. */ if (kthread_should_stop() || kthread_should_park()) return false; if (!xfs_zoned_need_gc(mp)) return false; spin_lock(&zi->zi_used_buckets_lock); for (bucket = 0; bucket < XFS_ZONE_USED_BUCKETS; bucket++) { victim_rtg = xfs_zone_gc_pick_victim_from(mp, bucket); if (victim_rtg) break; } spin_unlock(&zi->zi_used_buckets_lock); if (!victim_rtg) return false; trace_xfs_zone_gc_select_victim(victim_rtg, bucket); xfs_zone_gc_iter_init(iter, victim_rtg); return true; } static struct xfs_open_zone * xfs_zone_gc_steal_open( struct xfs_zone_info *zi) { struct xfs_open_zone *oz, *found = NULL; spin_lock(&zi->zi_open_zones_lock); list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) { if (!found || oz->oz_write_pointer < found->oz_write_pointer) found = oz; } if (found) { found->oz_is_gc = true; list_del_init(&found->oz_entry); zi->zi_nr_open_zones--; } spin_unlock(&zi->zi_open_zones_lock); return found; } static struct xfs_open_zone * xfs_zone_gc_select_target( struct xfs_mount *mp) { struct xfs_zone_info *zi = mp->m_zone_info; struct xfs_open_zone *oz = zi->zi_open_gc_zone; /* * We need to wait for pending writes to finish. */ if (oz && oz->oz_written < rtg_blocks(oz->oz_rtg)) return NULL; ASSERT(zi->zi_nr_open_zones <= mp->m_max_open_zones - XFS_OPEN_GC_ZONES); oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true); if (oz) trace_xfs_zone_gc_target_opened(oz->oz_rtg); spin_lock(&zi->zi_open_zones_lock); zi->zi_open_gc_zone = oz; spin_unlock(&zi->zi_open_zones_lock); return oz; } /* * Ensure we have a valid open zone to write the GC data to. * * If the current target zone has space keep writing to it, else first wait for * all pending writes and then pick a new one. */ static struct xfs_open_zone * xfs_zone_gc_ensure_target( struct xfs_mount *mp) { struct xfs_open_zone *oz = mp->m_zone_info->zi_open_gc_zone; if (!oz || oz->oz_write_pointer == rtg_blocks(oz->oz_rtg)) return xfs_zone_gc_select_target(mp); return oz; } static unsigned int xfs_zone_gc_scratch_available( struct xfs_zone_gc_data *data) { return XFS_GC_CHUNK_SIZE - data->scratch[data->scratch_idx].offset; } static bool xfs_zone_gc_space_available( struct xfs_zone_gc_data *data) { struct xfs_open_zone *oz; oz = xfs_zone_gc_ensure_target(data->mp); if (!oz) return false; return oz->oz_write_pointer < rtg_blocks(oz->oz_rtg) && xfs_zone_gc_scratch_available(data); } static void xfs_zone_gc_end_io( struct bio *bio) { struct xfs_gc_bio *chunk = container_of(bio, struct xfs_gc_bio, bio); struct xfs_zone_gc_data *data = chunk->data; WRITE_ONCE(chunk->state, XFS_GC_BIO_DONE); wake_up_process(data->mp->m_zone_info->zi_gc_thread); } static struct xfs_open_zone * xfs_zone_gc_alloc_blocks( struct xfs_zone_gc_data *data, xfs_extlen_t *count_fsb, xfs_daddr_t *daddr, bool *is_seq) { struct xfs_mount *mp = data->mp; struct xfs_open_zone *oz; oz = xfs_zone_gc_ensure_target(mp); if (!oz) return NULL; *count_fsb = min(*count_fsb, XFS_B_TO_FSB(mp, xfs_zone_gc_scratch_available(data))); /* * Directly allocate GC blocks from the reserved pool. * * If we'd take them from the normal pool we could be stealing blocks * from a regular writer, which would then have to wait for GC and * deadlock. */ spin_lock(&mp->m_sb_lock); *count_fsb = min(*count_fsb, rtg_blocks(oz->oz_rtg) - oz->oz_write_pointer); *count_fsb = min3(*count_fsb, mp->m_free[XC_FREE_RTEXTENTS].res_avail, mp->m_free[XC_FREE_RTAVAILABLE].res_avail); mp->m_free[XC_FREE_RTEXTENTS].res_avail -= *count_fsb; mp->m_free[XC_FREE_RTAVAILABLE].res_avail -= *count_fsb; spin_unlock(&mp->m_sb_lock); if (!*count_fsb) return NULL; *daddr = xfs_gbno_to_daddr(&oz->oz_rtg->rtg_group, 0); *is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *daddr); if (!*is_seq) *daddr += XFS_FSB_TO_BB(mp, oz->oz_write_pointer); oz->oz_write_pointer += *count_fsb; atomic_inc(&oz->oz_ref); return oz; } static bool xfs_zone_gc_start_chunk( struct xfs_zone_gc_data *data) { struct xfs_zone_gc_iter *iter = &data->iter; struct xfs_mount *mp = data->mp; struct block_device *bdev = mp->m_rtdev_targp->bt_bdev; struct xfs_open_zone *oz; struct xfs_rmap_irec irec; struct xfs_gc_bio *chunk; struct xfs_inode *ip; struct bio *bio; xfs_daddr_t daddr; bool is_seq; if (xfs_is_shutdown(mp)) return false; if (!xfs_zone_gc_iter_next(mp, iter, &irec, &ip)) return false; oz = xfs_zone_gc_alloc_blocks(data, &irec.rm_blockcount, &daddr, &is_seq); if (!oz) { xfs_irele(ip); return false; } bio = bio_alloc_bioset(bdev, 1, REQ_OP_READ, GFP_NOFS, &data->bio_set); chunk = container_of(bio, struct xfs_gc_bio, bio); chunk->ip = ip; chunk->offset = XFS_FSB_TO_B(mp, irec.rm_offset); chunk->len = XFS_FSB_TO_B(mp, irec.rm_blockcount); chunk->old_startblock = xfs_rgbno_to_rtb(iter->victim_rtg, irec.rm_startblock); chunk->new_daddr = daddr; chunk->is_seq = is_seq; chunk->scratch = &data->scratch[data->scratch_idx]; chunk->data = data; chunk->oz = oz; bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock); bio->bi_end_io = xfs_zone_gc_end_io; bio_add_folio_nofail(bio, chunk->scratch->folio, chunk->len, chunk->scratch->offset); chunk->scratch->offset += chunk->len; if (chunk->scratch->offset == XFS_GC_CHUNK_SIZE) { data->scratch_idx = (data->scratch_idx + 1) % XFS_ZONE_GC_NR_SCRATCH; } WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW); list_add_tail(&chunk->entry, &data->reading); xfs_zone_gc_iter_advance(iter, irec.rm_blockcount); submit_bio(bio); return true; } static void xfs_zone_gc_free_chunk( struct xfs_gc_bio *chunk) { list_del(&chunk->entry); xfs_open_zone_put(chunk->oz); xfs_irele(chunk->ip); bio_put(&chunk->bio); } static void xfs_zone_gc_submit_write( struct xfs_zone_gc_data *data, struct xfs_gc_bio *chunk) { if (chunk->is_seq) { chunk->bio.bi_opf &= ~REQ_OP_WRITE; chunk->bio.bi_opf |= REQ_OP_ZONE_APPEND; } chunk->bio.bi_iter.bi_sector = chunk->new_daddr; chunk->bio.bi_end_io = xfs_zone_gc_end_io; submit_bio(&chunk->bio); } static struct xfs_gc_bio * xfs_zone_gc_split_write( struct xfs_zone_gc_data *data, struct xfs_gc_bio *chunk) { struct queue_limits *lim = &bdev_get_queue(chunk->bio.bi_bdev)->limits; struct xfs_gc_bio *split_chunk; int split_sectors; unsigned int split_len; struct bio *split; unsigned int nsegs; if (!chunk->is_seq) return NULL; split_sectors = bio_split_rw_at(&chunk->bio, lim, &nsegs, lim->max_zone_append_sectors << SECTOR_SHIFT); if (!split_sectors) return NULL; /* ensure the split chunk is still block size aligned */ split_sectors = ALIGN_DOWN(split_sectors << SECTOR_SHIFT, data->mp->m_sb.sb_blocksize) >> SECTOR_SHIFT; split_len = split_sectors << SECTOR_SHIFT; split = bio_split(&chunk->bio, split_sectors, GFP_NOFS, &data->bio_set); split_chunk = container_of(split, struct xfs_gc_bio, bio); split_chunk->data = data; ihold(VFS_I(chunk->ip)); split_chunk->ip = chunk->ip; split_chunk->is_seq = chunk->is_seq; split_chunk->scratch = chunk->scratch; split_chunk->offset = chunk->offset; split_chunk->len = split_len; split_chunk->old_startblock = chunk->old_startblock; split_chunk->new_daddr = chunk->new_daddr; split_chunk->oz = chunk->oz; atomic_inc(&chunk->oz->oz_ref); chunk->offset += split_len; chunk->len -= split_len; chunk->old_startblock += XFS_B_TO_FSB(data->mp, split_len); /* add right before the original chunk */ WRITE_ONCE(split_chunk->state, XFS_GC_BIO_NEW); list_add_tail(&split_chunk->entry, &chunk->entry); return split_chunk; } static void xfs_zone_gc_write_chunk( struct xfs_gc_bio *chunk) { struct xfs_zone_gc_data *data = chunk->data; struct xfs_mount *mp = chunk->ip->i_mount; unsigned int folio_offset = chunk->bio.bi_io_vec->bv_offset; struct xfs_gc_bio *split_chunk; if (chunk->bio.bi_status) xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); if (xfs_is_shutdown(mp)) { xfs_zone_gc_free_chunk(chunk); return; } WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW); list_move_tail(&chunk->entry, &data->writing); bio_reset(&chunk->bio, mp->m_rtdev_targp->bt_bdev, REQ_OP_WRITE); bio_add_folio_nofail(&chunk->bio, chunk->scratch->folio, chunk->len, folio_offset); while ((split_chunk = xfs_zone_gc_split_write(data, chunk))) xfs_zone_gc_submit_write(data, split_chunk); xfs_zone_gc_submit_write(data, chunk); } static void xfs_zone_gc_finish_chunk( struct xfs_gc_bio *chunk) { uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; struct xfs_inode *ip = chunk->ip; struct xfs_mount *mp = ip->i_mount; int error; if (chunk->bio.bi_status) xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); if (xfs_is_shutdown(mp)) { xfs_zone_gc_free_chunk(chunk); return; } chunk->scratch->freed += chunk->len; if (chunk->scratch->freed == chunk->scratch->offset) { chunk->scratch->offset = 0; chunk->scratch->freed = 0; } /* * Cycle through the iolock and wait for direct I/O and layouts to * ensure no one is reading from the old mapping before it goes away. * * Note that xfs_zoned_end_io() below checks that no other writer raced * with us to update the mapping by checking that the old startblock * didn't change. */ xfs_ilock(ip, iolock); error = xfs_break_layouts(VFS_I(ip), &iolock, BREAK_UNMAP); if (!error) inode_dio_wait(VFS_I(ip)); xfs_iunlock(ip, iolock); if (error) goto free; if (chunk->is_seq) chunk->new_daddr = chunk->bio.bi_iter.bi_sector; error = xfs_zoned_end_io(ip, chunk->offset, chunk->len, chunk->new_daddr, chunk->oz, chunk->old_startblock); free: if (error) xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); xfs_zone_gc_free_chunk(chunk); } static void xfs_zone_gc_finish_reset( struct xfs_gc_bio *chunk) { struct xfs_rtgroup *rtg = chunk->bio.bi_private; struct xfs_mount *mp = rtg_mount(rtg); struct xfs_zone_info *zi = mp->m_zone_info; if (chunk->bio.bi_status) { xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); goto out; } xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_FREE); atomic_inc(&zi->zi_nr_free_zones); xfs_zoned_add_available(mp, rtg_blocks(rtg)); wake_up_all(&zi->zi_zone_wait); out: list_del(&chunk->entry); bio_put(&chunk->bio); } static bool xfs_zone_gc_prepare_reset( struct bio *bio, struct xfs_rtgroup *rtg) { trace_xfs_zone_reset(rtg); ASSERT(rtg_rmap(rtg)->i_used_blocks == 0); bio->bi_iter.bi_sector = xfs_gbno_to_daddr(&rtg->rtg_group, 0); if (!bdev_zone_is_seq(bio->bi_bdev, bio->bi_iter.bi_sector)) { if (!bdev_max_discard_sectors(bio->bi_bdev)) return false; bio->bi_opf = REQ_OP_DISCARD | REQ_SYNC; bio->bi_iter.bi_size = XFS_FSB_TO_B(rtg_mount(rtg), rtg_blocks(rtg)); } return true; } int xfs_zone_gc_reset_sync( struct xfs_rtgroup *rtg) { int error = 0; struct bio bio; bio_init(&bio, rtg_mount(rtg)->m_rtdev_targp->bt_bdev, NULL, 0, REQ_OP_ZONE_RESET); if (xfs_zone_gc_prepare_reset(&bio, rtg)) error = submit_bio_wait(&bio); bio_uninit(&bio); return error; } static void xfs_zone_gc_reset_zones( struct xfs_zone_gc_data *data, struct xfs_group *reset_list) { struct xfs_group *next = reset_list; if (blkdev_issue_flush(data->mp->m_rtdev_targp->bt_bdev) < 0) { xfs_force_shutdown(data->mp, SHUTDOWN_META_IO_ERROR); return; } do { struct xfs_rtgroup *rtg = to_rtg(next); struct xfs_gc_bio *chunk; struct bio *bio; xfs_log_force_inode(rtg_rmap(rtg)); next = rtg_group(rtg)->xg_next_reset; rtg_group(rtg)->xg_next_reset = NULL; bio = bio_alloc_bioset(rtg_mount(rtg)->m_rtdev_targp->bt_bdev, 0, REQ_OP_ZONE_RESET, GFP_NOFS, &data->bio_set); bio->bi_private = rtg; bio->bi_end_io = xfs_zone_gc_end_io; chunk = container_of(bio, struct xfs_gc_bio, bio); chunk->data = data; WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW); list_add_tail(&chunk->entry, &data->resetting); /* * Also use the bio to drive the state machine when neither * zone reset nor discard is supported to keep things simple. */ if (xfs_zone_gc_prepare_reset(bio, rtg)) submit_bio(bio); else bio_endio(bio); } while (next); } /* * Handle the work to read and write data for GC and to reset the zones, * including handling all completions. * * Note that the order of the chunks is preserved so that we don't undo the * optimal order established by xfs_zone_gc_query(). */ static bool xfs_zone_gc_handle_work( struct xfs_zone_gc_data *data) { struct xfs_zone_info *zi = data->mp->m_zone_info; struct xfs_gc_bio *chunk, *next; struct xfs_group *reset_list; struct blk_plug plug; spin_lock(&zi->zi_reset_list_lock); reset_list = zi->zi_reset_list; zi->zi_reset_list = NULL; spin_unlock(&zi->zi_reset_list_lock); if (!xfs_zone_gc_select_victim(data) || !xfs_zone_gc_space_available(data)) { if (list_empty(&data->reading) && list_empty(&data->writing) && list_empty(&data->resetting) && !reset_list) return false; } __set_current_state(TASK_RUNNING); try_to_freeze(); if (reset_list) xfs_zone_gc_reset_zones(data, reset_list); list_for_each_entry_safe(chunk, next, &data->resetting, entry) { if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE) break; xfs_zone_gc_finish_reset(chunk); } list_for_each_entry_safe(chunk, next, &data->writing, entry) { if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE) break; xfs_zone_gc_finish_chunk(chunk); } blk_start_plug(&plug); list_for_each_entry_safe(chunk, next, &data->reading, entry) { if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE) break; xfs_zone_gc_write_chunk(chunk); } blk_finish_plug(&plug); blk_start_plug(&plug); while (xfs_zone_gc_start_chunk(data)) ; blk_finish_plug(&plug); return true; } /* * Note that the current GC algorithm would break reflinks and thus duplicate * data that was shared by multiple owners before. Because of that reflinks * are currently not supported on zoned file systems and can't be created or * mounted. */ static int xfs_zoned_gcd( void *private) { struct xfs_zone_gc_data *data = private; struct xfs_mount *mp = data->mp; struct xfs_zone_info *zi = mp->m_zone_info; unsigned int nofs_flag; nofs_flag = memalloc_nofs_save(); set_freezable(); for (;;) { set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE); xfs_set_zonegc_running(mp); if (xfs_zone_gc_handle_work(data)) continue; if (list_empty(&data->reading) && list_empty(&data->writing) && list_empty(&data->resetting) && !zi->zi_reset_list) { xfs_clear_zonegc_running(mp); xfs_zoned_resv_wake_all(mp); if (kthread_should_stop()) { __set_current_state(TASK_RUNNING); break; } if (kthread_should_park()) { __set_current_state(TASK_RUNNING); kthread_parkme(); continue; } } schedule(); } xfs_clear_zonegc_running(mp); if (data->iter.victim_rtg) xfs_rtgroup_rele(data->iter.victim_rtg); memalloc_nofs_restore(nofs_flag); xfs_zone_gc_data_free(data); return 0; } void xfs_zone_gc_start( struct xfs_mount *mp) { if (xfs_has_zoned(mp)) kthread_unpark(mp->m_zone_info->zi_gc_thread); } void xfs_zone_gc_stop( struct xfs_mount *mp) { if (xfs_has_zoned(mp)) kthread_park(mp->m_zone_info->zi_gc_thread); } int xfs_zone_gc_mount( struct xfs_mount *mp) { struct xfs_zone_info *zi = mp->m_zone_info; struct xfs_zone_gc_data *data; struct xfs_open_zone *oz; int error; /* * If there are no free zones available for GC, pick the open zone with * the least used space to GC into. This should only happen after an * unclean shutdown near ENOSPC while GC was ongoing. * * We also need to do this for the first gc zone allocation if we * unmounted while at the open limit. */ if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_FREE) || zi->zi_nr_open_zones == mp->m_max_open_zones) oz = xfs_zone_gc_steal_open(zi); else oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true); if (!oz) { xfs_warn(mp, "unable to allocate a zone for gc"); error = -EIO; goto out; } trace_xfs_zone_gc_target_opened(oz->oz_rtg); zi->zi_open_gc_zone = oz; data = xfs_zone_gc_data_alloc(mp); if (!data) { error = -ENOMEM; goto out_put_gc_zone; } mp->m_zone_info->zi_gc_thread = kthread_create(xfs_zoned_gcd, data, "xfs-zone-gc/%s", mp->m_super->s_id); if (IS_ERR(mp->m_zone_info->zi_gc_thread)) { xfs_warn(mp, "unable to create zone gc thread"); error = PTR_ERR(mp->m_zone_info->zi_gc_thread); goto out_free_gc_data; } /* xfs_zone_gc_start will unpark for rw mounts */ kthread_park(mp->m_zone_info->zi_gc_thread); return 0; out_free_gc_data: kfree(data); out_put_gc_zone: xfs_open_zone_put(zi->zi_open_gc_zone); out: return error; } void xfs_zone_gc_unmount( struct xfs_mount *mp) { struct xfs_zone_info *zi = mp->m_zone_info; kthread_stop(zi->zi_gc_thread); if (zi->zi_open_gc_zone) xfs_open_zone_put(zi->zi_open_gc_zone); }