/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright 2023 Red Hat */ #ifndef VDO_BLOCK_MAP_H #define VDO_BLOCK_MAP_H #include #include "numeric.h" #include "admin-state.h" #include "completion.h" #include "encodings.h" #include "int-map.h" #include "statistics.h" #include "types.h" #include "vio.h" #include "wait-queue.h" /* * The block map is responsible for tracking all the logical to physical mappings of a VDO. It * consists of a collection of 60 radix trees gradually allocated as logical addresses are used. * Each tree is assigned to a logical zone such that it is easy to compute which zone must handle * each logical address. Each logical zone also has a dedicated portion of the leaf page cache. * * Each logical zone has a single dedicated queue and thread for performing all updates to the * radix trees assigned to that zone. The concurrency guarantees of this single-threaded model * allow the code to omit more fine-grained locking for the block map structures. * * Load operations must be performed on the admin thread. Normal operations, such as reading and * updating mappings, must be performed on the appropriate logical zone thread. Save operations * must be launched from the same admin thread as the original load operation. */ enum { BLOCK_MAP_VIO_POOL_SIZE = 64, }; /* * Generation counter for page references. */ typedef u32 vdo_page_generation; extern const struct block_map_entry UNMAPPED_BLOCK_MAP_ENTRY; /* The VDO Page Cache abstraction. */ struct vdo_page_cache { /* the VDO which owns this cache */ struct vdo *vdo; /* number of pages in cache */ page_count_t page_count; /* number of pages to write in the current batch */ page_count_t pages_in_batch; /* Whether the VDO is doing a read-only rebuild */ bool rebuilding; /* array of page information entries */ struct page_info *infos; /* raw memory for pages */ char *pages; /* cache last found page info */ struct page_info *last_found; /* map of page number to info */ struct int_map *page_map; /* main LRU list (all infos) */ struct list_head lru_list; /* free page list (oldest first) */ struct list_head free_list; /* outgoing page list */ struct list_head outgoing_list; /* number of read I/O operations pending */ page_count_t outstanding_reads; /* number of write I/O operations pending */ page_count_t outstanding_writes; /* number of pages covered by the current flush */ page_count_t pages_in_flush; /* number of pages waiting to be included in the next flush */ page_count_t pages_to_flush; /* number of discards in progress */ unsigned int discard_count; /* how many VPCs waiting for free page */ unsigned int waiter_count; /* queue of waiters who want a free page */ struct vdo_wait_queue free_waiters; /* * Statistics are only updated on the logical zone thread, but are accessed from other * threads. */ struct block_map_statistics stats; /* counter for pressure reports */ u32 pressure_report; /* the block map zone to which this cache belongs */ struct block_map_zone *zone; }; /* * The state of a page buffer. If the page buffer is free no particular page is bound to it, * otherwise the page buffer is bound to particular page whose absolute pbn is in the pbn field. If * the page is resident or dirty the page data is stable and may be accessed. Otherwise the page is * in flight (incoming or outgoing) and its data should not be accessed. * * @note Update the static data in get_page_state_name() if you change this enumeration. */ enum vdo_page_buffer_state { /* this page buffer is not being used */ PS_FREE, /* this page is being read from store */ PS_INCOMING, /* attempt to load this page failed */ PS_FAILED, /* this page is valid and un-modified */ PS_RESIDENT, /* this page is valid and modified */ PS_DIRTY, /* this page is being written and should not be used */ PS_OUTGOING, /* not a state */ PAGE_STATE_COUNT, } __packed; /* * The write status of page */ enum vdo_page_write_status { WRITE_STATUS_NORMAL, WRITE_STATUS_DISCARD, WRITE_STATUS_DEFERRED, } __packed; /* Per-page-slot information. */ struct page_info { /* Preallocated page struct vio */ struct vio *vio; /* back-link for references */ struct vdo_page_cache *cache; /* the pbn of the page */ physical_block_number_t pbn; /* page is busy (temporarily locked) */ u16 busy; /* the write status the page */ enum vdo_page_write_status write_status; /* page state */ enum vdo_page_buffer_state state; /* queue of completions awaiting this item */ struct vdo_wait_queue waiting; /* state linked list entry */ struct list_head state_entry; /* LRU entry */ struct list_head lru_entry; /* * The earliest recovery journal block containing uncommitted updates to the block map page * associated with this page_info. A reference (lock) is held on that block to prevent it * from being reaped. When this value changes, the reference on the old value must be * released and a reference on the new value must be acquired. */ sequence_number_t recovery_lock; }; /* * A completion awaiting a specific page. Also a live reference into the page once completed, until * freed. */ struct vdo_page_completion { /* The generic completion */ struct vdo_completion completion; /* The cache involved */ struct vdo_page_cache *cache; /* The waiter for the pending list */ struct vdo_waiter waiter; /* The absolute physical block number of the page on disk */ physical_block_number_t pbn; /* Whether the page may be modified */ bool writable; /* Whether the page is available */ bool ready; /* The info structure for the page, only valid when ready */ struct page_info *info; }; struct forest; struct tree_page { struct vdo_waiter waiter; /* Dirty list entry */ struct list_head entry; /* If dirty, the tree zone flush generation in which it was last dirtied. */ u8 generation; /* Whether this page is an interior tree page being written out. */ bool writing; /* If writing, the tree zone flush generation of the copy being written. */ u8 writing_generation; /* * Sequence number of the earliest recovery journal block containing uncommitted updates to * this page */ sequence_number_t recovery_lock; /* The value of recovery_lock when the this page last started writing */ sequence_number_t writing_recovery_lock; char page_buffer[VDO_BLOCK_SIZE]; }; enum block_map_page_type { VDO_TREE_PAGE, VDO_CACHE_PAGE, }; typedef struct list_head dirty_era_t[2]; struct dirty_lists { /* The number of periods after which an element will be expired */ block_count_t maximum_age; /* The oldest period which has unexpired elements */ sequence_number_t oldest_period; /* One more than the current period */ sequence_number_t next_period; /* The offset in the array of lists of the oldest period */ block_count_t offset; /* Expired pages */ dirty_era_t expired; /* The lists of dirty pages */ dirty_era_t eras[]; }; struct block_map_zone { zone_count_t zone_number; thread_id_t thread_id; struct admin_state state; struct block_map *block_map; /* Dirty pages, by era*/ struct dirty_lists *dirty_lists; struct vdo_page_cache page_cache; data_vio_count_t active_lookups; struct int_map *loading_pages; struct vio_pool *vio_pool; /* The tree page which has issued or will be issuing a flush */ struct tree_page *flusher; struct vdo_wait_queue flush_waiters; /* The generation after the most recent flush */ u8 generation; u8 oldest_generation; /* The counts of dirty pages in each generation */ u32 dirty_page_counts[256]; }; struct block_map { struct vdo *vdo; struct action_manager *action_manager; /* The absolute PBN of the first root of the tree part of the block map */ physical_block_number_t root_origin; block_count_t root_count; /* The era point we are currently distributing to the zones */ sequence_number_t current_era_point; /* The next era point */ sequence_number_t pending_era_point; /* The number of entries in block map */ block_count_t entry_count; nonce_t nonce; struct recovery_journal *journal; /* The trees for finding block map pages */ struct forest *forest; /* The expanded trees awaiting growth */ struct forest *next_forest; /* The number of entries after growth */ block_count_t next_entry_count; zone_count_t zone_count; struct block_map_zone zones[]; }; /** * typedef vdo_entry_callback_fn - A function to be called for each allocated PBN when traversing * the forest. * @pbn: A PBN of a tree node. * @completion: The parent completion of the traversal. * * Return: VDO_SUCCESS or an error. */ typedef int (*vdo_entry_callback_fn)(physical_block_number_t pbn, struct vdo_completion *completion); static inline struct vdo_page_completion *as_vdo_page_completion(struct vdo_completion *completion) { vdo_assert_completion_type(completion, VDO_PAGE_COMPLETION); return container_of(completion, struct vdo_page_completion, completion); } void vdo_release_page_completion(struct vdo_completion *completion); void vdo_get_page(struct vdo_page_completion *page_completion, struct block_map_zone *zone, physical_block_number_t pbn, bool writable, void *parent, vdo_action_fn callback, vdo_action_fn error_handler, bool requeue); void vdo_request_page_write(struct vdo_completion *completion); int __must_check vdo_get_cached_page(struct vdo_completion *completion, struct block_map_page **page_ptr); int __must_check vdo_invalidate_page_cache(struct vdo_page_cache *cache); static inline struct block_map_page * __must_check vdo_as_block_map_page(struct tree_page *tree_page) { return (struct block_map_page *) tree_page->page_buffer; } bool vdo_copy_valid_page(char *buffer, nonce_t nonce, physical_block_number_t pbn, struct block_map_page *page); void vdo_find_block_map_slot(struct data_vio *data_vio); physical_block_number_t vdo_find_block_map_page_pbn(struct block_map *map, page_number_t page_number); void vdo_write_tree_page(struct tree_page *page, struct block_map_zone *zone); void vdo_traverse_forest(struct block_map *map, vdo_entry_callback_fn callback, struct vdo_completion *completion); int __must_check vdo_decode_block_map(struct block_map_state_2_0 state, block_count_t logical_blocks, struct vdo *vdo, struct recovery_journal *journal, nonce_t nonce, page_count_t cache_size, block_count_t maximum_age, struct block_map **map_ptr); void vdo_drain_block_map(struct block_map *map, const struct admin_state_code *operation, struct vdo_completion *parent); void vdo_resume_block_map(struct block_map *map, struct vdo_completion *parent); int __must_check vdo_prepare_to_grow_block_map(struct block_map *map, block_count_t new_logical_blocks); void vdo_grow_block_map(struct block_map *map, struct vdo_completion *parent); void vdo_abandon_block_map_growth(struct block_map *map); void vdo_free_block_map(struct block_map *map); struct block_map_state_2_0 __must_check vdo_record_block_map(const struct block_map *map); void vdo_initialize_block_map_from_journal(struct block_map *map, struct recovery_journal *journal); zone_count_t vdo_compute_logical_zone(struct data_vio *data_vio); void vdo_advance_block_map_era(struct block_map *map, sequence_number_t recovery_block_number); void vdo_update_block_map_page(struct block_map_page *page, struct data_vio *data_vio, physical_block_number_t pbn, enum block_mapping_state mapping_state, sequence_number_t *recovery_lock); void vdo_get_mapped_block(struct data_vio *data_vio); void vdo_put_mapped_block(struct data_vio *data_vio); struct block_map_statistics __must_check vdo_get_block_map_statistics(struct block_map *map); /** * vdo_convert_maximum_age() - Convert the maximum age to reflect the new recovery journal format * @age: The configured maximum age * * Return: The converted age * * In the old recovery journal format, each journal block held 311 entries, and every write bio * made two entries. The old maximum age was half the usable journal length. In the new format, * each block holds only 217 entries, but each bio only makes one entry. We convert the configured * age so that the number of writes in a block map era is the same in the old and new formats. This * keeps the bound on the amount of work required to recover the block map from the recovery * journal the same across the format change. It also keeps the amortization of block map page * writes to write bios the same. */ static inline block_count_t vdo_convert_maximum_age(block_count_t age) { return DIV_ROUND_UP(age * RECOVERY_JOURNAL_1_ENTRIES_PER_BLOCK, 2 * RECOVERY_JOURNAL_ENTRIES_PER_BLOCK); } #endif /* VDO_BLOCK_MAP_H */