// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved */ #include "cmd.h" enum { CQ_OK = 0, CQ_EMPTY = -1, CQ_POLL_ERR = -2 }; static int mlx5vf_is_migratable(struct mlx5_core_dev *mdev, u16 func_id) { int query_sz = MLX5_ST_SZ_BYTES(query_hca_cap_out); void *query_cap = NULL, *cap; int ret; query_cap = kzalloc(query_sz, GFP_KERNEL); if (!query_cap) return -ENOMEM; ret = mlx5_vport_get_other_func_cap(mdev, func_id, query_cap, MLX5_CAP_GENERAL_2); if (ret) goto out; cap = MLX5_ADDR_OF(query_hca_cap_out, query_cap, capability); if (!MLX5_GET(cmd_hca_cap_2, cap, migratable)) ret = -EOPNOTSUPP; out: kfree(query_cap); return ret; } static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id, u16 *vhca_id); static void _mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev); int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod) { struct mlx5_vf_migration_file *migf = mvdev->saving_migf; u32 out[MLX5_ST_SZ_DW(suspend_vhca_out)] = {}; u32 in[MLX5_ST_SZ_DW(suspend_vhca_in)] = {}; int err; lockdep_assert_held(&mvdev->state_mutex); if (mvdev->mdev_detach) return -ENOTCONN; /* * In case PRE_COPY is used, saving_migf is exposed while the device is * running. Make sure to run only once there is no active save command. * Running both in parallel, might end-up with a failure in the save * command once it will try to turn on 'tracking' on a suspended device. */ if (migf) { err = wait_for_completion_interruptible(&migf->save_comp); if (err) return err; } MLX5_SET(suspend_vhca_in, in, opcode, MLX5_CMD_OP_SUSPEND_VHCA); MLX5_SET(suspend_vhca_in, in, vhca_id, mvdev->vhca_id); MLX5_SET(suspend_vhca_in, in, op_mod, op_mod); err = mlx5_cmd_exec_inout(mvdev->mdev, suspend_vhca, in, out); if (migf) complete(&migf->save_comp); return err; } int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod) { u32 out[MLX5_ST_SZ_DW(resume_vhca_out)] = {}; u32 in[MLX5_ST_SZ_DW(resume_vhca_in)] = {}; lockdep_assert_held(&mvdev->state_mutex); if (mvdev->mdev_detach) return -ENOTCONN; MLX5_SET(resume_vhca_in, in, opcode, MLX5_CMD_OP_RESUME_VHCA); MLX5_SET(resume_vhca_in, in, vhca_id, mvdev->vhca_id); MLX5_SET(resume_vhca_in, in, op_mod, op_mod); return mlx5_cmd_exec_inout(mvdev->mdev, resume_vhca, in, out); } int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, size_t *state_size, u64 *total_size, u8 query_flags) { u32 out[MLX5_ST_SZ_DW(query_vhca_migration_state_out)] = {}; u32 in[MLX5_ST_SZ_DW(query_vhca_migration_state_in)] = {}; bool inc = query_flags & MLX5VF_QUERY_INC; int ret; lockdep_assert_held(&mvdev->state_mutex); if (mvdev->mdev_detach) return -ENOTCONN; /* * In case PRE_COPY is used, saving_migf is exposed while device is * running. Make sure to run only once there is no active save command. * Running both in parallel, might end-up with a failure in the * incremental query command on un-tracked vhca. */ if (inc) { ret = wait_for_completion_interruptible(&mvdev->saving_migf->save_comp); if (ret) return ret; /* Upon cleanup, ignore previous pre_copy error state */ if (mvdev->saving_migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR && !(query_flags & MLX5VF_QUERY_CLEANUP)) { /* * In case we had a PRE_COPY error, only query full * image for final image */ if (!(query_flags & MLX5VF_QUERY_FINAL)) { *state_size = 0; complete(&mvdev->saving_migf->save_comp); return 0; } query_flags &= ~MLX5VF_QUERY_INC; } /* Block incremental query which is state-dependent */ if (mvdev->saving_migf->state == MLX5_MIGF_STATE_ERROR) { complete(&mvdev->saving_migf->save_comp); return -ENODEV; } } MLX5_SET(query_vhca_migration_state_in, in, opcode, MLX5_CMD_OP_QUERY_VHCA_MIGRATION_STATE); MLX5_SET(query_vhca_migration_state_in, in, vhca_id, mvdev->vhca_id); MLX5_SET(query_vhca_migration_state_in, in, op_mod, 0); MLX5_SET(query_vhca_migration_state_in, in, incremental, query_flags & MLX5VF_QUERY_INC); MLX5_SET(query_vhca_migration_state_in, in, chunk, mvdev->chunk_mode); ret = mlx5_cmd_exec_inout(mvdev->mdev, query_vhca_migration_state, in, out); if (inc) complete(&mvdev->saving_migf->save_comp); if (ret) return ret; *state_size = MLX5_GET(query_vhca_migration_state_out, out, required_umem_size); if (total_size) *total_size = mvdev->chunk_mode ? MLX5_GET64(query_vhca_migration_state_out, out, remaining_total_size) : *state_size; return 0; } static void set_tracker_change_event(struct mlx5vf_pci_core_device *mvdev) { mvdev->tracker.object_changed = true; complete(&mvdev->tracker_comp); } static void set_tracker_error(struct mlx5vf_pci_core_device *mvdev) { /* Mark the tracker under an error and wake it up if it's running */ mvdev->tracker.is_err = true; complete(&mvdev->tracker_comp); } static int mlx5fv_vf_event(struct notifier_block *nb, unsigned long event, void *data) { struct mlx5vf_pci_core_device *mvdev = container_of(nb, struct mlx5vf_pci_core_device, nb); switch (event) { case MLX5_PF_NOTIFY_ENABLE_VF: mutex_lock(&mvdev->state_mutex); mvdev->mdev_detach = false; mlx5vf_state_mutex_unlock(mvdev); break; case MLX5_PF_NOTIFY_DISABLE_VF: mlx5vf_cmd_close_migratable(mvdev); mutex_lock(&mvdev->state_mutex); mvdev->mdev_detach = true; mlx5vf_state_mutex_unlock(mvdev); break; default: break; } return 0; } void mlx5vf_cmd_close_migratable(struct mlx5vf_pci_core_device *mvdev) { if (!mvdev->migrate_cap) return; /* Must be done outside the lock to let it progress */ set_tracker_error(mvdev); mutex_lock(&mvdev->state_mutex); mlx5vf_disable_fds(mvdev, NULL); _mlx5vf_free_page_tracker_resources(mvdev); mlx5vf_state_mutex_unlock(mvdev); } void mlx5vf_cmd_remove_migratable(struct mlx5vf_pci_core_device *mvdev) { if (!mvdev->migrate_cap) return; mlx5_sriov_blocking_notifier_unregister(mvdev->mdev, mvdev->vf_id, &mvdev->nb); destroy_workqueue(mvdev->cb_wq); } void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev, const struct vfio_migration_ops *mig_ops, const struct vfio_log_ops *log_ops) { struct pci_dev *pdev = mvdev->core_device.pdev; int ret; if (!pdev->is_virtfn) return; mvdev->mdev = mlx5_vf_get_core_dev(pdev); if (!mvdev->mdev) return; if (!MLX5_CAP_GEN(mvdev->mdev, migration)) goto end; if (!(MLX5_CAP_GEN_2(mvdev->mdev, migration_multi_load) && MLX5_CAP_GEN_2(mvdev->mdev, migration_tracking_state))) goto end; mvdev->vf_id = pci_iov_vf_id(pdev); if (mvdev->vf_id < 0) goto end; ret = mlx5vf_is_migratable(mvdev->mdev, mvdev->vf_id + 1); if (ret) goto end; if (mlx5vf_cmd_get_vhca_id(mvdev->mdev, mvdev->vf_id + 1, &mvdev->vhca_id)) goto end; mvdev->cb_wq = alloc_ordered_workqueue("mlx5vf_wq", 0); if (!mvdev->cb_wq) goto end; mutex_init(&mvdev->state_mutex); spin_lock_init(&mvdev->reset_lock); mvdev->nb.notifier_call = mlx5fv_vf_event; ret = mlx5_sriov_blocking_notifier_register(mvdev->mdev, mvdev->vf_id, &mvdev->nb); if (ret) { destroy_workqueue(mvdev->cb_wq); goto end; } mvdev->migrate_cap = 1; mvdev->core_device.vdev.migration_flags = VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P | VFIO_MIGRATION_PRE_COPY; mvdev->core_device.vdev.mig_ops = mig_ops; init_completion(&mvdev->tracker_comp); if (MLX5_CAP_GEN(mvdev->mdev, adv_virtualization)) mvdev->core_device.vdev.log_ops = log_ops; if (MLX5_CAP_GEN_2(mvdev->mdev, migration_in_chunks)) mvdev->chunk_mode = 1; end: mlx5_vf_put_core_dev(mvdev->mdev); } static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id, u16 *vhca_id) { u32 in[MLX5_ST_SZ_DW(query_hca_cap_in)] = {}; int out_size; void *out; int ret; out_size = MLX5_ST_SZ_BYTES(query_hca_cap_out); out = kzalloc(out_size, GFP_KERNEL); if (!out) return -ENOMEM; MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP); MLX5_SET(query_hca_cap_in, in, other_function, 1); MLX5_SET(query_hca_cap_in, in, function_id, function_id); MLX5_SET(query_hca_cap_in, in, op_mod, MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE << 1 | HCA_CAP_OPMOD_GET_CUR); ret = mlx5_cmd_exec_inout(mdev, query_hca_cap, in, out); if (ret) goto err_exec; *vhca_id = MLX5_GET(query_hca_cap_out, out, capability.cmd_hca_cap.vhca_id); err_exec: kfree(out); return ret; } static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn, struct mlx5_vhca_data_buffer *buf, struct mlx5_vhca_recv_buf *recv_buf, u32 *mkey) { size_t npages = buf ? DIV_ROUND_UP(buf->allocated_length, PAGE_SIZE) : recv_buf->npages; int err = 0, inlen; __be64 *mtt; void *mkc; u32 *in; inlen = MLX5_ST_SZ_BYTES(create_mkey_in) + sizeof(*mtt) * round_up(npages, 2); in = kvzalloc(inlen, GFP_KERNEL); if (!in) return -ENOMEM; MLX5_SET(create_mkey_in, in, translations_octword_actual_size, DIV_ROUND_UP(npages, 2)); mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt); if (buf) { struct sg_dma_page_iter dma_iter; for_each_sgtable_dma_page(&buf->table.sgt, &dma_iter, 0) *mtt++ = cpu_to_be64(sg_page_iter_dma_address(&dma_iter)); } else { int i; for (i = 0; i < npages; i++) *mtt++ = cpu_to_be64(recv_buf->dma_addrs[i]); } mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT); MLX5_SET(mkc, mkc, lr, 1); MLX5_SET(mkc, mkc, lw, 1); MLX5_SET(mkc, mkc, rr, 1); MLX5_SET(mkc, mkc, rw, 1); MLX5_SET(mkc, mkc, pd, pdn); MLX5_SET(mkc, mkc, bsf_octword_size, 0); MLX5_SET(mkc, mkc, qpn, 0xffffff); MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT); MLX5_SET(mkc, mkc, translations_octword_size, DIV_ROUND_UP(npages, 2)); MLX5_SET64(mkc, mkc, len, npages * PAGE_SIZE); err = mlx5_core_create_mkey(mdev, mkey, in, inlen); kvfree(in); return err; } static int mlx5vf_dma_data_buffer(struct mlx5_vhca_data_buffer *buf) { struct mlx5vf_pci_core_device *mvdev = buf->migf->mvdev; struct mlx5_core_dev *mdev = mvdev->mdev; int ret; lockdep_assert_held(&mvdev->state_mutex); if (mvdev->mdev_detach) return -ENOTCONN; if (buf->dmaed || !buf->allocated_length) return -EINVAL; ret = dma_map_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0); if (ret) return ret; ret = _create_mkey(mdev, buf->migf->pdn, buf, NULL, &buf->mkey); if (ret) goto err; buf->dmaed = true; return 0; err: dma_unmap_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0); return ret; } void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf) { struct mlx5_vf_migration_file *migf = buf->migf; struct sg_page_iter sg_iter; lockdep_assert_held(&migf->mvdev->state_mutex); WARN_ON(migf->mvdev->mdev_detach); if (buf->dmaed) { mlx5_core_destroy_mkey(migf->mvdev->mdev, buf->mkey); dma_unmap_sgtable(migf->mvdev->mdev->device, &buf->table.sgt, buf->dma_dir, 0); } /* Undo alloc_pages_bulk_array() */ for_each_sgtable_page(&buf->table.sgt, &sg_iter, 0) __free_page(sg_page_iter_page(&sg_iter)); sg_free_append_table(&buf->table); kfree(buf); } static int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, unsigned int npages) { unsigned int to_alloc = npages; struct page **page_list; unsigned long filled; unsigned int to_fill; int ret; int i; to_fill = min_t(unsigned int, npages, PAGE_SIZE / sizeof(*page_list)); page_list = kvzalloc(to_fill * sizeof(*page_list), GFP_KERNEL_ACCOUNT); if (!page_list) return -ENOMEM; do { filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, to_fill, page_list); if (!filled) { ret = -ENOMEM; goto err; } to_alloc -= filled; ret = sg_alloc_append_table_from_pages( &buf->table, page_list, filled, 0, filled << PAGE_SHIFT, UINT_MAX, SG_MAX_SINGLE_ALLOC, GFP_KERNEL_ACCOUNT); if (ret) goto err_append; buf->allocated_length += filled * PAGE_SIZE; /* clean input for another bulk allocation */ memset(page_list, 0, filled * sizeof(*page_list)); to_fill = min_t(unsigned int, to_alloc, PAGE_SIZE / sizeof(*page_list)); } while (to_alloc > 0); kvfree(page_list); return 0; err_append: for (i = filled - 1; i >= 0; i--) __free_page(page_list[i]); err: kvfree(page_list); return ret; } struct mlx5_vhca_data_buffer * mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, size_t length, enum dma_data_direction dma_dir) { struct mlx5_vhca_data_buffer *buf; int ret; buf = kzalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT); if (!buf) return ERR_PTR(-ENOMEM); buf->dma_dir = dma_dir; buf->migf = migf; if (length) { ret = mlx5vf_add_migration_pages(buf, DIV_ROUND_UP_ULL(length, PAGE_SIZE)); if (ret) goto end; if (dma_dir != DMA_NONE) { ret = mlx5vf_dma_data_buffer(buf); if (ret) goto end; } } return buf; end: mlx5vf_free_data_buffer(buf); return ERR_PTR(ret); } void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf) { spin_lock_irq(&buf->migf->list_lock); buf->stop_copy_chunk_num = 0; list_add_tail(&buf->buf_elm, &buf->migf->avail_list); spin_unlock_irq(&buf->migf->list_lock); } struct mlx5_vhca_data_buffer * mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, size_t length, enum dma_data_direction dma_dir) { struct mlx5_vhca_data_buffer *buf, *temp_buf; struct list_head free_list; lockdep_assert_held(&migf->mvdev->state_mutex); if (migf->mvdev->mdev_detach) return ERR_PTR(-ENOTCONN); INIT_LIST_HEAD(&free_list); spin_lock_irq(&migf->list_lock); list_for_each_entry_safe(buf, temp_buf, &migf->avail_list, buf_elm) { if (buf->dma_dir == dma_dir) { list_del_init(&buf->buf_elm); if (buf->allocated_length >= length) { spin_unlock_irq(&migf->list_lock); goto found; } /* * Prevent holding redundant buffers. Put in a free * list and call at the end not under the spin lock * (&migf->list_lock) to mlx5vf_free_data_buffer which * might sleep. */ list_add(&buf->buf_elm, &free_list); } } spin_unlock_irq(&migf->list_lock); buf = mlx5vf_alloc_data_buffer(migf, length, dma_dir); found: while ((temp_buf = list_first_entry_or_null(&free_list, struct mlx5_vhca_data_buffer, buf_elm))) { list_del(&temp_buf->buf_elm); mlx5vf_free_data_buffer(temp_buf); } return buf; } static void mlx5vf_save_callback_complete(struct mlx5_vf_migration_file *migf, struct mlx5vf_async_data *async_data) { kvfree(async_data->out); complete(&migf->save_comp); fput(migf->filp); } void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) { struct mlx5vf_async_data *async_data = container_of(_work, struct mlx5vf_async_data, work); struct mlx5_vf_migration_file *migf = container_of(async_data, struct mlx5_vf_migration_file, async_data); mutex_lock(&migf->lock); if (async_data->status) { mlx5vf_put_data_buffer(async_data->buf); if (async_data->header_buf) mlx5vf_put_data_buffer(async_data->header_buf); if (!async_data->stop_copy_chunk && async_data->status == MLX5_CMD_STAT_BAD_RES_STATE_ERR) migf->state = MLX5_MIGF_STATE_PRE_COPY_ERROR; else migf->state = MLX5_MIGF_STATE_ERROR; wake_up_interruptible(&migf->poll_wait); } mutex_unlock(&migf->lock); mlx5vf_save_callback_complete(migf, async_data); } static int add_buf_header(struct mlx5_vhca_data_buffer *header_buf, size_t image_size, bool initial_pre_copy) { struct mlx5_vf_migration_file *migf = header_buf->migf; struct mlx5_vf_migration_header header = {}; unsigned long flags; struct page *page; u8 *to_buff; header.record_size = cpu_to_le64(image_size); header.flags = cpu_to_le32(MLX5_MIGF_HEADER_FLAGS_TAG_MANDATORY); header.tag = cpu_to_le32(MLX5_MIGF_HEADER_TAG_FW_DATA); page = mlx5vf_get_migration_page(header_buf, 0); if (!page) return -EINVAL; to_buff = kmap_local_page(page); memcpy(to_buff, &header, sizeof(header)); kunmap_local(to_buff); header_buf->length = sizeof(header); header_buf->start_pos = header_buf->migf->max_pos; migf->max_pos += header_buf->length; spin_lock_irqsave(&migf->list_lock, flags); list_add_tail(&header_buf->buf_elm, &migf->buf_list); spin_unlock_irqrestore(&migf->list_lock, flags); if (initial_pre_copy) migf->pre_copy_initial_bytes += sizeof(header); return 0; } static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) { struct mlx5vf_async_data *async_data = container_of(context, struct mlx5vf_async_data, cb_work); struct mlx5_vf_migration_file *migf = container_of(async_data, struct mlx5_vf_migration_file, async_data); if (!status) { size_t next_required_umem_size = 0; bool stop_copy_last_chunk; size_t image_size; unsigned long flags; bool initial_pre_copy = migf->state != MLX5_MIGF_STATE_PRE_COPY && !async_data->stop_copy_chunk; image_size = MLX5_GET(save_vhca_state_out, async_data->out, actual_image_size); if (async_data->buf->stop_copy_chunk_num) next_required_umem_size = MLX5_GET(save_vhca_state_out, async_data->out, next_required_umem_size); stop_copy_last_chunk = async_data->stop_copy_chunk && !next_required_umem_size; if (async_data->header_buf) { status = add_buf_header(async_data->header_buf, image_size, initial_pre_copy); if (status) goto err; } async_data->buf->length = image_size; async_data->buf->start_pos = migf->max_pos; migf->max_pos += async_data->buf->length; spin_lock_irqsave(&migf->list_lock, flags); list_add_tail(&async_data->buf->buf_elm, &migf->buf_list); if (async_data->buf->stop_copy_chunk_num) { migf->num_ready_chunks++; if (next_required_umem_size && migf->num_ready_chunks >= MAX_NUM_CHUNKS) { /* Delay the next SAVE till one chunk be consumed */ migf->next_required_umem_size = next_required_umem_size; next_required_umem_size = 0; } } spin_unlock_irqrestore(&migf->list_lock, flags); if (initial_pre_copy) { migf->pre_copy_initial_bytes += image_size; migf->state = MLX5_MIGF_STATE_PRE_COPY; } if (stop_copy_last_chunk) migf->state = MLX5_MIGF_STATE_COMPLETE; wake_up_interruptible(&migf->poll_wait); if (next_required_umem_size) mlx5vf_mig_file_set_save_work(migf, /* Picking up the next chunk num */ (async_data->buf->stop_copy_chunk_num % MAX_NUM_CHUNKS) + 1, next_required_umem_size); mlx5vf_save_callback_complete(migf, async_data); return; } err: /* The error flow can't run from an interrupt context */ if (status == -EREMOTEIO) { status = MLX5_GET(save_vhca_state_out, async_data->out, status); /* Failed in FW, print cmd out failure details */ mlx5_cmd_out_err(migf->mvdev->mdev, MLX5_CMD_OP_SAVE_VHCA_STATE, 0, async_data->out); } async_data->status = status; queue_work(migf->mvdev->cb_wq, &async_data->work); } int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, struct mlx5_vf_migration_file *migf, struct mlx5_vhca_data_buffer *buf, bool inc, bool track) { u32 out_size = MLX5_ST_SZ_BYTES(save_vhca_state_out); u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {}; struct mlx5_vhca_data_buffer *header_buf = NULL; struct mlx5vf_async_data *async_data; bool pre_copy_cleanup = false; int err; lockdep_assert_held(&mvdev->state_mutex); if (mvdev->mdev_detach) return -ENOTCONN; err = wait_for_completion_interruptible(&migf->save_comp); if (err) return err; if ((migf->state == MLX5_MIGF_STATE_PRE_COPY || migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR) && !track && !inc) pre_copy_cleanup = true; if (migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR) /* * In case we had a PRE_COPY error, SAVE is triggered only for * the final image, read device full image. */ inc = false; MLX5_SET(save_vhca_state_in, in, opcode, MLX5_CMD_OP_SAVE_VHCA_STATE); MLX5_SET(save_vhca_state_in, in, op_mod, 0); MLX5_SET(save_vhca_state_in, in, vhca_id, mvdev->vhca_id); MLX5_SET(save_vhca_state_in, in, mkey, buf->mkey); MLX5_SET(save_vhca_state_in, in, size, buf->allocated_length); MLX5_SET(save_vhca_state_in, in, incremental, inc); MLX5_SET(save_vhca_state_in, in, set_track, track); async_data = &migf->async_data; async_data->buf = buf; async_data->stop_copy_chunk = (!track && !pre_copy_cleanup); async_data->out = kvzalloc(out_size, GFP_KERNEL); if (!async_data->out) { err = -ENOMEM; goto err_out; } if (async_data->stop_copy_chunk) { u8 header_idx = buf->stop_copy_chunk_num ? buf->stop_copy_chunk_num - 1 : 0; header_buf = migf->buf_header[header_idx]; migf->buf_header[header_idx] = NULL; } if (!header_buf) { header_buf = mlx5vf_get_data_buffer(migf, sizeof(struct mlx5_vf_migration_header), DMA_NONE); if (IS_ERR(header_buf)) { err = PTR_ERR(header_buf); goto err_free; } } if (async_data->stop_copy_chunk) migf->state = MLX5_MIGF_STATE_SAVE_STOP_COPY_CHUNK; async_data->header_buf = header_buf; get_file(migf->filp); err = mlx5_cmd_exec_cb(&migf->async_ctx, in, sizeof(in), async_data->out, out_size, mlx5vf_save_callback, &async_data->cb_work); if (err) goto err_exec; return 0; err_exec: if (header_buf) mlx5vf_put_data_buffer(header_buf); fput(migf->filp); err_free: kvfree(async_data->out); err_out: complete(&migf->save_comp); return err; } int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, struct mlx5_vf_migration_file *migf, struct mlx5_vhca_data_buffer *buf) { u32 out[MLX5_ST_SZ_DW(load_vhca_state_out)] = {}; u32 in[MLX5_ST_SZ_DW(load_vhca_state_in)] = {}; int err; lockdep_assert_held(&mvdev->state_mutex); if (mvdev->mdev_detach) return -ENOTCONN; if (!buf->dmaed) { err = mlx5vf_dma_data_buffer(buf); if (err) return err; } MLX5_SET(load_vhca_state_in, in, opcode, MLX5_CMD_OP_LOAD_VHCA_STATE); MLX5_SET(load_vhca_state_in, in, op_mod, 0); MLX5_SET(load_vhca_state_in, in, vhca_id, mvdev->vhca_id); MLX5_SET(load_vhca_state_in, in, mkey, buf->mkey); MLX5_SET(load_vhca_state_in, in, size, buf->length); return mlx5_cmd_exec_inout(mvdev->mdev, load_vhca_state, in, out); } int mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file *migf) { int err; lockdep_assert_held(&migf->mvdev->state_mutex); if (migf->mvdev->mdev_detach) return -ENOTCONN; err = mlx5_core_alloc_pd(migf->mvdev->mdev, &migf->pdn); return err; } void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf) { lockdep_assert_held(&migf->mvdev->state_mutex); if (migf->mvdev->mdev_detach) return; mlx5_core_dealloc_pd(migf->mvdev->mdev, migf->pdn); } void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf) { struct mlx5_vhca_data_buffer *entry; int i; lockdep_assert_held(&migf->mvdev->state_mutex); WARN_ON(migf->mvdev->mdev_detach); for (i = 0; i < MAX_NUM_CHUNKS; i++) { if (migf->buf[i]) { mlx5vf_free_data_buffer(migf->buf[i]); migf->buf[i] = NULL; } if (migf->buf_header[i]) { mlx5vf_free_data_buffer(migf->buf_header[i]); migf->buf_header[i] = NULL; } } list_splice(&migf->avail_list, &migf->buf_list); while ((entry = list_first_entry_or_null(&migf->buf_list, struct mlx5_vhca_data_buffer, buf_elm))) { list_del(&entry->buf_elm); mlx5vf_free_data_buffer(entry); } mlx5vf_cmd_dealloc_pd(migf); } static int mlx5vf_create_tracker(struct mlx5_core_dev *mdev, struct mlx5vf_pci_core_device *mvdev, struct rb_root_cached *ranges, u32 nnodes) { int max_num_range = MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_max_num_range); struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker; int record_size = MLX5_ST_SZ_BYTES(page_track_range); u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {}; struct interval_tree_node *node = NULL; u64 total_ranges_len = 0; u32 num_ranges = nnodes; u8 log_addr_space_size; void *range_list_ptr; void *obj_context; void *cmd_hdr; int inlen; void *in; int err; int i; if (num_ranges > max_num_range) { vfio_combine_iova_ranges(ranges, nnodes, max_num_range); num_ranges = max_num_range; } inlen = MLX5_ST_SZ_BYTES(create_page_track_obj_in) + record_size * num_ranges; in = kzalloc(inlen, GFP_KERNEL); if (!in) return -ENOMEM; cmd_hdr = MLX5_ADDR_OF(create_page_track_obj_in, in, general_obj_in_cmd_hdr); MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_CREATE_GENERAL_OBJECT); MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK); obj_context = MLX5_ADDR_OF(create_page_track_obj_in, in, obj_context); MLX5_SET(page_track, obj_context, vhca_id, mvdev->vhca_id); MLX5_SET(page_track, obj_context, track_type, 1); MLX5_SET(page_track, obj_context, log_page_size, ilog2(tracker->host_qp->tracked_page_size)); MLX5_SET(page_track, obj_context, log_msg_size, ilog2(tracker->host_qp->max_msg_size)); MLX5_SET(page_track, obj_context, reporting_qpn, tracker->fw_qp->qpn); MLX5_SET(page_track, obj_context, num_ranges, num_ranges); range_list_ptr = MLX5_ADDR_OF(page_track, obj_context, track_range); node = interval_tree_iter_first(ranges, 0, ULONG_MAX); for (i = 0; i < num_ranges; i++) { void *addr_range_i_base = range_list_ptr + record_size * i; unsigned long length = node->last - node->start + 1; MLX5_SET64(page_track_range, addr_range_i_base, start_address, node->start); MLX5_SET64(page_track_range, addr_range_i_base, length, length); total_ranges_len += length; node = interval_tree_iter_next(node, 0, ULONG_MAX); } WARN_ON(node); log_addr_space_size = ilog2(roundup_pow_of_two(total_ranges_len)); if (log_addr_space_size < (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_min_addr_space)) || log_addr_space_size > (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_max_addr_space))) { err = -EOPNOTSUPP; goto out; } MLX5_SET(page_track, obj_context, log_addr_space_size, log_addr_space_size); err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out)); if (err) goto out; tracker->id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id); out: kfree(in); return err; } static int mlx5vf_cmd_destroy_tracker(struct mlx5_core_dev *mdev, u32 tracker_id) { u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {}; u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {}; MLX5_SET(general_obj_in_cmd_hdr, in, opcode, MLX5_CMD_OP_DESTROY_GENERAL_OBJECT); MLX5_SET(general_obj_in_cmd_hdr, in, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK); MLX5_SET(general_obj_in_cmd_hdr, in, obj_id, tracker_id); return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); } static int mlx5vf_cmd_modify_tracker(struct mlx5_core_dev *mdev, u32 tracker_id, unsigned long iova, unsigned long length, u32 tracker_state) { u32 in[MLX5_ST_SZ_DW(modify_page_track_obj_in)] = {}; u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {}; void *obj_context; void *cmd_hdr; cmd_hdr = MLX5_ADDR_OF(modify_page_track_obj_in, in, general_obj_in_cmd_hdr); MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_MODIFY_GENERAL_OBJECT); MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK); MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, tracker_id); obj_context = MLX5_ADDR_OF(modify_page_track_obj_in, in, obj_context); MLX5_SET64(page_track, obj_context, modify_field_select, 0x3); MLX5_SET64(page_track, obj_context, range_start_address, iova); MLX5_SET64(page_track, obj_context, length, length); MLX5_SET(page_track, obj_context, state, tracker_state); return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); } static int mlx5vf_cmd_query_tracker(struct mlx5_core_dev *mdev, struct mlx5_vhca_page_tracker *tracker) { u32 out[MLX5_ST_SZ_DW(query_page_track_obj_out)] = {}; u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {}; void *obj_context; void *cmd_hdr; int err; cmd_hdr = MLX5_ADDR_OF(modify_page_track_obj_in, in, general_obj_in_cmd_hdr); MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_QUERY_GENERAL_OBJECT); MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK); MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, tracker->id); err = mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); if (err) return err; obj_context = MLX5_ADDR_OF(query_page_track_obj_out, out, obj_context); tracker->status = MLX5_GET(page_track, obj_context, state); return 0; } static int alloc_cq_frag_buf(struct mlx5_core_dev *mdev, struct mlx5_vhca_cq_buf *buf, int nent, int cqe_size) { struct mlx5_frag_buf *frag_buf = &buf->frag_buf; u8 log_wq_stride = 6 + (cqe_size == 128 ? 1 : 0); u8 log_wq_sz = ilog2(cqe_size); int err; err = mlx5_frag_buf_alloc_node(mdev, nent * cqe_size, frag_buf, mdev->priv.numa_node); if (err) return err; mlx5_init_fbc(frag_buf->frags, log_wq_stride, log_wq_sz, &buf->fbc); buf->cqe_size = cqe_size; buf->nent = nent; return 0; } static void init_cq_frag_buf(struct mlx5_vhca_cq_buf *buf) { struct mlx5_cqe64 *cqe64; void *cqe; int i; for (i = 0; i < buf->nent; i++) { cqe = mlx5_frag_buf_get_wqe(&buf->fbc, i); cqe64 = buf->cqe_size == 64 ? cqe : cqe + 64; cqe64->op_own = MLX5_CQE_INVALID << 4; } } static void mlx5vf_destroy_cq(struct mlx5_core_dev *mdev, struct mlx5_vhca_cq *cq) { mlx5_core_destroy_cq(mdev, &cq->mcq); mlx5_frag_buf_free(mdev, &cq->buf.frag_buf); mlx5_db_free(mdev, &cq->db); } static void mlx5vf_cq_event(struct mlx5_core_cq *mcq, enum mlx5_event type) { if (type != MLX5_EVENT_TYPE_CQ_ERROR) return; set_tracker_error(container_of(mcq, struct mlx5vf_pci_core_device, tracker.cq.mcq)); } static int mlx5vf_event_notifier(struct notifier_block *nb, unsigned long type, void *data) { struct mlx5_vhca_page_tracker *tracker = mlx5_nb_cof(nb, struct mlx5_vhca_page_tracker, nb); struct mlx5vf_pci_core_device *mvdev = container_of( tracker, struct mlx5vf_pci_core_device, tracker); struct mlx5_eqe_obj_change *object; struct mlx5_eqe *eqe = data; u8 event_type = (u8)type; u8 queue_type; u32 obj_id; int qp_num; switch (event_type) { case MLX5_EVENT_TYPE_WQ_CATAS_ERROR: case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR: case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR: queue_type = eqe->data.qp_srq.type; if (queue_type != MLX5_EVENT_QUEUE_TYPE_QP) break; qp_num = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff; if (qp_num != tracker->host_qp->qpn && qp_num != tracker->fw_qp->qpn) break; set_tracker_error(mvdev); break; case MLX5_EVENT_TYPE_OBJECT_CHANGE: object = &eqe->data.obj_change; obj_id = be32_to_cpu(object->obj_id); if (obj_id == tracker->id) set_tracker_change_event(mvdev); break; default: break; } return NOTIFY_OK; } static void mlx5vf_cq_complete(struct mlx5_core_cq *mcq, struct mlx5_eqe *eqe) { struct mlx5vf_pci_core_device *mvdev = container_of(mcq, struct mlx5vf_pci_core_device, tracker.cq.mcq); complete(&mvdev->tracker_comp); } static int mlx5vf_create_cq(struct mlx5_core_dev *mdev, struct mlx5_vhca_page_tracker *tracker, size_t ncqe) { int cqe_size = cache_line_size() == 128 ? 128 : 64; u32 out[MLX5_ST_SZ_DW(create_cq_out)]; struct mlx5_vhca_cq *cq; int inlen, err, eqn; void *cqc, *in; __be64 *pas; int vector; cq = &tracker->cq; ncqe = roundup_pow_of_two(ncqe); err = mlx5_db_alloc_node(mdev, &cq->db, mdev->priv.numa_node); if (err) return err; cq->ncqe = ncqe; cq->mcq.set_ci_db = cq->db.db; cq->mcq.arm_db = cq->db.db + 1; cq->mcq.cqe_sz = cqe_size; err = alloc_cq_frag_buf(mdev, &cq->buf, ncqe, cqe_size); if (err) goto err_db_free; init_cq_frag_buf(&cq->buf); inlen = MLX5_ST_SZ_BYTES(create_cq_in) + MLX5_FLD_SZ_BYTES(create_cq_in, pas[0]) * cq->buf.frag_buf.npages; in = kvzalloc(inlen, GFP_KERNEL); if (!in) { err = -ENOMEM; goto err_buff; } vector = raw_smp_processor_id() % mlx5_comp_vectors_max(mdev); err = mlx5_comp_eqn_get(mdev, vector, &eqn); if (err) goto err_vec; cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context); MLX5_SET(cqc, cqc, log_cq_size, ilog2(ncqe)); MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn); MLX5_SET(cqc, cqc, uar_page, tracker->uar->index); MLX5_SET(cqc, cqc, log_page_size, cq->buf.frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); MLX5_SET64(cqc, cqc, dbr_addr, cq->db.dma); pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas); mlx5_fill_page_frag_array(&cq->buf.frag_buf, pas); cq->mcq.comp = mlx5vf_cq_complete; cq->mcq.event = mlx5vf_cq_event; err = mlx5_core_create_cq(mdev, &cq->mcq, in, inlen, out, sizeof(out)); if (err) goto err_vec; mlx5_cq_arm(&cq->mcq, MLX5_CQ_DB_REQ_NOT, tracker->uar->map, cq->mcq.cons_index); kvfree(in); return 0; err_vec: kvfree(in); err_buff: mlx5_frag_buf_free(mdev, &cq->buf.frag_buf); err_db_free: mlx5_db_free(mdev, &cq->db); return err; } static struct mlx5_vhca_qp * mlx5vf_create_rc_qp(struct mlx5_core_dev *mdev, struct mlx5_vhca_page_tracker *tracker, u32 max_recv_wr) { u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {}; struct mlx5_vhca_qp *qp; u8 log_rq_stride; u8 log_rq_sz; void *qpc; int inlen; void *in; int err; qp = kzalloc(sizeof(*qp), GFP_KERNEL_ACCOUNT); if (!qp) return ERR_PTR(-ENOMEM); err = mlx5_db_alloc_node(mdev, &qp->db, mdev->priv.numa_node); if (err) goto err_free; if (max_recv_wr) { qp->rq.wqe_cnt = roundup_pow_of_two(max_recv_wr); log_rq_stride = ilog2(MLX5_SEND_WQE_DS); log_rq_sz = ilog2(qp->rq.wqe_cnt); err = mlx5_frag_buf_alloc_node(mdev, wq_get_byte_sz(log_rq_sz, log_rq_stride), &qp->buf, mdev->priv.numa_node); if (err) goto err_db_free; mlx5_init_fbc(qp->buf.frags, log_rq_stride, log_rq_sz, &qp->rq.fbc); } qp->rq.db = &qp->db.db[MLX5_RCV_DBR]; inlen = MLX5_ST_SZ_BYTES(create_qp_in) + MLX5_FLD_SZ_BYTES(create_qp_in, pas[0]) * qp->buf.npages; in = kvzalloc(inlen, GFP_KERNEL); if (!in) { err = -ENOMEM; goto err_in; } qpc = MLX5_ADDR_OF(create_qp_in, in, qpc); MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC); MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED); MLX5_SET(qpc, qpc, pd, tracker->pdn); MLX5_SET(qpc, qpc, uar_page, tracker->uar->index); MLX5_SET(qpc, qpc, log_page_size, qp->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); MLX5_SET(qpc, qpc, ts_format, mlx5_get_qp_default_ts(mdev)); if (MLX5_CAP_GEN(mdev, cqe_version) == 1) MLX5_SET(qpc, qpc, user_index, 0xFFFFFF); MLX5_SET(qpc, qpc, no_sq, 1); if (max_recv_wr) { MLX5_SET(qpc, qpc, cqn_rcv, tracker->cq.mcq.cqn); MLX5_SET(qpc, qpc, log_rq_stride, log_rq_stride - 4); MLX5_SET(qpc, qpc, log_rq_size, log_rq_sz); MLX5_SET(qpc, qpc, rq_type, MLX5_NON_ZERO_RQ); MLX5_SET64(qpc, qpc, dbr_addr, qp->db.dma); mlx5_fill_page_frag_array(&qp->buf, (__be64 *)MLX5_ADDR_OF(create_qp_in, in, pas)); } else { MLX5_SET(qpc, qpc, rq_type, MLX5_ZERO_LEN_RQ); } MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP); err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out)); kvfree(in); if (err) goto err_in; qp->qpn = MLX5_GET(create_qp_out, out, qpn); return qp; err_in: if (max_recv_wr) mlx5_frag_buf_free(mdev, &qp->buf); err_db_free: mlx5_db_free(mdev, &qp->db); err_free: kfree(qp); return ERR_PTR(err); } static void mlx5vf_post_recv(struct mlx5_vhca_qp *qp) { struct mlx5_wqe_data_seg *data; unsigned int ix; WARN_ON(qp->rq.pc - qp->rq.cc >= qp->rq.wqe_cnt); ix = qp->rq.pc & (qp->rq.wqe_cnt - 1); data = mlx5_frag_buf_get_wqe(&qp->rq.fbc, ix); data->byte_count = cpu_to_be32(qp->max_msg_size); data->lkey = cpu_to_be32(qp->recv_buf.mkey); data->addr = cpu_to_be64(qp->recv_buf.next_rq_offset); qp->rq.pc++; /* Make sure that descriptors are written before doorbell record. */ dma_wmb(); *qp->rq.db = cpu_to_be32(qp->rq.pc & 0xffff); } static int mlx5vf_activate_qp(struct mlx5_core_dev *mdev, struct mlx5_vhca_qp *qp, u32 remote_qpn, bool host_qp) { u32 init_in[MLX5_ST_SZ_DW(rst2init_qp_in)] = {}; u32 rtr_in[MLX5_ST_SZ_DW(init2rtr_qp_in)] = {}; u32 rts_in[MLX5_ST_SZ_DW(rtr2rts_qp_in)] = {}; void *qpc; int ret; /* Init */ qpc = MLX5_ADDR_OF(rst2init_qp_in, init_in, qpc); MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, 1); MLX5_SET(qpc, qpc, pm_state, MLX5_QPC_PM_STATE_MIGRATED); MLX5_SET(qpc, qpc, rre, 1); MLX5_SET(qpc, qpc, rwe, 1); MLX5_SET(rst2init_qp_in, init_in, opcode, MLX5_CMD_OP_RST2INIT_QP); MLX5_SET(rst2init_qp_in, init_in, qpn, qp->qpn); ret = mlx5_cmd_exec_in(mdev, rst2init_qp, init_in); if (ret) return ret; if (host_qp) { struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf; int i; for (i = 0; i < qp->rq.wqe_cnt; i++) { mlx5vf_post_recv(qp); recv_buf->next_rq_offset += qp->max_msg_size; } } /* RTR */ qpc = MLX5_ADDR_OF(init2rtr_qp_in, rtr_in, qpc); MLX5_SET(init2rtr_qp_in, rtr_in, qpn, qp->qpn); MLX5_SET(qpc, qpc, mtu, IB_MTU_4096); MLX5_SET(qpc, qpc, log_msg_max, MLX5_CAP_GEN(mdev, log_max_msg)); MLX5_SET(qpc, qpc, remote_qpn, remote_qpn); MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, 1); MLX5_SET(qpc, qpc, primary_address_path.fl, 1); MLX5_SET(qpc, qpc, min_rnr_nak, 1); MLX5_SET(init2rtr_qp_in, rtr_in, opcode, MLX5_CMD_OP_INIT2RTR_QP); MLX5_SET(init2rtr_qp_in, rtr_in, qpn, qp->qpn); ret = mlx5_cmd_exec_in(mdev, init2rtr_qp, rtr_in); if (ret || host_qp) return ret; /* RTS */ qpc = MLX5_ADDR_OF(rtr2rts_qp_in, rts_in, qpc); MLX5_SET(rtr2rts_qp_in, rts_in, qpn, qp->qpn); MLX5_SET(qpc, qpc, retry_count, 7); MLX5_SET(qpc, qpc, rnr_retry, 7); /* Infinite retry if RNR NACK */ MLX5_SET(qpc, qpc, primary_address_path.ack_timeout, 0x8); /* ~1ms */ MLX5_SET(rtr2rts_qp_in, rts_in, opcode, MLX5_CMD_OP_RTR2RTS_QP); MLX5_SET(rtr2rts_qp_in, rts_in, qpn, qp->qpn); return mlx5_cmd_exec_in(mdev, rtr2rts_qp, rts_in); } static void mlx5vf_destroy_qp(struct mlx5_core_dev *mdev, struct mlx5_vhca_qp *qp) { u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {}; MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP); MLX5_SET(destroy_qp_in, in, qpn, qp->qpn); mlx5_cmd_exec_in(mdev, destroy_qp, in); mlx5_frag_buf_free(mdev, &qp->buf); mlx5_db_free(mdev, &qp->db); kfree(qp); } static void free_recv_pages(struct mlx5_vhca_recv_buf *recv_buf) { int i; /* Undo alloc_pages_bulk_array() */ for (i = 0; i < recv_buf->npages; i++) __free_page(recv_buf->page_list[i]); kvfree(recv_buf->page_list); } static int alloc_recv_pages(struct mlx5_vhca_recv_buf *recv_buf, unsigned int npages) { unsigned int filled = 0, done = 0; int i; recv_buf->page_list = kvcalloc(npages, sizeof(*recv_buf->page_list), GFP_KERNEL_ACCOUNT); if (!recv_buf->page_list) return -ENOMEM; for (;;) { filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, npages - done, recv_buf->page_list + done); if (!filled) goto err; done += filled; if (done == npages) break; } recv_buf->npages = npages; return 0; err: for (i = 0; i < npages; i++) { if (recv_buf->page_list[i]) __free_page(recv_buf->page_list[i]); } kvfree(recv_buf->page_list); return -ENOMEM; } static int register_dma_recv_pages(struct mlx5_core_dev *mdev, struct mlx5_vhca_recv_buf *recv_buf) { int i, j; recv_buf->dma_addrs = kvcalloc(recv_buf->npages, sizeof(*recv_buf->dma_addrs), GFP_KERNEL_ACCOUNT); if (!recv_buf->dma_addrs) return -ENOMEM; for (i = 0; i < recv_buf->npages; i++) { recv_buf->dma_addrs[i] = dma_map_page(mdev->device, recv_buf->page_list[i], 0, PAGE_SIZE, DMA_FROM_DEVICE); if (dma_mapping_error(mdev->device, recv_buf->dma_addrs[i])) goto error; } return 0; error: for (j = 0; j < i; j++) dma_unmap_single(mdev->device, recv_buf->dma_addrs[j], PAGE_SIZE, DMA_FROM_DEVICE); kvfree(recv_buf->dma_addrs); return -ENOMEM; } static void unregister_dma_recv_pages(struct mlx5_core_dev *mdev, struct mlx5_vhca_recv_buf *recv_buf) { int i; for (i = 0; i < recv_buf->npages; i++) dma_unmap_single(mdev->device, recv_buf->dma_addrs[i], PAGE_SIZE, DMA_FROM_DEVICE); kvfree(recv_buf->dma_addrs); } static void mlx5vf_free_qp_recv_resources(struct mlx5_core_dev *mdev, struct mlx5_vhca_qp *qp) { struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf; mlx5_core_destroy_mkey(mdev, recv_buf->mkey); unregister_dma_recv_pages(mdev, recv_buf); free_recv_pages(&qp->recv_buf); } static int mlx5vf_alloc_qp_recv_resources(struct mlx5_core_dev *mdev, struct mlx5_vhca_qp *qp, u32 pdn, u64 rq_size) { unsigned int npages = DIV_ROUND_UP_ULL(rq_size, PAGE_SIZE); struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf; int err; err = alloc_recv_pages(recv_buf, npages); if (err < 0) return err; err = register_dma_recv_pages(mdev, recv_buf); if (err) goto end; err = _create_mkey(mdev, pdn, NULL, recv_buf, &recv_buf->mkey); if (err) goto err_create_mkey; return 0; err_create_mkey: unregister_dma_recv_pages(mdev, recv_buf); end: free_recv_pages(recv_buf); return err; } static void _mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev) { struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker; struct mlx5_core_dev *mdev = mvdev->mdev; lockdep_assert_held(&mvdev->state_mutex); if (!mvdev->log_active) return; WARN_ON(mvdev->mdev_detach); mlx5_eq_notifier_unregister(mdev, &tracker->nb); mlx5vf_cmd_destroy_tracker(mdev, tracker->id); mlx5vf_destroy_qp(mdev, tracker->fw_qp); mlx5vf_free_qp_recv_resources(mdev, tracker->host_qp); mlx5vf_destroy_qp(mdev, tracker->host_qp); mlx5vf_destroy_cq(mdev, &tracker->cq); mlx5_core_dealloc_pd(mdev, tracker->pdn); mlx5_put_uars_page(mdev, tracker->uar); mvdev->log_active = false; } int mlx5vf_stop_page_tracker(struct vfio_device *vdev) { struct mlx5vf_pci_core_device *mvdev = container_of( vdev, struct mlx5vf_pci_core_device, core_device.vdev); mutex_lock(&mvdev->state_mutex); if (!mvdev->log_active) goto end; _mlx5vf_free_page_tracker_resources(mvdev); mvdev->log_active = false; end: mlx5vf_state_mutex_unlock(mvdev); return 0; } int mlx5vf_start_page_tracker(struct vfio_device *vdev, struct rb_root_cached *ranges, u32 nnodes, u64 *page_size) { struct mlx5vf_pci_core_device *mvdev = container_of( vdev, struct mlx5vf_pci_core_device, core_device.vdev); struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker; u8 log_tracked_page = ilog2(*page_size); struct mlx5_vhca_qp *host_qp; struct mlx5_vhca_qp *fw_qp; struct mlx5_core_dev *mdev; u32 log_max_msg_size; u32 max_msg_size; u64 rq_size = SZ_2M; u32 max_recv_wr; int err; mutex_lock(&mvdev->state_mutex); if (mvdev->mdev_detach) { err = -ENOTCONN; goto end; } if (mvdev->log_active) { err = -EINVAL; goto end; } mdev = mvdev->mdev; log_max_msg_size = MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_max_msg_size); max_msg_size = (1ULL << log_max_msg_size); /* The RQ must hold at least 4 WQEs/messages for successful QP creation */ if (rq_size < 4 * max_msg_size) rq_size = 4 * max_msg_size; memset(tracker, 0, sizeof(*tracker)); tracker->uar = mlx5_get_uars_page(mdev); if (IS_ERR(tracker->uar)) { err = PTR_ERR(tracker->uar); goto end; } err = mlx5_core_alloc_pd(mdev, &tracker->pdn); if (err) goto err_uar; max_recv_wr = DIV_ROUND_UP_ULL(rq_size, max_msg_size); err = mlx5vf_create_cq(mdev, tracker, max_recv_wr); if (err) goto err_dealloc_pd; host_qp = mlx5vf_create_rc_qp(mdev, tracker, max_recv_wr); if (IS_ERR(host_qp)) { err = PTR_ERR(host_qp); goto err_cq; } host_qp->max_msg_size = max_msg_size; if (log_tracked_page < MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_min_page_size)) { log_tracked_page = MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_min_page_size); } else if (log_tracked_page > MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_max_page_size)) { log_tracked_page = MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_max_page_size); } host_qp->tracked_page_size = (1ULL << log_tracked_page); err = mlx5vf_alloc_qp_recv_resources(mdev, host_qp, tracker->pdn, rq_size); if (err) goto err_host_qp; fw_qp = mlx5vf_create_rc_qp(mdev, tracker, 0); if (IS_ERR(fw_qp)) { err = PTR_ERR(fw_qp); goto err_recv_resources; } err = mlx5vf_activate_qp(mdev, host_qp, fw_qp->qpn, true); if (err) goto err_activate; err = mlx5vf_activate_qp(mdev, fw_qp, host_qp->qpn, false); if (err) goto err_activate; tracker->host_qp = host_qp; tracker->fw_qp = fw_qp; err = mlx5vf_create_tracker(mdev, mvdev, ranges, nnodes); if (err) goto err_activate; MLX5_NB_INIT(&tracker->nb, mlx5vf_event_notifier, NOTIFY_ANY); mlx5_eq_notifier_register(mdev, &tracker->nb); *page_size = host_qp->tracked_page_size; mvdev->log_active = true; mlx5vf_state_mutex_unlock(mvdev); return 0; err_activate: mlx5vf_destroy_qp(mdev, fw_qp); err_recv_resources: mlx5vf_free_qp_recv_resources(mdev, host_qp); err_host_qp: mlx5vf_destroy_qp(mdev, host_qp); err_cq: mlx5vf_destroy_cq(mdev, &tracker->cq); err_dealloc_pd: mlx5_core_dealloc_pd(mdev, tracker->pdn); err_uar: mlx5_put_uars_page(mdev, tracker->uar); end: mlx5vf_state_mutex_unlock(mvdev); return err; } static void set_report_output(u32 size, int index, struct mlx5_vhca_qp *qp, struct iova_bitmap *dirty) { u32 entry_size = MLX5_ST_SZ_BYTES(page_track_report_entry); u32 nent = size / entry_size; u32 nent_in_page; u32 nent_to_set; struct page *page; u32 page_offset; u32 page_index; u32 buf_offset; void *kaddr; u64 addr; u64 *buf; int i; buf_offset = index * qp->max_msg_size; if (WARN_ON(buf_offset + size >= qp->recv_buf.npages * PAGE_SIZE || (nent > qp->max_msg_size / entry_size))) return; do { page_index = buf_offset / PAGE_SIZE; page_offset = buf_offset % PAGE_SIZE; nent_in_page = (PAGE_SIZE - page_offset) / entry_size; page = qp->recv_buf.page_list[page_index]; kaddr = kmap_local_page(page); buf = kaddr + page_offset; nent_to_set = min(nent, nent_in_page); for (i = 0; i < nent_to_set; i++) { addr = MLX5_GET(page_track_report_entry, buf + i, dirty_address_low); addr |= (u64)MLX5_GET(page_track_report_entry, buf + i, dirty_address_high) << 32; iova_bitmap_set(dirty, addr, qp->tracked_page_size); } kunmap_local(kaddr); buf_offset += (nent_to_set * entry_size); nent -= nent_to_set; } while (nent); } static void mlx5vf_rq_cqe(struct mlx5_vhca_qp *qp, struct mlx5_cqe64 *cqe, struct iova_bitmap *dirty, int *tracker_status) { u32 size; int ix; qp->rq.cc++; *tracker_status = be32_to_cpu(cqe->immediate) >> 28; size = be32_to_cpu(cqe->byte_cnt); ix = be16_to_cpu(cqe->wqe_counter) & (qp->rq.wqe_cnt - 1); /* zero length CQE, no data */ WARN_ON(!size && *tracker_status == MLX5_PAGE_TRACK_STATE_REPORTING); if (size) set_report_output(size, ix, qp, dirty); qp->recv_buf.next_rq_offset = ix * qp->max_msg_size; mlx5vf_post_recv(qp); } static void *get_cqe(struct mlx5_vhca_cq *cq, int n) { return mlx5_frag_buf_get_wqe(&cq->buf.fbc, n); } static struct mlx5_cqe64 *get_sw_cqe(struct mlx5_vhca_cq *cq, int n) { void *cqe = get_cqe(cq, n & (cq->ncqe - 1)); struct mlx5_cqe64 *cqe64; cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64; if (likely(get_cqe_opcode(cqe64) != MLX5_CQE_INVALID) && !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & (cq->ncqe)))) { return cqe64; } else { return NULL; } } static int mlx5vf_cq_poll_one(struct mlx5_vhca_cq *cq, struct mlx5_vhca_qp *qp, struct iova_bitmap *dirty, int *tracker_status) { struct mlx5_cqe64 *cqe; u8 opcode; cqe = get_sw_cqe(cq, cq->mcq.cons_index); if (!cqe) return CQ_EMPTY; ++cq->mcq.cons_index; /* * Make sure we read CQ entry contents after we've checked the * ownership bit. */ rmb(); opcode = get_cqe_opcode(cqe); switch (opcode) { case MLX5_CQE_RESP_SEND_IMM: mlx5vf_rq_cqe(qp, cqe, dirty, tracker_status); return CQ_OK; default: return CQ_POLL_ERR; } } int mlx5vf_tracker_read_and_clear(struct vfio_device *vdev, unsigned long iova, unsigned long length, struct iova_bitmap *dirty) { struct mlx5vf_pci_core_device *mvdev = container_of( vdev, struct mlx5vf_pci_core_device, core_device.vdev); struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker; struct mlx5_vhca_cq *cq = &tracker->cq; struct mlx5_core_dev *mdev; int poll_err, err; mutex_lock(&mvdev->state_mutex); if (!mvdev->log_active) { err = -EINVAL; goto end; } if (mvdev->mdev_detach) { err = -ENOTCONN; goto end; } if (tracker->is_err) { err = -EIO; goto end; } mdev = mvdev->mdev; err = mlx5vf_cmd_modify_tracker(mdev, tracker->id, iova, length, MLX5_PAGE_TRACK_STATE_REPORTING); if (err) goto end; tracker->status = MLX5_PAGE_TRACK_STATE_REPORTING; while (tracker->status == MLX5_PAGE_TRACK_STATE_REPORTING && !tracker->is_err) { poll_err = mlx5vf_cq_poll_one(cq, tracker->host_qp, dirty, &tracker->status); if (poll_err == CQ_EMPTY) { mlx5_cq_arm(&cq->mcq, MLX5_CQ_DB_REQ_NOT, tracker->uar->map, cq->mcq.cons_index); poll_err = mlx5vf_cq_poll_one(cq, tracker->host_qp, dirty, &tracker->status); if (poll_err == CQ_EMPTY) { wait_for_completion(&mvdev->tracker_comp); if (tracker->object_changed) { tracker->object_changed = false; err = mlx5vf_cmd_query_tracker(mdev, tracker); if (err) goto end; } continue; } } if (poll_err == CQ_POLL_ERR) { err = -EIO; goto end; } mlx5_cq_set_ci(&cq->mcq); } if (tracker->status == MLX5_PAGE_TRACK_STATE_ERROR) tracker->is_err = true; if (tracker->is_err) err = -EIO; end: mlx5vf_state_mutex_unlock(mvdev); return err; }