mirror of
https://github.com/NVIDIA/open-gpu-kernel-modules.git
synced 2026-02-20 06:53:58 +00:00
570.86.15
This commit is contained in:
@@ -62,10 +62,9 @@ static NV_STATUS migrate_vma_page_copy_address(struct page *page,
|
||||
*gpu_addr = uvm_gpu_address_copy(owning_gpu, uvm_gpu_page_to_phys_address(owning_gpu, page));
|
||||
}
|
||||
else if (owning_gpu && can_copy_from) {
|
||||
uvm_gpu_identity_mapping_t *gpu_peer_mappings = uvm_gpu_get_peer_mapping(copying_gpu, owning_gpu->id);
|
||||
uvm_gpu_phys_address_t phys_addr = uvm_gpu_page_to_phys_address(owning_gpu, page);
|
||||
|
||||
*gpu_addr = uvm_gpu_address_virtual(gpu_peer_mappings->base + phys_addr.address);
|
||||
*gpu_addr = uvm_gpu_peer_copy_address(owning_gpu, phys_addr.address, copying_gpu);
|
||||
}
|
||||
else {
|
||||
NV_STATUS status = uvm_parent_gpu_map_cpu_page(copying_gpu->parent, page, &state->dma.addrs[page_index]);
|
||||
@@ -399,6 +398,38 @@ static NV_STATUS migrate_vma_populate_anon_pages(struct vm_area_struct *vma,
|
||||
return status;
|
||||
}
|
||||
|
||||
static NV_STATUS zero_non_failed_pages_in_mask(uvm_push_t *push,
|
||||
const unsigned long *pfns,
|
||||
unsigned long *page_mask,
|
||||
unsigned long mask_size,
|
||||
migrate_vma_state_t *state)
|
||||
{
|
||||
unsigned long i;
|
||||
uvm_migrate_args_t *uvm_migrate_args = state->uvm_migrate_args;
|
||||
uvm_processor_id_t dst_id = uvm_migrate_args->dst_id;
|
||||
uvm_gpu_t *zeroing_gpu = uvm_push_get_gpu(push);
|
||||
|
||||
for_each_set_bit(i, page_mask, mask_size) {
|
||||
struct page *page;
|
||||
uvm_gpu_address_t dst_address;
|
||||
NV_STATUS status;
|
||||
|
||||
if (test_bit(i, state->allocation_failed_mask))
|
||||
continue;
|
||||
|
||||
page = migrate_pfn_to_page(pfns[i]);
|
||||
status = migrate_vma_page_copy_address(page, i, dst_id, zeroing_gpu, state, &dst_address);
|
||||
if (status != NV_OK)
|
||||
return status;
|
||||
|
||||
uvm_push_set_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
|
||||
uvm_push_set_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
|
||||
zeroing_gpu->parent->ce_hal->memset_8(push, dst_address, 0, PAGE_SIZE);
|
||||
}
|
||||
|
||||
return NV_OK;
|
||||
}
|
||||
|
||||
static NV_STATUS migrate_vma_copy_pages_from(struct vm_area_struct *vma,
|
||||
const unsigned long *src,
|
||||
unsigned long *dst,
|
||||
@@ -411,36 +442,82 @@ static NV_STATUS migrate_vma_copy_pages_from(struct vm_area_struct *vma,
|
||||
uvm_push_t push;
|
||||
unsigned long i;
|
||||
uvm_gpu_t *copying_gpu = NULL;
|
||||
uvm_gpu_t *src_gpu = UVM_ID_IS_GPU(src_id) ? uvm_gpu_get(src_id) : NULL;
|
||||
uvm_migrate_args_t *uvm_migrate_args = state->uvm_migrate_args;
|
||||
uvm_processor_id_t dst_id = uvm_migrate_args->dst_id;
|
||||
unsigned long *page_mask = state->processors[uvm_id_value(src_id)].page_mask;
|
||||
uvm_va_space_t *va_space = uvm_migrate_args->va_space;
|
||||
uvm_tracker_t zero_tracker = UVM_TRACKER_INIT();
|
||||
|
||||
UVM_ASSERT(!bitmap_empty(page_mask, state->num_pages));
|
||||
|
||||
// Pre-allocate the dst pages and mark the ones that failed
|
||||
for_each_set_bit(i, page_mask, state->num_pages) {
|
||||
uvm_gpu_address_t src_address;
|
||||
uvm_gpu_address_t dst_address;
|
||||
struct page *src_page = migrate_pfn_to_page(src[i]);
|
||||
struct page *dst_page;
|
||||
|
||||
UVM_ASSERT(src[i] & MIGRATE_PFN_VALID);
|
||||
UVM_ASSERT(src_page);
|
||||
|
||||
dst_page = migrate_vma_alloc_page(state);
|
||||
struct page *dst_page = migrate_vma_alloc_page(state);
|
||||
if (!dst_page) {
|
||||
__set_bit(i, state->allocation_failed_mask);
|
||||
continue;
|
||||
}
|
||||
|
||||
lock_page(dst_page);
|
||||
dst[i] = migrate_pfn(page_to_pfn(dst_page));
|
||||
}
|
||||
|
||||
// Zero destination pages in case of NVLINK copy that can hit STO or XC,
|
||||
// or in case of injected unresolved NVLINK error.
|
||||
// TODO: Bug 4922701: [uvm] Re-evaluate STO handling for ATS migrations
|
||||
// This can be removed if the false-positive rate of STO
|
||||
// fast-path is low enough to prefer failing the copy when an STO
|
||||
// fast-path error is detected.
|
||||
if (UVM_ID_IS_GPU(src_id) &&
|
||||
UVM_ID_IS_GPU(dst_id) &&
|
||||
((src_gpu->nvlink_status.enabled &&
|
||||
(uvm_parent_gpu_peer_link_type(src_gpu->parent, uvm_gpu_get(dst_id)->parent) >= UVM_GPU_LINK_NVLINK_5)) ||
|
||||
uvm_gpu_get_injected_nvlink_error(src_gpu) == NV_WARN_MORE_PROCESSING_REQUIRED)) {
|
||||
uvm_gpu_t *dst_gpu = uvm_gpu_get(dst_id);
|
||||
uvm_push_t zero_push;
|
||||
|
||||
status = migrate_vma_zero_begin_push(va_space, dst_id, dst_gpu, start, outer - 1, &zero_push);
|
||||
if (status != NV_OK)
|
||||
return status;
|
||||
|
||||
status = zero_non_failed_pages_in_mask(&zero_push, dst, page_mask, state->num_pages, state);
|
||||
|
||||
uvm_push_end(&zero_push);
|
||||
|
||||
if (status == NV_OK)
|
||||
status = uvm_tracker_add_push_safe(&zero_tracker, &zero_push);
|
||||
|
||||
if (status != NV_OK)
|
||||
return status;
|
||||
}
|
||||
|
||||
for_each_set_bit(i, page_mask, state->num_pages) {
|
||||
uvm_gpu_address_t src_address;
|
||||
uvm_gpu_address_t dst_address;
|
||||
struct page *src_page = migrate_pfn_to_page(src[i]);
|
||||
struct page *dst_page = migrate_pfn_to_page(dst[i]);
|
||||
|
||||
if (test_bit(i, state->allocation_failed_mask))
|
||||
continue;
|
||||
|
||||
UVM_ASSERT(src[i] & MIGRATE_PFN_VALID);
|
||||
UVM_ASSERT(src_page);
|
||||
UVM_ASSERT(dst[i] & MIGRATE_PFN_VALID);
|
||||
UVM_ASSERT(dst_page);
|
||||
|
||||
if (!copying_gpu) {
|
||||
status = migrate_vma_copy_begin_push(va_space, dst_id, src_id, start, outer - 1, &push);
|
||||
if (status != NV_OK) {
|
||||
__free_page(dst_page);
|
||||
return status;
|
||||
}
|
||||
if (status != NV_OK)
|
||||
break;
|
||||
|
||||
copying_gpu = uvm_push_get_gpu(&push);
|
||||
if (src_gpu)
|
||||
UVM_ASSERT(src_gpu == copying_gpu);
|
||||
|
||||
// The zero tracker will be empty if zeroing is not necessary
|
||||
uvm_push_acquire_tracker(&push, &zero_tracker);
|
||||
uvm_tracker_deinit(&zero_tracker);
|
||||
}
|
||||
else {
|
||||
uvm_push_set_flag(&push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
|
||||
@@ -452,18 +529,12 @@ static NV_STATUS migrate_vma_copy_pages_from(struct vm_area_struct *vma,
|
||||
if (status == NV_OK)
|
||||
status = migrate_vma_page_copy_address(dst_page, i, dst_id, copying_gpu, state, &dst_address);
|
||||
|
||||
if (status != NV_OK) {
|
||||
__free_page(dst_page);
|
||||
if (status != NV_OK)
|
||||
break;
|
||||
}
|
||||
|
||||
lock_page(dst_page);
|
||||
|
||||
// We'll push one membar later for all copies in this loop
|
||||
uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
|
||||
copying_gpu->parent->ce_hal->memcopy(&push, dst_address, src_address, PAGE_SIZE);
|
||||
|
||||
dst[i] = migrate_pfn(page_to_pfn(dst_page));
|
||||
}
|
||||
|
||||
// TODO: Bug 1766424: If the destination is a GPU and the copy was done by
|
||||
@@ -523,6 +594,7 @@ static void migrate_vma_alloc_and_copy(struct migrate_vma *args, migrate_vma_sta
|
||||
unsigned long start = args->start;
|
||||
unsigned long outer = args->end;
|
||||
NV_STATUS tracker_status;
|
||||
uvm_migrate_args_t *uvm_migrate_args = state->uvm_migrate_args;
|
||||
|
||||
uvm_tracker_init(&state->tracker);
|
||||
|
||||
@@ -542,6 +614,40 @@ static void migrate_vma_alloc_and_copy(struct migrate_vma *args, migrate_vma_sta
|
||||
if (state->status == NV_OK)
|
||||
state->status = tracker_status;
|
||||
|
||||
// Check if the copy might have been impacted by NVLINK errors.
|
||||
if (state->status == NV_OK) {
|
||||
uvm_processor_id_t src_id;
|
||||
|
||||
for_each_id_in_mask(src_id, &state->src_processors) {
|
||||
NV_STATUS status;
|
||||
|
||||
// Skip CPU source, even if for some reason the operation went over
|
||||
// NVLINK, it'd be a read and hit poison.
|
||||
if (UVM_ID_IS_CPU(src_id))
|
||||
continue;
|
||||
|
||||
UVM_ASSERT(UVM_ID_IS_GPU(src_id));
|
||||
status = uvm_gpu_check_nvlink_error_no_rm(uvm_gpu_get(src_id));
|
||||
|
||||
// Set state->status to the first error if there's an NVLINK error.
|
||||
// Do not report NV_WARN_MORE_PROCESSING_REQUIRED. The call to the
|
||||
// uvm_migrate_vma_copy_pages above zeroed the destination.
|
||||
// Thus in case of real STO error zeroed pages will be mapped.
|
||||
if (state->status == NV_OK && status != NV_WARN_MORE_PROCESSING_REQUIRED)
|
||||
state->status = status;
|
||||
|
||||
// Record unresolved GPU errors if the caller can use the information
|
||||
if (status == NV_WARN_MORE_PROCESSING_REQUIRED) {
|
||||
if (uvm_migrate_args->gpus_to_check_for_nvlink_errors)
|
||||
uvm_processor_mask_set(uvm_migrate_args->gpus_to_check_for_nvlink_errors, src_id);
|
||||
|
||||
// fail the copy if requested by the caller
|
||||
if (uvm_migrate_args->fail_on_unresolved_sto_errors && state->status == NV_OK)
|
||||
state->status = NV_ERR_BUSY_RETRY;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Mark all pages as not migrating if we're failing
|
||||
if (state->status != NV_OK)
|
||||
migrate_vma_cleanup_pages(args->dst, state->num_pages);
|
||||
@@ -870,6 +976,14 @@ static NV_STATUS migrate_pageable_vma(struct vm_area_struct *vma,
|
||||
if (va_space->test.skip_migrate_vma)
|
||||
return NV_WARN_NOTHING_TO_DO;
|
||||
|
||||
// This isn't the right path for a UVM-owned vma. In most cases the callers
|
||||
// will take the correct (managed) path, but we can get here if invoked on a
|
||||
// disabled vma (see uvm_disable_vma()) that has no VA range but still has a
|
||||
// vma. This could cause locking issues if the caller has the VA space
|
||||
// locked and we invoke a UVM fault handler, so avoid it entirely.
|
||||
if (uvm_file_is_nvidia_uvm(vma->vm_file))
|
||||
return NV_ERR_INVALID_ADDRESS;
|
||||
|
||||
// TODO: Bug 2419180: support file-backed pages in migrate_vma, when
|
||||
// support for it is added to the Linux kernel
|
||||
if (!vma_is_anonymous(vma))
|
||||
@@ -1002,9 +1116,12 @@ NV_STATUS uvm_migrate_pageable(uvm_migrate_args_t *uvm_migrate_args)
|
||||
return NV_ERR_INVALID_ARGUMENT;
|
||||
}
|
||||
else {
|
||||
uvm_gpu_t *gpu = uvm_gpu_get(dst_id);
|
||||
|
||||
// Incoming dst_node_id is only valid if dst_id belongs to the CPU. Use
|
||||
// dst_node_id as the GPU node id if dst_id doesn't belong to the CPU.
|
||||
uvm_migrate_args->dst_node_id = uvm_gpu_numa_node(uvm_gpu_get(dst_id));
|
||||
UVM_ASSERT(gpu->mem_info.numa.enabled);
|
||||
uvm_migrate_args->dst_node_id = uvm_gpu_numa_node(gpu);
|
||||
}
|
||||
|
||||
state = kmem_cache_alloc(g_uvm_migrate_vma_state_cache, NV_UVM_GFP_FLAGS);
|
||||
|
||||
Reference in New Issue
Block a user