570.86.15

This commit is contained in:
Bernhard Stoeckner
2025-01-27 19:36:56 +01:00
parent 9d0b0414a5
commit 54d69484da
1166 changed files with 318863 additions and 182687 deletions

View File

@@ -41,6 +41,7 @@
#include "uvm_common.h"
#include "nv_uvm_interface.h"
#include "nv-kthread-q.h"
#include <linux/mmzone.h>
static bool processor_mask_array_test(const uvm_processor_mask_t *mask,
uvm_processor_id_t mask_id,
@@ -145,11 +146,11 @@ static bool va_space_check_processors_masks(uvm_va_space_t *va_space)
UVM_ASSERT(processor_mask_array_test(va_space->can_copy_from, processor, UVM_ID_CPU));
UVM_ASSERT(processor_mask_array_test(va_space->can_copy_from, UVM_ID_CPU, processor));
// NVLINK
UVM_ASSERT(!processor_mask_array_test(va_space->has_nvlink, processor, processor));
// NVLINK/C2C
UVM_ASSERT(!processor_mask_array_test(va_space->has_fast_link, processor, processor));
if (check_can_copy_from) {
UVM_ASSERT(uvm_processor_mask_subset(&va_space->has_nvlink[uvm_id_value(processor)],
UVM_ASSERT(uvm_processor_mask_subset(&va_space->has_fast_link[uvm_id_value(processor)],
&va_space->can_copy_from[uvm_id_value(processor)]));
}
@@ -293,6 +294,22 @@ fail:
return status;
}
static void va_space_parent_gpu_unregister(uvm_va_space_t *va_space, uvm_parent_gpu_t *parent)
{
uvm_egm_numa_node_info_t *node_info;
if (!uvm_va_space_single_gpu_in_parent(va_space, parent) ||
!parent->egm.enabled ||
parent->closest_cpu_numa_node == NUMA_NO_NODE)
return;
node_info = uvm_va_space_get_egm_numa_node_info(va_space, parent->closest_cpu_numa_node);
uvm_parent_processor_mask_clear(&node_info->parent_gpus, parent->id);
// Clear local EGM routing
node_info->routing_table[uvm_parent_id_gpu_index(parent->id)] = NULL;
}
// This function does *not* release the GPU, nor the GPU's PCIE peer pairings.
// Those are returned so the caller can do it after dropping the VA space lock.
static void unregister_gpu(uvm_va_space_t *va_space,
@@ -340,6 +357,8 @@ static void unregister_gpu(uvm_va_space_t *va_space,
}
}
va_space_parent_gpu_unregister(va_space, gpu->parent);
if (gpu->parent->isr.replayable_faults.handling) {
UVM_ASSERT(uvm_processor_mask_test(&va_space->faultable_processors, gpu->id));
uvm_processor_mask_clear(&va_space->faultable_processors, gpu->id);
@@ -365,9 +384,9 @@ static void unregister_gpu(uvm_va_space_t *va_space,
processor_mask_array_clear(va_space->can_copy_from, UVM_ID_CPU, gpu->id);
UVM_ASSERT(processor_mask_array_empty(va_space->can_copy_from, gpu->id));
processor_mask_array_clear(va_space->has_nvlink, gpu->id, UVM_ID_CPU);
processor_mask_array_clear(va_space->has_nvlink, UVM_ID_CPU, gpu->id);
UVM_ASSERT(processor_mask_array_empty(va_space->has_nvlink, gpu->id));
processor_mask_array_clear(va_space->has_fast_link, gpu->id, UVM_ID_CPU);
processor_mask_array_clear(va_space->has_fast_link, UVM_ID_CPU, gpu->id);
UVM_ASSERT(processor_mask_array_empty(va_space->has_fast_link, gpu->id));
processor_mask_array_clear(va_space->has_native_atomics, gpu->id, gpu->id);
processor_mask_array_clear(va_space->has_native_atomics, gpu->id, UVM_ID_CPU);
@@ -395,6 +414,7 @@ static void unregister_gpu(uvm_va_space_t *va_space,
va_space->gpu_unregister_dma_buffer[uvm_id_gpu_index(gpu->id)],
&va_space->gpu_unregister_dma_buffer[uvm_id_gpu_index(gpu->id)]->tracker);
}
va_space_check_processors_masks(va_space);
}
@@ -698,6 +718,32 @@ bool uvm_va_space_can_read_duplicate(uvm_va_space_t *va_space, uvm_gpu_t *changi
return count == 0;
}
static void va_space_parent_gpu_register(uvm_va_space_t *va_space, uvm_parent_gpu_t *parent)
{
uvm_egm_numa_node_info_t *node_info;
if (!uvm_va_space_single_gpu_in_parent(va_space, parent) ||
!parent->egm.enabled ||
parent->closest_cpu_numa_node == -1)
return;
node_info = uvm_va_space_get_egm_numa_node_info(va_space, parent->closest_cpu_numa_node);
if (!node_info->node_start) {
node_info->node_start = node_start_pfn(parent->closest_cpu_numa_node) << PAGE_SHIFT;
node_info->node_end = node_end_pfn(parent->closest_cpu_numa_node) << PAGE_SHIFT;
}
uvm_parent_processor_mask_set(&node_info->parent_gpus, parent->id);
// Setup local EGM routing.
// This is done here because local EGM routing does need not any peers.
// So, if there are no peers to this GPU, local EGM accesses should
// still be possible.
if (parent->egm.enabled)
node_info->routing_table[uvm_parent_id_gpu_index(parent->id)] = parent;
}
// Note that the "VA space" in the function name refers to a UVM per-process
// VA space. (This is different from a per-GPU VA space.)
NV_STATUS uvm_va_space_register_gpu(uvm_va_space_t *va_space,
@@ -795,10 +841,9 @@ NV_STATUS uvm_va_space_register_gpu(uvm_va_space_t *va_space,
// All GPUs have native atomics on their own memory
processor_mask_array_set(va_space->has_native_atomics, gpu->id, gpu->id);
// TODO: Bug 3252572: Support the new link type UVM_GPU_LINK_C2C
if (gpu->parent->system_bus.link >= UVM_GPU_LINK_NVLINK_1) {
processor_mask_array_set(va_space->has_nvlink, gpu->id, UVM_ID_CPU);
processor_mask_array_set(va_space->has_nvlink, UVM_ID_CPU, gpu->id);
processor_mask_array_set(va_space->has_fast_link, gpu->id, UVM_ID_CPU);
processor_mask_array_set(va_space->has_fast_link, UVM_ID_CPU, gpu->id);
}
if (uvm_parent_gpu_is_coherent(gpu->parent)) {
@@ -846,6 +891,8 @@ NV_STATUS uvm_va_space_register_gpu(uvm_va_space_t *va_space,
}
}
va_space_parent_gpu_register(va_space, gpu->parent);
status = register_gpu_peers(va_space, gpu);
if (status != NV_OK)
goto cleanup;
@@ -1023,6 +1070,30 @@ NV_STATUS uvm_va_space_unregister_gpu(uvm_va_space_t *va_space, const NvProcesso
return NV_OK;
}
static void disable_egm_peers(uvm_va_space_t *va_space, uvm_gpu_t *gpu0, uvm_gpu_t *gpu1)
{
uvm_egm_numa_node_info_t *node_info;
int nid;
for_each_egm_numa_node_info_for_gpu(node_info, va_space, gpu0->parent, nid) {
if (uvm_va_space_single_gpu_in_parent(va_space, gpu0->parent)) {
uvm_parent_processor_mask_t proc_mask;
uvm_parent_gpu_id_t peer_parent_id;
uvm_parent_processor_mask_copy(&proc_mask, &node_info->parent_gpus);
uvm_parent_processor_mask_clear(&proc_mask, gpu0->parent->id);
peer_parent_id = uvm_parent_processor_mask_find_first_gpu_id(&proc_mask);
if (!UVM_PARENT_ID_IS_VALID(peer_parent_id)) {
node_info->routing_table[uvm_parent_id_gpu_index(gpu1->parent->id)] = NULL;
}
else {
uvm_parent_gpu_t *peer_parent_gpu = uvm_parent_gpu_get(peer_parent_id);
node_info->routing_table[uvm_parent_id_gpu_index(gpu1->parent->id)] = peer_parent_gpu;
}
}
}
}
// This does *not* release the global GPU peer entry
static void disable_peers(uvm_va_space_t *va_space,
uvm_gpu_t *gpu0,
@@ -1044,14 +1115,17 @@ static void disable_peers(uvm_va_space_t *va_space,
uvm_for_each_va_range(va_range, va_space)
uvm_va_range_disable_peer(va_range, gpu0, gpu1, deferred_free_list);
disable_egm_peers(va_space, gpu0, gpu1);
disable_egm_peers(va_space, gpu1, gpu0);
processor_mask_array_clear(va_space->can_access, gpu0->id, gpu1->id);
processor_mask_array_clear(va_space->can_access, gpu1->id, gpu0->id);
processor_mask_array_clear(va_space->accessible_from, gpu0->id, gpu1->id);
processor_mask_array_clear(va_space->accessible_from, gpu1->id, gpu0->id);
processor_mask_array_clear(va_space->can_copy_from, gpu0->id, gpu1->id);
processor_mask_array_clear(va_space->can_copy_from, gpu1->id, gpu0->id);
processor_mask_array_clear(va_space->has_nvlink, gpu0->id, gpu1->id);
processor_mask_array_clear(va_space->has_nvlink, gpu1->id, gpu0->id);
processor_mask_array_clear(va_space->has_fast_link, gpu0->id, gpu1->id);
processor_mask_array_clear(va_space->has_fast_link, gpu1->id, gpu0->id);
processor_mask_array_clear(va_space->has_native_atomics, gpu0->id, gpu1->id);
processor_mask_array_clear(va_space->has_native_atomics, gpu1->id, gpu0->id);
@@ -1060,6 +1134,24 @@ static void disable_peers(uvm_va_space_t *va_space,
va_space_check_processors_masks(va_space);
}
static void enable_egm_peers(uvm_va_space_t *va_space, uvm_gpu_t *gpu0, uvm_gpu_t *gpu1)
{
if (gpu0->parent->egm.enabled) {
uvm_egm_numa_node_info_t *node_info;
int nid;
for_each_egm_numa_node_info_for_gpu(node_info, va_space, gpu0->parent, nid) {
// Setup remote EGM routing.
// Note that we only setup this routing if gpu1 is not attached to the
// same NUMA node. Otherwise, we want accesses from it to this CPU NUMA
// node to use gpu1's local EGM accesses.
if (!node_info->routing_table[uvm_parent_id_gpu_index(gpu1->parent->id)] &&
!uvm_parent_processor_mask_test(&node_info->parent_gpus, gpu1->parent->id))
node_info->routing_table[uvm_parent_id_gpu_index(gpu1->parent->id)] = gpu0->parent;
}
}
}
static NV_STATUS enable_peers(uvm_va_space_t *va_space, uvm_gpu_t *gpu0, uvm_gpu_t *gpu1)
{
NV_STATUS status = NV_OK;
@@ -1112,13 +1204,16 @@ static NV_STATUS enable_peers(uvm_va_space_t *va_space, uvm_gpu_t *gpu0, uvm_gpu
processor_mask_array_set(va_space->has_native_atomics, gpu1->id, gpu0->id);
}
else if (uvm_parent_gpu_peer_link_type(gpu0->parent, gpu1->parent) >= UVM_GPU_LINK_NVLINK_1) {
processor_mask_array_set(va_space->has_nvlink, gpu0->id, gpu1->id);
processor_mask_array_set(va_space->has_nvlink, gpu1->id, gpu0->id);
processor_mask_array_set(va_space->has_fast_link, gpu0->id, gpu1->id);
processor_mask_array_set(va_space->has_fast_link, gpu1->id, gpu0->id);
processor_mask_array_set(va_space->has_native_atomics, gpu0->id, gpu1->id);
processor_mask_array_set(va_space->has_native_atomics, gpu1->id, gpu0->id);
}
enable_egm_peers(va_space, gpu0, gpu1);
enable_egm_peers(va_space, gpu1, gpu0);
UVM_ASSERT(va_space_check_processors_masks(va_space));
__set_bit(pair_index, va_space->enabled_peers);
@@ -1168,14 +1263,35 @@ static NV_STATUS retain_pcie_peers_from_uuids(uvm_va_space_t *va_space,
static bool uvm_va_space_pcie_peer_enabled(uvm_va_space_t *va_space, uvm_gpu_t *gpu0, uvm_gpu_t *gpu1)
{
return !processor_mask_array_test(va_space->has_nvlink, gpu0->id, gpu1->id) &&
return !processor_mask_array_test(va_space->has_fast_link, gpu0->id, gpu1->id) &&
!uvm_gpus_are_smc_peers(gpu0, gpu1) &&
uvm_va_space_peer_enabled(va_space, gpu0, gpu1);
}
static bool uvm_va_space_nvlink_peer_enabled(uvm_va_space_t *va_space, uvm_gpu_t *gpu0, uvm_gpu_t *gpu1)
{
return processor_mask_array_test(va_space->has_nvlink, gpu0->id, gpu1->id);
return processor_mask_array_test(va_space->has_fast_link, gpu0->id, gpu1->id);
}
uvm_egm_numa_node_info_t *uvm_va_space_get_next_egm_numa_node_info_for_gpu(uvm_va_space_t *va_space,
uvm_parent_gpu_t *parent_gpu,
int *nid)
{
int _nid;
UVM_ASSERT(nid);
uvm_assert_rwsem_locked(&va_space->lock);
for (_nid = next_node(*nid, node_possible_map); _nid != MAX_NUMNODES; _nid = next_node(_nid, node_possible_map)) {
uvm_egm_numa_node_info_t *node_info = uvm_va_space_get_egm_numa_node_info(va_space, _nid);
if (uvm_parent_processor_mask_test(&node_info->parent_gpus, parent_gpu->id)) {
*nid = _nid;
return node_info;
}
}
*nid = NUMA_NO_NODE;
return NULL;
}
static void free_gpu_va_space(nv_kref_t *nv_kref)
@@ -1725,6 +1841,17 @@ NV_STATUS uvm_va_space_unregister_gpu_va_space(uvm_va_space_t *va_space, const N
return status;
}
bool uvm_va_space_single_gpu_in_parent(uvm_va_space_t *va_space, uvm_parent_gpu_t *parent_gpu)
{
uvm_sub_processor_mask_t sub_processors;
uvm_assert_rwsem_locked(&va_space->lock);
UVM_ASSERT(!uvm_processor_mask_empty(&va_space->registered_gpus));
sub_processors = uvm_sub_processor_mask_from_processor_mask(&va_space->registered_gpus, parent_gpu->id);
return uvm_sub_processor_mask_get_count(&sub_processors) == 1;
}
bool uvm_va_space_peer_enabled(uvm_va_space_t *va_space, const uvm_gpu_t *gpu0, const uvm_gpu_t *gpu1)
{
UVM_ASSERT(uvm_processor_mask_test(&va_space->registered_gpus, gpu0->id));
@@ -1762,7 +1889,7 @@ uvm_processor_id_t uvm_processor_mask_find_closest_id(uvm_va_space_t *va_space,
}
}
if (uvm_processor_mask_and(mask, candidates, &va_space->has_nvlink[uvm_id_value(src)])) {
if (uvm_processor_mask_and(mask, candidates, &va_space->has_fast_link[uvm_id_value(src)])) {
// Direct peers, prioritizing GPU peers over CPU
closest_id = uvm_processor_mask_find_first_gpu_id(mask);
if (UVM_ID_IS_INVALID(closest_id))
@@ -2369,6 +2496,7 @@ static vm_fault_t uvm_va_space_cpu_fault(uvm_va_space_t *va_space,
}
service_context->cpu_fault.wakeup_time_stamp = 0;
service_context->num_retries = 0;
// There are up to three mm_structs to worry about, and they might all be
// different:
@@ -2392,26 +2520,47 @@ static vm_fault_t uvm_va_space_cpu_fault(uvm_va_space_t *va_space,
do {
bool do_sleep = false;
// NV_WARN_MORE_PROCESSING_REQUIRED can be returned by either thrashing
// or NVLINK error check. Use bits in gpus_to_check_for_nvlink_errors
// to select one or the other.
if (status == NV_WARN_MORE_PROCESSING_REQUIRED) {
NvU64 now = NV_GETTIME();
if (now < service_context->cpu_fault.wakeup_time_stamp)
do_sleep = true;
if (uvm_processor_mask_empty(&service_context->gpus_to_check_for_nvlink_errors)) {
NvU64 now = NV_GETTIME();
if (now < service_context->cpu_fault.wakeup_time_stamp)
do_sleep = true;
if (do_sleep)
uvm_tools_record_throttling_start(va_space, fault_addr, UVM_ID_CPU);
if (do_sleep)
uvm_tools_record_throttling_start(va_space, fault_addr, UVM_ID_CPU);
// Drop the VA space lock while we sleep
uvm_va_space_up_read(va_space);
// Drop the VA space lock while we sleep
uvm_va_space_up_read(va_space);
// usleep_range is preferred because msleep has a 20ms granularity
// and udelay uses a busy-wait loop. usleep_range uses
// high-resolution timers and, by adding a range, the Linux
// scheduler may coalesce our wakeup with others, thus saving some
// interrupts.
if (do_sleep) {
unsigned long nap_us = (service_context->cpu_fault.wakeup_time_stamp - now) / 1000;
// usleep_range is preferred because msleep has a 20ms
// granularity and udelay uses a busy-wait loop. usleep_range
// uses high-resolution timers and, by adding a range, the
// Linux scheduler may coalesce our wakeup with others, thus
// saving some interrupts.
if (do_sleep) {
unsigned long nap_us = (service_context->cpu_fault.wakeup_time_stamp - now) / 1000;
usleep_range(nap_us, nap_us + nap_us / 2);
usleep_range(nap_us, nap_us + nap_us / 2);
}
}
else {
// Drop the VA space lock while we check RM for nvlink errors
uvm_va_space_up_read(va_space);
// Record unlock of the mm lock without actually releasing it
// to allow calling RM. This matches the ECC error checking
// below.
uvm_record_unlock_mmap_lock_read(vma->vm_mm);
status = uvm_global_gpu_check_nvlink_error(&service_context->gpus_to_check_for_nvlink_errors);
uvm_record_lock_mmap_lock_read(vma->vm_mm);
uvm_va_space_down_read(va_space);
if (status != NV_OK)
break;
}
}