570.86.15

2026-02-23 00:13:59 +00:00 · 2025-01-27 19:36:56 +01:00
parent 9d0b0414a5
commit 54d69484da
1166 changed files with 318863 additions and 182687 deletions
--- a/kernel-open/nvidia-uvm/uvm_global.c
+++ b/kernel-open/nvidia-uvm/uvm_global.c
@@ -54,6 +54,9 @@ static NV_STATUS uvm_register_callbacks(void)
    g_exported_uvm_ops.stopDevice  = NULL;
    g_exported_uvm_ops.isrTopHalf  = uvm_isr_top_half_entry;

+    g_exported_uvm_ops.drainP2P = uvm_suspend_and_drainP2P_entry;
+    g_exported_uvm_ops.resumeP2P = uvm_resumeP2P_entry;
+
    // Register the UVM callbacks with the main GPU driver:
    status = uvm_rm_locked_call(nvUvmInterfaceRegisterUvmCallbacks(&g_exported_uvm_ops));
    if (status != NV_OK)
@@ -100,12 +103,6 @@ NV_STATUS uvm_global_init(void)
        goto error;
    }

-    status = errno_to_nv_status(nv_kthread_q_init(&g_uvm_global.deferred_release_q, "UVM deferred release queue"));
-    if (status != NV_OK) {
-        UVM_DBG_PRINT("nv_kthread_q_init() failed: %s\n", nvstatusToString(status));
-        goto error;
-    }
-
    status = uvm_procfs_init();
    if (status != NV_OK) {
        UVM_ERR_PRINT("uvm_procfs_init() failed: %s\n", nvstatusToString(status));
@@ -194,15 +191,22 @@ NV_STATUS uvm_global_init(void)
        goto error;
    }

-    // This sets up the ISR (interrupt service routine), by hooking into RM's top-half ISR callback. As soon as this
-    // call completes, GPU interrupts will start arriving, so it's important to be prepared to receive interrupts before
-    // this point:
+    // This sets up the ISR (interrupt service routine), by hooking into RM's
+    // top-half ISR callback. As soon as this call completes, GPU interrupts
+    // will start arriving, so it's important to be prepared to receive
+    // interrupts before this point.
    status = uvm_register_callbacks();
    if (status != NV_OK) {
        UVM_ERR_PRINT("uvm_register_callbacks failed: %s\n", nvstatusToString(status));
        goto error;
    }

+    status = errno_to_nv_status(nv_kthread_q_init(&g_uvm_global.deferred_release_q, "UVM deferred release queue"));
+    if (status != NV_OK) {
+        UVM_DBG_PRINT("nv_kthread_q_init() failed: %s\n", nvstatusToString(status));
+        goto error;
+    }
+
    return NV_OK;

 error:
@@ -214,9 +218,7 @@ void uvm_global_exit(void)
 {
    uvm_assert_mutex_unlocked(&g_uvm_global.global_lock);

-    // Guarantee completion of any release callbacks scheduled after the flush
-    // in uvm_resume().
-    nv_kthread_q_flush(&g_uvm_global.deferred_release_q);
+    nv_kthread_q_stop(&g_uvm_global.deferred_release_q);

    uvm_unregister_callbacks();
    uvm_service_block_context_exit();
@@ -237,7 +239,6 @@ void uvm_global_exit(void)

    uvm_procfs_exit();

-    nv_kthread_q_stop(&g_uvm_global.deferred_release_q);
    nv_kthread_q_stop(&g_uvm_global.global_q);

    uvm_assert_mutex_unlocked(&g_uvm_global.va_spaces.lock);
@@ -472,3 +473,68 @@ NV_STATUS uvm_global_gpu_check_ecc_error(uvm_processor_mask_t *gpus)

    return NV_OK;
 }
+
+static NV_STATUS suspend_and_drainP2P(const NvProcessorUuid *parent_uuid)
+{
+    NV_STATUS status = NV_OK;
+    uvm_parent_gpu_t *parent_gpu;
+
+    uvm_mutex_lock(&g_uvm_global.global_lock);
+
+    // NVLINK STO recovery is not supported in combination with MIG
+    parent_gpu = uvm_parent_gpu_get_by_uuid(parent_uuid);
+    if (!parent_gpu || parent_gpu->smc.enabled) {
+        status = NV_ERR_INVALID_DEVICE;
+        goto unlock;
+    }
+
+    status = uvm_channel_manager_suspend_p2p(parent_gpu->gpus[0]->channel_manager);
+
+unlock:
+    uvm_mutex_unlock(&g_uvm_global.global_lock);
+    return status;
+}
+
+static NV_STATUS resumeP2P(const NvProcessorUuid *parent_uuid)
+{
+    NV_STATUS status = NV_OK;
+    uvm_parent_gpu_t *parent_gpu;
+
+    uvm_mutex_lock(&g_uvm_global.global_lock);
+
+    // NVLINK STO recovery is not supported in combination with MIG
+    parent_gpu = uvm_parent_gpu_get_by_uuid(parent_uuid);
+    if (!parent_gpu || parent_gpu->smc.enabled) {
+        status = NV_ERR_INVALID_DEVICE;
+        goto unlock;
+    }
+
+    uvm_channel_manager_resume_p2p(parent_gpu->gpus[0]->channel_manager);
+
+unlock:
+    uvm_mutex_unlock(&g_uvm_global.global_lock);
+    return status;
+}
+
+NV_STATUS uvm_suspend_and_drainP2P_entry(const NvProcessorUuid *uuid)
+{
+    UVM_ENTRY_RET(suspend_and_drainP2P(uuid));
+}
+
+NV_STATUS uvm_resumeP2P_entry(const NvProcessorUuid *uuid)
+{
+    UVM_ENTRY_RET(resumeP2P(uuid));
+}
+
+NV_STATUS uvm_global_gpu_check_nvlink_error(uvm_processor_mask_t *gpus)
+{
+    uvm_gpu_t *gpu;
+
+    for_each_gpu_in_mask(gpu, gpus) {
+        NV_STATUS status = uvm_gpu_check_nvlink_error(gpu);
+        if (status != NV_OK)
+            return status;
+    }
+
+    return NV_OK;
+}