535.216.01

2026-02-24 08:53:57 +00:00 · 2024-10-22 17:35:00 +02:00
parent c588c3877f
commit 60d85c464b
92 changed files with 1189 additions and 1226 deletions
--- a/kernel-open/nvidia-uvm/nv-kthread-q-selftest.c
+++ b/kernel-open/nvidia-uvm/nv-kthread-q-selftest.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2016 NVIDIA Corporation
+    Copyright (c) 2016-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -81,7 +81,7 @@
 #define NUM_Q_ITEMS_IN_MULTITHREAD_TEST (NUM_TEST_Q_ITEMS * NUM_TEST_KTHREADS)

 // This exists in order to have a function to place a breakpoint on:
-void on_nvq_assert(void)
+static void on_nvq_assert(void)
 {
    (void)NULL;
 }
--- a/kernel-open/nvidia-uvm/nv-kthread-q.c
+++ b/kernel-open/nvidia-uvm/nv-kthread-q.c
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2016 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2016-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@@ -176,7 +176,7 @@ static struct task_struct *thread_create_on_node(int (*threadfn)(void *data),
 {

    unsigned i, j;
-    const static unsigned attempts = 3;
+    static const unsigned attempts = 3;
    struct task_struct *thread[3];

    for (i = 0;; i++) {
--- a/kernel-open/nvidia-uvm/uvm_channel.c
+++ b/kernel-open/nvidia-uvm/uvm_channel.c
@@ -722,7 +722,17 @@ static void internal_channel_submit_work_wlc(uvm_push_t *push)

    // Wait for the WLC/LCIC to be primed. This means that PUT == GET + 2
    // and a WLC doorbell ring is enough to start work.
-    UVM_SPIN_WHILE(!uvm_gpu_tracking_semaphore_is_completed(&lcic_channel->tracking_sem), &spin);
+    UVM_SPIN_WHILE(!uvm_gpu_tracking_semaphore_is_completed(&lcic_channel->tracking_sem), &spin) {
+        NV_STATUS status = uvm_channel_check_errors(lcic_channel);
+        if (status != NV_OK) {
+            UVM_ASSERT(uvm_global_get_status() != NV_OK);
+
+            // If there's a global fatal error we can't communicate with the GPU
+            // and the below launch sequence doesn't work.
+            UVM_ERR_PRINT_NV_STATUS("Failed to wait for LCIC channel (%s) completion.", status, lcic_channel->name);
+            return;
+        }
+    }

    // Executing WLC adds an extra job to LCIC
    ++lcic_channel->tracking_sem.queued_value;
@@ -3250,7 +3260,17 @@ static void channel_manager_stop_wlc(uvm_channel_manager_t *manager)

        // Wait for the WLC/LCIC to be primed. This means that PUT == GET + 2
        // and a WLC doorbell ring is enough to start work.
-        UVM_SPIN_WHILE(!uvm_gpu_tracking_semaphore_is_completed(&channel->tracking_sem), &spin);
+        UVM_SPIN_WHILE(!uvm_gpu_tracking_semaphore_is_completed(&channel->tracking_sem), &spin) {
+            status = uvm_channel_check_errors(channel);
+            if (status != NV_OK) {
+                UVM_ERR_PRINT_NV_STATUS("Failed to wait for LCIC channel (%s) completion", status, channel->name);
+                break;
+            }
+        }
+
+        // Continue on error and attempt to stop WLC below. This can lead to
+        // channel destruction with mismatched GET and PUT pointers. RM will
+        // print errors if that's the case, but channel destruction succeeeds.
    }

    status = uvm_push_begin(manager, UVM_CHANNEL_TYPE_SEC2, &push, "Stop WLC channels");
--- a/kernel-open/nvidia-uvm/uvm_common.h
+++ b/kernel-open/nvidia-uvm/uvm_common.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2013-2021 NVIDIA Corporation
+    Copyright (c) 2013-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -434,7 +434,9 @@ static void uvm_get_unaddressable_range(NvU32 num_va_bits, NvU64 *first, NvU64 *
    UVM_ASSERT(first);
    UVM_ASSERT(outer);

-    if (uvm_platform_uses_canonical_form_address()) {
+    // Maxwell GPUs (num_va_bits == 40b) do not support canonical form address
+    // even when plugged into platforms using it.
+    if (uvm_platform_uses_canonical_form_address() && num_va_bits > 40) {
        *first = 1ULL << (num_va_bits - 1);
        *outer = (NvU64)((NvS64)(1ULL << 63) >> (64 - num_va_bits));
    }
--- a/kernel-open/nvidia-uvm/uvm_gpu_access_counters.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu_access_counters.c
@@ -680,7 +680,10 @@ static void access_counter_buffer_flush_locked(uvm_gpu_t *gpu, uvm_gpu_buffer_fl

    while (get != put) {
        // Wait until valid bit is set
-        UVM_SPIN_WHILE(!gpu->parent->access_counter_buffer_hal->entry_is_valid(gpu->parent, get), &spin);
+        UVM_SPIN_WHILE(!gpu->parent->access_counter_buffer_hal->entry_is_valid(gpu->parent, get), &spin) {
+            if (uvm_global_get_status() != NV_OK)
+                goto done;
+        }

        gpu->parent->access_counter_buffer_hal->entry_clear_valid(gpu->parent, get);
        ++get;
@@ -688,6 +691,7 @@ static void access_counter_buffer_flush_locked(uvm_gpu_t *gpu, uvm_gpu_buffer_fl
            get = 0;
    }

+done:
    write_get(gpu->parent, get);
 }

@@ -813,12 +817,18 @@ static NvU32 fetch_access_counter_buffer_entries(uvm_gpu_t *gpu,
           (fetch_mode == NOTIFICATION_FETCH_MODE_ALL || notification_index < access_counters->max_batch_size)) {
        uvm_access_counter_buffer_entry_t *current_entry = &notification_cache[notification_index];

-        // We cannot just wait for the last entry (the one pointed by put) to become valid, we have to do it
-        // individually since entries can be written out of order
+        // We cannot just wait for the last entry (the one pointed by put) to
+        // become valid, we have to do it individually since entries can be
+        // written out of order
        UVM_SPIN_WHILE(!gpu->parent->access_counter_buffer_hal->entry_is_valid(gpu->parent, get), &spin) {
            // We have some entry to work on. Let's do the rest later.
            if (fetch_mode != NOTIFICATION_FETCH_MODE_ALL && notification_index > 0)
                goto done;
+
+            // There's no entry to work on and something has gone wrong. Ignore
+            // the rest.
+            if (uvm_global_get_status() != NV_OK)
+               goto done;
        }

        // Prevent later accesses being moved above the read of the valid bit
--- a/kernel-open/nvidia-uvm/uvm_gpu_replayable_faults.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu_replayable_faults.c
@@ -624,7 +624,15 @@ static NV_STATUS fault_buffer_flush_locked(uvm_gpu_t *gpu,

    while (get != put) {
        // Wait until valid bit is set
-        UVM_SPIN_WHILE(!parent_gpu->fault_buffer_hal->entry_is_valid(parent_gpu, get), &spin);
+        UVM_SPIN_WHILE(!parent_gpu->fault_buffer_hal->entry_is_valid(parent_gpu, get), &spin) {
+            // Channels might be idle (e.g. in teardown) so check for errors
+            // actively. In that case the gpu pointer is valid.
+            NV_STATUS status = gpu ? uvm_channel_manager_check_errors(gpu->channel_manager) : uvm_global_get_status();
+            if (status != NV_OK) {
+                write_get(parent_gpu, get);
+                return status;
+            }
+        }

        fault_buffer_skip_replayable_entry(parent_gpu, get);
        ++get;
@@ -857,6 +865,10 @@ static NV_STATUS fetch_fault_buffer_entries(uvm_gpu_t *gpu,
            // We have some entry to work on. Let's do the rest later.
            if (fetch_mode == FAULT_FETCH_MODE_BATCH_READY && fault_index > 0)
                goto done;
+            
+            status = uvm_global_get_status();
+            if (status != NV_OK)
+                goto done;
        }

        // Prevent later accesses being moved above the read of the valid bit