570.158.01

2026-02-02 06:29:47 +00:00 · 2025-06-17 11:52:54 -07:00
parent d5cb404571
commit 443ace971f
36 changed files with 302 additions and 112 deletions
--- a/kernel-open/Kbuild
+++ b/kernel-open/Kbuild
@@ -79,7 +79,7 @@ ccflags-y += -I$(src)/common/inc
 ccflags-y += -I$(src)
 ccflags-y += -Wall $(DEFINES) $(INCLUDES) -Wno-cast-qual -Wno-format-extra-args
 ccflags-y += -D__KERNEL__ -DMODULE -DNVRM
-ccflags-y += -DNV_VERSION_STRING=\"570.153.02\"
+ccflags-y += -DNV_VERSION_STRING=\"570.158.01\"

 ifneq ($(SYSSRCHOST1X),)
 ccflags-y += -I$(SYSSRCHOST1X)
--- a/kernel-open/conftest.sh
+++ b/kernel-open/conftest.sh
@@ -6602,22 +6602,22 @@ compile_test() {
            compile_check_conftest "$CODE" "NV_DRM_DRIVER_HAS_DUMB_DESTROY" "" "types"
        ;;

-        memory_failure_has_trapno_arg)
+        memory_failure_queue_has_trapno_arg)
            #
-            # Check if memory_failure() has trapno parameter.
+            # Check if memory_failure_queue() has trapno parameter.
            #
            # Removed by commit 83b57531c58f ("mm/memory_failure: Remove
            # unused trapno from memory_failure") in v4.16.
            #
            CODE="
            #include <linux/mm.h>
-            void conftest_memory_failure_has_trapno_arg(unsigned long pfn,
+            void conftest_memory_failure_queue_has_trapno_arg(unsigned long pfn,
                                                        int trapno,
                                                        int flags) {
-                (void) memory_failure(pfn, trapno, flags);
+                memory_failure_queue(pfn, trapno, flags);
            }"

-            compile_check_conftest "$CODE" "NV_MEMORY_FAILURE_HAS_TRAPNO_ARG" "" "types"
+            compile_check_conftest "$CODE" "NV_MEMORY_FAILURE_QUEUE_HAS_TRAPNO_ARG" "" "types"
        ;;

        memory_failure_mf_sw_simulated_defined)
@@ -7571,7 +7571,7 @@ compile_test() {
            CODE="
            #include <linux/mmzone.h>
            int conftest_page_pgmap(void) {
-                return page_pgmap(NULL);
+                return page_pgmap();
            }"

            compile_check_conftest "$CODE" "NV_PAGE_PGMAP_PRESENT" "" "functions"
--- a/kernel-open/nvidia-uvm/uvm_ats_faults.c
+++ b/kernel-open/nvidia-uvm/uvm_ats_faults.c
@@ -767,6 +767,20 @@ NV_STATUS uvm_ats_service_access_counters(uvm_gpu_va_space_t *gpu_va_space,
                         &ats_context->access_counters.accessed_mask,
                         &ats_context->prefetch_state.residency_mask);

+    // Pretend that pages that are already resident at the destination GPU were
+    // migrated now. This makes sure that the access counter is cleared even if
+    // the accessed pages, were already resident on the target.
+    // TODO: Bug 5296998: [uvm][ats] Not clearing stale access counter
+    //                     notifications can lead to missed migrations
+    // The same problem of stale notification exists for migration to other
+    // locations than local vidmem. However, stale notifications to data
+    // migrated to another remote location are identical to those triggered
+    // by accessing memory that cannot or should not be migrated.
+    if (uvm_id_equal(ats_context->residency_id, gpu_va_space->gpu->id)) {
+        uvm_page_mask_copy(&ats_context->access_counters.migrated_mask,
+                           &ats_context->prefetch_state.residency_mask);
+    }
+
    for_each_va_block_subregion_in_mask(subregion, &ats_context->access_counters.accessed_mask, region) {
        NV_STATUS status;
        NvU64 start = base + (subregion.first * PAGE_SIZE);
@@ -779,7 +793,7 @@ NV_STATUS uvm_ats_service_access_counters(uvm_gpu_va_space_t *gpu_va_space,

        status = service_ats_requests(gpu_va_space, vma, start, length, access_type, service_type, ats_context);

-        // clear access counters if pages were migrated or migration needs to
+        // Clear access counters if pages were migrated or migration needs to
        // be retried
        if (status == NV_OK || status == NV_ERR_BUSY_RETRY)
            uvm_page_mask_region_fill(migrated_mask, subregion);
--- a/kernel-open/nvidia/nv-caps.c
+++ b/kernel-open/nvidia/nv-caps.c
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@@ -582,6 +582,9 @@ err:
 void NV_API_CALL nv_cap_close_fd(int fd)
 {
 #if NV_FILESYSTEM_ACCESS_AVAILABLE
+    struct file *file;
+    NvBool is_nv_cap_fd;
+
    if (fd == -1)
    {
        return;
@@ -600,6 +603,30 @@ void NV_API_CALL nv_cap_close_fd(int fd)
        return;
    }

+    file = fget(fd);
+    if (file == NULL)
+    {
+        task_unlock(current);
+        return;
+    }
+
+    /* Make sure the fd belongs to the nv-cap-drv */
+    is_nv_cap_fd = (file->f_op == &g_nv_cap_drv_fops);
+
+    fput(file);
+
+    /*
+     * In some cases, we may be in shutdown path and execute
+     * in context of unrelated process. In that case we should
+     * not access any 'current' state, but instead let kernel
+     * clean up capability files on its own.
+     */
+    if (!is_nv_cap_fd)
+    {
+        task_unlock(current);
+        return;
+    }
+
 /*
 * From v4.17-rc1 (to v5.10.8) kernels have stopped exporting sys_close(fd)
 * and started exporting __close_fd, as of this commit:
--- a/kernel-open/nvidia/nvidia.Kbuild
+++ b/kernel-open/nvidia/nvidia.Kbuild
@@ -257,7 +257,7 @@ NV_CONFTEST_TYPE_COMPILE_TESTS += add_memory_driver_managed_has_mhp_flags_arg
 NV_CONFTEST_TYPE_COMPILE_TESTS += num_registered_fb
 NV_CONFTEST_TYPE_COMPILE_TESTS += pci_driver_has_driver_managed_dma
 NV_CONFTEST_TYPE_COMPILE_TESTS += vm_area_struct_has_const_vm_flags
-NV_CONFTEST_TYPE_COMPILE_TESTS += memory_failure_has_trapno_arg
+NV_CONFTEST_TYPE_COMPILE_TESTS += memory_failure_queue_has_trapno_arg
 NV_CONFTEST_TYPE_COMPILE_TESTS += foll_longterm_present
 NV_CONFTEST_TYPE_COMPILE_TESTS += bus_type_has_iommu_ops
 NV_CONFTEST_TYPE_COMPILE_TESTS += class_create_has_no_owner_arg
--- a/kernel-open/nvidia/os-interface.c
+++ b/kernel-open/nvidia/os-interface.c
@@ -2596,7 +2596,6 @@ NV_STATUS NV_API_CALL os_offline_page_at_address
 {
 #if defined(CONFIG_MEMORY_FAILURE)
    int flags = 0;
-    int ret;
    NvU64 pfn;
    struct page *page = NV_GET_PAGE_STRUCT(address);

@@ -2621,22 +2620,18 @@ NV_STATUS NV_API_CALL os_offline_page_at_address
    flags |= MF_SW_SIMULATED;
 #endif

-#ifdef NV_MEMORY_FAILURE_HAS_TRAPNO_ARG
-    ret = memory_failure(pfn, 0, flags);
-#else
-    ret = memory_failure(pfn, flags);
-#endif
+    nv_printf(NV_DBG_INFO, "NVRM: offlining page at address: 0x%llx pfn: 0x%llx\n",
+              address, pfn);

-    if (ret != 0)
-    {
-        nv_printf(NV_DBG_ERRORS, "NVRM: page offlining failed. address: 0x%llx pfn: 0x%llx ret: %d\n",
-                  address, pfn, ret);
-        return NV_ERR_OPERATING_SYSTEM;
-    }
+#ifdef NV_MEMORY_FAILURE_QUEUE_HAS_TRAPNO_ARG
+    memory_failure_queue(pfn, 0, flags);
+#else
+    memory_failure_queue(pfn, flags);
+#endif

    return NV_OK;
 #else // !defined(CONFIG_MEMORY_FAILURE)
-    nv_printf(NV_DBG_ERRORS, "NVRM: memory_failure() not supported by kernel. page offlining failed. address: 0x%llx\n",
+    nv_printf(NV_DBG_ERRORS, "NVRM: memory_failure_queue() not supported by kernel. page offlining failed. address: 0x%llx\n",
              address);
    return NV_ERR_NOT_SUPPORTED;
 #endif