570.124.04

2026-02-02 22:47:25 +00:00 · 2025-02-27 17:32:23 +01:00
parent 81fe4fb417
commit 129479b1b7
141 changed files with 102245 additions and 100070 deletions
--- a/kernel-open/Kbuild
+++ b/kernel-open/Kbuild
@@ -86,7 +86,7 @@ EXTRA_CFLAGS += -I$(src)/common/inc
 EXTRA_CFLAGS += -I$(src)
 EXTRA_CFLAGS += -Wall $(DEFINES) $(INCLUDES) -Wno-cast-qual -Wno-format-extra-args
 EXTRA_CFLAGS += -D__KERNEL__ -DMODULE -DNVRM
-EXTRA_CFLAGS += -DNV_VERSION_STRING=\"570.86.16\"
+EXTRA_CFLAGS += -DNV_VERSION_STRING=\"570.124.04\"

 ifneq ($(SYSSRCHOST1X),)
 EXTRA_CFLAGS += -I$(SYSSRCHOST1X)
--- a/kernel-open/common/inc/nv_uvm_types.h
+++ b/kernel-open/common/inc/nv_uvm_types.h
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2014-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2014-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@@ -756,6 +756,8 @@ typedef struct UvmGpuFbInfo_tag
    NvBool bStaticBar1Enabled; // Static BAR1 mode is enabled
    NvU64  staticBar1StartOffset;  // The start offset of the the static mapping
    NvU64  staticBar1Size;     // The size of the static mapping
+    NvU32  heapStart;          // The start offset of heap in KB, helpful for MIG
+                               // systems
 } UvmGpuFbInfo;

 typedef struct UvmGpuEccInfo_tag
--- a/kernel-open/conftest.sh
+++ b/kernel-open/conftest.sh
@@ -6307,6 +6307,32 @@ compile_test() {
            compile_check_conftest "$CODE" "NV_NUM_REGISTERED_FB_PRESENT" "" "types"
        ;;

+        acpi_video_register_backlight)
+            #
+            # Determine if acpi_video_register_backlight() function is present
+            #
+            # acpi_video_register_backlight was added by commit 3dbc80a3e4c55c
+            # (ACPI: video: Make backlight class device registration a separate
+            # step (v2)) for v6.0 (2022-09-02).
+            # Note: the include directive for <linux/types> in this conftest is
+            # necessary in order to support kernels between commit 0b9f7d93ca61
+            # ("ACPI / i915: ignore firmware requests backlight change") for
+            # v3.16 (2014-07-07) and commit 3bd6bce369f5 ("ACPI / video: Port
+            # to new backlight interface selection API") for v4.2 (2015-07-16).
+            # Kernels within this range use the 'bool' type and the related
+            # 'false' value in <acpi/video.h> without first including the
+            # definitions of that type and value.
+            #
+            CODE="
+            #include <linux/types.h>
+            #include <acpi/video.h>
+            void conftest_acpi_video_register_backlight(void) {
+                acpi_video_register_backlight(0);
+            }"
+
+            compile_check_conftest "$CODE" "NV_ACPI_VIDEO_REGISTER_BACKLIGHT" "" "functions"
+        ;;
+
        acpi_video_backlight_use_native)
            #
            # Determine if acpi_video_backlight_use_native() function is present
@@ -6690,13 +6716,18 @@ compile_test() {
            #
            # Determine whether drm_client_setup is present.
            #
-            # Added by commit d07fdf922592 ("drm/fbdev-ttm:
-            # Convert to client-setup") in v6.13.
+            # Added by commit d07fdf922592 ("drm/fbdev-ttm: Convert to
+            # client-setup") in v6.13 in drm/drm_client_setup.h, but then moved
+            # to drm/clients/drm_client_setup.h by commit b86711c6d6e2
+            # ("drm/client: Move public client header to clients/ subdirectory")
+            # in linux-next b86711c6d6e2.
            #
            CODE="
            #include <drm/drm_fb_helper.h>
            #if defined(NV_DRM_DRM_CLIENT_SETUP_H_PRESENT)
            #include <drm/drm_client_setup.h>
+            #elif defined(NV_DRM_CLIENTS_DRM_CLIENT_SETUP_H_PRESENT)
+            #include <drm/clients/drm_client_setup.h>
            #endif
            void conftest_drm_client_setup(void) {
                drm_client_setup();
@@ -7509,6 +7540,31 @@ compile_test() {
            compile_check_conftest "$CODE" "NV_MODULE_IMPORT_NS_TAKES_CONSTANT" "" "generic"
        ;;

+
+        drm_driver_has_date)
+            #
+            # Determine if the 'drm_driver' structure has a 'date' field.
+            #
+            # Removed by commit cb2e1c2136f7 ("drm: remove driver date from
+            # struct drm_driver and all drivers") in linux-next, expected in
+            # v6.14.
+            #
+            CODE="
+            #if defined(NV_DRM_DRMP_H_PRESENT)
+            #include <drm/drmP.h>
+            #endif
+
+            #if defined(NV_DRM_DRM_DRV_H_PRESENT)
+            #include <drm/drm_drv.h>
+            #endif
+
+            int conftest_drm_driver_has_date(void) {
+                return offsetof(struct drm_driver, date);
+            }"
+
+            compile_check_conftest "$CODE" "NV_DRM_DRIVER_HAS_DATE" "" "types"
+        ;;
+
        # When adding a new conftest entry, please use the correct format for
        # specifying the relevant upstream Linux kernel commit.  Please
        # avoid specifying -rc kernels, and only use SHAs that actually exist
--- a/kernel-open/header-presence-tests.mk
+++ b/kernel-open/header-presence-tests.mk
@@ -31,6 +31,7 @@ NV_HEADER_PRESENCE_TESTS = \
  drm/drm_mode_config.h \
  drm/drm_modeset_lock.h \
  drm/drm_property.h \
+  drm/clients/drm_client_setup.h \
  dt-bindings/interconnect/tegra_icc_id.h \
  generated/autoconf.h \
  generated/compile.h \
--- a/kernel-open/nvidia-drm/nvidia-drm-conftest.h
+++ b/kernel-open/nvidia-drm/nvidia-drm-conftest.h
@@ -65,9 +65,13 @@
 #if defined(NV_DRM_CLIENT_SETUP_PRESENT) &&                                    \
    (defined(NV_DRM_APERTURE_REMOVE_CONFLICTING_PCI_FRAMEBUFFERS_PRESENT) ||   \
     defined(NV_APERTURE_REMOVE_CONFLICTING_PCI_DEVICES_PRESENT))
+// XXX remove dependency on DRM_TTM_HELPER by implementing nvidia-drm's own
+// .fbdev_probe callback that uses NVKMS kapi
+#if IS_ENABLED(CONFIG_DRM_TTM_HELPER)
 #define NV_DRM_FBDEV_AVAILABLE
 #define NV_DRM_CLIENT_AVAILABLE
 #endif
+#endif

 /*
 * We can support color management if either drm_helper_crtc_enable_color_mgmt()
--- a/kernel-open/nvidia-drm/nvidia-drm-drv.c
+++ b/kernel-open/nvidia-drm/nvidia-drm-drv.c
@@ -78,6 +78,8 @@

 #if defined(NV_DRM_DRM_CLIENT_SETUP_H_PRESENT)
 #include <drm/drm_client_setup.h>
+#elif defined(NV_DRM_CLIENTS_DRM_CLIENT_SETUP_H_PRESENT)
+#include <drm/clients/drm_client_setup.h>
 #endif

 #if defined(NV_DRM_DRM_FBDEV_TTM_H_PRESENT)
@@ -1915,14 +1917,18 @@ static struct drm_driver nv_drm_driver = {
    .name                   = "nvidia-drm",

    .desc                   = "NVIDIA DRM driver",
+
+#if defined(NV_DRM_DRIVER_HAS_DATE)
    .date                   = "20160202",
+#endif

 #if defined(NV_DRM_DRIVER_HAS_DEVICE_LIST)
    .device_list            = LIST_HEAD_INIT(nv_drm_driver.device_list),
 #elif defined(NV_DRM_DRIVER_HAS_LEGACY_DEV_LIST)
    .legacy_dev_list        = LIST_HEAD_INIT(nv_drm_driver.legacy_dev_list),
 #endif
-#if defined(DRM_FBDEV_TTM_DRIVER_OPS)
+// XXX implement nvidia-drm's own .fbdev_probe callback that uses NVKMS kapi directly
+#if defined(NV_DRM_FBDEV_AVAILABLE) && defined(DRM_FBDEV_TTM_DRIVER_OPS)
    DRM_FBDEV_TTM_DRIVER_OPS,
 #endif
 };
--- a/kernel-open/nvidia-drm/nvidia-drm-sources.mk
+++ b/kernel-open/nvidia-drm/nvidia-drm-sources.mk
@@ -143,4 +143,5 @@ NV_CONFTEST_TYPE_COMPILE_TESTS += drm_color_lut
 NV_CONFTEST_TYPE_COMPILE_TESTS += drm_property_blob_put
 NV_CONFTEST_TYPE_COMPILE_TESTS += drm_driver_has_gem_prime_mmap
 NV_CONFTEST_TYPE_COMPILE_TESTS += drm_output_poll_changed
+NV_CONFTEST_TYPE_COMPILE_TESTS += drm_driver_has_date
 NV_CONFTEST_TYPE_COMPILE_TESTS += file_operations_fop_unsigned_offset_present
--- a/kernel-open/nvidia-modeset/nvidia-modeset-linux.c
+++ b/kernel-open/nvidia-modeset/nvidia-modeset-linux.c
@@ -1050,6 +1050,11 @@ nvkms_register_backlight(NvU32 gpu_id, NvU32 display_id, void *drv_priv,

 #if defined(NV_ACPI_VIDEO_BACKLIGHT_USE_NATIVE)
    if (!acpi_video_backlight_use_native()) {
+#if defined(NV_ACPI_VIDEO_REGISTER_BACKLIGHT)
+        nvkms_log(NVKMS_LOG_LEVEL_INFO, NVKMS_LOG_PREFIX,
+                  "ACPI reported no NVIDIA native backlight available; attempting to use ACPI backlight.");
+        acpi_video_register_backlight();
+#endif
        return NULL;
    }
 #endif
--- a/kernel-open/nvidia-modeset/nvidia-modeset.Kbuild
+++ b/kernel-open/nvidia-modeset/nvidia-modeset.Kbuild
@@ -102,4 +102,5 @@ NV_CONFTEST_FUNCTION_COMPILE_TESTS += list_is_first
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += ktime_get_real_ts64
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += ktime_get_raw_ts64
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += acpi_video_backlight_use_native
+NV_CONFTEST_FUNCTION_COMPILE_TESTS += acpi_video_register_backlight
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += kernel_read_has_pointer_pos_arg
--- a/kernel-open/nvidia-uvm/uvm_ats_faults.c
+++ b/kernel-open/nvidia-uvm/uvm_ats_faults.c
@@ -29,6 +29,7 @@
 #include <linux/nodemask.h>
 #include <linux/mempolicy.h>
 #include <linux/mmu_notifier.h>
+#include <linux/topology.h>

 #if UVM_HMM_RANGE_FAULT_SUPPORTED()
 #include <linux/hmm.h>
@@ -291,6 +292,27 @@ static const struct mmu_interval_notifier_ops uvm_ats_notifier_ops =

 #endif

+static bool resident_policy_match(struct vm_area_struct *vma, int dst_nid, int src_nid)
+{
+#if defined(NV_MEMPOLICY_HAS_UNIFIED_NODES)
+    struct mempolicy *vma_policy = vma_policy(vma);
+
+    // TODO: Bug 4981209: When migrations between CPU numa nodes are supported,
+    // add (dst_nid != closest_cpu_numa_node) to allow migrations between CPU
+    // NUMA nodes when destination is the closest_cpu_numa_node.
+    if (vma_policy &&
+        node_isset(src_nid, vma_policy->nodes) &&
+        node_isset(dst_nid, vma_policy->nodes) &&
+        !cpumask_empty(cpumask_of_node(src_nid)) &&
+        !cpumask_empty(cpumask_of_node(dst_nid))) {
+
+        return true;
+    }
+#endif
+
+    return false;
+}
+
 static NV_STATUS ats_compute_residency_mask(uvm_gpu_va_space_t *gpu_va_space,
                                            struct vm_area_struct *vma,
                                            NvU64 base,
@@ -370,9 +392,23 @@ static NV_STATUS ats_compute_residency_mask(uvm_gpu_va_space_t *gpu_va_space,

            if (pfn & HMM_PFN_VALID) {
                struct page *page = hmm_pfn_to_page(pfn);
+                int resident_node = page_to_nid(page);

-                if (page_to_nid(page) == ats_context->residency_node)
+                // Set the residency_mask if:
+                // - The page is already resident at the intended destination.
+                //   or
+                // - If both the source and destination nodes are CPU nodes and
+                //   source node is already in the list of preferred nodes for
+                //   the vma. On multi-CPU NUMA node architectures, this avoids
+                //   unnecessary migrations between CPU nodes. Since the
+                //   specific ats_context->residency_node selected by
+                //   ats_batch_select_residency() is just a guess among the list
+                //   of preferred nodes, paying the cost of migration across the
+                //   CPU preferred nodes in this case can't be justified.
+                if ((resident_node == ats_context->residency_node) ||
+                    resident_policy_match(vma, ats_context->residency_node, resident_node)) {
                    uvm_page_mask_set(residency_mask, page_index);
+                }

                ats_context->prefetch_state.first_touch = false;
            }
--- a/kernel-open/nvidia-uvm/uvm_global.c
+++ b/kernel-open/nvidia-uvm/uvm_global.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2024 NVIDIA Corporation
+    Copyright (c) 2015-2025 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -35,6 +35,7 @@
 #include "uvm_mmu.h"
 #include "uvm_perf_heuristics.h"
 #include "uvm_pmm_sysmem.h"
+#include "uvm_pmm_gpu.h"
 #include "uvm_migrate.h"
 #include "uvm_gpu_access_counters.h"
 #include "uvm_va_space_mm.h"
@@ -90,6 +91,8 @@ NV_STATUS uvm_global_init(void)
    uvm_spin_lock_irqsave_init(&g_uvm_global.gpu_table_lock, UVM_LOCK_ORDER_LEAF);
    uvm_mutex_init(&g_uvm_global.va_spaces.lock, UVM_LOCK_ORDER_VA_SPACES_LIST);
    INIT_LIST_HEAD(&g_uvm_global.va_spaces.list);
+    uvm_mutex_init(&g_uvm_global.devmem_ranges.lock, UVM_LOCK_ORDER_LEAF);
+    INIT_LIST_HEAD(&g_uvm_global.devmem_ranges.list);

    status = uvm_kvmalloc_init();
    if (status != NV_OK) {
@@ -231,6 +234,7 @@ void uvm_global_exit(void)
    uvm_va_policy_exit();
    uvm_mem_global_exit();
    uvm_pmm_sysmem_exit();
+    uvm_pmm_devmem_exit();
    uvm_gpu_exit();
    uvm_processor_mask_cache_exit();

--- a/kernel-open/nvidia-uvm/uvm_global.h
+++ b/kernel-open/nvidia-uvm/uvm_global.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2024 NVIDIA Corporation
+    Copyright (c) 2015-2025 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -157,6 +157,12 @@ struct uvm_global_struct
    // This field is set once during global initialization (uvm_global_init),
    // and can be read afterwards without acquiring any locks.
    bool conf_computing_enabled;
+
+    // List of all devmem ranges allocted on this GPU
+    struct {
+        uvm_mutex_t lock;
+        struct list_head list;
+    } devmem_ranges;
 };

 // Initialize global uvm state
--- a/kernel-open/nvidia-uvm/uvm_gpu.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2024 NVIDIA Corporation
+    Copyright (c) 2015-2025 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -109,8 +109,10 @@ static void fill_parent_gpu_info(uvm_parent_gpu_t *parent_gpu, const UvmGpuInfo
    // nvswitch is routed via physical pages, where the upper 13-bits of the
    // 47-bit address space holds the routing information for each peer.
    // Currently, this is limited to a 16GB framebuffer window size.
-    if (parent_gpu->nvswitch_info.is_nvswitch_connected)
+    if (parent_gpu->nvswitch_info.is_nvswitch_connected) {
        parent_gpu->nvswitch_info.fabric_memory_window_start = gpu_info->nvswitchMemoryWindowStart;
+        parent_gpu->nvswitch_info.egm_fabric_memory_window_start = gpu_info->nvswitchEgmMemoryWindowStart;
+    }

    uvm_uuid_string(uuid_buffer, &parent_gpu->uuid);
    snprintf(parent_gpu->name,
@@ -244,6 +246,7 @@ static NV_STATUS get_gpu_fb_info(uvm_gpu_t *gpu)
    if (!fb_info.bZeroFb) {
        gpu->mem_info.size = ((NvU64)fb_info.heapSize + fb_info.reservedHeapSize) * 1024;
        gpu->mem_info.max_allocatable_address = fb_info.maxAllocatableAddress;
+        gpu->mem_info.phys_start = (NvU64)fb_info.heapStart * 1024;
    }

    gpu->mem_info.max_vidmem_page_size = fb_info.maxVidmemPageSize;
@@ -568,6 +571,9 @@ static void gpu_info_print_common(uvm_gpu_t *gpu, struct seq_file *s)
    UVM_SEQ_OR_DBG_PRINT(s, "big_page_size                          %u\n", gpu->big_page.internal_size);
    UVM_SEQ_OR_DBG_PRINT(s, "rm_va_base                             0x%llx\n", gpu->parent->rm_va_base);
    UVM_SEQ_OR_DBG_PRINT(s, "rm_va_size                             0x%llx\n", gpu->parent->rm_va_size);
+    UVM_SEQ_OR_DBG_PRINT(s, "vidmem_start                           %llu (%llu MBs)\n",
+                         gpu->mem_info.phys_start,
+                         gpu->mem_info.phys_start / (1024 * 1024));
    UVM_SEQ_OR_DBG_PRINT(s, "vidmem_size                            %llu (%llu MBs)\n",
                         gpu->mem_info.size,
                         gpu->mem_info.size / (1024 * 1024));
@@ -1361,6 +1367,7 @@ static NV_STATUS init_parent_gpu(uvm_parent_gpu_t *parent_gpu,
                                 const UvmGpuPlatformInfo *gpu_platform_info)
 {
    NV_STATUS status;
+    UvmGpuFbInfo fb_info = {0};

    status = uvm_rm_locked_call(nvUvmInterfaceDeviceCreate(uvm_global_session_handle(),
                                                           gpu_info,
@@ -1384,8 +1391,15 @@ static NV_STATUS init_parent_gpu(uvm_parent_gpu_t *parent_gpu,
    parent_gpu->egm.local_peer_id = gpu_info->egmPeerId;
    parent_gpu->egm.base_address = gpu_info->egmBaseAddr;

+    status = uvm_rm_locked_call(nvUvmInterfaceGetFbInfo(parent_gpu->rm_device, &fb_info));
+    if (status != NV_OK)
+        return status;
+
    parent_gpu->sli_enabled = (gpu_info->subdeviceCount > 1);

+    if (!fb_info.bZeroFb)
+        parent_gpu->max_allocatable_address = fb_info.maxAllocatableAddress;
+
    parent_gpu->virt_mode = gpu_info->virtMode;
    if (parent_gpu->virt_mode == UVM_VIRT_MODE_LEGACY) {
        UVM_ERR_PRINT("Failed to init GPU %s. UVM is not supported in legacy virtualization mode\n",
@@ -1419,6 +1433,14 @@ static NV_STATUS init_parent_gpu(uvm_parent_gpu_t *parent_gpu,

    uvm_mmu_init_gpu_chunk_sizes(parent_gpu);

+    status = uvm_pmm_devmem_init(parent_gpu);
+    if (status != NV_OK) {
+        UVM_ERR_PRINT("failed to intialize device private memory: %s, GPU %s\n",
+                      nvstatusToString(status),
+                      uvm_parent_gpu_name(parent_gpu));
+        return status;
+    }
+
    status = uvm_ats_add_gpu(parent_gpu);
    if (status != NV_OK) {
        UVM_ERR_PRINT("uvm_ats_add_gpu failed: %s, GPU %s\n",
@@ -1667,6 +1689,7 @@ static void deinit_parent_gpu(uvm_parent_gpu_t *parent_gpu)

    deinit_parent_procfs_files(parent_gpu);

+    uvm_pmm_devmem_deinit(parent_gpu);
    uvm_ats_remove_gpu(parent_gpu);

    UVM_ASSERT(atomic64_read(&parent_gpu->mapped_cpu_pages_size) == 0);
--- a/kernel-open/nvidia-uvm/uvm_gpu.h
+++ b/kernel-open/nvidia-uvm/uvm_gpu.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2024 NVIDIA Corporation
+    Copyright (c) 2015-2025 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -696,6 +696,11 @@ struct uvm_gpu_struct
        // ZeroFB testing mode, this will be 0.
        NvU64 size;

+        // Physical start of heap, for SMC enabled GPUs, this is useful to
+        // partition PMM, it is used by HMM to figure out the right translation
+        // between HMM ranges and PMM offsets.
+        NvU64 phys_start;
+
        // Max (inclusive) physical address of this GPU's memory that the driver
        // can allocate through PMM (PMA).
        NvU64 max_allocatable_address;
@@ -1015,6 +1020,13 @@ struct uvm_parent_gpu_struct
    // Do not read this field directly, use uvm_gpu_device_handle instead.
    uvmGpuDeviceHandle rm_device;

+    // Total amount of physical memory available on the parent GPU.
+    NvU64 max_allocatable_address;
+
+#if UVM_IS_CONFIG_HMM()
+    uvm_pmm_gpu_devmem_t *devmem;
+#endif
+
    // The physical address range addressable by the GPU
    //
    // The GPU has its NV_PFB_XV_UPPER_ADDR register set by RM to
@@ -1288,6 +1300,10 @@ struct uvm_parent_gpu_struct
        // 47-bit fabric memory physical offset that peer gpus need to access
        // to read a peer's memory
        NvU64 fabric_memory_window_start;
+
+        // 47-bit fabric memory physical offset that peer gpus need to access
+        // to read remote EGM memory.
+        NvU64 egm_fabric_memory_window_start;
    } nvswitch_info;

    struct
--- a/kernel-open/nvidia-uvm/uvm_hmm.c
+++ b/kernel-open/nvidia-uvm/uvm_hmm.c
@@ -321,13 +321,17 @@ void uvm_hmm_unregister_gpu(uvm_va_space_t *va_space, uvm_gpu_t *gpu, struct mm_
 {
    uvm_range_tree_node_t *node;
    uvm_va_block_t *va_block;
-    struct range range = gpu->pmm.devmem.pagemap.range;
+    unsigned long devmem_start;
+    unsigned long devmem_end;
    unsigned long pfn;
    bool retry;

    if (!uvm_hmm_is_enabled(va_space))
        return;

+    devmem_start = gpu->parent->devmem->pagemap.range.start + gpu->mem_info.phys_start;
+    devmem_end = devmem_start + gpu->mem_info.size;
+
    if (mm)
        uvm_assert_mmap_lock_locked(mm);
    uvm_assert_rwsem_locked_write(&va_space->lock);
@@ -341,7 +345,7 @@ void uvm_hmm_unregister_gpu(uvm_va_space_t *va_space, uvm_gpu_t *gpu, struct mm_
    do {
        retry = false;

-        for (pfn = __phys_to_pfn(range.start); pfn <= __phys_to_pfn(range.end); pfn++) {
+        for (pfn = __phys_to_pfn(devmem_start); pfn <= __phys_to_pfn(devmem_end); pfn++) {
            struct page *page = pfn_to_page(pfn);

            UVM_ASSERT(is_device_private_page(page));
@@ -349,7 +353,7 @@ void uvm_hmm_unregister_gpu(uvm_va_space_t *va_space, uvm_gpu_t *gpu, struct mm_
            // This check is racy because nothing stops the page being freed and
            // even reused. That doesn't matter though - worst case the
            // migration fails, we retry and find the va_space doesn't match.
-            if (page->zone_device_data == va_space)
+            if (uvm_pmm_devmem_page_to_va_space(page) == va_space)
                if (uvm_hmm_pmm_gpu_evict_pfn(pfn) != NV_OK)
                    retry = true;
        }
@@ -1713,7 +1717,7 @@ static void gpu_chunk_remove(uvm_va_block_t *va_block,
    uvm_gpu_chunk_t *gpu_chunk;
    uvm_gpu_id_t id;

-    id = uvm_pmm_devmem_page_to_gpu_id(page);
+    id = uvm_gpu_chunk_get_gpu(uvm_pmm_devmem_page_to_chunk(page))->id;
    gpu_state = uvm_va_block_gpu_state_get(va_block, id);
    UVM_ASSERT(gpu_state);

@@ -1743,7 +1747,7 @@ static NV_STATUS gpu_chunk_add(uvm_va_block_t *va_block,
    uvm_gpu_id_t id;
    NV_STATUS status;

-    id = uvm_pmm_devmem_page_to_gpu_id(page);
+    id = uvm_gpu_chunk_get_gpu(uvm_pmm_devmem_page_to_chunk(page))->id;
    gpu_state = uvm_va_block_gpu_state_get(va_block, id);

    // It's possible that this is a fresh va_block we're trying to add an
@@ -1765,7 +1769,7 @@ static NV_STATUS gpu_chunk_add(uvm_va_block_t *va_block,
    gpu_chunk = uvm_pmm_devmem_page_to_chunk(page);
    UVM_ASSERT(gpu_chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED);
    UVM_ASSERT(gpu_chunk->is_referenced);
-    UVM_ASSERT(page->zone_device_data == va_block->hmm.va_space);
+    UVM_ASSERT(uvm_pmm_devmem_page_to_va_space(page) == va_block->hmm.va_space);

    if (gpu_state->chunks[page_index] == gpu_chunk)
        return NV_OK;
@@ -1992,7 +1996,7 @@ static void fill_dst_pfn(uvm_va_block_t *va_block,
    hmm_mark_gpu_chunk_referenced(va_block, gpu, gpu_chunk);
    UVM_ASSERT(!page_count(dpage));
    zone_device_page_init(dpage);
-    dpage->zone_device_data = va_block->hmm.va_space;
+    dpage->zone_device_data = gpu_chunk;

    dst_pfns[page_index] = migrate_pfn(pfn);
 }
--- a/kernel-open/nvidia-uvm/uvm_migrate.c
+++ b/kernel-open/nvidia-uvm/uvm_migrate.c
@@ -130,27 +130,12 @@ static NV_STATUS block_migrate_map_unmapped_pages(uvm_va_block_t *va_block,
    uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
    NV_STATUS status = NV_OK;
    NV_STATUS tracker_status;
+    uvm_prot_t prot = UVM_PROT_READ_WRITE_ATOMIC;

    // Get the mask of unmapped pages because it will change after the
    // first map operation
    uvm_va_block_unmapped_pages_get(va_block, region, &va_block_context->caller_page_mask);

-    if (uvm_va_block_is_hmm(va_block) && !UVM_ID_IS_CPU(dest_id)) {
-        // Do not map pages that are already resident on the CPU. This is in
-        // order to avoid breaking system-wide atomic operations on HMM. HMM's
-        // implementation of system-side atomic operations involves restricting
-        // mappings to one processor (CPU or a GPU) at a time. If we were to
-        // grant a GPU a mapping to system memory, this gets into trouble
-        // because, on the CPU side, Linux can silently upgrade PTE permissions
-        // (move from read-only, to read-write, without any MMU notifiers
-        // firing), thus breaking the model by allowing simultaneous read-write
-        // access from two separate processors. To avoid that, just don't map
-        // such pages at all, when migrating.
-        uvm_page_mask_andnot(&va_block_context->caller_page_mask,
-                             &va_block_context->caller_page_mask,
-                             uvm_va_block_resident_mask_get(va_block, UVM_ID_CPU, NUMA_NO_NODE));
-    }
-
    // Only map those pages that are not mapped anywhere else (likely due
    // to a first touch or a migration). We pass
    // UvmEventMapRemoteCauseInvalid since the destination processor of a
@@ -166,6 +151,31 @@ static NV_STATUS block_migrate_map_unmapped_pages(uvm_va_block_t *va_block,
    if (status != NV_OK)
        goto out;

+    if (uvm_va_block_is_hmm(va_block) && UVM_ID_IS_CPU(dest_id)) {
+        uvm_processor_id_t id;
+
+        // Do not atomically map pages that are resident on the CPU. This is in
+        // order to avoid breaking system-wide atomic operations on HMM. HMM's
+        // implementation of system-side atomic operations involves restricting
+        // mappings to one processor (CPU or a GPU) at a time. If we were to
+        // grant a GPU a mapping to system memory, this gets into trouble
+        // because, on the CPU side, Linux can silently upgrade PTE permissions
+        // (move from read-only, to read-write, without any MMU notifiers
+        // firing), thus breaking the model by allowing simultaneous read-write
+        // access from two separate processors. To avoid that, don't remote map
+        // such pages atomically, after migrating.
+        // Also note that HMM sets CPU mapping for resident pages so the mask
+        // of pages to be mapped needs to be recomputed without including the
+        // CPU mapping.
+        prot = UVM_PROT_READ_WRITE;
+        uvm_page_mask_region_fill(&va_block_context->caller_page_mask, region);
+        for_each_gpu_id_in_mask(id, &va_block->mapped) {
+            uvm_page_mask_andnot(&va_block_context->caller_page_mask,
+                                 &va_block_context->caller_page_mask,
+                                 uvm_va_block_map_mask_get(va_block, id));
+        }
+    }
+
    // Add mappings for AccessedBy processors
    //
    // No mappings within this call will operate on dest_id, so we don't
@@ -176,7 +186,7 @@ static NV_STATUS block_migrate_map_unmapped_pages(uvm_va_block_t *va_block,
                                                       dest_id,
                                                       region,
                                                       &va_block_context->caller_page_mask,
-                                                       UVM_PROT_READ_WRITE_ATOMIC,
+                                                       prot,
                                                       NULL);

 out:
--- a/kernel-open/nvidia-uvm/uvm_perf_thrashing.c
+++ b/kernel-open/nvidia-uvm/uvm_perf_thrashing.c
@@ -1409,11 +1409,13 @@ static bool thrashing_processors_have_fast_access_to(uvm_va_space_t *va_space,
        uvm_processor_mask_set(fast_to, to);
    }
    else {
-        // Include registered SMC peers and the processor 'to'.
+        // Include all SMC peers and the processor 'to'.
+        // This includes SMC peers that are not registered.
+        // Since not-registered peers cannot be in page_thrashing->processors,
+        // the value of their respective bits in "fast_to" doesn't matter.
        uvm_processor_mask_range_fill(fast_to,
                                      uvm_gpu_id_from_sub_processor(uvm_parent_gpu_id_from_gpu_id(to), 0),
                                      UVM_PARENT_ID_MAX_SUB_PROCESSORS);
-        uvm_processor_mask_and(fast_to, fast_to, &va_space->registered_gpu_va_spaces);
    }

    return uvm_processor_mask_subset(&page_thrashing->processors, fast_to);
--- a/kernel-open/nvidia-uvm/uvm_pmm_gpu.c
+++ b/kernel-open/nvidia-uvm/uvm_pmm_gpu.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2024 NVIDIA Corporation
+    Copyright (c) 2015-2025 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -3030,69 +3030,23 @@ NvU32 uvm_pmm_gpu_phys_to_virt(uvm_pmm_gpu_t *pmm, NvU64 phys_addr, NvU64 region

 #if UVM_IS_CONFIG_HMM()

-static uvm_pmm_gpu_t *devmem_page_to_pmm(struct page *page)
-{
-    return container_of(page->pgmap, uvm_pmm_gpu_t, devmem.pagemap);
-}
-
-static uvm_gpu_chunk_t *devmem_page_to_chunk_locked(struct page *page)
-{
-    uvm_pmm_gpu_t *pmm = devmem_page_to_pmm(page);
-    NvU64 chunk_addr = ((NvU64)page_to_pfn(page) << PAGE_SHIFT) - pmm->devmem.pagemap.range.start;
-    size_t index = chunk_addr / UVM_CHUNK_SIZE_MAX;
-    uvm_gpu_chunk_t *root_chunk;
-    uvm_gpu_chunk_t *chunk;
-    uvm_gpu_chunk_t *parent;
-    uvm_chunk_size_t chunk_size;
-
-    UVM_ASSERT(index < pmm->root_chunks.count);
-    root_chunk = &pmm->root_chunks.array[index].chunk;
-    UVM_ASSERT(root_chunk->address == UVM_ALIGN_DOWN(chunk_addr, UVM_CHUNK_SIZE_MAX));
-
-    // Find the uvm_gpu_chunk_t that corresponds to the device private struct
-    // page's PFN. The loop is only 0, 1, or 2 iterations.
-    for (chunk = root_chunk;
-         uvm_gpu_chunk_get_size(chunk) != page_size(page);
-         chunk = parent->suballoc->subchunks[index]) {
-
-        parent = chunk;
-        UVM_ASSERT(parent->state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT);
-        UVM_ASSERT(parent->suballoc);
-
-        chunk_size = uvm_gpu_chunk_get_size(parent->suballoc->subchunks[0]);
-        index = (size_t)uvm_div_pow2_64(chunk_addr - parent->address, chunk_size);
-        UVM_ASSERT(index < num_subchunks(parent));
-    }
-
-    UVM_ASSERT(chunk->address = chunk_addr);
-    UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED);
-    UVM_ASSERT(chunk->is_referenced);
-
-    return chunk;
-}
-
 uvm_gpu_chunk_t *uvm_pmm_devmem_page_to_chunk(struct page *page)
 {
-    uvm_pmm_gpu_t *pmm = devmem_page_to_pmm(page);
-    uvm_gpu_chunk_t *chunk;
-
    UVM_ASSERT(is_device_private_page(page));

-    uvm_spin_lock(&pmm->list_lock);
-    chunk = devmem_page_to_chunk_locked(page);
-    uvm_spin_unlock(&pmm->list_lock);
-
-    return chunk;
+    return page->zone_device_data;
 }

-uvm_gpu_id_t uvm_pmm_devmem_page_to_gpu_id(struct page *page)
+uvm_va_space_t *uvm_pmm_devmem_page_to_va_space(struct page *page)
 {
-    uvm_pmm_gpu_t *pmm = devmem_page_to_pmm(page);
-    uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
+    uvm_gpu_chunk_t *gpu_chunk = uvm_pmm_devmem_page_to_chunk(page);

-    UVM_ASSERT(is_device_private_page(page));
+    // uvm_hmm_unregister_gpu() needs to do a racy check here so
+    // page->zone_device_data might be NULL.
+    if (!gpu_chunk || !gpu_chunk->va_block)
+        return NULL;

-    return gpu->id;
+    return gpu_chunk->va_block->hmm.va_space;
 }

 // Check there are no orphan pages. This should be only called as part of
@@ -3104,12 +3058,17 @@ static bool uvm_pmm_gpu_check_orphan_pages(uvm_pmm_gpu_t *pmm)
 {
    size_t i;
    bool ret = true;
+    uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
+    unsigned long devmem_start;
+    unsigned long devmem_end;
    unsigned long pfn;
-    struct range range = pmm->devmem.pagemap.range;

    if (!pmm->initialized || !uvm_hmm_is_enabled_system_wide())
        return ret;

+    devmem_start = gpu->parent->devmem->pagemap.range.start + gpu->mem_info.phys_start;
+    devmem_end = devmem_start + gpu->mem_info.size;
+
    // Scan all the root chunks looking for subchunks which are still
    // referenced.
    for (i = 0; i < pmm->root_chunks.count; i++) {
@@ -3121,7 +3080,7 @@ static bool uvm_pmm_gpu_check_orphan_pages(uvm_pmm_gpu_t *pmm)
        root_chunk_unlock(pmm, root_chunk);
    }

-    for (pfn = __phys_to_pfn(range.start); pfn <= __phys_to_pfn(range.end); pfn++) {
+    for (pfn = __phys_to_pfn(devmem_start); pfn <= __phys_to_pfn(devmem_end); pfn++) {
        struct page *page = pfn_to_page(pfn);

        if (!is_device_private_page(page)) {
@@ -3140,9 +3099,8 @@ static bool uvm_pmm_gpu_check_orphan_pages(uvm_pmm_gpu_t *pmm)

 static void devmem_page_free(struct page *page)
 {
-    uvm_pmm_gpu_t *pmm = devmem_page_to_pmm(page);
-    uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
-    uvm_gpu_chunk_t *chunk;
+    uvm_gpu_chunk_t *chunk = uvm_pmm_devmem_page_to_chunk(page);
+    uvm_gpu_t *gpu = uvm_gpu_chunk_get_gpu(chunk);

    page->zone_device_data = NULL;

@@ -3150,23 +3108,22 @@ static void devmem_page_free(struct page *page)
    // we may be in an interrupt context where we can't do that. Instead,
    // do a lazy free. Note that we have to use a "normal" spin lock because
    // the UVM context is not available.
-    spin_lock(&pmm->list_lock.lock);
+    spin_lock(&gpu->pmm.list_lock.lock);

-    chunk = devmem_page_to_chunk_locked(page);
    UVM_ASSERT(chunk->is_referenced);
    chunk->is_referenced = false;
-    list_add_tail(&chunk->list, &pmm->root_chunks.va_block_lazy_free);
+    list_add_tail(&chunk->list, &gpu->pmm.root_chunks.va_block_lazy_free);

-    spin_unlock(&pmm->list_lock.lock);
+    spin_unlock(&gpu->pmm.list_lock.lock);

    nv_kthread_q_schedule_q_item(&gpu->parent->lazy_free_q,
-                                 &pmm->root_chunks.va_block_lazy_free_q_item);
+                                 &gpu->pmm.root_chunks.va_block_lazy_free_q_item);
 }

 // This is called by HMM when the CPU faults on a ZONE_DEVICE private entry.
 static vm_fault_t devmem_fault(struct vm_fault *vmf)
 {
-    uvm_va_space_t *va_space = vmf->page->zone_device_data;
+    uvm_va_space_t *va_space = uvm_pmm_devmem_page_to_va_space(vmf->page);

    if (!va_space)
        return VM_FAULT_SIGBUS;
@@ -3185,26 +3142,46 @@ static const struct dev_pagemap_ops uvm_pmm_devmem_ops =
    .migrate_to_ram = devmem_fault_entry,
 };

-static NV_STATUS devmem_init(uvm_pmm_gpu_t *pmm)
+// Allocating and initialising device private pages takes a significant amount
+// of time on very large systems. So rather than do that everytime a GPU is
+// registered we do it once and keep track of the range when the GPU is
+// unregistered for later reuse.
+//
+// This function tries to find an exsiting range of device private pages and if
+// available allocates and returns it for reuse.
+static uvm_pmm_gpu_devmem_t *devmem_reuse_pagemap(unsigned long size)
 {
-    unsigned long size = pmm->root_chunks.count * UVM_CHUNK_SIZE_MAX;
-    uvm_pmm_gpu_devmem_t *devmem = &pmm->devmem;
+    uvm_pmm_gpu_devmem_t *devmem;
+
+    list_for_each_entry(devmem, &g_uvm_global.devmem_ranges.list, list_node) {
+        if (devmem->size == size) {
+            list_del(&devmem->list_node);
+            return devmem;
+        }
+    }
+
+    return NULL;
+}
+
+static uvm_pmm_gpu_devmem_t *devmem_alloc_pagemap(unsigned long size)
+{
+    uvm_pmm_gpu_devmem_t *devmem;
    struct resource *res;
    void *ptr;
    NV_STATUS status;

-    if (!uvm_hmm_is_enabled_system_wide()) {
-        devmem->pagemap.owner = NULL;
-        return NV_OK;
-    }
-
    res = request_free_mem_region(&iomem_resource, size, "nvidia-uvm-hmm");
    if (IS_ERR(res)) {
        UVM_ERR_PRINT("request_free_mem_region() err %ld\n", PTR_ERR(res));
        status = errno_to_nv_status(PTR_ERR(res));
-        goto err;
+        return NULL;
    }

+    devmem = kzalloc(sizeof(*devmem), GFP_KERNEL);
+    if (!devmem)
+        goto err;
+
+    devmem->size = size;
    devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
    devmem->pagemap.range.start = res->start;
    devmem->pagemap.range.end = res->end;
@@ -3217,43 +3194,77 @@ static NV_STATUS devmem_init(uvm_pmm_gpu_t *pmm)
    if (IS_ERR(ptr)) {
        UVM_ERR_PRINT("memremap_pages() err %ld\n", PTR_ERR(ptr));
        status = errno_to_nv_status(PTR_ERR(ptr));
-        goto err_release;
+        goto err_free;
    }

-    return NV_OK;
+    return devmem;
+
+err_free:
+    kfree(devmem);

-err_release:
-    release_mem_region(res->start, resource_size(res));
 err:
-    devmem->pagemap.owner = NULL;
-    return status;
+    release_mem_region(res->start, resource_size(res));
+    return NULL;
 }

-static void devmem_deinit(uvm_pmm_gpu_t *pmm)
+NV_STATUS uvm_pmm_devmem_init(uvm_parent_gpu_t *gpu)
 {
-    uvm_pmm_gpu_devmem_t *devmem = &pmm->devmem;
+    // Create a DEVICE_PRIVATE page for every GPU page available on the parent.
+    unsigned long size = gpu->max_allocatable_address;

-    if (!devmem->pagemap.owner)
+    if (!uvm_hmm_is_enabled_system_wide()) {
+        gpu->devmem = NULL;
+        return NV_OK;
+    }
+
+    gpu->devmem = devmem_reuse_pagemap(size);
+    if (!gpu->devmem)
+        gpu->devmem = devmem_alloc_pagemap(size);
+
+    if (!gpu->devmem)
+        return NV_ERR_NO_MEMORY;
+
+    return NV_OK;
+}
+
+void uvm_pmm_devmem_deinit(uvm_parent_gpu_t *gpu)
+{
+    if (!gpu->devmem)
        return;

-    memunmap_pages(&devmem->pagemap);
-    release_mem_region(devmem->pagemap.range.start, range_len(&devmem->pagemap.range));
+    list_add_tail(&gpu->devmem->list_node, &g_uvm_global.devmem_ranges.list);
+    gpu->devmem = NULL;
+}
+
+void uvm_pmm_devmem_exit(void)
+{
+    uvm_pmm_gpu_devmem_t *devmem, *devmem_next;
+
+    list_for_each_entry_safe(devmem, devmem_next, &g_uvm_global.devmem_ranges.list, list_node) {
+        list_del(&devmem->list_node);
+        memunmap_pages(&devmem->pagemap);
+        release_mem_region(devmem->pagemap.range.start, range_len(&devmem->pagemap.range));
+        kfree(devmem);
+    }
 }

 unsigned long uvm_pmm_gpu_devmem_get_pfn(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
 {
-    return (pmm->devmem.pagemap.range.start + chunk->address) >> PAGE_SHIFT;
+    uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
+    unsigned long devmem_start = gpu->parent->devmem->pagemap.range.start;
+
+    return (devmem_start + chunk->address) >> PAGE_SHIFT;
 }

 #endif // UVM_IS_CONFIG_HMM()

 #if !UVM_IS_CONFIG_HMM()
-static NV_STATUS devmem_init(uvm_pmm_gpu_t *pmm)
+NV_STATUS uvm_pmm_devmem_init(uvm_parent_gpu_t *gpu)
 {
    return NV_OK;
 }

-static void devmem_deinit(uvm_pmm_gpu_t *pmm)
+void uvm_pmm_devmem_deinit(uvm_parent_gpu_t *gpu)
 {
 }

@@ -3469,10 +3480,6 @@ NV_STATUS uvm_pmm_gpu_init(uvm_pmm_gpu_t *pmm)
        }
    }

-    status = devmem_init(pmm);
-    if (status != NV_OK)
-        goto cleanup;
-
    return NV_OK;
 cleanup:
    uvm_pmm_gpu_deinit(pmm);
@@ -3543,8 +3550,6 @@ void uvm_pmm_gpu_deinit(uvm_pmm_gpu_t *pmm)

    deinit_caches(pmm);

-    devmem_deinit(pmm);
-
    pmm->initialized = false;
 }

--- a/kernel-open/nvidia-uvm/uvm_pmm_gpu.h
+++ b/kernel-open/nvidia-uvm/uvm_pmm_gpu.h
@@ -192,22 +192,41 @@ typedef struct uvm_pmm_gpu_chunk_suballoc_struct uvm_pmm_gpu_chunk_suballoc_t;

 #if UVM_IS_CONFIG_HMM()

-typedef struct uvm_pmm_gpu_struct uvm_pmm_gpu_t;
-
 typedef struct
 {
+    // For g_uvm_global.devmem_ranges
+    struct list_head list_node;
+
+    // Size that was requested when created this region. This may be less than
+    // the size actually allocated by the kernel due to alignment contraints.
+    // Figuring out the required alignment at compile time is difficult due to
+    // unexported macros, so just use the requested size as the search key.
+    unsigned long size;
+
    struct dev_pagemap pagemap;
 } uvm_pmm_gpu_devmem_t;

+typedef struct uvm_pmm_gpu_struct uvm_pmm_gpu_t;
+
 // Return the GPU chunk for a given device private struct page.
 uvm_gpu_chunk_t *uvm_pmm_devmem_page_to_chunk(struct page *page);

+// Return the va_space for a given device private struct page.
+uvm_va_space_t *uvm_pmm_devmem_page_to_va_space(struct page *page);
+
 // Return the GPU id for a given device private struct page.
 uvm_gpu_id_t uvm_pmm_devmem_page_to_gpu_id(struct page *page);

 // Return the PFN of the device private struct page for the given GPU chunk.
 unsigned long uvm_pmm_gpu_devmem_get_pfn(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk);

+// Free unused ZONE_DEVICE pages.
+void uvm_pmm_devmem_exit(void);
+
+#else
+static inline void uvm_pmm_devmem_exit(void)
+{
+}
 #endif

 #if defined(CONFIG_PCI_P2PDMA) && defined(NV_STRUCT_PAGE_HAS_ZONE_DEVICE_DATA)
@@ -349,10 +368,6 @@ typedef struct uvm_pmm_gpu_struct
        nv_kthread_q_item_t va_block_lazy_free_q_item;
    } root_chunks;

-#if UVM_IS_CONFIG_HMM()
-    uvm_pmm_gpu_devmem_t devmem;
-#endif
-
    // Lock protecting PMA allocation, freeing and eviction
    uvm_rw_semaphore_t pma_lock;

@@ -604,6 +619,10 @@ static uvm_chunk_size_t uvm_chunk_find_prev_size(uvm_chunk_sizes_mask_t chunk_si
 // retained, and it's up to the caller to release them.
 NvU32 uvm_pmm_gpu_phys_to_virt(uvm_pmm_gpu_t *pmm, NvU64 phys_addr, NvU64 region_size, uvm_reverse_map_t *out_mappings);

+// Allocate and initialise struct page data in the kernel to support HMM.
+NV_STATUS uvm_pmm_devmem_init(uvm_parent_gpu_t *gpu);
+void uvm_pmm_devmem_deinit(uvm_parent_gpu_t *parent_gpu);
+
 // Iterates over every size in the input mask from smallest to largest
 #define for_each_chunk_size(__size, __chunk_sizes)                                  \
    for ((__size) = (__chunk_sizes) ? uvm_chunk_find_first_size(__chunk_sizes) :    \
--- a/kernel-open/nvidia-uvm/uvm_va_block.c
+++ b/kernel-open/nvidia-uvm/uvm_va_block.c
@@ -2839,10 +2839,14 @@ static bool block_check_egm_peer(uvm_va_space_t *va_space, uvm_gpu_t *gpu, int n
    remote_node_info = uvm_va_space_get_egm_numa_node_info(va_space, nid);
    UVM_ASSERT(!uvm_parent_processor_mask_empty(&remote_node_info->parent_gpus));
    for_each_parent_gpu_in_mask(parent_gpu, &remote_node_info->parent_gpus) {
-        UVM_ASSERT(parent_gpu->egm.enabled);
+        NvU64 page_addr = phys_addr.address;

-        if (phys_addr.address + parent_gpu->egm.base_address >= remote_node_info->node_start &&
-            phys_addr.address + parent_gpu->egm.base_address < remote_node_info->node_end &&
+        UVM_ASSERT(parent_gpu->egm.enabled);
+        page_addr += parent_gpu->egm.base_address;
+        if (parent_gpu->nvswitch_info.is_nvswitch_connected && gpu->parent != parent_gpu)
+            page_addr -= parent_gpu->nvswitch_info.egm_fabric_memory_window_start;
+
+        if (page_addr >= remote_node_info->node_start && page_addr < remote_node_info->node_end &&
            remote_node_info->routing_table[uvm_parent_id_gpu_index(gpu->parent->id)] == parent_gpu) {
            return true;
        }
@@ -3229,8 +3233,15 @@ static uvm_gpu_phys_address_t block_phys_page_address(uvm_va_block_t *block,

        if (routing_gpu) {
            struct page *page = uvm_cpu_chunk_get_cpu_page(block, chunk, block_page.page_index);
+
            phys_addr = page_to_phys(page);
            aperture = uvm_gpu_egm_peer_aperture(gpu->parent, routing_gpu);
+
+            // Remote EGM routing is based on both the EGM base address and EGM
+            // fabric memory window.
+            if (routing_gpu->nvswitch_info.is_nvswitch_connected && routing_gpu != gpu->parent)
+                phys_addr += routing_gpu->nvswitch_info.egm_fabric_memory_window_start;
+
            uvm_page_mask_set(&accessing_gpu_state->egm_pages, block_page.page_index);
            return uvm_gpu_phys_address(aperture, phys_addr - routing_gpu->egm.base_address);
        }
@@ -13575,6 +13586,9 @@ NV_STATUS uvm_test_va_residency_info(UVM_TEST_VA_RESIDENCY_INFO_PARAMS *params,
                        struct page *page = block_page_get(block, block_page);

                        phys_addr = page_to_phys(page) - egm_routing_gpu->egm.base_address;
+                        if (egm_routing_gpu->nvswitch_info.is_nvswitch_connected && egm_routing_gpu != gpu->parent)
+                            phys_addr += egm_routing_gpu->nvswitch_info.egm_fabric_memory_window_start;
+
                        params->is_egm_mapping[count] = true;
                    }
                }
--- a/kernel-open/nvidia/nv-dmabuf.c
+++ b/kernel-open/nvidia/nv-dmabuf.c
@@ -612,6 +612,42 @@ nv_dma_buf_unmap_pfns(
    }
 }

+static NvU32
+nv_dma_buf_get_sg_count (
+    struct device *dev,
+    nv_dma_buf_file_private_t *priv,
+    NvU32  *max_seg_size
+)
+{
+    NvU32 dma_max_seg_size, i;
+    NvU32 nents = 0;
+
+    dma_max_seg_size = NV_ALIGN_DOWN(dma_get_max_seg_size(dev), PAGE_SIZE);
+    if (dma_max_seg_size < PAGE_SIZE)
+    {
+        return 0;
+    }
+
+    // Calculate nents needed to allocate sg_table
+    for (i = 0; i < priv->num_objects; i++)
+    {
+        NvU32 range_count = priv->handles[i].memArea.numRanges;
+        NvU32 index;
+
+        for (index = 0; index < range_count; index++)
+        {
+            NvU64 length = priv->handles[i].memArea.pRanges[index].size;
+            NvU64 count = length + dma_max_seg_size - 1;
+            do_div(count, dma_max_seg_size);
+            nents += count;
+        }
+    }
+
+    *max_seg_size = dma_max_seg_size;
+
+    return nents;
+}
+
 static struct sg_table*
 nv_dma_buf_map_pages (
    struct device *dev,
@@ -620,15 +656,11 @@ nv_dma_buf_map_pages (
 {
    struct sg_table *sgt = NULL;
    struct scatterlist *sg;
-    NvU32 nents = 0;
-    NvU32 i;
+    NvU32 dma_max_seg_size = 0;
+    NvU32 i, nents;
    int rc;

-    // Calculate nents needed to allocate sg_table
-    for (i = 0; i < priv->num_objects; i++)
-    {
-        nents += priv->handles[i].memArea.numRanges;
-    }
+    nents = nv_dma_buf_get_sg_count(dev, priv, &dma_max_seg_size);

    NV_KZALLOC(sgt, sizeof(struct sg_table));
    if (sgt == NULL)
@@ -650,20 +682,30 @@ nv_dma_buf_map_pages (
        NvU32 index = 0;
        for (index = 0; index < range_count; index++)
        {
-            NvU64 addr = priv->handles[i].memArea.pRanges[index].start;
-            NvU64 len  = priv->handles[i].memArea.pRanges[index].size;
-            struct page *page = NV_GET_PAGE_STRUCT(addr);
+            NvU64 dma_addr = priv->handles[i].memArea.pRanges[index].start;
+            NvU64 dma_len  = priv->handles[i].memArea.pRanges[index].size;

-            if ((page == NULL) || (sg == NULL))
+            // Split each range into dma_max_seg_size chunks
+            while(dma_len != 0)
            {
-                goto free_table;
-            }
+                NvU32 sg_len = NV_MIN(dma_len, dma_max_seg_size);
+                struct page *page = NV_GET_PAGE_STRUCT(dma_addr);

-            sg_set_page(sg, page, len, NV_GET_OFFSET_IN_PAGE(addr));
-            sg = sg_next(sg);
+                if ((page == NULL) || (sg == NULL))
+                {
+                    goto free_table;
+                }
+
+                sg_set_page(sg, page, sg_len, NV_GET_OFFSET_IN_PAGE(dma_addr));
+                dma_addr += sg_len;
+                dma_len -= sg_len;
+                sg = sg_next(sg);
+            }
        }
    }

+    WARN_ON(sg != NULL);
+
    // DMA map the sg_table
    rc = dma_map_sg(dev, sgt->sgl, sgt->orig_nents, DMA_BIDIRECTIONAL);
    if (rc <= 0)
@@ -693,36 +735,16 @@ nv_dma_buf_map_pfns (
    struct sg_table *sgt = NULL;
    struct scatterlist *sg;
    nv_dma_device_t peer_dma_dev = {{ 0 }};
-    NvU32 dma_max_seg_size;
-    NvU32 nents = 0;
+    NvU32 dma_max_seg_size = 0;
    NvU32 mapped_nents = 0;
    NvU32 i = 0;
+    NvU32 nents;
    int rc = 0;

    peer_dma_dev.dev = dev;
    peer_dma_dev.addressable_range.limit = (NvU64)dev->dma_mask;

-    dma_max_seg_size = NV_ALIGN_DOWN(dma_get_max_seg_size(dev), PAGE_SIZE);
-
-    if (dma_max_seg_size < PAGE_SIZE)
-    {
-        return NULL;
-    }
-
-    // Calculate nents needed to allocate sg_table
-    for (i = 0; i < priv->num_objects; i++)
-    {
-        NvU32 range_count = priv->handles[i].memArea.numRanges;
-        NvU32 index;
-
-        for (index = 0; index < range_count; index++)
-        {
-            NvU64 length = priv->handles[i].memArea.pRanges[index].size;
-            NvU64 count = length + dma_max_seg_size - 1;
-            do_div(count, dma_max_seg_size);
-            nents += count;
-        }
-    }
+    nents = nv_dma_buf_get_sg_count(dev, priv, &dma_max_seg_size);

    NV_KZALLOC(sgt, sizeof(struct sg_table));
    if (sgt == NULL)
@@ -777,6 +799,9 @@ nv_dma_buf_map_pfns (
            }
        }
    }
+
+    WARN_ON(sg != NULL);
+
    sgt->nents = mapped_nents;

    WARN_ON(sgt->nents != sgt->orig_nents);
--- a/kernel-open/nvidia/nv-mmap.c
+++ b/kernel-open/nvidia/nv-mmap.c
@@ -445,7 +445,9 @@ static int nvidia_mmap_sysmem(
        }
        else
        {
-            vma->vm_page_prot = nv_adjust_pgprot(vma->vm_page_prot, 0);
+            if (at->flags.unencrypted)
+                vma->vm_page_prot = nv_adjust_pgprot(vma->vm_page_prot, 0);
+
            ret = vm_insert_page(vma, start,
                                 NV_GET_PAGE_STRUCT(at->page_table[j]->phys_addr));
        }
--- a/kernel-open/nvidia/nv-pci.c
+++ b/kernel-open/nvidia/nv-pci.c
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@@ -308,6 +308,15 @@ static NvU32 find_gpu_numa_nodes_in_srat(nv_linux_state_t *nvl)
    struct acpi_srat_generic_affinity *gi;
    NvU32 numa_node = NUMA_NO_NODE;

+    if (NV_PCI_DEVFN(nvl->pci_dev) != 0)
+    {
+        nv_printf(NV_DBG_ERRORS, "NVRM: Failing to parse SRAT GI for %04x:%02x:%02x.%x "
+                  "since non-zero device function is not supported.\n",
+                  NV_PCI_DOMAIN_NUMBER(nvl->pci_dev), NV_PCI_BUS_NUMBER(nvl->pci_dev),
+                  NV_PCI_SLOT_NUMBER(nvl->pci_dev), PCI_FUNC(nvl->pci_dev->devfn));
+        return 0;
+    }
+
    if (acpi_get_table(ACPI_SIG_SRAT, 0, &table_header)) {
        nv_printf(NV_DBG_INFO, "NVRM: Failed to parse the SRAT table.\n");
        return 0;
@@ -331,9 +340,14 @@ static NvU32 find_gpu_numa_nodes_in_srat(nv_linux_state_t *nvl)
           (((unsigned long)subtable_header) + subtable_header_length < table_end)) {

        if (subtable_header->type == ACPI_SRAT_TYPE_GENERIC_AFFINITY) {
+            NvU8 busAtByte2, busAtByte3;
            gi = (struct acpi_srat_generic_affinity *) subtable_header;
+            busAtByte2 = gi->device_handle[2];
+            busAtByte3 = gi->device_handle[3];
+
+            // Device and function should be zero enforced by above check
            gi_dbdf = *((NvU16 *)(&gi->device_handle[0])) << 16 |
-                      *((NvU16 *)(&gi->device_handle[2]));
+                (busAtByte2 != 0 ? busAtByte2 : busAtByte3) << 8;
            
            if (gi_dbdf == dev_dbdf) {
                numa_node = pxm_to_node(gi->proximity_domain);
@@ -347,6 +361,31 @@ static NvU32 find_gpu_numa_nodes_in_srat(nv_linux_state_t *nvl)
                    pxm_count = 0;
                    goto exit;
                }
+                nv_printf(NV_DBG_INFO,
+                          "NVRM: matching SRAT GI entry: 0x%x 0x%x 0x%x 0x%x  PXM: %d\n",
+                          gi->device_handle[3],
+                          gi->device_handle[2],
+                          gi->device_handle[1],
+                          gi->device_handle[0],
+                          gi->proximity_domain);
+                if ((busAtByte2) == 0 &&
+                    (busAtByte3) != 0)
+                {
+                    /*
+                     * TODO: Remove this WAR once Hypervisor stack is updated
+                     * to fix this bug and after all CSPs have moved to using
+                     * the updated Hypervisor stack with fix.
+                     */
+                    nv_printf(NV_DBG_WARNINGS,
+                              "NVRM: PCIe bus value picked from byte 3 offset in SRAT GI entry: 0x%x 0x%x 0x%x 0x%x  PXM: %d\n"
+                              "NVRM: Hypervisor stack is old and not following ACPI spec defined offset.\n"
+                              "NVRM: Please consider upgrading the Hypervisor stack as this workaround will be removed in future release.\n",
+                              gi->device_handle[3],
+                              gi->device_handle[2],
+                              gi->device_handle[1],
+                              gi->device_handle[0],
+                              gi->proximity_domain);
+                }
            }
        }

@@ -792,7 +831,10 @@ next_bar:
    NV_ATOMIC_SET(nvl->numa_info.status, NV_IOCTL_NUMA_STATUS_DISABLED);
    nvl->numa_info.node_id = NUMA_NO_NODE;

-    nv_init_coherent_link_info(nv);
+    if (pci_devid_is_self_hosted(pci_dev->device))
+    {
+        nv_init_coherent_link_info(nv);
+    }

 #if defined(NVCPU_PPC64LE)
    // Use HW NUMA support as a proxy for ATS support. This is true in the only
--- a/kernel-open/nvidia/nv.c
+++ b/kernel-open/nvidia/nv.c
@@ -1630,17 +1630,25 @@ static void nv_init_mapping_revocation(nv_linux_state_t *nvl,
                                       nv_linux_file_private_t *nvlfp,
                                       struct inode *inode)
 {
-    down(&nvl->mmap_lock);
-
    /* Set up struct address_space for use with unmap_mapping_range() */
    address_space_init_once(&nvlfp->mapping);
    nvlfp->mapping.host = inode;
    nvlfp->mapping.a_ops = inode->i_mapping->a_ops;
    file->f_mapping = &nvlfp->mapping;
+}

-    /* Add nvlfp to list of open files in nvl for mapping revocation */
+/* Adds nvlfp to list of open files for mapping revocation */
+static void nv_add_open_file(nv_linux_state_t *nvl,
+                             nv_linux_file_private_t *nvlfp)
+{
+    nvlfp->nvptr = nvl;
+
+    /*
+     * nvl->open_files and other mapping revocation members in nv_linux_state_t
+     * are protected by nvl->mmap_lock instead of nvl->ldata_lock.
+     */
+    down(&nvl->mmap_lock);
    list_add(&nvlfp->entry, &nvl->open_files);
-
    up(&nvl->mmap_lock);
 }

@@ -1690,11 +1698,12 @@ static void nvidia_open_deferred(void *nvlfp_raw)
     */
    down(&nvl->ldata_lock);
    rc = nv_open_device_for_nvlfp(NV_STATE_PTR(nvl), nvlfp->sp, nvlfp);
-    up(&nvl->ldata_lock);

-    /* Set nvptr only upon success (where nvl->usage_count is incremented) */
+    /* Only add open file tracking where nvl->usage_count is incremented */
    if (rc == 0)
-        nvlfp->nvptr = nvl;
+        nv_add_open_file(nvl, nvlfp);
+
+    up(&nvl->ldata_lock);

    complete_all(&nvlfp->open_complete);
 }
@@ -1813,6 +1822,7 @@ nvidia_open(
    }

    nv = NV_STATE_PTR(nvl);
+    nv_init_mapping_revocation(nvl, file, nvlfp, inode);

    if (nv_try_lock_foreground_open(file, nvl) == 0)
    {
@@ -1823,11 +1833,11 @@ nvidia_open(

        rc = nv_open_device_for_nvlfp(nv, nvlfp->sp, nvlfp);

-        up(&nvl->ldata_lock);
-
-        /* Set nvptr only upon success (where nvl->usage_count is incremented) */
+        /* Only add open file tracking where nvl->usage_count is incremented */
        if (rc == 0)
-            nvlfp->nvptr = nvl;
+            nv_add_open_file(nvl, nvlfp);
+
+        up(&nvl->ldata_lock);

        complete_all(&nvlfp->open_complete);
    }
@@ -1882,10 +1892,6 @@ failed:
            NV_SET_FILE_PRIVATE(file, NULL);
        }
    }
-    else
-    {
-        nv_init_mapping_revocation(nvl, file, nvlfp, inode);
-    }

    return rc;
 }
--- a/kernel-open/nvidia/os-interface.c
+++ b/kernel-open/nvidia/os-interface.c
@@ -1672,7 +1672,7 @@ NV_STATUS NV_API_CALL os_alloc_pages_node
    NV_STATUS status = NV_ERR_NOT_SUPPORTED;

 #if defined(__GFP_THISNODE) && defined(GFP_HIGHUSER_MOVABLE) && \
-    defined(__GFP_COMP) && defined(__GFP_NORETRY) && defined(__GFP_NOWARN)
+    defined(__GFP_COMP) && defined(__GFP_NOWARN)
    gfp_t gfp_mask;
    struct page *alloc_addr;
    unsigned int order = get_order(size);
@@ -1689,13 +1689,11 @@ NV_STATUS NV_API_CALL os_alloc_pages_node
     *                              pages, which is needed in order to use
     *                              vm_insert_page API.
     *
-     * 4. __GFP_NORETRY:            Used to avoid the Linux kernel OOM killer.
-     *
-     * 5. __GFP_NOWARN:             Used to avoid a WARN_ON in the slowpath if
+     * 4. __GFP_NOWARN:             Used to avoid a WARN_ON in the slowpath if
     *                              the requested order is too large (just fail
     *                              instead).
     *
-     * 6. (Optional) __GFP_RECLAIM: Used to allow/forbid reclaim.
+     * 5. (Optional) __GFP_RECLAIM: Used to allow/forbid reclaim.
     *                              This is part of GFP_USER and consequently
     *                              GFP_HIGHUSER_MOVABLE.
     *
@@ -1709,7 +1707,30 @@ NV_STATUS NV_API_CALL os_alloc_pages_node
     */

    gfp_mask = __GFP_THISNODE | GFP_HIGHUSER_MOVABLE | __GFP_COMP |
-               __GFP_NORETRY | __GFP_NOWARN;
+               __GFP_NOWARN;
+    
+#if defined(__GFP_RETRY_MAYFAIL)
+
+    /*
+     * __GFP_RETRY_MAYFAIL :  Used to avoid the Linux kernel OOM killer.
+     *                        To help PMA on paths where UVM might be
+     *                        in memory over subscription. This gives UVM 
+     *                        a chance to free memory before invoking any 
+     *                        action from the OOM killer.
+     *                        Freeing non-essential memory will also benefit 
+     *                        the system as a whole.
+     */
+
+    gfp_mask |= __GFP_RETRY_MAYFAIL;
+#elif defined(__GFP_NORETRY)
+
+    /*
+     *  __GFP_NORETRY :       Use __GFP_NORETRY on older kernels where
+     *                        __GFP_RETRY_MAYFAIL is not present.
+     */
+
+    gfp_mask |= __GFP_NORETRY;
+#endif

 #if defined(__GFP_RECLAIM)
    if (flag & NV_ALLOC_PAGES_NODE_SKIP_RECLAIM)