570.124.04

2026-02-09 09:39:57 +00:00 · 2025-02-27 17:32:23 +01:00
parent 81fe4fb417
commit 129479b1b7
141 changed files with 102245 additions and 100070 deletions
--- a/kernel-open/nvidia-uvm/uvm_ats_faults.c
+++ b/kernel-open/nvidia-uvm/uvm_ats_faults.c
@@ -29,6 +29,7 @@
 #include <linux/nodemask.h>
 #include <linux/mempolicy.h>
 #include <linux/mmu_notifier.h>
+#include <linux/topology.h>

 #if UVM_HMM_RANGE_FAULT_SUPPORTED()
 #include <linux/hmm.h>
@@ -291,6 +292,27 @@ static const struct mmu_interval_notifier_ops uvm_ats_notifier_ops =

 #endif

+static bool resident_policy_match(struct vm_area_struct *vma, int dst_nid, int src_nid)
+{
+#if defined(NV_MEMPOLICY_HAS_UNIFIED_NODES)
+    struct mempolicy *vma_policy = vma_policy(vma);
+
+    // TODO: Bug 4981209: When migrations between CPU numa nodes are supported,
+    // add (dst_nid != closest_cpu_numa_node) to allow migrations between CPU
+    // NUMA nodes when destination is the closest_cpu_numa_node.
+    if (vma_policy &&
+        node_isset(src_nid, vma_policy->nodes) &&
+        node_isset(dst_nid, vma_policy->nodes) &&
+        !cpumask_empty(cpumask_of_node(src_nid)) &&
+        !cpumask_empty(cpumask_of_node(dst_nid))) {
+
+        return true;
+    }
+#endif
+
+    return false;
+}
+
 static NV_STATUS ats_compute_residency_mask(uvm_gpu_va_space_t *gpu_va_space,
                                            struct vm_area_struct *vma,
                                            NvU64 base,
@@ -370,9 +392,23 @@ static NV_STATUS ats_compute_residency_mask(uvm_gpu_va_space_t *gpu_va_space,

            if (pfn & HMM_PFN_VALID) {
                struct page *page = hmm_pfn_to_page(pfn);
+                int resident_node = page_to_nid(page);

-                if (page_to_nid(page) == ats_context->residency_node)
+                // Set the residency_mask if:
+                // - The page is already resident at the intended destination.
+                //   or
+                // - If both the source and destination nodes are CPU nodes and
+                //   source node is already in the list of preferred nodes for
+                //   the vma. On multi-CPU NUMA node architectures, this avoids
+                //   unnecessary migrations between CPU nodes. Since the
+                //   specific ats_context->residency_node selected by
+                //   ats_batch_select_residency() is just a guess among the list
+                //   of preferred nodes, paying the cost of migration across the
+                //   CPU preferred nodes in this case can't be justified.
+                if ((resident_node == ats_context->residency_node) ||
+                    resident_policy_match(vma, ats_context->residency_node, resident_node)) {
                    uvm_page_mask_set(residency_mask, page_index);
+                }

                ats_context->prefetch_state.first_touch = false;
            }
--- a/kernel-open/nvidia-uvm/uvm_global.c
+++ b/kernel-open/nvidia-uvm/uvm_global.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2024 NVIDIA Corporation
+    Copyright (c) 2015-2025 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -35,6 +35,7 @@
 #include "uvm_mmu.h"
 #include "uvm_perf_heuristics.h"
 #include "uvm_pmm_sysmem.h"
+#include "uvm_pmm_gpu.h"
 #include "uvm_migrate.h"
 #include "uvm_gpu_access_counters.h"
 #include "uvm_va_space_mm.h"
@@ -90,6 +91,8 @@ NV_STATUS uvm_global_init(void)
    uvm_spin_lock_irqsave_init(&g_uvm_global.gpu_table_lock, UVM_LOCK_ORDER_LEAF);
    uvm_mutex_init(&g_uvm_global.va_spaces.lock, UVM_LOCK_ORDER_VA_SPACES_LIST);
    INIT_LIST_HEAD(&g_uvm_global.va_spaces.list);
+    uvm_mutex_init(&g_uvm_global.devmem_ranges.lock, UVM_LOCK_ORDER_LEAF);
+    INIT_LIST_HEAD(&g_uvm_global.devmem_ranges.list);

    status = uvm_kvmalloc_init();
    if (status != NV_OK) {
@@ -231,6 +234,7 @@ void uvm_global_exit(void)
    uvm_va_policy_exit();
    uvm_mem_global_exit();
    uvm_pmm_sysmem_exit();
+    uvm_pmm_devmem_exit();
    uvm_gpu_exit();
    uvm_processor_mask_cache_exit();

--- a/kernel-open/nvidia-uvm/uvm_global.h
+++ b/kernel-open/nvidia-uvm/uvm_global.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2024 NVIDIA Corporation
+    Copyright (c) 2015-2025 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -157,6 +157,12 @@ struct uvm_global_struct
    // This field is set once during global initialization (uvm_global_init),
    // and can be read afterwards without acquiring any locks.
    bool conf_computing_enabled;
+
+    // List of all devmem ranges allocted on this GPU
+    struct {
+        uvm_mutex_t lock;
+        struct list_head list;
+    } devmem_ranges;
 };

 // Initialize global uvm state
--- a/kernel-open/nvidia-uvm/uvm_gpu.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2024 NVIDIA Corporation
+    Copyright (c) 2015-2025 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -109,8 +109,10 @@ static void fill_parent_gpu_info(uvm_parent_gpu_t *parent_gpu, const UvmGpuInfo
    // nvswitch is routed via physical pages, where the upper 13-bits of the
    // 47-bit address space holds the routing information for each peer.
    // Currently, this is limited to a 16GB framebuffer window size.
-    if (parent_gpu->nvswitch_info.is_nvswitch_connected)
+    if (parent_gpu->nvswitch_info.is_nvswitch_connected) {
        parent_gpu->nvswitch_info.fabric_memory_window_start = gpu_info->nvswitchMemoryWindowStart;
+        parent_gpu->nvswitch_info.egm_fabric_memory_window_start = gpu_info->nvswitchEgmMemoryWindowStart;
+    }

    uvm_uuid_string(uuid_buffer, &parent_gpu->uuid);
    snprintf(parent_gpu->name,
@@ -244,6 +246,7 @@ static NV_STATUS get_gpu_fb_info(uvm_gpu_t *gpu)
    if (!fb_info.bZeroFb) {
        gpu->mem_info.size = ((NvU64)fb_info.heapSize + fb_info.reservedHeapSize) * 1024;
        gpu->mem_info.max_allocatable_address = fb_info.maxAllocatableAddress;
+        gpu->mem_info.phys_start = (NvU64)fb_info.heapStart * 1024;
    }

    gpu->mem_info.max_vidmem_page_size = fb_info.maxVidmemPageSize;
@@ -568,6 +571,9 @@ static void gpu_info_print_common(uvm_gpu_t *gpu, struct seq_file *s)
    UVM_SEQ_OR_DBG_PRINT(s, "big_page_size                          %u\n", gpu->big_page.internal_size);
    UVM_SEQ_OR_DBG_PRINT(s, "rm_va_base                             0x%llx\n", gpu->parent->rm_va_base);
    UVM_SEQ_OR_DBG_PRINT(s, "rm_va_size                             0x%llx\n", gpu->parent->rm_va_size);
+    UVM_SEQ_OR_DBG_PRINT(s, "vidmem_start                           %llu (%llu MBs)\n",
+                         gpu->mem_info.phys_start,
+                         gpu->mem_info.phys_start / (1024 * 1024));
    UVM_SEQ_OR_DBG_PRINT(s, "vidmem_size                            %llu (%llu MBs)\n",
                         gpu->mem_info.size,
                         gpu->mem_info.size / (1024 * 1024));
@@ -1361,6 +1367,7 @@ static NV_STATUS init_parent_gpu(uvm_parent_gpu_t *parent_gpu,
                                 const UvmGpuPlatformInfo *gpu_platform_info)
 {
    NV_STATUS status;
+    UvmGpuFbInfo fb_info = {0};

    status = uvm_rm_locked_call(nvUvmInterfaceDeviceCreate(uvm_global_session_handle(),
                                                           gpu_info,
@@ -1384,8 +1391,15 @@ static NV_STATUS init_parent_gpu(uvm_parent_gpu_t *parent_gpu,
    parent_gpu->egm.local_peer_id = gpu_info->egmPeerId;
    parent_gpu->egm.base_address = gpu_info->egmBaseAddr;

+    status = uvm_rm_locked_call(nvUvmInterfaceGetFbInfo(parent_gpu->rm_device, &fb_info));
+    if (status != NV_OK)
+        return status;
+
    parent_gpu->sli_enabled = (gpu_info->subdeviceCount > 1);

+    if (!fb_info.bZeroFb)
+        parent_gpu->max_allocatable_address = fb_info.maxAllocatableAddress;
+
    parent_gpu->virt_mode = gpu_info->virtMode;
    if (parent_gpu->virt_mode == UVM_VIRT_MODE_LEGACY) {
        UVM_ERR_PRINT("Failed to init GPU %s. UVM is not supported in legacy virtualization mode\n",
@@ -1419,6 +1433,14 @@ static NV_STATUS init_parent_gpu(uvm_parent_gpu_t *parent_gpu,

    uvm_mmu_init_gpu_chunk_sizes(parent_gpu);

+    status = uvm_pmm_devmem_init(parent_gpu);
+    if (status != NV_OK) {
+        UVM_ERR_PRINT("failed to intialize device private memory: %s, GPU %s\n",
+                      nvstatusToString(status),
+                      uvm_parent_gpu_name(parent_gpu));
+        return status;
+    }
+
    status = uvm_ats_add_gpu(parent_gpu);
    if (status != NV_OK) {
        UVM_ERR_PRINT("uvm_ats_add_gpu failed: %s, GPU %s\n",
@@ -1667,6 +1689,7 @@ static void deinit_parent_gpu(uvm_parent_gpu_t *parent_gpu)

    deinit_parent_procfs_files(parent_gpu);

+    uvm_pmm_devmem_deinit(parent_gpu);
    uvm_ats_remove_gpu(parent_gpu);

    UVM_ASSERT(atomic64_read(&parent_gpu->mapped_cpu_pages_size) == 0);
--- a/kernel-open/nvidia-uvm/uvm_gpu.h
+++ b/kernel-open/nvidia-uvm/uvm_gpu.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2024 NVIDIA Corporation
+    Copyright (c) 2015-2025 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -696,6 +696,11 @@ struct uvm_gpu_struct
        // ZeroFB testing mode, this will be 0.
        NvU64 size;

+        // Physical start of heap, for SMC enabled GPUs, this is useful to
+        // partition PMM, it is used by HMM to figure out the right translation
+        // between HMM ranges and PMM offsets.
+        NvU64 phys_start;
+
        // Max (inclusive) physical address of this GPU's memory that the driver
        // can allocate through PMM (PMA).
        NvU64 max_allocatable_address;
@@ -1015,6 +1020,13 @@ struct uvm_parent_gpu_struct
    // Do not read this field directly, use uvm_gpu_device_handle instead.
    uvmGpuDeviceHandle rm_device;

+    // Total amount of physical memory available on the parent GPU.
+    NvU64 max_allocatable_address;
+
+#if UVM_IS_CONFIG_HMM()
+    uvm_pmm_gpu_devmem_t *devmem;
+#endif
+
    // The physical address range addressable by the GPU
    //
    // The GPU has its NV_PFB_XV_UPPER_ADDR register set by RM to
@@ -1288,6 +1300,10 @@ struct uvm_parent_gpu_struct
        // 47-bit fabric memory physical offset that peer gpus need to access
        // to read a peer's memory
        NvU64 fabric_memory_window_start;
+
+        // 47-bit fabric memory physical offset that peer gpus need to access
+        // to read remote EGM memory.
+        NvU64 egm_fabric_memory_window_start;
    } nvswitch_info;

    struct
--- a/kernel-open/nvidia-uvm/uvm_hmm.c
+++ b/kernel-open/nvidia-uvm/uvm_hmm.c
@@ -321,13 +321,17 @@ void uvm_hmm_unregister_gpu(uvm_va_space_t *va_space, uvm_gpu_t *gpu, struct mm_
 {
    uvm_range_tree_node_t *node;
    uvm_va_block_t *va_block;
-    struct range range = gpu->pmm.devmem.pagemap.range;
+    unsigned long devmem_start;
+    unsigned long devmem_end;
    unsigned long pfn;
    bool retry;

    if (!uvm_hmm_is_enabled(va_space))
        return;

+    devmem_start = gpu->parent->devmem->pagemap.range.start + gpu->mem_info.phys_start;
+    devmem_end = devmem_start + gpu->mem_info.size;
+
    if (mm)
        uvm_assert_mmap_lock_locked(mm);
    uvm_assert_rwsem_locked_write(&va_space->lock);
@@ -341,7 +345,7 @@ void uvm_hmm_unregister_gpu(uvm_va_space_t *va_space, uvm_gpu_t *gpu, struct mm_
    do {
        retry = false;

-        for (pfn = __phys_to_pfn(range.start); pfn <= __phys_to_pfn(range.end); pfn++) {
+        for (pfn = __phys_to_pfn(devmem_start); pfn <= __phys_to_pfn(devmem_end); pfn++) {
            struct page *page = pfn_to_page(pfn);

            UVM_ASSERT(is_device_private_page(page));
@@ -349,7 +353,7 @@ void uvm_hmm_unregister_gpu(uvm_va_space_t *va_space, uvm_gpu_t *gpu, struct mm_
            // This check is racy because nothing stops the page being freed and
            // even reused. That doesn't matter though - worst case the
            // migration fails, we retry and find the va_space doesn't match.
-            if (page->zone_device_data == va_space)
+            if (uvm_pmm_devmem_page_to_va_space(page) == va_space)
                if (uvm_hmm_pmm_gpu_evict_pfn(pfn) != NV_OK)
                    retry = true;
        }
@@ -1713,7 +1717,7 @@ static void gpu_chunk_remove(uvm_va_block_t *va_block,
    uvm_gpu_chunk_t *gpu_chunk;
    uvm_gpu_id_t id;

-    id = uvm_pmm_devmem_page_to_gpu_id(page);
+    id = uvm_gpu_chunk_get_gpu(uvm_pmm_devmem_page_to_chunk(page))->id;
    gpu_state = uvm_va_block_gpu_state_get(va_block, id);
    UVM_ASSERT(gpu_state);

@@ -1743,7 +1747,7 @@ static NV_STATUS gpu_chunk_add(uvm_va_block_t *va_block,
    uvm_gpu_id_t id;
    NV_STATUS status;

-    id = uvm_pmm_devmem_page_to_gpu_id(page);
+    id = uvm_gpu_chunk_get_gpu(uvm_pmm_devmem_page_to_chunk(page))->id;
    gpu_state = uvm_va_block_gpu_state_get(va_block, id);

    // It's possible that this is a fresh va_block we're trying to add an
@@ -1765,7 +1769,7 @@ static NV_STATUS gpu_chunk_add(uvm_va_block_t *va_block,
    gpu_chunk = uvm_pmm_devmem_page_to_chunk(page);
    UVM_ASSERT(gpu_chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED);
    UVM_ASSERT(gpu_chunk->is_referenced);
-    UVM_ASSERT(page->zone_device_data == va_block->hmm.va_space);
+    UVM_ASSERT(uvm_pmm_devmem_page_to_va_space(page) == va_block->hmm.va_space);

    if (gpu_state->chunks[page_index] == gpu_chunk)
        return NV_OK;
@@ -1992,7 +1996,7 @@ static void fill_dst_pfn(uvm_va_block_t *va_block,
    hmm_mark_gpu_chunk_referenced(va_block, gpu, gpu_chunk);
    UVM_ASSERT(!page_count(dpage));
    zone_device_page_init(dpage);
-    dpage->zone_device_data = va_block->hmm.va_space;
+    dpage->zone_device_data = gpu_chunk;

    dst_pfns[page_index] = migrate_pfn(pfn);
 }
--- a/kernel-open/nvidia-uvm/uvm_migrate.c
+++ b/kernel-open/nvidia-uvm/uvm_migrate.c
@@ -130,27 +130,12 @@ static NV_STATUS block_migrate_map_unmapped_pages(uvm_va_block_t *va_block,
    uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
    NV_STATUS status = NV_OK;
    NV_STATUS tracker_status;
+    uvm_prot_t prot = UVM_PROT_READ_WRITE_ATOMIC;

    // Get the mask of unmapped pages because it will change after the
    // first map operation
    uvm_va_block_unmapped_pages_get(va_block, region, &va_block_context->caller_page_mask);

-    if (uvm_va_block_is_hmm(va_block) && !UVM_ID_IS_CPU(dest_id)) {
-        // Do not map pages that are already resident on the CPU. This is in
-        // order to avoid breaking system-wide atomic operations on HMM. HMM's
-        // implementation of system-side atomic operations involves restricting
-        // mappings to one processor (CPU or a GPU) at a time. If we were to
-        // grant a GPU a mapping to system memory, this gets into trouble
-        // because, on the CPU side, Linux can silently upgrade PTE permissions
-        // (move from read-only, to read-write, without any MMU notifiers
-        // firing), thus breaking the model by allowing simultaneous read-write
-        // access from two separate processors. To avoid that, just don't map
-        // such pages at all, when migrating.
-        uvm_page_mask_andnot(&va_block_context->caller_page_mask,
-                             &va_block_context->caller_page_mask,
-                             uvm_va_block_resident_mask_get(va_block, UVM_ID_CPU, NUMA_NO_NODE));
-    }
-
    // Only map those pages that are not mapped anywhere else (likely due
    // to a first touch or a migration). We pass
    // UvmEventMapRemoteCauseInvalid since the destination processor of a
@@ -166,6 +151,31 @@ static NV_STATUS block_migrate_map_unmapped_pages(uvm_va_block_t *va_block,
    if (status != NV_OK)
        goto out;

+    if (uvm_va_block_is_hmm(va_block) && UVM_ID_IS_CPU(dest_id)) {
+        uvm_processor_id_t id;
+
+        // Do not atomically map pages that are resident on the CPU. This is in
+        // order to avoid breaking system-wide atomic operations on HMM. HMM's
+        // implementation of system-side atomic operations involves restricting
+        // mappings to one processor (CPU or a GPU) at a time. If we were to
+        // grant a GPU a mapping to system memory, this gets into trouble
+        // because, on the CPU side, Linux can silently upgrade PTE permissions
+        // (move from read-only, to read-write, without any MMU notifiers
+        // firing), thus breaking the model by allowing simultaneous read-write
+        // access from two separate processors. To avoid that, don't remote map
+        // such pages atomically, after migrating.
+        // Also note that HMM sets CPU mapping for resident pages so the mask
+        // of pages to be mapped needs to be recomputed without including the
+        // CPU mapping.
+        prot = UVM_PROT_READ_WRITE;
+        uvm_page_mask_region_fill(&va_block_context->caller_page_mask, region);
+        for_each_gpu_id_in_mask(id, &va_block->mapped) {
+            uvm_page_mask_andnot(&va_block_context->caller_page_mask,
+                                 &va_block_context->caller_page_mask,
+                                 uvm_va_block_map_mask_get(va_block, id));
+        }
+    }
+
    // Add mappings for AccessedBy processors
    //
    // No mappings within this call will operate on dest_id, so we don't
@@ -176,7 +186,7 @@ static NV_STATUS block_migrate_map_unmapped_pages(uvm_va_block_t *va_block,
                                                       dest_id,
                                                       region,
                                                       &va_block_context->caller_page_mask,
-                                                       UVM_PROT_READ_WRITE_ATOMIC,
+                                                       prot,
                                                       NULL);

 out:
--- a/kernel-open/nvidia-uvm/uvm_perf_thrashing.c
+++ b/kernel-open/nvidia-uvm/uvm_perf_thrashing.c
@@ -1409,11 +1409,13 @@ static bool thrashing_processors_have_fast_access_to(uvm_va_space_t *va_space,
        uvm_processor_mask_set(fast_to, to);
    }
    else {
-        // Include registered SMC peers and the processor 'to'.
+        // Include all SMC peers and the processor 'to'.
+        // This includes SMC peers that are not registered.
+        // Since not-registered peers cannot be in page_thrashing->processors,
+        // the value of their respective bits in "fast_to" doesn't matter.
        uvm_processor_mask_range_fill(fast_to,
                                      uvm_gpu_id_from_sub_processor(uvm_parent_gpu_id_from_gpu_id(to), 0),
                                      UVM_PARENT_ID_MAX_SUB_PROCESSORS);
-        uvm_processor_mask_and(fast_to, fast_to, &va_space->registered_gpu_va_spaces);
    }

    return uvm_processor_mask_subset(&page_thrashing->processors, fast_to);
--- a/kernel-open/nvidia-uvm/uvm_pmm_gpu.c
+++ b/kernel-open/nvidia-uvm/uvm_pmm_gpu.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2024 NVIDIA Corporation
+    Copyright (c) 2015-2025 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -3030,69 +3030,23 @@ NvU32 uvm_pmm_gpu_phys_to_virt(uvm_pmm_gpu_t *pmm, NvU64 phys_addr, NvU64 region

 #if UVM_IS_CONFIG_HMM()

-static uvm_pmm_gpu_t *devmem_page_to_pmm(struct page *page)
-{
-    return container_of(page->pgmap, uvm_pmm_gpu_t, devmem.pagemap);
-}
-
-static uvm_gpu_chunk_t *devmem_page_to_chunk_locked(struct page *page)
-{
-    uvm_pmm_gpu_t *pmm = devmem_page_to_pmm(page);
-    NvU64 chunk_addr = ((NvU64)page_to_pfn(page) << PAGE_SHIFT) - pmm->devmem.pagemap.range.start;
-    size_t index = chunk_addr / UVM_CHUNK_SIZE_MAX;
-    uvm_gpu_chunk_t *root_chunk;
-    uvm_gpu_chunk_t *chunk;
-    uvm_gpu_chunk_t *parent;
-    uvm_chunk_size_t chunk_size;
-
-    UVM_ASSERT(index < pmm->root_chunks.count);
-    root_chunk = &pmm->root_chunks.array[index].chunk;
-    UVM_ASSERT(root_chunk->address == UVM_ALIGN_DOWN(chunk_addr, UVM_CHUNK_SIZE_MAX));
-
-    // Find the uvm_gpu_chunk_t that corresponds to the device private struct
-    // page's PFN. The loop is only 0, 1, or 2 iterations.
-    for (chunk = root_chunk;
-         uvm_gpu_chunk_get_size(chunk) != page_size(page);
-         chunk = parent->suballoc->subchunks[index]) {
-
-        parent = chunk;
-        UVM_ASSERT(parent->state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT);
-        UVM_ASSERT(parent->suballoc);
-
-        chunk_size = uvm_gpu_chunk_get_size(parent->suballoc->subchunks[0]);
-        index = (size_t)uvm_div_pow2_64(chunk_addr - parent->address, chunk_size);
-        UVM_ASSERT(index < num_subchunks(parent));
-    }
-
-    UVM_ASSERT(chunk->address = chunk_addr);
-    UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED);
-    UVM_ASSERT(chunk->is_referenced);
-
-    return chunk;
-}
-
 uvm_gpu_chunk_t *uvm_pmm_devmem_page_to_chunk(struct page *page)
 {
-    uvm_pmm_gpu_t *pmm = devmem_page_to_pmm(page);
-    uvm_gpu_chunk_t *chunk;
-
    UVM_ASSERT(is_device_private_page(page));

-    uvm_spin_lock(&pmm->list_lock);
-    chunk = devmem_page_to_chunk_locked(page);
-    uvm_spin_unlock(&pmm->list_lock);
-
-    return chunk;
+    return page->zone_device_data;
 }

-uvm_gpu_id_t uvm_pmm_devmem_page_to_gpu_id(struct page *page)
+uvm_va_space_t *uvm_pmm_devmem_page_to_va_space(struct page *page)
 {
-    uvm_pmm_gpu_t *pmm = devmem_page_to_pmm(page);
-    uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
+    uvm_gpu_chunk_t *gpu_chunk = uvm_pmm_devmem_page_to_chunk(page);

-    UVM_ASSERT(is_device_private_page(page));
+    // uvm_hmm_unregister_gpu() needs to do a racy check here so
+    // page->zone_device_data might be NULL.
+    if (!gpu_chunk || !gpu_chunk->va_block)
+        return NULL;

-    return gpu->id;
+    return gpu_chunk->va_block->hmm.va_space;
 }

 // Check there are no orphan pages. This should be only called as part of
@@ -3104,12 +3058,17 @@ static bool uvm_pmm_gpu_check_orphan_pages(uvm_pmm_gpu_t *pmm)
 {
    size_t i;
    bool ret = true;
+    uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
+    unsigned long devmem_start;
+    unsigned long devmem_end;
    unsigned long pfn;
-    struct range range = pmm->devmem.pagemap.range;

    if (!pmm->initialized || !uvm_hmm_is_enabled_system_wide())
        return ret;

+    devmem_start = gpu->parent->devmem->pagemap.range.start + gpu->mem_info.phys_start;
+    devmem_end = devmem_start + gpu->mem_info.size;
+
    // Scan all the root chunks looking for subchunks which are still
    // referenced.
    for (i = 0; i < pmm->root_chunks.count; i++) {
@@ -3121,7 +3080,7 @@ static bool uvm_pmm_gpu_check_orphan_pages(uvm_pmm_gpu_t *pmm)
        root_chunk_unlock(pmm, root_chunk);
    }

-    for (pfn = __phys_to_pfn(range.start); pfn <= __phys_to_pfn(range.end); pfn++) {
+    for (pfn = __phys_to_pfn(devmem_start); pfn <= __phys_to_pfn(devmem_end); pfn++) {
        struct page *page = pfn_to_page(pfn);

        if (!is_device_private_page(page)) {
@@ -3140,9 +3099,8 @@ static bool uvm_pmm_gpu_check_orphan_pages(uvm_pmm_gpu_t *pmm)

 static void devmem_page_free(struct page *page)
 {
-    uvm_pmm_gpu_t *pmm = devmem_page_to_pmm(page);
-    uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
-    uvm_gpu_chunk_t *chunk;
+    uvm_gpu_chunk_t *chunk = uvm_pmm_devmem_page_to_chunk(page);
+    uvm_gpu_t *gpu = uvm_gpu_chunk_get_gpu(chunk);

    page->zone_device_data = NULL;

@@ -3150,23 +3108,22 @@ static void devmem_page_free(struct page *page)
    // we may be in an interrupt context where we can't do that. Instead,
    // do a lazy free. Note that we have to use a "normal" spin lock because
    // the UVM context is not available.
-    spin_lock(&pmm->list_lock.lock);
+    spin_lock(&gpu->pmm.list_lock.lock);

-    chunk = devmem_page_to_chunk_locked(page);
    UVM_ASSERT(chunk->is_referenced);
    chunk->is_referenced = false;
-    list_add_tail(&chunk->list, &pmm->root_chunks.va_block_lazy_free);
+    list_add_tail(&chunk->list, &gpu->pmm.root_chunks.va_block_lazy_free);

-    spin_unlock(&pmm->list_lock.lock);
+    spin_unlock(&gpu->pmm.list_lock.lock);

    nv_kthread_q_schedule_q_item(&gpu->parent->lazy_free_q,
-                                 &pmm->root_chunks.va_block_lazy_free_q_item);
+                                 &gpu->pmm.root_chunks.va_block_lazy_free_q_item);
 }

 // This is called by HMM when the CPU faults on a ZONE_DEVICE private entry.
 static vm_fault_t devmem_fault(struct vm_fault *vmf)
 {
-    uvm_va_space_t *va_space = vmf->page->zone_device_data;
+    uvm_va_space_t *va_space = uvm_pmm_devmem_page_to_va_space(vmf->page);

    if (!va_space)
        return VM_FAULT_SIGBUS;
@@ -3185,26 +3142,46 @@ static const struct dev_pagemap_ops uvm_pmm_devmem_ops =
    .migrate_to_ram = devmem_fault_entry,
 };

-static NV_STATUS devmem_init(uvm_pmm_gpu_t *pmm)
+// Allocating and initialising device private pages takes a significant amount
+// of time on very large systems. So rather than do that everytime a GPU is
+// registered we do it once and keep track of the range when the GPU is
+// unregistered for later reuse.
+//
+// This function tries to find an exsiting range of device private pages and if
+// available allocates and returns it for reuse.
+static uvm_pmm_gpu_devmem_t *devmem_reuse_pagemap(unsigned long size)
 {
-    unsigned long size = pmm->root_chunks.count * UVM_CHUNK_SIZE_MAX;
-    uvm_pmm_gpu_devmem_t *devmem = &pmm->devmem;
+    uvm_pmm_gpu_devmem_t *devmem;
+
+    list_for_each_entry(devmem, &g_uvm_global.devmem_ranges.list, list_node) {
+        if (devmem->size == size) {
+            list_del(&devmem->list_node);
+            return devmem;
+        }
+    }
+
+    return NULL;
+}
+
+static uvm_pmm_gpu_devmem_t *devmem_alloc_pagemap(unsigned long size)
+{
+    uvm_pmm_gpu_devmem_t *devmem;
    struct resource *res;
    void *ptr;
    NV_STATUS status;

-    if (!uvm_hmm_is_enabled_system_wide()) {
-        devmem->pagemap.owner = NULL;
-        return NV_OK;
-    }
-
    res = request_free_mem_region(&iomem_resource, size, "nvidia-uvm-hmm");
    if (IS_ERR(res)) {
        UVM_ERR_PRINT("request_free_mem_region() err %ld\n", PTR_ERR(res));
        status = errno_to_nv_status(PTR_ERR(res));
-        goto err;
+        return NULL;
    }

+    devmem = kzalloc(sizeof(*devmem), GFP_KERNEL);
+    if (!devmem)
+        goto err;
+
+    devmem->size = size;
    devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
    devmem->pagemap.range.start = res->start;
    devmem->pagemap.range.end = res->end;
@@ -3217,43 +3194,77 @@ static NV_STATUS devmem_init(uvm_pmm_gpu_t *pmm)
    if (IS_ERR(ptr)) {
        UVM_ERR_PRINT("memremap_pages() err %ld\n", PTR_ERR(ptr));
        status = errno_to_nv_status(PTR_ERR(ptr));
-        goto err_release;
+        goto err_free;
    }

-    return NV_OK;
+    return devmem;
+
+err_free:
+    kfree(devmem);

-err_release:
-    release_mem_region(res->start, resource_size(res));
 err:
-    devmem->pagemap.owner = NULL;
-    return status;
+    release_mem_region(res->start, resource_size(res));
+    return NULL;
 }

-static void devmem_deinit(uvm_pmm_gpu_t *pmm)
+NV_STATUS uvm_pmm_devmem_init(uvm_parent_gpu_t *gpu)
 {
-    uvm_pmm_gpu_devmem_t *devmem = &pmm->devmem;
+    // Create a DEVICE_PRIVATE page for every GPU page available on the parent.
+    unsigned long size = gpu->max_allocatable_address;

-    if (!devmem->pagemap.owner)
+    if (!uvm_hmm_is_enabled_system_wide()) {
+        gpu->devmem = NULL;
+        return NV_OK;
+    }
+
+    gpu->devmem = devmem_reuse_pagemap(size);
+    if (!gpu->devmem)
+        gpu->devmem = devmem_alloc_pagemap(size);
+
+    if (!gpu->devmem)
+        return NV_ERR_NO_MEMORY;
+
+    return NV_OK;
+}
+
+void uvm_pmm_devmem_deinit(uvm_parent_gpu_t *gpu)
+{
+    if (!gpu->devmem)
        return;

-    memunmap_pages(&devmem->pagemap);
-    release_mem_region(devmem->pagemap.range.start, range_len(&devmem->pagemap.range));
+    list_add_tail(&gpu->devmem->list_node, &g_uvm_global.devmem_ranges.list);
+    gpu->devmem = NULL;
+}
+
+void uvm_pmm_devmem_exit(void)
+{
+    uvm_pmm_gpu_devmem_t *devmem, *devmem_next;
+
+    list_for_each_entry_safe(devmem, devmem_next, &g_uvm_global.devmem_ranges.list, list_node) {
+        list_del(&devmem->list_node);
+        memunmap_pages(&devmem->pagemap);
+        release_mem_region(devmem->pagemap.range.start, range_len(&devmem->pagemap.range));
+        kfree(devmem);
+    }
 }

 unsigned long uvm_pmm_gpu_devmem_get_pfn(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
 {
-    return (pmm->devmem.pagemap.range.start + chunk->address) >> PAGE_SHIFT;
+    uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
+    unsigned long devmem_start = gpu->parent->devmem->pagemap.range.start;
+
+    return (devmem_start + chunk->address) >> PAGE_SHIFT;
 }

 #endif // UVM_IS_CONFIG_HMM()

 #if !UVM_IS_CONFIG_HMM()
-static NV_STATUS devmem_init(uvm_pmm_gpu_t *pmm)
+NV_STATUS uvm_pmm_devmem_init(uvm_parent_gpu_t *gpu)
 {
    return NV_OK;
 }

-static void devmem_deinit(uvm_pmm_gpu_t *pmm)
+void uvm_pmm_devmem_deinit(uvm_parent_gpu_t *gpu)
 {
 }

@@ -3469,10 +3480,6 @@ NV_STATUS uvm_pmm_gpu_init(uvm_pmm_gpu_t *pmm)
        }
    }

-    status = devmem_init(pmm);
-    if (status != NV_OK)
-        goto cleanup;
-
    return NV_OK;
 cleanup:
    uvm_pmm_gpu_deinit(pmm);
@@ -3543,8 +3550,6 @@ void uvm_pmm_gpu_deinit(uvm_pmm_gpu_t *pmm)

    deinit_caches(pmm);

-    devmem_deinit(pmm);
-
    pmm->initialized = false;
 }

--- a/kernel-open/nvidia-uvm/uvm_pmm_gpu.h
+++ b/kernel-open/nvidia-uvm/uvm_pmm_gpu.h
@@ -192,22 +192,41 @@ typedef struct uvm_pmm_gpu_chunk_suballoc_struct uvm_pmm_gpu_chunk_suballoc_t;

 #if UVM_IS_CONFIG_HMM()

-typedef struct uvm_pmm_gpu_struct uvm_pmm_gpu_t;
-
 typedef struct
 {
+    // For g_uvm_global.devmem_ranges
+    struct list_head list_node;
+
+    // Size that was requested when created this region. This may be less than
+    // the size actually allocated by the kernel due to alignment contraints.
+    // Figuring out the required alignment at compile time is difficult due to
+    // unexported macros, so just use the requested size as the search key.
+    unsigned long size;
+
    struct dev_pagemap pagemap;
 } uvm_pmm_gpu_devmem_t;

+typedef struct uvm_pmm_gpu_struct uvm_pmm_gpu_t;
+
 // Return the GPU chunk for a given device private struct page.
 uvm_gpu_chunk_t *uvm_pmm_devmem_page_to_chunk(struct page *page);

+// Return the va_space for a given device private struct page.
+uvm_va_space_t *uvm_pmm_devmem_page_to_va_space(struct page *page);
+
 // Return the GPU id for a given device private struct page.
 uvm_gpu_id_t uvm_pmm_devmem_page_to_gpu_id(struct page *page);

 // Return the PFN of the device private struct page for the given GPU chunk.
 unsigned long uvm_pmm_gpu_devmem_get_pfn(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk);

+// Free unused ZONE_DEVICE pages.
+void uvm_pmm_devmem_exit(void);
+
+#else
+static inline void uvm_pmm_devmem_exit(void)
+{
+}
 #endif

 #if defined(CONFIG_PCI_P2PDMA) && defined(NV_STRUCT_PAGE_HAS_ZONE_DEVICE_DATA)
@@ -349,10 +368,6 @@ typedef struct uvm_pmm_gpu_struct
        nv_kthread_q_item_t va_block_lazy_free_q_item;
    } root_chunks;

-#if UVM_IS_CONFIG_HMM()
-    uvm_pmm_gpu_devmem_t devmem;
-#endif
-
    // Lock protecting PMA allocation, freeing and eviction
    uvm_rw_semaphore_t pma_lock;

@@ -604,6 +619,10 @@ static uvm_chunk_size_t uvm_chunk_find_prev_size(uvm_chunk_sizes_mask_t chunk_si
 // retained, and it's up to the caller to release them.
 NvU32 uvm_pmm_gpu_phys_to_virt(uvm_pmm_gpu_t *pmm, NvU64 phys_addr, NvU64 region_size, uvm_reverse_map_t *out_mappings);

+// Allocate and initialise struct page data in the kernel to support HMM.
+NV_STATUS uvm_pmm_devmem_init(uvm_parent_gpu_t *gpu);
+void uvm_pmm_devmem_deinit(uvm_parent_gpu_t *parent_gpu);
+
 // Iterates over every size in the input mask from smallest to largest
 #define for_each_chunk_size(__size, __chunk_sizes)                                  \
    for ((__size) = (__chunk_sizes) ? uvm_chunk_find_first_size(__chunk_sizes) :    \
--- a/kernel-open/nvidia-uvm/uvm_va_block.c
+++ b/kernel-open/nvidia-uvm/uvm_va_block.c
@@ -2839,10 +2839,14 @@ static bool block_check_egm_peer(uvm_va_space_t *va_space, uvm_gpu_t *gpu, int n
    remote_node_info = uvm_va_space_get_egm_numa_node_info(va_space, nid);
    UVM_ASSERT(!uvm_parent_processor_mask_empty(&remote_node_info->parent_gpus));
    for_each_parent_gpu_in_mask(parent_gpu, &remote_node_info->parent_gpus) {
-        UVM_ASSERT(parent_gpu->egm.enabled);
+        NvU64 page_addr = phys_addr.address;

-        if (phys_addr.address + parent_gpu->egm.base_address >= remote_node_info->node_start &&
-            phys_addr.address + parent_gpu->egm.base_address < remote_node_info->node_end &&
+        UVM_ASSERT(parent_gpu->egm.enabled);
+        page_addr += parent_gpu->egm.base_address;
+        if (parent_gpu->nvswitch_info.is_nvswitch_connected && gpu->parent != parent_gpu)
+            page_addr -= parent_gpu->nvswitch_info.egm_fabric_memory_window_start;
+
+        if (page_addr >= remote_node_info->node_start && page_addr < remote_node_info->node_end &&
            remote_node_info->routing_table[uvm_parent_id_gpu_index(gpu->parent->id)] == parent_gpu) {
            return true;
        }
@@ -3229,8 +3233,15 @@ static uvm_gpu_phys_address_t block_phys_page_address(uvm_va_block_t *block,

        if (routing_gpu) {
            struct page *page = uvm_cpu_chunk_get_cpu_page(block, chunk, block_page.page_index);
+
            phys_addr = page_to_phys(page);
            aperture = uvm_gpu_egm_peer_aperture(gpu->parent, routing_gpu);
+
+            // Remote EGM routing is based on both the EGM base address and EGM
+            // fabric memory window.
+            if (routing_gpu->nvswitch_info.is_nvswitch_connected && routing_gpu != gpu->parent)
+                phys_addr += routing_gpu->nvswitch_info.egm_fabric_memory_window_start;
+
            uvm_page_mask_set(&accessing_gpu_state->egm_pages, block_page.page_index);
            return uvm_gpu_phys_address(aperture, phys_addr - routing_gpu->egm.base_address);
        }
@@ -13575,6 +13586,9 @@ NV_STATUS uvm_test_va_residency_info(UVM_TEST_VA_RESIDENCY_INFO_PARAMS *params,
                        struct page *page = block_page_get(block, block_page);

                        phys_addr = page_to_phys(page) - egm_routing_gpu->egm.base_address;
+                        if (egm_routing_gpu->nvswitch_info.is_nvswitch_connected && egm_routing_gpu != gpu->parent)
+                            phys_addr += egm_routing_gpu->nvswitch_info.egm_fabric_memory_window_start;
+
                        params->is_egm_mapping[count] = true;
                    }
                }