diff --git a/README.md b/README.md
index 706e80db5..3cac64804 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 # NVIDIA Linux Open GPU Kernel Module Source
 
 This is the source release of the NVIDIA Linux open GPU kernel modules,
-version 570.181.
+version 570.190.
 
 
 ## How to Build
@@ -17,7 +17,7 @@ as root:
 
 Note that the kernel modules built here must be used with GSP
 firmware and user-space NVIDIA GPU driver components from a corresponding
-570.181 driver release.  This can be achieved by installing
+570.190 driver release.  This can be achieved by installing
 the NVIDIA GPU driver from the .run file using the `--no-kernel-modules`
 option.  E.g.,
 
@@ -185,7 +185,7 @@ table below).
 For details on feature support and limitations, see the NVIDIA GPU driver
 end user README here:
 
-https://us.download.nvidia.com/XFree86/Linux-x86_64/570.181/README/kernel_open.html
+https://us.download.nvidia.com/XFree86/Linux-x86_64/570.190/README/kernel_open.html
 
 For vGPU support, please refer to the README.vgpu packaged in the vGPU Host
 Package for more details.
@@ -970,6 +970,9 @@ Subsystem Device ID.
 | NVIDIA RTX PRO 4500 Blackwell                           | 2C31 103C 2051 |
 | NVIDIA RTX PRO 4500 Blackwell                           | 2C31 10DE 2051 |
 | NVIDIA RTX PRO 4500 Blackwell                           | 2C31 17AA 2051 |
+| NVIDIA RTX PRO 4000 Blackwell SFF Edition               | 2C33 1028 2053 |
+| NVIDIA RTX PRO 4000 Blackwell SFF Edition               | 2C33 103C 2053 |
+| NVIDIA RTX PRO 4000 Blackwell SFF Edition               | 2C33 17AA 2053 |
 | NVIDIA RTX PRO 4000 Blackwell                           | 2C34 1028 2052 |
 | NVIDIA RTX PRO 4000 Blackwell                           | 2C34 103C 2052 |
 | NVIDIA RTX PRO 4000 Blackwell                           | 2C34 10DE 2052 |
@@ -982,6 +985,9 @@ Subsystem Device ID.
 | NVIDIA GeForce RTX 5060                                 | 2D05           |
 | NVIDIA GeForce RTX 5070 Laptop GPU                      | 2D18           |
 | NVIDIA GeForce RTX 5060 Laptop GPU                      | 2D19           |
+| NVIDIA RTX PRO 2000 Blackwell                           | 2D30 1028 2054 |
+| NVIDIA RTX PRO 2000 Blackwell                           | 2D30 103C 2054 |
+| NVIDIA RTX PRO 2000 Blackwell                           | 2D30 17AA 2054 |
 | NVIDIA RTX PRO 2000 Blackwell Generation Laptop GPU     | 2D39           |
 | NVIDIA GeForce RTX 5070 Laptop GPU                      | 2D58           |
 | NVIDIA GeForce RTX 5060 Laptop GPU                      | 2D59           |
diff --git a/kernel-open/Kbuild b/kernel-open/Kbuild
index 2919972c7..00c027771 100644
--- a/kernel-open/Kbuild
+++ b/kernel-open/Kbuild
@@ -79,7 +79,7 @@ ccflags-y += -I$(src)/common/inc
 ccflags-y += -I$(src)
 ccflags-y += -Wall $(DEFINES) $(INCLUDES) -Wno-cast-qual -Wno-format-extra-args
 ccflags-y += -D__KERNEL__ -DMODULE -DNVRM
-ccflags-y += -DNV_VERSION_STRING=\"570.181\"
+ccflags-y += -DNV_VERSION_STRING=\"570.190\"
 
 ifneq ($(SYSSRCHOST1X),)
  ccflags-y += -I$(SYSSRCHOST1X)
diff --git a/kernel-open/conftest.sh b/kernel-open/conftest.sh
index 98faac1b4..5dd4a4ff6 100755
--- a/kernel-open/conftest.sh
+++ b/kernel-open/conftest.sh
@@ -4071,6 +4071,43 @@ compile_test() {
             fi
         ;;
 
+        drm_fb_create_takes_format_info)
+            #
+            # Determine if a `struct drm_format_info *` is passed into
+            # the .fb_create callback. If so, it will have 4 arguments.
+            # This parameter was added in commit 81112eaac559 ("drm:
+            # Pass the format info to .fb_create") in linux-next
+            # (2025-07-16)
+            CODE="
+            #include <drm/drm_mode_config.h>
+            #include <drm/drm_framebuffer.h>
+
+            static const struct drm_mode_config_funcs funcs;
+            void conftest_drm_fb_create_takes_format_info(void) {
+                funcs.fb_create(NULL, NULL, NULL, NULL); 
+            }"
+
+            compile_check_conftest "$CODE" "NV_DRM_FB_CREATE_TAKES_FORMAT_INFO" "" "types"
+        ;;
+
+        drm_fill_fb_struct_takes_format_info)
+            #
+            # Determine if a `struct drm_format_info *` is passed into
+            # drm_helper_mode_fill_fb_struct(). If so, it will have 4 arguments.
+            # This parameter was added in commit a34cc7bf1034 ("drm:
+            # Allow the caller to pass in the format info to
+            # drm_helper_mode_fill_fb_struct()") in linux-next
+            # (2025-07-16)
+            CODE="
+            #include <drm/drm_modeset_helper.h>
+
+            void conftest_drm_fill_fb_struct_takes_format_info(void) {
+                drm_helper_mode_fill_fb_struct(NULL, NULL, NULL, NULL);
+            }"
+
+            compile_check_conftest "$CODE" "NV_DRM_FILL_FB_STRUCT_TAKES_FORMAT_INFO" "" "types"
+        ;;
+
         drm_connector_funcs_have_mode_in_name)
             #
             # Determine if _mode_ is present in connector function names.  We
diff --git a/kernel-open/nvidia-drm/nvidia-drm-drv.c b/kernel-open/nvidia-drm/nvidia-drm-drv.c
index 89087d7b3..e0df73201 100644
--- a/kernel-open/nvidia-drm/nvidia-drm-drv.c
+++ b/kernel-open/nvidia-drm/nvidia-drm-drv.c
@@ -209,11 +209,14 @@ static void nv_drm_output_poll_changed(struct drm_device *dev)
 static struct drm_framebuffer *nv_drm_framebuffer_create(
     struct drm_device *dev,
     struct drm_file *file,
-    #if defined(NV_DRM_HELPER_MODE_FILL_FB_STRUCT_HAS_CONST_MODE_CMD_ARG)
+#if defined(NV_DRM_FB_CREATE_TAKES_FORMAT_INFO)
+    const struct drm_format_info *info,
+#endif
+#if defined(NV_DRM_HELPER_MODE_FILL_FB_STRUCT_HAS_CONST_MODE_CMD_ARG)
     const struct drm_mode_fb_cmd2 *cmd
-    #else
+#else
     struct drm_mode_fb_cmd2 *cmd
-    #endif
+#endif
 )
 {
     struct drm_mode_fb_cmd2 local_cmd;
@@ -224,11 +227,14 @@ static struct drm_framebuffer *nv_drm_framebuffer_create(
     fb = nv_drm_internal_framebuffer_create(
             dev,
             file,
+#if defined(NV_DRM_FB_CREATE_TAKES_FORMAT_INFO)
+            info,
+#endif
             &local_cmd);
 
-    #if !defined(NV_DRM_HELPER_MODE_FILL_FB_STRUCT_HAS_CONST_MODE_CMD_ARG)
+#if !defined(NV_DRM_HELPER_MODE_FILL_FB_STRUCT_HAS_CONST_MODE_CMD_ARG)
     *cmd = local_cmd;
-    #endif
+#endif
 
     return fb;
 }
@@ -2046,13 +2052,13 @@ void nv_drm_register_drm_device(const nv_gpu_info_t *gpu_info)
 #endif
             nvKms->framebufferConsoleDisabled(nv_dev->pDevice);
         }
-        #if defined(NV_DRM_CLIENT_AVAILABLE)
+#if defined(NV_DRM_CLIENT_AVAILABLE)
 	    drm_client_setup(dev, NULL);
-        #elif defined(NV_DRM_FBDEV_TTM_AVAILABLE)
+#elif defined(NV_DRM_FBDEV_TTM_AVAILABLE)
         drm_fbdev_ttm_setup(dev, 32);
-        #elif defined(NV_DRM_FBDEV_GENERIC_AVAILABLE)
+#elif defined(NV_DRM_FBDEV_GENERIC_AVAILABLE)
         drm_fbdev_generic_setup(dev, 32);
-        #endif
+#endif
     }
 #endif /* defined(NV_DRM_FBDEV_AVAILABLE) */
 
diff --git a/kernel-open/nvidia-drm/nvidia-drm-fb.c b/kernel-open/nvidia-drm/nvidia-drm-fb.c
index e88dc948d..4dc41054f 100644
--- a/kernel-open/nvidia-drm/nvidia-drm-fb.c
+++ b/kernel-open/nvidia-drm/nvidia-drm-fb.c
@@ -220,6 +220,9 @@ fail:
 struct drm_framebuffer *nv_drm_internal_framebuffer_create(
     struct drm_device *dev,
     struct drm_file *file,
+#if defined(NV_DRM_FB_CREATE_TAKES_FORMAT_INFO)
+    const struct drm_format_info *info,
+#endif
     struct drm_mode_fb_cmd2 *cmd)
 {
     struct nv_drm_device *nv_dev = to_nv_device(dev);
@@ -273,6 +276,9 @@ struct drm_framebuffer *nv_drm_internal_framebuffer_create(
         dev,
         #endif
         &nv_fb->base,
+        #if defined(NV_DRM_FB_CREATE_TAKES_FORMAT_INFO)
+        info,
+        #endif 
         cmd);
 
     /*
diff --git a/kernel-open/nvidia-drm/nvidia-drm-fb.h b/kernel-open/nvidia-drm/nvidia-drm-fb.h
index 40445665e..b2d8b8664 100644
--- a/kernel-open/nvidia-drm/nvidia-drm-fb.h
+++ b/kernel-open/nvidia-drm/nvidia-drm-fb.h
@@ -84,6 +84,9 @@ static inline void nv_fb_set_gem_obj(
 struct drm_framebuffer *nv_drm_internal_framebuffer_create(
     struct drm_device *dev,
     struct drm_file *file,
+#if defined(NV_DRM_FB_CREATE_TAKES_FORMAT_INFO)
+    const struct drm_format_info *info,
+#endif
     struct drm_mode_fb_cmd2 *cmd);
 
 #endif /* NV_DRM_ATOMIC_MODESET_AVAILABLE */
diff --git a/kernel-open/nvidia-drm/nvidia-drm-modeset.c b/kernel-open/nvidia-drm/nvidia-drm-modeset.c
index 6ed769615..adaee1148 100644
--- a/kernel-open/nvidia-drm/nvidia-drm-modeset.c
+++ b/kernel-open/nvidia-drm/nvidia-drm-modeset.c
@@ -703,6 +703,13 @@ int nv_drm_atomic_commit(struct drm_device *dev,
 #else
     drm_atomic_helper_swap_state(dev, state);
 #endif
+    /*
+     * Used to update legacy modeset state pointers to support UAPIs not updated
+     * by the core atomic modeset infrastructure.
+     *
+     * Example: /sys/class/drm/<card connector>/enabled
+     */
+    drm_atomic_helper_update_legacy_modeset_state(dev, state);
 
     /*
      * nv_drm_atomic_commit_internal() must not return failure after
diff --git a/kernel-open/nvidia-drm/nvidia-drm-sources.mk b/kernel-open/nvidia-drm/nvidia-drm-sources.mk
index c5969d4f0..4dd98e58a 100644
--- a/kernel-open/nvidia-drm/nvidia-drm-sources.mk
+++ b/kernel-open/nvidia-drm/nvidia-drm-sources.mk
@@ -148,3 +148,4 @@ NV_CONFTEST_TYPE_COMPILE_TESTS += drm_output_poll_changed
 NV_CONFTEST_TYPE_COMPILE_TESTS += drm_driver_has_date
 NV_CONFTEST_TYPE_COMPILE_TESTS += file_operations_fop_unsigned_offset_present
 NV_CONFTEST_TYPE_COMPILE_TESTS += drm_connector_helper_funcs_mode_valid_has_const_mode_arg
+NV_CONFTEST_TYPE_COMPILE_TESTS += drm_fb_create_takes_format_info
diff --git a/kernel-open/nvidia-uvm/uvm_hmm.c b/kernel-open/nvidia-uvm/uvm_hmm.c
index 6f904aa6f..cd77a73b5 100644
--- a/kernel-open/nvidia-uvm/uvm_hmm.c
+++ b/kernel-open/nvidia-uvm/uvm_hmm.c
@@ -91,10 +91,6 @@ static __always_inline bool nv_PageSwapCache(struct page *page)
 #endif
 }
 
-static NV_STATUS gpu_chunk_add(uvm_va_block_t *va_block,
-                               uvm_page_index_t page_index,
-                               struct page *page);
-
 typedef struct
 {
     uvm_processor_id_t processor_id;
@@ -269,6 +265,7 @@ void uvm_hmm_va_space_initialize(uvm_va_space_t *va_space)
 
     uvm_range_tree_init(&hmm_va_space->blocks);
     uvm_mutex_init(&hmm_va_space->blocks_lock, UVM_LOCK_ORDER_LEAF);
+    atomic64_set(&hmm_va_space->allocated_page_count, 0);
 
     return;
 }
@@ -348,14 +345,20 @@ void uvm_hmm_unregister_gpu(uvm_va_space_t *va_space, uvm_gpu_t *gpu, struct mm_
         for (pfn = __phys_to_pfn(devmem_start); pfn <= __phys_to_pfn(devmem_end); pfn++) {
             struct page *page = pfn_to_page(pfn);
 
+            // No need to keep scanning if no HMM pages are allocated for this
+            // va_space.
+            if (!atomic64_read(&va_space->hmm.allocated_page_count))
+                break;
+
             UVM_ASSERT(is_device_private_page(page));
 
             // This check is racy because nothing stops the page being freed and
             // even reused. That doesn't matter though - worst case the
             // migration fails, we retry and find the va_space doesn't match.
-            if (uvm_pmm_devmem_page_to_va_space(page) == va_space)
+            if (uvm_pmm_devmem_page_to_va_space(page) == va_space) {
                 if (uvm_hmm_pmm_gpu_evict_pfn(pfn) != NV_OK)
                     retry = true;
+            }
         }
     } while (retry);
 
@@ -945,7 +948,6 @@ static NV_STATUS split_block_if_needed(uvm_va_block_t *va_block,
     uvm_va_space_t *va_space;
     struct mm_struct *mm;
     struct vm_area_struct *vma;
-    uvm_va_block_region_t region;
     NvU64 addr, from, to;
     uvm_va_block_t *new;
     NV_STATUS status = NV_OK;
@@ -987,7 +989,6 @@ static NV_STATUS split_block_if_needed(uvm_va_block_t *va_block,
 
         from = max(addr, (NvU64)vma->vm_start);
         to = min(va_block->end, (NvU64)vma->vm_end - 1);
-        region = uvm_va_block_region_from_start_end(va_block, from, to);
 
         if (!uvm_hmm_vma_is_valid(vma, from, false))
             continue;
@@ -1344,6 +1345,7 @@ void uvm_hmm_block_add_eviction_mappings(uvm_va_space_t *va_space,
     uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
     uvm_va_policy_node_t *node;
     uvm_va_block_region_t region;
+    const uvm_va_policy_t *policy;
     uvm_processor_mask_t *map_processors = &block_context->hmm.map_processors_eviction;
     uvm_processor_id_t id;
     NV_STATUS tracker_status;
@@ -1355,8 +1357,8 @@ void uvm_hmm_block_add_eviction_mappings(uvm_va_space_t *va_space,
 
     uvm_mutex_lock(&va_block->lock);
 
-    uvm_for_each_va_policy_node_in(node, va_block, va_block->start, va_block->end) {
-        for_each_id_in_mask(id, &node->policy.accessed_by) {
+    uvm_for_each_va_policy_in(policy, va_block, va_block->start, va_block->end, node, region) {
+        for_each_id_in_mask(id, &policy->accessed_by) {
             status = hmm_set_accessed_by_start_end_locked(va_block,
                                                           block_context,
                                                           id,
@@ -1371,7 +1373,7 @@ void uvm_hmm_block_add_eviction_mappings(uvm_va_space_t *va_space,
 
             // Exclude the processors that have been already mapped due to
             // AccessedBy.
-            uvm_processor_mask_andnot(map_processors, &va_block->evicted_gpus, &node->policy.accessed_by);
+            uvm_processor_mask_andnot(map_processors, &va_block->evicted_gpus, &policy->accessed_by);
 
             for_each_gpu_id_in_mask(id, map_processors) {
                 uvm_gpu_t *gpu = uvm_gpu_get(id);
@@ -1604,7 +1606,7 @@ static NV_STATUS hmm_va_block_cpu_page_populate(uvm_va_block_t *va_block,
 
     status = uvm_va_block_map_cpu_chunk_on_gpus(va_block, chunk);
     if (status != NV_OK) {
-        uvm_cpu_chunk_remove_from_block(va_block, page_to_nid(page), page_index);
+        uvm_cpu_chunk_remove_from_block(va_block, chunk, page_to_nid(page), page_index);
         uvm_cpu_chunk_free(chunk);
     }
 
@@ -1623,7 +1625,7 @@ static void hmm_va_block_cpu_unpopulate_chunk(uvm_va_block_t *va_block,
                !uvm_va_block_cpu_is_page_resident_on(va_block, NUMA_NO_NODE, page_index));
     UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == PAGE_SIZE);
 
-    uvm_cpu_chunk_remove_from_block(va_block, chunk_nid, page_index);
+    uvm_cpu_chunk_remove_from_block(va_block, chunk, chunk_nid, page_index);
     uvm_va_block_unmap_cpu_chunk_on_gpus(va_block, chunk);
     uvm_cpu_chunk_free(chunk);
 }
@@ -1648,14 +1650,45 @@ static void hmm_va_block_cpu_page_unpopulate(uvm_va_block_t *va_block, uvm_page_
     }
 }
 
-static bool hmm_va_block_cpu_page_is_same(uvm_va_block_t *va_block,
-                                          uvm_page_index_t page_index,
-                                          struct page *page)
+// Insert the given sysmem page.
+// Note that we might have a driver allocated sysmem page for staged GPU to GPU
+// copies and that Linux may independently have allocated a page.
+// If so, we have to free the driver page and use the one from Linux.
+static NV_STATUS hmm_va_block_cpu_page_insert_or_replace(uvm_va_block_t *va_block,
+                                                         uvm_page_index_t page_index,
+                                                         struct page *page,
+                                                         uvm_page_mask_t *populated_page_mask)
 {
-    struct page *old_page = uvm_va_block_get_cpu_page(va_block, page_index);
+    NV_STATUS status;
 
-    UVM_ASSERT(uvm_cpu_chunk_is_hmm(uvm_cpu_chunk_get_chunk_for_page(va_block, page_to_nid(page), page_index)));
-    return old_page == page;
+    if (uvm_page_mask_test(&va_block->cpu.allocated, page_index)) {
+        uvm_cpu_chunk_t *cpu_chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, page_to_nid(page), page_index);
+
+        // Check to see if the CPU chunk already refers to the given page.
+        if (cpu_chunk &&
+            uvm_cpu_chunk_is_hmm(cpu_chunk) &&
+            uvm_cpu_chunk_get_cpu_page(va_block, cpu_chunk, page_index) == page) {
+
+            UVM_ASSERT(uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU));
+            UVM_ASSERT(uvm_va_block_cpu_is_page_resident_on(va_block, page_to_nid(page), page_index));
+
+            return NV_OK;
+        }
+
+        // A driver allocated CPU chunk could have a different NUMA node ID.
+        hmm_va_block_cpu_page_unpopulate(va_block, page_index, NULL);
+    }
+
+    status = hmm_va_block_cpu_page_populate(va_block, page_index, page);
+    if (status != NV_OK)
+        return status;
+
+    // Record that we populated this page. hmm_block_cpu_fault_locked()
+    // uses this to ensure pages that don't migrate get remote mapped.
+    if (populated_page_mask)
+        uvm_page_mask_set(populated_page_mask, page_index);
+
+    return NV_OK;
 }
 
 // uvm_va_block_service_copy() and uvm_va_block_service_finish() expect the
@@ -1709,6 +1742,67 @@ static void cpu_mapping_clear(uvm_va_block_t *va_block, uvm_page_index_t page_in
         uvm_processor_mask_clear(&va_block->mapped, UVM_ID_CPU);
 }
 
+static void gpu_chunk_free(uvm_va_block_t *va_block,
+                           uvm_va_block_retry_t *va_block_retry,
+                           uvm_va_block_gpu_state_t *gpu_state,
+                           uvm_page_index_t page_index)
+{
+    uvm_gpu_chunk_t *gpu_chunk = gpu_state->chunks[page_index];
+
+    if (gpu_chunk->state != UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED || gpu_chunk->is_referenced)
+        return;
+
+    UVM_ASSERT(gpu_chunk->va_block == va_block);
+    UVM_ASSERT(gpu_chunk->va_block_page_index == page_index);
+
+    uvm_mmu_chunk_unmap(gpu_chunk, &va_block->tracker);
+    gpu_state->chunks[page_index] = NULL;
+    if (va_block_retry) {
+        list_move_tail(&gpu_chunk->list, &va_block_retry->free_chunks);
+    }
+    else {
+        list_del_init(&gpu_chunk->list);
+        uvm_pmm_gpu_free(&uvm_gpu_chunk_get_gpu(gpu_chunk)->pmm, gpu_chunk, NULL);
+    }
+}
+
+static void gpu_chunk_free_region(uvm_va_block_t *va_block,
+                                  uvm_va_block_retry_t *va_block_retry,
+                                  uvm_gpu_id_t gpu_id,
+                                  uvm_va_block_region_t region,
+                                  const uvm_page_mask_t *page_mask)
+{
+    uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu_id);
+    uvm_page_index_t page_index;
+
+    for_each_va_block_page_in_region_mask(page_index, page_mask, region)
+        gpu_chunk_free(va_block, va_block_retry, gpu_state, page_index);
+}
+
+static void gpu_chunk_free_preallocated(uvm_va_block_t *va_block,
+                                        uvm_va_block_retry_t *va_block_retry)
+{
+    uvm_gpu_chunk_t *gpu_chunk, *next_chunk;
+
+    list_for_each_entry_safe(gpu_chunk, next_chunk, &va_block_retry->used_chunks, list) {
+        uvm_gpu_t *gpu = uvm_gpu_chunk_get_gpu(gpu_chunk);
+        uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id);
+        uvm_page_index_t page_index = gpu_chunk->va_block_page_index;
+
+        UVM_ASSERT(gpu_state);
+
+        UVM_ASSERT(gpu_chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED);
+        UVM_ASSERT(gpu_chunk->va_block == va_block);
+        UVM_ASSERT(!gpu_chunk->is_referenced);
+
+        uvm_mmu_chunk_unmap(gpu_chunk, &va_block->tracker);
+        gpu_state->chunks[page_index] = NULL;
+
+        list_del_init(&gpu_chunk->list);
+        uvm_pmm_gpu_free(&gpu->pmm, gpu_chunk, NULL);
+    }
+}
+
 static void gpu_chunk_remove(uvm_va_block_t *va_block,
                              uvm_page_index_t page_index,
                              struct page *page)
@@ -1717,20 +1811,23 @@ static void gpu_chunk_remove(uvm_va_block_t *va_block,
     uvm_gpu_chunk_t *gpu_chunk;
     uvm_gpu_id_t id;
 
-    id = uvm_gpu_chunk_get_gpu(uvm_pmm_devmem_page_to_chunk(page))->id;
+    gpu_chunk = uvm_pmm_devmem_page_to_chunk(page);
+    id = uvm_gpu_chunk_get_gpu(gpu_chunk)->id;
     gpu_state = uvm_va_block_gpu_state_get(va_block, id);
     UVM_ASSERT(gpu_state);
 
-    gpu_chunk = gpu_state->chunks[page_index];
-    if (!gpu_chunk) {
+    if (!gpu_state->chunks[page_index]) {
         // If we didn't find a chunk it's because the page was unmapped for
         // mremap and no fault has established a new mapping.
         UVM_ASSERT(!uvm_page_mask_test(&gpu_state->resident, page_index));
         return;
     }
 
-    UVM_ASSERT(gpu_chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED);
+    UVM_ASSERT(gpu_chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED ||
+               gpu_chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED);
+    UVM_ASSERT(gpu_chunk->va_block == va_block);
     UVM_ASSERT(gpu_chunk->is_referenced);
+    UVM_ASSERT(gpu_chunk == gpu_state->chunks[page_index]);
 
     uvm_page_mask_clear(&gpu_state->resident, page_index);
 
@@ -1739,42 +1836,42 @@ static void gpu_chunk_remove(uvm_va_block_t *va_block,
 }
 
 static NV_STATUS gpu_chunk_add(uvm_va_block_t *va_block,
+                               uvm_va_block_retry_t *va_block_retry,
                                uvm_page_index_t page_index,
                                struct page *page)
 {
     uvm_va_block_gpu_state_t *gpu_state;
     uvm_gpu_chunk_t *gpu_chunk;
-    uvm_gpu_id_t id;
+    uvm_gpu_t *gpu;
     NV_STATUS status;
 
-    id = uvm_gpu_chunk_get_gpu(uvm_pmm_devmem_page_to_chunk(page))->id;
-    gpu_state = uvm_va_block_gpu_state_get(va_block, id);
+    gpu_chunk = uvm_pmm_devmem_page_to_chunk(page);
+    gpu = uvm_gpu_chunk_get_gpu(gpu_chunk);
+    gpu_state = uvm_va_block_gpu_state_get_alloc(va_block, gpu);
 
-    // It's possible that this is a fresh va_block we're trying to add an
-    // existing gpu_chunk to. This occurs for example when a GPU faults on a
-    // virtual address that has been remapped with mremap().
-    if (!gpu_state) {
-        status = uvm_va_block_gpu_state_alloc(va_block);
-        if (status != NV_OK)
-            return status;
-        gpu_state = uvm_va_block_gpu_state_get(va_block, id);
-    }
-
-    UVM_ASSERT(gpu_state);
+    if (!gpu_state)
+            return NV_ERR_NO_MEMORY;
 
     // Note that a mremap() might be to a CPU virtual address that is nolonger
     // aligned with a larger GPU chunk size. We would need to allocate a new
     // aligned GPU chunk and copy from old to new.
     // TODO: Bug 3368756: add support for large GPU pages.
-    gpu_chunk = uvm_pmm_devmem_page_to_chunk(page);
     UVM_ASSERT(gpu_chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED);
     UVM_ASSERT(gpu_chunk->is_referenced);
     UVM_ASSERT(uvm_pmm_devmem_page_to_va_space(page) == va_block->hmm.va_space);
 
-    if (gpu_state->chunks[page_index] == gpu_chunk)
+    if (gpu_state->chunks[page_index] == gpu_chunk) {
+        UVM_ASSERT(gpu_chunk->va_block == va_block);
+        UVM_ASSERT(gpu_chunk->va_block_page_index == page_index);
         return NV_OK;
+    }
 
-    UVM_ASSERT(!gpu_state->chunks[page_index]);
+    if (gpu_state->chunks[page_index]) {
+        // In the mremap() case, if we pre-allocated a new GPU chunk for the
+        // destination of a potential migration but we need to free it because
+        // we are replacing it with the old chunk from the mremap() source.
+        gpu_chunk_free(va_block, va_block_retry, gpu_state, page_index);
+    }
 
     // In some configurations such as SR-IOV heavy, the chunk cannot be
     // referenced using its physical address. Create a virtual mapping.
@@ -1782,7 +1879,7 @@ static NV_STATUS gpu_chunk_add(uvm_va_block_t *va_block,
     if (status != NV_OK)
         return status;
 
-    uvm_processor_mask_set(&va_block->resident, id);
+    uvm_processor_mask_set(&va_block->resident, gpu->id);
     uvm_page_mask_set(&gpu_state->resident, page_index);
 
     // It is safe to modify the page index field without holding any PMM locks
@@ -1817,34 +1914,50 @@ static NV_STATUS sync_page_and_chunk_state(uvm_va_block_t *va_block,
 
     // Wait for the GPU to finish. migrate_vma_finalize() will release the
     // migrated source pages (or non migrating destination pages), so GPU
-    // opererations must be finished by then.
+    // opererations must be finished by then. Also, we unmap the source or
+    // destination so DMAs must be complete before DMA unmapping.
     status = uvm_tracker_wait(&va_block->tracker);
 
     for_each_va_block_page_in_region(page_index, region) {
-        struct page *page;
+        struct page *src_page;
+        struct page *dst_page;
 
         if (uvm_page_mask_test(same_devmem_page_mask, page_index))
             continue;
 
-        // If a page migrated, clean up the source page.
-        // Otherwise, clean up the destination page.
-        if (uvm_page_mask_test(migrated_pages, page_index))
-            page = migrate_pfn_to_page(src_pfns[page_index]);
-        else
-            page = migrate_pfn_to_page(dst_pfns[page_index]);
-
-        if (!page)
-            continue;
-
-        if (is_device_private_page(page)) {
-            gpu_chunk_remove(va_block, page_index, page);
+        // If the source page migrated, we have to remove our pointers to it
+        // because migrate_vma_finalize() will release the reference.
+        // TODO: Bug 3660922: Need to handle read duplication at some point.
+        src_page = migrate_pfn_to_page(src_pfns[page_index]);
+        if (src_page && uvm_page_mask_test(migrated_pages, page_index)) {
+            if (is_device_private_page(src_page))
+                gpu_chunk_remove(va_block, page_index, src_page);
+            else
+                hmm_va_block_cpu_page_unpopulate(va_block, page_index, src_page);
         }
-        else {
-            // If the source page is a system memory page,
-            // migrate_vma_finalize() will release the reference so we should
-            // clear our pointer to it.
-            // TODO: Bug 3660922: Need to handle read duplication at some point.
-            hmm_va_block_cpu_page_unpopulate(va_block, page_index, page);
+
+        dst_page = migrate_pfn_to_page(dst_pfns[page_index]);
+        if (dst_page) {
+            if (is_device_private_page(dst_page)) {
+                uvm_gpu_chunk_t *gpu_chunk = uvm_pmm_devmem_page_to_chunk(dst_page);
+
+                UVM_ASSERT(gpu_chunk);
+                UVM_ASSERT(gpu_chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED ||
+                           gpu_chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED);
+                UVM_ASSERT(gpu_chunk->is_referenced);
+
+                // If a page migrated to the GPU, we have to unpin the
+                // gpu_chunk. Otherwise, clear pointers to temporary pinned
+                // pages that aren't migrating.
+                if (uvm_page_mask_test(migrated_pages, page_index))
+                    uvm_pmm_gpu_unpin_allocated(&uvm_gpu_chunk_get_gpu(gpu_chunk)->pmm, gpu_chunk, va_block);
+                else
+                    gpu_chunk_remove(va_block, page_index, dst_page);
+            }
+            else if (!uvm_page_mask_test(migrated_pages, page_index)) {
+                // Clear pointer to sysmem page that will be released.
+                hmm_va_block_cpu_page_unpopulate(va_block, page_index, dst_page);
+            }
         }
     }
 
@@ -1853,7 +1966,6 @@ static NV_STATUS sync_page_and_chunk_state(uvm_va_block_t *va_block,
 
 // Update va_block state to reflect that the page isn't migrating.
 static void clean_up_non_migrating_page(uvm_va_block_t *va_block,
-                                        const unsigned long *src_pfns,
                                         unsigned long *dst_pfns,
                                         uvm_page_index_t page_index)
 {
@@ -1879,10 +1991,9 @@ static void clean_up_non_migrating_page(uvm_va_block_t *va_block,
 }
 
 static void clean_up_non_migrating_pages(uvm_va_block_t *va_block,
-                                         const unsigned long *src_pfns,
                                          unsigned long *dst_pfns,
                                          uvm_va_block_region_t region,
-                                         uvm_page_mask_t *page_mask)
+                                         const uvm_page_mask_t *page_mask)
 {
     uvm_page_index_t page_index;
     NV_STATUS status;
@@ -1891,23 +2002,47 @@ static void clean_up_non_migrating_pages(uvm_va_block_t *va_block,
     UVM_ASSERT(status == NV_OK);
 
     for_each_va_block_page_in_region_mask(page_index, page_mask, region) {
-        clean_up_non_migrating_page(va_block, src_pfns, dst_pfns, page_index);
+        clean_up_non_migrating_page(va_block, dst_pfns, page_index);
     }
 }
 
 // CPU page fault handling.
 
-// Fill in the dst_pfns[page_index] entry given that there is an allocated
-// CPU page.
-static void lock_block_cpu_page(uvm_va_block_t *va_block,
-                                uvm_page_index_t page_index,
-                                struct page *src_page,
-                                unsigned long *dst_pfns,
-                                uvm_page_mask_t *same_devmem_page_mask)
+// Fill in the dst_pfns[page_index] entry with a CPU page.
+// The src_pfns[page_index] page, if present, is page locked.
+static NV_STATUS alloc_page_on_cpu(uvm_va_block_t *va_block,
+                                   uvm_va_block_retry_t *va_block_retry,
+                                   uvm_page_index_t page_index,
+                                   const unsigned long *src_pfns,
+                                   unsigned long *dst_pfns,
+                                   uvm_page_mask_t *same_devmem_page_mask,
+                                   uvm_va_block_context_t *block_context)
 {
-    uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_any_chunk_for_page(va_block, page_index);
-    uvm_va_block_region_t chunk_region;
+    struct page *src_page;
     struct page *dst_page;
+    uvm_cpu_chunk_t *chunk;
+    uvm_va_block_region_t chunk_region;
+
+    if (!uvm_page_mask_test(&va_block->cpu.allocated, page_index)) {
+        NV_STATUS status;
+
+        UVM_ASSERT(!uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU) ||
+                   !uvm_va_block_cpu_is_page_resident_on(va_block, NUMA_NO_NODE, page_index));
+
+        status = uvm_va_block_populate_page_cpu(va_block, page_index, block_context);
+        if (status != NV_OK)
+            return status;
+    }
+
+    // This is the page that will be copied to system memory.
+    src_page = migrate_pfn_to_page(src_pfns[page_index]);
+
+    // mremap may have caused us to lose the gpu_chunk associated with
+    // this va_block/page_index so make sure we have the correct chunk.
+    if (src_page && is_device_private_page(src_page))
+        gpu_chunk_add(va_block, va_block_retry, page_index, src_page);
+
+    chunk = uvm_cpu_chunk_get_any_chunk_for_page(va_block, page_index);
 
     UVM_ASSERT(chunk);
     UVM_ASSERT(chunk->page);
@@ -1923,61 +2058,63 @@ static void lock_block_cpu_page(uvm_va_block_t *va_block,
     // remote mapped system memory page. It could also be a driver allocated
     // page for GPU-to-GPU staged copies (i.e., not a resident copy and owned
     // by the driver).
-    if (is_device_private_page(src_page)) {
-        // Since the page isn't mirrored, it was allocated by alloc_pages()
+    if (!src_page || is_device_private_page(src_page)) {
+        UVM_ASSERT(!uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU) ||
+                   !uvm_va_block_cpu_is_page_resident_on(va_block, NUMA_NO_NODE, page_index));
+
+        // If the page isn't mirrored, it was allocated by alloc_pages()
         // and UVM owns the reference. We leave the reference count unchanged
         // and mark the page pointer as mirrored since UVM is transferring
         // ownership to Linux and we don't want UVM to double free the page in
         // hmm_va_block_cpu_page_unpopulate() or block_kill(). If the page
         // does not migrate, it will be freed though.
-        UVM_ASSERT(!uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU) ||
-                   !uvm_va_block_cpu_is_page_resident_on(va_block, NUMA_NO_NODE, page_index));
-        UVM_ASSERT(chunk->type == UVM_CPU_CHUNK_TYPE_PHYSICAL);
-        UVM_ASSERT(page_ref_count(dst_page) == 1);
-        uvm_cpu_chunk_make_hmm(chunk);
+        if (chunk->type == UVM_CPU_CHUNK_TYPE_PHYSICAL) {
+            UVM_ASSERT(page_ref_count(dst_page) == 1);
+            uvm_cpu_chunk_make_hmm(chunk);
+        }
+
+        lock_page(dst_page);
+        dst_pfns[page_index] = migrate_pfn(page_to_pfn(dst_page));
     }
     else {
+        if (src_page != dst_page) {
+            // This must be a driver allocated staging page that doesn't match
+            // the page that migrate_vma_setup() locked.
+            hmm_va_block_cpu_unpopulate_chunk(va_block, chunk, page_to_nid(dst_page), page_index);
+            hmm_va_block_cpu_page_populate(va_block, page_index, src_page);
+        }
+
+        UVM_ASSERT(uvm_cpu_chunk_is_hmm(chunk));
         UVM_ASSERT(same_devmem_page_mask);
-        UVM_ASSERT(src_page == dst_page);
         uvm_page_mask_set(same_devmem_page_mask, page_index);
 
         // The call to migrate_vma_setup() will have inserted a migration PTE
         // so the CPU has no access.
         cpu_mapping_clear(va_block, page_index);
-        return;
     }
 
-    lock_page(dst_page);
-    dst_pfns[page_index] = migrate_pfn(page_to_pfn(dst_page));
-}
-
-static void hmm_mark_gpu_chunk_referenced(uvm_va_block_t *va_block,
-                                          uvm_gpu_t *gpu,
-                                          uvm_gpu_chunk_t *gpu_chunk)
-{
-    // Tell PMM to expect a callback from Linux to free the page since the
-    // device private struct page reference count will determine when the
-    // GPU chunk is free.
-    UVM_ASSERT(gpu_chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED);
-    list_del_init(&gpu_chunk->list);
-    uvm_pmm_gpu_unpin_referenced(&gpu->pmm, gpu_chunk, va_block);
+    return NV_OK;
 }
 
 static void fill_dst_pfn(uvm_va_block_t *va_block,
+                         uvm_va_block_gpu_state_t *gpu_state,
                          uvm_gpu_t *gpu,
                          const unsigned long *src_pfns,
                          unsigned long *dst_pfns,
                          uvm_page_index_t page_index,
+                         const uvm_page_mask_t *page_mask,
                          uvm_page_mask_t *same_devmem_page_mask)
 {
     unsigned long src_pfn = src_pfns[page_index];
-    uvm_gpu_chunk_t *gpu_chunk;
+    uvm_gpu_chunk_t *gpu_chunk = gpu_state->chunks[page_index];
     unsigned long pfn;
     struct page *dpage;
 
-    gpu_chunk = uvm_va_block_lookup_gpu_chunk(va_block, gpu, uvm_va_block_cpu_page_address(va_block, page_index));
     UVM_ASSERT(gpu_chunk);
+    UVM_ASSERT(uvm_gpu_chunk_is_user(gpu_chunk));
     UVM_ASSERT(gpu_chunk->log2_size == PAGE_SHIFT);
+    UVM_ASSERT(gpu_chunk->va_block == va_block);
+
     pfn = uvm_pmm_gpu_devmem_get_pfn(&gpu->pmm, gpu_chunk);
 
     // If the same GPU page is both source and destination, migrate_vma_pages()
@@ -1985,6 +2122,8 @@ static void fill_dst_pfn(uvm_va_block_t *va_block,
     // mark it as not migrating but we keep track of this so we don't confuse
     // it with a page that migrate_vma_pages() actually does not migrate.
     if ((src_pfn & MIGRATE_PFN_VALID) && (src_pfn >> MIGRATE_PFN_SHIFT) == pfn) {
+        UVM_ASSERT(gpu_chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED);
+        UVM_ASSERT(gpu_chunk->is_referenced);
         uvm_page_mask_set(same_devmem_page_mask, page_index);
         return;
     }
@@ -1993,90 +2132,32 @@ static void fill_dst_pfn(uvm_va_block_t *va_block,
     UVM_ASSERT(is_device_private_page(dpage));
     UVM_ASSERT(page_pgmap(dpage)->owner == &g_uvm_global);
 
-    hmm_mark_gpu_chunk_referenced(va_block, gpu, gpu_chunk);
-    UVM_ASSERT(!page_count(dpage));
-    zone_device_page_init(dpage);
-    dpage->zone_device_data = gpu_chunk;
+    if (gpu_chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED) {
+        UVM_ASSERT(!gpu_chunk->is_referenced);
+        gpu_chunk->is_referenced = true;
+
+        // Remove the GPU chunk from the retry->used_chunks list.
+        list_del_init(&gpu_chunk->list);
+
+        UVM_ASSERT(!page_count(dpage));
+        UVM_ASSERT(!dpage->zone_device_data);
+        zone_device_page_init(dpage);
+        dpage->zone_device_data = gpu_chunk;
+        atomic64_inc(&va_block->hmm.va_space->hmm.allocated_page_count);
+    }
+    else {
+        UVM_ASSERT(gpu_chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED);
+        UVM_ASSERT(gpu_chunk->is_referenced);
+        UVM_ASSERT(uvm_pmm_devmem_page_to_chunk(dpage) == gpu_chunk);
+        UVM_ASSERT(page_count(dpage) == 1);
+    }
 
     dst_pfns[page_index] = migrate_pfn(pfn);
 }
 
-static void fill_dst_pfns(uvm_va_block_t *va_block,
-                          const unsigned long *src_pfns,
-                          unsigned long *dst_pfns,
-                          uvm_va_block_region_t region,
-                          uvm_page_mask_t *page_mask,
-                          uvm_page_mask_t *same_devmem_page_mask,
-                          uvm_processor_id_t dest_id)
-{
-    uvm_gpu_t *gpu = uvm_gpu_get(dest_id);
-    uvm_page_index_t page_index;
-
-    uvm_page_mask_zero(same_devmem_page_mask);
-
-    for_each_va_block_page_in_region_mask(page_index, page_mask, region) {
-        if (!(src_pfns[page_index] & MIGRATE_PFN_MIGRATE))
-            continue;
-
-        fill_dst_pfn(va_block,
-                     gpu,
-                     src_pfns,
-                     dst_pfns,
-                     page_index,
-                     same_devmem_page_mask);
-    }
-}
-
-static NV_STATUS alloc_page_on_cpu(uvm_va_block_t *va_block,
-                                   uvm_page_index_t page_index,
-                                   const unsigned long *src_pfns,
-                                   unsigned long *dst_pfns,
-                                   uvm_page_mask_t *same_devmem_page_mask,
-                                   uvm_va_block_context_t *block_context)
-{
-    NV_STATUS status;
-    struct page *src_page;
-    struct page *dst_page;
-
-    // This is the page that will be copied to system memory.
-    src_page = migrate_pfn_to_page(src_pfns[page_index]);
-
-    if (src_page) {
-        // mremap may have caused us to lose the gpu_chunk associated with
-        // this va_block/page_index so make sure we have the correct chunk.
-        if (is_device_private_page(src_page))
-            gpu_chunk_add(va_block, page_index, src_page);
-
-        if (uvm_page_mask_test(&va_block->cpu.allocated, page_index)) {
-            lock_block_cpu_page(va_block, page_index, src_page, dst_pfns, same_devmem_page_mask);
-            return NV_OK;
-        }
-    }
-
-    UVM_ASSERT(!uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU) ||
-                !uvm_va_block_cpu_is_page_resident_on(va_block, NUMA_NO_NODE, page_index));
-
-    status = uvm_va_block_populate_page_cpu(va_block, page_index, block_context);
-    if (status != NV_OK)
-        return status;
-
-    // TODO: Bug 3368756: add support for transparent huge pages
-    // Support for large CPU pages means the page_index may need fixing
-    dst_page = migrate_pfn_to_page(block_context->hmm.dst_pfns[page_index]);
-
-    // Note that we don't call get_page(dst_page) since alloc_page_vma()
-    // returns with a page reference count of one and we are passing
-    // ownership to Linux. Also, uvm_va_block_cpu_page_populate() recorded
-    // the page as "mirrored" so that migrate_vma_finalize() and
-    // hmm_va_block_cpu_page_unpopulate() don't double free the page.
-    lock_page(dst_page);
-    dst_pfns[page_index] = migrate_pfn(page_to_pfn(dst_page));
-
-    return NV_OK;
-}
-
 // Allocates pages on the CPU to handle migration due to a page fault
 static NV_STATUS fault_alloc_on_cpu(uvm_va_block_t *va_block,
+                                    uvm_va_block_retry_t *va_block_retry,
                                     const unsigned long *src_pfns,
                                     unsigned long *dst_pfns,
                                     uvm_va_block_region_t region,
@@ -2109,7 +2190,13 @@ static NV_STATUS fault_alloc_on_cpu(uvm_va_block_t *va_block,
             goto clr_mask;
         }
 
-        status = alloc_page_on_cpu(va_block, page_index, src_pfns, dst_pfns, same_devmem_page_mask, service_context->block_context);
+        status = alloc_page_on_cpu(va_block,
+                                   va_block_retry,
+                                   page_index,
+                                   src_pfns,
+                                   dst_pfns,
+                                   same_devmem_page_mask,
+                                   service_context->block_context);
         if (status != NV_OK) {
             // Ignore errors if the page is only for prefetching.
             if (service_context &&
@@ -2126,7 +2213,7 @@ static NV_STATUS fault_alloc_on_cpu(uvm_va_block_t *va_block,
     }
 
     if (status != NV_OK)
-        clean_up_non_migrating_pages(va_block, src_pfns, dst_pfns, region, page_mask);
+        clean_up_non_migrating_pages(va_block, dst_pfns, region, page_mask);
     else if (uvm_page_mask_empty(page_mask))
         return NV_WARN_MORE_PROCESSING_REQUIRED;
 
@@ -2135,6 +2222,7 @@ static NV_STATUS fault_alloc_on_cpu(uvm_va_block_t *va_block,
 
 // Allocates pages on the CPU for explicit migration calls.
 static NV_STATUS migrate_alloc_on_cpu(uvm_va_block_t *va_block,
+                                      uvm_va_block_retry_t *va_block_retry,
                                       const unsigned long *src_pfns,
                                       unsigned long *dst_pfns,
                                       uvm_va_block_region_t region,
@@ -2157,11 +2245,18 @@ static NV_STATUS migrate_alloc_on_cpu(uvm_va_block_t *va_block,
             continue;
         }
 
-        status = alloc_page_on_cpu(va_block, page_index, src_pfns, dst_pfns, same_devmem_page_mask, block_context);
+        status = alloc_page_on_cpu(va_block, va_block_retry, page_index, src_pfns, dst_pfns, same_devmem_page_mask, block_context);
+        if (status != NV_OK) {
+            // Try to migrate other pages if we can't allocate this one.
+            if (status != NV_ERR_NO_MEMORY)
+                break;
+
+            uvm_page_mask_clear(page_mask, page_index);
+        }
     }
 
     if (status != NV_OK)
-        clean_up_non_migrating_pages(va_block, src_pfns, dst_pfns, region, page_mask);
+        clean_up_non_migrating_pages(va_block, dst_pfns, region, page_mask);
     else if (uvm_page_mask_empty(page_mask))
         return NV_WARN_MORE_PROCESSING_REQUIRED;
 
@@ -2194,6 +2289,7 @@ static NV_STATUS uvm_hmm_devmem_fault_alloc_and_copy(uvm_hmm_devmem_fault_contex
     uvm_page_mask_copy(page_mask, &service_context->per_processor_masks[UVM_ID_CPU_VALUE].new_residency);
 
     status = fault_alloc_on_cpu(va_block,
+                                va_block_retry,
                                 src_pfns,
                                 dst_pfns,
                                 service_context->region,
@@ -2208,7 +2304,7 @@ static NV_STATUS uvm_hmm_devmem_fault_alloc_and_copy(uvm_hmm_devmem_fault_contex
     // location yet.
     status = uvm_va_block_service_copy(processor_id, UVM_ID_CPU, va_block, va_block_retry, service_context);
     if (status != NV_OK)
-        clean_up_non_migrating_pages(va_block, src_pfns, dst_pfns, service_context->region, page_mask);
+        clean_up_non_migrating_pages(va_block, dst_pfns, service_context->region, page_mask);
 
     return status;
 }
@@ -2217,10 +2313,8 @@ static NV_STATUS uvm_hmm_devmem_fault_finalize_and_map(uvm_hmm_devmem_fault_cont
 {
     uvm_processor_id_t processor_id;
     uvm_service_block_context_t *service_context;
-    uvm_perf_prefetch_hint_t *prefetch_hint;
-    uvm_va_block_retry_t *va_block_retry;
     const unsigned long *src_pfns;
-    unsigned long *dst_pfns;
+    const unsigned long *dst_pfns;
     uvm_page_mask_t *page_mask;
     uvm_va_block_t *va_block;
     uvm_va_block_region_t region;
@@ -2229,9 +2323,7 @@ static NV_STATUS uvm_hmm_devmem_fault_finalize_and_map(uvm_hmm_devmem_fault_cont
 
     processor_id = devmem_fault_context->processor_id;
     service_context = devmem_fault_context->service_context;
-    prefetch_hint = &service_context->prefetch_hint;
     va_block = devmem_fault_context->va_block;
-    va_block_retry = devmem_fault_context->va_block_retry;
     src_pfns = service_context->block_context->hmm.src_pfns;
     dst_pfns = service_context->block_context->hmm.dst_pfns;
     region = service_context->region;
@@ -2270,6 +2362,7 @@ static NV_STATUS uvm_hmm_devmem_fault_finalize_and_map(uvm_hmm_devmem_fault_cont
 }
 
 static NV_STATUS populate_region(uvm_va_block_t *va_block,
+                                 uvm_va_block_retry_t *va_block_retry,
                                  unsigned long *pfns,
                                  uvm_va_block_region_t region,
                                  uvm_page_mask_t *populated_page_mask)
@@ -2277,12 +2370,6 @@ static NV_STATUS populate_region(uvm_va_block_t *va_block,
     uvm_page_index_t page_index;
     NV_STATUS status;
 
-    // Make sure GPU state is allocated or else the GPU DMA mappings to
-    // system memory won't be saved.
-    status = uvm_va_block_gpu_state_alloc(va_block);
-    if (status != NV_OK)
-        return status;
-
     for_each_va_block_page_in_region(page_index, region) {
         struct page *page;
 
@@ -2316,30 +2403,18 @@ static NV_STATUS populate_region(uvm_va_block_t *va_block,
             // not release the device private struct page reference. Since
             // hmm_range_fault() did find a device private PTE, we can
             // re-establish the GPU chunk pointer.
-            status = gpu_chunk_add(va_block, page_index, page);
+            status = gpu_chunk_add(va_block, va_block_retry, page_index, page);
             if (status != NV_OK)
                 return status;
             continue;
         }
 
-        // If a CPU chunk is already allocated, check to see it matches what
-        // hmm_range_fault() found.
-        if (uvm_page_mask_test(&va_block->cpu.allocated, page_index)) {
-            UVM_ASSERT(hmm_va_block_cpu_page_is_same(va_block, page_index, page));
-        }
-        else {
-            status = hmm_va_block_cpu_page_populate(va_block, page_index, page);
-            if (status != NV_OK)
-                return status;
+        status = hmm_va_block_cpu_page_insert_or_replace(va_block, page_index, page, populated_page_mask);
+        if (status != NV_OK)
+            return status;
 
-            // Record that we populated this page. hmm_block_cpu_fault_locked()
-            // uses this to ensure pages that don't migrate get remote mapped.
-            if (populated_page_mask)
-                uvm_page_mask_set(populated_page_mask, page_index);
-        }
-
-        // Since we have a stable snapshot of the CPU pages, we can
-        // update the residency and protection information.
+        // Since we have a stable snapshot of the CPU pages, we can update the
+        // residency and mapping information.
         uvm_va_block_cpu_set_resident_page(va_block, page_to_nid(page), page_index);
 
         cpu_mapping_set(va_block, pfns[page_index] & HMM_PFN_WRITE, page_index);
@@ -2367,6 +2442,7 @@ static bool hmm_range_fault_retry(uvm_va_block_t *va_block)
 // Make the region be resident on the CPU by calling hmm_range_fault() to fault
 // in CPU pages.
 static NV_STATUS hmm_make_resident_cpu(uvm_va_block_t *va_block,
+                                       uvm_va_block_retry_t *va_block_retry,
                                        struct vm_area_struct *vma,
                                        unsigned long *hmm_pfns,
                                        uvm_va_block_region_t region,
@@ -2414,6 +2490,7 @@ static NV_STATUS hmm_make_resident_cpu(uvm_va_block_t *va_block,
         return NV_WARN_MORE_PROCESSING_REQUIRED;
 
     return populate_region(va_block,
+                           va_block_retry,
                            hmm_pfns,
                            region,
                            populated_page_mask);
@@ -2548,27 +2625,15 @@ static NV_STATUS hmm_block_atomic_fault_locked(uvm_processor_id_t processor_id,
     for_each_va_block_page_in_region(page_index, region) {
         struct page *page = pages[page_index];
 
-        if (!page) {
+        if (!page || hmm_va_block_cpu_page_insert_or_replace(va_block, page_index, page, NULL) != NV_OK) {
             // Record that one of the pages isn't exclusive but keep converting
             // the others.
             status = NV_WARN_MORE_PROCESSING_REQUIRED;
             continue;
         }
 
-        // If a CPU chunk is already allocated, check to see it matches what
-        // make_device_exclusive_range() found.
-        if (uvm_page_mask_test(&va_block->cpu.allocated, page_index)) {
-            UVM_ASSERT(hmm_va_block_cpu_page_is_same(va_block, page_index, page));
-            UVM_ASSERT(uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU));
-            UVM_ASSERT(uvm_va_block_cpu_is_page_resident_on(va_block, NUMA_NO_NODE, page_index));
-        }
-        else {
-            NV_STATUS s = hmm_va_block_cpu_page_populate(va_block, page_index, page);
-
-            if (s == NV_OK)
-                uvm_va_block_cpu_set_resident_page(va_block, page_to_nid(page), page_index);
-        }
-
+        // Since we have a stable snapshot of the CPU pages, we can update the
+        // mapping information.
         cpu_mapping_clear(va_block, page_index);
     }
 
@@ -2629,6 +2694,7 @@ static NV_STATUS hmm_block_cpu_fault_locked(uvm_processor_id_t processor_id,
         .va_block = va_block,
         .va_block_retry = va_block_retry,
         .service_context = service_context,
+        .same_devmem_page_mask = {}
     };
 
     // Normally the source page will be a device private page that is being
@@ -2655,6 +2721,7 @@ static NV_STATUS hmm_block_cpu_fault_locked(uvm_processor_id_t processor_id,
         }
 
         status = hmm_make_resident_cpu(va_block,
+                                       va_block_retry,
                                        service_context->block_context->hmm.vma,
                                        service_context->block_context->hmm.src_pfns,
                                        region,
@@ -2724,20 +2791,27 @@ static NV_STATUS hmm_block_cpu_fault_locked(uvm_processor_id_t processor_id,
     return status;
 }
 
-static NV_STATUS dmamap_src_sysmem_pages(uvm_va_block_t *va_block,
-                                         struct vm_area_struct *vma,
-                                         const unsigned long *src_pfns,
-                                         unsigned long *dst_pfns,
-                                         uvm_va_block_region_t region,
-                                         uvm_page_mask_t *page_mask,
-                                         uvm_processor_id_t dest_id,
-                                         uvm_service_block_context_t *service_context)
+static NV_STATUS dmamap_src_sysmem_and_fill_dst(uvm_va_block_t *va_block,
+                                                uvm_va_block_retry_t *va_block_retry,
+                                                const unsigned long *src_pfns,
+                                                unsigned long *dst_pfns,
+                                                uvm_va_block_region_t region,
+                                                uvm_page_mask_t *page_mask,
+                                                uvm_page_mask_t *same_devmem_page_mask,
+                                                uvm_processor_id_t dest_id,
+                                                uvm_service_block_context_t *service_context)
 {
+    uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, dest_id);
+    uvm_gpu_t *gpu = uvm_gpu_get(dest_id);
     uvm_page_index_t page_index;
     NV_STATUS status = NV_OK;
 
+    UVM_ASSERT(gpu_state);
+    UVM_ASSERT(gpu);
     UVM_ASSERT(service_context);
 
+    uvm_page_mask_zero(same_devmem_page_mask);
+
     for_each_va_block_page_in_region_mask(page_index, page_mask, region) {
         struct page *src_page;
 
@@ -2752,46 +2826,28 @@ static NV_STATUS dmamap_src_sysmem_pages(uvm_va_block_t *va_block,
         src_page = migrate_pfn_to_page(src_pfns[page_index]);
         if (src_page) {
             if (is_device_private_page(src_page)) {
-                status = gpu_chunk_add(va_block, page_index, src_page);
+                status = gpu_chunk_add(va_block, va_block_retry, page_index, src_page);
                 if (status != NV_OK)
-                    break;
-                continue;
+                    goto clr_mask;
+
+                goto fill_dst;
             }
 
             if (nv_PageSwapCache(src_page)) {
                 // TODO: Bug 4050579: Remove this when swap cached pages can be
                 // migrated.
+                gpu_chunk_free_region(va_block, va_block_retry, dest_id, region, page_mask);
                 status = NV_WARN_MISMATCHED_TARGET;
                 break;
             }
 
-            // If the page is already allocated, it is most likely a mirrored
-            // page. Check to be sure it matches what we have recorded. The
-            // page shouldn't be a staging page from a GPU to GPU migration
-            // or a remote mapped atomic sysmem page because migrate_vma_setup()
-            // found a normal page and non-mirrored pages are only known
-            // privately to the UVM driver.
-            if (uvm_page_mask_test(&va_block->cpu.allocated, page_index)) {
-                UVM_ASSERT(hmm_va_block_cpu_page_is_same(va_block, page_index, src_page));
-                UVM_ASSERT(uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU));
-                UVM_ASSERT(uvm_va_block_cpu_is_page_resident_on(va_block, NUMA_NO_NODE, page_index));
-            }
-            else {
-                status = hmm_va_block_cpu_page_populate(va_block, page_index, src_page);
-                if (status != NV_OK)
-                    goto clr_mask;
+            status = hmm_va_block_cpu_page_insert_or_replace(va_block, page_index, src_page, NULL);
+            if (status != NV_OK)
+                goto clr_mask;
 
-                // Since there is a CPU resident page, there shouldn't be one
-                // anywhere else. TODO: Bug 3660922: Need to handle read
-                // duplication at some point.
-                UVM_ASSERT(!uvm_va_block_page_resident_processors_count(va_block,
-                                                                        service_context->block_context,
-                                                                        page_index));
-
-                // migrate_vma_setup() was able to isolate and lock the page;
-                // therefore, it is CPU resident and not mapped.
-                uvm_va_block_cpu_set_resident_page(va_block, page_to_nid(src_page), page_index);
-            }
+            // Since we have a stable snapshot of the CPU pages, we can update
+            // the residency information.
+            uvm_va_block_cpu_set_resident_page(va_block, page_to_nid(src_page), page_index);
 
             // The call to migrate_vma_setup() will have inserted a migration
             // PTE so the CPU has no access.
@@ -2810,33 +2866,43 @@ static NV_STATUS dmamap_src_sysmem_pages(uvm_va_block_t *va_block,
             }
         }
 
+    fill_dst:
+        fill_dst_pfn(va_block,
+                     gpu_state,
+                     gpu,
+                     src_pfns,
+                     dst_pfns,
+                     page_index,
+                     page_mask,
+                     same_devmem_page_mask);
+
         continue;
 
     clr_mask:
+        // Free the pre-allocated GPU chunk for non-migrating pages.
+        gpu_chunk_free(va_block, va_block_retry, gpu_state, page_index);
+
         // TODO: Bug 3900774: clean up murky mess of mask clearing.
         uvm_page_mask_clear(page_mask, page_index);
         if (service_context)
             clear_service_context_masks(service_context, dest_id, page_index);
     }
 
-    if (uvm_page_mask_empty(page_mask))
-        status = NV_WARN_MORE_PROCESSING_REQUIRED;
+    gpu_chunk_free_preallocated(va_block, va_block_retry);
 
-    if (status != NV_OK)
-        clean_up_non_migrating_pages(va_block, src_pfns, dst_pfns, region, page_mask);
+    if (status == NV_OK && uvm_page_mask_empty(page_mask))
+        status = NV_WARN_MORE_PROCESSING_REQUIRED;
 
     return status;
 }
 
-static NV_STATUS uvm_hmm_gpu_fault_alloc_and_copy(struct vm_area_struct *vma,
-                                                  uvm_hmm_gpu_fault_event_t *uvm_hmm_gpu_fault_event)
+static NV_STATUS uvm_hmm_gpu_fault_alloc_and_copy(uvm_hmm_gpu_fault_event_t *uvm_hmm_gpu_fault_event)
 {
     uvm_processor_id_t processor_id;
     uvm_processor_id_t new_residency;
     uvm_va_block_t *va_block;
     uvm_va_block_retry_t *va_block_retry;
     uvm_service_block_context_t *service_context;
-    uvm_perf_prefetch_hint_t *prefetch_hint;
     const unsigned long *src_pfns;
     unsigned long *dst_pfns;
     uvm_va_block_region_t region;
@@ -2849,7 +2915,6 @@ static NV_STATUS uvm_hmm_gpu_fault_alloc_and_copy(struct vm_area_struct *vma,
     va_block_retry = uvm_hmm_gpu_fault_event->va_block_retry;
     service_context = uvm_hmm_gpu_fault_event->service_context;
     region = service_context->region;
-    prefetch_hint = &service_context->prefetch_hint;
     src_pfns = service_context->block_context->hmm.src_pfns;
     dst_pfns = service_context->block_context->hmm.dst_pfns;
 
@@ -2860,14 +2925,15 @@ static NV_STATUS uvm_hmm_gpu_fault_alloc_and_copy(struct vm_area_struct *vma,
     uvm_page_mask_copy(page_mask,
                        &service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency);
 
-    status = dmamap_src_sysmem_pages(va_block,
-                                     vma,
-                                     src_pfns,
-                                     dst_pfns,
-                                     region,
-                                     page_mask,
-                                     new_residency,
-                                     service_context);
+    status = dmamap_src_sysmem_and_fill_dst(va_block,
+                                            va_block_retry,
+                                            src_pfns,
+                                            dst_pfns,
+                                            region,
+                                            page_mask,
+                                            &uvm_hmm_gpu_fault_event->same_devmem_page_mask,
+                                            new_residency,
+                                            service_context);
     if (status != NV_OK)
         return status;
 
@@ -2875,17 +2941,7 @@ static NV_STATUS uvm_hmm_gpu_fault_alloc_and_copy(struct vm_area_struct *vma,
     // new location yet.
     status = uvm_va_block_service_copy(processor_id, new_residency, va_block, va_block_retry, service_context);
     if (status != NV_OK)
-        return status;
-
-    // Record the destination PFNs of device private struct pages now that
-    // uvm_va_block_service_copy() has populated the GPU destination pages.
-    fill_dst_pfns(va_block,
-                  src_pfns,
-                  dst_pfns,
-                  region,
-                  page_mask,
-                  &uvm_hmm_gpu_fault_event->same_devmem_page_mask,
-                  new_residency);
+        clean_up_non_migrating_pages(va_block, dst_pfns, region, page_mask);
 
     return status;
 }
@@ -2895,10 +2951,9 @@ static NV_STATUS uvm_hmm_gpu_fault_finalize_and_map(uvm_hmm_gpu_fault_event_t *u
     uvm_processor_id_t processor_id;
     uvm_processor_id_t new_residency;
     uvm_va_block_t *va_block;
-    uvm_va_block_retry_t *va_block_retry;
     uvm_service_block_context_t *service_context;
     const unsigned long *src_pfns;
-    unsigned long *dst_pfns;
+    const unsigned long *dst_pfns;
     uvm_va_block_region_t region;
     uvm_page_index_t page_index;
     uvm_page_mask_t *page_mask;
@@ -2907,7 +2962,6 @@ static NV_STATUS uvm_hmm_gpu_fault_finalize_and_map(uvm_hmm_gpu_fault_event_t *u
     processor_id = uvm_hmm_gpu_fault_event->processor_id;
     new_residency = uvm_hmm_gpu_fault_event->new_residency;
     va_block = uvm_hmm_gpu_fault_event->va_block;
-    va_block_retry = uvm_hmm_gpu_fault_event->va_block_retry;
     service_context = uvm_hmm_gpu_fault_event->service_context;
     src_pfns = service_context->block_context->hmm.src_pfns;
     dst_pfns = service_context->block_context->hmm.dst_pfns;
@@ -2958,6 +3012,8 @@ NV_STATUS uvm_hmm_va_block_service_locked(uvm_processor_id_t processor_id,
     uvm_va_block_region_t region = service_context->region;
     uvm_hmm_gpu_fault_event_t uvm_hmm_gpu_fault_event;
     struct migrate_vma *args = &service_context->block_context->hmm.migrate_vma_args;
+    const uvm_page_mask_t *new_residency_mask =
+        &service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency;
     int ret;
     NV_STATUS status = NV_ERR_INVALID_ADDRESS;
 
@@ -2971,8 +3027,66 @@ NV_STATUS uvm_hmm_va_block_service_locked(uvm_processor_id_t processor_id,
     UVM_ASSERT(vma);
 
     // If the desired destination is the CPU, try to fault in CPU pages.
-    if (UVM_ID_IS_CPU(new_residency))
+    if (UVM_ID_IS_CPU(new_residency)) {
+        if (va_block_retry && !list_empty(&va_block_retry->used_chunks))
+            gpu_chunk_free_preallocated(va_block, va_block_retry);
+
         return hmm_block_cpu_fault_locked(processor_id, va_block, va_block_retry, service_context);
+    }
+
+    UVM_ASSERT(va_block_retry);
+
+    // The overall process here is to migrate pages from the CPU or GPUs to the
+    // faulting GPU. This is only safe because we hold the va_block lock across
+    // the calls to migrate_vma_pages(), uvm_hmm_gpu_fault_alloc_and_copy(),
+    // uvm_hmm_gpu_fault_finalize_and_map(), and migrate_vma_finalize().
+    // If the va_block lock were to be dropped, eviction callbacks from RM,
+    // migration callbacks from CPU faults, or invalidation callbacks from
+    // Linux could change the va_block state which would require careful
+    // revalidation of the state. Also, pages are page locked which leads to
+    // inefficiency or potential deadlocks.
+
+    // We pre-allocate the destination GPU pages because otherwise,
+    // migrate_vma_setup() could page lock the source pages and then try to
+    // allocate destination pages with block_alloc_gpu_chunk() which might
+    // unlock the va_block lock and try to evict the source page and fail.
+    // Note that by preallocating, we introduce 3 states instead of 2 for
+    // GPU chunks:
+    //   UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED, !is_referenced
+    //   UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED, is_referenced
+    //   UVM_PMM_GPU_CHUNK_STATE_ALLOCATED,   is_referenced
+    // The first state is when uvm_va_block_populate_pages_gpu() returns and
+    // we should call uvm_pmm_gpu_free() if the chunk isn't needed.
+    // The second state is after the source pages are pinned and we know which
+    // chunks will be used for DMA and passed to migrate_vma_pages() dst_pfns[].
+    // The third state is when migrate_vma_pages() commits to the migration and
+    // the GPU chunk will be marked resident.
+    // The is_referenced flag is just for sanity checking so it is clear when
+    // ownership for freeing the chunk changes from the driver to Linux's
+    // page_free() callback. The TEMP_PINNED/is_referenced state could be
+    // replaced with ALLOCATED/is_referenced but ALLOCATED implies the chunk
+    // could be evicted (except we hold the va_block lock) and seems safer
+    // to leave it in the pinned state until we are about to call
+    // migrate_vma_finalize().
+    // Also note that we have to free any pre-allocated pages because otherwise
+    // they would be marked ALLOCATED in uvm_va_block_retry_deinit() and
+    // we can't free them in uvm_va_block_retry_deinit() because the va_block
+    // lock might not be held and freeing the GPU chunk requires unmapping and
+    // clearing the gpu_state->chunks[] entry.
+    // Also note that the new_residency and new_residency_mask can change each
+    // time uvm_va_block_populate_pages_gpu() returns
+    // NV_ERR_MORE_PROCESSING_REQUIRED (based on thrashing and other reasons)
+    // so there might be pre-allocated chunks not in region.
+    status = uvm_va_block_populate_pages_gpu(va_block,
+                                             va_block_retry,
+                                             new_residency,
+                                             region,
+                                             new_residency_mask);
+    if (status != NV_OK) {
+        if (status != NV_ERR_MORE_PROCESSING_REQUIRED)
+            gpu_chunk_free_preallocated(va_block, va_block_retry);
+        return status;
+    }
 
     uvm_hmm_gpu_fault_event.processor_id = processor_id;
     uvm_hmm_gpu_fault_event.new_residency = new_residency;
@@ -2992,21 +3106,7 @@ NV_STATUS uvm_hmm_va_block_service_locked(uvm_processor_id_t processor_id,
     ret = migrate_vma_setup_locked(args, va_block);
     UVM_ASSERT(!ret);
 
-    // The overall process here is to migrate pages from the CPU or GPUs to the
-    // faulting GPU.
-    // This is safe because we hold the va_block lock across the calls to
-    // uvm_hmm_gpu_fault_alloc_and_copy(), migrate_vma_pages(),
-    // uvm_hmm_gpu_fault_finalize_and_map(), and migrate_vma_finalize().
-    // If uvm_hmm_gpu_fault_alloc_and_copy() needs to drop the va_block
-    // lock, a sequence number is used to tell if an invalidate() callback
-    // occurred while not holding the lock. If the sequence number changes,
-    // all the locks need to be dropped (mm, va_space, va_block) and the whole
-    // uvm_va_block_service_locked() called again. Otherwise, there were no
-    // conflicting invalidate callbacks and our snapshots of the CPU page
-    // tables are accurate and can be used to DMA pages and update GPU page
-    // tables. TODO: Bug 3901904: there might be better ways of handling no
-    // page being migrated.
-    status = uvm_hmm_gpu_fault_alloc_and_copy(vma, &uvm_hmm_gpu_fault_event);
+    status = uvm_hmm_gpu_fault_alloc_and_copy(&uvm_hmm_gpu_fault_event);
     if (status == NV_WARN_MORE_PROCESSING_REQUIRED) {
         migrate_vma_finalize(args);
 
@@ -3015,12 +3115,16 @@ NV_STATUS uvm_hmm_va_block_service_locked(uvm_processor_id_t processor_id,
         // We do know that none of the pages in the region are zero pages
         // since migrate_vma_setup() would have reported that information.
         // Try to make it resident in system memory and retry the migration.
+        // TODO: Bug 3901904: there might be better ways of handling no page
+        // being migrated.
         status = hmm_make_resident_cpu(va_block,
+                                       va_block_retry,
                                        service_context->block_context->hmm.vma,
                                        service_context->block_context->hmm.src_pfns,
                                        region,
                                        service_context->access_type,
                                        NULL);
+
         return NV_WARN_MORE_PROCESSING_REQUIRED;
     }
 
@@ -3037,8 +3141,7 @@ NV_STATUS uvm_hmm_va_block_service_locked(uvm_processor_id_t processor_id,
     return status;
 }
 
-static NV_STATUS uvm_hmm_migrate_alloc_and_copy(struct vm_area_struct *vma,
-                                                uvm_hmm_migrate_event_t *uvm_hmm_migrate_event)
+static NV_STATUS uvm_hmm_migrate_alloc_and_copy(uvm_hmm_migrate_event_t *uvm_hmm_migrate_event)
 {
     uvm_va_block_t *va_block;
     uvm_va_block_retry_t *va_block_retry;
@@ -3065,6 +3168,7 @@ static NV_STATUS uvm_hmm_migrate_alloc_and_copy(struct vm_area_struct *vma,
 
     if (UVM_ID_IS_CPU(dest_id)) {
         status = migrate_alloc_on_cpu(va_block,
+                                      va_block_retry,
                                       src_pfns,
                                       dst_pfns,
                                       region,
@@ -3073,14 +3177,15 @@ static NV_STATUS uvm_hmm_migrate_alloc_and_copy(struct vm_area_struct *vma,
                                       service_context->block_context);
     }
     else {
-        status = dmamap_src_sysmem_pages(va_block,
-                                         vma,
-                                         src_pfns,
-                                         dst_pfns,
-                                         region,
-                                         page_mask,
-                                         dest_id,
-                                         service_context);
+        status = dmamap_src_sysmem_and_fill_dst(va_block,
+                                                va_block_retry,
+                                                src_pfns,
+                                                dst_pfns,
+                                                region,
+                                                page_mask,
+                                                &uvm_hmm_migrate_event->same_devmem_page_mask,
+                                                dest_id,
+                                                service_context);
     }
 
     if (status != NV_OK)
@@ -3095,20 +3200,7 @@ static NV_STATUS uvm_hmm_migrate_alloc_and_copy(struct vm_area_struct *vma,
                                              NULL,
                                              uvm_hmm_migrate_event->cause);
     if (status != NV_OK)
-        return status;
-
-    if (!UVM_ID_IS_CPU(dest_id)) {
-        // Record the destination PFNs of device private struct pages now that
-        // uvm_va_block_make_resident_copy() has populated the GPU destination
-        // pages.
-        fill_dst_pfns(va_block,
-                      src_pfns,
-                      dst_pfns,
-                      region,
-                      page_mask,
-                      &uvm_hmm_migrate_event->same_devmem_page_mask,
-                      dest_id);
-    }
+        clean_up_non_migrating_pages(va_block, dst_pfns, region, page_mask);
 
     return status;
 }
@@ -3116,17 +3208,15 @@ static NV_STATUS uvm_hmm_migrate_alloc_and_copy(struct vm_area_struct *vma,
 static NV_STATUS uvm_hmm_migrate_finalize(uvm_hmm_migrate_event_t *uvm_hmm_migrate_event)
 {
     uvm_va_block_t *va_block;
-    uvm_va_block_retry_t *va_block_retry;
     uvm_va_block_context_t *va_block_context;
     uvm_va_block_region_t region;
     uvm_processor_id_t dest_id;
     uvm_page_index_t page_index;
     uvm_page_mask_t *page_mask;
     const unsigned long *src_pfns;
-    unsigned long *dst_pfns;
+    const unsigned long *dst_pfns;
 
     va_block = uvm_hmm_migrate_event->va_block;
-    va_block_retry = uvm_hmm_migrate_event->va_block_retry;
     va_block_context = uvm_hmm_migrate_event->service_context->block_context;
     region = uvm_hmm_migrate_event->region;
     dest_id = uvm_hmm_migrate_event->dest_id;
@@ -3189,6 +3279,45 @@ NV_STATUS uvm_hmm_va_block_migrate_locked(uvm_va_block_t *va_block,
     uvm_assert_mutex_locked(&va_block->hmm.migrate_lock);
     uvm_assert_mutex_locked(&va_block->lock);
 
+    // Save some time and effort if we can't migrate to a GPU.
+    if (UVM_ID_IS_GPU(dest_id) && uvm_hmm_must_use_sysmem(va_block, vma)) {
+        return hmm_make_resident_cpu(va_block,
+                                     va_block_retry,
+                                     vma,
+                                     va_block_context->hmm.src_pfns,
+                                     region,
+                                     NULL,
+                                     NULL);
+    }
+
+    // The overall process here is to migrate pages from the CPU or GPUs to the
+    // destination processor. Note that block_migrate_add_mappings() handles
+    // updating GPU mappings after the migration.
+    // This is only safe because we hold the va_block lock across the calls to
+    // uvm_hmm_migrate_alloc_and_copy(), migrate_vma_pages(),
+    // uvm_hmm_migrate_finalize(), migrate_vma_finalize() and
+    // block_migrate_add_mappings().
+    // If the va_block lock were to be dropped, eviction callbacks from RM,
+    // migration callbacks from CPU faults, or invalidation callbacks from
+    // Linux could change the va_block state which would require careful
+    // revalidation of the state. Also, pages are page locked which leads to
+    // inefficiency or potential deadlocks.
+    // tables are accurate and can be used to DMA pages and update GPU page
+    // tables.
+
+    // We pre-allocate the destination GPU pages because otherwise,
+    // migrate_vma_setup() could page lock the source pages and then try to
+    // allocate destination pages with block_alloc_gpu_chunk() which might
+    // unlock the va_block lock and try to evict the source page and fail.
+    if (UVM_ID_IS_GPU(dest_id)) {
+        status = uvm_va_block_populate_pages_gpu(va_block, va_block_retry, dest_id, region, NULL);
+        if (status != NV_OK) {
+            if (status != NV_ERR_MORE_PROCESSING_REQUIRED)
+                gpu_chunk_free_preallocated(va_block, va_block_retry);
+            return status;
+        }
+    }
+
     start = uvm_va_block_region_start(va_block, region);
     end = uvm_va_block_region_end(va_block, region);
     UVM_ASSERT(vma->vm_start <= start && end < vma->vm_end);
@@ -3214,30 +3343,20 @@ NV_STATUS uvm_hmm_va_block_migrate_locked(uvm_va_block_t *va_block,
     // VMAs so if UvmMigrate() tries to migrate such a region, -EINVAL will
     // be returned and we will only try to make the pages be CPU resident.
     ret = migrate_vma_setup_locked(args, va_block);
-    if (ret)
+    if (ret) {
+        if (va_block_retry && !list_empty(&va_block_retry->used_chunks))
+            gpu_chunk_free_preallocated(va_block, va_block_retry);
+
         return hmm_make_resident_cpu(va_block,
+                                     va_block_retry,
                                      vma,
                                      va_block_context->hmm.src_pfns,
                                      region,
                                      NULL,
                                      NULL);
+    }
 
-    // The overall process here is to migrate pages from the CPU or GPUs to the
-    // destination processor. Note that block_migrate_add_mappings() handles
-    // updating GPU mappings after the migration.
-    // This is safe because we hold the va_block lock across the calls to
-    // uvm_hmm_migrate_alloc_and_copy(), migrate_vma_pages(),
-    // uvm_hmm_migrate_finalize(), migrate_vma_finalize() and
-    // block_migrate_add_mappings().
-    // If uvm_hmm_migrate_alloc_and_copy() needs to drop the va_block
-    // lock, a sequence number is used to tell if an invalidate() callback
-    // occurred while not holding the lock. If the sequence number changes,
-    // all the locks need to be dropped (mm, va_space, va_block) and the whole
-    // uvm_hmm_va_block_migrate_locked() called again. Otherwise, there were no
-    // conflicting invalidate callbacks and our snapshots of the CPU page
-    // tables are accurate and can be used to DMA pages and update GPU page
-    // tables.
-    status = uvm_hmm_migrate_alloc_and_copy(vma, &uvm_hmm_migrate_event);
+    status = uvm_hmm_migrate_alloc_and_copy(&uvm_hmm_migrate_event);
     if (status == NV_WARN_MORE_PROCESSING_REQUIRED) {
         uvm_processor_id_t id;
         uvm_page_mask_t *page_mask;
@@ -3260,6 +3379,7 @@ NV_STATUS uvm_hmm_va_block_migrate_locked(uvm_va_block_t *va_block,
         }
 
         return hmm_make_resident_cpu(va_block,
+                                     va_block_retry,
                                      vma,
                                      va_block_context->hmm.src_pfns,
                                      region,
@@ -3350,6 +3470,17 @@ NV_STATUS uvm_hmm_va_block_evict_chunk_prep(uvm_va_block_t *va_block,
     if (ret)
         return errno_to_nv_status(ret);
 
+    if (!(src_pfns[page_index] & MIGRATE_PFN_MIGRATE))
+        return NV_WARN_MORE_PROCESSING_REQUIRED;
+
+    if (UVM_IS_DEBUG()) {
+        struct page *src_page = migrate_pfn_to_page(src_pfns[page_index]);
+
+        UVM_ASSERT(is_device_private_page(src_page));
+        UVM_ASSERT(page_pgmap(src_page)->owner == &g_uvm_global);
+        UVM_ASSERT(uvm_pmm_devmem_page_to_chunk(src_page) == gpu_chunk);
+    }
+
     return NV_OK;
 }
 
@@ -3374,6 +3505,7 @@ static NV_STATUS hmm_va_block_evict_chunks(uvm_va_block_t *va_block,
         .region = region,
         .dest_id = UVM_ID_CPU,
         .cause = cause,
+        .same_devmem_page_mask = {},
     };
     uvm_page_mask_t *page_mask = &uvm_hmm_migrate_event.page_mask;
     const uvm_va_policy_t *policy;
@@ -3390,7 +3522,7 @@ static NV_STATUS hmm_va_block_evict_chunks(uvm_va_block_t *va_block,
     // Note that there is no VMA available when evicting HMM pages.
     va_block_context->hmm.vma = NULL;
 
-    uvm_page_mask_copy(page_mask, pages_to_evict);
+    uvm_page_mask_init_from_region(page_mask, region, pages_to_evict);
 
     uvm_for_each_va_policy_in(policy, va_block, start, end, node, region) {
         npages = uvm_va_block_region_num_pages(region);
@@ -3401,9 +3533,16 @@ static NV_STATUS hmm_va_block_evict_chunks(uvm_va_block_t *va_block,
         // Pages resident on the GPU should not have a resident page in system
         // memory.
         // TODO: Bug 3660922: Need to handle read duplication at some point.
-        UVM_ASSERT(uvm_page_mask_region_empty(cpu_resident_mask, region));
+        UVM_ASSERT(!uvm_page_mask_intersects(cpu_resident_mask, page_mask));
 
-        status = migrate_alloc_on_cpu(va_block, src_pfns, dst_pfns, region, page_mask, NULL, va_block_context);
+        status = migrate_alloc_on_cpu(va_block,
+                                      NULL,
+                                      src_pfns,
+                                      dst_pfns,
+                                      region,
+                                      page_mask,
+                                      &uvm_hmm_migrate_event.same_devmem_page_mask,
+                                      va_block_context);
         if (status != NV_OK)
             goto err;
 
@@ -3429,6 +3568,13 @@ static NV_STATUS hmm_va_block_evict_chunks(uvm_va_block_t *va_block,
         migrate_device_finalize(src_pfns + region.first, dst_pfns + region.first, npages);
     }
 
+    // TODO: Bug 5167764: Evictions can't handle partial migrations.
+    uvm_page_mask_init_from_region(&va_block_context->scratch_page_mask, region, pages_to_evict);
+    if (uvm_page_mask_andnot(&va_block_context->scratch_page_mask,
+                             &va_block_context->scratch_page_mask,
+                             page_mask))
+        return NV_WARN_MORE_PROCESSING_REQUIRED;
+
     return NV_OK;
 
 err:
@@ -3663,12 +3809,12 @@ NV_STATUS uvm_hmm_va_block_update_residency_info(uvm_va_block_t *va_block,
     // Update the va_block CPU state based on the snapshot.
     // Note that we have to adjust the pfns address since it will be indexed
     // by region.first.
-    status = populate_region(va_block, &pfn - region.first, region, NULL);
+    status = populate_region(va_block, NULL, &pfn - region.first, region, NULL);
 
     uvm_mutex_unlock(&va_block->lock);
     uvm_hmm_migrate_finish(va_block);
 
-    return NV_OK;
+    return status;
 }
 
 NV_STATUS uvm_test_split_invalidate_delay(UVM_TEST_SPLIT_INVALIDATE_DELAY_PARAMS *params, struct file *filp)
diff --git a/kernel-open/nvidia-uvm/uvm_hmm.h b/kernel-open/nvidia-uvm/uvm_hmm.h
index 9e20b973d..cf9ddfd05 100644
--- a/kernel-open/nvidia-uvm/uvm_hmm.h
+++ b/kernel-open/nvidia-uvm/uvm_hmm.h
@@ -37,6 +37,7 @@ typedef struct
     // This stores pointers to uvm_va_block_t for HMM blocks.
     uvm_range_tree_t blocks;
     uvm_mutex_t blocks_lock;
+    atomic64_t allocated_page_count;
 } uvm_hmm_va_space_t;
 
 #if UVM_IS_CONFIG_HMM()
diff --git a/kernel-open/nvidia-uvm/uvm_pmm_gpu.c b/kernel-open/nvidia-uvm/uvm_pmm_gpu.c
index cf626ad5c..aaa12c727 100644
--- a/kernel-open/nvidia-uvm/uvm_pmm_gpu.c
+++ b/kernel-open/nvidia-uvm/uvm_pmm_gpu.c
@@ -402,7 +402,10 @@ static void chunk_pin(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
 {
     uvm_gpu_root_chunk_t *root_chunk = root_chunk_from_chunk(pmm, chunk);
 
-    uvm_assert_spinlock_locked(&pmm->list_lock);
+    // The PMM list_lock must be held, but calling uvm_assert_spinlock_locked()
+    // is not possible here due to the absence of the UVM context pointer in
+    // the interrupt context when called from devmem_page_free().
+
     UVM_ASSERT(chunk->state != UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED);
     chunk->state = UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED;
 
@@ -415,8 +418,9 @@ static void chunk_pin(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
 
     // The passed-in subchunk is not the root chunk so the root chunk has to be
     // split.
-    UVM_ASSERT_MSG(chunk->state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT, "chunk state %s\n",
-            uvm_pmm_gpu_chunk_state_string(chunk->state));
+    UVM_ASSERT_MSG(chunk->state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT,
+                   "chunk state %s\n",
+                    uvm_pmm_gpu_chunk_state_string(chunk->state));
 
     chunk->suballoc->pinned_leaf_chunks++;
 }
@@ -429,7 +433,6 @@ static void chunk_unpin(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk, uvm_pmm_gpu_
 
     uvm_assert_spinlock_locked(&pmm->list_lock);
     UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED);
-    UVM_ASSERT(chunk->va_block == NULL);
     UVM_ASSERT(chunk_is_root_chunk_pinned(pmm, chunk));
     UVM_ASSERT(new_state != UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED);
 
@@ -444,8 +447,9 @@ static void chunk_unpin(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk, uvm_pmm_gpu_
 
     // The passed-in subchunk is not the root chunk so the root chunk has to be
     // split.
-    UVM_ASSERT_MSG(chunk->state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT, "chunk state %s\n",
-            uvm_pmm_gpu_chunk_state_string(chunk->state));
+    UVM_ASSERT_MSG(chunk->state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT,
+                   "chunk state %s\n",
+                   uvm_pmm_gpu_chunk_state_string(chunk->state));
 
     UVM_ASSERT(chunk->suballoc->pinned_leaf_chunks != 0);
     chunk->suballoc->pinned_leaf_chunks--;
@@ -597,8 +601,6 @@ NV_STATUS uvm_pmm_gpu_alloc_kernel(uvm_pmm_gpu_t *pmm,
         return status;
 
     for (i = 0; i < num_chunks; ++i) {
-        UVM_ASSERT(chunks[i]->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED);
-
         uvm_spin_lock(&pmm->list_lock);
         chunk_unpin(pmm, chunks[i], UVM_PMM_GPU_CHUNK_STATE_ALLOCATED);
         chunks[i]->is_referenced = false;
@@ -644,45 +646,29 @@ static void chunk_update_lists_locked(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk
         list_del_init(&chunk->list);
 }
 
-static void gpu_unpin_temp(uvm_pmm_gpu_t *pmm,
-                           uvm_gpu_chunk_t *chunk,
-                           uvm_va_block_t *va_block,
-                           bool is_referenced)
+void uvm_pmm_gpu_unpin_allocated(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk, uvm_va_block_t *va_block)
 {
     UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED);
     UVM_ASSERT(uvm_gpu_chunk_is_user(chunk));
-
-    INIT_LIST_HEAD(&chunk->list);
+    UVM_ASSERT(list_empty(&chunk->list));
+    UVM_ASSERT(va_block);
+    UVM_ASSERT(chunk->va_block == va_block);
+    UVM_ASSERT(chunk->va_block_page_index < uvm_va_block_num_cpu_pages(va_block));
 
     uvm_spin_lock(&pmm->list_lock);
 
-    UVM_ASSERT(!chunk->va_block);
-    UVM_ASSERT(va_block);
-    UVM_ASSERT(chunk->va_block_page_index < uvm_va_block_num_cpu_pages(va_block));
-
     chunk_unpin(pmm, chunk, UVM_PMM_GPU_CHUNK_STATE_ALLOCATED);
-    chunk->is_referenced = is_referenced;
-    chunk->va_block = va_block;
     chunk_update_lists_locked(pmm, chunk);
 
     uvm_spin_unlock(&pmm->list_lock);
 }
 
-void uvm_pmm_gpu_unpin_allocated(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk, uvm_va_block_t *va_block)
-{
-    gpu_unpin_temp(pmm, chunk, va_block, false);
-}
-
-void uvm_pmm_gpu_unpin_referenced(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk, uvm_va_block_t *va_block)
-{
-    gpu_unpin_temp(pmm, chunk, va_block, true);
-}
-
 void uvm_pmm_gpu_free(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk, uvm_tracker_t *tracker)
 {
     NV_STATUS status;
 
-    if (!chunk)
+    // Referenced chunks are freed by Linux when the reference is released.
+    if (!chunk || chunk->is_referenced)
         return;
 
     UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED ||
@@ -748,6 +734,10 @@ static bool assert_chunk_mergeable(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
     size_t i;
 
     UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT);
+    UVM_ASSERT_MSG(chunk->suballoc->allocated == num_subchunks(chunk),
+                   "%u != %u\n",
+                   chunk->suballoc->allocated,
+                   num_subchunks(chunk));
     UVM_ASSERT(first_child->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED ||
                first_child->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED);
 
@@ -766,14 +756,6 @@ static bool assert_chunk_mergeable(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
         }
     }
 
-    if (first_child->state == UVM_PMM_GPU_CHUNK_STATE_FREE) {
-        UVM_ASSERT(chunk->suballoc->allocated == 0);
-    }
-    else {
-        UVM_ASSERT_MSG(chunk->suballoc->allocated == num_subchunks(chunk), "%u != %u\n",
-                chunk->suballoc->allocated, num_subchunks(chunk));
-    }
-
     return true;
 }
 
@@ -812,6 +794,7 @@ static void merge_gpu_chunk(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
     else if (child_state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED) {
         UVM_ASSERT(root_chunk->chunk.suballoc->pinned_leaf_chunks >= num_sub);
         root_chunk->chunk.suballoc->pinned_leaf_chunks += 1 - num_sub;
+        chunk->va_block = subchunk->va_block;
     }
 
     chunk->state = child_state;
@@ -835,7 +818,7 @@ static void merge_gpu_chunk(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
         UVM_ASSERT(list_empty(&subchunk->list));
 
         if ((child_state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED) && uvm_gpu_chunk_is_user(subchunk))
-            UVM_ASSERT(subchunk->va_block != NULL);
+            UVM_ASSERT(subchunk->va_block);
 
         kmem_cache_free(CHUNK_CACHE, subchunk);
     }
@@ -1202,7 +1185,7 @@ void uvm_pmm_gpu_mark_chunk_evicted(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
 
     UVM_ASSERT(chunk_is_in_eviction(pmm, chunk));
     UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED);
-    UVM_ASSERT(chunk->va_block != NULL);
+    UVM_ASSERT(chunk->va_block);
 
     chunk->va_block = NULL;
     chunk->va_block_page_index = PAGES_PER_UVM_VA_BLOCK;
@@ -1259,11 +1242,13 @@ static NV_STATUS find_and_retain_va_block_to_evict(uvm_pmm_gpu_t *pmm, uvm_gpu_c
 
     uvm_spin_lock(&pmm->list_lock);
 
-    // All free chunks should have been pinned already by pin_free_chunks_func().
+    // All free chunks should have been pinned already by
+    // pin_free_chunks_func().
     UVM_ASSERT_MSG(chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED ||
                    chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED ||
                    chunk->state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT,
-                   "state %s\n", uvm_pmm_gpu_chunk_state_string(chunk->state));
+                   "state %s\n",
+                   uvm_pmm_gpu_chunk_state_string(chunk->state));
 
     if (chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED) {
         UVM_ASSERT(chunk->va_block);
@@ -1750,8 +1735,10 @@ static NV_STATUS alloc_chunk_with_splits(uvm_pmm_gpu_t *pmm,
             UVM_ASSERT(chunk->parent->suballoc);
             UVM_ASSERT(uvm_gpu_chunk_get_size(chunk->parent) == uvm_chunk_find_next_size(chunk_sizes, cur_size));
             UVM_ASSERT(chunk->parent->type == type);
-            UVM_ASSERT_MSG(chunk->parent->suballoc->allocated <= num_subchunks(chunk->parent), "allocated %u num %u\n",
-                    chunk->parent->suballoc->allocated, num_subchunks(chunk->parent));
+            UVM_ASSERT_MSG(chunk->parent->suballoc->allocated <= num_subchunks(chunk->parent),
+                           "allocated %u num %u\n",
+                           chunk->parent->suballoc->allocated,
+                           num_subchunks(chunk->parent));
         }
 
         if (cur_size == chunk_size) {
@@ -1856,9 +1843,9 @@ static void init_root_chunk(uvm_pmm_gpu_t *pmm,
                    uvm_pmm_gpu_chunk_state_string(chunk->state),
                    uvm_gpu_name(gpu));
 
-    UVM_ASSERT(chunk->parent == NULL);
-    UVM_ASSERT(chunk->suballoc == NULL);
-    UVM_ASSERT(chunk->va_block == NULL);
+    UVM_ASSERT(!chunk->parent);
+    UVM_ASSERT(!chunk->suballoc);
+    UVM_ASSERT(!chunk->va_block);
     UVM_ASSERT(chunk->va_block_page_index == PAGES_PER_UVM_VA_BLOCK);
     UVM_ASSERT(list_empty(&chunk->list));
     UVM_ASSERT(uvm_gpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_MAX);
@@ -2116,6 +2103,9 @@ NV_STATUS split_gpu_chunk(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
             subchunk->va_block_page_index = chunk->va_block_page_index + (i * subchunk_size) / PAGE_SIZE;
             subchunk->is_referenced = chunk->is_referenced;
         }
+        else if (chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED) {
+            subchunk->va_block = chunk->va_block;
+        }
     }
 
     // We're splitting an allocated or pinned chunk in-place.
@@ -2141,6 +2131,10 @@ NV_STATUS split_gpu_chunk(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
         // accounting for the root chunk itself so add the 1 back.
         if (chunk_is_root_chunk(chunk))
             root_chunk->chunk.suballoc->pinned_leaf_chunks += 1;
+
+        chunk->va_block = NULL;
+        chunk->va_block_page_index = PAGES_PER_UVM_VA_BLOCK;
+        chunk->is_referenced = false;
     }
 
     chunk->state = UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT;
@@ -2222,16 +2216,16 @@ static void chunk_free_locked(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
 
     if (root_chunk->chunk.in_eviction) {
         // A root chunk with pinned subchunks would never be picked for eviction
-        // so this one has to be in the allocated state. Pin it and let the
-        // evicting thread pick it up.
-        UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED);
-        UVM_ASSERT(chunk->va_block != NULL);
-        UVM_ASSERT(chunk->va_block_page_index != PAGES_PER_UVM_VA_BLOCK);
-        UVM_ASSERT(list_empty(&chunk->list));
-        chunk->va_block = NULL;
-        chunk->va_block_page_index = PAGES_PER_UVM_VA_BLOCK;
-        chunk->is_zero = false;
-        chunk_pin(pmm, chunk);
+        // but HMM evictions will end up here so leave the chunk pinned (or pin
+        // it) and let the eviction thread pick it up.
+        if (chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED) {
+            UVM_ASSERT(chunk->va_block);
+            UVM_ASSERT(list_empty(&chunk->list));
+            chunk->va_block = NULL;
+            chunk->va_block_page_index = PAGES_PER_UVM_VA_BLOCK;
+            chunk->is_zero = false;
+            chunk_pin(pmm, chunk);
+        }
         return;
     }
 
@@ -2245,17 +2239,15 @@ static void chunk_free_locked(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
         }
     }
 
-    if (chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED) {
-        chunk_unpin(pmm, chunk, UVM_PMM_GPU_CHUNK_STATE_FREE);
-    }
-    else {
-        chunk->state = UVM_PMM_GPU_CHUNK_STATE_FREE;
-        chunk->va_block = NULL;
-    }
-
+    chunk->va_block = NULL;
     chunk->va_block_page_index = PAGES_PER_UVM_VA_BLOCK;
     chunk->is_zero = false;
 
+    if (chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED)
+        chunk_unpin(pmm, chunk, UVM_PMM_GPU_CHUNK_STATE_FREE);
+    else
+        chunk->state = UVM_PMM_GPU_CHUNK_STATE_FREE;
+
     chunk_update_lists_locked(pmm, chunk);
 }
 
@@ -2369,8 +2361,8 @@ static void free_chunk(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
         try_free = is_root;
     }
     else {
-        // Freeing a chunk can only fail if it requires merging. Take the PMM lock
-        // and free it with merges supported.
+        // Freeing a chunk can only fail if it requires merging. Take the PMM
+        // lock and free it with merges supported.
         uvm_mutex_lock(&pmm->lock);
         free_chunk_with_merges(pmm, chunk);
         uvm_mutex_unlock(&pmm->lock);
@@ -3088,6 +3080,11 @@ static bool uvm_pmm_gpu_check_orphan_pages(uvm_pmm_gpu_t *pmm)
             break;
         }
 
+        if (page->zone_device_data) {
+            ret = false;
+            break;
+        }
+
         if (page_count(page)) {
             ret = false;
             break;
@@ -3102,6 +3099,14 @@ static void devmem_page_free(struct page *page)
     uvm_gpu_chunk_t *chunk = uvm_pmm_devmem_page_to_chunk(page);
     uvm_gpu_t *gpu = uvm_gpu_chunk_get_gpu(chunk);
 
+    if (chunk->va_block) {
+        uvm_va_space_t *va_space = chunk->va_block->hmm.va_space;
+
+        UVM_ASSERT(va_space);
+        atomic64_dec(&va_space->hmm.allocated_page_count);
+        UVM_ASSERT(atomic64_read(&va_space->hmm.allocated_page_count) >= 0);
+    }
+
     page->zone_device_data = NULL;
 
     // We should be calling free_chunk() except that it acquires a mutex and
@@ -3111,7 +3116,20 @@ static void devmem_page_free(struct page *page)
     spin_lock(&gpu->pmm.list_lock.lock);
 
     UVM_ASSERT(chunk->is_referenced);
+
+    chunk->va_block = NULL;
+    chunk->va_block_page_index = PAGES_PER_UVM_VA_BLOCK;
     chunk->is_referenced = false;
+
+    if (chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED) {
+        list_del_init(&chunk->list);
+        chunk_pin(&gpu->pmm, chunk);
+    }
+    else {
+        UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED);
+        UVM_ASSERT(list_empty(&chunk->list));
+    }
+
     list_add_tail(&chunk->list, &gpu->pmm.root_chunks.va_block_lazy_free);
 
     spin_unlock(&gpu->pmm.list_lock.lock);
@@ -3362,6 +3380,7 @@ static void process_lazy_free(uvm_pmm_gpu_t *pmm)
     // is empty.
     while (!list_empty(&pmm->root_chunks.va_block_lazy_free)) {
         chunk = list_first_entry(&pmm->root_chunks.va_block_lazy_free, uvm_gpu_chunk_t, list);
+        UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED);
         list_del_init(&chunk->list);
         uvm_spin_unlock(&pmm->list_lock);
 
@@ -3414,6 +3433,7 @@ NV_STATUS uvm_pmm_gpu_init(uvm_pmm_gpu_t *pmm)
 
     for (i = 0; i < UVM_PMM_GPU_MEMORY_TYPE_COUNT; i++) {
         pmm->chunk_sizes[i] = 0;
+
         // Add the common root chunk size to all memory types
         pmm->chunk_sizes[i] |= UVM_CHUNK_SIZE_MAX;
         for (j = 0; j < ARRAY_SIZE(chunk_size_init); j++)
@@ -3421,7 +3441,9 @@ NV_STATUS uvm_pmm_gpu_init(uvm_pmm_gpu_t *pmm)
 
         UVM_ASSERT(pmm->chunk_sizes[i] < UVM_CHUNK_SIZE_INVALID);
         UVM_ASSERT_MSG(hweight_long(pmm->chunk_sizes[i]) <= UVM_MAX_CHUNK_SIZES,
-                "chunk sizes %lu, max chunk sizes %u\n", hweight_long(pmm->chunk_sizes[i]), UVM_MAX_CHUNK_SIZES);
+                       "chunk sizes %lu, max chunk sizes %u\n",
+                       hweight_long(pmm->chunk_sizes[i]),
+                       UVM_MAX_CHUNK_SIZES);
     }
 
     status = init_caches(pmm);
@@ -3515,9 +3537,9 @@ void uvm_pmm_gpu_deinit(uvm_pmm_gpu_t *pmm)
 
     gpu = uvm_pmm_to_gpu(pmm);
 
-    UVM_ASSERT(uvm_pmm_gpu_check_orphan_pages(pmm));
     nv_kthread_q_flush(&gpu->parent->lazy_free_q);
     UVM_ASSERT(list_empty(&pmm->root_chunks.va_block_lazy_free));
+    UVM_ASSERT(uvm_pmm_gpu_check_orphan_pages(pmm));
     release_free_root_chunks(pmm);
 
     if (gpu->mem_info.size != 0 && gpu_supports_pma_eviction(gpu))
diff --git a/kernel-open/nvidia-uvm/uvm_pmm_gpu.h b/kernel-open/nvidia-uvm/uvm_pmm_gpu.h
index 142b2c5f5..86d1a4f47 100644
--- a/kernel-open/nvidia-uvm/uvm_pmm_gpu.h
+++ b/kernel-open/nvidia-uvm/uvm_pmm_gpu.h
@@ -271,6 +271,11 @@ struct uvm_gpu_chunk_struct
 
         // This flag indicates an allocated user chunk is referenced by a device
         // private struct page PTE and therefore expects a page_free() callback.
+        // The flag is only for sanity checking since uvm_pmm_gpu_free()
+        // shouldn't be called if Linux has a device private reference to this
+        // chunk and devmem_page_free() should only be called from the Linux
+        // callback if a reference was created.
+        // See uvm_hmm_va_block_service_locked() and fill_dst_pfn() for details.
         //
         // This field is always false in kernel chunks.
         bool is_referenced : 1;
@@ -300,6 +305,9 @@ struct uvm_gpu_chunk_struct
     // The VA block using the chunk, if any.
     // User chunks that are not backed by a VA block are considered to be
     // temporarily pinned and cannot be evicted.
+    // Note that the chunk state is normally UVM_PMM_GPU_CHUNK_STATE_ALLOCATED
+    // but can also be UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED if an HMM va_block
+    // and device private struct page have a pointer to this chunk.
     //
     // This field is always NULL in kernel chunks.
     uvm_va_block_t *va_block;
@@ -437,17 +445,16 @@ struct page *uvm_gpu_chunk_to_page(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk);
 // Allocates num_chunks chunks of size chunk_size in caller-supplied array
 // (chunks).
 //
-// Returned chunks are in the TEMP_PINNED state, requiring a call to either
-// uvm_pmm_gpu_unpin_allocated, uvm_pmm_gpu_unpin_referenced, or
-// uvm_pmm_gpu_free. If a tracker is passed in, all
-// the pending operations on the allocated chunks will be added to it
+// Returned chunks are in the TEMP_PINNED state, requiring a call to
+// uvm_pmm_gpu_unpin_allocated or uvm_pmm_gpu_free. If a tracker is passed in,
+// all the pending operations on the allocated chunks will be added to it
 // guaranteeing that all the entries come from the same GPU as the PMM.
 // Otherwise, when tracker is NULL, all the pending operations will be
 // synchronized before returning to the caller.
 //
 // Each of the allocated chunks list nodes (uvm_gpu_chunk_t::list) can be used
-// by the caller until the chunk is unpinned (uvm_pmm_gpu_unpin_allocated,
-// uvm_pmm_gpu_unpin_referenced) or freed (uvm_pmm_gpu_free). If used, the list
+// by the caller until the chunk is unpinned (uvm_pmm_gpu_unpin_allocated)
+// or freed (uvm_pmm_gpu_free). If used, the list
 // node has to be returned to a valid state before calling either of the APIs.
 //
 // In case of an error, the chunks array is guaranteed to be cleared.
@@ -480,12 +487,6 @@ NV_STATUS uvm_pmm_gpu_alloc_kernel(uvm_pmm_gpu_t *pmm,
 // Can only be used on user memory.
 void uvm_pmm_gpu_unpin_allocated(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk, uvm_va_block_t *va_block);
 
-// Unpin a temporarily pinned chunk, set its reverse map to a VA block, and
-// mark it as referenced.
-//
-// Can only be used on user memory.
-void uvm_pmm_gpu_unpin_referenced(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk, uvm_va_block_t *va_block);
-
 // Free a user or kernel chunk. Temporarily pinned chunks are unpinned.
 //
 // The tracker is optional and a NULL tracker indicates that no new operation
diff --git a/kernel-open/nvidia-uvm/uvm_va_block.c b/kernel-open/nvidia-uvm/uvm_va_block.c
index 50c9707cb..d76137582 100644
--- a/kernel-open/nvidia-uvm/uvm_va_block.c
+++ b/kernel-open/nvidia-uvm/uvm_va_block.c
@@ -426,11 +426,13 @@ static uvm_cpu_chunk_t *uvm_cpu_chunk_get_chunk_for_page_resident(uvm_va_block_t
     return chunk;
 }
 
-void uvm_cpu_chunk_remove_from_block(uvm_va_block_t *va_block, int nid, uvm_page_index_t page_index)
+void uvm_cpu_chunk_remove_from_block(uvm_va_block_t *va_block,
+                                     uvm_cpu_chunk_t *chunk,
+                                     int nid,
+                                     uvm_page_index_t page_index)
 {
     uvm_va_block_cpu_node_state_t *node_state = block_node_state_get(va_block, nid);
     uvm_cpu_chunk_storage_mixed_t *mixed;
-    uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, nid, page_index);
     uvm_va_block_region_t chunk_region = uvm_cpu_chunk_block_region(va_block, chunk, page_index);
     size_t slot_index;
     uvm_cpu_chunk_t **chunks;
@@ -765,7 +767,7 @@ static bool block_check_cpu_chunks(uvm_va_block_t *block)
     int nid;
     uvm_page_mask_t *temp_resident_mask;
 
-    temp_resident_mask = kmem_cache_alloc(g_uvm_page_mask_cache, NV_UVM_GFP_FLAGS | __GFP_ZERO);
+    temp_resident_mask = nv_kmem_cache_zalloc(g_uvm_page_mask_cache, NV_UVM_GFP_FLAGS);
 
     for_each_possible_uvm_node(nid) {
         uvm_cpu_chunk_t *chunk;
@@ -827,16 +829,16 @@ void uvm_va_block_retry_deinit(uvm_va_block_retry_t *retry, uvm_va_block_t *va_b
         uvm_pmm_gpu_free(&gpu->pmm, gpu_chunk, NULL);
     }
 
+    // HMM should have already moved allocated GPU chunks to the referenced
+    // state or freed them.
+    if (uvm_va_block_is_hmm(va_block))
+        UVM_ASSERT(list_empty(&retry->used_chunks));
+
     // Unpin all the used chunks now that we are done
     list_for_each_entry_safe(gpu_chunk, next_chunk, &retry->used_chunks, list) {
         list_del_init(&gpu_chunk->list);
         gpu = uvm_gpu_chunk_get_gpu(gpu_chunk);
-        // HMM should have already moved allocated blocks to the referenced
-        // state so any left over were not migrated and should be freed.
-        if (uvm_va_block_is_hmm(va_block))
-            uvm_pmm_gpu_free(&gpu->pmm, gpu_chunk, NULL);
-        else
-            uvm_pmm_gpu_unpin_allocated(&gpu->pmm, gpu_chunk, va_block);
+        uvm_pmm_gpu_unpin_allocated(&gpu->pmm, gpu_chunk, va_block);
     }
 }
 
@@ -1158,6 +1160,8 @@ static size_t block_gpu_chunk_index(uvm_va_block_t *block,
         UVM_ASSERT(gpu_state->chunks);
         chunk = gpu_state->chunks[index];
         if (chunk) {
+            UVM_ASSERT(uvm_gpu_chunk_is_user(chunk));
+            UVM_ASSERT(uvm_id_equal(uvm_gpu_id_from_index(chunk->gpu_index), gpu->id));
             UVM_ASSERT(uvm_gpu_chunk_get_size(chunk) == size);
             UVM_ASSERT(chunk->state != UVM_PMM_GPU_CHUNK_STATE_PMA_OWNED);
             UVM_ASSERT(chunk->state != UVM_PMM_GPU_CHUNK_STATE_FREE);
@@ -1385,10 +1389,7 @@ error:
     return status;
 }
 
-// Retrieves the gpu_state for the given GPU. The returned pointer is
-// internally managed and will be allocated (and freed) automatically,
-// rather than by the caller.
-static uvm_va_block_gpu_state_t *block_gpu_state_get_alloc(uvm_va_block_t *block, uvm_gpu_t *gpu)
+uvm_va_block_gpu_state_t *uvm_va_block_gpu_state_get_alloc(uvm_va_block_t *block, uvm_gpu_t *gpu)
 {
     NV_STATUS status;
     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
@@ -1420,22 +1421,6 @@ error:
     return NULL;
 }
 
-NV_STATUS uvm_va_block_gpu_state_alloc(uvm_va_block_t *va_block)
-{
-    uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
-    uvm_gpu_id_t gpu_id;
-
-    UVM_ASSERT(uvm_va_block_is_hmm(va_block));
-    uvm_assert_mutex_locked(&va_block->lock);
-
-    for_each_gpu_id_in_mask(gpu_id, &va_space->registered_gpus) {
-        if (!block_gpu_state_get_alloc(va_block, uvm_gpu_get(gpu_id)))
-            return NV_ERR_NO_MEMORY;
-    }
-
-    return NV_OK;
-}
-
 void uvm_va_block_unmap_cpu_chunk_on_gpus(uvm_va_block_t *block,
                                           uvm_cpu_chunk_t *chunk)
 {
@@ -1490,7 +1475,7 @@ void uvm_va_block_remove_cpu_chunks(uvm_va_block_t *va_block, uvm_va_block_regio
             uvm_page_mask_region_clear(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ], chunk_region);
             uvm_page_mask_region_clear(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE], chunk_region);
             uvm_va_block_cpu_clear_resident_region(va_block, nid, chunk_region);
-            uvm_cpu_chunk_remove_from_block(va_block, nid, page_index);
+            uvm_cpu_chunk_remove_from_block(va_block, chunk, nid, page_index);
             uvm_va_block_unmap_cpu_chunk_on_gpus(va_block, chunk);
             uvm_cpu_chunk_free(chunk);
         }
@@ -1586,26 +1571,6 @@ static NV_STATUS block_alloc_cpu_chunk(uvm_va_block_t *block,
     return status;
 }
 
-// Same as block_alloc_cpu_chunk() but allocate a chunk suitable for use as
-// a HMM destination page. The main difference is UVM does not own the reference
-// on the struct page backing these chunks.
-static NV_STATUS block_alloc_hmm_cpu_chunk(uvm_va_block_t *block,
-                                           uvm_chunk_sizes_mask_t cpu_allocation_sizes,
-                                           uvm_cpu_chunk_alloc_flags_t flags,
-                                           int nid,
-                                           uvm_cpu_chunk_t **chunk)
-{
-    NV_STATUS status;
-
-    UVM_ASSERT(uvm_va_block_is_hmm(block));
-
-    status = block_alloc_cpu_chunk(block, cpu_allocation_sizes, flags, nid, chunk);
-    if (status == NV_OK)
-        (*chunk)->type = UVM_CPU_CHUNK_TYPE_HMM;
-
-    return status;
-}
-
 // Find the largest allocation size we can use for the given page_index in the
 // given block. Returns the mask of possible sizes and region covered by the
 // largest. Callers may also elect to use a smaller size.
@@ -1837,7 +1802,7 @@ static NV_STATUS block_add_cpu_chunk(uvm_va_block_t *block,
 
         status = uvm_va_block_map_cpu_chunk_on_gpus(block, chunk);
         if (status != NV_OK) {
-            uvm_cpu_chunk_remove_from_block(block, uvm_cpu_chunk_get_numa_node(chunk), page_index);
+            uvm_cpu_chunk_remove_from_block(block, chunk, uvm_cpu_chunk_get_numa_node(chunk), page_index);
             goto out;
         }
     }
@@ -1859,10 +1824,9 @@ out:
 // is required for IOMMU support. Skipped on GPUs without access to CPU memory.
 // e.g., this happens when the Confidential Computing Feature is enabled.
 static NV_STATUS block_populate_pages_cpu(uvm_va_block_t *block,
-                                          uvm_page_mask_t *populate_page_mask,
+                                          const uvm_page_mask_t *populate_page_mask,
                                           uvm_va_block_region_t populate_region,
-                                          uvm_va_block_context_t *block_context,
-                                          bool staged)
+                                          uvm_va_block_context_t *block_context)
 {
     NV_STATUS status = NV_OK;
     uvm_cpu_chunk_t *chunk;
@@ -1956,13 +1920,7 @@ static NV_STATUS block_populate_pages_cpu(uvm_va_block_t *block,
         if (!uvm_page_mask_region_full(resident_mask, region))
             chunk_alloc_flags |= UVM_CPU_CHUNK_ALLOC_FLAGS_ZERO;
 
-        // Management of a page used for a staged migration is never handed off
-        // to the kernel and is really just a driver managed page. Therefore
-        // don't allocate a HMM chunk in this case.
-        if (uvm_va_block_is_hmm(block) && !staged)
-            status = block_alloc_hmm_cpu_chunk(block, allocation_sizes, chunk_alloc_flags, preferred_nid, &chunk);
-        else
-            status = block_alloc_cpu_chunk(block, allocation_sizes, chunk_alloc_flags, preferred_nid, &chunk);
+        status = block_alloc_cpu_chunk(block, allocation_sizes, chunk_alloc_flags, preferred_nid, &chunk);
 
         if (status == NV_WARN_MORE_PROCESSING_REQUIRED) {
             alloc_flags &= ~UVM_CPU_CHUNK_ALLOC_FLAGS_STRICT;
@@ -1973,7 +1931,8 @@ static NV_STATUS block_populate_pages_cpu(uvm_va_block_t *block,
             return status;
         }
 
-        // A smaller chunk than the maximum size may have been allocated, update the region accordingly.
+        // A smaller chunk than the maximum size may have been allocated,
+        // update the region accordingly.
         region = uvm_va_block_chunk_region(block, uvm_cpu_chunk_get_size(chunk), page_index);
         status = block_add_cpu_chunk(block, node_pages_mask, chunk, region);
         if (status != NV_OK)
@@ -1981,50 +1940,14 @@ static NV_STATUS block_populate_pages_cpu(uvm_va_block_t *block,
 
         // Skip iterating over all pages covered by the allocated chunk.
         page_index = region.outer - 1;
-
-#if UVM_IS_CONFIG_HMM()
-        if (uvm_va_block_is_hmm(block) && block_context)
-            block_context->hmm.dst_pfns[page_index] = migrate_pfn(page_to_pfn(chunk->page));
-#endif
     }
 
     return NV_OK;
 }
 
-// Note this clears the block_context caller_page_mask.
 NV_STATUS uvm_va_block_populate_page_cpu(uvm_va_block_t *va_block, uvm_page_index_t page_index, uvm_va_block_context_t *block_context)
 {
-    uvm_page_mask_t *page_mask = &block_context->caller_page_mask;
-    NV_STATUS status = NV_OK;
-
-    uvm_page_mask_zero(page_mask);
-    uvm_page_mask_set(page_mask, page_index);
-
-    if (uvm_va_block_is_hmm(va_block)) {
-        const uvm_va_policy_t *policy;
-        uvm_va_block_region_t region;
-        uvm_va_policy_node_t *node;
-
-        uvm_for_each_va_policy_in(policy, va_block, va_block->start, va_block->end, node, region) {
-            status = block_populate_pages_cpu(va_block,
-                                              page_mask,
-                                              region,
-                                              block_context,
-                                              false);
-
-            if (status != NV_OK)
-                break;
-        }
-    }
-    else {
-        status = block_populate_pages_cpu(va_block,
-                                          page_mask,
-                                          uvm_va_block_region_from_block(va_block),
-                                          block_context,
-                                          false);
-    }
-
-    return status;
+    return block_populate_pages_cpu(va_block, NULL, uvm_va_block_region_for_page(page_index), block_context);
 }
 
 // Try allocating a chunk. If eviction was required,
@@ -2413,7 +2336,7 @@ static uvm_page_mask_t *block_resident_mask_get_alloc(uvm_va_block_t *block, uvm
     if (UVM_ID_IS_CPU(processor))
         return uvm_va_block_resident_mask_get(block, processor, nid);
 
-    gpu_state = block_gpu_state_get_alloc(block, uvm_gpu_get(processor));
+    gpu_state = uvm_va_block_gpu_state_get_alloc(block, uvm_gpu_get(processor));
     if (!gpu_state)
         return NULL;
 
@@ -2453,9 +2376,15 @@ void uvm_va_block_unmapped_pages_get(uvm_va_block_t *va_block,
         return;
     }
 
+    uvm_page_mask_zero(out_mask);
     uvm_page_mask_region_fill(out_mask, region);
 
-    for_each_id_in_mask(id, &va_block->mapped) {
+    // UVM-HMM doesn't always know when CPU pages are mapped or not since there
+    // is no notification when CPU page tables are upgraded. If the page is
+    // resident, assume the CPU has some mapping.
+    uvm_page_mask_andnot(out_mask, out_mask, uvm_va_block_resident_mask_get(va_block, UVM_ID_CPU, NUMA_NO_NODE));
+
+    for_each_gpu_id_in_mask(id, &va_block->mapped) {
         uvm_page_mask_andnot(out_mask, out_mask, uvm_va_block_map_mask_get(va_block, id));
     }
 }
@@ -2951,7 +2880,7 @@ static NV_STATUS block_populate_gpu_chunk(uvm_va_block_t *block,
                                           size_t chunk_index,
                                           uvm_va_block_region_t chunk_region)
 {
-    uvm_va_block_gpu_state_t *gpu_state = block_gpu_state_get_alloc(block, gpu);
+    uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get_alloc(block, gpu);
     uvm_gpu_chunk_t *chunk = NULL;
     uvm_chunk_size_t chunk_size = uvm_va_block_region_size(chunk_region);
     uvm_va_block_test_t *block_test = uvm_va_block_get_test(block);
@@ -3005,8 +2934,10 @@ static NV_STATUS block_populate_gpu_chunk(uvm_va_block_t *block,
     }
 
     // Record the used chunk so that it can be unpinned at the end of the whole
-    // operation.
+    // operation. HMM chunks are unpinned after a successful migration.
     block_retry_add_used_chunk(retry, chunk);
+
+    chunk->va_block = block;
     gpu_state->chunks[chunk_index] = chunk;
 
     return NV_OK;
@@ -3023,12 +2954,13 @@ chunk_free:
 }
 
 // Populate all chunks which cover the given region and page mask.
-static NV_STATUS block_populate_pages_gpu(uvm_va_block_t *block,
+NV_STATUS uvm_va_block_populate_pages_gpu(uvm_va_block_t *block,
                                           uvm_va_block_retry_t *retry,
-                                          uvm_gpu_t *gpu,
+                                          uvm_gpu_id_t gpu_id,
                                           uvm_va_block_region_t region,
                                           const uvm_page_mask_t *populate_mask)
 {
+    uvm_gpu_t *gpu = uvm_gpu_get(gpu_id);
     uvm_va_block_region_t chunk_region, check_region;
     size_t chunk_index;
     uvm_page_index_t page_index;
@@ -3105,7 +3037,7 @@ static NV_STATUS block_populate_pages(uvm_va_block_t *block,
         if (!tmp_processor_mask)
             return NV_ERR_NO_MEMORY;
 
-        status = block_populate_pages_gpu(block, retry, uvm_gpu_get(dest_id), region, populate_page_mask);
+        status = uvm_va_block_populate_pages_gpu(block, retry, dest_id, region, populate_page_mask);
         if (status != NV_OK) {
             uvm_processor_mask_cache_free(tmp_processor_mask);
             return status;
@@ -3150,7 +3082,7 @@ static NV_STATUS block_populate_pages(uvm_va_block_t *block,
     }
 
     uvm_memcg_context_start(&memcg_context, block_context->mm);
-    status = block_populate_pages_cpu(block, cpu_populate_mask, region, block_context, UVM_ID_IS_GPU(dest_id));
+    status = block_populate_pages_cpu(block, cpu_populate_mask, region, block_context);
     uvm_memcg_context_end(&memcg_context);
     return status;
 }
@@ -4180,7 +4112,7 @@ static NV_STATUS block_copy_resident_pages_between(uvm_va_block_t *block,
 
                 // Ensure that there is GPU state that can be used for CPU-to-CPU copies
                 if (UVM_ID_IS_CPU(dst_id) && uvm_id_equal(src_id, dst_id)) {
-                    uvm_va_block_gpu_state_t *gpu_state = block_gpu_state_get_alloc(block, copying_gpu);
+                    uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get_alloc(block, copying_gpu);
                     if (!gpu_state) {
                         status = NV_ERR_NO_MEMORY;
                         break;
@@ -4841,6 +4773,7 @@ static void block_cleanup_temp_pinned_gpu_chunks(uvm_va_block_t *va_block, uvm_g
         // block_populate_pages above. Release them since the copy
         // failed and they won't be mapped to userspace.
         if (chunk && chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED) {
+            list_del_init(&chunk->list);
             uvm_mmu_chunk_unmap(chunk, &va_block->tracker);
             uvm_pmm_gpu_free(&gpu->pmm, chunk, &va_block->tracker);
             gpu_state->chunks[i] = NULL;
@@ -4935,7 +4868,8 @@ NV_STATUS uvm_va_block_make_resident_copy(uvm_va_block_t *va_block,
                                        prefetch_page_mask,
                                        UVM_VA_BLOCK_TRANSFER_MODE_MOVE);
 
-    if (status != NV_OK) {
+    // HMM does its own clean up.
+    if (status != NV_OK && !uvm_va_block_is_hmm(va_block)) {
         if (UVM_ID_IS_GPU(dest_id))
             block_cleanup_temp_pinned_gpu_chunks(va_block, dest_id);
 
@@ -7891,7 +7825,7 @@ static NV_STATUS block_pre_populate_pde1_gpu(uvm_va_block_t *block,
     gpu = gpu_va_space->gpu;
     big_page_size = gpu_va_space->page_tables.big_page_size;
 
-    gpu_state = block_gpu_state_get_alloc(block, gpu);
+    gpu_state = uvm_va_block_gpu_state_get_alloc(block, gpu);
     if (!gpu_state)
         return NV_ERR_NO_MEMORY;
 
@@ -8604,12 +8538,12 @@ NV_STATUS uvm_va_block_map(uvm_va_block_t *va_block,
 
         gpu = uvm_gpu_get(id);
 
-        // Although this GPU UUID is registered in the VA space, it might not have a
-        // GPU VA space registered.
+        // Although this GPU UUID is registered in the VA space, it might not
+        // have a GPU VA space registered.
         if (!uvm_gpu_va_space_get(va_space, gpu))
             return NV_OK;
 
-        gpu_state = block_gpu_state_get_alloc(va_block, gpu);
+        gpu_state = uvm_va_block_gpu_state_get_alloc(va_block, gpu);
         if (!gpu_state)
             return NV_ERR_NO_MEMORY;
 
@@ -9608,7 +9542,7 @@ static void block_kill(uvm_va_block_t *block)
             if (!uvm_va_block_is_hmm(block))
                 uvm_cpu_chunk_mark_dirty(chunk, 0);
 
-            uvm_cpu_chunk_remove_from_block(block, nid, page_index);
+            uvm_cpu_chunk_remove_from_block(block, chunk, nid, page_index);
             uvm_cpu_chunk_free(chunk);
         }
 
@@ -9672,13 +9606,12 @@ void uvm_va_block_kill(uvm_va_block_t *va_block)
 static void block_gpu_release_region(uvm_va_block_t *va_block,
                                      uvm_gpu_id_t gpu_id,
                                      uvm_va_block_gpu_state_t *gpu_state,
-                                     uvm_page_mask_t *page_mask,
                                      uvm_va_block_region_t region)
 {
     uvm_page_index_t page_index;
     uvm_gpu_t *gpu = uvm_gpu_get(gpu_id);
 
-    for_each_va_block_page_in_region_mask(page_index, page_mask, region) {
+    for_each_va_block_page_in_region(page_index, region) {
         size_t chunk_index = block_gpu_chunk_index(va_block, gpu, page_index, NULL);
         uvm_gpu_chunk_t *gpu_chunk = gpu_state->chunks[chunk_index];
 
@@ -9723,7 +9656,7 @@ void uvm_va_block_munmap_region(uvm_va_block_t *va_block,
             uvm_processor_mask_clear(&va_block->evicted_gpus, gpu_id);
 
         if (gpu_state->chunks) {
-            block_gpu_release_region(va_block, gpu_id, gpu_state, NULL, region);
+            block_gpu_release_region(va_block, gpu_id, gpu_state, region);
 
             // TODO: bug 3660922: Need to update the read duplicated pages mask
             // when read duplication is supported for HMM.
@@ -10294,7 +10227,7 @@ static NV_STATUS block_split_preallocate_no_retry(uvm_va_block_t *existing, uvm_
         if (status != NV_OK)
             goto error;
 
-        if (!block_gpu_state_get_alloc(new, gpu)) {
+        if (!uvm_va_block_gpu_state_get_alloc(new, gpu)) {
             status = NV_ERR_NO_MEMORY;
             goto error;
         }
@@ -10468,7 +10401,7 @@ static void block_split_cpu(uvm_va_block_t *existing, uvm_va_block_t *new)
             uvm_page_index_t new_chunk_page_index;
             NV_STATUS status;
 
-            uvm_cpu_chunk_remove_from_block(existing, nid, page_index);
+            uvm_cpu_chunk_remove_from_block(existing, chunk, nid, page_index);
 
             // The chunk has to be adjusted for the new block before inserting it.
             new_chunk_page_index = page_index - split_page_index;
@@ -13067,7 +13000,7 @@ out:
 
 static NV_STATUS block_gpu_force_4k_ptes(uvm_va_block_t *block, uvm_va_block_context_t *block_context, uvm_gpu_t *gpu)
 {
-    uvm_va_block_gpu_state_t *gpu_state = block_gpu_state_get_alloc(block, gpu);
+    uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get_alloc(block, gpu);
     uvm_push_t push;
     NV_STATUS status;
 
diff --git a/kernel-open/nvidia-uvm/uvm_va_block.h b/kernel-open/nvidia-uvm/uvm_va_block.h
index 180e2114a..5d53bcc6b 100644
--- a/kernel-open/nvidia-uvm/uvm_va_block.h
+++ b/kernel-open/nvidia-uvm/uvm_va_block.h
@@ -1339,9 +1339,11 @@ NV_STATUS uvm_va_block_service_finish(uvm_processor_id_t processor_id,
                                       uvm_va_block_t *va_block,
                                       uvm_service_block_context_t *service_context);
 
-// Allocate GPU state for the given va_block and registered GPUs.
+// Returns the gpu_state for the given GPU. The returned pointer is
+// internally managed and will be allocated (and freed) automatically,
+// rather than by the caller. Returns NULL if there is no memory.
 // Locking: The block lock must be held.
-NV_STATUS uvm_va_block_gpu_state_alloc(uvm_va_block_t *va_block);
+uvm_va_block_gpu_state_t *uvm_va_block_gpu_state_get_alloc(uvm_va_block_t *va_block, uvm_gpu_t *gpu);
 
 // Release any GPU or policy data associated with the given region in response
 // to munmap().
@@ -2113,10 +2115,13 @@ bool uvm_va_block_cpu_is_region_resident_on(uvm_va_block_t *va_block, int nid, u
 // Locking: The va_block lock must be held.
 NV_STATUS uvm_cpu_chunk_insert_in_block(uvm_va_block_t *va_block, uvm_cpu_chunk_t *chunk, uvm_page_index_t page_index);
 
-// Remove a CPU chunk at the given page_index from the va_block.
+// Remove the given CPU chunk at the given page_index from the va_block.
 // nid cannot be NUMA_NO_NODE.
 // Locking: The va_block lock must be held.
-void uvm_cpu_chunk_remove_from_block(uvm_va_block_t *va_block, int nid, uvm_page_index_t page_index);
+void uvm_cpu_chunk_remove_from_block(uvm_va_block_t *va_block,
+                                     uvm_cpu_chunk_t *chunk,
+                                     int nid,
+                                     uvm_page_index_t page_index);
 
 // Return the CPU chunk at the given page_index on the given NUMA node from the
 // va_block. nid cannot be NUMA_NO_NODE.
@@ -2249,6 +2254,13 @@ NV_STATUS uvm_va_block_populate_page_cpu(uvm_va_block_t *va_block,
                                          uvm_page_index_t page_index,
                                          uvm_va_block_context_t *block_context);
 
+// Populate all GPU chunks which cover the given region and page mask.
+NV_STATUS uvm_va_block_populate_pages_gpu(uvm_va_block_t *block,
+                                          uvm_va_block_retry_t *retry,
+                                          uvm_gpu_id_t gpu_id,
+                                          uvm_va_block_region_t region,
+                                          const uvm_page_mask_t *populate_mask);
+
 // A helper macro for handling allocation-retry
 //
 // The macro takes a VA block, uvm_va_block_retry_t struct and a function call
diff --git a/kernel-open/nvidia/nv.c b/kernel-open/nvidia/nv.c
index 07f842a1a..568560540 100644
--- a/kernel-open/nvidia/nv.c
+++ b/kernel-open/nvidia/nv.c
@@ -2497,8 +2497,13 @@ nvidia_ioctl(
 
             NV_CTL_DEVICE_ONLY(nv);
 
-            if (num_arg_gpus == 0 || nvlfp->num_attached_gpus != 0 ||
-                arg_size % sizeof(NvU32) != 0)
+            if ((num_arg_gpus == 0) || (arg_size % sizeof(NvU32) != 0))
+            {
+                status = -EINVAL;
+                goto done;
+            }
+
+            if (nvlfp->num_attached_gpus != 0)
             {
                 status = -EINVAL;
                 goto done;
@@ -2527,6 +2532,7 @@ nvidia_ioctl(
                         if (nvlfp->attached_gpus[i] != 0)
                             nvidia_dev_put(nvlfp->attached_gpus[i], sp);
                     }
+
                     NV_KFREE(nvlfp->attached_gpus, arg_size);
                     nvlfp->num_attached_gpus = 0;
 
diff --git a/src/common/displayport/inc/dp_connectorimpl.h b/src/common/displayport/inc/dp_connectorimpl.h
index 2219dcc30..b4cfd9410 100644
--- a/src/common/displayport/inc/dp_connectorimpl.h
+++ b/src/common/displayport/inc/dp_connectorimpl.h
@@ -260,6 +260,12 @@ namespace DisplayPort
         // Flag to check if the system is UEFI.
         bool        bIsUefiSystem;
 
+        //
+        // Flag to ensure we take into account that
+        // Displayport++ supports HDMI as well.
+        //
+        bool        bHDMIOnDPPlusPlus;
+
         bool        bSkipResetLinkStateDuringPlug;
 
         // Flag to check if LT should be skipped.
diff --git a/src/common/displayport/inc/dp_regkeydatabase.h b/src/common/displayport/inc/dp_regkeydatabase.h
index 88641362b..f74609810 100644
--- a/src/common/displayport/inc/dp_regkeydatabase.h
+++ b/src/common/displayport/inc/dp_regkeydatabase.h
@@ -110,6 +110,8 @@
 
 #define NV_DP_REGKEY_SKIP_SETTING_LINK_STATE_DURING_UNPLUG        "DP_SKIP_SETTING_LINK_STATE_DURING_UNPLUG"
 
+// This regkey ensures DPLib takes into account Displayport++ supports HDMI.
+#define NV_DP_REGKEY_HDMI_ON_DP_PLUS_PLUS                         "HDMI_ON_DP_PLUS_PLUS"
 
 // Data Base used to store all the regkey values.
 // The actual data base is declared statically in dp_evoadapter.cpp.
@@ -154,6 +156,7 @@ struct DP_REGKEY_DATABASE
     bool  bEnableLowerBppCheckForDsc;
     bool  bSkipSettingLinkStateDuringUnplug;
     bool  bEnableDevId;
+    bool  bHDMIOnDPPlusPlus;
 };
 
 extern struct DP_REGKEY_DATABASE dpRegkeyDatabase;
diff --git a/src/common/displayport/src/dp_configcaps2x.cpp b/src/common/displayport/src/dp_configcaps2x.cpp
index 2f83c8920..4ad3b35ad 100644
--- a/src/common/displayport/src/dp_configcaps2x.cpp
+++ b/src/common/displayport/src/dp_configcaps2x.cpp
@@ -151,6 +151,16 @@ void DPCDHALImpl2x::parseAndReadCaps()
 
     DPCDHALImpl::parseAndReadCaps();
 
+    // reset DP tunneling UHBR caps
+    caps2x.dpInTunnelingCaps.bUHBR_10GSupported = NV_FALSE;
+    caps2x.dpInTunnelingCaps.bUHBR_13_5GSupported = NV_FALSE;
+    caps2x.dpInTunnelingCaps.bUHBR_20GSupported = NV_FALSE;
+
+    // reset CableCaps
+    caps2x.cableCaps.bUHBR_10GSupported = NV_TRUE;
+    caps2x.cableCaps.bUHBR_13_5GSupported = NV_TRUE;
+    caps2x.cableCaps.bUHBR_20GSupported = NV_TRUE;
+
     // 02206h
     if (AuxRetry::ack == bus.read(NV_DPCD14_EXTENDED_MAIN_LINK_CHANNEL_CODING, &buffer[0], 1))
     {
diff --git a/src/common/displayport/src/dp_connectorimpl.cpp b/src/common/displayport/src/dp_connectorimpl.cpp
index 106ed709f..c36916afe 100644
--- a/src/common/displayport/src/dp_connectorimpl.cpp
+++ b/src/common/displayport/src/dp_connectorimpl.cpp
@@ -199,6 +199,7 @@ void ConnectorImpl::applyRegkeyOverrides(const DP_REGKEY_DATABASE& dpRegkeyDatab
     this->bEnableLowerBppCheckForDsc        = dpRegkeyDatabase.bEnableLowerBppCheckForDsc;
     this->bSkipSettingLinkStateDuringUnplug = dpRegkeyDatabase.bSkipSettingLinkStateDuringUnplug;
     this->bEnableDevId                      = dpRegkeyDatabase.bEnableDevId;
+    this->bHDMIOnDPPlusPlus                 = dpRegkeyDatabase.bHDMIOnDPPlusPlus;
 }
 
 void ConnectorImpl::setPolicyModesetOrderMitigation(bool enabled)
@@ -412,7 +413,16 @@ void ConnectorImpl::processNewDevice(const DiscoveryManager::Device & device,
     {
         case DISPLAY_PORT:
         case DISPLAY_PORT_PLUSPLUS: // DP port that supports DP and TMDS
-            connector = connectorDisplayPort;
+            if (bHDMIOnDPPlusPlus &&
+                existingDev &&
+                existingDev->connectorType == connectorHDMI)
+            {
+                connector = connectorHDMI;
+            }
+            else
+            {
+                connector = connectorDisplayPort;
+            }
             break;
 
         case ANALOG_VGA:
diff --git a/src/common/displayport/src/dp_evoadapter.cpp b/src/common/displayport/src/dp_evoadapter.cpp
index 1076e9359..6d759f933 100644
--- a/src/common/displayport/src/dp_evoadapter.cpp
+++ b/src/common/displayport/src/dp_evoadapter.cpp
@@ -108,7 +108,8 @@ const struct
     {NV_DP_REGKEY_FORCE_HEAD_SHUTDOWN,                   &dpRegkeyDatabase.bForceHeadShutdown,                DP_REG_VAL_BOOL},
     {NV_DP_REGKEY_ENABLE_LOWER_BPP_CHECK_FOR_DSC,        &dpRegkeyDatabase.bEnableLowerBppCheckForDsc,        DP_REG_VAL_BOOL},
     {NV_DP_REGKEY_SKIP_SETTING_LINK_STATE_DURING_UNPLUG, &dpRegkeyDatabase.bSkipSettingLinkStateDuringUnplug, DP_REG_VAL_BOOL},
-    {NV_DP_REGKEY_EXPOSE_DSC_DEVID_WAR,                  &dpRegkeyDatabase.bEnableDevId,                      DP_REG_VAL_BOOL}
+    {NV_DP_REGKEY_EXPOSE_DSC_DEVID_WAR,                  &dpRegkeyDatabase.bEnableDevId,                      DP_REG_VAL_BOOL},
+    {NV_DP_REGKEY_HDMI_ON_DP_PLUS_PLUS,                  &dpRegkeyDatabase.bHDMIOnDPPlusPlus,                 DP_REG_VAL_BOOL}
 };
 
 EvoMainLink::EvoMainLink(EvoInterface * provider, Timer * timer) :
diff --git a/src/common/inc/nvBldVer.h b/src/common/inc/nvBldVer.h
index 028624600..9c4a387a0 100644
--- a/src/common/inc/nvBldVer.h
+++ b/src/common/inc/nvBldVer.h
@@ -43,18 +43,18 @@
 #endif
 
 #if defined(NV_LINUX) || defined(NV_BSD) || defined(NV_SUNOS)
-#define NV_BUILD_BRANCH_VERSION         "rel/gpu_drv/r570/r570_00-540"
-#define NV_BUILD_CHANGELIST_NUM         (36324750)
+#define NV_BUILD_BRANCH_VERSION         "rel/gpu_drv/r570/r570_00-575"
+#define NV_BUILD_CHANGELIST_NUM         (36467544)
 #define NV_BUILD_TYPE                   "Official"
-#define NV_BUILD_NAME                   "rel/gpu_drv/r570/r570_00-540"
-#define NV_LAST_OFFICIAL_CHANGELIST_NUM (36324750)
+#define NV_BUILD_NAME                   "rel/gpu_drv/r570/r570_00-575"
+#define NV_LAST_OFFICIAL_CHANGELIST_NUM (36467544)
 
 #else     /* Windows builds */
-#define NV_BUILD_BRANCH_VERSION         "r570_00-536"
-#define NV_BUILD_CHANGELIST_NUM         (36324750)
+#define NV_BUILD_BRANCH_VERSION         "r570_00-569"
+#define NV_BUILD_CHANGELIST_NUM         (36467544)
 #define NV_BUILD_TYPE                   "Official"
-#define NV_BUILD_NAME                   "573.65"
-#define NV_LAST_OFFICIAL_CHANGELIST_NUM (36324750)
+#define NV_BUILD_NAME                   "573.73"
+#define NV_LAST_OFFICIAL_CHANGELIST_NUM (36467544)
 #define NV_BUILD_BRANCH_BASE_VERSION    R570
 #endif
 // End buildmeister python edited section
diff --git a/src/common/inc/nvUnixVersion.h b/src/common/inc/nvUnixVersion.h
index 089c722f5..cd6960628 100644
--- a/src/common/inc/nvUnixVersion.h
+++ b/src/common/inc/nvUnixVersion.h
@@ -4,7 +4,7 @@
 #if defined(NV_LINUX) || defined(NV_BSD) || defined(NV_SUNOS) || defined(NV_VMWARE) || defined(NV_QNX) || defined(NV_INTEGRITY) || \
     (defined(RMCFG_FEATURE_PLATFORM_GSP) && RMCFG_FEATURE_PLATFORM_GSP == 1)
 
-#define NV_VERSION_STRING               "570.181"
+#define NV_VERSION_STRING               "570.190"
 
 #else
 
diff --git a/src/common/sdk/nvidia/inc/ctrl/ctrl208f/ctrl208ffb.h b/src/common/sdk/nvidia/inc/ctrl/ctrl208f/ctrl208ffb.h
index 41825448b..b528cd4db 100644
--- a/src/common/sdk/nvidia/inc/ctrl/ctrl208f/ctrl208ffb.h
+++ b/src/common/sdk/nvidia/inc/ctrl/ctrl208f/ctrl208ffb.h
@@ -724,4 +724,25 @@ typedef struct NV208F_CTRL_FB_CONVERT_CHANNEL_PARAMS {
 
 #define NV208F_CTRL_FB_CHANNEL_CONVERSION_TYPE_LOGICAL_TO_PHYSICAL (0x00000000U)
 #define NV208F_CTRL_FB_CHANNEL_CONVERSION_TYPE_PHYSICAL_TO_LOGICAL (0x00000001U)
+
+/*
+ * NV208F_CTRL_CMD_FB_SET_ROW_REMAP_FAILURE_FLAG
+ *
+ * This command sets the status of row remap failure flag to the passed value.
+ *
+ *  value
+ *    The value to set for row remap failure flag
+ *
+ *  Possible status values returned are:
+ *    NV_OK
+ *    NV_ERR_NOT_SUPPORTED
+ */
+#define NV208F_CTRL_CMD_FB_SET_ROW_REMAP_FAILURE_FLAG              (0x208f051cU) /* finn: Evaluated from "(FINN_NV20_SUBDEVICE_DIAG_FB_INTERFACE_ID << 8) | NV208F_CTRL_FB_SET_ROW_REMAP_FAILURE_FLAG_PARAMS_MESSAGE_ID" */
+
+#define NV208F_CTRL_FB_SET_ROW_REMAP_FAILURE_FLAG_PARAMS_MESSAGE_ID (0x1cU)
+
+typedef struct NV208F_CTRL_FB_SET_ROW_REMAP_FAILURE_FLAG_PARAMS {
+    NvBool value;
+} NV208F_CTRL_FB_SET_ROW_REMAP_FAILURE_FLAG_PARAMS;
+
 /* _ctrl208ffb_h_ */
diff --git a/src/common/uproc/os/libos-v3.1.0/lib/liblogdecode.c b/src/common/uproc/os/libos-v3.1.0/lib/liblogdecode.c
index 75e37ba86..11d17e416 100644
--- a/src/common/uproc/os/libos-v3.1.0/lib/liblogdecode.c
+++ b/src/common/uproc/os/libos-v3.1.0/lib/liblogdecode.c
@@ -1505,25 +1505,25 @@ NvBool isLibosPreserveLogBufferFull(LIBOS_LOG_DECODE *pLogDecode, NvU32 gpuInsta
 {
     NvU64 i = (NvU32)(pLogDecode->numLogBuffers);
     NvU32 tag = LIBOS_LOG_NVLOG_BUFFER_TAG(pLogDecode->sourceName, i * 2);
-    NVLOG_BUFFER_HANDLE handle = 0;
-    NV_STATUS status = nvlogGetBufferHandleFromTag(tag, &handle);
 
-    if (status != NV_OK)
+    //
+    // Cannot use nvlogGetBufferHandleFromTag here since in multi GPU case,
+    // we can have multiple buffers with exact same tag, only differentiable
+    // from gpuInstance
+    //
+    for (i = 0; i < NVLOG_MAX_BUFFERS; i++)
     {
-        return NV_FALSE;
-    }
-
-    NVLOG_BUFFER *pNvLogBuffer = NvLogLogger.pBuffers[handle];
-    if (pNvLogBuffer == NULL)
-    {
-        return NV_FALSE;
-    }
-
-    if (FLD_TEST_DRF(LOG_BUFFER, _FLAGS, _PRESERVE, _YES, pNvLogBuffer->flags) &&
-        DRF_VAL(LOG, _BUFFER_FLAGS, _GPU_INSTANCE, pNvLogBuffer->flags) == gpuInstance &&
-        (pNvLogBuffer->pos >= pNvLogBuffer->size - NV_OFFSETOF(LIBOS_LOG_NVLOG_BUFFER, data) - sizeof(NvU64)))
-    {
-        return NV_TRUE;
+        if (NvLogLogger.pBuffers[i] != NULL)
+        {
+            NVLOG_BUFFER *pNvLogBuffer = NvLogLogger.pBuffers[i];
+            if ((pNvLogBuffer->tag == tag) &&
+                (DRF_VAL(LOG, _BUFFER_FLAGS, _GPU_INSTANCE, pNvLogBuffer->flags) == gpuInstance) &&
+                FLD_TEST_DRF(LOG_BUFFER, _FLAGS, _PRESERVE, _YES, pNvLogBuffer->flags) &&
+                (pNvLogBuffer->pos >= pNvLogBuffer->size - NV_OFFSETOF(LIBOS_LOG_NVLOG_BUFFER, data) - sizeof(NvU64)))
+            {
+                return NV_TRUE;
+            }
+        }
     }
 
     return NV_FALSE;
@@ -1531,19 +1531,27 @@ NvBool isLibosPreserveLogBufferFull(LIBOS_LOG_DECODE *pLogDecode, NvU32 gpuInsta
 
 static NvBool findPreservedNvlogBuffer(NvU32 tag, NvU32 gpuInstance, NVLOG_BUFFER_HANDLE *pHandle)
 {
-    NVLOG_BUFFER_HANDLE handle = 0;
-    NV_STATUS status = nvlogGetBufferHandleFromTag(tag, &handle);
+    NvU64 i;
 
-    if (status != NV_OK)
-        return NV_FALSE;
-
-    NVLOG_BUFFER *pNvLogBuffer = NvLogLogger.pBuffers[handle];
-    if (FLD_TEST_DRF(LOG_BUFFER, _FLAGS, _PRESERVE, _YES, pNvLogBuffer->flags) &&
-        DRF_VAL(LOG, _BUFFER_FLAGS, _GPU_INSTANCE, pNvLogBuffer->flags) == gpuInstance &&
-        (pNvLogBuffer->pos < pNvLogBuffer->size - NV_OFFSETOF(LIBOS_LOG_NVLOG_BUFFER, data) - sizeof(NvU64)))
+    //
+    // Cannot use nvlogGetBufferHandleFromTag here since in multi GPU case,
+    // we can have multiple buffers with exact same tag, only differentiable
+    // from gpuInstance
+    //
+    for (i = 0; i < NVLOG_MAX_BUFFERS; i++)
     {
-        *pHandle = handle;
-        return NV_TRUE;
+        if (NvLogLogger.pBuffers[i] != NULL)
+        {
+            NVLOG_BUFFER *pNvLogBuffer = NvLogLogger.pBuffers[i];
+            if ((pNvLogBuffer->tag == tag) &&
+                (DRF_VAL(LOG, _BUFFER_FLAGS, _GPU_INSTANCE, pNvLogBuffer->flags) == gpuInstance) &&
+                FLD_TEST_DRF(LOG_BUFFER, _FLAGS, _PRESERVE, _YES, pNvLogBuffer->flags) &&
+                (pNvLogBuffer->pos < pNvLogBuffer->size - NV_OFFSETOF(LIBOS_LOG_NVLOG_BUFFER, data) - sizeof(NvU64)))
+            {
+                *pHandle = i;
+                return NV_TRUE;
+            }
+        }
     }
 
     return NV_FALSE;
diff --git a/src/nvidia/arch/nvalloc/unix/include/osapi.h b/src/nvidia/arch/nvalloc/unix/include/osapi.h
index 83029cb4a..0411e9bf5 100644
--- a/src/nvidia/arch/nvalloc/unix/include/osapi.h
+++ b/src/nvidia/arch/nvalloc/unix/include/osapi.h
@@ -176,6 +176,8 @@ void       RmUpdateGc6ConsoleRefCount (nv_state_t *, NvBool);
 NvBool     rm_get_uefi_console_status (nv_state_t *);
 NvU64      rm_get_uefi_console_size (nv_state_t *, NvU64 *);
 
+void       rm_check_s0ix_regkey_and_platform_support(void);
+
 RM_API    *RmUnixRmApiPrologue      (nv_state_t *, THREAD_STATE_NODE *, NvU32 module);
 void       RmUnixRmApiEpilogue      (nv_state_t *, THREAD_STATE_NODE *);
 
diff --git a/src/nvidia/arch/nvalloc/unix/src/dynamic-power.c b/src/nvidia/arch/nvalloc/unix/src/dynamic-power.c
index d7929ba73..80ff9b9fa 100644
--- a/src/nvidia/arch/nvalloc/unix/src/dynamic-power.c
+++ b/src/nvidia/arch/nvalloc/unix/src/dynamic-power.c
@@ -747,6 +747,16 @@ rmReadAndParseDynamicPowerRegkey
 }
 #undef NV_PMC_BOOT_42_CHIP_ID_GA102
 
+void rm_check_s0ix_regkey_and_platform_support(void)
+{
+    OBJSYS *pSys = SYS_GET_INSTANCE();
+    NvU32 data;
+    NvBool status = ((nv_platform_supports_s0ix()) &&
+                     ((osReadRegistryDword(NULL, NV_REG_ENABLE_S0IX_POWER_MANAGEMENT, &data) == NV_OK) && (data == 1)));
+
+    pSys->setProperty(pSys, PDB_PROP_SYS_SUPPORTS_S0IX, status);
+}
+
 /*!
  * @brief Initialize state related to dynamic power management.
  * Called once per GPU during driver initialization.
diff --git a/src/nvidia/arch/nvalloc/unix/src/osinit.c b/src/nvidia/arch/nvalloc/unix/src/osinit.c
index b9437d37a..453d64d2c 100644
--- a/src/nvidia/arch/nvalloc/unix/src/osinit.c
+++ b/src/nvidia/arch/nvalloc/unix/src/osinit.c
@@ -272,6 +272,8 @@ NV_STATUS osRmInitRm(void)
         return status;
     }
 
+    rm_check_s0ix_regkey_and_platform_support();
+
     // Setup any ThreadState defaults
     threadStateInitSetupFlags(THREAD_STATE_SETUP_FLAGS_ENABLED |
                               THREAD_STATE_SETUP_FLAGS_TIMEOUT_ENABLED |
diff --git a/src/nvidia/generated/g_engines_pb.c b/src/nvidia/generated/g_engines_pb.c
index ea90f57a6..ff2fc5b94 100644
--- a/src/nvidia/generated/g_engines_pb.c
+++ b/src/nvidia/generated/g_engines_pb.c
@@ -368,6 +368,18 @@ const PRB_FIELD_DESC prb_fields_nvdebug_eng_kgsp_rpcinfo[] = {
         PRB_MAYBE_FIELD_NAME("data1")
         PRB_MAYBE_FIELD_DEFAULT(0)
     },
+    {
+        6,
+        {
+            PRB_OPTIONAL,
+            PRB_UINT32,
+            0,
+        },
+        0,
+        0,
+        PRB_MAYBE_FIELD_NAME("sequence")
+        PRB_MAYBE_FIELD_DEFAULT(0)
+    },
 };
 
 // Message descriptors
@@ -403,7 +415,7 @@ const PRB_MSG_DESC prb_messages_nvdebug_eng[] = {
         PRB_MAYBE_MESSAGE_NAME("NvDebug.Eng.Mc.PciBarInfo")
     },
     {
-        5,
+        6,
         prb_fields_nvdebug_eng_kgsp_rpcinfo,
         PRB_MAYBE_MESSAGE_NAME("NvDebug.Eng.KGsp.RpcInfo")
     },
diff --git a/src/nvidia/generated/g_engines_pb.h b/src/nvidia/generated/g_engines_pb.h
index 0fb4a58f3..c5a5b4818 100644
--- a/src/nvidia/generated/g_engines_pb.h
+++ b/src/nvidia/generated/g_engines_pb.h
@@ -21,10 +21,10 @@ extern const PRB_MSG_DESC prb_messages_nvdebug_eng[];
 #define NVDEBUG_ENG_MC_LEN 66
 #define NVDEBUG_ENG_GPU_LEN 62
 #define NVDEBUG_ENG_NVD_LEN 30
-#define NVDEBUG_ENG_KGSP_LEN 88
+#define NVDEBUG_ENG_KGSP_LEN 100
 #define NVDEBUG_ENG_MC_RMDATA_LEN 6
 #define NVDEBUG_ENG_MC_PCIBARINFO_LEN 22
-#define NVDEBUG_ENG_KGSP_RPCINFO_LEN 40
+#define NVDEBUG_ENG_KGSP_RPCINFO_LEN 46
 
 extern const PRB_FIELD_DESC prb_fields_nvdebug_eng_mc[];
 
@@ -85,8 +85,8 @@ extern const PRB_FIELD_DESC prb_fields_nvdebug_eng_kgsp[];
 #define NVDEBUG_ENG_KGSP_EVENT_HISTORY (&prb_fields_nvdebug_eng_kgsp[1])
 
 // 'KGsp' field lengths
-#define NVDEBUG_ENG_KGSP_RPC_HISTORY_LEN 43
-#define NVDEBUG_ENG_KGSP_EVENT_HISTORY_LEN 43
+#define NVDEBUG_ENG_KGSP_RPC_HISTORY_LEN 49
+#define NVDEBUG_ENG_KGSP_EVENT_HISTORY_LEN 49
 
 extern const PRB_FIELD_DESC prb_fields_nvdebug_eng_mc_rmdata[];
 
@@ -114,6 +114,7 @@ extern const PRB_FIELD_DESC prb_fields_nvdebug_eng_kgsp_rpcinfo[];
 #define NVDEBUG_ENG_KGSP_RPCINFO_TS_END (&prb_fields_nvdebug_eng_kgsp_rpcinfo[2])
 #define NVDEBUG_ENG_KGSP_RPCINFO_DATA0 (&prb_fields_nvdebug_eng_kgsp_rpcinfo[3])
 #define NVDEBUG_ENG_KGSP_RPCINFO_DATA1 (&prb_fields_nvdebug_eng_kgsp_rpcinfo[4])
+#define NVDEBUG_ENG_KGSP_RPCINFO_SEQUENCE (&prb_fields_nvdebug_eng_kgsp_rpcinfo[5])
 
 // 'RpcInfo' field lengths
 #define NVDEBUG_ENG_KGSP_RPCINFO_FUNCTION_LEN 5
@@ -121,6 +122,7 @@ extern const PRB_FIELD_DESC prb_fields_nvdebug_eng_kgsp_rpcinfo[];
 #define NVDEBUG_ENG_KGSP_RPCINFO_TS_END_LEN 10
 #define NVDEBUG_ENG_KGSP_RPCINFO_DATA0_LEN 5
 #define NVDEBUG_ENG_KGSP_RPCINFO_DATA1_LEN 5
+#define NVDEBUG_ENG_KGSP_RPCINFO_SEQUENCE_LEN 5
 
 extern const PRB_SERVICE_DESC prb_services_nvdebug_eng[];
 
diff --git a/src/nvidia/generated/g_intr_nvoc.h b/src/nvidia/generated/g_intr_nvoc.h
index f17204db1..1963c7c35 100644
--- a/src/nvidia/generated/g_intr_nvoc.h
+++ b/src/nvidia/generated/g_intr_nvoc.h
@@ -1054,23 +1054,23 @@ static inline NV_STATUS intrRestoreIntrRegValue(OBJGPU *pGpu, struct Intr *pIntr
 
 #define intrRestoreIntrRegValue_HAL(pGpu, pIntr, arg3, arg4, arg5) intrRestoreIntrRegValue(pGpu, pIntr, arg3, arg4, arg5)
 
-static inline NV_STATUS intrTriggerCpuDoorbellForVF_46f6a7(OBJGPU *pGpu, struct Intr *pIntr, NvU32 gfid) {
+static inline NV_STATUS intrTriggerCpuDoorbellForVF_46f6a7(OBJGPU *pGpu, struct Intr *pIntr, NvU32 gfid, NvBool bRearmIntr) {
     return NV_ERR_NOT_SUPPORTED;
 }
 
-NV_STATUS intrTriggerCpuDoorbellForVF_TU102(OBJGPU *pGpu, struct Intr *pIntr, NvU32 gfid);
+NV_STATUS intrTriggerCpuDoorbellForVF_TU102(OBJGPU *pGpu, struct Intr *pIntr, NvU32 gfid, NvBool bRearmIntr);
 
 
 #ifdef __nvoc_intr_h_disabled
-static inline NV_STATUS intrTriggerCpuDoorbellForVF(OBJGPU *pGpu, struct Intr *pIntr, NvU32 gfid) {
+static inline NV_STATUS intrTriggerCpuDoorbellForVF(OBJGPU *pGpu, struct Intr *pIntr, NvU32 gfid, NvBool bRearmIntr) {
     NV_ASSERT_FAILED_PRECOMP("Intr was disabled!");
     return NV_ERR_NOT_SUPPORTED;
 }
 #else //__nvoc_intr_h_disabled
-#define intrTriggerCpuDoorbellForVF(pGpu, pIntr, gfid) intrTriggerCpuDoorbellForVF_46f6a7(pGpu, pIntr, gfid)
+#define intrTriggerCpuDoorbellForVF(pGpu, pIntr, gfid, bRearmIntr) intrTriggerCpuDoorbellForVF_46f6a7(pGpu, pIntr, gfid, bRearmIntr)
 #endif //__nvoc_intr_h_disabled
 
-#define intrTriggerCpuDoorbellForVF_HAL(pGpu, pIntr, gfid) intrTriggerCpuDoorbellForVF(pGpu, pIntr, gfid)
+#define intrTriggerCpuDoorbellForVF_HAL(pGpu, pIntr, gfid, bRearmIntr) intrTriggerCpuDoorbellForVF(pGpu, pIntr, gfid, bRearmIntr)
 
 void intrRetriggerTopLevel_TU102(OBJGPU *pGpu, struct Intr *pIntr);
 
diff --git a/src/nvidia/generated/g_kernel_bif_nvoc.c b/src/nvidia/generated/g_kernel_bif_nvoc.c
index 5b02ee041..b0ace17e8 100644
--- a/src/nvidia/generated/g_kernel_bif_nvoc.c
+++ b/src/nvidia/generated/g_kernel_bif_nvoc.c
@@ -895,18 +895,25 @@ static void __nvoc_init_funcTable_KernelBif_1(KernelBif *pThis, RmHalspecOwner *
         pThis->__kbifCacheMnocSupport__ = &kbifCacheMnocSupport_b3696a;
     }
 
-    // kbifCacheVFInfo -- halified (3 hals) body
-    if (( ((chipHal_HalVarIdx >> 5) == 1UL) && ((1UL << (chipHal_HalVarIdx & 0x1f)) & 0xc0000000UL) )) /* ChipHal: GB100 | GB102 */ 
+    // kbifCacheVFInfo -- halified (4 hals) body
+    if (( ((rmVariantHal_HalVarIdx >> 5) == 0UL) && ((1UL << (rmVariantHal_HalVarIdx & 0x1f)) & 0x00000001UL) )) /* RmVariantHal: VF */ 
     {
-        pThis->__kbifCacheVFInfo__ = &kbifCacheVFInfo_GB100;
-    }
-    else if (( ((chipHal_HalVarIdx >> 5) == 1UL) && ((1UL << (chipHal_HalVarIdx & 0x1f)) & 0x01f0ffe0UL) )) /* ChipHal: TU102 | TU104 | TU106 | TU116 | TU117 | GA100 | GA102 | GA103 | GA104 | GA106 | GA107 | AD102 | AD103 | AD104 | AD106 | AD107 */ 
-    {
-        pThis->__kbifCacheVFInfo__ = &kbifCacheVFInfo_TU102;
+        pThis->__kbifCacheVFInfo__ = &kbifCacheVFInfo_b3696a;
     }
     else
     {
-        pThis->__kbifCacheVFInfo__ = &kbifCacheVFInfo_GH100;
+        if (( ((chipHal_HalVarIdx >> 5) == 1UL) && ((1UL << (chipHal_HalVarIdx & 0x1f)) & 0xc0000000UL) )) /* ChipHal: GB100 | GB102 */ 
+        {
+            pThis->__kbifCacheVFInfo__ = &kbifCacheVFInfo_GB100;
+        }
+        else if (( ((chipHal_HalVarIdx >> 5) == 1UL) && ((1UL << (chipHal_HalVarIdx & 0x1f)) & 0x01f0ffe0UL) )) /* ChipHal: TU102 | TU104 | TU106 | TU116 | TU117 | GA100 | GA102 | GA103 | GA104 | GA106 | GA107 | AD102 | AD103 | AD104 | AD106 | AD107 */ 
+        {
+            pThis->__kbifCacheVFInfo__ = &kbifCacheVFInfo_TU102;
+        }
+        else
+        {
+            pThis->__kbifCacheVFInfo__ = &kbifCacheVFInfo_GH100;
+        }
     }
 
     // kbifRestoreBar0 -- halified (3 hals) body
@@ -1113,7 +1120,7 @@ static void __nvoc_init_funcTable_KernelBif_1(KernelBif *pThis, RmHalspecOwner *
     {
         pThis->__kbifDoSecondaryBusHotReset__ = &kbifDoSecondaryBusHotReset_GH100;
     }
-} // End __nvoc_init_funcTable_KernelBif_1 with approximately 159 basic block(s).
+} // End __nvoc_init_funcTable_KernelBif_1 with approximately 160 basic block(s).
 
 
 // Initialize vtable(s) for 75 virtual method(s).
diff --git a/src/nvidia/generated/g_kernel_bif_nvoc.h b/src/nvidia/generated/g_kernel_bif_nvoc.h
index 6f6c4aa6f..0bf3ddde9 100644
--- a/src/nvidia/generated/g_kernel_bif_nvoc.h
+++ b/src/nvidia/generated/g_kernel_bif_nvoc.h
@@ -225,7 +225,7 @@ struct KernelBif {
     void (*__kbifCacheFlrSupport__)(struct OBJGPU *, struct KernelBif * /*this*/);  // halified (3 hals) body
     void (*__kbifCache64bBar0Support__)(struct OBJGPU *, struct KernelBif * /*this*/);  // halified (4 hals) body
     void (*__kbifCacheMnocSupport__)(struct OBJGPU *, struct KernelBif * /*this*/);  // halified (2 hals) body
-    void (*__kbifCacheVFInfo__)(struct OBJGPU *, struct KernelBif * /*this*/);  // halified (3 hals) body
+    void (*__kbifCacheVFInfo__)(struct OBJGPU *, struct KernelBif * /*this*/);  // halified (4 hals) body
     void (*__kbifRestoreBar0__)(struct OBJGPU *, struct KernelBif * /*this*/, void *, NvU32 *);  // halified (3 hals) body
     NvBool (*__kbifAnyBarsAreValid__)(struct OBJGPU *, struct KernelBif * /*this*/);  // halified (2 hals) body
     NV_STATUS (*__kbifRestoreBarsAndCommand__)(struct OBJGPU *, struct KernelBif * /*this*/);  // halified (3 hals) body
@@ -1302,6 +1302,10 @@ static inline void kbifCacheMnocSupport_b3696a(struct OBJGPU *pGpu, struct Kerne
 
 void kbifCacheMnocSupport_GB100(struct OBJGPU *pGpu, struct KernelBif *pKernelBif);
 
+static inline void kbifCacheVFInfo_b3696a(struct OBJGPU *pGpu, struct KernelBif *pKernelBif) {
+    return;
+}
+
 void kbifCacheVFInfo_TU102(struct OBJGPU *pGpu, struct KernelBif *pKernelBif);
 
 void kbifCacheVFInfo_GH100(struct OBJGPU *pGpu, struct KernelBif *pKernelBif);
diff --git a/src/nvidia/generated/g_nv_name_released.h b/src/nvidia/generated/g_nv_name_released.h
index 793a6c1e8..047e88aae 100644
--- a/src/nvidia/generated/g_nv_name_released.h
+++ b/src/nvidia/generated/g_nv_name_released.h
@@ -5416,6 +5416,7 @@ static const CHIPS_RELEASED sChipsReleased[] = {
     { 0x2941, 0x21ca, 0x10de, "NVIDIA GB200" },
     { 0x2B85, 0x0000, 0x0000, "NVIDIA GeForce RTX 5090" },
     { 0x2B87, 0x0000, 0x0000, "NVIDIA GeForce RTX 5090 D" },
+    { 0x2B8C, 0x530c, 0x17aa, "NVIDIA GeForce RTX 5090 D v2" },
     { 0x2BB1, 0x204b, 0x1028, "NVIDIA RTX PRO 6000 Blackwell Workstation Edition" },
     { 0x2BB1, 0x204b, 0x103c, "NVIDIA RTX PRO 6000 Blackwell Workstation Edition" },
     { 0x2BB1, 0x204b, 0x10de, "NVIDIA RTX PRO 6000 Blackwell Workstation Edition" },
@@ -5436,6 +5437,9 @@ static const CHIPS_RELEASED sChipsReleased[] = {
     { 0x2C31, 0x2051, 0x103c, "NVIDIA RTX PRO 4500 Blackwell" },
     { 0x2C31, 0x2051, 0x10de, "NVIDIA RTX PRO 4500 Blackwell" },
     { 0x2C31, 0x2051, 0x17aa, "NVIDIA RTX PRO 4500 Blackwell" },
+    { 0x2C33, 0x2053, 0x1028, "NVIDIA RTX PRO 4000 Blackwell SFF Edition" },
+    { 0x2C33, 0x2053, 0x103c, "NVIDIA RTX PRO 4000 Blackwell SFF Edition" },
+    { 0x2C33, 0x2053, 0x17aa, "NVIDIA RTX PRO 4000 Blackwell SFF Edition" },
     { 0x2C34, 0x2052, 0x1028, "NVIDIA RTX PRO 4000 Blackwell" },
     { 0x2C34, 0x2052, 0x103c, "NVIDIA RTX PRO 4000 Blackwell" },
     { 0x2C34, 0x2052, 0x10de, "NVIDIA RTX PRO 4000 Blackwell" },
@@ -5448,6 +5452,9 @@ static const CHIPS_RELEASED sChipsReleased[] = {
     { 0x2D05, 0x0000, 0x0000, "NVIDIA GeForce RTX 5060" },
     { 0x2D18, 0x0000, 0x0000, "NVIDIA GeForce RTX 5070 Laptop GPU" },
     { 0x2D19, 0x0000, 0x0000, "NVIDIA GeForce RTX 5060 Laptop GPU" },
+    { 0x2D30, 0x2054, 0x1028, "NVIDIA RTX PRO 2000 Blackwell" },
+    { 0x2D30, 0x2054, 0x103c, "NVIDIA RTX PRO 2000 Blackwell" },
+    { 0x2D30, 0x2054, 0x17aa, "NVIDIA RTX PRO 2000 Blackwell" },
     { 0x2D39, 0x0000, 0x0000, "NVIDIA RTX PRO 2000 Blackwell Generation Laptop GPU" },
     { 0x2D58, 0x0000, 0x0000, "NVIDIA GeForce RTX 5070 Laptop GPU" },
     { 0x2D59, 0x0000, 0x0000, "NVIDIA GeForce RTX 5060 Laptop GPU" },
diff --git a/src/nvidia/generated/g_nvdebug_pb.h b/src/nvidia/generated/g_nvdebug_pb.h
index 789d1e86b..c00f2cf7f 100644
--- a/src/nvidia/generated/g_nvdebug_pb.h
+++ b/src/nvidia/generated/g_nvdebug_pb.h
@@ -42,8 +42,8 @@ extern const PRB_MSG_DESC prb_messages_nvdebug[];
 // Message maximum lengths
 // Does not include repeated fields, strings and byte arrays.
 #define NVDEBUG_SYSTEMINFO_LEN 354
-#define NVDEBUG_GPUINFO_LEN 262
-#define NVDEBUG_NVDUMP_LEN 1613
+#define NVDEBUG_GPUINFO_LEN 274
+#define NVDEBUG_NVDUMP_LEN 1625
 #define NVDEBUG_SYSTEMINFO_NORTHBRIDGEINFO_LEN 12
 #define NVDEBUG_SYSTEMINFO_SOCINFO_LEN 12
 #define NVDEBUG_SYSTEMINFO_CPUINFO_LEN 24
@@ -101,7 +101,7 @@ extern const PRB_FIELD_DESC prb_fields_nvdebug_gpuinfo[];
 #define NVDEBUG_GPUINFO_ENG_GPU_LEN 65
 #define NVDEBUG_GPUINFO_ENG_MC_LEN 69
 #define NVDEBUG_GPUINFO_ENG_NVD_LEN 33
-#define NVDEBUG_GPUINFO_ENG_KGSP_LEN 91
+#define NVDEBUG_GPUINFO_ENG_KGSP_LEN 103
 
 extern const PRB_FIELD_DESC prb_fields_nvdebug_nvdump[];
 
@@ -115,7 +115,7 @@ extern const PRB_FIELD_DESC prb_fields_nvdebug_nvdump[];
 // 'NvDump' field lengths
 #define NVDEBUG_NVDUMP_SYSTEM_INFO_LEN 357
 #define NVDEBUG_NVDUMP_DCL_MSG_LEN 619
-#define NVDEBUG_NVDUMP_GPU_INFO_LEN 265
+#define NVDEBUG_NVDUMP_GPU_INFO_LEN 277
 #define NVDEBUG_NVDUMP_EXCEPTION_ADDRESS_LEN 10
 #define NVDEBUG_NVDUMP_SYSTEM_INFO_GSPRM_LEN 357
 
diff --git a/src/nvidia/generated/g_rpc_hal.h b/src/nvidia/generated/g_rpc_hal.h
index ad6d2db74..023fb1158 100644
--- a/src/nvidia/generated/g_rpc_hal.h
+++ b/src/nvidia/generated/g_rpc_hal.h
@@ -16,8 +16,8 @@
 
 typedef NV_STATUS      RpcConstruct(POBJGPU, POBJRPC);
 typedef void           RpcDestroy(POBJGPU, POBJRPC);
-typedef NV_STATUS      RpcSendMessage(POBJGPU, POBJRPC);
-typedef NV_STATUS      RpcRecvPoll(POBJGPU, POBJRPC, NvU32);
+typedef NV_STATUS      RpcSendMessage(POBJGPU, POBJRPC, NvU32 *);
+typedef NV_STATUS      RpcRecvPoll(POBJGPU, POBJRPC, NvU32, NvU32);
 
 
 //
@@ -42,10 +42,10 @@ typedef struct RPC_OBJ_IFACES {
         (_pRpc)->obj.__rpcConstruct__(_pGpu, _pRpc)
 #define rpcDestroy(_pGpu, _pRpc)  \
         (_pRpc)->obj.__rpcDestroy__(_pGpu, _pRpc)
-#define rpcSendMessage(_pGpu, _pRpc)  \
-        (_pRpc)->obj.__rpcSendMessage__(_pGpu, _pRpc)
-#define rpcRecvPoll(_pGpu, _pRpc, _arg0)  \
-        (_pRpc)->obj.__rpcRecvPoll__(_pGpu, _pRpc, _arg0)
+#define rpcSendMessage(_pGpu, _pRpc, _pArg0)  \
+        (_pRpc)->obj.__rpcSendMessage__(_pGpu, _pRpc, _pArg0)
+#define rpcRecvPoll(_pGpu, _pRpc, _arg0, _arg1)  \
+        (_pRpc)->obj.__rpcRecvPoll__(_pGpu, _pRpc, _arg0, _arg1)
 
 
 //
diff --git a/src/nvidia/generated/g_system_nvoc.c b/src/nvidia/generated/g_system_nvoc.c
index fcde01e0c..a0bbfdd4a 100644
--- a/src/nvidia/generated/g_system_nvoc.c
+++ b/src/nvidia/generated/g_system_nvoc.c
@@ -103,6 +103,7 @@ void __nvoc_init_dataField_OBJSYS(OBJSYS *pThis) {
 
     pThis->clientListDeferredFreeLimit = 0;
     pThis->setProperty(pThis, PDB_PROP_SYS_RECOVERY_REBOOT_REQUIRED, NV_FALSE);
+    pThis->setProperty(pThis, PDB_PROP_SYS_SUPPORTS_S0IX, (0));
 }
 
 NV_STATUS __nvoc_ctor_Object(Object* );
diff --git a/src/nvidia/generated/g_system_nvoc.h b/src/nvidia/generated/g_system_nvoc.h
index f5122783c..4611b1e3a 100644
--- a/src/nvidia/generated/g_system_nvoc.h
+++ b/src/nvidia/generated/g_system_nvoc.h
@@ -418,7 +418,7 @@ struct OBJSYS {
     struct OBJTRACEABLE *__nvoc_pbase_OBJTRACEABLE;    // traceable super
     struct OBJSYS *__nvoc_pbase_OBJSYS;    // sys
 
-    // 34 PDB properties
+    // 35 PDB properties
     NvBool PDB_PROP_SYS_SBIOS_NVIF_POWERMIZER_LIMIT;
     NvBool PDB_PROP_SYS_MXM_THERMAL_CONTROL_PRESENT;
     NvBool PDB_PROP_SYS_POWER_BATTERY;
@@ -453,6 +453,7 @@ struct OBJSYS {
     NvBool PDB_PROP_SYS_ENABLE_FORCE_SHARED_LOCK;
     NvBool PDB_PROP_SYS_DESTRUCTING;
     NvBool PDB_PROP_SYS_RECOVERY_REBOOT_REQUIRED;
+    NvBool PDB_PROP_SYS_SUPPORTS_S0IX;
 
     // Data members
     NvU32 apiLockMask;
@@ -547,6 +548,8 @@ extern const struct NVOC_CLASS_DEF __nvoc_class_def_OBJSYS;
 #define PDB_PROP_SYS_VALIDATE_CLIENT_HANDLE_STRICT_BASE_NAME PDB_PROP_SYS_VALIDATE_CLIENT_HANDLE_STRICT
 #define PDB_PROP_SYS_DESTRUCTING_BASE_CAST
 #define PDB_PROP_SYS_DESTRUCTING_BASE_NAME PDB_PROP_SYS_DESTRUCTING
+#define PDB_PROP_SYS_SUPPORTS_S0IX_BASE_CAST
+#define PDB_PROP_SYS_SUPPORTS_S0IX_BASE_NAME PDB_PROP_SYS_SUPPORTS_S0IX
 #define PDB_PROP_SYS_VALIDATE_KERNEL_BUFFERS_BASE_CAST
 #define PDB_PROP_SYS_VALIDATE_KERNEL_BUFFERS_BASE_NAME PDB_PROP_SYS_VALIDATE_KERNEL_BUFFERS
 #define PDB_PROP_SYS_PRIMARY_VBIOS_STATE_SAVED_BASE_CAST
diff --git a/src/nvidia/inc/kernel/gpu/gsp/gsp_static_config.h b/src/nvidia/inc/kernel/gpu/gsp/gsp_static_config.h
index 48a927a1e..3f457d5c4 100644
--- a/src/nvidia/inc/kernel/gpu/gsp/gsp_static_config.h
+++ b/src/nvidia/inc/kernel/gpu/gsp/gsp_static_config.h
@@ -218,6 +218,7 @@ typedef struct GspSystemInfo
     NvBool bRouteDispIntrsToCPU;
     NvU64  hostPageSize;
     NvBool bGspNocatEnabled;
+    NvBool bS0ixSupport;
     NvU16 virtualConfigBits;
 } GspSystemInfo;
 
diff --git a/src/nvidia/inc/kernel/gpu/rpc/objrpc.h b/src/nvidia/inc/kernel/gpu/rpc/objrpc.h
index ad3f368fd..4f6d57ec7 100644
--- a/src/nvidia/inc/kernel/gpu/rpc/objrpc.h
+++ b/src/nvidia/inc/kernel/gpu/rpc/objrpc.h
@@ -58,6 +58,7 @@ TYPEDEF_BITVECTOR(MC_ENGINE_BITVECTOR);
 typedef struct RpcHistoryEntry
 {
     NvU32 function;
+    NvU32 sequence;
     NvU64 data[2];
     NvU64 ts_start;
     NvU64 ts_end;
@@ -89,6 +90,9 @@ struct OBJRPC{
     NvU32 rpcHistoryCurrent;
     RpcHistoryEntry rpcEventHistory[RPC_HISTORY_DEPTH];
     NvU32 rpcEventHistoryCurrent;
+
+    /* sequence number for RPC */ 
+    NvU32 sequence;
     NvU32 timeoutCount;
     NvBool bQuietPrints;
 
diff --git a/src/nvidia/src/kernel/gpu/gsp/kernel_gsp.c b/src/nvidia/src/kernel/gpu/gsp/kernel_gsp.c
index fe9d5d669..0d73a79e7 100644
--- a/src/nvidia/src/kernel/gpu/gsp/kernel_gsp.c
+++ b/src/nvidia/src/kernel/gpu/gsp/kernel_gsp.c
@@ -129,9 +129,9 @@ static void _kgspFreeRpcInfrastructure(OBJGPU *, KernelGsp *);
 
 static NV_STATUS _kgspConstructRpcObject(OBJGPU *, KernelGsp *, MESSAGE_QUEUE_INFO *, OBJRPC **);
 
-static NV_STATUS _kgspRpcSendMessage(OBJGPU *, OBJRPC *);
-static NV_STATUS _kgspRpcRecvPoll(OBJGPU *, OBJRPC *, NvU32);
-static NV_STATUS _kgspRpcDrainEvents(OBJGPU *, KernelGsp *, NvU32, KernelGspRpcEventHandlerContext);
+static NV_STATUS _kgspRpcSendMessage(OBJGPU *, OBJRPC *, NvU32 *);
+static NV_STATUS _kgspRpcRecvPoll(OBJGPU *, OBJRPC *, NvU32, NvU32);
+static NV_STATUS _kgspRpcDrainEvents(OBJGPU *, KernelGsp *, NvU32, NvU32, KernelGspRpcEventHandlerContext);
 static void      _kgspRpcIncrementTimeoutCountAndRateLimitPrints(OBJGPU *, OBJRPC *);
 
 static NV_STATUS _kgspAllocSimAccessBuffer(OBJGPU *pGpu, KernelGsp *pKernelGsp);
@@ -312,12 +312,14 @@ _kgspAddRpcHistoryEntry
 )
 {
     NvU32 func = RPC_HDR->function;
+    NvU32 sequence = RPC_HDR->sequence;
     NvU32 entry;
 
     entry = *pCurrent = (*pCurrent + 1) % RPC_HISTORY_DEPTH;
 
     portMemSet(&pHistory[entry], 0, sizeof(pHistory[0]));
     pHistory[entry].function = func;
+    pHistory[entry].sequence = sequence;
     pHistory[entry].ts_start = osGetTimestamp();
 
     _kgspGetActiveRpcDebugData(pRpc, func,
@@ -367,7 +369,8 @@ static NV_STATUS
 _kgspRpcSendMessage
 (
     OBJGPU *pGpu,
-    OBJRPC *pRpc
+    OBJRPC *pRpc,
+    NvU32 *pSequence
 )
 {
     NV_STATUS nvStatus;
@@ -376,6 +379,11 @@ _kgspRpcSendMessage
 
     NV_ASSERT(rmGpuGroupLockIsOwner(pGpu->gpuInstance, GPU_LOCK_GRP_SUBDEVICE, &gpuMaskUnused));
 
+    if (pSequence)
+        vgpu_rpc_message_header_v->sequence = *pSequence = pRpc->sequence++;
+    else
+        vgpu_rpc_message_header_v->sequence = 0;
+
     NV_CHECK_OK_OR_RETURN(LEVEL_SILENT, _kgspRpcSanityCheck(pGpu, pKernelGsp, pRpc));
 
     nvStatus = GspMsgQueueSendCommand(pRpc->pMessageQueueInfo, pGpu);
@@ -1582,8 +1590,8 @@ _kgspProcessRpcEvent
             // eventually comes in as an unexpected event.  The error handling
             // for the timeout should have already happened.
             //
-            NV_PRINTF(LEVEL_ERROR, "Unexpected RPC event from GPU%d: 0x%x (%s)\n",
-                      gpuGetInstance(pGpu), event, _getRpcName(event));
+            NV_PRINTF(LEVEL_ERROR, "Unexpected RPC event from GPU%d: 0x%x (%s), sequence: %u\n",
+                      gpuGetInstance(pGpu), event, _getRpcName(event), pMsgHdr->sequence);
             break;
     }
 
@@ -1718,6 +1726,7 @@ _kgspRpcDrainOneEvent
     OBJGPU          *pGpu,
     OBJRPC          *pRpc,
     NvU32            expectedFunc,
+    NvU32            expectedSequence,
     KernelGspRpcEventHandlerContext rpcHandlerContext
 )
 {
@@ -1734,8 +1743,11 @@ _kgspRpcDrainOneEvent
     {
         rpc_message_header_v *pMsgHdr = RPC_HDR;
 
-        if (pMsgHdr->function == expectedFunc)
+        if (pMsgHdr->function == expectedFunc &&
+            pMsgHdr->sequence == expectedSequence)
+        {
             return NV_WARN_MORE_PROCESSING_REQUIRED;
+        }
 
         _kgspProcessRpcEvent(pGpu, pRpc, rpcHandlerContext);
     }
@@ -1768,6 +1780,7 @@ _kgspRpcDrainEvents
     OBJGPU    *pGpu,
     KernelGsp *pKernelGsp,
     NvU32      expectedFunc,
+    NvU32      expectedSequence,
     KernelGspRpcEventHandlerContext rpcHandlerContext
 )
 {
@@ -1776,7 +1789,7 @@ _kgspRpcDrainEvents
 
     while (nvStatus == NV_OK)
     {
-        nvStatus = _kgspRpcDrainOneEvent(pGpu, pRpc, expectedFunc, rpcHandlerContext);
+        nvStatus = _kgspRpcDrainOneEvent(pGpu, pRpc, expectedFunc, expectedSequence, rpcHandlerContext);
         kgspDumpGspLogs(pKernelGsp, NV_FALSE);
     }
 
@@ -1871,11 +1884,12 @@ _kgspLogRpcHistoryEntry
             duration = _tsDiffToDuration(duration, &durationUnitsChar);
 
             NV_ERROR_LOG_DATA(pGpu, errorNum,
-                              "    %c%-4d %-4d %-21.21s 0x%016llx 0x%016llx 0x%016llx 0x%016llx %6llu%cs %c\n",
+                              "    %c%-4d %-4d %-21.21s %10u 0x%016llx 0x%016llx 0x%016llx 0x%016llx %6llu%cs %c\n",
                               ((historyIndex == 0) ? ' ' : '-'),
                               historyIndex,
                               pEntry->function,
                               _getRpcName(pEntry->function),
+                              pEntry->sequence,
                               pEntry->data[0],
                               pEntry->data[1],
                               pEntry->ts_start,
@@ -1886,11 +1900,12 @@ _kgspLogRpcHistoryEntry
         else
         {
             NV_ERROR_LOG_DATA(pGpu, errorNum,
-                              "    %c%-4d %-4d %-21.21s 0x%016llx 0x%016llx 0x%016llx 0x%016llx          %c\n",
+                              "    %c%-4d %-4d %-21.21s %10u 0x%016llx 0x%016llx 0x%016llx 0x%016llx          %c\n",
                               ((historyIndex == 0) ? ' ' : '-'),
                               historyIndex,
                               pEntry->function,
                               _getRpcName(pEntry->function),
+                              pEntry->sequence,
                               pEntry->data[0],
                               pEntry->data[1],
                               pEntry->ts_start,
@@ -1918,16 +1933,16 @@ kgspLogRpcDebugInfo
     _kgspGetActiveRpcDebugData(pRpc, pMsgHdr->function,
                                &activeData[0], &activeData[1]);
     NV_ERROR_LOG_DATA(pGpu, errorNum,
-                      "GPU%d GSP RPC buffer contains function %d (%s) and data 0x%016llx 0x%016llx.\n",
+                      "GPU%d GSP RPC buffer contains function %d (%s) sequence %u and data 0x%016llx 0x%016llx.\n",
                       gpuGetInstance(pGpu),
-                      pMsgHdr->function, _getRpcName(pMsgHdr->function),
+                      pMsgHdr->function, _getRpcName(pMsgHdr->function), pMsgHdr->sequence,
                       activeData[0], activeData[1]);
 
     NV_ERROR_LOG_DATA(pGpu, errorNum,
                       "GPU%d RPC history (CPU -> GSP):\n",
                       gpuGetInstance(pGpu));
     NV_ERROR_LOG_DATA(pGpu, errorNum,
-                      "    entry function                   data0              data1              ts_start           ts_end             duration actively_polling\n");
+                      "    entry function                     sequence data0              data1              ts_start           ts_end             duration actively_polling\n");
     for (historyIndex = 0; historyIndex < rpcEntriesToLog; historyIndex++)
     {
         historyEntry = (pRpc->rpcHistoryCurrent + RPC_HISTORY_DEPTH - historyIndex) % RPC_HISTORY_DEPTH;
@@ -1939,7 +1954,7 @@ kgspLogRpcDebugInfo
                       "GPU%d RPC event history (CPU <- GSP):\n",
                       gpuGetInstance(pGpu));
     NV_ERROR_LOG_DATA(pGpu, errorNum,
-                      "    entry function                   data0              data1              ts_start           ts_end             duration during_incomplete_rpc\n");
+                      "    entry function                     sequence data0              data1              ts_start           ts_end             duration during_incomplete_rpc\n");
     for (historyIndex = 0; historyIndex < rpcEntriesToLog; historyIndex++)
     {
         historyEntry = (pRpc->rpcEventHistoryCurrent + RPC_HISTORY_DEPTH - historyIndex) % RPC_HISTORY_DEPTH;
@@ -1958,7 +1973,8 @@ _kgspLogXid119
 (
     OBJGPU *pGpu,
     OBJRPC *pRpc,
-    NvU32 expectedFunc
+    NvU32 expectedFunc,
+    NvU32 expectedSequence
 )
 {
     RpcHistoryEntry *pHistoryEntry = &pRpc->rpcHistory[pRpc->rpcHistoryCurrent];
@@ -1980,11 +1996,12 @@ _kgspLogXid119
     duration = _tsDiffToDuration(ts_end - pHistoryEntry->ts_start, &durationUnitsChar);
 
     NV_ERROR_LOG(pGpu, GSP_RPC_TIMEOUT,
-                 "Timeout after %llus of waiting for RPC response from GPU%d GSP! Expected function %d (%s) (0x%llx 0x%llx).",
+                 "Timeout after %llus of waiting for RPC response from GPU%d GSP! Expected function %d (%s) sequence %u (0x%llx 0x%llx).",
                  (durationUnitsChar == 'm' ? duration / 1000 : duration),
                  gpuGetInstance(pGpu),
                  expectedFunc,
                  _getRpcName(expectedFunc),
+                 expectedSequence,
                  pHistoryEntry->data[0],
                  pHistoryEntry->data[1]);
 
@@ -2013,7 +2030,8 @@ _kgspLogRpcSanityCheckFailure
     OBJGPU *pGpu,
     OBJRPC *pRpc,
     NvU32 rpcStatus,
-    NvU32 expectedFunc
+    NvU32 expectedFunc,
+    NvU32 expectedSequence
 )
 {
     RpcHistoryEntry *pHistoryEntry = &pRpc->rpcHistory[pRpc->rpcHistoryCurrent];
@@ -2021,11 +2039,12 @@ _kgspLogRpcSanityCheckFailure
     NV_ASSERT(expectedFunc == pHistoryEntry->function);
 
     NV_PRINTF(LEVEL_ERROR,
-              "GPU%d sanity check failed 0x%x waiting for RPC response from GSP. Expected function %d (%s) (0x%llx 0x%llx).\n",
+              "GPU%d sanity check failed 0x%x waiting for RPC response from GSP. Expected function %d (%s) sequence %u (0x%llx 0x%llx).\n",
               gpuGetInstance(pGpu),
               rpcStatus,
               expectedFunc,
               _getRpcName(expectedFunc),
+              expectedSequence,
               pHistoryEntry->data[0],
               pHistoryEntry->data[1]);
 
@@ -2072,7 +2091,8 @@ _kgspRpcRecvPoll
 (
     OBJGPU *pGpu,
     OBJRPC *pRpc,
-    NvU32   expectedFunc
+    NvU32   expectedFunc,
+    NvU32   expectedSequence
 )
 {
     KernelGsp *pKernelGsp = GPU_GET_KERNEL_GSP(pGpu);
@@ -2165,7 +2185,7 @@ _kgspRpcRecvPoll
         //
         timeoutStatus = gpuCheckTimeout(pGpu, &timeout);
 
-        rpcStatus = _kgspRpcDrainEvents(pGpu, pKernelGsp, expectedFunc, rpcHandlerContext);
+        rpcStatus = _kgspRpcDrainEvents(pGpu, pKernelGsp, expectedFunc, expectedSequence, rpcHandlerContext);
 
         switch (rpcStatus) {
             case NV_WARN_MORE_PROCESSING_REQUIRED:
@@ -2191,7 +2211,7 @@ _kgspRpcRecvPoll
         {
             if (!pRpc->bQuietPrints)
             {
-                _kgspLogRpcSanityCheckFailure(pGpu, pRpc, rpcStatus, expectedFunc);
+                _kgspLogRpcSanityCheckFailure(pGpu, pRpc, rpcStatus, expectedFunc, expectedSequence);
                 pRpc->bQuietPrints = NV_TRUE;
             }
             goto done;
@@ -2205,7 +2225,7 @@ _kgspRpcRecvPoll
 
             if (!pRpc->bQuietPrints)
             {
-                _kgspLogXid119(pGpu, pRpc, expectedFunc);
+                _kgspLogXid119(pGpu, pRpc, expectedFunc, expectedSequence);
             }
 
             goto done;
@@ -4744,7 +4764,7 @@ kgspRpcRecvEvents_IMPL
     // If we do the assert will fail on NV_WARN_MORE_PROCESSING_REQUIRED,
     // in addition to general error codes.
     //
-    NV_ASSERT_OK(_kgspRpcDrainEvents(pGpu, pKernelGsp, NV_VGPU_MSG_FUNCTION_NUM_FUNCTIONS, KGSP_RPC_EVENT_HANDLER_CONTEXT_INTERRUPT));
+    NV_ASSERT_OK(_kgspRpcDrainEvents(pGpu, pKernelGsp, NV_VGPU_MSG_FUNCTION_NUM_FUNCTIONS, 0, KGSP_RPC_EVENT_HANDLER_CONTEXT_INTERRUPT));
 }
 
 /*!
@@ -4766,7 +4786,7 @@ kgspWaitForRmInitDone_IMPL
     threadStateResetTimeout(pGpu);
 
     NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
-        rpcRecvPoll(pGpu, pRpc, NV_VGPU_MSG_EVENT_GSP_INIT_DONE));
+        rpcRecvPoll(pGpu, pRpc, NV_VGPU_MSG_EVENT_GSP_INIT_DONE, 0));
 
     //
     // Now check if RPC really succeeded (NV_VGPU_MSG_RESULT_* are defined to
@@ -5256,6 +5276,7 @@ static NV_STATUS _kgspDumpEngineFunc
             prbEncNestedStart(pPrbEnc, NVDEBUG_ENG_KGSP_RPC_HISTORY));
 
         prbEncAddUInt32(pPrbEnc, NVDEBUG_ENG_KGSP_RPCINFO_FUNCTION, entry->function);
+        prbEncAddUInt32(pPrbEnc, NVDEBUG_ENG_KGSP_RPCINFO_SEQUENCE, entry->sequence);
         prbEncAddUInt64(pPrbEnc, NVDEBUG_ENG_KGSP_RPCINFO_TS_START, entry->ts_start);
         prbEncAddUInt64(pPrbEnc, NVDEBUG_ENG_KGSP_RPCINFO_TS_END, entry->ts_end);
         prbEncAddUInt32(pPrbEnc, NVDEBUG_ENG_KGSP_RPCINFO_DATA0, entry->data[0]);
@@ -5276,6 +5297,7 @@ static NV_STATUS _kgspDumpEngineFunc
             prbEncNestedStart(pPrbEnc, NVDEBUG_ENG_KGSP_EVENT_HISTORY));
 
         prbEncAddUInt32(pPrbEnc, NVDEBUG_ENG_KGSP_RPCINFO_FUNCTION, entry->function);
+        prbEncAddUInt32(pPrbEnc, NVDEBUG_ENG_KGSP_RPCINFO_SEQUENCE, entry->sequence);
         prbEncAddUInt64(pPrbEnc, NVDEBUG_ENG_KGSP_RPCINFO_TS_START, entry->ts_start);
         prbEncAddUInt64(pPrbEnc, NVDEBUG_ENG_KGSP_RPCINFO_TS_END, entry->ts_end);
         prbEncAddUInt32(pPrbEnc, NVDEBUG_ENG_KGSP_RPCINFO_DATA0, entry->data[0]);
diff --git a/src/nvidia/src/kernel/gpu/mem_mgr/arch/maxwell/mem_mgr_gm107.c b/src/nvidia/src/kernel/gpu/mem_mgr/arch/maxwell/mem_mgr_gm107.c
index 62391c2b7..87740daca 100644
--- a/src/nvidia/src/kernel/gpu/mem_mgr/arch/maxwell/mem_mgr_gm107.c
+++ b/src/nvidia/src/kernel/gpu/mem_mgr/arch/maxwell/mem_mgr_gm107.c
@@ -49,6 +49,7 @@
 
 #include "vgpu/rpc.h"
 #include "vgpu/vgpu_events.h"
+#include "nvdevid.h"
 
 //
 // statics
@@ -1412,15 +1413,37 @@ memmgrGetRsvdSizeForSr_GM107
     MemoryManager *pMemoryManager
 )
 {
+    //
+    // Temporary WAR to override WDDM S/R buffer for specific skus
+    // Bug 5327051
+    //
+    static const NvU16 gb20x_devid[] = { 0x2B8C };
+    NvU32  pciDeviceID = DRF_VAL(_PCI, _DEVID, _DEVICE, pGpu->idInfo.PCIDeviceID);
+    NvBool overrideFbsrRsvdBufferSize = NV_FALSE;
+
+    for (NvU32 i = 0; i < NV_ARRAY_ELEMENTS(gb20x_devid); i++)
+    {
+        if (pciDeviceID == gb20x_devid[i])
+        {
+            overrideFbsrRsvdBufferSize = NV_TRUE;
+            break;
+        }
+    }
+
     if (((pMemoryManager->Ram.fbTotalMemSizeMb >> 10) >= 31) || IS_GSP_CLIENT(pGpu))
     {
         //
         // We need to reserve more memory for S/R if
-        // 1. FB size is > 32GB  Bug Id: 2468357
+        // 1. FB size is >= 31GB  Bug Id: 2468357
         // 2. Or GSP is enabled  Bug Id: 4312881
         //
         return 512 * 1024 * 1024;
     }
+    else if (overrideFbsrRsvdBufferSize)
+    {
+        // Bug 5327051: WAR to override WDDM S/R buffer for specific skus
+        return 300 * 1024 * 1024;
+    }
     else
     {
         return 256 * 1024 * 1024;
diff --git a/src/nvidia/src/kernel/gpu/mem_mgr/mem_desc.c b/src/nvidia/src/kernel/gpu/mem_mgr/mem_desc.c
index 1d9539fa5..b6c79365a 100644
--- a/src/nvidia/src/kernel/gpu/mem_mgr/mem_desc.c
+++ b/src/nvidia/src/kernel/gpu/mem_mgr/mem_desc.c
@@ -306,8 +306,6 @@ memdescCreate
     // (4k >> 12 = 1). This modification helps us to avoid overflow of variable
     // allocSize, in case caller of this function passes highest value of NvU64.
     //
-    // If allocSize is passed as 0, PageCount should be returned as 0.
-    //
     if (allocSize == 0)
     {
         PageCount = 0;
diff --git a/src/nvidia/src/kernel/mem_mgr/standard_mem.c b/src/nvidia/src/kernel/mem_mgr/standard_mem.c
index 84ed46770..4ef0535ee 100644
--- a/src/nvidia/src/kernel/mem_mgr/standard_mem.c
+++ b/src/nvidia/src/kernel/mem_mgr/standard_mem.c
@@ -57,7 +57,7 @@ NV_STATUS stdmemValidateParams
         return NV_ERR_INVALID_ARGUMENT;
     }
 
-   //
+    //
     // These flags don't do anything in this path. No mapping on alloc and
     // kernel map is controlled by TYPE
     //
diff --git a/src/nvidia/src/kernel/rmapi/nv_gpu_ops.c b/src/nvidia/src/kernel/rmapi/nv_gpu_ops.c
index c5abc71fd..aec294151 100644
--- a/src/nvidia/src/kernel/rmapi/nv_gpu_ops.c
+++ b/src/nvidia/src/kernel/rmapi/nv_gpu_ops.c
@@ -7694,7 +7694,7 @@ static NV_STATUS dupMemory(struct gpuDevice *device,
         // particular doesn't create IOMMU mappings required for the mapped GPU
         // to access the memory. That's a problem if the mapped GPU is different
         // from the GPU that the allocation was created under. Add them
-        // explicitly here and remove them when the memory is freed in n
+        // explicitly here and remove them when the memory is freed in
         // nvGpuOpsFreeDupedHandle(). Notably memdescMapIommu() refcounts the
         // mappings so it's ok to call it if the mappings are already there.
         //
diff --git a/src/nvidia/src/kernel/rmapi/rpc_common.c b/src/nvidia/src/kernel/rmapi/rpc_common.c
index f90e25c80..a0567b2ca 100644
--- a/src/nvidia/src/kernel/rmapi/rpc_common.c
+++ b/src/nvidia/src/kernel/rmapi/rpc_common.c
@@ -88,7 +88,8 @@ OBJRPC *initRpcObject(OBJGPU *pGpu)
     pRpc->timeoutCount = 0;
     pRpc->bQuietPrints = NV_FALSE;
 
-    // VIRTUALIZATION is disabled on DCE. Only run the below code on VGPU and GSP.
+    pRpc->sequence = 0;
+        // VIRTUALIZATION is disabled on DCE. Only run the below code on VGPU and GSP.
     rpcSetIpVersion(pGpu, pRpc,
                     RPC_VERSION_FROM_VGX_VERSION(VGX_MAJOR_VERSION_NUMBER,
                                                  VGX_MINOR_VERSION_NUMBER));
diff --git a/src/nvidia/src/kernel/vgpu/rpc.c b/src/nvidia/src/kernel/vgpu/rpc.c
index 9ce5b2865..9b9e075bd 100644
--- a/src/nvidia/src/kernel/vgpu/rpc.c
+++ b/src/nvidia/src/kernel/vgpu/rpc.c
@@ -120,8 +120,8 @@ static NvU64 startTimeInNs, endTimeInNs, elapsedTimeInNs;
 static NV_STATUS updateHostVgpuFbUsage(OBJGPU *pGpu, NvHandle hClient, NvHandle hDevice,
                                        NvHandle hSubdevice);
 
-static NV_STATUS _rpcSendMessage_VGPUGSP(OBJGPU *pGpu, OBJRPC *pRPC);
-static NV_STATUS _rpcRecvPoll_VGPUGSP(OBJGPU *pGpu, OBJRPC *pRPC, NvU32 expectedFunc);
+static NV_STATUS _rpcSendMessage_VGPUGSP(OBJGPU *pGpu, OBJRPC *pRPC, NvU32 *pSequence);
+static NV_STATUS _rpcRecvPoll_VGPUGSP(OBJGPU *pGpu, OBJRPC *pRPC, NvU32 expectedFunc, NvU32 expectedSequence);
 void setGuestEccStatus(OBJGPU *pGpu);
 
 typedef NV_STATUS dma_control_copy_params_to_rpc_buffer_v(NvU32 cmd, void *Params, void *params_in);
@@ -1386,6 +1386,9 @@ NV_STATUS vgpuGspSetupBuffers(OBJGPU *pGpu)
         return NV_ERR_NOT_SUPPORTED;
     }
 
+    // Modifying the DMA address size to the value supported by the hardware
+    osDmaSetAddressSize(pGpu->pOsGpuInfo, gpuGetPhysAddrWidth_HAL(pGpu, ADDR_SYSMEM));
+
     rpcSendMessage_FNPTR(pVGpu->pRpc) = _rpcSendMessage_VGPUGSP;
     rpcRecvPoll_FNPTR(pVGpu->pRpc)    = _rpcRecvPoll_VGPUGSP;
 
@@ -1665,28 +1668,29 @@ NV_STATUS freeRpcInfrastructure_VGPU(OBJGPU *pGpu)
     return rmStatus;
 }
 
-NV_STATUS rpcSendMessage_IMPL(OBJGPU *pGpu, OBJRPC *pRpc)
+NV_STATUS rpcSendMessage_IMPL(OBJGPU *pGpu, OBJRPC *pRpc, NvU32 *pSequence)
 {
     NV_PRINTF(LEVEL_ERROR, "virtual function not implemented.\n");
     return NV_ERR_NOT_SUPPORTED;
 }
 
-NV_STATUS rpcRecvPoll_IMPL(OBJGPU *pGpu, OBJRPC *pRpc, NvU32 expectedFunc)
+NV_STATUS rpcRecvPoll_IMPL(OBJGPU *pGpu, OBJRPC *pRpc, NvU32 expectedFunc, NvU32 expectedSequence)
 {
     NV_PRINTF(LEVEL_ERROR, "virtual function not implemented.\n");
     return NV_ERR_NOT_SUPPORTED;
 }
 
-static NV_STATUS _rpcSendMessage_VGPUGSP(OBJGPU *pGpu, OBJRPC *pRpc)
+static NV_STATUS _rpcSendMessage_VGPUGSP(OBJGPU *pGpu, OBJRPC *pRpc, NvU32 *pSequence)
 {
     OBJVGPU *pVGpu = GPU_GET_VGPU(pGpu);
 
-    vgpu_rpc_message_header_v->sequence = pVGpu->sequence_base++;
+    NV_ASSERT(pSequence != NULL);
+    vgpu_rpc_message_header_v->sequence = *pSequence = pVGpu->sequence_base++;
 
     return _vgpuGspSendRpcRequest(pGpu, pRpc);
 }
 
-static NV_STATUS _rpcRecvPoll_VGPUGSP(OBJGPU *pGpu, OBJRPC *pRPC, NvU32 expectedFunc)
+static NV_STATUS _rpcRecvPoll_VGPUGSP(OBJGPU *pGpu, OBJRPC *pRPC, NvU32 expectedFunc, NvU32 expectedSequence)
 {
     return _vgpuGspWaitForResponse(pGpu);
 }
@@ -1722,6 +1726,15 @@ static NV_STATUS _issueRpcAndWait(OBJGPU *pGpu, OBJRPC *pRpc)
 
         pNewEntry->rpcData.rpcDataTag = vgpu_rpc_message_header_v->function;
 
+        switch (vgpu_rpc_message_header_v->function)
+        {
+            case NV_VGPU_MSG_FUNCTION_RM_API_CONTROL:
+                pNewEntry->rpcData.rpcExtraData = rpc_message->rm_api_control_v.params.cmd;
+                break;
+			default:
+                break;
+        }
+
         rpcProfilerEntryCount++;
 
         osGetPerformanceCounter(&pNewEntry->rpcData.startTimeInNs);
@@ -1729,13 +1742,14 @@ static NV_STATUS _issueRpcAndWait(OBJGPU *pGpu, OBJRPC *pRpc)
 
     // For HCC, cache expectedFunc value before encrypting.
     NvU32 expectedFunc = vgpu_rpc_message_header_v->function;
+    NvU32 expectedSequence = 0;
 
-    status = rpcSendMessage(pGpu, pRpc);
+    status = rpcSendMessage(pGpu, pRpc, &expectedSequence);
     if (status != NV_OK)
     {
         NV_PRINTF_COND(pRpc->bQuietPrints, LEVEL_INFO, LEVEL_ERROR,
-            "rpcSendMessage failed with status 0x%08x for fn %d!\n",
-            status, vgpu_rpc_message_header_v->function);
+            "rpcSendMessage failed with status 0x%08x for fn %d sequence %d!\n",
+            status, expectedFunc, expectedSequence);
         //
         // It has been observed that returning NV_ERR_BUSY_RETRY in a bad state (RPC
         // buffers full and not being serviced) can make things worse, i.e. turn RPC
@@ -1746,20 +1760,20 @@ static NV_STATUS _issueRpcAndWait(OBJGPU *pGpu, OBJRPC *pRpc)
     }
 
     // Use cached expectedFunc here because vgpu_rpc_message_header_v is encrypted for HCC.
-    status = rpcRecvPoll(pGpu, pRpc, expectedFunc);
+    status = rpcRecvPoll(pGpu, pRpc, expectedFunc, expectedSequence);
     if (status != NV_OK)
     {
         if (status == NV_ERR_TIMEOUT)
         {
             NV_PRINTF_COND(pRpc->bQuietPrints, LEVEL_INFO, LEVEL_ERROR,
-                "rpcRecvPoll timedout for fn %d!\n",
-                 vgpu_rpc_message_header_v->function);
+                "rpcRecvPoll timedout for fn %d sequence %u!\n",
+                 expectedFunc, expectedSequence);
         }
         else
         {
             NV_PRINTF_COND(pRpc->bQuietPrints, LEVEL_INFO, LEVEL_ERROR,
-                "rpcRecvPoll failed with status 0x%08x for fn %d!\n",
-                 status, vgpu_rpc_message_header_v->function);
+                "rpcRecvPoll failed with status 0x%08x for fn %d sequence %u!\n",
+                 status, expectedFunc, expectedSequence);
         }
         return status;
     }
@@ -1793,10 +1807,10 @@ static NV_STATUS _issueRpcAsync(OBJGPU *pGpu, OBJRPC *pRpc)
     // should not be called in broadcast mode
     NV_ASSERT_OR_RETURN(!gpumgrGetBcEnabledStatus(pGpu), NV_ERR_INVALID_STATE);
 
-    status = rpcSendMessage(pGpu, pRpc);
+    status = rpcSendMessage(pGpu, pRpc, NULL);
     if (status != NV_OK)
     {
-        NV_PRINTF(LEVEL_ERROR, "rpcSendMessage failed with status 0x%08x for fn %d!\n",
+        NV_PRINTF(LEVEL_ERROR, "rpcSendMessage async failed with status 0x%08x for fn %d!\n",
                   status, vgpu_rpc_message_header_v->function);
         NV_ASSERT(0);
         //
@@ -1824,6 +1838,8 @@ static NV_STATUS _issueRpcLarge
     NvU8      *pBuf8         = (NvU8 *)pBuffer;
     NV_STATUS  nvStatus      = NV_OK;
     NvU32      expectedFunc  = vgpu_rpc_message_header_v->function;
+    NvU32      firstSequence = pRpc->sequence;
+    NvU32      lastSequence, waitSequence;
     NvU32      entryLength;
     NvU32      remainingSize = bufSize;
     NvU32      recordCount   = 0;
@@ -1840,7 +1856,7 @@ static NV_STATUS _issueRpcLarge
     // Set the correct length for this queue entry.
     vgpu_rpc_message_header_v->length = entryLength;
 
-    nvStatus = rpcSendMessage(pGpu, pRpc);
+    nvStatus = rpcSendMessage(pGpu, pRpc, &firstSequence);
     if (nvStatus != NV_OK)
     {
         NV_PRINTF(LEVEL_ERROR, "rpcSendMessage failed with status 0x%08x for fn %d!\n",
@@ -1876,7 +1892,7 @@ static NV_STATUS _issueRpcLarge
         vgpu_rpc_message_header_v->length   = entryLength + sizeof(rpc_message_header_v);
         vgpu_rpc_message_header_v->function = NV_VGPU_MSG_FUNCTION_CONTINUATION_RECORD;
 
-        nvStatus = rpcSendMessage(pGpu, pRpc);
+        nvStatus = rpcSendMessage(pGpu, pRpc, &lastSequence);
         if (nvStatus != NV_OK)
         {
             NV_PRINTF(LEVEL_ERROR,
@@ -1897,6 +1913,8 @@ static NV_STATUS _issueRpcLarge
         recordCount++;
     }
 
+    NV_ASSERT(lastSequence == (firstSequence + recordCount));
+
     if (!bWait)
     {
         // In case of Async RPC, we are done here.
@@ -1904,18 +1922,20 @@ static NV_STATUS _issueRpcLarge
     }
 
     // Always receive at least one..
-    nvStatus = rpcRecvPoll(pGpu, pRpc, expectedFunc);
+    waitSequence = firstSequence;
+
+    nvStatus = rpcRecvPoll(pGpu, pRpc, expectedFunc, waitSequence);
     if (nvStatus != NV_OK)
     {
         if (nvStatus == NV_ERR_TIMEOUT)
         {
-            NV_PRINTF(LEVEL_ERROR, "rpcRecvPoll timedout for fn %d!\n",
-                      vgpu_rpc_message_header_v->function);
+            NV_PRINTF(LEVEL_ERROR, "rpcRecvPoll timedout for fn %d sequence %d!\n",
+                      expectedFunc, waitSequence);
         }
         else
         {
-            NV_PRINTF(LEVEL_ERROR, "rpcRecvPoll failed with status 0x%08x for fn %d!\n",
-                      nvStatus, vgpu_rpc_message_header_v->function);
+            NV_PRINTF(LEVEL_ERROR, "rpcRecvPoll failed with status 0x%08x for fn %d sequence %d!\n",
+                      nvStatus, expectedFunc, waitSequence);
         }
         NV_ASSERT(0);
         return nvStatus;
@@ -1931,26 +1951,27 @@ static NV_STATUS _issueRpcLarge
 
     remainingSize -= entryLength;
     pBuf8   += entryLength;
+    waitSequence++;
 
     // For bidirectional transfer messages, need to receive all other frames as well
     if (bBidirectional && (recordCount > 0))
     {
         while (remainingSize > 0)
         {
-            nvStatus = rpcRecvPoll(pGpu, pRpc, NV_VGPU_MSG_FUNCTION_CONTINUATION_RECORD);
+            nvStatus = rpcRecvPoll(pGpu, pRpc, NV_VGPU_MSG_FUNCTION_CONTINUATION_RECORD, waitSequence);
             if (nvStatus != NV_OK)
             {
                 if (nvStatus == NV_ERR_TIMEOUT)
                 {
                     NV_PRINTF(LEVEL_ERROR,
-                              "rpcRecvPoll timedout for fn %d continuation record (remainingSize=0x%x)!\n",
-                              vgpu_rpc_message_header_v->function, remainingSize);
+                              "rpcRecvPoll timedout for fn %d sequence %d continuation record (remainingSize=0x%x)!\n",
+                              expectedFunc, waitSequence, remainingSize);
                 }
                 else
                 {
                     NV_PRINTF(LEVEL_ERROR,
-                              "rpcRecvPoll failed with status 0x%08x for fn %d continuation record! (remainingSize=0x%x)\n",
-                              nvStatus, vgpu_rpc_message_header_v->function, remainingSize);
+                              "rpcRecvPoll failed with status 0x%08x for fn %d sequence %d continuation record! (remainingSize=0x%x)\n",
+                              nvStatus, expectedFunc, waitSequence, remainingSize);
                 }
                 NV_ASSERT(0);
                 return nvStatus;
@@ -1968,9 +1989,11 @@ static NV_STATUS _issueRpcLarge
             remainingSize -= entryLength;
             pBuf8         += entryLength;
             recordCount--;
+            waitSequence++;
         }
         vgpu_rpc_message_header_v->function = expectedFunc;
         NV_ASSERT(recordCount == 0);
+        NV_ASSERT(waitSequence - 1 == lastSequence);
     }
 
     // Now check if RPC really succeeded
@@ -9577,6 +9600,7 @@ NV_STATUS rpcGspSetSystemInfo_v17_00
 
         rpcInfo->bIsPrimary = pGpu->getProperty(pGpu, PDB_PROP_GPU_PRIMARY_DEVICE);
 
+        rpcInfo->bS0ixSupport = pSys->getProperty(pSys, PDB_PROP_SYS_SUPPORTS_S0IX);
 #if defined(NV_UNIX) && !RMCFG_FEATURE_MODS_FEATURES
         rpcInfo->isGridBuild = os_is_grid_supported();
 #endif
diff --git a/version.mk b/version.mk
index 35b396326..3af3247a1 100644
--- a/version.mk
+++ b/version.mk
@@ -1,4 +1,4 @@
-NVIDIA_VERSION = 570.181
+NVIDIA_VERSION = 570.190
 
 # This file.
 VERSION_MK_FILE := $(lastword $(MAKEFILE_LIST))