diff --git a/README.md b/README.md
index fea72f437..f1b68c059 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 # NVIDIA Linux Open GPU Kernel Module Source
 
 This is the source release of the NVIDIA Linux open GPU kernel modules,
-version 580.94.06.
+version 580.94.10.
 
 
 ## How to Build
@@ -17,7 +17,7 @@ as root:
 
 Note that the kernel modules built here must be used with GSP
 firmware and user-space NVIDIA GPU driver components from a corresponding
-580.94.06 driver release.  This can be achieved by installing
+580.94.10 driver release.  This can be achieved by installing
 the NVIDIA GPU driver from the .run file using the `--no-kernel-modules`
 option.  E.g.,
 
@@ -185,7 +185,7 @@ table below).
 For details on feature support and limitations, see the NVIDIA GPU driver
 end user README here:
 
-https://us.download.nvidia.com/XFree86/Linux-x86_64/580.94.06/README/kernel_open.html
+https://us.download.nvidia.com/XFree86/Linux-x86_64/580.94.10/README/kernel_open.html
 
 For vGPU support, please refer to the README.vgpu packaged in the vGPU Host
 Package for more details.
@@ -749,6 +749,7 @@ Subsystem Device ID.
 | NVIDIA A10                                              | 2236 10DE 1482 |
 | NVIDIA A10G                                             | 2237 10DE 152F |
 | NVIDIA A10M                                             | 2238 10DE 1677 |
+| NVIDIA H20 NVL16                                        | 230E 10DE 20DF |
 | NVIDIA H100 NVL                                         | 2321 10DE 1839 |
 | NVIDIA H800 PCIe                                        | 2322 10DE 17A4 |
 | NVIDIA H800                                             | 2324 10DE 17A6 |
@@ -949,9 +950,10 @@ Subsystem Device ID.
 | NVIDIA GB200                                            | 2941 10DE 20D5 |
 | NVIDIA GB200                                            | 2941 10DE 21C9 |
 | NVIDIA GB200                                            | 2941 10DE 21CA |
+| NVIDIA DRIVE P2021                                      | 29BB 10DE 207C |
 | NVIDIA GeForce RTX 5090                                 | 2B85           |
 | NVIDIA GeForce RTX 5090 D                               | 2B87           |
-| NVIDIA GeForce RTX 5090 D v2                            | 2B8C 17AA 530C |
+| NVIDIA GeForce RTX 5090 D v2                            | 2B8C           |
 | NVIDIA RTX PRO 6000 Blackwell Workstation Edition       | 2BB1 1028 204B |
 | NVIDIA RTX PRO 6000 Blackwell Workstation Edition       | 2BB1 103C 204B |
 | NVIDIA RTX PRO 6000 Blackwell Workstation Edition       | 2BB1 10DE 204B |
@@ -964,6 +966,8 @@ Subsystem Device ID.
 | NVIDIA RTX PRO 6000 Blackwell Max-Q Workstation Edition | 2BB4 103C 204C |
 | NVIDIA RTX PRO 6000 Blackwell Max-Q Workstation Edition | 2BB4 10DE 204C |
 | NVIDIA RTX PRO 6000 Blackwell Max-Q Workstation Edition | 2BB4 17AA 204C |
+| NVIDIA RTX PRO 6000 Blackwell Server Edition            | 2BB5 10DE 204E |
+| NVIDIA RTX 6000D                                        | 2BB9 10DE 2091 |
 | NVIDIA GeForce RTX 5080                                 | 2C02           |
 | NVIDIA GeForce RTX 5070 Ti                              | 2C05           |
 | NVIDIA GeForce RTX 5090 Laptop GPU                      | 2C18           |
@@ -974,6 +978,7 @@ Subsystem Device ID.
 | NVIDIA RTX PRO 4500 Blackwell                           | 2C31 17AA 2051 |
 | NVIDIA RTX PRO 4000 Blackwell SFF Edition               | 2C33 1028 2053 |
 | NVIDIA RTX PRO 4000 Blackwell SFF Edition               | 2C33 103C 2053 |
+| NVIDIA RTX PRO 4000 Blackwell SFF Edition               | 2C33 10DE 2053 |
 | NVIDIA RTX PRO 4000 Blackwell SFF Edition               | 2C33 17AA 2053 |
 | NVIDIA RTX PRO 4000 Blackwell                           | 2C34 1028 2052 |
 | NVIDIA RTX PRO 4000 Blackwell                           | 2C34 103C 2052 |
@@ -983,22 +988,29 @@ Subsystem Device ID.
 | NVIDIA RTX PRO 4000 Blackwell Generation Laptop GPU     | 2C39           |
 | NVIDIA GeForce RTX 5090 Laptop GPU                      | 2C58           |
 | NVIDIA GeForce RTX 5080 Laptop GPU                      | 2C59           |
+| NVIDIA RTX PRO 5000 Blackwell Embedded GPU              | 2C77           |
+| NVIDIA RTX PRO 4000 Blackwell Embedded GPU              | 2C79           |
 | NVIDIA GeForce RTX 5060 Ti                              | 2D04           |
 | NVIDIA GeForce RTX 5060                                 | 2D05           |
 | NVIDIA GeForce RTX 5070 Laptop GPU                      | 2D18           |
 | NVIDIA GeForce RTX 5060 Laptop GPU                      | 2D19           |
 | NVIDIA RTX PRO 2000 Blackwell                           | 2D30 1028 2054 |
 | NVIDIA RTX PRO 2000 Blackwell                           | 2D30 103C 2054 |
+| NVIDIA RTX PRO 2000 Blackwell                           | 2D30 10DE 2054 |
 | NVIDIA RTX PRO 2000 Blackwell                           | 2D30 17AA 2054 |
 | NVIDIA RTX PRO 2000 Blackwell Generation Laptop GPU     | 2D39           |
 | NVIDIA GeForce RTX 5070 Laptop GPU                      | 2D58           |
 | NVIDIA GeForce RTX 5060 Laptop GPU                      | 2D59           |
-| NVIDIA GeForce RTX 5050                                 | 2D83 17AA C791 |
+| NVIDIA RTX PRO 2000 Blackwell Embedded GPU              | 2D79           |
+| NVIDIA GeForce RTX 5050                                 | 2D83           |
 | NVIDIA GeForce RTX 5050 Laptop GPU                      | 2D98           |
 | NVIDIA RTX PRO 1000 Blackwell Generation Laptop GPU     | 2DB8           |
 | NVIDIA RTX PRO 500 Blackwell Generation Laptop GPU      | 2DB9           |
 | NVIDIA GeForce RTX 5050 Laptop GPU                      | 2DD8           |
+| NVIDIA RTX PRO 500 Blackwell Embedded GPU               | 2DF9           |
 | NVIDIA GeForce RTX 5070                                 | 2F04           |
 | NVIDIA GeForce RTX 5070 Ti Laptop GPU                   | 2F18           |
 | NVIDIA RTX PRO 3000 Blackwell Generation Laptop GPU     | 2F38           |
 | NVIDIA GeForce RTX 5070 Ti Laptop GPU                   | 2F58           |
+| NVIDIA B300 SXM6 AC                                     | 3182 10DE 20E6 |
+| NVIDIA GB300                                            | 31C2 10DE 21F1 |
diff --git a/kernel-open/Kbuild b/kernel-open/Kbuild
index 536552e5d..bfb3a3c95 100644
--- a/kernel-open/Kbuild
+++ b/kernel-open/Kbuild
@@ -79,7 +79,7 @@ ccflags-y += -I$(src)/common/inc
 ccflags-y += -I$(src)
 ccflags-y += -Wall $(DEFINES) $(INCLUDES) -Wno-cast-qual -Wno-format-extra-args
 ccflags-y += -D__KERNEL__ -DMODULE -DNVRM
-ccflags-y += -DNV_VERSION_STRING=\"580.94.06\"
+ccflags-y += -DNV_VERSION_STRING=\"580.94.10\"
 
 # Include and link Tegra out-of-tree modules.
 ifneq ($(wildcard /usr/src/nvidia/nvidia-oot),)
diff --git a/kernel-open/common/inc/nvstatuscodes.h b/kernel-open/common/inc/nvstatuscodes.h
index 98ebb7b47..440434997 100644
--- a/kernel-open/common/inc/nvstatuscodes.h
+++ b/kernel-open/common/inc/nvstatuscodes.h
@@ -165,6 +165,7 @@ NV_STATUS_CODE(NV_ERR_FABRIC_STATE_OUT_OF_SYNC,        0x00000087, "NVLink fabri
 NV_STATUS_CODE(NV_ERR_BUFFER_FULL,                     0x00000088, "Buffer is full")
 NV_STATUS_CODE(NV_ERR_BUFFER_EMPTY,                    0x00000089, "Buffer is empty")
 NV_STATUS_CODE(NV_ERR_MC_FLA_OFFSET_TABLE_FULL,        0x0000008A, "Multicast FLA offset table has no available slots")
+NV_STATUS_CODE(NV_ERR_DMA_XFER_FAILED,                 0x0000008B, "DMA transfer failed")
 
 // Warnings:
 NV_STATUS_CODE(NV_WARN_HOT_SWITCH,                     0x00010001, "WARNING Hot switch")
diff --git a/kernel-open/common/inc/os-interface.h b/kernel-open/common/inc/os-interface.h
index 523368eaa..dde5c843a 100644
--- a/kernel-open/common/inc/os-interface.h
+++ b/kernel-open/common/inc/os-interface.h
@@ -62,6 +62,11 @@ struct os_work_queue;
 /* Each OS defines its own version of this opaque type */
 typedef struct os_wait_queue os_wait_queue;
 
+/* Flags needed by os_get_current_proccess_flags */
+#define OS_CURRENT_PROCESS_FLAG_NONE            0x0
+#define OS_CURRENT_PROCESS_FLAG_KERNEL_THREAD   0x1
+#define OS_CURRENT_PROCESS_FLAG_EXITING         0x2
+
 /*
  * ---------------------------------------------------------------------------
  *
@@ -194,6 +199,7 @@ NV_STATUS   NV_API_CALL  os_open_readonly_file            (const char *, void **
 NV_STATUS   NV_API_CALL  os_open_and_read_file            (const char *, NvU8 *, NvU64);
 NvBool      NV_API_CALL  os_is_nvswitch_present           (void);
 NV_STATUS   NV_API_CALL  os_get_random_bytes              (NvU8 *, NvU16);
+NvU32       NV_API_CALL  os_get_current_process_flags     (void);
 NV_STATUS   NV_API_CALL  os_alloc_wait_queue              (os_wait_queue **);
 void        NV_API_CALL  os_free_wait_queue               (os_wait_queue *);
 void        NV_API_CALL  os_wait_uninterruptible          (os_wait_queue *);
diff --git a/kernel-open/nvidia-uvm/uvm_ampere_host.c b/kernel-open/nvidia-uvm/uvm_ampere_host.c
index 834bf93b9..0bcbf9cf9 100644
--- a/kernel-open/nvidia-uvm/uvm_ampere_host.c
+++ b/kernel-open/nvidia-uvm/uvm_ampere_host.c
@@ -461,3 +461,29 @@ void uvm_hal_ampere_host_tlb_invalidate_test(uvm_push_t *push,
     if (params->membar == UvmInvalidateTlbMemBarLocal)
         uvm_push_get_gpu(push)->parent->host_hal->membar_gpu(push);
 }
+
+void uvm_hal_ampere_host_l2_invalidate(uvm_push_t *push, uvm_aperture_t aperture)
+{
+    uvm_gpu_t *gpu = uvm_push_get_gpu(push);
+    NvU32 aperture_value;
+
+    if (aperture == UVM_APERTURE_SYS) {
+        aperture_value = HWCONST(C56F, MEM_OP_D, OPERATION, L2_SYSMEM_INVALIDATE);
+    }
+    else if (uvm_aperture_is_peer(aperture)) {
+        aperture_value = HWCONST(C56F, MEM_OP_D, OPERATION, L2_PEERMEM_INVALIDATE);
+    }
+    else {
+        UVM_ASSERT_MSG(false, "Invalid aperture_type %d\n", aperture);
+        return;
+    }
+
+    uvm_hal_membar(gpu, push, UVM_MEMBAR_SYS);
+
+    NV_PUSH_4U(C56F, MEM_OP_A, 0,
+               MEM_OP_B, 0,
+               MEM_OP_C, 0,
+               MEM_OP_D, aperture_value);
+
+    uvm_hal_membar(gpu, push, UVM_MEMBAR_SYS);
+}
diff --git a/kernel-open/nvidia-uvm/uvm_blackwell_host.c b/kernel-open/nvidia-uvm/uvm_blackwell_host.c
index 7552863ce..d829bf445 100644
--- a/kernel-open/nvidia-uvm/uvm_blackwell_host.c
+++ b/kernel-open/nvidia-uvm/uvm_blackwell_host.c
@@ -347,10 +347,23 @@ uvm_hal_blackwell_access_counter_query_clear_op_gb20x(uvm_parent_gpu_t *parent_g
     return UVM_ACCESS_COUNTER_CLEAR_OP_TARGETED;
 }
 
-// Host-specific L2 cache invalidate for non-coherent sysmem
-void uvm_hal_blackwell_host_l2_invalidate_noncoh_sysmem(uvm_push_t *push)
+void uvm_hal_blackwell_host_l2_invalidate(uvm_push_t *push, uvm_aperture_t aperture)
 {
     uvm_gpu_t *gpu = uvm_push_get_gpu(push);
+    NvU32 aperture_value;
+
+    if (!gpu->parent->is_integrated_gpu) {
+        return uvm_hal_ampere_host_l2_invalidate(push, aperture);
+    }
+
+    switch (aperture) {
+        case UVM_APERTURE_SYS:
+            aperture_value = HWCONST(C96F, MEM_OP_D, OPERATION, L2_SYSMEM_NCOH_INVALIDATE);
+            break;
+        default:
+            UVM_ASSERT_MSG(false, "Invalid aperture_type %d\n", aperture);
+            return;
+    }
 
     // First sysmembar
     uvm_hal_membar(gpu, push, UVM_MEMBAR_SYS);
@@ -363,7 +376,7 @@ void uvm_hal_blackwell_host_l2_invalidate_noncoh_sysmem(uvm_push_t *push)
     NV_PUSH_4U(C96F, MEM_OP_A, 0,
                MEM_OP_B, 0,
                MEM_OP_C, 0,
-               MEM_OP_D, HWCONST(C96F, MEM_OP_D, OPERATION, L2_SYSMEM_NCOH_INVALIDATE));
+               MEM_OP_D, aperture_value);
     // Final sysmembar
     uvm_hal_membar(gpu, push, UVM_MEMBAR_SYS);
 }
diff --git a/kernel-open/nvidia-uvm/uvm_hal.c b/kernel-open/nvidia-uvm/uvm_hal.c
index 777d16a97..93f0e70c1 100644
--- a/kernel-open/nvidia-uvm/uvm_hal.c
+++ b/kernel-open/nvidia-uvm/uvm_hal.c
@@ -221,7 +221,7 @@ static uvm_hal_class_ops_t host_table[] =
             .access_counter_clear_all = uvm_hal_maxwell_access_counter_clear_all_unsupported,
             .access_counter_clear_targeted = uvm_hal_maxwell_access_counter_clear_targeted_unsupported,
             .access_counter_query_clear_op = uvm_hal_maxwell_access_counter_query_clear_op_unsupported,
-            .l2_invalidate_noncoh_sysmem = uvm_hal_host_l2_invalidate_noncoh_sysmem_unsupported,
+            .l2_invalidate = uvm_hal_host_l2_invalidate_unsupported,
             .get_time = uvm_hal_maxwell_get_time,
         }
     },
@@ -287,6 +287,7 @@ static uvm_hal_class_ops_t host_table[] =
             .tlb_invalidate_all = uvm_hal_ampere_host_tlb_invalidate_all,
             .tlb_invalidate_va = uvm_hal_ampere_host_tlb_invalidate_va,
             .tlb_invalidate_test = uvm_hal_ampere_host_tlb_invalidate_test,
+            .l2_invalidate = uvm_hal_ampere_host_l2_invalidate,
         }
     },
     {
@@ -315,8 +316,8 @@ static uvm_hal_class_ops_t host_table[] =
             .tlb_invalidate_phys = uvm_hal_blackwell_host_tlb_invalidate_phys,
             .tlb_invalidate_test = uvm_hal_blackwell_host_tlb_invalidate_test,
             .tlb_flush_prefetch = uvm_hal_blackwell_host_tlb_flush_prefetch,
-            .l2_invalidate_noncoh_sysmem = uvm_hal_blackwell_host_l2_invalidate_noncoh_sysmem,
             .access_counter_query_clear_op = uvm_hal_blackwell_access_counter_query_clear_op_gb100,
+            .l2_invalidate = uvm_hal_blackwell_host_l2_invalidate,
         }
     },
     {
@@ -1162,10 +1163,11 @@ void uvm_hal_ce_memcopy_patch_src_stub(uvm_push_t *push, uvm_gpu_address_t *src)
 {
 }
 
-void uvm_hal_host_l2_invalidate_noncoh_sysmem_unsupported(uvm_push_t *push)
+void uvm_hal_host_l2_invalidate_unsupported(uvm_push_t *push, uvm_aperture_t aperture)
 {
     uvm_gpu_t *gpu = uvm_push_get_gpu(push);
     UVM_ERR_PRINT("L2 cache invalidation: Called on unsupported GPU %s (arch: 0x%x, impl: 0x%x)\n", 
                    uvm_gpu_name(gpu), gpu->parent->rm_info.gpuArch, gpu->parent->rm_info.gpuImplementation);
-    UVM_ASSERT_MSG(false, "host l2_invalidate_noncoh_sysmem called on unsupported GPU\n");
+    UVM_ASSERT_MSG(false, "L2 invalidate is not supported on %s",
+                   uvm_parent_gpu_name(gpu->parent));
 }
\ No newline at end of file
diff --git a/kernel-open/nvidia-uvm/uvm_hal.h b/kernel-open/nvidia-uvm/uvm_hal.h
index 004ba6392..f492f5cc9 100644
--- a/kernel-open/nvidia-uvm/uvm_hal.h
+++ b/kernel-open/nvidia-uvm/uvm_hal.h
@@ -248,11 +248,12 @@ typedef void (*uvm_hal_host_tlb_flush_prefetch_t)(uvm_push_t *push);
 void uvm_hal_maxwell_host_tlb_flush_prefetch_unsupported(uvm_push_t *push);
 void uvm_hal_blackwell_host_tlb_flush_prefetch(uvm_push_t *push);
 
-// L2 cache invalidate for non-coherent sysmem for systems with write back cache.
-// These are iGPUs as of now.
-typedef void (*uvm_hal_host_l2_invalidate_noncoh_sysmem_t)(uvm_push_t *push);
-void uvm_hal_blackwell_host_l2_invalidate_noncoh_sysmem(uvm_push_t *push);
-void uvm_hal_host_l2_invalidate_noncoh_sysmem_unsupported(uvm_push_t *push);
+// Performs L2 cache invalidation for peer or system memory.
+typedef void (*uvm_hal_host_l2_invalidate_t)(uvm_push_t *push, uvm_aperture_t aperture);
+void uvm_hal_blackwell_host_l2_invalidate(uvm_push_t *push, uvm_aperture_t aperture);
+
+void uvm_hal_ampere_host_l2_invalidate(uvm_push_t *push, uvm_aperture_t aperture);
+void uvm_hal_host_l2_invalidate_unsupported(uvm_push_t *push, uvm_aperture_t aperture);
 
 // By default all semaphore release operations include a membar sys before the
 // operation. This can be affected by using UVM_PUSH_FLAG_NEXT_* flags with
@@ -822,7 +823,7 @@ struct uvm_host_hal_struct
     uvm_hal_host_tlb_invalidate_phys_t tlb_invalidate_phys;
     uvm_hal_host_tlb_invalidate_test_t tlb_invalidate_test;
     uvm_hal_host_tlb_flush_prefetch_t tlb_flush_prefetch;
-    uvm_hal_host_l2_invalidate_noncoh_sysmem_t l2_invalidate_noncoh_sysmem;
+    uvm_hal_host_l2_invalidate_t l2_invalidate;
     uvm_hal_fault_buffer_replay_t replay_faults;
     uvm_hal_fault_cancel_global_t cancel_faults_global;
     uvm_hal_fault_cancel_targeted_t cancel_faults_targeted;
diff --git a/kernel-open/nvidia-uvm/uvm_map_external.c b/kernel-open/nvidia-uvm/uvm_map_external.c
index af6b8f5c5..24bfca275 100644
--- a/kernel-open/nvidia-uvm/uvm_map_external.c
+++ b/kernel-open/nvidia-uvm/uvm_map_external.c
@@ -1276,11 +1276,20 @@ void uvm_ext_gpu_map_destroy(uvm_va_range_external_t *external_range,
 
     range_tree = uvm_ext_gpu_range_tree(external_range, mapped_gpu);
 
-    // Perform L2 cache invalidation for noncoherent sysmem mappings. 
-    // This is done only on systems with write-back cache which is iGPUs as of now.
+    // Perform L2 cache invalidation for cached peer and sysmem mappings.
     if (ext_gpu_map->need_l2_invalidate_at_unmap) {
-        UVM_ASSERT(ext_gpu_map->gpu->parent->is_integrated_gpu);
-        status = uvm_mmu_l2_invalidate_noncoh_sysmem(mapped_gpu);
+        uvm_aperture_t aperture;
+
+        // Peer cache invalidation is not targeted to a specific peer, so we
+        // just use UVM_APERTURE_PEER(0).
+        if (ext_gpu_map->is_egm)
+            aperture = UVM_APERTURE_PEER(0);
+        else if (ext_gpu_map->is_sysmem)
+            aperture = UVM_APERTURE_SYS;
+        else
+            aperture = UVM_APERTURE_PEER(0);
+
+        status = uvm_mmu_l2_invalidate(mapped_gpu, aperture);
         UVM_ASSERT(status == NV_OK);
     }
 
diff --git a/kernel-open/nvidia-uvm/uvm_mmu.c b/kernel-open/nvidia-uvm/uvm_mmu.c
index 5a1fd6a44..c6e1ed256 100644
--- a/kernel-open/nvidia-uvm/uvm_mmu.c
+++ b/kernel-open/nvidia-uvm/uvm_mmu.c
@@ -2974,25 +2974,21 @@ NV_STATUS uvm_mmu_tlb_invalidate_phys(uvm_gpu_t *gpu)
     return uvm_push_end_and_wait(&push);
 }
 
-NV_STATUS uvm_mmu_l2_invalidate_noncoh_sysmem(uvm_gpu_t *gpu)
+NV_STATUS uvm_mmu_l2_invalidate(uvm_gpu_t *gpu, uvm_aperture_t aperture)
 {
     uvm_push_t push;
     NV_STATUS status;
 
-    // L2 cache invalidation is only done for systems with write-back 
-    // cache which is iGPUs as of now.
-    UVM_ASSERT(gpu->parent->is_integrated_gpu);
-
     status = uvm_push_begin(gpu->channel_manager,
                             UVM_CHANNEL_TYPE_MEMOPS,
                             &push,
-                            "L2 cache invalidate for sysmem");
+                            "L2 cache invalidate");
     if (status != NV_OK) {
         UVM_ERR_PRINT("L2 cache invalidation: Failed to begin push, status: %s\n", nvstatusToString(status));
         return status;
     }
 
-    gpu->parent->host_hal->l2_invalidate_noncoh_sysmem(&push);
+    gpu->parent->host_hal->l2_invalidate(&push, aperture);
 
     status = uvm_push_end_and_wait(&push);
     if (status != NV_OK) 
diff --git a/kernel-open/nvidia-uvm/uvm_mmu.h b/kernel-open/nvidia-uvm/uvm_mmu.h
index a969df3ac..134c73b5e 100644
--- a/kernel-open/nvidia-uvm/uvm_mmu.h
+++ b/kernel-open/nvidia-uvm/uvm_mmu.h
@@ -722,9 +722,8 @@ uvm_gpu_address_t uvm_mmu_gpu_address(uvm_gpu_t *gpu, uvm_gpu_phys_address_t phy
 // dma addresses, IOVAs, and GPAs). See uvm_dma_map_invalidation_t.
 NV_STATUS uvm_mmu_tlb_invalidate_phys(uvm_gpu_t *gpu);
 
-// Invalidate L2 cache when noncoherent sysmem mappings are unmapped.
-// This is done for systems with write-back cache i.e. iGPUs as of now.
-NV_STATUS uvm_mmu_l2_invalidate_noncoh_sysmem(uvm_gpu_t *gpu);
+// Invalidate L2 cache for peer or system memory.
+NV_STATUS uvm_mmu_l2_invalidate(uvm_gpu_t *gpu, uvm_aperture_t aperture);
 
 NV_STATUS uvm_test_invalidate_tlb(UVM_TEST_INVALIDATE_TLB_PARAMS *params, struct file *filp);
 
diff --git a/kernel-open/nvidia-uvm/uvm_va_range.h b/kernel-open/nvidia-uvm/uvm_va_range.h
index 4c12cfdd3..16ff18eb5 100644
--- a/kernel-open/nvidia-uvm/uvm_va_range.h
+++ b/kernel-open/nvidia-uvm/uvm_va_range.h
@@ -204,8 +204,12 @@ typedef struct
     uvm_deferred_free_object_t deferred_free;
 
     // Flag indicating whether L2 cache invalidation is needed at unmap time.
-    // This is set by RM during mapping and used during unmap to determine
-    // if L2 cache invalidation should be performed for non coherent sysmem.
+    // This is set by RM during mapping and used during unmap to determine if L2
+    // cache invalidation should be performed. For GPU cached system memory
+    // allocations on systems a write-back cache this is required for
+    // correctness. For GPU cached peer and system memory on systems with a
+    // write-through cache the invalidation could be done by RM at map time
+    // however this introduces overhead during performance sensitive sections.
     bool need_l2_invalidate_at_unmap;
 } uvm_ext_gpu_map_t;
 
diff --git a/kernel-open/nvidia/os-interface.c b/kernel-open/nvidia/os-interface.c
index a03a3b88c..835e329d9 100644
--- a/kernel-open/nvidia/os-interface.c
+++ b/kernel-open/nvidia/os-interface.c
@@ -2061,6 +2061,22 @@ NV_STATUS NV_API_CALL os_get_random_bytes
     return NV_OK;
 }
 
+NvU32 NV_API_CALL os_get_current_process_flags
+(
+    void
+)
+{
+    NvU32 flags = OS_CURRENT_PROCESS_FLAG_NONE;
+
+    if (current->flags & PF_EXITING)
+        flags |= OS_CURRENT_PROCESS_FLAG_EXITING;
+
+    if (current->flags & PF_KTHREAD)
+        flags |= OS_CURRENT_PROCESS_FLAG_KERNEL_THREAD;
+
+    return flags;
+}
+
 NV_STATUS NV_API_CALL os_alloc_wait_queue
 (
     os_wait_queue **wq
diff --git a/src/common/inc/nvBldVer.h b/src/common/inc/nvBldVer.h
index 844e22126..83ec5b74a 100644
--- a/src/common/inc/nvBldVer.h
+++ b/src/common/inc/nvBldVer.h
@@ -43,18 +43,18 @@
 #endif
 
 #if defined(NV_LINUX) || defined(NV_BSD) || defined(NV_SUNOS)
-#define NV_BUILD_BRANCH_VERSION         "rel/gpu_drv/r580/VK580_65-182"
-#define NV_BUILD_CHANGELIST_NUM         (36741708)
+#define NV_BUILD_BRANCH_VERSION         "rel/gpu_drv/r580/VK580_65-186"
+#define NV_BUILD_CHANGELIST_NUM         (36888175)
 #define NV_BUILD_TYPE                   "Official"
-#define NV_BUILD_NAME                   "rel/gpu_drv/r580/VK580_65-182"
-#define NV_LAST_OFFICIAL_CHANGELIST_NUM (36741708)
+#define NV_BUILD_NAME                   "rel/gpu_drv/r580/VK580_65-186"
+#define NV_LAST_OFFICIAL_CHANGELIST_NUM (36888175)
 
 #else     /* Windows builds */
-#define NV_BUILD_BRANCH_VERSION         "VK580_65-9"
-#define NV_BUILD_CHANGELIST_NUM         (36741708)
+#define NV_BUILD_BRANCH_VERSION         "VK580_65-12"
+#define NV_BUILD_CHANGELIST_NUM         (36887028)
 #define NV_BUILD_TYPE                   "Official"
-#define NV_BUILD_NAME                   "581.71"
-#define NV_LAST_OFFICIAL_CHANGELIST_NUM (36741708)
+#define NV_BUILD_NAME                   "581.90"
+#define NV_LAST_OFFICIAL_CHANGELIST_NUM (36887028)
 #define NV_BUILD_BRANCH_BASE_VERSION    R580
 #endif
 // End buildmeister python edited section
diff --git a/src/common/inc/nvUnixVersion.h b/src/common/inc/nvUnixVersion.h
index 0f8a7250b..89e34cfb8 100644
--- a/src/common/inc/nvUnixVersion.h
+++ b/src/common/inc/nvUnixVersion.h
@@ -4,7 +4,7 @@
 #if defined(NV_LINUX) || defined(NV_BSD) || defined(NV_SUNOS) || defined(NV_VMWARE) || defined(NV_QNX) || defined(NV_INTEGRITY) || \
     (defined(RMCFG_FEATURE_PLATFORM_GSP) && RMCFG_FEATURE_PLATFORM_GSP == 1)
 
-#define NV_VERSION_STRING               "580.94.06"
+#define NV_VERSION_STRING               "580.94.10"
 
 #else
 
diff --git a/src/common/modeset/hdmipacket/nvhdmipkt_C671.c b/src/common/modeset/hdmipacket/nvhdmipkt_C671.c
index e25344dde..eb4d3096f 100644
--- a/src/common/modeset/hdmipacket/nvhdmipkt_C671.c
+++ b/src/common/modeset/hdmipacket/nvhdmipkt_C671.c
@@ -175,6 +175,7 @@ static NVHDMIPKT_RESULT SetFRLLinkRate(NVHDMIPKT_CLASS  *pThis,
                                        const NvU32       subDevice,
                                        const NvU32       displayId,
                                        const NvBool      bFakeLt,
+                                       const NvBool      bDoNotSkipLt,
                                        const NvBool      bLinkAssessmentOnly,
                                        const NvU32       frlRate)
 {
@@ -184,6 +185,7 @@ static NVHDMIPKT_RESULT SetFRLLinkRate(NVHDMIPKT_CLASS  *pThis,
     params.displayId = displayId;
     params.data = frlRate;
     params.bFakeLt = bFakeLt;
+    params.bDoNotSkipLt = bDoNotSkipLt;
     params.bLinkAssessmentOnly = bLinkAssessmentOnly;
 
 #if NVHDMIPKT_RM_CALLS_INTERNAL
@@ -275,14 +277,16 @@ performLinkTraningToAssessFRLLink(NVHDMIPKT_CLASS          *pThis,
     {
         // If the display is active and the maximum link rate matches the link
         // rate required for the current mode timings, avoid marking the set
-        // link configuration call as an assessment only. This prevents
-        // re-training after the assessment.
+        // link configuration call as an assessment only. This allows us to
+        // re-train the existing link now instead of after the assessment.
+        // In addition, do not allow link training to be skipped to ensure
+        // we succesfully recover an existing FRL config.
         const NvBool bLinkAssessmentOnly =
             bIsDisplayActive ? (nv0073currFRLRate != maxFRLRate) : NV_TRUE;
 
         if (SetFRLLinkRate(pThis, subDevice, displayId,
-                           NV_FALSE /* bFakeLt */, bLinkAssessmentOnly,
-                           maxFRLRate) == NVHDMIPKT_SUCCESS)
+                           NV_FALSE /* bFakeLt */, NV_TRUE /* bDoNotSkipLt */,
+                           bLinkAssessmentOnly, maxFRLRate) == NVHDMIPKT_SUCCESS)
         {
             break;
         }
@@ -299,11 +303,13 @@ performLinkTraningToAssessFRLLink(NVHDMIPKT_CLASS          *pThis,
 
             if (SetFRLLinkRate(pThis, subDevice, displayId,
                                bFakeLt, NV_FALSE /* bLinkAssessmentOnly */,
+                               NV_FALSE /* bDoNotSkipLt */,
                                currFRLRate) != NVHDMIPKT_SUCCESS)
             {
                 if (!bFakeLt) {
                     if (SetFRLLinkRate(pThis, subDevice, displayId,
                                        NV_TRUE, NV_FALSE /* bLinkAssessmentOnly */,
+                                       NV_FALSE /* bDoNotSkipLt */,
                                        currFRLRate) != NVHDMIPKT_SUCCESS) {
                         NvHdmiPkt_Assert(0);
                     }
@@ -1130,6 +1136,19 @@ hdmiQueryFRLConfigC671(NVHDMIPKT_CLASS                         *pThis,
         NvU32 bppMinX16Itr, bppMaxX16Itr;
         NvBool bHasPreCalcFRLData = NV_FALSE;
 
+        NvBool forceFRLRateDSC = pClientCtrl->forceFRLRate;
+        HDMI_FRL_DATA_RATE requestedFRLRate = pClientCtrl->frlRate;
+ 
+#if defined(NVHDMIPKT_NVKMS)
+        NvU32 rr = (pVidTransInfo->pTiming->pclk * (NvU64)10000) /
+                   (pVidTransInfo->pTiming->HTotal * (NvU64)pVidTransInfo->pTiming->VTotal);
+
+        if (!pVidTransInfo->pTiming->interlaced && (rr >= 480)) {
+            forceFRLRateDSC = NV_TRUE;
+            requestedFRLRate = dscMaxFRLRate;
+        }
+#endif
+
         // DSC_All_bpp = 1:
         //     Lower the compression ratio better the pixel quality, hence a high bppTarget value will be ideal
         //     DSC_All_bpp = 1 allows us the flexibility to use a bppTarget setting different from the primary compressed format
@@ -1237,16 +1256,16 @@ hdmiQueryFRLConfigC671(NVHDMIPKT_CLASS                         *pThis,
             frlParams.compressionInfo.hSlices    = NV_UNSIGNED_DIV_CEIL(pVidTransInfo->pTiming->HVisible, pClientCtrl->sliceWidth);
         }
 
-        if (pClientCtrl->forceFRLRate)
+        if (forceFRLRateDSC)
         {
-            if (pClientCtrl->frlRate > dscMaxFRLRate)
+            if (requestedFRLRate > dscMaxFRLRate)
             {
                 result = NVHDMIPKT_FAIL;
                 goto frlQuery_fail;
             }
 
-            minFRLRateItr = pClientCtrl->frlRate;
-            maxFRLRateItr = pClientCtrl->frlRate;
+            minFRLRateItr = requestedFRLRate;
+            maxFRLRateItr = requestedFRLRate;
         }
 
         if (pClientCtrl->forceBppx16)
@@ -1419,6 +1438,7 @@ hdmiSetFRLConfigC671(NVHDMIPKT_CLASS             *pThis,
 {
     return SetFRLLinkRate(pThis, subDevice, displayId, bFakeLt,
                           NV_FALSE /* bLinkAssessmentOnly */,
+                          NV_FALSE /* bDoNotSkipLt */,
                           translateFRLRateToNv0073SetHdmiFrlConfig(pFRLConfig->frlRate));
 }
 
@@ -1432,6 +1452,7 @@ hdmiClearFRLConfigC671(NVHDMIPKT_CLASS    *pThis,
 {
     return SetFRLLinkRate(pThis, subDevice, displayId,
                           NV_FALSE, NV_FALSE /* bLinkAssessmentOnly */,
+                          NV_FALSE /* bDoNotSkipLt */,
                           NV0073_CTRL_HDMI_FRL_DATA_SET_FRL_RATE_NONE);
 }
 
diff --git a/src/common/nvlink/inband/interface/nvlink_inband_msg.h b/src/common/nvlink/inband/interface/nvlink_inband_msg.h
index 727d157ef..bce5d755e 100644
--- a/src/common/nvlink/inband/interface/nvlink_inband_msg.h
+++ b/src/common/nvlink/inband/interface/nvlink_inband_msg.h
@@ -84,6 +84,7 @@ typedef struct
 #define NVLINK_INBAND_GPU_PROBE_CAPS_ATS_SUPPORT   NVBIT(3)
 #define NVLINK_INBAND_GPU_PROBE_CAPS_LINK_RETRAIN_SUPPORT NVBIT(4)
 #define NVLINK_INBAND_GPU_PROBE_CAPS_HEALTH_SUMMARY NVBIT(6)
+#define NVLINK_INBAND_GPU_PROBE_CAPS_MC_RETRY      NVBIT(8)
 
 /* Add more caps as need in the future */
 
diff --git a/src/common/sdk/nvidia/inc/ctrl/ctrl0073/ctrl0073specific.h b/src/common/sdk/nvidia/inc/ctrl/ctrl0073/ctrl0073specific.h
index f52a49f39..3bceec0fd 100644
--- a/src/common/sdk/nvidia/inc/ctrl/ctrl0073/ctrl0073specific.h
+++ b/src/common/sdk/nvidia/inc/ctrl/ctrl0073/ctrl0073specific.h
@@ -1377,6 +1377,7 @@ typedef struct NV0073_CTRL_SPECIFIC_SET_HDMI_FRL_LINK_CONFIG_PARAMS {
     NvU32  displayId;
     NvU32  data;
     NvBool bFakeLt;
+    NvBool bDoNotSkipLt;
     NvBool bLtSkipped;
     NvBool bLinkAssessmentOnly;
 } NV0073_CTRL_SPECIFIC_SET_HDMI_FRL_LINK_CONFIG_PARAMS;
diff --git a/src/common/sdk/nvidia/inc/nverror.h b/src/common/sdk/nvidia/inc/nverror.h
index fa52e8bf3..6f9b9544a 100644
--- a/src/common/sdk/nvidia/inc/nverror.h
+++ b/src/common/sdk/nvidia/inc/nverror.h
@@ -165,7 +165,8 @@
 #define ROBUST_CHANNEL_UNUSED_ERROR_170                 (170)
 #define UNCORRECTABLE_DRAM_ERROR                        (171)
 #define UNCORRECTABLE_SRAM_ERROR                        (172)
-#define ROBUST_CHANNEL_LAST_ERROR                       (172)
+#define C2C_FATAL_LINK_FAILURE                          (173)
+#define ROBUST_CHANNEL_LAST_ERROR                       (173)
 
 // Indexed CE reference
 #define ROBUST_CHANNEL_CE_ERROR(x)                                        \
diff --git a/src/common/sdk/nvidia/inc/nvstatuscodes.h b/src/common/sdk/nvidia/inc/nvstatuscodes.h
index 98ebb7b47..440434997 100644
--- a/src/common/sdk/nvidia/inc/nvstatuscodes.h
+++ b/src/common/sdk/nvidia/inc/nvstatuscodes.h
@@ -165,6 +165,7 @@ NV_STATUS_CODE(NV_ERR_FABRIC_STATE_OUT_OF_SYNC,        0x00000087, "NVLink fabri
 NV_STATUS_CODE(NV_ERR_BUFFER_FULL,                     0x00000088, "Buffer is full")
 NV_STATUS_CODE(NV_ERR_BUFFER_EMPTY,                    0x00000089, "Buffer is empty")
 NV_STATUS_CODE(NV_ERR_MC_FLA_OFFSET_TABLE_FULL,        0x0000008A, "Multicast FLA offset table has no available slots")
+NV_STATUS_CODE(NV_ERR_DMA_XFER_FAILED,                 0x0000008B, "DMA transfer failed")
 
 // Warnings:
 NV_STATUS_CODE(NV_WARN_HOT_SWITCH,                     0x00010001, "WARNING Hot switch")
diff --git a/src/common/shared/inc/g_vgpu_chip_flags.h b/src/common/shared/inc/g_vgpu_chip_flags.h
index 6184ed11a..845e8c97d 100644
--- a/src/common/shared/inc/g_vgpu_chip_flags.h
+++ b/src/common/shared/inc/g_vgpu_chip_flags.h
@@ -621,25 +621,6 @@ ENTRY(0x2238, 0x16B8, 0x10de, "NVIDIA A10M-10C"),
 ENTRY(0x2238, 0x16B9, 0x10de, "NVIDIA A10M-20C"),
 ENTRY(0x2238, 0x16E6, 0x10de, "NVIDIA A10M-1"),
 ENTRY(0x2238, 0x2208, 0x10de, "NVIDIA A10M-3B"),
-ENTRY(0x230E, 0x20F5, 0x10de, "NVIDIA H20L-1-15CME"),
-ENTRY(0x230E, 0x20F6, 0x10de, "NVIDIA H20L-1-15C"),
-ENTRY(0x230E, 0x20F7, 0x10de, "NVIDIA H20L-1-30C"),
-ENTRY(0x230E, 0x20F8, 0x10de, "NVIDIA H20L-2-30C"),
-ENTRY(0x230E, 0x20F9, 0x10de, "NVIDIA H20L-3-60C"),
-ENTRY(0x230E, 0x20FA, 0x10de, "NVIDIA H20L-4-60C"),
-ENTRY(0x230E, 0x20FB, 0x10de, "NVIDIA H20L-7-120C"),
-ENTRY(0x230E, 0x20FC, 0x10de, "NVIDIA H20L-4C"),
-ENTRY(0x230E, 0x20FD, 0x10de, "NVIDIA H20L-5C"),
-ENTRY(0x230E, 0x20FE, 0x10de, "NVIDIA H20L-6C"),
-ENTRY(0x230E, 0x20FF, 0x10de, "NVIDIA H20L-8C"),
-ENTRY(0x230E, 0x2100, 0x10de, "NVIDIA H20L-10C"),
-ENTRY(0x230E, 0x2101, 0x10de, "NVIDIA H20L-12C"),
-ENTRY(0x230E, 0x2102, 0x10de, "NVIDIA H20L-15C"),
-ENTRY(0x230E, 0x2103, 0x10de, "NVIDIA H20L-20C"),
-ENTRY(0x230E, 0x2104, 0x10de, "NVIDIA H20L-30C"),
-ENTRY(0x230E, 0x2105, 0x10de, "NVIDIA H20L-40C"),
-ENTRY(0x230E, 0x2106, 0x10de, "NVIDIA H20L-60C"),
-ENTRY(0x230E, 0x2107, 0x10de, "NVIDIA H20L-120C"),
 ENTRY(0x2321, 0x1853, 0x10de, "NVIDIA H100L-1-12CME"),
 ENTRY(0x2321, 0x1854, 0x10de, "NVIDIA H100L-1-12C"),
 ENTRY(0x2321, 0x1855, 0x10de, "NVIDIA H100L-1-24C"),
diff --git a/src/common/shared/inc/g_vgpu_resman_specific.h b/src/common/shared/inc/g_vgpu_resman_specific.h
index eeca3a11e..ad84bab6b 100644
--- a/src/common/shared/inc/g_vgpu_resman_specific.h
+++ b/src/common/shared/inc/g_vgpu_resman_specific.h
@@ -17,7 +17,6 @@ static inline void _get_chip_id_for_alias_pgpu(NvU32 *dev_id, NvU32 *subdev_id)
         { 0x20B7, 0x1804, 0x20B7, 0x1532 },
         { 0x20B9, 0x157F, 0x20B7, 0x1532 },
         { 0x20FD, 0x17F8, 0x20F5, 0x0 },
-        { 0x230E, 0x20DF, 0x230E, 0x20DF },
         { 0x2324, 0x17A8, 0x2324, 0x17A6 },
         { 0x2329, 0x198C, 0x2329, 0x198B },
         { 0x232C, 0x2064, 0x232C, 0x2063 },
@@ -122,13 +121,6 @@ static const struct {
     {0x20F610DE, NV2080_CTRL_GPU_PARTITION_FLAG_ONE_HALF_GPU                                                                         , 1094}, // GRID A800-4-20C
     {0x20F610DE, NV2080_CTRL_GPU_PARTITION_FLAG_FULL_GPU                                                                             , 1095}, // GRID A800-7-40C
     {0x20F610DE, NV2080_CTRL_GPU_PARTITION_FLAG_ONE_MINI_QUARTER_GPU                                                                 , 1091}, // GRID A800-1-10C
-    {0x230E10DE, NV2080_CTRL_GPU_PARTITION_FLAG_ONE_EIGHTHED_GPU | DRF_DEF(2080, _CTRL_GPU_PARTITION_FLAG, _REQ_DEC_JPG_OFA, _ENABLE), 1499}, // NVIDIA H20L-1-15CME
-    {0x230E10DE, NV2080_CTRL_GPU_PARTITION_FLAG_ONE_EIGHTHED_GPU                                                                     , 1500}, // NVIDIA H20L-1-15C
-    {0x230E10DE, NV2080_CTRL_GPU_PARTITION_FLAG_ONE_MINI_QUARTER_GPU                                                                 , 1501}, // NVIDIA H20L-1-30C
-    {0x230E10DE, NV2080_CTRL_GPU_PARTITION_FLAG_ONE_QUARTER_GPU                                                                      , 1502}, // NVIDIA H20L-2-30C
-    {0x230E10DE, NV2080_CTRL_GPU_PARTITION_FLAG_ONE_MINI_HALF_GPU                                                                    , 1503}, // NVIDIA H20L-3-60C
-    {0x230E10DE, NV2080_CTRL_GPU_PARTITION_FLAG_ONE_HALF_GPU                                                                         , 1504}, // NVIDIA H20L-4-60C
-    {0x230E10DE, NV2080_CTRL_GPU_PARTITION_FLAG_FULL_GPU                                                                             , 1505}, // NVIDIA H20L-7-120C
     {0x232110DE, NV2080_CTRL_GPU_PARTITION_FLAG_ONE_EIGHTHED_GPU | DRF_DEF(2080, _CTRL_GPU_PARTITION_FLAG, _REQ_DEC_JPG_OFA, _ENABLE), 1061}, // NVIDIA H100L-1-12CME
     {0x232110DE, NV2080_CTRL_GPU_PARTITION_FLAG_ONE_EIGHTHED_GPU                                                                     , 1062}, // NVIDIA H100L-1-12C
     {0x232110DE, NV2080_CTRL_GPU_PARTITION_FLAG_ONE_MINI_QUARTER_GPU                                                                 , 1063}, // NVIDIA H100L-1-24C
diff --git a/src/common/shared/msgq/inc/msgq/msgq.h b/src/common/shared/msgq/inc/msgq/msgq.h
index ee0d6f05a..d81bbd235 100644
--- a/src/common/shared/msgq/inc/msgq/msgq.h
+++ b/src/common/shared/msgq/inc/msgq/msgq.h
@@ -68,8 +68,8 @@ typedef void (*msgqFcnBarrier)(void);
 // Function to access backend memory (if it's not memory mapped).
 // Keep in mind than when using it, pointers given by peek can't be trusted
 // Should return 0 on success.
-typedef int (*msgqFcnBackendRw)(void *pDest, const void *pSrc, unsigned size,
-                                unsigned flags, void *pArg);
+typedef unsigned (*msgqFcnBackendRw)(void *pDest, const void *pSrc, unsigned size,
+                                     unsigned flags, void *pArg);
 
 /**
  * @brief Return size of metadata (that must be allocated)
diff --git a/src/common/shared/msgq/msgq.c b/src/common/shared/msgq/msgq.c
index 84714b318..76d9dd361 100644
--- a/src/common/shared/msgq/msgq.c
+++ b/src/common/shared/msgq/msgq.c
@@ -104,35 +104,45 @@ msgqSetBarrier(msgqHandle handle, msgqFcnBarrier fcn)
 /*
  * Helper functions to access indirect backend.
  */
-
-sysSHARED_CODE static void
+// TODO: Make these funcions return NV_STATUS instead of int wherever possible.
+sysSHARED_CODE static int
 _backendRead32(msgqMetadata *pQueue, volatile const void *pAddr, NvU32 *pVal, unsigned flags)
 {
     if (pQueue->fcnBackendRw != NULL)
     {
-        pQueue->fcnBackendRw(pVal, (const void *)pAddr, sizeof(*pVal),
-                             flags | FCN_FLAG_BACKEND_ACCESS_READ,
-                             pQueue->fcnBackendRwArg);
+        int status = pQueue->fcnBackendRw(pVal, (const void *)pAddr, sizeof(*pVal),
+                                          flags | FCN_FLAG_BACKEND_ACCESS_READ,
+                                          pQueue->fcnBackendRwArg);
+        if (status != 0)
+        {
+            return -1;
+        }
     }
     else
     {
         *pVal = *(volatile const NvU32*)pAddr;
     }
+    return 0;
 }
 
-sysSHARED_CODE static void
+sysSHARED_CODE static int
 _backendWrite32(msgqMetadata *pQueue, volatile void *pAddr, NvU32 *pVal, unsigned flags)
 {
     if (pQueue->fcnBackendRw != NULL)
     {
-        pQueue->fcnBackendRw((void*)pAddr, pVal, sizeof(*pVal),
-                             flags | FCN_FLAG_BACKEND_ACCESS_WRITE,
-                             pQueue->fcnBackendRwArg);
+        int status = pQueue->fcnBackendRw((void*)pAddr, pVal, sizeof(*pVal),
+                                          flags | FCN_FLAG_BACKEND_ACCESS_WRITE,
+                                          pQueue->fcnBackendRwArg);
+        if (status != 0)
+        {
+            return -1;
+        }
     }
     else
     {
         *(volatile NvU32*)pAddr = *pVal;
     }
+    return 0;
 }
 
 /**
@@ -142,7 +152,7 @@ _backendWrite32(msgqMetadata *pQueue, volatile void *pAddr, NvU32 *pVal, unsigne
 sysSHARED_CODE static void
 msgqRiscvDefaultBarrier(void)
 {
-    asm volatile("fence iorw,iorw");
+    __asm__ volatile("fence iorw,iorw");
 }
 #endif
 
@@ -188,6 +198,7 @@ msgqTxCreate
 {
     msgqMetadata *pQueue = (msgqMetadata*)handle;
     msgqTxHeader *pTx;
+    int status;
 
     if ((pQueue == NULL) || pQueue->txLinked)
     {
@@ -282,10 +293,15 @@ msgqTxCreate
     // Indirect access to backend
     if (pQueue->fcnBackendRw != NULL)
     {
-        pQueue->fcnBackendRw(pTx, &pQueue->tx, sizeof *pTx,
-            FCN_FLAG_BACKEND_ACCESS_WRITE | FCN_FLAG_BACKEND_QUEUE_TX,
-            pQueue->fcnBackendRwArg);
-    } else
+        status = pQueue->fcnBackendRw(pTx, &pQueue->tx, sizeof *pTx,
+                                      FCN_FLAG_BACKEND_ACCESS_WRITE | FCN_FLAG_BACKEND_QUEUE_TX,
+                                      pQueue->fcnBackendRwArg);
+        if (status != 0)
+        {
+            return -1;
+        }
+    } 
+    else
     {
         memcpy(pTx, &pQueue->tx, sizeof *pTx);
     }
@@ -315,6 +331,7 @@ sysSHARED_CODE int
 msgqRxLink(msgqHandle handle, const void *pBackingStore, unsigned size, unsigned msgSize)
 {
     msgqMetadata *pQueue = (msgqMetadata*)handle;
+    int status;
 
     if ((pQueue == NULL) || pQueue->rxLinked)
     {
@@ -347,10 +364,14 @@ msgqRxLink(msgqHandle handle, const void *pBackingStore, unsigned size, unsigned
     // copy their metadata
     if (pQueue->fcnBackendRw != NULL)
     {
-        pQueue->fcnBackendRw(&pQueue->rx, (const void *)pQueue->pTheirTxHdr,
-            sizeof pQueue->rx,
-            FCN_FLAG_BACKEND_ACCESS_READ | FCN_FLAG_BACKEND_QUEUE_RX,
-            pQueue->fcnBackendRwArg);
+        status = pQueue->fcnBackendRw(&pQueue->rx, (const void *)pQueue->pTheirTxHdr,
+                                      sizeof pQueue->rx,
+                                      FCN_FLAG_BACKEND_ACCESS_READ | FCN_FLAG_BACKEND_QUEUE_RX,
+                                      pQueue->fcnBackendRwArg);
+        if (status != 0)
+        {
+            return -11;
+        }
     }
     else
     {
@@ -413,8 +434,13 @@ msgqRxLink(msgqHandle handle, const void *pBackingStore, unsigned size, unsigned
     }
 
     pQueue->rxReadPtr = 0;
-    _backendWrite32(pQueue, pQueue->pReadOutgoing, &pQueue->rxReadPtr,
-        pQueue->rxSwapped ? FCN_FLAG_BACKEND_QUEUE_TX : FCN_FLAG_BACKEND_QUEUE_RX);
+    status = _backendWrite32(pQueue, pQueue->pReadOutgoing, &pQueue->rxReadPtr,
+                             pQueue->rxSwapped ? FCN_FLAG_BACKEND_QUEUE_TX : FCN_FLAG_BACKEND_QUEUE_RX);
+    if (status != 0)
+    {
+        return -12;
+    }
+
     if (pQueue->fcnFlush != NULL)
     {
         pQueue->fcnFlush(pQueue->pReadOutgoing, sizeof(NvU32));
@@ -451,8 +477,12 @@ msgqTxGetFreeSpace(msgqHandle handle)
         return 0;
     }
 
-    _backendRead32(pQueue, pQueue->pReadIncoming, &pQueue->txReadPtr,
-        pQueue->rxSwapped ? FCN_FLAG_BACKEND_QUEUE_RX : FCN_FLAG_BACKEND_QUEUE_TX);
+    if (_backendRead32(pQueue, pQueue->pReadIncoming, &pQueue->txReadPtr,
+                       pQueue->rxSwapped ? FCN_FLAG_BACKEND_QUEUE_RX : FCN_FLAG_BACKEND_QUEUE_TX) != 0)
+    {
+        return 0;
+    }
+
     if (pQueue->txReadPtr >= pQueue->tx.msgCount)
     {
         return 0;
@@ -505,6 +535,7 @@ sysSHARED_CODE int
 msgqTxSubmitBuffers(msgqHandle handle, unsigned n)
 {
     msgqMetadata *pQueue = (msgqMetadata*)handle;
+    int status;
 
     if ((pQueue == NULL) || !pQueue->txLinked)
     {
@@ -531,8 +562,19 @@ msgqTxSubmitBuffers(msgqHandle handle, unsigned n)
         pQueue->tx.writePtr -= pQueue->tx.msgCount;
     }
 
-    _backendWrite32(pQueue, pQueue->pWriteOutgoing,
-        &pQueue->tx.writePtr, FCN_FLAG_BACKEND_QUEUE_TX);
+    status = _backendWrite32(pQueue, pQueue->pWriteOutgoing,
+                             &pQueue->tx.writePtr, FCN_FLAG_BACKEND_QUEUE_TX);
+    if (status != 0)
+    {
+        // restore write pointer
+        if (pQueue->tx.writePtr < n)
+        {
+            pQueue->tx.writePtr += pQueue->tx.msgCount;
+        }
+
+        pQueue->tx.writePtr -= n;
+        return -2;
+    }
 
     // Adjust cached value for number of free elements.
     pQueue->txFree -= n;
@@ -606,7 +648,11 @@ msgqRxGetReadAvailable(msgqHandle handle)
         return 0;
     }
 
-    _backendRead32(pQueue, pQueue->pWriteIncoming, &pQueue->rx.writePtr, FCN_FLAG_BACKEND_QUEUE_RX);
+    if (_backendRead32(pQueue, pQueue->pWriteIncoming, &pQueue->rx.writePtr, FCN_FLAG_BACKEND_QUEUE_RX) != 0)
+    {
+        return 0;
+    }
+
     if (pQueue->rx.writePtr >= pQueue->rx.msgCount)
     {
         return 0;
@@ -659,6 +705,7 @@ sysSHARED_CODE int
 msgqRxMarkConsumed(msgqHandle handle, unsigned n)
 {
     msgqMetadata *pQueue = (msgqMetadata*)handle;
+    int status;
 
     if ((pQueue == NULL) || !pQueue->rxLinked)
     {
@@ -679,8 +726,19 @@ msgqRxMarkConsumed(msgqHandle handle, unsigned n)
     }
 
     // Copy to backend
-    _backendWrite32(pQueue, pQueue->pReadOutgoing, &pQueue->rxReadPtr,
-        pQueue->rxSwapped ? FCN_FLAG_BACKEND_QUEUE_TX : FCN_FLAG_BACKEND_QUEUE_RX);
+    status = _backendWrite32(pQueue, pQueue->pReadOutgoing, &pQueue->rxReadPtr,
+                             pQueue->rxSwapped ? FCN_FLAG_BACKEND_QUEUE_TX : FCN_FLAG_BACKEND_QUEUE_RX);
+    if (status != 0)
+    {
+        // restore read pointer
+        if (pQueue->rxReadPtr < n)
+        {
+            pQueue->rxReadPtr += pQueue->rx.msgCount;
+        }
+
+        pQueue->rxReadPtr -= n;
+        return -2;
+    }
 
     // Adjust cached value for number of available elements.
     pQueue->rxAvail -= n;
diff --git a/src/nvidia-modeset/src/nvkms-dpy.c b/src/nvidia-modeset/src/nvkms-dpy.c
index dcbc4085a..bbdf4df50 100644
--- a/src/nvidia-modeset/src/nvkms-dpy.c
+++ b/src/nvidia-modeset/src/nvkms-dpy.c
@@ -909,6 +909,29 @@ void nvDpyProbeMaxPixelClock(NVDpyEvoPtr pDpyEvo)
                     pDpyEvo->maxPixelClockKHz =
                         ((4 * 12 * 1000 * 1000 * 16) / 18);
                 }
+            } else {
+                const NVParsedEdidEvoRec *pParsedEdid = &pDpyEvo->parsedEdid;
+
+                if (pParsedEdid->valid) {
+                    const NVT_EDID_INFO *pEdidInfo = &pParsedEdid->info;
+                    /* Default Maximum HDMI TMDS character rate is 165MHz. */
+                    NvU32 maxTmdsCharRate = 33;
+
+                    if (pEdidInfo->ext861.valid.H20_HF_VSDB &&
+                        (pEdidInfo->hdmiForumInfo.max_TMDS_char_rate > 0)) {
+                        maxTmdsCharRate =
+                            NV_MIN(pEdidInfo->hdmiForumInfo.max_TMDS_char_rate, 120);
+                    } else if (pEdidInfo->ext861.valid.H14B_VSDB &&
+                               (pEdidInfo->hdmiLlcInfo.max_tmds_clock > 0)) {
+                        maxTmdsCharRate =
+                            NV_MIN(pEdidInfo->hdmiLlcInfo.max_tmds_clock, 68);
+                    }
+
+                    /* Max Pixel Rate = Max TMDS character Rate * 5MHz */
+                    pDpyEvo->maxPixelClockKHz =
+                        pDpyEvo->maxSingleLinkPixelClockKHz =
+                        maxTmdsCharRate * 5000;
+                }
             }
         } else {
             /*
diff --git a/src/nvidia-modeset/src/nvkms-hdmi.c b/src/nvidia-modeset/src/nvkms-hdmi.c
index 23c03c21a..e7f2ca230 100644
--- a/src/nvidia-modeset/src/nvkms-hdmi.c
+++ b/src/nvidia-modeset/src/nvkms-hdmi.c
@@ -2036,7 +2036,13 @@ NvBool nvHdmiDpySupportsFrl(const NVDpyEvoRec *pDpyEvo)
 {
     const NVDevEvoRec *pDevEvo = pDpyEvo->pDispEvo->pDevEvo;
 
-    nvAssert(nvDpyIsHdmiEvo(pDpyEvo));
+    /*
+     * Can't use FRL if HDMI is not supported by the GPU and the monitor
+     * connection.
+     */
+    if (!nvDpyIsHdmiEvo(pDpyEvo)) {
+        return FALSE;
+    }
 
     /* Can't use FRL if disabled by kernel module param. */
     if (nvkms_disable_hdmi_frl()) {
@@ -2102,9 +2108,6 @@ NvBool nvHdmiIsTmdsPossible(const NVDpyEvoRec *pDpyEvo,
             pDpyEvo->pDispEvo->pDevEvo->caps.hdmiTmds10BpcMaxPClkMHz * 1000UL;
         NvU32 adjustedMaxPixelClock =
             (pDpyEvo->maxSingleLinkPixelClockKHz * 4ULL) / 5ULL;
-        NvU32 adjustedMaxEDIDPixelClock =
-            pDpyEvo->parsedEdid.valid ?
-              (pDpyEvo->parsedEdid.limits.max_pclk_10khz * 10 * 4ULL) / 5ULL : 0;
 
         /* Pixel clock must satisfy hdmiTmds10BpcMaxPClkKHz, if applicable. */
         if ((hdmiTmds10BpcMaxPClkKHz > 0) &&
@@ -2117,12 +2120,6 @@ NvBool nvHdmiIsTmdsPossible(const NVDpyEvoRec *pDpyEvo,
             return FALSE;
         }
 
-        /* Pixel clock must also satisfy adjustedMaxEDIDPixelClock. */
-        if (adjustedMaxEDIDPixelClock != 0 &&
-            pixelClock > adjustedMaxEDIDPixelClock) {
-            return FALSE;
-        }
-
         return TRUE;
     }
 
diff --git a/src/nvidia-modeset/src/nvkms-modepool.c b/src/nvidia-modeset/src/nvkms-modepool.c
index 57c7b18d9..75ef6fb4e 100644
--- a/src/nvidia-modeset/src/nvkms-modepool.c
+++ b/src/nvidia-modeset/src/nvkms-modepool.c
@@ -1214,43 +1214,66 @@ static NvBool ValidateModeTimings(
         }
     }
 
-    /* reject modes with too high pclk */
+    /*
+     * Reject modes with too high pclk, except when using HDMI FRL or
+     * DisplayPort. FRL and DP have features like DSC that cannot be trivially
+     * checked against a pixel clock rate limit. Instead:
+     *
+     * - DPlib will perform link assessment to determine whether both the
+     *   monitor and GPU can drive a particular bandwidth.
+     *
+     * - hdmipacket will perform the equivalent for FRL.
+     *
+     * TMDS will only be considered on a connection capable of HDMI FRL for the
+     * mode being validated if nvHdmiIsTmdsPossible returns TRUE in the
+     * following callpath:
+     *
+     *     ValidateMode
+     *     |_ ValidateModeTimings
+     *     |_ nvConstructHwModeTimingsEvo
+     *        |_ GetDfpProtocol
+     *           |_ GetDfpHdmiProtocol
+     *              |_ nvHdmiIsTmdsPossible
+     */
 
-    if ((overrides & NVKMS_MODE_VALIDATION_NO_MAX_PCLK_CHECK) == 0) {
+    if (!(nvHdmiDpySupportsFrl(pDpyEvo) ||
+          nvConnectorUsesDPLib(pDpyEvo->pConnectorEvo))) {
+        if ((overrides & NVKMS_MODE_VALIDATION_NO_MAX_PCLK_CHECK) == 0) {
 
-        NvU32 maxPixelClockKHz = pDpyEvo->maxPixelClockKHz;
-        NvU32 realPixelClock = HzToKHz(pModeTimings->pixelClockHz);
-        if (pModeTimings->yuv420Mode != NV_YUV420_MODE_NONE) {
-            realPixelClock /= 2;
-        }
-
-        if (realPixelClock > maxPixelClockKHz) {
-            NvU32 hdmi3DPixelClock = realPixelClock;
-
-            if (pModeTimings->hdmi3D) {
-                hdmi3DPixelClock /= 2;
+            NvU32 maxPixelClockKHz = pDpyEvo->maxPixelClockKHz;
+            NvU32 realPixelClock = HzToKHz(pModeTimings->pixelClockHz);
+            if (pModeTimings->yuv420Mode != NV_YUV420_MODE_NONE) {
+                realPixelClock /= 2;
             }
 
-            if (is3DVisionStereo &&
-                pDpyEvo->stereo3DVision.requiresModetimingPatching &&
-                (realPixelClock - maxPixelClockKHz < 5000)) {
+            if (realPixelClock > maxPixelClockKHz) {
+                NvU32 hdmi3DPixelClock = realPixelClock;
 
-                nvAssert(!pModeTimings->hdmi3D);
+                if (pModeTimings->hdmi3D) {
+                    hdmi3DPixelClock /= 2;
+                }
 
-                nvEvoLogInfoString(pInfoString,
-                    "PixelClock (" NV_FMT_DIV_1000_POINT_1 " MHz) is slightly higher than Display Device maximum (" NV_FMT_DIV_1000_POINT_1 " MHz), but is within tolerance for 3D Vision Stereo.",
-                    NV_VA_DIV_1000_POINT_1(realPixelClock),
-                    NV_VA_DIV_1000_POINT_1(maxPixelClockKHz));
+                if (is3DVisionStereo &&
+                    pDpyEvo->stereo3DVision.requiresModetimingPatching &&
+                    (realPixelClock - maxPixelClockKHz < 5000)) {
 
-            } else {
+                    nvAssert(!pModeTimings->hdmi3D);
 
-                LogModeValidationEnd(pDispEvo, pInfoString,
-                    "PixelClock (" NV_FMT_DIV_1000_POINT_1 " MHz%s) too high for Display Device (Max: " NV_FMT_DIV_1000_POINT_1 " MHz)",
-                    NV_VA_DIV_1000_POINT_1(hdmi3DPixelClock),
-                    pModeTimings->hdmi3D ?
-                    ", doubled for HDMI 3D" : "",
-                    NV_VA_DIV_1000_POINT_1(maxPixelClockKHz));
-                return FALSE;
+                    nvEvoLogInfoString(pInfoString,
+                        "PixelClock (" NV_FMT_DIV_1000_POINT_1 " MHz) is slightly higher than Display Device maximum (" NV_FMT_DIV_1000_POINT_1 " MHz), but is within tolerance for 3D Vision Stereo.",
+                        NV_VA_DIV_1000_POINT_1(realPixelClock),
+                        NV_VA_DIV_1000_POINT_1(maxPixelClockKHz));
+
+                } else {
+
+                    LogModeValidationEnd(pDispEvo, pInfoString,
+                        "PixelClock (" NV_FMT_DIV_1000_POINT_1 " MHz%s) too high for Display Device (Max: " NV_FMT_DIV_1000_POINT_1 " MHz)",
+                        NV_VA_DIV_1000_POINT_1(hdmi3DPixelClock),
+                        pModeTimings->hdmi3D ?
+                        ", doubled for HDMI 3D" : "",
+                        NV_VA_DIV_1000_POINT_1(maxPixelClockKHz));
+                    return FALSE;
+                }
             }
         }
     }
diff --git a/src/nvidia-modeset/src/nvkms-vrr.c b/src/nvidia-modeset/src/nvkms-vrr.c
index 4c8e26a71..3e2f8976d 100644
--- a/src/nvidia-modeset/src/nvkms-vrr.c
+++ b/src/nvidia-modeset/src/nvkms-vrr.c
@@ -254,6 +254,17 @@ nvGetAllowedDpyVrrType(const NVDpyEvoRec *pDpyEvo,
                        const NvBool allowGsync,
                        const enum NvKmsAllowAdaptiveSync allowAdaptiveSync)
 {
+
+    if (nvDpyIsHdmiEvo(pDpyEvo)) {
+        /*
+         * Do not allow HDMI VRR if refresh rate less than
+         * 50Hz or Vactive < 720.
+         */
+        if ((pTimings->vVisible < 720) || (pTimings->RRx1k < 50000)) {
+            return NVKMS_DPY_VRR_TYPE_NONE;
+        }
+    }
+
     /*
      * Mark these mode timings as indicating a VRR mode, even if the timings
      * don't need to be adjusted; this is used to distinguish between VRR and
diff --git a/src/nvidia/arch/nvalloc/unix/include/os-interface.h b/src/nvidia/arch/nvalloc/unix/include/os-interface.h
index 6eb955964..84842e17a 100644
--- a/src/nvidia/arch/nvalloc/unix/include/os-interface.h
+++ b/src/nvidia/arch/nvalloc/unix/include/os-interface.h
@@ -62,6 +62,11 @@ struct os_work_queue;
 /* Each OS defines its own version of this opaque type */
 typedef struct os_wait_queue os_wait_queue;
 
+/* Flags needed by os_get_current_proccess_flags */
+#define OS_CURRENT_PROCESS_FLAG_NONE            0x0
+#define OS_CURRENT_PROCESS_FLAG_KERNEL_THREAD   0x1
+#define OS_CURRENT_PROCESS_FLAG_EXITING         0x2
+
 /*
  * ---------------------------------------------------------------------------
  *
@@ -190,6 +195,7 @@ NV_STATUS   NV_API_CALL  os_open_readonly_file            (const char *, void **
 NV_STATUS   NV_API_CALL  os_open_and_read_file            (const char *, NvU8 *, NvU64);
 NvBool      NV_API_CALL  os_is_nvswitch_present           (void);
 NV_STATUS   NV_API_CALL  os_get_random_bytes              (NvU8 *, NvU16);
+NvU32       NV_API_CALL  os_get_current_process_flags     (void);
 NV_STATUS   NV_API_CALL  os_alloc_wait_queue              (os_wait_queue **);
 void        NV_API_CALL  os_free_wait_queue               (os_wait_queue *);
 void        NV_API_CALL  os_wait_uninterruptible          (os_wait_queue *);
diff --git a/src/nvidia/arch/nvalloc/unix/src/os.c b/src/nvidia/arch/nvalloc/unix/src/os.c
index 2c1a89c22..2379c382d 100644
--- a/src/nvidia/arch/nvalloc/unix/src/os.c
+++ b/src/nvidia/arch/nvalloc/unix/src/os.c
@@ -5074,6 +5074,18 @@ osGetRandomBytes
     return os_get_random_bytes(pBytes, numBytes);
 }
 
+/*
+ * @brief Get current process flags..
+ */
+NvU32
+osGetCurrentProcessFlags
+(
+    void
+)
+{
+    return os_get_current_process_flags();
+}
+
 /*
  * @brief Allocate wait queue
  *
diff --git a/src/nvidia/generated/g_kern_mem_sys_nvoc.c b/src/nvidia/generated/g_kern_mem_sys_nvoc.c
index 6e241eae1..1f0400e95 100644
--- a/src/nvidia/generated/g_kern_mem_sys_nvoc.c
+++ b/src/nvidia/generated/g_kern_mem_sys_nvoc.c
@@ -661,8 +661,7 @@ static void __nvoc_init_funcTable_KernelMemorySystem_1(KernelMemorySystem *pThis
     }
 
     // kmemsysNeedInvalidateGpuCacheOnMap -- halified (2 hals) body
-    if (( ((chipHal_HalVarIdx >> 5) == 1UL) && ((1UL << (chipHal_HalVarIdx & 0x1f)) & 0x71f0ffe0UL) ) ||
-        ( ((chipHal_HalVarIdx >> 5) == 2UL) && ((1UL << (chipHal_HalVarIdx & 0x1f)) & 0x000003e6UL) )) /* ChipHal: TU102 | TU104 | TU106 | TU116 | TU117 | GA100 | GA102 | GA103 | GA104 | GA106 | GA107 | AD102 | AD103 | AD104 | AD106 | AD107 | GH100 | GB100 | GB102 | GB110 | GB112 | GB202 | GB203 | GB205 | GB206 | GB207 */ 
+    if (( ((chipHal_HalVarIdx >> 5) == 1UL) && ((1UL << (chipHal_HalVarIdx & 0x1f)) & 0x000003e0UL) )) /* ChipHal: TU102 | TU104 | TU106 | TU116 | TU117 */ 
     {
         pThis->__kmemsysNeedInvalidateGpuCacheOnMap__ = &kmemsysNeedInvalidateGpuCacheOnMap_GV100;
     }
@@ -673,9 +672,9 @@ static void __nvoc_init_funcTable_KernelMemorySystem_1(KernelMemorySystem *pThis
     }
 
     // kmemsysNeedInvalidateGpuCacheOnUnmap -- halified (2 hals) body
-    if (( ((chipHal_HalVarIdx >> 5) == 1UL) && ((1UL << (chipHal_HalVarIdx & 0x1f)) & 0x80000000UL) ) ||
-        ( ((chipHal_HalVarIdx >> 5) == 2UL) && ((1UL << (chipHal_HalVarIdx & 0x1f)) & 0x00000c00UL) ) ||
-        ( ((chipHal_HalVarIdx >> 5) == 3UL) && ((1UL << (chipHal_HalVarIdx & 0x1f)) & 0x00005000UL) )) /* ChipHal: GB10B | GB20B | GB20C | T234D | T264D */ 
+    if (( ((chipHal_HalVarIdx >> 5) == 1UL) && ((1UL << (chipHal_HalVarIdx & 0x1f)) & 0xf1f0fc00UL) ) ||
+        ( ((chipHal_HalVarIdx >> 5) == 2UL) && ((1UL << (chipHal_HalVarIdx & 0x1f)) & 0x00000fe6UL) ) ||
+        ( ((chipHal_HalVarIdx >> 5) == 3UL) && ((1UL << (chipHal_HalVarIdx & 0x1f)) & 0x00005000UL) )) /* ChipHal: GA100 | GA102 | GA103 | GA104 | GA106 | GA107 | AD102 | AD103 | AD104 | AD106 | AD107 | GH100 | GB100 | GB102 | GB10B | GB110 | GB112 | GB202 | GB203 | GB205 | GB206 | GB207 | GB20B | GB20C | T234D | T264D */ 
     {
         pThis->__kmemsysNeedInvalidateGpuCacheOnUnmap__ = &kmemsysNeedInvalidateGpuCacheOnUnmap_T194;
     }
diff --git a/src/nvidia/generated/g_nv_name_released.h b/src/nvidia/generated/g_nv_name_released.h
index bc9d4a5ed..a0fcfd284 100644
--- a/src/nvidia/generated/g_nv_name_released.h
+++ b/src/nvidia/generated/g_nv_name_released.h
@@ -5214,6 +5214,7 @@ static const CHIPS_RELEASED sChipsReleased[] = {
     { 0x2236, 0x1482, 0x10de, "NVIDIA A10" },
     { 0x2237, 0x152f, 0x10de, "NVIDIA A10G" },
     { 0x2238, 0x1677, 0x10de, "NVIDIA A10M" },
+    { 0x230E, 0x20df, 0x10de, "NVIDIA H20 NVL16" },
     { 0x2321, 0x1839, 0x10de, "NVIDIA H100 NVL" },
     { 0x2322, 0x17a4, 0x10de, "NVIDIA H800 PCIe" },
     { 0x2324, 0x17a6, 0x10de, "NVIDIA H800" },
@@ -5414,9 +5415,10 @@ static const CHIPS_RELEASED sChipsReleased[] = {
     { 0x2941, 0x20d5, 0x10de, "NVIDIA GB200" },
     { 0x2941, 0x21c9, 0x10de, "NVIDIA GB200" },
     { 0x2941, 0x21ca, 0x10de, "NVIDIA GB200" },
+    { 0x29BB, 0x207c, 0x10de, "NVIDIA DRIVE P2021" },
     { 0x2B85, 0x0000, 0x0000, "NVIDIA GeForce RTX 5090" },
     { 0x2B87, 0x0000, 0x0000, "NVIDIA GeForce RTX 5090 D" },
-    { 0x2B8C, 0x530c, 0x17aa, "NVIDIA GeForce RTX 5090 D v2" },
+    { 0x2B8C, 0x0000, 0x0000, "NVIDIA GeForce RTX 5090 D v2" },
     { 0x2BB1, 0x204b, 0x1028, "NVIDIA RTX PRO 6000 Blackwell Workstation Edition" },
     { 0x2BB1, 0x204b, 0x103c, "NVIDIA RTX PRO 6000 Blackwell Workstation Edition" },
     { 0x2BB1, 0x204b, 0x10de, "NVIDIA RTX PRO 6000 Blackwell Workstation Edition" },
@@ -5429,6 +5431,8 @@ static const CHIPS_RELEASED sChipsReleased[] = {
     { 0x2BB4, 0x204c, 0x103c, "NVIDIA RTX PRO 6000 Blackwell Max-Q Workstation Edition" },
     { 0x2BB4, 0x204c, 0x10de, "NVIDIA RTX PRO 6000 Blackwell Max-Q Workstation Edition" },
     { 0x2BB4, 0x204c, 0x17aa, "NVIDIA RTX PRO 6000 Blackwell Max-Q Workstation Edition" },
+    { 0x2BB5, 0x204e, 0x10de, "NVIDIA RTX PRO 6000 Blackwell Server Edition" },
+    { 0x2BB9, 0x2091, 0x10de, "NVIDIA RTX 6000D" },
     { 0x2C02, 0x0000, 0x0000, "NVIDIA GeForce RTX 5080" },
     { 0x2C05, 0x0000, 0x0000, "NVIDIA GeForce RTX 5070 Ti" },
     { 0x2C18, 0x0000, 0x0000, "NVIDIA GeForce RTX 5090 Laptop GPU" },
@@ -5439,6 +5443,7 @@ static const CHIPS_RELEASED sChipsReleased[] = {
     { 0x2C31, 0x2051, 0x17aa, "NVIDIA RTX PRO 4500 Blackwell" },
     { 0x2C33, 0x2053, 0x1028, "NVIDIA RTX PRO 4000 Blackwell SFF Edition" },
     { 0x2C33, 0x2053, 0x103c, "NVIDIA RTX PRO 4000 Blackwell SFF Edition" },
+    { 0x2C33, 0x2053, 0x10de, "NVIDIA RTX PRO 4000 Blackwell SFF Edition" },
     { 0x2C33, 0x2053, 0x17aa, "NVIDIA RTX PRO 4000 Blackwell SFF Edition" },
     { 0x2C34, 0x2052, 0x1028, "NVIDIA RTX PRO 4000 Blackwell" },
     { 0x2C34, 0x2052, 0x103c, "NVIDIA RTX PRO 4000 Blackwell" },
@@ -5448,25 +5453,32 @@ static const CHIPS_RELEASED sChipsReleased[] = {
     { 0x2C39, 0x0000, 0x0000, "NVIDIA RTX PRO 4000 Blackwell Generation Laptop GPU" },
     { 0x2C58, 0x0000, 0x0000, "NVIDIA GeForce RTX 5090 Laptop GPU" },
     { 0x2C59, 0x0000, 0x0000, "NVIDIA GeForce RTX 5080 Laptop GPU" },
+    { 0x2C77, 0x0000, 0x0000, "NVIDIA RTX PRO 5000 Blackwell Embedded GPU" },
+    { 0x2C79, 0x0000, 0x0000, "NVIDIA RTX PRO 4000 Blackwell Embedded GPU" },
     { 0x2D04, 0x0000, 0x0000, "NVIDIA GeForce RTX 5060 Ti" },
     { 0x2D05, 0x0000, 0x0000, "NVIDIA GeForce RTX 5060" },
     { 0x2D18, 0x0000, 0x0000, "NVIDIA GeForce RTX 5070 Laptop GPU" },
     { 0x2D19, 0x0000, 0x0000, "NVIDIA GeForce RTX 5060 Laptop GPU" },
     { 0x2D30, 0x2054, 0x1028, "NVIDIA RTX PRO 2000 Blackwell" },
     { 0x2D30, 0x2054, 0x103c, "NVIDIA RTX PRO 2000 Blackwell" },
+    { 0x2D30, 0x2054, 0x10de, "NVIDIA RTX PRO 2000 Blackwell" },
     { 0x2D30, 0x2054, 0x17aa, "NVIDIA RTX PRO 2000 Blackwell" },
     { 0x2D39, 0x0000, 0x0000, "NVIDIA RTX PRO 2000 Blackwell Generation Laptop GPU" },
     { 0x2D58, 0x0000, 0x0000, "NVIDIA GeForce RTX 5070 Laptop GPU" },
     { 0x2D59, 0x0000, 0x0000, "NVIDIA GeForce RTX 5060 Laptop GPU" },
-    { 0x2D83, 0xc791, 0x17aa, "NVIDIA GeForce RTX 5050" },
+    { 0x2D79, 0x0000, 0x0000, "NVIDIA RTX PRO 2000 Blackwell Embedded GPU" },
+    { 0x2D83, 0x0000, 0x0000, "NVIDIA GeForce RTX 5050" },
     { 0x2D98, 0x0000, 0x0000, "NVIDIA GeForce RTX 5050 Laptop GPU" },
     { 0x2DB8, 0x0000, 0x0000, "NVIDIA RTX PRO 1000 Blackwell Generation Laptop GPU" },
     { 0x2DB9, 0x0000, 0x0000, "NVIDIA RTX PRO 500 Blackwell Generation Laptop GPU" },
     { 0x2DD8, 0x0000, 0x0000, "NVIDIA GeForce RTX 5050 Laptop GPU" },
+    { 0x2DF9, 0x0000, 0x0000, "NVIDIA RTX PRO 500 Blackwell Embedded GPU" },
     { 0x2F04, 0x0000, 0x0000, "NVIDIA GeForce RTX 5070" },
     { 0x2F18, 0x0000, 0x0000, "NVIDIA GeForce RTX 5070 Ti Laptop GPU" },
     { 0x2F38, 0x0000, 0x0000, "NVIDIA RTX PRO 3000 Blackwell Generation Laptop GPU" },
     { 0x2F58, 0x0000, 0x0000, "NVIDIA GeForce RTX 5070 Ti Laptop GPU" },
+    { 0x3182, 0x20e6, 0x10de, "NVIDIA B300 SXM6 AC" },
+    { 0x31C2, 0x21f1, 0x10de, "NVIDIA GB300" },
     { 0x13BD, 0x11cc, 0x10DE, "GRID M10-0B" },
     { 0x13BD, 0x11cd, 0x10DE, "GRID M10-1B" },
     { 0x13BD, 0x11ce, 0x10DE, "GRID M10-0Q" },
@@ -6067,25 +6079,6 @@ static const CHIPS_RELEASED sChipsReleased[] = {
     { 0x2238, 0x16b9, 0x10DE, "NVIDIA A10M-20C" },
     { 0x2238, 0x16e6, 0x10DE, "NVIDIA A10M-1" },
     { 0x2238, 0x2208, 0x10DE, "NVIDIA A10M-3B" },
-    { 0x230E, 0x20f5, 0x10DE, "NVIDIA H20L-1-15CME" },
-    { 0x230E, 0x20f6, 0x10DE, "NVIDIA H20L-1-15C" },
-    { 0x230E, 0x20f7, 0x10DE, "NVIDIA H20L-1-30C" },
-    { 0x230E, 0x20f8, 0x10DE, "NVIDIA H20L-2-30C" },
-    { 0x230E, 0x20f9, 0x10DE, "NVIDIA H20L-3-60C" },
-    { 0x230E, 0x20fa, 0x10DE, "NVIDIA H20L-4-60C" },
-    { 0x230E, 0x20fb, 0x10DE, "NVIDIA H20L-7-120C" },
-    { 0x230E, 0x20fc, 0x10DE, "NVIDIA H20L-4C" },
-    { 0x230E, 0x20fd, 0x10DE, "NVIDIA H20L-5C" },
-    { 0x230E, 0x20fe, 0x10DE, "NVIDIA H20L-6C" },
-    { 0x230E, 0x20ff, 0x10DE, "NVIDIA H20L-8C" },
-    { 0x230E, 0x2100, 0x10DE, "NVIDIA H20L-10C" },
-    { 0x230E, 0x2101, 0x10DE, "NVIDIA H20L-12C" },
-    { 0x230E, 0x2102, 0x10DE, "NVIDIA H20L-15C" },
-    { 0x230E, 0x2103, 0x10DE, "NVIDIA H20L-20C" },
-    { 0x230E, 0x2104, 0x10DE, "NVIDIA H20L-30C" },
-    { 0x230E, 0x2105, 0x10DE, "NVIDIA H20L-40C" },
-    { 0x230E, 0x2106, 0x10DE, "NVIDIA H20L-60C" },
-    { 0x230E, 0x2107, 0x10DE, "NVIDIA H20L-120C" },
     { 0x2321, 0x1853, 0x10DE, "NVIDIA H100L-1-12CME" },
     { 0x2321, 0x1854, 0x10DE, "NVIDIA H100L-1-12C" },
     { 0x2321, 0x1855, 0x10DE, "NVIDIA H100L-1-24C" },
diff --git a/src/nvidia/generated/g_os_nvoc.h b/src/nvidia/generated/g_os_nvoc.h
index 08c3d1a3a..2c08adfd2 100644
--- a/src/nvidia/generated/g_os_nvoc.h
+++ b/src/nvidia/generated/g_os_nvoc.h
@@ -215,6 +215,11 @@ typedef struct RM_PAGEABLE_SECTION {
 #define OS_ALLOC_PAGES_NODE_NONE                0x0
 #define OS_ALLOC_PAGES_NODE_SKIP_RECLAIM        0x1
 
+// Flags needed by osGetCurrentProccessFlags
+#define OS_CURRENT_PROCESS_FLAG_NONE            0x0
+#define OS_CURRENT_PROCESS_FLAG_KERNEL_THREAD   0x1
+#define OS_CURRENT_PROCESS_FLAG_EXITING         0x2
+
 //
 // Structures for osPackageRegistry and osUnpackageRegistry
 //
@@ -737,6 +742,8 @@ NvS32 osImexChannelCount(void);
 
 NV_STATUS osGetRandomBytes(NvU8 *pBytes, NvU16 numBytes);
 
+NvU32 osGetCurrentProcessFlags(void);
+
 NV_STATUS osAllocWaitQueue(OS_WAIT_QUEUE **ppWq);
 void      osFreeWaitQueue(OS_WAIT_QUEUE *pWq);
 void      osWaitUninterruptible(OS_WAIT_QUEUE *pWq);
diff --git a/src/nvidia/generated/g_sysmem_scrub_nvoc.h b/src/nvidia/generated/g_sysmem_scrub_nvoc.h
index f5437a5af..c91ad877e 100644
--- a/src/nvidia/generated/g_sysmem_scrub_nvoc.h
+++ b/src/nvidia/generated/g_sysmem_scrub_nvoc.h
@@ -78,14 +78,21 @@ typedef struct
 {
     MEMORY_DESCRIPTOR *pMemDesc;
     NvU64 semaphoreValue;
+    NODE listNode;
 } SysScrubEntry;
 
-MAKE_LIST(SysScrubList, SysScrubEntry);
+MAKE_INTRUSIVE_LIST(SysScrubList, SysScrubEntry, listNode);
 
 typedef struct
 {
+    // semaphore event handle doesn't take GPU lock
+    PORT_SPINLOCK *pSpinlock;
+
+    // spinlock needs to be taken to use pSysmemScrubber
     struct SysmemScrubber *pSysmemScrubber;
+
     NvU32 refCount;
+    NvU32 bWorkerQueued;
 } SysmemScrubberWorkerParams;
 
 
@@ -124,7 +131,6 @@ struct SysmemScrubber {
     struct CeUtils *pCeUtils;
     SysScrubList asyncScrubList;
     NvBool bAsync;
-    NvBool bCallbackQueued;
     SysmemScrubberWorkerParams *pWorkerParams;
 };
 
diff --git a/src/nvidia/inc/kernel/core/thread_state.h b/src/nvidia/inc/kernel/core/thread_state.h
index 1f721c8ed..01ac1e485 100644
--- a/src/nvidia/inc/kernel/core/thread_state.h
+++ b/src/nvidia/inc/kernel/core/thread_state.h
@@ -187,6 +187,8 @@ typedef struct THREAD_STATE_DB
 #define THREAD_STATE_FLAGS_TIMEOUT_INITED               NVBIT(5)
 #define THREAD_STATE_FLAGS_DEVICE_INIT                  NVBIT(7)
 #define THREAD_STATE_FLAGS_STATE_FREE_CB_ENABLED        NVBIT(8)
+#define THREAD_STATE_FLAGS_IS_KERNEL_THREAD             NVBIT(9)
+#define THREAD_STATE_FLAGS_IS_EXITING                   NVBIT(10)
 
 // These Threads run exclusively between a conditional acquire
 #define THREAD_STATE_FLAGS_EXCLUSIVE_RUNNING   (THREAD_STATE_FLAGS_IS_ISR                       | \
diff --git a/src/nvidia/src/kernel/core/thread_state.c b/src/nvidia/src/kernel/core/thread_state.c
index c53388eaa..10f73e3e4 100644
--- a/src/nvidia/src/kernel/core/thread_state.c
+++ b/src/nvidia/src/kernel/core/thread_state.c
@@ -590,6 +590,8 @@ static NV_STATUS _threadStateInitCommon(THREAD_STATE_NODE *pThreadNode, NvU32 fl
  */
 void threadStateInit(THREAD_STATE_NODE *pThreadNode, NvU32 flags)
 {
+    NvU32 osFlags;
+
     // Isrs should be using threadStateIsrInit().
     NV_ASSERT_OR_RETURN_VOID((flags & (THREAD_STATE_FLAGS_IS_ISR_LOCKLESS |
         THREAD_STATE_FLAGS_IS_ISR |
@@ -599,6 +601,14 @@ void threadStateInit(THREAD_STATE_NODE *pThreadNode, NvU32 flags)
     if (!(threadStateDatabase.setupFlags & THREAD_STATE_SETUP_FLAGS_ENABLED))
         return;
 
+    osFlags = osGetCurrentProcessFlags();
+
+    if (osFlags & OS_CURRENT_PROCESS_FLAG_KERNEL_THREAD)
+        flags |= THREAD_STATE_FLAGS_IS_KERNEL_THREAD;
+
+    if (osFlags & OS_CURRENT_PROCESS_FLAG_EXITING)
+        flags |= THREAD_STATE_FLAGS_IS_EXITING;
+
     // Use common initialization logic (stack-allocated)
     // Note: Legacy void API ignores errors for backward compatibility
     _threadStateInitCommon(pThreadNode, flags, NV_FALSE);
diff --git a/src/nvidia/src/kernel/gpu/external_device/arch/kepler/kern_gsync_p2060.c b/src/nvidia/src/kernel/gpu/external_device/arch/kepler/kern_gsync_p2060.c
index 375947afd..886d3e898 100644
--- a/src/nvidia/src/kernel/gpu/external_device/arch/kepler/kern_gsync_p2060.c
+++ b/src/nvidia/src/kernel/gpu/external_device/arch/kepler/kern_gsync_p2060.c
@@ -1214,6 +1214,8 @@ gsyncReadUniversalFrameCount_P2060
     OBJTMR *pTmpTmr = NULL;
     OBJTMR *pTmr = GPU_GET_TIMER(pGpu);
 
+    NV_CHECK_OR_RETURN(LEVEL_INFO, gsyncIsFrameLocked_P2060(pThis), NV_ERR_INVALID_STATE);
+
     if (!(pThis->FrameCountData.iface == NV_P2060_MAX_IFACES_PER_GSYNC))
     {
         //
@@ -1258,7 +1260,8 @@ gsyncReadUniversalFrameCount_P2060
         // P2060 refreshrate is in 0.00001 Hz, so divide by 10000 to get Hz.
         // divide 1000000 by refreshRate to get the frame time in us.
         //
-        pThis->FrameCountData.frameTime = 1000000 / (pThis->RefreshRate/10000); //in us
+        NV_CHECK_OR_RETURN(LEVEL_INFO, pThis->RefreshRate >= 10, NV_ERR_INVALID_STATE);
+        pThis->FrameCountData.frameTime = 1000*1000*1000 / (pThis->RefreshRate/10); //in us
 
         //
         // Enable FrameCountTimerService to verify FrameCountData.initialDifference.
diff --git a/src/nvidia/src/kernel/gpu/fsp/kern_fsp.c b/src/nvidia/src/kernel/gpu/fsp/kern_fsp.c
index bc83cb44f..e87348b5f 100644
--- a/src/nvidia/src/kernel/gpu/fsp/kern_fsp.c
+++ b/src/nvidia/src/kernel/gpu/fsp/kern_fsp.c
@@ -242,6 +242,7 @@ kfspStateUnload_IMPL
     NvU32      flags
 )
 {
+    kfspReleaseProxyImage(pGpu, pKernelFsp);
     return NV_OK;
 }
 
diff --git a/src/nvidia/src/kernel/gpu/gpu.c b/src/nvidia/src/kernel/gpu/gpu.c
index 44f8ddd95..44db3e9f3 100644
--- a/src/nvidia/src/kernel/gpu/gpu.c
+++ b/src/nvidia/src/kernel/gpu/gpu.c
@@ -2287,7 +2287,7 @@ gpuStateInit_IMPL
     }
 
     // Set a property indicating that VF BAR0 MMU TLB Invalidation register emulation is required or not.
-    if (hypervisorIsVgxHyper())
+    if (hypervisorIsVgxHyper() || (RMCFG_FEATURE_PLATFORM_GSP && IS_VGPU_GSP_PLUGIN_OFFLOAD_ENABLED(pGpu)))
     {
         if (
             IsdADA(pGpu) ||
diff --git a/src/nvidia/src/kernel/gpu/gr/kernel_graphics.c b/src/nvidia/src/kernel/gpu/gr/kernel_graphics.c
index f78f0f9ff..63c9b43c4 100644
--- a/src/nvidia/src/kernel/gpu/gr/kernel_graphics.c
+++ b/src/nvidia/src/kernel/gpu/gr/kernel_graphics.c
@@ -788,9 +788,13 @@ cleanup:
         // to be allocated. We delay them until now to save memory when runs
         // are done without using graphics contexts!
         //
+        // For MIG ESX hypervisor, vGPU stack do not need any GR channel on host so
+        // skip global ctx buffer alloc to save FB memory
+        //
         if (!pKernelGraphics->globalCtxBuffersInfo.pGlobalCtxBuffers[gfid].bAllocated &&
             (!gpuIsClientRmAllocatedCtxBufferEnabled(pGpu) ||
-             (gpuIsSriovEnabled(pGpu) && IS_GFID_PF(gfid))))
+             (gpuIsSriovEnabled(pGpu) && IS_GFID_PF(gfid) && 
+              !(IS_MIG_IN_USE(pGpu) && hypervisorIsType(OS_HYPERVISOR_VMWARE)))))
         {
             NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
                 kgraphicsAllocGrGlobalCtxBuffers_HAL(pGpu, pKernelGraphics, gfid, NULL));
@@ -865,6 +869,17 @@ kgraphicsLoadStaticInfo_VF
         portMemCopy(pPrivate->staticInfo.pSmIssueRateModifier, sizeof(*pPrivate->staticInfo.pSmIssueRateModifier),
                     &pVSI->smIssueRateModifier.smIssueRateModifier[grIdx], sizeof(pVSI->smIssueRateModifier.smIssueRateModifier[grIdx]));
 
+        pPrivate->staticInfo.pSmIssueThrottleCtrl =
+                portMemAllocNonPaged(sizeof(*pPrivate->staticInfo.pSmIssueThrottleCtrl));
+        if (pPrivate->staticInfo.pSmIssueThrottleCtrl == NULL)
+        {
+            status = NV_ERR_NO_MEMORY;
+            goto cleanup;
+        }
+
+        portMemCopy(pPrivate->staticInfo.pSmIssueThrottleCtrl, sizeof(*pPrivate->staticInfo.pSmIssueThrottleCtrl),
+                    &pVSI->smIssueThrottleCtrl.smIssueThrottleCtrl[grIdx], sizeof(pVSI->smIssueThrottleCtrl.smIssueThrottleCtrl[grIdx]));
+
         pPrivate->staticInfo.pPpcMasks = portMemAllocNonPaged(sizeof(*pPrivate->staticInfo.pPpcMasks));
         if (pPrivate->staticInfo.pPpcMasks == NULL)
         {
@@ -958,6 +973,28 @@ kgraphicsLoadStaticInfo_VF
         portMemCopy(pPrivate->staticInfo.pSmIssueRateModifier, sizeof(*pPrivate->staticInfo.pSmIssueRateModifier),
                     &pVSI->smIssueRateModifier.smIssueRateModifier[grIdx], sizeof(pVSI->smIssueRateModifier.smIssueRateModifier[grIdx]));
 
+        pPrivate->staticInfo.pSmIssueRateModifierV2 =
+                portMemAllocNonPaged(sizeof(*pPrivate->staticInfo.pSmIssueRateModifierV2));
+        if (pPrivate->staticInfo.pSmIssueRateModifierV2 == NULL)
+        {
+            status = NV_ERR_NO_MEMORY;
+            goto cleanup;
+        }
+
+        portMemCopy(pPrivate->staticInfo.pSmIssueRateModifierV2, sizeof(*pPrivate->staticInfo.pSmIssueRateModifierV2),
+                    &pVSI->smIssueRateModifierV2.smIssueRateModifierV2[grIdx], sizeof(pVSI->smIssueRateModifierV2.smIssueRateModifierV2[grIdx]));
+
+        pPrivate->staticInfo.pSmIssueThrottleCtrl =
+                portMemAllocNonPaged(sizeof(*pPrivate->staticInfo.pSmIssueThrottleCtrl));
+        if (pPrivate->staticInfo.pSmIssueThrottleCtrl == NULL)
+        {
+            status = NV_ERR_NO_MEMORY;
+            goto cleanup;
+        }
+
+        portMemCopy(pPrivate->staticInfo.pSmIssueThrottleCtrl, sizeof(*pPrivate->staticInfo.pSmIssueThrottleCtrl),
+                    &pVSI->smIssueThrottleCtrl.smIssueThrottleCtrl[grIdx], sizeof(pVSI->smIssueThrottleCtrl.smIssueThrottleCtrl[grIdx]));
+
         pPrivate->staticInfo.pPpcMasks = portMemAllocNonPaged(sizeof(*pPrivate->staticInfo.pPpcMasks));
         if (pPrivate->staticInfo.pPpcMasks == NULL)
         {
@@ -1072,6 +1109,12 @@ cleanup :
         portMemFree(pPrivate->staticInfo.pSmIssueRateModifier);
         pPrivate->staticInfo.pSmIssueRateModifier = NULL;
 
+        portMemFree(pPrivate->staticInfo.pSmIssueRateModifierV2);
+        pPrivate->staticInfo.pSmIssueRateModifierV2 = NULL;
+
+        portMemFree(pPrivate->staticInfo.pSmIssueThrottleCtrl);
+        pPrivate->staticInfo.pSmIssueThrottleCtrl = NULL;
+
         portMemFree(pPrivate->staticInfo.pFecsTraceDefines);
         pPrivate->staticInfo.pFecsTraceDefines = NULL;
     }
@@ -3355,7 +3398,6 @@ subdeviceCtrlCmdKGrGetSmIssueThrottleCtrl_IMPL
 
         NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
             kmigmgrGetInstanceRefFromDevice(pGpu, pKernelMIGManager, pDevice, &ref));
-        NV_ASSERT_OR_RETURN(ref.pMIGComputeInstance != NULL && ref.pKernelMIGGpuInstance != NULL, NV_ERR_INVALID_STATE);
 
         NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
             kmigmgrGetLocalToGlobalEngineType(pGpu, pKernelMIGManager, ref, RM_ENGINE_TYPE_GR(0), &globalGrEngine));
diff --git a/src/nvidia/src/kernel/gpu/gr/kernel_graphics_context.c b/src/nvidia/src/kernel/gpu/gr/kernel_graphics_context.c
index 69e543726..f91c538f4 100644
--- a/src/nvidia/src/kernel/gpu/gr/kernel_graphics_context.c
+++ b/src/nvidia/src/kernel/gpu/gr/kernel_graphics_context.c
@@ -2619,7 +2619,8 @@ kgrctxShouldManageCtxBuffers_PHYSICAL
     NvU32 gfid
 )
 {
-    return !gpuIsClientRmAllocatedCtxBufferEnabled(pGpu) || (gpuIsSriovEnabled(pGpu) && IS_GFID_PF(gfid));
+    return !gpuIsClientRmAllocatedCtxBufferEnabled(pGpu) || (gpuIsSriovEnabled(pGpu) && IS_GFID_PF(gfid) &&
+                                                             !(IS_MIG_IN_USE(pGpu) && hypervisorIsType(OS_HYPERVISOR_VMWARE)));
 }
 
 /**
diff --git a/src/nvidia/src/kernel/gpu/gr/kernel_graphics_object.c b/src/nvidia/src/kernel/gpu/gr/kernel_graphics_object.c
index 90b42800c..c2ed97ca2 100644
--- a/src/nvidia/src/kernel/gpu/gr/kernel_graphics_object.c
+++ b/src/nvidia/src/kernel/gpu/gr/kernel_graphics_object.c
@@ -28,6 +28,7 @@
 #include "kernel/core/locks.h"
 #include "kernel/gpu/subdevice/subdevice.h"
 #include "vgpu/rpc.h"
+#include "virtualization/hypervisor/hypervisor.h"
 #include "kernel/mem_mgr/gpu_vaspace.h"
 #include "kernel/gpu/mem_mgr/mem_mgr.h"
 #include "kernel/gpu/fifo/kernel_channel_group.h"
@@ -520,7 +521,8 @@ kgrobjShouldCleanup_PHYSICAL
     ChannelDescendant *pChannelDescendant = staticCast(pKernelGraphicsObject, ChannelDescendant);
     NvU32              gfid = kchannelGetGfid(pChannelDescendant->pKernelChannel);
 
-    return !gpuIsClientRmAllocatedCtxBufferEnabled(pGpu) || (gpuIsSriovEnabled(pGpu) && IS_GFID_PF(gfid));
+    return !gpuIsClientRmAllocatedCtxBufferEnabled(pGpu) || (gpuIsSriovEnabled(pGpu) && IS_GFID_PF(gfid) &&
+                                                             !(IS_MIG_IN_USE(pGpu) && hypervisorIsType(OS_HYPERVISOR_VMWARE)));
 }
 
 /*!
diff --git a/src/nvidia/src/kernel/gpu/gsp/message_queue_cpu.c b/src/nvidia/src/kernel/gpu/gsp/message_queue_cpu.c
index 80c7212c5..382360943 100644
--- a/src/nvidia/src/kernel/gpu/gsp/message_queue_cpu.c
+++ b/src/nvidia/src/kernel/gpu/gsp/message_queue_cpu.c
@@ -239,6 +239,16 @@ GspMsgQueuesInit
     memdescSetPageSize(pMQCollection->pSharedMemDesc, AT_GPU, RM_PAGE_SIZE_HUGE);
     memdescTagAlloc(nvStatus, NV_FB_ALLOC_RM_INTERNAL_OWNER_UNNAMED_TAG_58,
                     pMQCollection->pSharedMemDesc);
+
+    if (nvStatus == NV_ERR_NO_MEMORY)
+    {
+        // TODO: Bug 5299603
+        NV_PRINTF(LEVEL_ERROR, "Allocation failed with big page size, retrying with default page size\n");
+        memdescSetPageSize(pMQCollection->pSharedMemDesc, AT_GPU, RM_PAGE_SIZE);
+        memdescTagAlloc(nvStatus, NV_FB_ALLOC_RM_INTERNAL_OWNER_UNNAMED_TAG_58,
+                        pMQCollection->pSharedMemDesc);
+    }
+
     NV_ASSERT_OK_OR_GOTO(nvStatus, nvStatus, error_ret);
 
     // Create kernel mapping for command queue.
@@ -760,7 +770,6 @@ NV_STATUS GspMsgQueueReceiveStatus(MESSAGE_QUEUE_INFO *pMQI, OBJGPU *pGpu)
     }
 
 exit:
-    pMQI->rxSeqNum++;
 
     nRet = msgqRxMarkConsumed(pMQI->hQueue, nElements);
     if (nRet < 0)
@@ -768,6 +777,10 @@ exit:
         NV_PRINTF(LEVEL_ERROR, "msgqRxMarkConsumed failed: %d\n", nRet);
         nvStatus = NV_ERR_GENERIC;
     }
+    else
+    {
+        pMQI->rxSeqNum++;
+    }
 
     return nvStatus;
 }
diff --git a/src/nvidia/src/kernel/gpu/mem_mgr/mem_desc.c b/src/nvidia/src/kernel/gpu/mem_mgr/mem_desc.c
index a2a01ff0e..4dd29461b 100644
--- a/src/nvidia/src/kernel/gpu/mem_mgr/mem_desc.c
+++ b/src/nvidia/src/kernel/gpu/mem_mgr/mem_desc.c
@@ -2661,6 +2661,10 @@ memdescCreateSubMem
         pMemDescNew->_flags |= MEMDESC_FLAGS_ENCRYPTED;
     else
         pMemDescNew->_flags &= ~MEMDESC_FLAGS_ENCRYPTED;
+    if (pMemDesc->_flags & MEMDESC_FLAGS_ALLOC_AS_LOCALIZED)
+        pMemDescNew->_flags |= MEMDESC_FLAGS_ALLOC_AS_LOCALIZED;
+    else
+        pMemDescNew->_flags &= ~MEMDESC_FLAGS_ALLOC_AS_LOCALIZED;
     pMemDescNew->_pageSize   = pMemDesc->_pageSize;
     pMemDescNew->pageArrayGranularity = pageArrayGranularity;
     pMemDescNew->_gpuCacheAttrib = pMemDesc->_gpuCacheAttrib;
diff --git a/src/nvidia/src/kernel/gpu/mem_mgr/sysmem_scrub.c b/src/nvidia/src/kernel/gpu/mem_mgr/sysmem_scrub.c
index bb9b016a4..ce5519df5 100644
--- a/src/nvidia/src/kernel/gpu/mem_mgr/sysmem_scrub.c
+++ b/src/nvidia/src/kernel/gpu/mem_mgr/sysmem_scrub.c
@@ -29,6 +29,17 @@
 #include "gpu/mem_mgr/ce_utils.h"
 #include "nvrm_registry.h"
 
+void
+_sysmemscrubFreeWorkerParams
+(
+    SysmemScrubberWorkerParams *pWorkerParams
+)
+{
+    if (pWorkerParams->pSpinlock != NULL)
+        portSyncSpinlockDestroy(pWorkerParams->pSpinlock);
+    portMemFree(pWorkerParams);
+}
+
 NV_STATUS
 sysmemscrubConstruct_IMPL
 (
@@ -47,7 +58,7 @@ sysmemscrubConstruct_IMPL
     pSysmemScrubber->pGpu = pGpu;
 
     // Disable by default until locking issues are addressed
-    pSysmemScrubber->bAsync = NV_FALSE;
+    pSysmemScrubber->bAsync = NV_TRUE;
 
     if (osReadRegistryDword(pGpu, NV_REG_STR_RM_DISABLE_ASYNC_SYSMEM_SCRUB, &data32) == NV_OK)
     {
@@ -56,11 +67,13 @@ sysmemscrubConstruct_IMPL
 
     pWorkerParams = portMemAllocNonPaged(sizeof (*pWorkerParams));
     NV_ASSERT_OR_RETURN(pWorkerParams != NULL, NV_ERR_NO_MEMORY);
+    pWorkerParams->pSpinlock = portSyncSpinlockCreate(portMemAllocatorGetGlobalNonPaged());
+    NV_ASSERT_TRUE_OR_GOTO(status, pWorkerParams->pSpinlock != NULL, NV_ERR_NO_MEMORY, failed);
     pWorkerParams->pSysmemScrubber = pSysmemScrubber;
     pWorkerParams->refCount = 1;
     pSysmemScrubber->pWorkerParams = pWorkerParams;
 
-    listInit(&pSysmemScrubber->asyncScrubList, portMemAllocatorGetGlobalNonPaged());
+    listInitIntrusive(&pSysmemScrubber->asyncScrubList);
 
     ceUtilsAllocParams.flags |= DRF_DEF(0050_CEUTILS, _FLAGS, _ENABLE_COMPLETION_CB, _TRUE);
     NV_ASSERT_OK_OR_GOTO(status,
@@ -70,7 +83,7 @@ sysmemscrubConstruct_IMPL
 failed:
     if (status != NV_OK)
     {
-        portMemFree(pWorkerParams);
+        _sysmemscrubFreeWorkerParams(pWorkerParams);
     }
 
     return status;
@@ -80,24 +93,60 @@ static void
 _sysmemscrubProcessCompletedEntries
 (
     SysmemScrubber *pSysmemScrubber,
-    NvU64 lastCompleted
+    SysmemScrubberWorkerParams *pWorkerParams
 )
 {
     SysScrubEntry *pEntry;
+    SysScrubList freeList;
 
-    while ((pEntry = listHead(&pSysmemScrubber->asyncScrubList)) != NULL)
+    //
+    // Destructor sets pWorkerParams->pSysmemScrubber to NULL
+    // After that the workers need to return early
+    // Destructor is responsible for draining all the work iself
+    // This is done as destructor can't flush all the pending workers
+    //
+
+    listInitIntrusive(&freeList);
+
+    portSyncSpinlockAcquire(pWorkerParams->pSpinlock);
+
+    if (pSysmemScrubber == NULL)
     {
-        if (pEntry->semaphoreValue > lastCompleted)
-            break;
+        // Destructor passes pSysmemScrubber directly, as pWorkerParams->pSysmemScrubber is NULL by then (see below)
+        pSysmemScrubber = pWorkerParams->pSysmemScrubber;
+    }
 
+    if (pSysmemScrubber != NULL)
+    {
+        // ceutilsDestruct() ensures that the work is completed
+        NvU64 lastCompleted = (pSysmemScrubber->pCeUtils == NULL) ?
+            NV_U64_MAX : ceutilsUpdateProgress(pSysmemScrubber->pCeUtils);
+
+        while ((pEntry = listHead(&pSysmemScrubber->asyncScrubList)) != NULL)
+        {
+            if (pEntry->semaphoreValue > lastCompleted)
+                break;
+
+            listRemove(&pSysmemScrubber->asyncScrubList, pEntry);
+            listAppendExisting(&freeList, pEntry);
+        }
+    }
+
+    portSyncSpinlockRelease(pWorkerParams->pSpinlock);
+
+    while ((pEntry = listHead(&freeList)) != NULL)
+    {
         NV_PRINTF(LEVEL_INFO, "freeing scrubbed pMemDesc=%p RefCount=%u DupCount=%u\n",
             pEntry->pMemDesc, pEntry->pMemDesc->RefCount, pEntry->pMemDesc->DupCount);
 
         memdescFree(pEntry->pMemDesc);
         memdescDestroy(pEntry->pMemDesc);
 
-        listRemove(&pSysmemScrubber->asyncScrubList, pEntry);
+        listRemove(&freeList, pEntry);
+        portMemFree(pEntry);
     }
+
+    listDestroy(&freeList);
 }
 
 static void
@@ -108,46 +157,57 @@ _sysmemscrubProcessCompletedEntriesCb
 )
 {
     SysmemScrubberWorkerParams *pWorkerParams = pArg;
-    SysmemScrubber *pSysmemScrubber = pWorkerParams->pSysmemScrubber;
-
-    if (--pWorkerParams->refCount == 0)
-        portMemFree(pWorkerParams);
-
-    if (pSysmemScrubber == NULL)
-        return;
 
     NV_PRINTF(LEVEL_SILENT, "processing completed scrub work in deferred work item\n");
 
-    pSysmemScrubber->bCallbackQueued = NV_FALSE;
+    portAtomicSetU32(&pWorkerParams->bWorkerQueued, NV_FALSE);
 
-    _sysmemscrubProcessCompletedEntries(pSysmemScrubber, ceutilsUpdateProgress(pSysmemScrubber->pCeUtils));
+    _sysmemscrubProcessCompletedEntries(NULL, pWorkerParams);
+
+    if (portAtomicDecrementU32(&pWorkerParams->refCount) == 0)
+    {
+        _sysmemscrubFreeWorkerParams(pWorkerParams);
+    }
 }
 
 static NvBool
 _sysmemscrubIsWorkPending
 (
-    SysmemScrubber *pSysmemScrubber
+    SysmemScrubberWorkerParams *pWorkerParams
 )
 {
     // TODO: remove this function when CeUtils migrates to SemaphoreSurface
-    SysScrubEntry *pEntry = listHead(&pSysmemScrubber->asyncScrubList);
+    SysmemScrubber *pSysmemScrubber;
+    SysScrubEntry *pEntry;
+    NvBool bWorkPending = NV_FALSE;
 
-    return pEntry != NULL && pEntry->semaphoreValue <= ceutilsUpdateProgress(pSysmemScrubber->pCeUtils);
+    portSyncSpinlockAcquire(pWorkerParams->pSpinlock);
+    pSysmemScrubber = pWorkerParams->pSysmemScrubber;
+    if (pSysmemScrubber != NULL)
+    {
+        pEntry = listHead(&pSysmemScrubber->asyncScrubList);
+        bWorkPending = pEntry != NULL && pEntry->semaphoreValue <= ceutilsUpdateProgress(pSysmemScrubber->pCeUtils);
+    }
+    portSyncSpinlockRelease(pWorkerParams->pSpinlock);
+
+    return bWorkPending;
 }
 
 
 static void
 _sysmemscrubQueueProcessCompletedEntries(void *pArg)
 {
+    // The event handler can't get called after destructor, as the event gets deregistered
     SysmemScrubber *pSysmemScrubber = pArg;
     SysmemScrubberWorkerParams *pWorkerParams = pSysmemScrubber->pWorkerParams;
 
     NV_PRINTF(LEVEL_SILENT, "scrub completed callback\n");
 
-    NV_ASSERT_OR_RETURN_VOID(rmDeviceGpuLockIsOwner(pSysmemScrubber->pGpu->gpuInstance) || rmGpuLockIsOwner());
-
-    if (pWorkerParams->pSysmemScrubber == NULL || pSysmemScrubber->bCallbackQueued || !_sysmemscrubIsWorkPending(pSysmemScrubber))
+    if (portAtomicAddU32(&pWorkerParams->bWorkerQueued, 0) ||
+        !_sysmemscrubIsWorkPending(pWorkerParams))
+    {
         return;
+    }
 
     // queue work to run it outside interrupt context
     NV_ASSERT_OR_RETURN_VOID(
@@ -157,8 +217,8 @@ _sysmemscrubQueueProcessCompletedEntries(void *pArg)
                         OS_QUEUE_WORKITEM_FLAGS_LOCK_GPU_GROUP_DEVICE |
                         OS_QUEUE_WORKITEM_FLAGS_FULL_GPU_SANITY) == NV_OK);
 
-    pWorkerParams->refCount++;
-    pSysmemScrubber->bCallbackQueued = NV_TRUE;
+    portAtomicSetU32(&pWorkerParams->bWorkerQueued, NV_TRUE);
+    portAtomicIncrementU32(&pWorkerParams->refCount);
 }
 
 static NV_STATUS
@@ -176,11 +236,15 @@ _sysmemscrubScrubAndFreeAsync
         .pCompletionCallback = _sysmemscrubQueueProcessCompletedEntries,
         .pCompletionCallbackArg = pSysmemScrubber
     };
-    SysScrubEntry *pEntry = listAppendNew(&pSysmemScrubber->asyncScrubList);
+    SysmemScrubberWorkerParams *pWorkerParams = pSysmemScrubber->pWorkerParams;
+    SysScrubEntry *pEntry = portMemAllocNonPaged(sizeof (*pEntry));
     NV_STATUS status;
 
     NV_ASSERT_OR_RETURN(pEntry != NULL, NV_ERR_NO_MEMORY);
 
+    portSyncSpinlockAcquire(pWorkerParams->pSpinlock);
+    listAppendExisting(&pSysmemScrubber->asyncScrubList, pEntry);
+
     //
     // RM might be holding memory references despite memory is freed by the user
     // This should not happen for compressed allocations, so don't handle it and clear memdesc anyway
@@ -203,7 +267,9 @@ _sysmemscrubScrubAndFreeAsync
     else
     {
         listRemove(&pSysmemScrubber->asyncScrubList, pEntry);
+        portMemFree(pEntry);
     }
+    portSyncSpinlockRelease(pWorkerParams->pSpinlock);
 
     return status;
 }
@@ -237,7 +303,7 @@ sysmemscrubScrubAndFree_IMPL
     NV_ASSERT(pMemDesc->Size == pMemDesc->ActualSize);
 
     // WAR: currently queuing work out of ISR can fail, clean it up here
-    _sysmemscrubProcessCompletedEntries(pSysmemScrubber, ceutilsUpdateProgress(pSysmemScrubber->pCeUtils));
+    _sysmemscrubProcessCompletedEntries(NULL, pSysmemScrubber->pWorkerParams);
 
     if (pSysmemScrubber->bAsync &&
         _sysmemscrubScrubAndFreeAsync(pSysmemScrubber, pMemDesc) == NV_OK)
@@ -258,13 +324,18 @@ sysmemscrubDestruct_IMPL
 {
     SysmemScrubberWorkerParams *pWorkerParams = pSysmemScrubber->pWorkerParams;
 
+    portSyncSpinlockAcquire(pWorkerParams->pSpinlock);
     pWorkerParams->pSysmemScrubber = NULL;
+    portSyncSpinlockRelease(pWorkerParams->pSpinlock);
 
     objDelete(pSysmemScrubber->pCeUtils);
-    _sysmemscrubProcessCompletedEntries(pSysmemScrubber, NV_U64_MAX);
+    pSysmemScrubber->pCeUtils = NULL;
 
-    if (--pWorkerParams->refCount == 0)
-        portMemFree(pWorkerParams);
+    // pWorkerParams->pSysmemScrubber is NULL, so wokers won't run at this point
+    _sysmemscrubProcessCompletedEntries(pSysmemScrubber, pWorkerParams);
+
+    if (portAtomicDecrementU32(&pWorkerParams->refCount) == 0)
+        _sysmemscrubFreeWorkerParams(pWorkerParams);
 
     NV_ASSERT(listCount(&pSysmemScrubber->asyncScrubList) == 0);
     listDestroy(&pSysmemScrubber->asyncScrubList);
diff --git a/src/nvidia/src/kernel/gpu/mmu/arch/hopper/kern_gmmu_gh100.c b/src/nvidia/src/kernel/gpu/mmu/arch/hopper/kern_gmmu_gh100.c
index 796c50672..19cc74b5d 100644
--- a/src/nvidia/src/kernel/gpu/mmu/arch/hopper/kern_gmmu_gh100.c
+++ b/src/nvidia/src/kernel/gpu/mmu/arch/hopper/kern_gmmu_gh100.c
@@ -489,6 +489,16 @@ kgmmuFaultBufferAllocSharedMemory_GH100
     memdescSetPageSize(pMemDesc, AT_GPU, RM_PAGE_SIZE_HUGE);
     memdescTagAlloc(status, NV_FB_ALLOC_RM_INTERNAL_OWNER_UNNAMED_TAG_131, 
                     pMemDesc);
+
+    if (status == NV_ERR_NO_MEMORY)
+    {
+        // TODO: Bug 5299603
+        NV_PRINTF(LEVEL_ERROR, "Allocation failed with big page size, retrying with default page size\n");
+        memdescSetPageSize(pMemDesc, AT_GPU, RM_PAGE_SIZE);
+        memdescTagAlloc(status, NV_FB_ALLOC_RM_INTERNAL_OWNER_UNNAMED_TAG_131,
+                        pMemDesc);
+    }
+
     if (status != NV_OK)
     {
         goto destroy_memdesc;
diff --git a/src/nvidia/src/kernel/gpu/nvlink/kernel_nvlink.c b/src/nvidia/src/kernel/gpu/nvlink/kernel_nvlink.c
index 891f421c8..e4908ec36 100644
--- a/src/nvidia/src/kernel/gpu/nvlink/kernel_nvlink.c
+++ b/src/nvidia/src/kernel/gpu/nvlink/kernel_nvlink.c
@@ -276,6 +276,32 @@ _knvlinkCheckFabricCliqueId
     return NV_TRUE;
 }
 
+static NvBool
+_knvlinkCheckFabricProbeHealth
+(
+    OBJGPU       *pGpu,
+    OBJGPU       *pPeerGpu
+)
+{
+    NvU32 healthStatusMask = 0;
+    NvU32 peerHealthStatusMask = 0;
+    NV_STATUS status;
+    
+    status = gpuFabricProbeGetFabricHealthStatus(pGpu->pGpuFabricProbeInfoKernel, &healthStatusMask);
+    NV_ASSERT_OK_OR_RETURN(status);
+ 
+    status = gpuFabricProbeGetFabricHealthStatus(pPeerGpu->pGpuFabricProbeInfoKernel, &peerHealthStatusMask);
+    NV_ASSERT_OK_OR_RETURN(status);
+ 
+    if (nvlinkGetFabricHealthSummary(healthStatusMask) == NVLINK_INBAND_FABRIC_HEALTH_SUMMARY_UNHEALTHY ||
+        nvlinkGetFabricHealthSummary(peerHealthStatusMask) == NVLINK_INBAND_FABRIC_HEALTH_SUMMARY_UNHEALTHY)
+    {
+        return NV_FALSE;
+    }
+    
+    return NV_TRUE;
+}
+
 /*!
  * @brief Checks whether EGM addresses are valid for P2P
  * when GPU is connected to NVSwitch
@@ -372,7 +398,8 @@ knvlinkCheckNvswitchP2pConfig_IMPL
 
         if (gpuFabricProbeIsSupported(pGpu) && gpuFabricProbeIsSupported(pPeerGpu))
         {
-            if (!_knvlinkCheckFabricCliqueId(pGpu, pPeerGpu))
+            if (!_knvlinkCheckFabricCliqueId(pGpu, pPeerGpu) ||
+                !_knvlinkCheckFabricProbeHealth(pGpu, pPeerGpu))
             {
                 return NV_FALSE;
             }
diff --git a/src/nvidia/src/kernel/mem_mgr/mem_multicast_fabric.c b/src/nvidia/src/kernel/mem_mgr/mem_multicast_fabric.c
index 0901db1f1..c65bed261 100644
--- a/src/nvidia/src/kernel/mem_mgr/mem_multicast_fabric.c
+++ b/src/nvidia/src/kernel/mem_mgr/mem_multicast_fabric.c
@@ -155,10 +155,19 @@ typedef struct mem_multicast_fabric_descriptor
 
     //
     // Boolean to be set when an Inband request has been sent to FM
-    // and is currently in progress
+    // and is currently in progress.
+    //
+    // This flag is only set on the prime object.
     //
     NvBool bInbandReqInProgress;
 
+    //
+    // Boolean set when an inband request response is received.
+    //
+    // This flag is set on both prime and non-prime objects.
+    //
+    NvBool bResponseReceived;
+
     //
     // Request Id associated with the Inband request in progress when
     // bInbandReqSent is set to true
@@ -1110,27 +1119,43 @@ _memMulticastFabricDescriptorFree
                                     MEM_MULTICAST_FABRIC_TEAM_RELEASE_REQUEST);
     }
 
+    //
+    // In the process cleanup path or a deferred cleanup path, skip waiting on
+    // the clients which are being torn down. The process could be already in
+    // uninterruptible state at that point, and if for some reason GFM doesn't
+    // respond, we will be stuck indefinitely in the wait queue. Instead march
+    // on, and handle the cleanup later (see memorymulticastfabricTeamSetupResponseCallback)
+    // whenever GFM responds.
+    //
+    // This wait is really required for interruptible cases like NvRmFree(),
+    // to mimic a synchronous op.
+    //
     if (pMulticastFabricDesc->bInbandReqInProgress)
     {
-        OS_WAIT_QUEUE *pWq;
         THREAD_STATE_NODE *pThreadNode = NULL;
         THREAD_STATE_FREE_CALLBACK freeCallback;
 
-        NV_ASSERT_OK(osAllocWaitQueue(&pWq));
+        NV_ASSERT_OK(threadStateGetCurrent(&pThreadNode, NULL));
 
-        if (pWq != NULL)
+        if (!((pThreadNode->flags & THREAD_STATE_FLAGS_IS_EXITING) ||
+              (pThreadNode->flags & THREAD_STATE_FLAGS_IS_KERNEL_THREAD)))
         {
-            NV_ASSERT_OK(fabricMulticastCleanupCacheInsert(pFabric,
-                                        pMulticastFabricDesc->inbandReqId,
-                                        pWq));
+            OS_WAIT_QUEUE *pWq = NULL;
+            NV_ASSERT_OK(osAllocWaitQueue(&pWq));
 
-            NV_ASSERT_OK(threadStateGetCurrent(&pThreadNode, NULL));
+            if (pWq != NULL)
+            {
+                NV_ASSERT_OK(fabricMulticastCleanupCacheInsert(pFabric,
+                                            pMulticastFabricDesc->inbandReqId,
+                                            pWq));
 
-            freeCallback.pCb = fabricMulticastWaitOnTeamCleanupCallback;
-            freeCallback.pCbData = (void *)pMulticastFabricDesc->inbandReqId;
 
-            NV_ASSERT_OK(threadStateEnqueueCallbackOnFree(pThreadNode,
-                                                          &freeCallback));
+                freeCallback.pCb = fabricMulticastWaitOnTeamCleanupCallback;
+                freeCallback.pCbData = (void *)pMulticastFabricDesc->inbandReqId;
+
+                NV_ASSERT_OK(threadStateEnqueueCallbackOnFree(pThreadNode,
+                                                              &freeCallback));
+            }
         }
     }
 
@@ -1668,34 +1693,8 @@ memorymulticastfabricTeamSetupResponseCallback
 
     pMulticastFabricDesc = fabricMulticastSetupCacheGet(pFabric, requestId);
 
-    if ((pMulticastFabricDesc != NULL) && (mcTeamStatus == NV_ERR_BUSY_RETRY))
+    if (pMulticastFabricDesc != NULL)
     {
-        NvBool bRetrySuccess;
-
-        portSyncRwLockAcquireWrite(pMulticastFabricDesc->pLock);
-
-        pMulticastFabricDesc->bInbandReqInProgress = NV_FALSE;
-
-        _memMulticastFabricAttachGpuPostProcessor(pGpu,
-                                                  pMulticastFabricDesc,
-                                                  mcTeamStatus,
-                                                  mcTeamHandle,
-                                                  mcAddressBase,
-                                                  mcAddressSize);
-
-        bRetrySuccess = pMulticastFabricDesc->bInbandReqInProgress;
-
-        portSyncRwLockReleaseWrite(pMulticastFabricDesc->pLock);
-
-        if (!bRetrySuccess)
-            fabricMulticastSetupCacheDelete(pFabric, requestId);
-
-        portSyncRwLockReleaseWrite(pFabric->pMulticastFabricModuleLock);
-    }
-    else if (pMulticastFabricDesc != NULL)
-    {
-        fabricMulticastSetupCacheDelete(pFabric, requestId);
-
         //
         // We have now safely acquired pMulticastFabricDesc->lock, which
         // should block the destructor from removing pMulticastFabricDesc
@@ -1709,14 +1708,20 @@ memorymulticastfabricTeamSetupResponseCallback
         //
         portSyncRwLockReleaseWrite(pFabric->pMulticastFabricModuleLock);
 
-        pMulticastFabricDesc->bInbandReqInProgress = NV_FALSE;
+        if (!pMulticastFabricDesc->bResponseReceived)
+        {
+            pMulticastFabricDesc->bInbandReqInProgress = NV_FALSE;
 
-        _memMulticastFabricAttachGpuPostProcessor(pGpu,
-                                                  pMulticastFabricDesc,
-                                                  mcTeamStatus,
-                                                  mcTeamHandle,
-                                                  mcAddressBase,
-                                                  mcAddressSize);
+            // This call sets `bInbandReqInProgress` on a successful retry.
+            _memMulticastFabricAttachGpuPostProcessor(pGpu,
+                                                      pMulticastFabricDesc,
+                                                      mcTeamStatus,
+                                                      mcTeamHandle,
+                                                      mcAddressBase,
+                                                      mcAddressSize);
+
+            pMulticastFabricDesc->bResponseReceived = !pMulticastFabricDesc->bInbandReqInProgress;
+        }
 
         portSyncRwLockReleaseWrite(pMulticastFabricDesc->pLock);
     }
diff --git a/src/nvidia/src/kernel/os/os_init.c b/src/nvidia/src/kernel/os/os_init.c
index 16cc4d48a..c442f8c80 100644
--- a/src/nvidia/src/kernel/os/os_init.c
+++ b/src/nvidia/src/kernel/os/os_init.c
@@ -299,9 +299,9 @@ NvU32 vgpuDevReadReg032(
     OBJSYS        *pSys = SYS_GET_INSTANCE();
     OBJHYPERVISOR *pHypervisor = SYS_GET_HYPERVISOR(pSys);
 
-    if(!pGpu ||
-       !pHypervisor || !pHypervisor->bDetected || !pHypervisor->bIsHVMGuest ||
-       !GPU_GET_KERNEL_BIF(pGpu))
+
+    if (!pGpu || !GPU_GET_KERNEL_BIF(pGpu) ||
+        (!IS_VIRTUAL(pGpu) && !(pHypervisor && pHypervisor->bDetected && pHypervisor->bIsHVMGuest)))
     {
         *vgpuHandled = NV_FALSE;
         return 0;
diff --git a/src/nvidia/src/kernel/vgpu/rpcstructurecopy.c b/src/nvidia/src/kernel/vgpu/rpcstructurecopy.c
index 115d76822..231b85d93 100644
--- a/src/nvidia/src/kernel/vgpu/rpcstructurecopy.c
+++ b/src/nvidia/src/kernel/vgpu/rpcstructurecopy.c
@@ -393,7 +393,7 @@ NV_STATUS deserialize_NV2080_CTRL_GR_GET_SM_ISSUE_RATE_MODIFIER_V2_PARAMS_v2B_06
 
         grSmIssueRateModifierV2->smIssueRateModifierListSize = gr_get_sm_issue_rate_modifier_v2B_06->smIssueRateModifierListSize;
 
-        if (gr_get_sm_issue_rate_modifier_v2B_06->smIssueRateModifierListSize >= NV2080_CTRL_GR_SM_ISSUE_RATE_MODIFIER_V2_MAX_LIST_SIZE_v2B_06)
+        if (gr_get_sm_issue_rate_modifier_v2B_06->smIssueRateModifierListSize > NV2080_CTRL_GR_SM_ISSUE_RATE_MODIFIER_V2_MAX_LIST_SIZE_v2B_06)
         {
             return NV_ERR_OUT_OF_RANGE;
         }
@@ -433,7 +433,7 @@ NV_STATUS deserialize_NV2080_CTRL_GR_GET_SM_ISSUE_THROTTLE_CTRL_PARAMS_v2B_10(NV
 
         grSmIssueThrottleCtrl->smIssueThrottleCtrlListSize = gr_get_sm_issue_throttle_ctrl_v2B_10->smIssueThrottleCtrlListSize;
 
-        if (gr_get_sm_issue_throttle_ctrl_v2B_10->smIssueThrottleCtrlListSize >= NV2080_CTRL_GR_SM_ISSUE_THROTTLE_CTRL_MAX_LIST_SIZE_v2B_10)
+        if (gr_get_sm_issue_throttle_ctrl_v2B_10->smIssueThrottleCtrlListSize > NV2080_CTRL_GR_SM_ISSUE_THROTTLE_CTRL_MAX_LIST_SIZE_v2B_10)
         {
             return NV_ERR_OUT_OF_RANGE;
         }
@@ -859,7 +859,7 @@ NV_STATUS deserialize_NV2080_CTRL_INTERNAL_STATIC_GR_GET_SM_ISSUE_RATE_MODIFIER_
         {
             smIssueRateModifierV2->smIssueRateModifierV2[i].smIssueRateModifierListSize = rate_modifier_v2B_06->smIssueRateModifierV2[i].smIssueRateModifierListSize;
 
-            if (rate_modifier_v2B_06->smIssueRateModifierV2[i].smIssueRateModifierListSize >= NV2080_CTRL_GR_SM_ISSUE_RATE_MODIFIER_V2_MAX_LIST_SIZE_v2B_06)
+            if (rate_modifier_v2B_06->smIssueRateModifierV2[i].smIssueRateModifierListSize > NV2080_CTRL_GR_SM_ISSUE_RATE_MODIFIER_V2_MAX_LIST_SIZE_v2B_06)
             {
                 return NV_ERR_OUT_OF_RANGE;
             }
@@ -903,7 +903,7 @@ NV_STATUS deserialize_NV2080_CTRL_INTERNAL_STATIC_GR_GET_SM_ISSUE_THROTTLE_CTRL_
         {
             smIssueThrottleCtrl->smIssueThrottleCtrl[i].smIssueThrottleCtrlListSize = throttle_ctrl_v2B_10->smIssueThrottleCtrl[i].smIssueThrottleCtrlListSize;
 
-            if (throttle_ctrl_v2B_10->smIssueThrottleCtrl[i].smIssueThrottleCtrlListSize >= NV2080_CTRL_GR_SM_ISSUE_THROTTLE_CTRL_MAX_LIST_SIZE_v2B_10)
+            if (throttle_ctrl_v2B_10->smIssueThrottleCtrl[i].smIssueThrottleCtrlListSize > NV2080_CTRL_GR_SM_ISSUE_THROTTLE_CTRL_MAX_LIST_SIZE_v2B_10)
             {
                 return NV_ERR_OUT_OF_RANGE;
             }
diff --git a/src/nvidia/src/kernel/virtualization/kernel_vgpu_mgr.c b/src/nvidia/src/kernel/virtualization/kernel_vgpu_mgr.c
index a7019acd3..e98973d6b 100644
--- a/src/nvidia/src/kernel/virtualization/kernel_vgpu_mgr.c
+++ b/src/nvidia/src/kernel/virtualization/kernel_vgpu_mgr.c
@@ -1322,9 +1322,22 @@ kvgpumgrGuestRegister(OBJGPU *pGpu,
         }
     }
 
+    /* On device-vm, swizzId is reserved during A084 object creation */
+    if (IS_MIG_ENABLED(pGpu) && (osIsVgpuDeviceVmPresent() == NV_OK))
+    {
+        NvU32 partitionFlag;
+
+        if (swizzId >= KMIGMGR_MAX_GPU_SWIZZID)
+            return NV_ERR_INVALID_ARGUMENT;
+
+        NV_ASSERT_OK_OR_RETURN(kvgpumgrGetPartitionFlag(vgpuType, &partitionFlag));
+
+        NV_ASSERT_OK_OR_RETURN(kvgpumgrGetSwizzId(pGpu, pPhysGpuInfo, partitionFlag,
+                               pPhysGpuInfo->vgpuTypes[vgpuTypeIdx], &swizzId));
+
+    }
+
     /*
-     * For MIG mode, vGPU type is already validated based on swizzid in
-     * NVA081_CTRL_CMD_VGPU_CONFIG_[GET_FREE|VALIDATE]_SWIZZID RmCtrl.
      * For heterogeneous vGPU mode, vGPU type is already validated based on placement ID
      * in NVA081_CTRL_CMD_VGPU_CONFIG_UPDATE_HETEROGENEOUS_INFO RmCtrl.
      * Both the RmCtrls are done before allocating the A084 object.
@@ -2101,14 +2114,17 @@ kvgpumgrGetSwizzId(OBJGPU *pGpu,
     NvU32 id;
     NV_STATUS rmStatus = NV_OK;
     VGPU_TYPE *existingVgpuTypeInfo = NULL;
+    NvBool bIsSwizzIdReserved = NV_FALSE;
 
     swizzIdInUseMask = kmigmgrGetSwizzIdInUseMask(pGpu, pKernelMIGManager);
 
-    *swizzId = KMIGMGR_SWIZZID_INVALID;
-
     // Determine valid swizzids not assigned to any vGPU device.
     for (id = 0; id < KMIGMGR_MAX_GPU_SWIZZID; id++)
     {
+        //If specified GI is present, ignore other GIs
+        if ((*swizzId != KMIGMGR_SWIZZID_INVALID) && (*swizzId != id))
+            continue;
+
         if (NVBIT64(id) & swizzIdInUseMask)
         {
             KERNEL_MIG_GPU_INSTANCE *pKernelMIGGpuInstance;
@@ -2173,13 +2189,14 @@ kvgpumgrGetSwizzId(OBJGPU *pGpu,
                 {
                     NV_ASSERT_OK_OR_RETURN(_kvgpumgrSetAssignedSwizzIdMask(pGpu, vgpuTypeInfo, pKernelMIGGpuInstance->swizzId));
                     *swizzId = pKernelMIGGpuInstance->swizzId;
+                    bIsSwizzIdReserved = NV_TRUE;
                     break;
                 }
             }
         }
     }
 
-    if (*swizzId == KMIGMGR_SWIZZID_INVALID)
+    if (bIsSwizzIdReserved == NV_FALSE)
     {
         return NV_ERR_INVALID_STATE;
     }
diff --git a/src/nvidia/src/kernel/virtualization/vgpuconfigapi.c b/src/nvidia/src/kernel/virtualization/vgpuconfigapi.c
index 335ae56b2..abf7c2717 100644
--- a/src/nvidia/src/kernel/virtualization/vgpuconfigapi.c
+++ b/src/nvidia/src/kernel/virtualization/vgpuconfigapi.c
@@ -1317,6 +1317,7 @@ vgpuconfigapiCtrlCmdVgpuConfigGetFreeSwizzId_IMPL
         NV_ASSERT_OK_OR_RETURN(
             kvgpumgrGetVgpuTypeInfo(pParams->vgpuTypeId, &vgpuTypeInfo));
 
+        pParams->swizzId = KMIGMGR_SWIZZID_INVALID;
         NV_ASSERT_OK_OR_RETURN(
             kvgpumgrGetSwizzId(pGpu, pPhysGpuInfo, partitionFlag, vgpuTypeInfo, &pParams->swizzId));
     }
diff --git a/version.mk b/version.mk
index 08a250493..97958be83 100644
--- a/version.mk
+++ b/version.mk
@@ -1,5 +1,5 @@
-NVIDIA_VERSION = 580.94.06
-NVIDIA_NVID_VERSION = 580.94.06
+NVIDIA_VERSION = 580.94.10
+NVIDIA_NVID_VERSION = 580.94.10
 NVIDIA_NVID_EXTRA = 
 
 # This file.