560.28.03

2026-01-31 05:29:47 +00:00 · 2024-07-19 15:45:15 -07:00
parent 5fdf5032fb
commit 448d5cc656
859 changed files with 165424 additions and 91129 deletions
--- a/kernel-open/nvidia-uvm/clc96f.h
+++ b/kernel-open/nvidia-uvm/clc96f.h
@@ -0,0 +1,329 @@
+/*******************************************************************************
+    Copyright (c) 2012-2015 NVIDIA Corporation
+
+    Permission is hereby granted, free of charge, to any person obtaining a copy
+    of this software and associated documentation files (the "Software"), to
+    deal in the Software without restriction, including without limitation the
+    rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+    sell copies of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+
+    The above copyright notice and this permission notice shall be
+    included in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+    THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+    DEALINGS IN THE SOFTWARE.
+
+*******************************************************************************/
+
+
+#ifndef _clc96f_h_
+#define _clc96f_h_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "nvtypes.h"
+
+/* class BLACKWELL_CHANNEL_GPFIFO  */
+/*
+ * Documentation for BLACKWELL_CHANNEL_GPFIFO can be found in dev_pbdma.ref,
+ * chapter "User Control Registers". It is documented as device NV_UDMA.
+ * The GPFIFO format itself is also documented in dev_pbdma.ref,
+ * NV_PPBDMA_GP_ENTRY_*. The pushbuffer format is documented in dev_ram.ref,
+ * chapter "FIFO DMA RAM", NV_FIFO_DMA_*.
+ *
+ * Note there is no .mfs file for this class.
+ */
+#define  BLACKWELL_CHANNEL_GPFIFO_A                           (0x0000C96F)
+
+#define NVC96F_TYPEDEF                             BLACKWELL_CHANNELChannelGPFifoA
+
+/* dma flow control data structure */
+typedef volatile struct Nvc96fControl_struct {
+ NvU32 Ignored00[0x23];        /*                                  0000-008b*/
+ NvU32 GPPut;                   /* GP FIFO put offset               008c-008f*/
+ NvU32 Ignored01[0x5c];
+} Nvc96fControl, BlackwellAControlGPFifo;
+
+/* fields and values */
+#define NVC96F_NUMBER_OF_SUBCHANNELS                               (8)
+#define NVC96F_SET_OBJECT                                          (0x00000000)
+#define NVC96F_SET_OBJECT_NVCLASS                                         15:0
+#define NVC96F_SET_OBJECT_ENGINE                                         20:16
+#define NVC96F_SET_OBJECT_ENGINE_SW                                 0x0000001f
+#define NVC96F_NOP                                                 (0x00000008)
+#define NVC96F_NOP_HANDLE                                                 31:0
+#define NVC96F_NON_STALL_INTERRUPT                                 (0x00000020)
+#define NVC96F_NON_STALL_INTERRUPT_HANDLE                                 31:0
+#define NVC96F_FB_FLUSH                                            (0x00000024) // Deprecated - use MEMBAR TYPE SYS_MEMBAR
+#define NVC96F_FB_FLUSH_HANDLE                                            31:0
+// NOTE - MEM_OP_A and MEM_OP_B have been replaced in gp100 with methods for
+// specifying the page address for a targeted TLB invalidate and the uTLB for
+// a targeted REPLAY_CANCEL for UVM.
+// The previous MEM_OP_A/B functionality is in MEM_OP_C/D, with slightly
+// rearranged fields.
+#define NVC96F_MEM_OP_A                                            (0x00000028)
+#define NVC96F_MEM_OP_A_TLB_INVALIDATE_CANCEL_TARGET_CLIENT_UNIT_ID        5:0  // only relevant for REPLAY_CANCEL_TARGETED
+#define NVC96F_MEM_OP_A_TLB_INVALIDATE_INVALIDATION_SIZE                   5:0  // Used to specify size of invalidate, used for invalidates which are not of the REPLAY_CANCEL_TARGETED type
+#define NVC96F_MEM_OP_A_TLB_INVALIDATE_CANCEL_TARGET_GPC_ID               10:6  // only relevant for REPLAY_CANCEL_TARGETED
+#define NVC96F_MEM_OP_A_TLB_INVALIDATE_INVAL_SCOPE                         7:6  // only relevant for invalidates with NVC96F_MEM_OP_C_TLB_INVALIDATE_REPLAY_NONE for invalidating  link TLB only, or non-link TLB only or all TLBs
+#define NVC96F_MEM_OP_A_TLB_INVALIDATE_INVAL_SCOPE_ALL_TLBS                  0
+#define NVC96F_MEM_OP_A_TLB_INVALIDATE_INVAL_SCOPE_LINK_TLBS                 1
+#define NVC96F_MEM_OP_A_TLB_INVALIDATE_INVAL_SCOPE_NON_LINK_TLBS             2
+#define NVC96F_MEM_OP_A_TLB_INVALIDATE_INVAL_SCOPE_RSVRVD                    3
+#define NVC96F_MEM_OP_A_TLB_INVALIDATE_CANCEL_MMU_ENGINE_ID                8:0  // only relevant for REPLAY_CANCEL_VA_GLOBAL
+#define NVC96F_MEM_OP_A_TLB_INVALIDATE_SYSMEMBAR                         11:11
+#define NVC96F_MEM_OP_A_TLB_INVALIDATE_SYSMEMBAR_EN                 0x00000001
+#define NVC96F_MEM_OP_A_TLB_INVALIDATE_SYSMEMBAR_DIS                0x00000000
+#define NVC96F_MEM_OP_A_TLB_INVALIDATE_TARGET_ADDR_LO                    31:12
+#define NVC96F_MEM_OP_B                                            (0x0000002c)
+#define NVC96F_MEM_OP_B_TLB_INVALIDATE_TARGET_ADDR_HI                     31:0
+#define NVC96F_MEM_OP_C                                            (0x00000030)
+#define NVC96F_MEM_OP_C_MEMBAR_TYPE                                        2:0
+#define NVC96F_MEM_OP_C_MEMBAR_TYPE_SYS_MEMBAR                      0x00000000
+#define NVC96F_MEM_OP_C_MEMBAR_TYPE_MEMBAR                          0x00000001
+#define NVC96F_MEM_OP_C_TLB_INVALIDATE_PDB                                 0:0
+#define NVC96F_MEM_OP_C_TLB_INVALIDATE_PDB_ONE                      0x00000000
+#define NVC96F_MEM_OP_C_TLB_INVALIDATE_PDB_ALL                      0x00000001  // Probably nonsensical for MMU_TLB_INVALIDATE_TARGETED
+#define NVC96F_MEM_OP_C_TLB_INVALIDATE_GPC                                 1:1
+#define NVC96F_MEM_OP_C_TLB_INVALIDATE_GPC_ENABLE                   0x00000000
+#define NVC96F_MEM_OP_C_TLB_INVALIDATE_GPC_DISABLE                  0x00000001
+#define NVC96F_MEM_OP_C_TLB_INVALIDATE_REPLAY                              4:2  // only relevant if GPC ENABLE
+#define NVC96F_MEM_OP_C_TLB_INVALIDATE_REPLAY_NONE                  0x00000000
+#define NVC96F_MEM_OP_C_TLB_INVALIDATE_REPLAY_START                 0x00000001
+#define NVC96F_MEM_OP_C_TLB_INVALIDATE_REPLAY_START_ACK_ALL         0x00000002
+#define NVC96F_MEM_OP_C_TLB_INVALIDATE_REPLAY_CANCEL_TARGETED       0x00000003
+#define NVC96F_MEM_OP_C_TLB_INVALIDATE_REPLAY_CANCEL_GLOBAL         0x00000004
+#define NVC96F_MEM_OP_C_TLB_INVALIDATE_REPLAY_CANCEL_VA_GLOBAL      0x00000005
+#define NVC96F_MEM_OP_C_TLB_INVALIDATE_ACK_TYPE                            6:5  // only relevant if GPC ENABLE
+#define NVC96F_MEM_OP_C_TLB_INVALIDATE_ACK_TYPE_NONE                0x00000000
+#define NVC96F_MEM_OP_C_TLB_INVALIDATE_ACK_TYPE_GLOBALLY            0x00000001
+#define NVC96F_MEM_OP_C_TLB_INVALIDATE_ACK_TYPE_INTRANODE           0x00000002
+#define NVC96F_MEM_OP_C_TLB_INVALIDATE_ACCESS_TYPE                         9:7 //only relevant for REPLAY_CANCEL_VA_GLOBAL
+#define NVC96F_MEM_OP_C_TLB_INVALIDATE_ACCESS_TYPE_VIRT_READ                 0
+#define NVC96F_MEM_OP_C_TLB_INVALIDATE_ACCESS_TYPE_VIRT_WRITE                1
+#define NVC96F_MEM_OP_C_TLB_INVALIDATE_ACCESS_TYPE_VIRT_ATOMIC_STRONG        2
+#define NVC96F_MEM_OP_C_TLB_INVALIDATE_ACCESS_TYPE_VIRT_RSVRVD               3
+#define NVC96F_MEM_OP_C_TLB_INVALIDATE_ACCESS_TYPE_VIRT_ATOMIC_WEAK          4
+#define NVC96F_MEM_OP_C_TLB_INVALIDATE_ACCESS_TYPE_VIRT_ATOMIC_ALL           5
+#define NVC96F_MEM_OP_C_TLB_INVALIDATE_ACCESS_TYPE_VIRT_WRITE_AND_ATOMIC     6
+#define NVC96F_MEM_OP_C_TLB_INVALIDATE_ACCESS_TYPE_VIRT_ALL                  7
+#define NVC96F_MEM_OP_C_TLB_INVALIDATE_PAGE_TABLE_LEVEL                    9:7  // Invalidate affects this level and all below
+#define NVC96F_MEM_OP_C_TLB_INVALIDATE_PAGE_TABLE_LEVEL_ALL         0x00000000  // Invalidate tlb caches at all levels of the page table
+#define NVC96F_MEM_OP_C_TLB_INVALIDATE_PAGE_TABLE_LEVEL_PTE_ONLY    0x00000001
+#define NVC96F_MEM_OP_C_TLB_INVALIDATE_PAGE_TABLE_LEVEL_UP_TO_PDE0  0x00000002
+#define NVC96F_MEM_OP_C_TLB_INVALIDATE_PAGE_TABLE_LEVEL_UP_TO_PDE1  0x00000003
+#define NVC96F_MEM_OP_C_TLB_INVALIDATE_PAGE_TABLE_LEVEL_UP_TO_PDE2  0x00000004
+#define NVC96F_MEM_OP_C_TLB_INVALIDATE_PAGE_TABLE_LEVEL_UP_TO_PDE3  0x00000005
+#define NVC96F_MEM_OP_C_TLB_INVALIDATE_PAGE_TABLE_LEVEL_UP_TO_PDE4  0x00000006
+#define NVC96F_MEM_OP_C_TLB_INVALIDATE_PAGE_TABLE_LEVEL_UP_TO_PDE5  0x00000007
+#define NVC96F_MEM_OP_C_TLB_INVALIDATE_PDB_APERTURE                          11:10  // only relevant if PDB_ONE
+#define NVC96F_MEM_OP_C_TLB_INVALIDATE_PDB_APERTURE_VID_MEM             0x00000000
+#define NVC96F_MEM_OP_C_TLB_INVALIDATE_PDB_APERTURE_SYS_MEM_COHERENT    0x00000002
+#define NVC96F_MEM_OP_C_TLB_INVALIDATE_PDB_APERTURE_SYS_MEM_NONCOHERENT 0x00000003
+#define NVC96F_MEM_OP_C_TLB_INVALIDATE_PDB_ADDR_LO                       31:12  // only relevant if PDB_ONE
+#define NVC96F_MEM_OP_C_ACCESS_COUNTER_CLR_TARGETED_NOTIFY_TAG            19:0
+// MEM_OP_D MUST be preceded by MEM_OPs A-C.
+#define NVC96F_MEM_OP_D                                            (0x00000034)
+#define NVC96F_MEM_OP_D_TLB_INVALIDATE_PDB_ADDR_HI                        26:0  // only relevant if PDB_ONE
+#define NVC96F_MEM_OP_D_OPERATION                                        31:27
+#define NVC96F_MEM_OP_D_OPERATION_MEMBAR                            0x00000005
+#define NVC96F_MEM_OP_D_OPERATION_MMU_TLB_INVALIDATE                0x00000009
+#define NVC96F_MEM_OP_D_OPERATION_MMU_TLB_INVALIDATE_TARGETED       0x0000000a
+#define NVC96F_MEM_OP_D_OPERATION_MMU_OPERATION                     0x0000000b
+#define NVC96F_MEM_OP_D_OPERATION_L2_PEERMEM_INVALIDATE             0x0000000d
+#define NVC96F_MEM_OP_D_OPERATION_L2_SYSMEM_INVALIDATE              0x0000000e
+// CLEAN_LINES is an alias for Tegra/GPU IP usage
+#define NVC96F_MEM_OP_B_OPERATION_L2_INVALIDATE_CLEAN_LINES         0x0000000e
+#define NVC96F_MEM_OP_D_OPERATION_L2_CLEAN_COMPTAGS                 0x0000000f
+#define NVC96F_MEM_OP_D_OPERATION_L2_FLUSH_DIRTY                    0x00000010
+#define NVC96F_MEM_OP_D_OPERATION_L2_SYSMEM_NCOH_INVALIDATE         0x00000011
+#define NVC96F_MEM_OP_D_OPERATION_L2_SYSMEM_COH_INVALIDATE          0x00000012
+#define NVC96F_MEM_OP_D_OPERATION_L2_WAIT_FOR_SYS_PENDING_READS     0x00000015
+#define NVC96F_MEM_OP_D_OPERATION_ACCESS_COUNTER_CLR                0x00000016
+#define NVC96F_MEM_OP_D_ACCESS_COUNTER_CLR_TYPE                            1:0
+#define NVC96F_MEM_OP_D_ACCESS_COUNTER_CLR_TYPE_MIMC                0x00000000
+#define NVC96F_MEM_OP_D_ACCESS_COUNTER_CLR_TYPE_MOMC                0x00000001
+#define NVC96F_MEM_OP_D_ACCESS_COUNTER_CLR_TYPE_ALL                 0x00000002
+#define NVC96F_MEM_OP_D_ACCESS_COUNTER_CLR_TYPE_TARGETED            0x00000003
+#define NVC96F_MEM_OP_D_ACCESS_COUNTER_CLR_TARGETED_TYPE                   2:2
+#define NVC96F_MEM_OP_D_ACCESS_COUNTER_CLR_TARGETED_TYPE_MIMC       0x00000000
+#define NVC96F_MEM_OP_D_ACCESS_COUNTER_CLR_TARGETED_TYPE_MOMC       0x00000001
+#define NVC96F_MEM_OP_D_ACCESS_COUNTER_CLR_TARGETED_BANK                   6:3
+#define NVC96F_MEM_OP_D_MMU_OPERATION_TYPE                               23:20
+#define NVC96F_MEM_OP_D_MMU_OPERATION_TYPE_RESERVED                 0x00000000
+#define NVC96F_MEM_OP_D_MMU_OPERATION_TYPE_VIDMEM_ACCESS_BIT_DUMP   0x00000001
+#define NVC96F_SEM_ADDR_LO                                         (0x0000005c)
+#define NVC96F_SEM_ADDR_LO_OFFSET                                         31:2
+#define NVC96F_SEM_ADDR_HI                                         (0x00000060)
+#define NVC96F_SEM_ADDR_HI_OFFSET                                         24:0
+#define NVC96F_SEM_PAYLOAD_LO                                      (0x00000064)
+#define NVC96F_SEM_PAYLOAD_LO_PAYLOAD                                     31:0
+#define NVC96F_SEM_PAYLOAD_HI                                      (0x00000068)
+#define NVC96F_SEM_PAYLOAD_HI_PAYLOAD                                     31:0
+#define NVC96F_SEM_EXECUTE                                         (0x0000006c)
+#define NVC96F_SEM_EXECUTE_OPERATION                                       2:0
+#define NVC96F_SEM_EXECUTE_OPERATION_ACQUIRE                        0x00000000
+#define NVC96F_SEM_EXECUTE_OPERATION_RELEASE                        0x00000001
+#define NVC96F_SEM_EXECUTE_OPERATION_ACQ_STRICT_GEQ                 0x00000002
+#define NVC96F_SEM_EXECUTE_OPERATION_ACQ_CIRC_GEQ                   0x00000003
+#define NVC96F_SEM_EXECUTE_OPERATION_ACQ_AND                        0x00000004
+#define NVC96F_SEM_EXECUTE_OPERATION_ACQ_NOR                        0x00000005
+#define NVC96F_SEM_EXECUTE_OPERATION_REDUCTION                      0x00000006
+#define NVC96F_SEM_EXECUTE_ACQUIRE_SWITCH_TSG                            12:12
+#define NVC96F_SEM_EXECUTE_ACQUIRE_SWITCH_TSG_DIS                   0x00000000
+#define NVC96F_SEM_EXECUTE_ACQUIRE_SWITCH_TSG_EN                    0x00000001
+#define NVC96F_SEM_EXECUTE_ACQUIRE_RECHECK                               18:18
+#define NVC96F_SEM_EXECUTE_ACQUIRE_RECHECK_DIS                      0x00000000
+#define NVC96F_SEM_EXECUTE_ACQUIRE_RECHECK_EN                       0x00000001
+#define NVC96F_SEM_EXECUTE_RELEASE_WFI                                   20:20
+#define NVC96F_SEM_EXECUTE_RELEASE_WFI_DIS                          0x00000000
+#define NVC96F_SEM_EXECUTE_RELEASE_WFI_EN                           0x00000001
+#define NVC96F_SEM_EXECUTE_PAYLOAD_SIZE                                  24:24
+#define NVC96F_SEM_EXECUTE_PAYLOAD_SIZE_32BIT                       0x00000000
+#define NVC96F_SEM_EXECUTE_PAYLOAD_SIZE_64BIT                       0x00000001
+#define NVC96F_SEM_EXECUTE_RELEASE_TIMESTAMP                             25:25
+#define NVC96F_SEM_EXECUTE_RELEASE_TIMESTAMP_DIS                    0x00000000
+#define NVC96F_SEM_EXECUTE_RELEASE_TIMESTAMP_EN                     0x00000001
+#define NVC96F_SEM_EXECUTE_REDUCTION                                     30:27
+#define NVC96F_SEM_EXECUTE_REDUCTION_IMIN                           0x00000000
+#define NVC96F_SEM_EXECUTE_REDUCTION_IMAX                           0x00000001
+#define NVC96F_SEM_EXECUTE_REDUCTION_IXOR                           0x00000002
+#define NVC96F_SEM_EXECUTE_REDUCTION_IAND                           0x00000003
+#define NVC96F_SEM_EXECUTE_REDUCTION_IOR                            0x00000004
+#define NVC96F_SEM_EXECUTE_REDUCTION_IADD                           0x00000005
+#define NVC96F_SEM_EXECUTE_REDUCTION_INC                            0x00000006
+#define NVC96F_SEM_EXECUTE_REDUCTION_DEC                            0x00000007
+#define NVC96F_SEM_EXECUTE_REDUCTION_FORMAT                              31:31
+#define NVC96F_SEM_EXECUTE_REDUCTION_FORMAT_SIGNED                  0x00000000
+#define NVC96F_SEM_EXECUTE_REDUCTION_FORMAT_UNSIGNED                0x00000001
+#define NVC96F_WFI                                                 (0x00000078)
+#define NVC96F_WFI_SCOPE                                                   0:0
+#define NVC96F_WFI_SCOPE_CURRENT_SCG_TYPE                           0x00000000
+#define NVC96F_WFI_SCOPE_CURRENT_VEID                               0x00000000
+#define NVC96F_WFI_SCOPE_ALL                                        0x00000001
+#define NVC96F_YIELD                                               (0x00000080)
+#define NVC96F_YIELD_OP                                                    1:0
+#define NVC96F_YIELD_OP_NOP                                         0x00000000
+#define NVC96F_YIELD_OP_TSG                                         0x00000003
+#define NVC96F_CLEAR_FAULTED                                       (0x00000084)
+// Note: RM provides the HANDLE as an opaque value; the internal detail fields
+// are intentionally not exposed to the driver through these defines.
+#define NVC96F_CLEAR_FAULTED_HANDLE                                       30:0
+#define NVC96F_CLEAR_FAULTED_TYPE                                        31:31
+#define NVC96F_CLEAR_FAULTED_TYPE_PBDMA_FAULTED                     0x00000000
+#define NVC96F_CLEAR_FAULTED_TYPE_ENG_FAULTED                       0x00000001
+
+
+/* GPFIFO entry format */
+#define NVC96F_GP_ENTRY__SIZE                                          8
+#define NVC96F_GP_ENTRY0_FETCH                                       0:0
+#define NVC96F_GP_ENTRY0_FETCH_UNCONDITIONAL                  0x00000000
+#define NVC96F_GP_ENTRY0_FETCH_CONDITIONAL                    0x00000001
+#define NVC96F_GP_ENTRY0_GET                                        31:2
+#define NVC96F_GP_ENTRY0_OPERAND                                    31:0
+#define NVC96F_GP_ENTRY0_PB_EXTENDED_BASE_OPERAND                   24:8
+#define NVC96F_GP_ENTRY1_GET_HI                                      7:0
+#define NVC96F_GP_ENTRY1_LEVEL                                       9:9
+#define NVC96F_GP_ENTRY1_LEVEL_MAIN                           0x00000000
+#define NVC96F_GP_ENTRY1_LEVEL_SUBROUTINE                     0x00000001
+#define NVC96F_GP_ENTRY1_LENGTH                                    30:10
+#define NVC96F_GP_ENTRY1_SYNC                                      31:31
+#define NVC96F_GP_ENTRY1_SYNC_PROCEED                         0x00000000
+#define NVC96F_GP_ENTRY1_SYNC_WAIT                            0x00000001
+#define NVC96F_GP_ENTRY1_OPCODE                                      7:0
+#define NVC96F_GP_ENTRY1_OPCODE_NOP                           0x00000000
+#define NVC96F_GP_ENTRY1_OPCODE_ILLEGAL                       0x00000001
+#define NVC96F_GP_ENTRY1_OPCODE_GP_CRC                        0x00000002
+#define NVC96F_GP_ENTRY1_OPCODE_PB_CRC                        0x00000003
+#define NVC96F_GP_ENTRY1_OPCODE_SET_PB_SEGMENT_EXTENDED_BASE  0x00000004
+
+/* dma method formats */
+#define NVC96F_DMA_METHOD_ADDRESS_OLD                              12:2
+#define NVC96F_DMA_METHOD_ADDRESS                                  11:0
+#define NVC96F_DMA_SUBDEVICE_MASK                                  15:4
+#define NVC96F_DMA_METHOD_SUBCHANNEL                               15:13
+#define NVC96F_DMA_TERT_OP                                         17:16
+#define NVC96F_DMA_TERT_OP_GRP0_INC_METHOD                         (0x00000000)
+#define NVC96F_DMA_TERT_OP_GRP0_SET_SUB_DEV_MASK                   (0x00000001)
+#define NVC96F_DMA_TERT_OP_GRP0_STORE_SUB_DEV_MASK                 (0x00000002)
+#define NVC96F_DMA_TERT_OP_GRP0_USE_SUB_DEV_MASK                   (0x00000003)
+#define NVC96F_DMA_TERT_OP_GRP2_NON_INC_METHOD                     (0x00000000)
+#define NVC96F_DMA_METHOD_COUNT_OLD                                28:18
+#define NVC96F_DMA_METHOD_COUNT                                    28:16
+#define NVC96F_DMA_IMMD_DATA                                       28:16
+#define NVC96F_DMA_SEC_OP                                          31:29
+#define NVC96F_DMA_SEC_OP_GRP0_USE_TERT                            (0x00000000)
+#define NVC96F_DMA_SEC_OP_INC_METHOD                               (0x00000001)
+#define NVC96F_DMA_SEC_OP_GRP2_USE_TERT                            (0x00000002)
+#define NVC96F_DMA_SEC_OP_NON_INC_METHOD                           (0x00000003)
+#define NVC96F_DMA_SEC_OP_IMMD_DATA_METHOD                         (0x00000004)
+#define NVC96F_DMA_SEC_OP_ONE_INC                                  (0x00000005)
+#define NVC96F_DMA_SEC_OP_RESERVED6                                (0x00000006)
+#define NVC96F_DMA_SEC_OP_END_PB_SEGMENT                           (0x00000007)
+/* dma incrementing method format */
+#define NVC96F_DMA_INCR_ADDRESS                                    11:0
+#define NVC96F_DMA_INCR_SUBCHANNEL                                 15:13
+#define NVC96F_DMA_INCR_COUNT                                      28:16
+#define NVC96F_DMA_INCR_OPCODE                                     31:29
+#define NVC96F_DMA_INCR_OPCODE_VALUE                               (0x00000001)
+#define NVC96F_DMA_INCR_DATA                                       31:0
+/* dma non-incrementing method format */
+#define NVC96F_DMA_NONINCR_ADDRESS                                 11:0
+#define NVC96F_DMA_NONINCR_SUBCHANNEL                              15:13
+#define NVC96F_DMA_NONINCR_COUNT                                   28:16
+#define NVC96F_DMA_NONINCR_OPCODE                                  31:29
+#define NVC96F_DMA_NONINCR_OPCODE_VALUE                            (0x00000003)
+#define NVC96F_DMA_NONINCR_DATA                                    31:0
+/* dma increment-once method format */
+#define NVC96F_DMA_ONEINCR_ADDRESS                                 11:0
+#define NVC96F_DMA_ONEINCR_SUBCHANNEL                              15:13
+#define NVC96F_DMA_ONEINCR_COUNT                                   28:16
+#define NVC96F_DMA_ONEINCR_OPCODE                                  31:29
+#define NVC96F_DMA_ONEINCR_OPCODE_VALUE                            (0x00000005)
+#define NVC96F_DMA_ONEINCR_DATA                                    31:0
+/* dma no-operation format */
+#define NVC96F_DMA_NOP                                             (0x00000000)
+/* dma immediate-data format */
+#define NVC96F_DMA_IMMD_ADDRESS                                    11:0
+#define NVC96F_DMA_IMMD_SUBCHANNEL                                 15:13
+#define NVC96F_DMA_IMMD_DATA                                       28:16
+#define NVC96F_DMA_IMMD_OPCODE                                     31:29
+#define NVC96F_DMA_IMMD_OPCODE_VALUE                               (0x00000004)
+/* dma set sub-device mask format */
+#define NVC96F_DMA_SET_SUBDEVICE_MASK_VALUE                        15:4
+#define NVC96F_DMA_SET_SUBDEVICE_MASK_OPCODE                       31:16
+#define NVC96F_DMA_SET_SUBDEVICE_MASK_OPCODE_VALUE                 (0x00000001)
+/* dma store sub-device mask format */
+#define NVC96F_DMA_STORE_SUBDEVICE_MASK_VALUE                      15:4
+#define NVC96F_DMA_STORE_SUBDEVICE_MASK_OPCODE                     31:16
+#define NVC96F_DMA_STORE_SUBDEVICE_MASK_OPCODE_VALUE               (0x00000002)
+/* dma use sub-device mask format */
+#define NVC96F_DMA_USE_SUBDEVICE_MASK_OPCODE                       31:16
+#define NVC96F_DMA_USE_SUBDEVICE_MASK_OPCODE_VALUE                 (0x00000003)
+/* dma end-segment format */
+#define NVC96F_DMA_ENDSEG_OPCODE                                   31:29
+#define NVC96F_DMA_ENDSEG_OPCODE_VALUE                             (0x00000007)
+/* dma legacy incrementing/non-incrementing formats */
+#define NVC96F_DMA_ADDRESS                                         12:2
+#define NVC96F_DMA_SUBCH                                           15:13
+#define NVC96F_DMA_OPCODE3                                         17:16
+#define NVC96F_DMA_OPCODE3_NONE                                    (0x00000000)
+#define NVC96F_DMA_COUNT                                           28:18
+#define NVC96F_DMA_OPCODE                                          31:29
+#define NVC96F_DMA_OPCODE_METHOD                                   (0x00000000)
+#define NVC96F_DMA_OPCODE_NONINC_METHOD                            (0x00000002)
+#define NVC96F_DMA_DATA                                            31:0
+
+#ifdef __cplusplus
+};     /* extern "C" */
+#endif
+
+#endif /* _clc96f_h_ */
--- a/kernel-open/nvidia-uvm/clc9b5.h
+++ b/kernel-open/nvidia-uvm/clc9b5.h
@@ -0,0 +1,460 @@
+/*******************************************************************************
+    Copyright (c) 1993-2004 NVIDIA Corporation
+
+    Permission is hereby granted, free of charge, to any person obtaining a copy
+    of this software and associated documentation files (the "Software"), to
+    deal in the Software without restriction, including without limitation the
+    rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+    sell copies of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+
+    The above copyright notice and this permission notice shall be
+    included in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+    THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+    DEALINGS IN THE SOFTWARE.
+
+*******************************************************************************/
+
+
+
+#include "nvtypes.h"
+
+#ifndef _clc9b5_h_
+#define _clc9b5_h_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define BLACKWELL_DMA_COPY_A                                                            (0x0000C9B5)
+
+typedef volatile struct _clc9b5_tag0 {
+    NvV32 Reserved00[0x40];
+    NvV32 Nop;                                                                  // 0x00000100 - 0x00000103
+    NvV32 Reserved01[0xF];
+    NvV32 PmTrigger;                                                            // 0x00000140 - 0x00000143
+    NvV32 Reserved02[0x36];
+    NvV32 SetMonitoredFenceType;                                                // 0x0000021C - 0x0000021F
+    NvV32 SetMonitoredFenceSignalAddrBaseUpper;                                 // 0x00000220 - 0x00000223
+    NvV32 SetMonitoredFenceSignalAddrBaseLower;                                 // 0x00000224 - 0x00000227
+    NvV32 Reserved03[0x6];
+    NvV32 SetSemaphoreA;                                                        // 0x00000240 - 0x00000243
+    NvV32 SetSemaphoreB;                                                        // 0x00000244 - 0x00000247
+    NvV32 SetSemaphorePayload;                                                  // 0x00000248 - 0x0000024B
+    NvV32 SetSemaphorePayloadUpper;                                             // 0x0000024C - 0x0000024F
+    NvV32 Reserved04[0x1];
+    NvV32 SetRenderEnableA;                                                     // 0x00000254 - 0x00000257
+    NvV32 SetRenderEnableB;                                                     // 0x00000258 - 0x0000025B
+    NvV32 SetRenderEnableC;                                                     // 0x0000025C - 0x0000025F
+    NvV32 SetSrcPhysMode;                                                       // 0x00000260 - 0x00000263
+    NvV32 SetDstPhysMode;                                                       // 0x00000264 - 0x00000267
+    NvV32 Reserved05[0x26];
+    NvV32 LaunchDma;                                                            // 0x00000300 - 0x00000303
+    NvV32 Reserved06[0x3F];
+    NvV32 OffsetInUpper;                                                        // 0x00000400 - 0x00000403
+    NvV32 OffsetInLower;                                                        // 0x00000404 - 0x00000407
+    NvV32 OffsetOutUpper;                                                       // 0x00000408 - 0x0000040B
+    NvV32 OffsetOutLower;                                                       // 0x0000040C - 0x0000040F
+    NvV32 PitchIn;                                                              // 0x00000410 - 0x00000413
+    NvV32 PitchOut;                                                             // 0x00000414 - 0x00000417
+    NvV32 LineLengthIn;                                                         // 0x00000418 - 0x0000041B
+    NvV32 LineCount;                                                            // 0x0000041C - 0x0000041F
+    NvV32 Reserved07[0x38];
+    NvV32 SetSecureCopyMode;                                                    // 0x00000500 - 0x00000503
+    NvV32 SetDecryptIv0;                                                        // 0x00000504 - 0x00000507
+    NvV32 SetDecryptIv1;                                                        // 0x00000508 - 0x0000050B
+    NvV32 SetDecryptIv2;                                                        // 0x0000050C - 0x0000050F
+    NvV32 Reserved_SetAESCounter;                                               // 0x00000510 - 0x00000513
+    NvV32 SetDecryptAuthTagCompareAddrUpper;                                    // 0x00000514 - 0x00000517
+    NvV32 SetDecryptAuthTagCompareAddrLower;                                    // 0x00000518 - 0x0000051B
+    NvV32 Reserved08[0x5];
+    NvV32 SetEncryptAuthTagAddrUpper;                                           // 0x00000530 - 0x00000533
+    NvV32 SetEncryptAuthTagAddrLower;                                           // 0x00000534 - 0x00000537
+    NvV32 SetEncryptIvAddrUpper;                                                // 0x00000538 - 0x0000053B
+    NvV32 SetEncryptIvAddrLower;                                                // 0x0000053C - 0x0000053F
+    NvV32 Reserved09[0x10];
+    NvV32 SetCompressionParameters;                                             // 0x00000580 - 0x00000583
+    NvV32 SetDecompressOutLength;                                               // 0x00000584 - 0x00000587
+    NvV32 SetDecompressOutLengthAddrUpper;                                      // 0x00000588 - 0x0000058B
+    NvV32 SetDecompressOutLengthAddrLower;                                      // 0x0000058C - 0x0000058F
+    NvV32 SetDecompressChecksum;                                                // 0x00000590 - 0x00000593
+    NvV32 Reserved10[0x5A];
+    NvV32 SetMemoryScrubParameters;                                             // 0x000006FC - 0x000006FF
+    NvV32 SetRemapConstA;                                                       // 0x00000700 - 0x00000703
+    NvV32 SetRemapConstB;                                                       // 0x00000704 - 0x00000707
+    NvV32 SetRemapComponents;                                                   // 0x00000708 - 0x0000070B
+    NvV32 SetDstBlockSize;                                                      // 0x0000070C - 0x0000070F
+    NvV32 SetDstWidth;                                                          // 0x00000710 - 0x00000713
+    NvV32 SetDstHeight;                                                         // 0x00000714 - 0x00000717
+    NvV32 SetDstDepth;                                                          // 0x00000718 - 0x0000071B
+    NvV32 SetDstLayer;                                                          // 0x0000071C - 0x0000071F
+    NvV32 SetDstOrigin;                                                         // 0x00000720 - 0x00000723
+    NvV32 Reserved11[0x1];
+    NvV32 SetSrcBlockSize;                                                      // 0x00000728 - 0x0000072B
+    NvV32 SetSrcWidth;                                                          // 0x0000072C - 0x0000072F
+    NvV32 SetSrcHeight;                                                         // 0x00000730 - 0x00000733
+    NvV32 SetSrcDepth;                                                          // 0x00000734 - 0x00000737
+    NvV32 SetSrcLayer;                                                          // 0x00000738 - 0x0000073B
+    NvV32 SetSrcOrigin;                                                         // 0x0000073C - 0x0000073F
+    NvV32 Reserved12[0x1];
+    NvV32 SrcOriginX;                                                           // 0x00000744 - 0x00000747
+    NvV32 SrcOriginY;                                                           // 0x00000748 - 0x0000074B
+    NvV32 DstOriginX;                                                           // 0x0000074C - 0x0000074F
+    NvV32 DstOriginY;                                                           // 0x00000750 - 0x00000753
+    NvV32 Reserved13[0x270];
+    NvV32 PmTriggerEnd;                                                         // 0x00001114 - 0x00001117
+    NvV32 Reserved14[0x3BA];
+} blackwell_dma_copy_aControlPio;
+
+#define NVC9B5_NOP                                                              (0x00000100)
+#define NVC9B5_NOP_PARAMETER                                                    31:0
+#define NVC9B5_PM_TRIGGER                                                       (0x00000140)
+#define NVC9B5_PM_TRIGGER_V                                                     31:0
+#define NVC9B5_SET_MONITORED_FENCE_TYPE                                         (0x0000021C)
+#define NVC9B5_SET_MONITORED_FENCE_TYPE_TYPE                                    0:0
+#define NVC9B5_SET_MONITORED_FENCE_TYPE_TYPE_MONITORED_FENCE                    (0x00000000)
+#define NVC9B5_SET_MONITORED_FENCE_TYPE_TYPE_MONITORED_FENCE_EXT                (0x00000001)
+#define NVC9B5_SET_MONITORED_FENCE_SIGNAL_ADDR_BASE_UPPER                       (0x00000220)
+#define NVC9B5_SET_MONITORED_FENCE_SIGNAL_ADDR_BASE_UPPER_UPPER                 24:0
+#define NVC9B5_SET_MONITORED_FENCE_SIGNAL_ADDR_BASE_LOWER                       (0x00000224)
+#define NVC9B5_SET_MONITORED_FENCE_SIGNAL_ADDR_BASE_LOWER_LOWER                 31:0
+#define NVC9B5_SET_SEMAPHORE_A                                                  (0x00000240)
+#define NVC9B5_SET_SEMAPHORE_A_UPPER                                            24:0
+#define NVC9B5_SET_SEMAPHORE_B                                                  (0x00000244)
+#define NVC9B5_SET_SEMAPHORE_B_LOWER                                            31:0
+#define NVC9B5_SET_SEMAPHORE_PAYLOAD                                            (0x00000248)
+#define NVC9B5_SET_SEMAPHORE_PAYLOAD_PAYLOAD                                    31:0
+#define NVC9B5_SET_SEMAPHORE_PAYLOAD_UPPER                                      (0x0000024C)
+#define NVC9B5_SET_SEMAPHORE_PAYLOAD_UPPER_PAYLOAD                              31:0
+#define NVC9B5_SET_RENDER_ENABLE_A                                              (0x00000254)
+#define NVC9B5_SET_RENDER_ENABLE_A_UPPER                                        24:0
+#define NVC9B5_SET_RENDER_ENABLE_B                                              (0x00000258)
+#define NVC9B5_SET_RENDER_ENABLE_B_LOWER                                        31:0
+#define NVC9B5_SET_RENDER_ENABLE_C                                              (0x0000025C)
+#define NVC9B5_SET_RENDER_ENABLE_C_MODE                                         2:0
+#define NVC9B5_SET_RENDER_ENABLE_C_MODE_FALSE                                   (0x00000000)
+#define NVC9B5_SET_RENDER_ENABLE_C_MODE_TRUE                                    (0x00000001)
+#define NVC9B5_SET_RENDER_ENABLE_C_MODE_CONDITIONAL                             (0x00000002)
+#define NVC9B5_SET_RENDER_ENABLE_C_MODE_RENDER_IF_EQUAL                         (0x00000003)
+#define NVC9B5_SET_RENDER_ENABLE_C_MODE_RENDER_IF_NOT_EQUAL                     (0x00000004)
+#define NVC9B5_SET_SRC_PHYS_MODE                                                (0x00000260)
+#define NVC9B5_SET_SRC_PHYS_MODE_TARGET                                         1:0
+#define NVC9B5_SET_SRC_PHYS_MODE_TARGET_LOCAL_FB                                (0x00000000)
+#define NVC9B5_SET_SRC_PHYS_MODE_TARGET_COHERENT_SYSMEM                         (0x00000001)
+#define NVC9B5_SET_SRC_PHYS_MODE_TARGET_NONCOHERENT_SYSMEM                      (0x00000002)
+#define NVC9B5_SET_SRC_PHYS_MODE_TARGET_PEERMEM                                 (0x00000003)
+#define NVC9B5_SET_SRC_PHYS_MODE_BASIC_KIND                                     5:2
+#define NVC9B5_SET_SRC_PHYS_MODE_PEER_ID                                        8:6
+#define NVC9B5_SET_SRC_PHYS_MODE_FLA                                            9:9
+#define NVC9B5_SET_DST_PHYS_MODE                                                (0x00000264)
+#define NVC9B5_SET_DST_PHYS_MODE_TARGET                                         1:0
+#define NVC9B5_SET_DST_PHYS_MODE_TARGET_LOCAL_FB                                (0x00000000)
+#define NVC9B5_SET_DST_PHYS_MODE_TARGET_COHERENT_SYSMEM                         (0x00000001)
+#define NVC9B5_SET_DST_PHYS_MODE_TARGET_NONCOHERENT_SYSMEM                      (0x00000002)
+#define NVC9B5_SET_DST_PHYS_MODE_TARGET_PEERMEM                                 (0x00000003)
+#define NVC9B5_SET_DST_PHYS_MODE_BASIC_KIND                                     5:2
+#define NVC9B5_SET_DST_PHYS_MODE_PEER_ID                                        8:6
+#define NVC9B5_SET_DST_PHYS_MODE_FLA                                            9:9
+#define NVC9B5_LAUNCH_DMA                                                       (0x00000300)
+#define NVC9B5_LAUNCH_DMA_DATA_TRANSFER_TYPE                                    1:0
+#define NVC9B5_LAUNCH_DMA_DATA_TRANSFER_TYPE_NONE                               (0x00000000)
+#define NVC9B5_LAUNCH_DMA_DATA_TRANSFER_TYPE_PIPELINED                          (0x00000001)
+#define NVC9B5_LAUNCH_DMA_DATA_TRANSFER_TYPE_NON_PIPELINED                      (0x00000002)
+#define NVC9B5_LAUNCH_DMA_FLUSH_ENABLE                                          2:2
+#define NVC9B5_LAUNCH_DMA_FLUSH_ENABLE_FALSE                                    (0x00000000)
+#define NVC9B5_LAUNCH_DMA_FLUSH_ENABLE_TRUE                                     (0x00000001)
+#define NVC9B5_LAUNCH_DMA_FLUSH_TYPE                                            25:25
+#define NVC9B5_LAUNCH_DMA_FLUSH_TYPE_SYS                                        (0x00000000)
+#define NVC9B5_LAUNCH_DMA_FLUSH_TYPE_GL                                         (0x00000001)
+#define NVC9B5_LAUNCH_DMA_SEMAPHORE_TYPE                                        4:3
+#define NVC9B5_LAUNCH_DMA_SEMAPHORE_TYPE_NONE                                   (0x00000000)
+#define NVC9B5_LAUNCH_DMA_SEMAPHORE_TYPE_RELEASE_SEMAPHORE_NO_TIMESTAMP         (0x00000001)
+#define NVC9B5_LAUNCH_DMA_SEMAPHORE_TYPE_RELEASE_SEMAPHORE_WITH_TIMESTAMP       (0x00000002)
+#define NVC9B5_LAUNCH_DMA_SEMAPHORE_TYPE_RELEASE_ONE_WORD_SEMAPHORE             (0x00000001)
+#define NVC9B5_LAUNCH_DMA_SEMAPHORE_TYPE_RELEASE_FOUR_WORD_SEMAPHORE            (0x00000002)
+#define NVC9B5_LAUNCH_DMA_SEMAPHORE_TYPE_RELEASE_CONDITIONAL_INTR_SEMAPHORE     (0x00000003)
+#define NVC9B5_LAUNCH_DMA_INTERRUPT_TYPE                                        6:5
+#define NVC9B5_LAUNCH_DMA_INTERRUPT_TYPE_NONE                                   (0x00000000)
+#define NVC9B5_LAUNCH_DMA_INTERRUPT_TYPE_BLOCKING                               (0x00000001)
+#define NVC9B5_LAUNCH_DMA_INTERRUPT_TYPE_NON_BLOCKING                           (0x00000002)
+#define NVC9B5_LAUNCH_DMA_SRC_MEMORY_LAYOUT                                     7:7
+#define NVC9B5_LAUNCH_DMA_SRC_MEMORY_LAYOUT_BLOCKLINEAR                         (0x00000000)
+#define NVC9B5_LAUNCH_DMA_SRC_MEMORY_LAYOUT_PITCH                               (0x00000001)
+#define NVC9B5_LAUNCH_DMA_DST_MEMORY_LAYOUT                                     8:8
+#define NVC9B5_LAUNCH_DMA_DST_MEMORY_LAYOUT_BLOCKLINEAR                         (0x00000000)
+#define NVC9B5_LAUNCH_DMA_DST_MEMORY_LAYOUT_PITCH                               (0x00000001)
+#define NVC9B5_LAUNCH_DMA_MULTI_LINE_ENABLE                                     9:9
+#define NVC9B5_LAUNCH_DMA_MULTI_LINE_ENABLE_FALSE                               (0x00000000)
+#define NVC9B5_LAUNCH_DMA_MULTI_LINE_ENABLE_TRUE                                (0x00000001)
+#define NVC9B5_LAUNCH_DMA_REMAP_ENABLE                                          10:10
+#define NVC9B5_LAUNCH_DMA_REMAP_ENABLE_FALSE                                    (0x00000000)
+#define NVC9B5_LAUNCH_DMA_REMAP_ENABLE_TRUE                                     (0x00000001)
+#define NVC9B5_LAUNCH_DMA_COMPRESSION_ENABLE                                    11:11
+#define NVC9B5_LAUNCH_DMA_COMPRESSION_ENABLE_FALSE                              (0x00000000)
+#define NVC9B5_LAUNCH_DMA_COMPRESSION_ENABLE_TRUE                               (0x00000001)
+#define NVC9B5_LAUNCH_DMA_SRC_TYPE                                              12:12
+#define NVC9B5_LAUNCH_DMA_SRC_TYPE_VIRTUAL                                      (0x00000000)
+#define NVC9B5_LAUNCH_DMA_SRC_TYPE_PHYSICAL                                     (0x00000001)
+#define NVC9B5_LAUNCH_DMA_DST_TYPE                                              13:13
+#define NVC9B5_LAUNCH_DMA_DST_TYPE_VIRTUAL                                      (0x00000000)
+#define NVC9B5_LAUNCH_DMA_DST_TYPE_PHYSICAL                                     (0x00000001)
+#define NVC9B5_LAUNCH_DMA_SEMAPHORE_REDUCTION                                   17:14
+#define NVC9B5_LAUNCH_DMA_SEMAPHORE_REDUCTION_IMIN                              (0x00000000)
+#define NVC9B5_LAUNCH_DMA_SEMAPHORE_REDUCTION_IMAX                              (0x00000001)
+#define NVC9B5_LAUNCH_DMA_SEMAPHORE_REDUCTION_IXOR                              (0x00000002)
+#define NVC9B5_LAUNCH_DMA_SEMAPHORE_REDUCTION_IAND                              (0x00000003)
+#define NVC9B5_LAUNCH_DMA_SEMAPHORE_REDUCTION_IOR                               (0x00000004)
+#define NVC9B5_LAUNCH_DMA_SEMAPHORE_REDUCTION_IADD                              (0x00000005)
+#define NVC9B5_LAUNCH_DMA_SEMAPHORE_REDUCTION_INC                               (0x00000006)
+#define NVC9B5_LAUNCH_DMA_SEMAPHORE_REDUCTION_DEC                               (0x00000007)
+#define NVC9B5_LAUNCH_DMA_SEMAPHORE_REDUCTION_INVALIDA                          (0x00000008)
+#define NVC9B5_LAUNCH_DMA_SEMAPHORE_REDUCTION_INVALIDB                          (0x00000009)
+#define NVC9B5_LAUNCH_DMA_SEMAPHORE_REDUCTION_FADD                              (0x0000000A)
+#define NVC9B5_LAUNCH_DMA_SEMAPHORE_REDUCTION_FMIN                              (0x0000000B)
+#define NVC9B5_LAUNCH_DMA_SEMAPHORE_REDUCTION_FMAX                              (0x0000000C)
+#define NVC9B5_LAUNCH_DMA_SEMAPHORE_REDUCTION_INVALIDC                          (0x0000000D)
+#define NVC9B5_LAUNCH_DMA_SEMAPHORE_REDUCTION_INVALIDD                          (0x0000000E)
+#define NVC9B5_LAUNCH_DMA_SEMAPHORE_REDUCTION_INVALIDE                          (0x0000000F)
+#define NVC9B5_LAUNCH_DMA_SEMAPHORE_REDUCTION_SIGN                              18:18
+#define NVC9B5_LAUNCH_DMA_SEMAPHORE_REDUCTION_SIGN_SIGNED                       (0x00000000)
+#define NVC9B5_LAUNCH_DMA_SEMAPHORE_REDUCTION_SIGN_UNSIGNED                     (0x00000001)
+#define NVC9B5_LAUNCH_DMA_SEMAPHORE_REDUCTION_ENABLE                            19:19
+#define NVC9B5_LAUNCH_DMA_SEMAPHORE_REDUCTION_ENABLE_FALSE                      (0x00000000)
+#define NVC9B5_LAUNCH_DMA_SEMAPHORE_REDUCTION_ENABLE_TRUE                       (0x00000001)
+#define NVC9B5_LAUNCH_DMA_COPY_TYPE                                             21:20
+#define NVC9B5_LAUNCH_DMA_COPY_TYPE_PROT2PROT                                   (0x00000000)
+#define NVC9B5_LAUNCH_DMA_COPY_TYPE_DEFAULT                                     (0x00000000)
+#define NVC9B5_LAUNCH_DMA_COPY_TYPE_SECURE                                      (0x00000001)
+#define NVC9B5_LAUNCH_DMA_COPY_TYPE_NONPROT2NONPROT                             (0x00000002)
+#define NVC9B5_LAUNCH_DMA_COPY_TYPE_RESERVED                                    (0x00000003)
+#define NVC9B5_LAUNCH_DMA_VPRMODE                                               22:22
+#define NVC9B5_LAUNCH_DMA_VPRMODE_VPR_NONE                                      (0x00000000)
+#define NVC9B5_LAUNCH_DMA_VPRMODE_VPR_VID2VID                                   (0x00000001)
+#define NVC9B5_LAUNCH_DMA_MEMORY_SCRUB_ENABLE                                   23:23
+#define NVC9B5_LAUNCH_DMA_MEMORY_SCRUB_ENABLE_FALSE                             (0x00000000)
+#define NVC9B5_LAUNCH_DMA_MEMORY_SCRUB_ENABLE_TRUE                              (0x00000001)
+#define NVC9B5_LAUNCH_DMA_RESERVED_START_OF_COPY                                24:24
+#define NVC9B5_LAUNCH_DMA_DISABLE_PLC                                           26:26
+#define NVC9B5_LAUNCH_DMA_DISABLE_PLC_FALSE                                     (0x00000000)
+#define NVC9B5_LAUNCH_DMA_DISABLE_PLC_TRUE                                      (0x00000001)
+#define NVC9B5_LAUNCH_DMA_SEMAPHORE_PAYLOAD_SIZE                                27:27
+#define NVC9B5_LAUNCH_DMA_SEMAPHORE_PAYLOAD_SIZE_ONE_WORD                       (0x00000000)
+#define NVC9B5_LAUNCH_DMA_SEMAPHORE_PAYLOAD_SIZE_TWO_WORD                       (0x00000001)
+#define NVC9B5_LAUNCH_DMA_RESERVED_ERR_CODE                                     31:28
+#define NVC9B5_OFFSET_IN_UPPER                                                  (0x00000400)
+#define NVC9B5_OFFSET_IN_UPPER_UPPER                                            24:0
+#define NVC9B5_OFFSET_IN_LOWER                                                  (0x00000404)
+#define NVC9B5_OFFSET_IN_LOWER_VALUE                                            31:0
+#define NVC9B5_OFFSET_OUT_UPPER                                                 (0x00000408)
+#define NVC9B5_OFFSET_OUT_UPPER_UPPER                                           24:0
+#define NVC9B5_OFFSET_OUT_LOWER                                                 (0x0000040C)
+#define NVC9B5_OFFSET_OUT_LOWER_VALUE                                           31:0
+#define NVC9B5_PITCH_IN                                                         (0x00000410)
+#define NVC9B5_PITCH_IN_VALUE                                                   31:0
+#define NVC9B5_PITCH_OUT                                                        (0x00000414)
+#define NVC9B5_PITCH_OUT_VALUE                                                  31:0
+#define NVC9B5_LINE_LENGTH_IN                                                   (0x00000418)
+#define NVC9B5_LINE_LENGTH_IN_VALUE                                             31:0
+#define NVC9B5_LINE_COUNT                                                       (0x0000041C)
+#define NVC9B5_LINE_COUNT_VALUE                                                 31:0
+#define NVC9B5_SET_SECURE_COPY_MODE                                             (0x00000500)
+#define NVC9B5_SET_SECURE_COPY_MODE_MODE                                        0:0
+#define NVC9B5_SET_SECURE_COPY_MODE_MODE_ENCRYPT                                (0x00000000)
+#define NVC9B5_SET_SECURE_COPY_MODE_MODE_DECRYPT                                (0x00000001)
+#define NVC9B5_SET_SECURE_COPY_MODE_RESERVED_SRC_TARGET                         20:19
+#define NVC9B5_SET_SECURE_COPY_MODE_RESERVED_SRC_TARGET_LOCAL_FB                (0x00000000)
+#define NVC9B5_SET_SECURE_COPY_MODE_RESERVED_SRC_TARGET_COHERENT_SYSMEM         (0x00000001)
+#define NVC9B5_SET_SECURE_COPY_MODE_RESERVED_SRC_TARGET_NONCOHERENT_SYSMEM      (0x00000002)
+#define NVC9B5_SET_SECURE_COPY_MODE_RESERVED_SRC_TARGET_PEERMEM                 (0x00000003)
+#define NVC9B5_SET_SECURE_COPY_MODE_RESERVED_SRC_PEER_ID                        23:21
+#define NVC9B5_SET_SECURE_COPY_MODE_RESERVED_SRC_FLA                            24:24
+#define NVC9B5_SET_SECURE_COPY_MODE_RESERVED_DST_TARGET                         26:25
+#define NVC9B5_SET_SECURE_COPY_MODE_RESERVED_DST_TARGET_LOCAL_FB                (0x00000000)
+#define NVC9B5_SET_SECURE_COPY_MODE_RESERVED_DST_TARGET_COHERENT_SYSMEM         (0x00000001)
+#define NVC9B5_SET_SECURE_COPY_MODE_RESERVED_DST_TARGET_NONCOHERENT_SYSMEM      (0x00000002)
+#define NVC9B5_SET_SECURE_COPY_MODE_RESERVED_DST_TARGET_PEERMEM                 (0x00000003)
+#define NVC9B5_SET_SECURE_COPY_MODE_RESERVED_DST_PEER_ID                        29:27
+#define NVC9B5_SET_SECURE_COPY_MODE_RESERVED_DST_FLA                            30:30
+#define NVC9B5_SET_SECURE_COPY_MODE_RESERVED_END_OF_COPY                        31:31
+#define NVC9B5_SET_DECRYPT_IV0                                                  (0x00000504)
+#define NVC9B5_SET_DECRYPT_IV0_VALUE                                            31:0
+#define NVC9B5_SET_DECRYPT_IV1                                                  (0x00000508)
+#define NVC9B5_SET_DECRYPT_IV1_VALUE                                            31:0
+#define NVC9B5_SET_DECRYPT_IV2                                                  (0x0000050C)
+#define NVC9B5_SET_DECRYPT_IV2_VALUE                                            31:0
+#define NVC9B5_RESERVED_SET_AESCOUNTER                                          (0x00000510)
+#define NVC9B5_RESERVED_SET_AESCOUNTER_VALUE                                    31:0
+#define NVC9B5_SET_DECRYPT_AUTH_TAG_COMPARE_ADDR_UPPER                          (0x00000514)
+#define NVC9B5_SET_DECRYPT_AUTH_TAG_COMPARE_ADDR_UPPER_UPPER                    24:0
+#define NVC9B5_SET_DECRYPT_AUTH_TAG_COMPARE_ADDR_LOWER                          (0x00000518)
+#define NVC9B5_SET_DECRYPT_AUTH_TAG_COMPARE_ADDR_LOWER_LOWER                    31:0
+#define NVC9B5_SET_ENCRYPT_AUTH_TAG_ADDR_UPPER                                  (0x00000530)
+#define NVC9B5_SET_ENCRYPT_AUTH_TAG_ADDR_UPPER_UPPER                            24:0
+#define NVC9B5_SET_ENCRYPT_AUTH_TAG_ADDR_LOWER                                  (0x00000534)
+#define NVC9B5_SET_ENCRYPT_AUTH_TAG_ADDR_LOWER_LOWER                            31:0
+#define NVC9B5_SET_ENCRYPT_IV_ADDR_UPPER                                        (0x00000538)
+#define NVC9B5_SET_ENCRYPT_IV_ADDR_UPPER_UPPER                                  24:0
+#define NVC9B5_SET_ENCRYPT_IV_ADDR_LOWER                                        (0x0000053C)
+#define NVC9B5_SET_ENCRYPT_IV_ADDR_LOWER_LOWER                                  31:0
+#define NVC9B5_SET_COMPRESSION_PARAMETERS                                       (0x00000580)
+#define NVC9B5_SET_COMPRESSION_PARAMETERS_OPERATION                             0:0
+#define NVC9B5_SET_COMPRESSION_PARAMETERS_OPERATION_DECOMPRESS                  (0x00000000)
+#define NVC9B5_SET_COMPRESSION_PARAMETERS_OPERATION_COMPRESS                    (0x00000001)
+#define NVC9B5_SET_COMPRESSION_PARAMETERS_ALGO                                  3:1
+#define NVC9B5_SET_COMPRESSION_PARAMETERS_ALGO_SNAPPY                           (0x00000000)
+#define NVC9B5_SET_COMPRESSION_PARAMETERS_ALGO_LZ4_DATA_ONLY                    (0x00000001)
+#define NVC9B5_SET_COMPRESSION_PARAMETERS_ALGO_LZ4_BLOCK                        (0x00000002)
+#define NVC9B5_SET_COMPRESSION_PARAMETERS_ALGO_LZ4_BLOCK_CHECKSUM               (0x00000003)
+#define NVC9B5_SET_COMPRESSION_PARAMETERS_ALGO_DEFLATE                          (0x00000004)
+#define NVC9B5_SET_COMPRESSION_PARAMETERS_ALGO_SNAPPY_WITH_LONG_FETCH           (0x00000005)
+#define NVC9B5_SET_COMPRESSION_PARAMETERS_CHECK_SUM                             29:28
+#define NVC9B5_SET_COMPRESSION_PARAMETERS_CHECK_SUM_NONE                        (0x00000000)
+#define NVC9B5_SET_COMPRESSION_PARAMETERS_CHECK_SUM_ADLER32                     (0x00000001)
+#define NVC9B5_SET_COMPRESSION_PARAMETERS_CHECK_SUM_CRC32                       (0x00000002)
+#define NVC9B5_SET_COMPRESSION_PARAMETERS_CHECK_SUM_SNAPPY_CRC                  (0x00000003)
+#define NVC9B5_SET_DECOMPRESS_OUT_LENGTH                                        (0x00000584)
+#define NVC9B5_SET_DECOMPRESS_OUT_LENGTH_V                                      31:0
+#define NVC9B5_SET_DECOMPRESS_OUT_LENGTH_ADDR_UPPER                             (0x00000588)
+#define NVC9B5_SET_DECOMPRESS_OUT_LENGTH_ADDR_UPPER_UPPER                       24:0
+#define NVC9B5_SET_DECOMPRESS_OUT_LENGTH_ADDR_LOWER                             (0x0000058C)
+#define NVC9B5_SET_DECOMPRESS_OUT_LENGTH_ADDR_LOWER_LOWER                       31:0
+#define NVC9B5_SET_DECOMPRESS_CHECKSUM                                          (0x00000590)
+#define NVC9B5_SET_DECOMPRESS_CHECKSUM_V                                        31:0
+#define NVC9B5_SET_MEMORY_SCRUB_PARAMETERS                                      (0x000006FC)
+#define NVC9B5_SET_MEMORY_SCRUB_PARAMETERS_DISCARDABLE                          0:0
+#define NVC9B5_SET_MEMORY_SCRUB_PARAMETERS_DISCARDABLE_FALSE                    (0x00000000)
+#define NVC9B5_SET_MEMORY_SCRUB_PARAMETERS_DISCARDABLE_TRUE                     (0x00000001)
+#define NVC9B5_SET_REMAP_CONST_A                                                (0x00000700)
+#define NVC9B5_SET_REMAP_CONST_A_V                                              31:0
+#define NVC9B5_SET_REMAP_CONST_B                                                (0x00000704)
+#define NVC9B5_SET_REMAP_CONST_B_V                                              31:0
+#define NVC9B5_SET_REMAP_COMPONENTS                                             (0x00000708)
+#define NVC9B5_SET_REMAP_COMPONENTS_DST_X                                       2:0
+#define NVC9B5_SET_REMAP_COMPONENTS_DST_X_SRC_X                                 (0x00000000)
+#define NVC9B5_SET_REMAP_COMPONENTS_DST_X_SRC_Y                                 (0x00000001)
+#define NVC9B5_SET_REMAP_COMPONENTS_DST_X_SRC_Z                                 (0x00000002)
+#define NVC9B5_SET_REMAP_COMPONENTS_DST_X_SRC_W                                 (0x00000003)
+#define NVC9B5_SET_REMAP_COMPONENTS_DST_X_CONST_A                               (0x00000004)
+#define NVC9B5_SET_REMAP_COMPONENTS_DST_X_CONST_B                               (0x00000005)
+#define NVC9B5_SET_REMAP_COMPONENTS_DST_X_NO_WRITE                              (0x00000006)
+#define NVC9B5_SET_REMAP_COMPONENTS_DST_Y                                       6:4
+#define NVC9B5_SET_REMAP_COMPONENTS_DST_Y_SRC_X                                 (0x00000000)
+#define NVC9B5_SET_REMAP_COMPONENTS_DST_Y_SRC_Y                                 (0x00000001)
+#define NVC9B5_SET_REMAP_COMPONENTS_DST_Y_SRC_Z                                 (0x00000002)
+#define NVC9B5_SET_REMAP_COMPONENTS_DST_Y_SRC_W                                 (0x00000003)
+#define NVC9B5_SET_REMAP_COMPONENTS_DST_Y_CONST_A                               (0x00000004)
+#define NVC9B5_SET_REMAP_COMPONENTS_DST_Y_CONST_B                               (0x00000005)
+#define NVC9B5_SET_REMAP_COMPONENTS_DST_Y_NO_WRITE                              (0x00000006)
+#define NVC9B5_SET_REMAP_COMPONENTS_DST_Z                                       10:8
+#define NVC9B5_SET_REMAP_COMPONENTS_DST_Z_SRC_X                                 (0x00000000)
+#define NVC9B5_SET_REMAP_COMPONENTS_DST_Z_SRC_Y                                 (0x00000001)
+#define NVC9B5_SET_REMAP_COMPONENTS_DST_Z_SRC_Z                                 (0x00000002)
+#define NVC9B5_SET_REMAP_COMPONENTS_DST_Z_SRC_W                                 (0x00000003)
+#define NVC9B5_SET_REMAP_COMPONENTS_DST_Z_CONST_A                               (0x00000004)
+#define NVC9B5_SET_REMAP_COMPONENTS_DST_Z_CONST_B                               (0x00000005)
+#define NVC9B5_SET_REMAP_COMPONENTS_DST_Z_NO_WRITE                              (0x00000006)
+#define NVC9B5_SET_REMAP_COMPONENTS_DST_W                                       14:12
+#define NVC9B5_SET_REMAP_COMPONENTS_DST_W_SRC_X                                 (0x00000000)
+#define NVC9B5_SET_REMAP_COMPONENTS_DST_W_SRC_Y                                 (0x00000001)
+#define NVC9B5_SET_REMAP_COMPONENTS_DST_W_SRC_Z                                 (0x00000002)
+#define NVC9B5_SET_REMAP_COMPONENTS_DST_W_SRC_W                                 (0x00000003)
+#define NVC9B5_SET_REMAP_COMPONENTS_DST_W_CONST_A                               (0x00000004)
+#define NVC9B5_SET_REMAP_COMPONENTS_DST_W_CONST_B                               (0x00000005)
+#define NVC9B5_SET_REMAP_COMPONENTS_DST_W_NO_WRITE                              (0x00000006)
+#define NVC9B5_SET_REMAP_COMPONENTS_COMPONENT_SIZE                              17:16
+#define NVC9B5_SET_REMAP_COMPONENTS_COMPONENT_SIZE_ONE                          (0x00000000)
+#define NVC9B5_SET_REMAP_COMPONENTS_COMPONENT_SIZE_TWO                          (0x00000001)
+#define NVC9B5_SET_REMAP_COMPONENTS_COMPONENT_SIZE_THREE                        (0x00000002)
+#define NVC9B5_SET_REMAP_COMPONENTS_COMPONENT_SIZE_FOUR                         (0x00000003)
+#define NVC9B5_SET_REMAP_COMPONENTS_NUM_SRC_COMPONENTS                          21:20
+#define NVC9B5_SET_REMAP_COMPONENTS_NUM_SRC_COMPONENTS_ONE                      (0x00000000)
+#define NVC9B5_SET_REMAP_COMPONENTS_NUM_SRC_COMPONENTS_TWO                      (0x00000001)
+#define NVC9B5_SET_REMAP_COMPONENTS_NUM_SRC_COMPONENTS_THREE                    (0x00000002)
+#define NVC9B5_SET_REMAP_COMPONENTS_NUM_SRC_COMPONENTS_FOUR                     (0x00000003)
+#define NVC9B5_SET_REMAP_COMPONENTS_NUM_DST_COMPONENTS                          25:24
+#define NVC9B5_SET_REMAP_COMPONENTS_NUM_DST_COMPONENTS_ONE                      (0x00000000)
+#define NVC9B5_SET_REMAP_COMPONENTS_NUM_DST_COMPONENTS_TWO                      (0x00000001)
+#define NVC9B5_SET_REMAP_COMPONENTS_NUM_DST_COMPONENTS_THREE                    (0x00000002)
+#define NVC9B5_SET_REMAP_COMPONENTS_NUM_DST_COMPONENTS_FOUR                     (0x00000003)
+#define NVC9B5_SET_DST_BLOCK_SIZE                                               (0x0000070C)
+#define NVC9B5_SET_DST_BLOCK_SIZE_WIDTH                                         3:0
+#define NVC9B5_SET_DST_BLOCK_SIZE_WIDTH_ONE_GOB                                 (0x00000000)
+#define NVC9B5_SET_DST_BLOCK_SIZE_HEIGHT                                        7:4
+#define NVC9B5_SET_DST_BLOCK_SIZE_HEIGHT_ONE_GOB                                (0x00000000)
+#define NVC9B5_SET_DST_BLOCK_SIZE_HEIGHT_TWO_GOBS                               (0x00000001)
+#define NVC9B5_SET_DST_BLOCK_SIZE_HEIGHT_FOUR_GOBS                              (0x00000002)
+#define NVC9B5_SET_DST_BLOCK_SIZE_HEIGHT_EIGHT_GOBS                             (0x00000003)
+#define NVC9B5_SET_DST_BLOCK_SIZE_HEIGHT_SIXTEEN_GOBS                           (0x00000004)
+#define NVC9B5_SET_DST_BLOCK_SIZE_HEIGHT_THIRTYTWO_GOBS                         (0x00000005)
+#define NVC9B5_SET_DST_BLOCK_SIZE_DEPTH                                         11:8
+#define NVC9B5_SET_DST_BLOCK_SIZE_DEPTH_ONE_GOB                                 (0x00000000)
+#define NVC9B5_SET_DST_BLOCK_SIZE_DEPTH_TWO_GOBS                                (0x00000001)
+#define NVC9B5_SET_DST_BLOCK_SIZE_DEPTH_FOUR_GOBS                               (0x00000002)
+#define NVC9B5_SET_DST_BLOCK_SIZE_DEPTH_EIGHT_GOBS                              (0x00000003)
+#define NVC9B5_SET_DST_BLOCK_SIZE_DEPTH_SIXTEEN_GOBS                            (0x00000004)
+#define NVC9B5_SET_DST_BLOCK_SIZE_DEPTH_THIRTYTWO_GOBS                          (0x00000005)
+#define NVC9B5_SET_DST_BLOCK_SIZE_GOB_HEIGHT                                    15:12
+#define NVC9B5_SET_DST_BLOCK_SIZE_GOB_HEIGHT_GOB_HEIGHT_FERMI_8                 (0x00000001)
+#define NVC9B5_SET_DST_WIDTH                                                    (0x00000710)
+#define NVC9B5_SET_DST_WIDTH_V                                                  31:0
+#define NVC9B5_SET_DST_HEIGHT                                                   (0x00000714)
+#define NVC9B5_SET_DST_HEIGHT_V                                                 31:0
+#define NVC9B5_SET_DST_DEPTH                                                    (0x00000718)
+#define NVC9B5_SET_DST_DEPTH_V                                                  31:0
+#define NVC9B5_SET_DST_LAYER                                                    (0x0000071C)
+#define NVC9B5_SET_DST_LAYER_V                                                  31:0
+#define NVC9B5_SET_DST_ORIGIN                                                   (0x00000720)
+#define NVC9B5_SET_DST_ORIGIN_X                                                 15:0
+#define NVC9B5_SET_DST_ORIGIN_Y                                                 31:16
+#define NVC9B5_SET_SRC_BLOCK_SIZE                                               (0x00000728)
+#define NVC9B5_SET_SRC_BLOCK_SIZE_WIDTH                                         3:0
+#define NVC9B5_SET_SRC_BLOCK_SIZE_WIDTH_ONE_GOB                                 (0x00000000)
+#define NVC9B5_SET_SRC_BLOCK_SIZE_HEIGHT                                        7:4
+#define NVC9B5_SET_SRC_BLOCK_SIZE_HEIGHT_ONE_GOB                                (0x00000000)
+#define NVC9B5_SET_SRC_BLOCK_SIZE_HEIGHT_TWO_GOBS                               (0x00000001)
+#define NVC9B5_SET_SRC_BLOCK_SIZE_HEIGHT_FOUR_GOBS                              (0x00000002)
+#define NVC9B5_SET_SRC_BLOCK_SIZE_HEIGHT_EIGHT_GOBS                             (0x00000003)
+#define NVC9B5_SET_SRC_BLOCK_SIZE_HEIGHT_SIXTEEN_GOBS                           (0x00000004)
+#define NVC9B5_SET_SRC_BLOCK_SIZE_HEIGHT_THIRTYTWO_GOBS                         (0x00000005)
+#define NVC9B5_SET_SRC_BLOCK_SIZE_DEPTH                                         11:8
+#define NVC9B5_SET_SRC_BLOCK_SIZE_DEPTH_ONE_GOB                                 (0x00000000)
+#define NVC9B5_SET_SRC_BLOCK_SIZE_DEPTH_TWO_GOBS                                (0x00000001)
+#define NVC9B5_SET_SRC_BLOCK_SIZE_DEPTH_FOUR_GOBS                               (0x00000002)
+#define NVC9B5_SET_SRC_BLOCK_SIZE_DEPTH_EIGHT_GOBS                              (0x00000003)
+#define NVC9B5_SET_SRC_BLOCK_SIZE_DEPTH_SIXTEEN_GOBS                            (0x00000004)
+#define NVC9B5_SET_SRC_BLOCK_SIZE_DEPTH_THIRTYTWO_GOBS                          (0x00000005)
+#define NVC9B5_SET_SRC_BLOCK_SIZE_GOB_HEIGHT                                    15:12
+#define NVC9B5_SET_SRC_BLOCK_SIZE_GOB_HEIGHT_GOB_HEIGHT_FERMI_8                 (0x00000001)
+#define NVC9B5_SET_SRC_WIDTH                                                    (0x0000072C)
+#define NVC9B5_SET_SRC_WIDTH_V                                                  31:0
+#define NVC9B5_SET_SRC_HEIGHT                                                   (0x00000730)
+#define NVC9B5_SET_SRC_HEIGHT_V                                                 31:0
+#define NVC9B5_SET_SRC_DEPTH                                                    (0x00000734)
+#define NVC9B5_SET_SRC_DEPTH_V                                                  31:0
+#define NVC9B5_SET_SRC_LAYER                                                    (0x00000738)
+#define NVC9B5_SET_SRC_LAYER_V                                                  31:0
+#define NVC9B5_SET_SRC_ORIGIN                                                   (0x0000073C)
+#define NVC9B5_SET_SRC_ORIGIN_X                                                 15:0
+#define NVC9B5_SET_SRC_ORIGIN_Y                                                 31:16
+#define NVC9B5_SRC_ORIGIN_X                                                     (0x00000744)
+#define NVC9B5_SRC_ORIGIN_X_VALUE                                               31:0
+#define NVC9B5_SRC_ORIGIN_Y                                                     (0x00000748)
+#define NVC9B5_SRC_ORIGIN_Y_VALUE                                               31:0
+#define NVC9B5_DST_ORIGIN_X                                                     (0x0000074C)
+#define NVC9B5_DST_ORIGIN_X_VALUE                                               31:0
+#define NVC9B5_DST_ORIGIN_Y                                                     (0x00000750)
+#define NVC9B5_DST_ORIGIN_Y_VALUE                                               31:0
+#define NVC9B5_PM_TRIGGER_END                                                   (0x00001114)
+#define NVC9B5_PM_TRIGGER_END_V                                                 31:0
+
+#ifdef __cplusplus
+};     /* extern "C" */
+#endif
+#endif // _clc9b5_h
+
--- a/kernel-open/nvidia-uvm/ctrl2080mc.h
+++ b/kernel-open/nvidia-uvm/ctrl2080mc.h
@@ -34,6 +34,7 @@
 #define NV2080_CTRL_MC_ARCH_INFO_ARCHITECTURE_GA100                (0x00000170)
 #define NV2080_CTRL_MC_ARCH_INFO_ARCHITECTURE_GH100                (0x00000180)
 #define NV2080_CTRL_MC_ARCH_INFO_ARCHITECTURE_AD100                (0x00000190)
+#define NV2080_CTRL_MC_ARCH_INFO_ARCHITECTURE_GB100                (0x000001A0)

 /* valid ARCHITECTURE_GP10x implementation values */
 #define NV2080_CTRL_MC_ARCH_INFO_IMPLEMENTATION_GP100              (0x00000000)
--- a/kernel-open/nvidia-uvm/hwref/blackwell/gb100/dev_fault.h
+++ b/kernel-open/nvidia-uvm/hwref/blackwell/gb100/dev_fault.h
@@ -0,0 +1,546 @@
+/*******************************************************************************
+    Copyright (c) 2003-2016 NVIDIA Corporation
+
+    Permission is hereby granted, free of charge, to any person obtaining a copy
+    of this software and associated documentation files (the "Software"), to
+    deal in the Software without restriction, including without limitation the
+    rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+    sell copies of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+
+    The above copyright notice and this permission notice shall be
+    included in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+    THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+    DEALINGS IN THE SOFTWARE.
+
+*******************************************************************************/
+
+
+#ifndef __gb100_dev_fault_h__
+#define __gb100_dev_fault_h__
+/* This file is autogenerated.  Do not edit */
+#define NV_PFAULT                                              /* ----G */
+#define NV_PFAULT_MMU_ENG_ID_GRAPHICS          384 /*       */
+#define NV_PFAULT_MMU_ENG_ID_DISPLAY           1 /*       */
+#define NV_PFAULT_MMU_ENG_ID_GSP               2 /*       */
+#define NV_PFAULT_MMU_ENG_ID_IFB               55 /*       */
+#define NV_PFAULT_MMU_ENG_ID_FLA               4 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR1              256 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR2              320 /*       */
+#define NV_PFAULT_MMU_ENG_ID_SEC               6 /*       */
+#define NV_PFAULT_MMU_ENG_ID_FSP               7 /*       */
+#define NV_PFAULT_MMU_ENG_ID_PERF              10 /*       */
+#define NV_PFAULT_MMU_ENG_ID_PERF0             10 /*       */
+#define NV_PFAULT_MMU_ENG_ID_PERF1             11 /*       */
+#define NV_PFAULT_MMU_ENG_ID_PERF2             12 /*       */
+#define NV_PFAULT_MMU_ENG_ID_PERF3             13 /*       */
+#define NV_PFAULT_MMU_ENG_ID_PERF4             14 /*       */
+#define NV_PFAULT_MMU_ENG_ID_PERF5             15 /*       */
+#define NV_PFAULT_MMU_ENG_ID_PERF6             16 /*       */
+#define NV_PFAULT_MMU_ENG_ID_PERF7             17 /*       */
+#define NV_PFAULT_MMU_ENG_ID_PERF8             18 /*       */
+#define NV_PFAULT_MMU_ENG_ID_PERF9             19 /*       */
+#define NV_PFAULT_MMU_ENG_ID_GSPLITE          20 /*       */
+#define NV_PFAULT_MMU_ENG_ID_NVDEC             28 /*       */
+#define NV_PFAULT_MMU_ENG_ID_NVDEC0            28 /*       */
+#define NV_PFAULT_MMU_ENG_ID_NVDEC1            29 /*       */
+#define NV_PFAULT_MMU_ENG_ID_NVDEC2            30 /*       */
+#define NV_PFAULT_MMU_ENG_ID_NVDEC3            31 /*       */
+#define NV_PFAULT_MMU_ENG_ID_NVDEC4            32 /*       */
+#define NV_PFAULT_MMU_ENG_ID_NVDEC5            33 /*       */
+#define NV_PFAULT_MMU_ENG_ID_NVDEC6            34 /*       */
+#define NV_PFAULT_MMU_ENG_ID_NVDEC7            35 /*       */
+#define NV_PFAULT_MMU_ENG_ID_NVJPG0            36 /*       */
+#define NV_PFAULT_MMU_ENG_ID_NVJPG1            37 /*       */
+#define NV_PFAULT_MMU_ENG_ID_NVJPG2            38 /*       */
+#define NV_PFAULT_MMU_ENG_ID_NVJPG3            39 /*       */
+#define NV_PFAULT_MMU_ENG_ID_NVJPG4            40 /*       */
+#define NV_PFAULT_MMU_ENG_ID_NVJPG5            41 /*       */
+#define NV_PFAULT_MMU_ENG_ID_NVJPG6            42 /*       */
+#define NV_PFAULT_MMU_ENG_ID_NVJPG7            43 /*       */
+#define NV_PFAULT_MMU_ENG_ID_GRCOPY            65 /*       */
+#define NV_PFAULT_MMU_ENG_ID_CE0               65 /*       */
+#define NV_PFAULT_MMU_ENG_ID_CE1               66 /*       */
+#define NV_PFAULT_MMU_ENG_ID_CE2               67 /*       */
+#define NV_PFAULT_MMU_ENG_ID_CE3               68 /*       */
+#define NV_PFAULT_MMU_ENG_ID_CE4               69 /*       */
+#define NV_PFAULT_MMU_ENG_ID_CE5               70 /*       */
+#define NV_PFAULT_MMU_ENG_ID_CE6               71 /*       */
+#define NV_PFAULT_MMU_ENG_ID_CE7               72 /*       */
+#define NV_PFAULT_MMU_ENG_ID_CE8               73 /*       */
+#define NV_PFAULT_MMU_ENG_ID_CE9               74 /*       */
+#define NV_PFAULT_MMU_ENG_ID_CE10               75 /*       */
+#define NV_PFAULT_MMU_ENG_ID_CE11               76 /*       */
+#define NV_PFAULT_MMU_ENG_ID_CE12               77 /*       */
+#define NV_PFAULT_MMU_ENG_ID_CE13               78 /*       */
+#define NV_PFAULT_MMU_ENG_ID_CE14               79 /*       */
+#define NV_PFAULT_MMU_ENG_ID_CE15               80 /*       */
+#define NV_PFAULT_MMU_ENG_ID_CE16               81 /*       */
+#define NV_PFAULT_MMU_ENG_ID_CE17               82 /*       */
+#define NV_PFAULT_MMU_ENG_ID_CE18               83 /*       */
+#define NV_PFAULT_MMU_ENG_ID_CE19               84 /*       */
+#define NV_PFAULT_MMU_ENG_ID_PWR_PMU           5 /*       */
+#define NV_PFAULT_MMU_ENG_ID_PTP               3 /*       */
+#define NV_PFAULT_MMU_ENG_ID_NVENC0            44 /*       */
+#define NV_PFAULT_MMU_ENG_ID_NVENC1            45 /*       */
+#define NV_PFAULT_MMU_ENG_ID_NVENC2            46 /*       */
+#define NV_PFAULT_MMU_ENG_ID_NVENC3            47 /*       */
+#define NV_PFAULT_MMU_ENG_ID_OFA0              48 /*       */
+#define NV_PFAULT_MMU_ENG_ID_PHYSICAL          56 /*       */
+#define NV_PFAULT_MMU_ENG_ID_HOST0             85 /*       */
+#define NV_PFAULT_MMU_ENG_ID_HOST1             86 /*       */
+#define NV_PFAULT_MMU_ENG_ID_HOST2             87 /*       */
+#define NV_PFAULT_MMU_ENG_ID_HOST3             88 /*       */
+#define NV_PFAULT_MMU_ENG_ID_HOST4             89 /*       */
+#define NV_PFAULT_MMU_ENG_ID_HOST5             90 /*       */
+#define NV_PFAULT_MMU_ENG_ID_HOST6             91 /*       */
+#define NV_PFAULT_MMU_ENG_ID_HOST7             92 /*       */
+#define NV_PFAULT_MMU_ENG_ID_HOST8             93 /*       */
+#define NV_PFAULT_MMU_ENG_ID_HOST9             94 /*       */
+#define NV_PFAULT_MMU_ENG_ID_HOST10            95 /*       */
+#define NV_PFAULT_MMU_ENG_ID_HOST11            96 /*       */
+#define NV_PFAULT_MMU_ENG_ID_HOST12            97 /*       */
+#define NV_PFAULT_MMU_ENG_ID_HOST13            98 /*       */
+#define NV_PFAULT_MMU_ENG_ID_HOST14            99 /*       */
+#define NV_PFAULT_MMU_ENG_ID_HOST15            100 /*       */
+#define NV_PFAULT_MMU_ENG_ID_HOST16            101 /*       */
+#define NV_PFAULT_MMU_ENG_ID_HOST17            102 /*       */
+#define NV_PFAULT_MMU_ENG_ID_HOST18            103 /*       */
+#define NV_PFAULT_MMU_ENG_ID_HOST19            104 /*       */
+#define NV_PFAULT_MMU_ENG_ID_HOST20            105 /*       */
+#define NV_PFAULT_MMU_ENG_ID_HOST21            106 /*       */
+#define NV_PFAULT_MMU_ENG_ID_HOST22            107 /*       */
+#define NV_PFAULT_MMU_ENG_ID_HOST23            108 /*       */
+#define NV_PFAULT_MMU_ENG_ID_HOST24            109 /*       */
+#define NV_PFAULT_MMU_ENG_ID_HOST25            110 /*       */
+#define NV_PFAULT_MMU_ENG_ID_HOST26            111 /*       */
+#define NV_PFAULT_MMU_ENG_ID_HOST27            112 /*       */
+#define NV_PFAULT_MMU_ENG_ID_HOST28            113 /*       */
+#define NV_PFAULT_MMU_ENG_ID_HOST29            114 /*       */
+#define NV_PFAULT_MMU_ENG_ID_HOST30            115 /*       */
+#define NV_PFAULT_MMU_ENG_ID_HOST31            116 /*       */
+#define NV_PFAULT_MMU_ENG_ID_HOST32            117 /*       */
+#define NV_PFAULT_MMU_ENG_ID_HOST33            118 /*       */
+#define NV_PFAULT_MMU_ENG_ID_HOST34            119 /*       */
+#define NV_PFAULT_MMU_ENG_ID_HOST35            120 /*       */
+#define NV_PFAULT_MMU_ENG_ID_HOST36            121 /*       */
+#define NV_PFAULT_MMU_ENG_ID_HOST37            122 /*       */
+#define NV_PFAULT_MMU_ENG_ID_HOST38            123 /*       */
+#define NV_PFAULT_MMU_ENG_ID_HOST39            124 /*       */
+#define NV_PFAULT_MMU_ENG_ID_HOST40            125 /*       */
+#define NV_PFAULT_MMU_ENG_ID_HOST41            126 /*       */
+#define NV_PFAULT_MMU_ENG_ID_HOST42            127 /*       */
+#define NV_PFAULT_MMU_ENG_ID_HOST43            128 /*       */
+#define NV_PFAULT_MMU_ENG_ID_HOST44            129 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR1_FN0          256  /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR1_FN1          257  /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR1_FN2          258  /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR1_FN3          259  /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR1_FN4          260  /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR1_FN5          261  /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR1_FN6          262  /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR1_FN7          263  /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR1_FN8          264  /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR1_FN9          265  /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR1_FN10         266 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR1_FN11         267 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR1_FN12         268 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR1_FN13         269 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR1_FN14         270 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR1_FN15         271 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR1_FN16         272 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR1_FN17         273 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR1_FN18         274 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR1_FN19         275 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR1_FN20         276 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR1_FN21         277 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR1_FN22         278 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR1_FN23         279 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR1_FN24         280 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR1_FN25         281 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR1_FN26         282 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR1_FN27         283 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR1_FN28         284 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR1_FN29         285 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR1_FN30         286 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR1_FN31         287 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR1_FN32         288 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR1_FN33         289 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR1_FN34         290 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR1_FN35         291 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR1_FN36         292 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR1_FN37         293 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR1_FN38         294 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR1_FN39         295 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR1_FN40         296 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR1_FN41         297 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR1_FN42         298 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR1_FN43         299 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR1_FN44         300 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR1_FN45         301 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR1_FN46         302 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR1_FN47         303 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR1_FN48         304 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR1_FN49         305 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR1_FN50         306 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR1_FN51         307 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR1_FN52         308 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR1_FN53         309 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR1_FN54         310 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR1_FN55         311 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR1_FN56         312 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR1_FN57         313 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR1_FN58         314 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR1_FN59         315 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR1_FN60         316 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR1_FN61         317 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR1_FN62         318 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR1_FN63         319 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR2_FN0          320  /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR2_FN1          321  /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR2_FN2          322  /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR2_FN3          323  /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR2_FN4          324  /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR2_FN5          325  /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR2_FN6          326  /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR2_FN7          327  /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR2_FN8          328  /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR2_FN9          329  /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR2_FN10         330 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR2_FN11         331 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR2_FN12         332 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR2_FN13         333 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR2_FN14         334 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR2_FN15         335 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR2_FN16         336 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR2_FN17         337 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR2_FN18         338 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR2_FN19         339 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR2_FN20         340 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR2_FN21         341 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR2_FN22         342 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR2_FN23         343 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR2_FN24         344 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR2_FN25         345 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR2_FN26         346 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR2_FN27         347 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR2_FN28         348 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR2_FN29         349 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR2_FN30         350 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR2_FN31         351 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR2_FN32         352 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR2_FN33         353 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR2_FN34         354 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR2_FN35         355 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR2_FN36         356 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR2_FN37         357 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR2_FN38         358 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR2_FN39         359 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR2_FN40         360 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR2_FN41         361 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR2_FN42         362 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR2_FN43         363 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR2_FN44         364 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR2_FN45         365 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR2_FN46         366 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR2_FN47         367 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR2_FN48         368 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR2_FN49         369 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR2_FN50         370 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR2_FN51         371 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR2_FN52         372 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR2_FN53         373 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR2_FN54         374 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR2_FN55         375 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR2_FN56         376 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR2_FN57         377 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR2_FN58         378 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR2_FN59         379 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR2_FN60         380 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR2_FN61         381 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR2_FN62         382 /*       */
+#define NV_PFAULT_MMU_ENG_ID_BAR2_FN63         383 /*       */
+#define NV_PFAULT_FAULT_TYPE                             4:0 /*       */
+#define NV_PFAULT_FAULT_TYPE_PDE                  0x00000000 /*       */
+#define NV_PFAULT_FAULT_TYPE_PDE_SIZE             0x00000001 /*       */
+#define NV_PFAULT_FAULT_TYPE_PTE                  0x00000002 /*       */
+#define NV_PFAULT_FAULT_TYPE_VA_LIMIT_VIOLATION   0x00000003 /*       */
+#define NV_PFAULT_FAULT_TYPE_UNBOUND_INST_BLOCK   0x00000004 /*       */
+#define NV_PFAULT_FAULT_TYPE_PRIV_VIOLATION       0x00000005 /*       */
+#define NV_PFAULT_FAULT_TYPE_RO_VIOLATION         0x00000006 /*       */
+#define NV_PFAULT_FAULT_TYPE_WO_VIOLATION         0x00000007 /*       */
+#define NV_PFAULT_FAULT_TYPE_PITCH_MASK_VIOLATION 0x00000008 /*       */
+#define NV_PFAULT_FAULT_TYPE_WORK_CREATION        0x00000009 /*       */
+#define NV_PFAULT_FAULT_TYPE_UNSUPPORTED_APERTURE 0x0000000a /*       */
+#define NV_PFAULT_FAULT_TYPE_CC_VIOLATION         0x0000000b /*       */
+#define NV_PFAULT_FAULT_TYPE_UNSUPPORTED_KIND     0x0000000c /*       */
+#define NV_PFAULT_FAULT_TYPE_REGION_VIOLATION     0x0000000d /*       */
+#define NV_PFAULT_FAULT_TYPE_POISONED             0x0000000e /*       */
+#define NV_PFAULT_FAULT_TYPE_ATOMIC_VIOLATION     0x0000000f /*       */
+#define NV_PFAULT_CLIENT                       14:8 /*       */
+#define NV_PFAULT_CLIENT_GPC_T1_0        0x00000000 /*       */
+#define NV_PFAULT_CLIENT_GPC_T1_1        0x00000001 /*       */
+#define NV_PFAULT_CLIENT_GPC_T1_2        0x00000002 /*       */
+#define NV_PFAULT_CLIENT_GPC_T1_3        0x00000003 /*       */
+#define NV_PFAULT_CLIENT_GPC_T1_4        0x00000004 /*       */
+#define NV_PFAULT_CLIENT_GPC_T1_5        0x00000005 /*       */
+#define NV_PFAULT_CLIENT_GPC_T1_6        0x00000006 /*       */
+#define NV_PFAULT_CLIENT_GPC_T1_7        0x00000007 /*       */
+#define NV_PFAULT_CLIENT_GPC_PE_0        0x00000008 /*       */
+#define NV_PFAULT_CLIENT_GPC_PE_1        0x00000009 /*       */
+#define NV_PFAULT_CLIENT_GPC_PE_2        0x0000000A /*       */
+#define NV_PFAULT_CLIENT_GPC_PE_3        0x0000000B /*       */
+#define NV_PFAULT_CLIENT_GPC_PE_4        0x0000000C /*       */
+#define NV_PFAULT_CLIENT_GPC_PE_5        0x0000000D /*       */
+#define NV_PFAULT_CLIENT_GPC_PE_6        0x0000000E /*       */
+#define NV_PFAULT_CLIENT_GPC_PE_7        0x0000000F /*       */
+#define NV_PFAULT_CLIENT_GPC_RAST        0x00000010 /*       */
+#define NV_PFAULT_CLIENT_GPC_GCC         0x00000011 /*       */
+#define NV_PFAULT_CLIENT_GPC_GPCCS       0x00000012 /*       */
+#define NV_PFAULT_CLIENT_GPC_PROP_0      0x00000013 /*       */
+#define NV_PFAULT_CLIENT_GPC_PROP_1      0x00000014 /*       */
+#define NV_PFAULT_CLIENT_GPC_PROP_2      0x00000015 /*       */
+#define NV_PFAULT_CLIENT_GPC_PROP_3      0x00000016 /*       */
+#define NV_PFAULT_CLIENT_GPC_T1_8        0x00000021 /*       */
+#define NV_PFAULT_CLIENT_GPC_T1_9        0x00000022 /*       */
+#define NV_PFAULT_CLIENT_GPC_T1_10       0x00000023 /*       */
+#define NV_PFAULT_CLIENT_GPC_T1_11       0x00000024 /*       */
+#define NV_PFAULT_CLIENT_GPC_T1_12       0x00000025 /*       */
+#define NV_PFAULT_CLIENT_GPC_T1_13       0x00000026 /*       */
+#define NV_PFAULT_CLIENT_GPC_T1_14       0x00000027 /*       */
+#define NV_PFAULT_CLIENT_GPC_T1_15       0x00000028 /*       */
+#define NV_PFAULT_CLIENT_GPC_TPCCS_0     0x00000029 /*       */
+#define NV_PFAULT_CLIENT_GPC_TPCCS_1     0x0000002A /*       */
+#define NV_PFAULT_CLIENT_GPC_TPCCS_2     0x0000002B /*       */
+#define NV_PFAULT_CLIENT_GPC_TPCCS_3     0x0000002C /*       */
+#define NV_PFAULT_CLIENT_GPC_TPCCS_4     0x0000002D /*       */
+#define NV_PFAULT_CLIENT_GPC_TPCCS_5     0x0000002E /*       */
+#define NV_PFAULT_CLIENT_GPC_TPCCS_6     0x0000002F /*       */
+#define NV_PFAULT_CLIENT_GPC_TPCCS_7     0x00000030 /*       */
+#define NV_PFAULT_CLIENT_GPC_PE_8        0x00000031 /*       */
+#define NV_PFAULT_CLIENT_GPC_PE_9        0x00000032 /*       */
+#define NV_PFAULT_CLIENT_GPC_TPCCS_8     0x00000033 /*       */
+#define NV_PFAULT_CLIENT_GPC_TPCCS_9     0x00000034 /*       */
+#define NV_PFAULT_CLIENT_GPC_T1_16       0x00000035 /*       */
+#define NV_PFAULT_CLIENT_GPC_T1_17       0x00000036 /*       */
+#define NV_PFAULT_CLIENT_GPC_T1_18       0x00000037 /*       */
+#define NV_PFAULT_CLIENT_GPC_T1_19       0x00000038 /*       */
+#define NV_PFAULT_CLIENT_GPC_PE_10       0x00000039 /*       */
+#define NV_PFAULT_CLIENT_GPC_PE_11       0x0000003A /*       */
+#define NV_PFAULT_CLIENT_GPC_TPCCS_10    0x0000003B /*       */
+#define NV_PFAULT_CLIENT_GPC_TPCCS_11    0x0000003C /*       */
+#define NV_PFAULT_CLIENT_GPC_T1_20       0x0000003D /*       */
+#define NV_PFAULT_CLIENT_GPC_T1_21       0x0000003E /*       */
+#define NV_PFAULT_CLIENT_GPC_T1_22       0x0000003F /*       */
+#define NV_PFAULT_CLIENT_GPC_T1_23       0x00000040 /*       */
+#define NV_PFAULT_CLIENT_GPC_PE_12       0x00000041 /*       */
+#define NV_PFAULT_CLIENT_GPC_PE_13       0x00000042 /*       */
+#define NV_PFAULT_CLIENT_GPC_TPCCS_12    0x00000043 /*       */
+#define NV_PFAULT_CLIENT_GPC_TPCCS_13    0x00000044 /*       */
+#define NV_PFAULT_CLIENT_GPC_T1_24       0x00000045 /*       */
+#define NV_PFAULT_CLIENT_GPC_T1_25       0x00000046 /*       */
+#define NV_PFAULT_CLIENT_GPC_T1_26       0x00000047 /*       */
+#define NV_PFAULT_CLIENT_GPC_T1_27       0x00000048 /*       */
+#define NV_PFAULT_CLIENT_GPC_PE_14       0x00000049 /*       */
+#define NV_PFAULT_CLIENT_GPC_PE_15       0x0000004A /*       */
+#define NV_PFAULT_CLIENT_GPC_TPCCS_14    0x0000004B /*       */
+#define NV_PFAULT_CLIENT_GPC_TPCCS_15    0x0000004C /*       */
+#define NV_PFAULT_CLIENT_GPC_T1_28       0x0000004D /*       */
+#define NV_PFAULT_CLIENT_GPC_T1_29       0x0000004E /*       */
+#define NV_PFAULT_CLIENT_GPC_T1_30       0x0000004F /*       */
+#define NV_PFAULT_CLIENT_GPC_T1_31       0x00000050 /*       */
+#define NV_PFAULT_CLIENT_GPC_PE_16       0x00000051 /*       */
+#define NV_PFAULT_CLIENT_GPC_PE_17       0x00000052 /*       */
+#define NV_PFAULT_CLIENT_GPC_TPCCS_16    0x00000053 /*       */
+#define NV_PFAULT_CLIENT_GPC_TPCCS_17    0x00000054 /*       */
+#define NV_PFAULT_CLIENT_GPC_T1_32       0x00000055 /*       */
+#define NV_PFAULT_CLIENT_GPC_T1_33       0x00000056 /*       */
+#define NV_PFAULT_CLIENT_GPC_T1_34       0x00000057 /*       */
+#define NV_PFAULT_CLIENT_GPC_T1_35       0x00000058 /*       */
+#define NV_PFAULT_CLIENT_GPC_PE_18       0x00000059 /*       */
+#define NV_PFAULT_CLIENT_GPC_PE_19       0x0000005A /*       */
+#define NV_PFAULT_CLIENT_GPC_TPCCS_18    0x0000005B /*       */
+#define NV_PFAULT_CLIENT_GPC_TPCCS_19    0x0000005C /*       */
+#define NV_PFAULT_CLIENT_GPC_T1_36       0x0000005D /*       */
+#define NV_PFAULT_CLIENT_GPC_T1_37       0x0000005E /*       */
+#define NV_PFAULT_CLIENT_GPC_T1_38       0x0000005F /*       */
+#define NV_PFAULT_CLIENT_GPC_T1_39       0x00000060 /*       */
+#define NV_PFAULT_CLIENT_GPC_ROP_0       0x00000070 /*       */
+#define NV_PFAULT_CLIENT_GPC_ROP_1       0x00000071 /*       */
+#define NV_PFAULT_CLIENT_GPC_ROP_2       0x00000072 /*       */
+#define NV_PFAULT_CLIENT_GPC_ROP_3       0x00000073 /*       */
+#define NV_PFAULT_CLIENT_GPC_GPM          0x00000017 /*       */
+#define NV_PFAULT_CLIENT_HUB_VIP         0x00000000 /*       */
+#define NV_PFAULT_CLIENT_HUB_CE0         0x00000001 /*       */
+#define NV_PFAULT_CLIENT_HUB_CE1         0x00000002 /*       */
+#define NV_PFAULT_CLIENT_HUB_DNISO       0x00000003 /*       */
+#define NV_PFAULT_CLIENT_HUB_DISPNISO    0x00000003 /*       */
+#define NV_PFAULT_CLIENT_HUB_FE0         0x00000004 /*       */
+#define NV_PFAULT_CLIENT_HUB_FE          0x00000004 /*       */
+#define NV_PFAULT_CLIENT_HUB_FECS0       0x00000005 /*       */
+#define NV_PFAULT_CLIENT_HUB_FECS        0x00000005 /*       */
+#define NV_PFAULT_CLIENT_HUB_HOST        0x00000006 /*       */
+#define NV_PFAULT_CLIENT_HUB_HOST_CPU    0x00000007 /*       */
+#define NV_PFAULT_CLIENT_HUB_HOST_CPU_NB 0x00000008 /*       */
+#define NV_PFAULT_CLIENT_HUB_ISO         0x00000009 /*       */
+#define NV_PFAULT_CLIENT_HUB_MMU         0x0000000A /*       */
+#define NV_PFAULT_CLIENT_HUB_NVDEC0      0x0000000B /*       */
+#define NV_PFAULT_CLIENT_HUB_NVDEC       0x0000000B /*       */
+#define NV_PFAULT_CLIENT_HUB_CE3         0x0000000C /*       */
+#define NV_PFAULT_CLIENT_HUB_NVENC1      0x0000000D /*       */
+#define NV_PFAULT_CLIENT_HUB_NISO        0x0000000E /*       */
+#define NV_PFAULT_CLIENT_HUB_ACTRS       0x0000000E /*       */
+#define NV_PFAULT_CLIENT_HUB_P2P         0x0000000F /*       */
+#define NV_PFAULT_CLIENT_HUB_PD          0x00000010 /*       */
+#define NV_PFAULT_CLIENT_HUB_PD0         0x00000010 /*       */
+#define NV_PFAULT_CLIENT_HUB_PERF0       0x00000011 /*       */
+#define NV_PFAULT_CLIENT_HUB_PERF        0x00000011 /*       */
+#define NV_PFAULT_CLIENT_HUB_PMU         0x00000012 /*       */
+#define NV_PFAULT_CLIENT_HUB_RASTERTWOD  0x00000013 /*       */
+#define NV_PFAULT_CLIENT_HUB_RASTERTWOD0 0x00000013 /*       */
+#define NV_PFAULT_CLIENT_HUB_SCC         0x00000014 /*       */
+#define NV_PFAULT_CLIENT_HUB_SCC0        0x00000014 /*       */
+#define NV_PFAULT_CLIENT_HUB_SCC_NB      0x00000015 /*       */
+#define NV_PFAULT_CLIENT_HUB_SCC_NB0     0x00000015 /*       */
+#define NV_PFAULT_CLIENT_HUB_SEC         0x00000016 /*       */
+#define NV_PFAULT_CLIENT_HUB_SSYNC       0x00000017 /*       */
+#define NV_PFAULT_CLIENT_HUB_SSYNC0      0x00000017 /*       */
+#define NV_PFAULT_CLIENT_HUB_GRCOPY      0x00000018 /*       */
+#define NV_PFAULT_CLIENT_HUB_CE2         0x00000018 /*       */
+#define NV_PFAULT_CLIENT_HUB_XV          0x00000019 /*       */
+#define NV_PFAULT_CLIENT_HUB_MMU_NB      0x0000001A /*       */
+#define NV_PFAULT_CLIENT_HUB_NVENC0      0x0000001B /*       */
+#define NV_PFAULT_CLIENT_HUB_NVENC       0x0000001B /*       */
+#define NV_PFAULT_CLIENT_HUB_DFALCON     0x0000001C /*       */
+#define NV_PFAULT_CLIENT_HUB_SKED0       0x0000001D /*       */
+#define NV_PFAULT_CLIENT_HUB_SKED        0x0000001D /*       */
+#define NV_PFAULT_CLIENT_HUB_PD1         0x0000001E /*       */
+#define NV_PFAULT_CLIENT_HUB_DONT_CARE   0x0000001F /*       */
+#define NV_PFAULT_CLIENT_HUB_HSCE0       0x00000020 /*       */
+#define NV_PFAULT_CLIENT_HUB_HSCE1       0x00000021 /*       */
+#define NV_PFAULT_CLIENT_HUB_HSCE2       0x00000022 /*       */
+#define NV_PFAULT_CLIENT_HUB_HSCE3       0x00000023 /*       */
+#define NV_PFAULT_CLIENT_HUB_HSCE4       0x00000024 /*       */
+#define NV_PFAULT_CLIENT_HUB_HSCE5       0x00000025 /*       */
+#define NV_PFAULT_CLIENT_HUB_HSCE6       0x00000026 /*       */
+#define NV_PFAULT_CLIENT_HUB_HSCE7       0x00000027 /*       */
+#define NV_PFAULT_CLIENT_HUB_SSYNC1      0x00000028 /*       */
+#define NV_PFAULT_CLIENT_HUB_SSYNC2      0x00000029 /*       */
+#define NV_PFAULT_CLIENT_HUB_HSHUB       0x0000002A /*       */
+#define NV_PFAULT_CLIENT_HUB_PTP_X0      0x0000002B /*       */
+#define NV_PFAULT_CLIENT_HUB_PTP_X1      0x0000002C /*       */
+#define NV_PFAULT_CLIENT_HUB_PTP_X2      0x0000002D /*       */
+#define NV_PFAULT_CLIENT_HUB_PTP_X3      0x0000002E /*       */
+#define NV_PFAULT_CLIENT_HUB_PTP_X4      0x0000002F /*       */
+#define NV_PFAULT_CLIENT_HUB_PTP_X5      0x00000030 /*       */
+#define NV_PFAULT_CLIENT_HUB_PTP_X6      0x00000031 /*       */
+#define NV_PFAULT_CLIENT_HUB_PTP_X7      0x00000032 /*       */
+#define NV_PFAULT_CLIENT_HUB_NVENC2      0x00000033 /*       */
+#define NV_PFAULT_CLIENT_HUB_VPR_SCRUBBER0 0x00000034 /*       */
+#define NV_PFAULT_CLIENT_HUB_VPR_SCRUBBER1 0x00000035 /*       */
+#define NV_PFAULT_CLIENT_HUB_SSYNC3      0x00000036 /*       */
+#define NV_PFAULT_CLIENT_HUB_FBFALCON    0x00000037 /*       */
+#define NV_PFAULT_CLIENT_HUB_CE_SHIM     0x00000038 /*       */
+#define NV_PFAULT_CLIENT_HUB_CE_SHIM0    0x00000038 /*       */
+#define NV_PFAULT_CLIENT_HUB_GSP         0x00000039 /*       */
+#define NV_PFAULT_CLIENT_HUB_NVDEC1      0x0000003A /*       */
+#define NV_PFAULT_CLIENT_HUB_NVDEC2      0x0000003B /*       */
+#define NV_PFAULT_CLIENT_HUB_NVJPG0      0x0000003C /*       */
+#define NV_PFAULT_CLIENT_HUB_NVDEC3      0x0000003D /*       */
+#define NV_PFAULT_CLIENT_HUB_NVDEC4      0x0000003E /*       */
+#define NV_PFAULT_CLIENT_HUB_OFA0        0x0000003F /*       */
+#define NV_PFAULT_CLIENT_HUB_SCC1        0x00000040 /*       */
+#define NV_PFAULT_CLIENT_HUB_SCC_NB1     0x00000041 /*       */
+#define NV_PFAULT_CLIENT_HUB_SCC2        0x00000042 /*       */
+#define NV_PFAULT_CLIENT_HUB_SCC_NB2     0x00000043 /*       */
+#define NV_PFAULT_CLIENT_HUB_SCC3        0x00000044 /*       */
+#define NV_PFAULT_CLIENT_HUB_SCC_NB3     0x00000045 /*       */
+#define NV_PFAULT_CLIENT_HUB_RASTERTWOD1 0x00000046 /*       */
+#define NV_PFAULT_CLIENT_HUB_RASTERTWOD2 0x00000047 /*       */
+#define NV_PFAULT_CLIENT_HUB_RASTERTWOD3 0x00000048 /*       */
+#define NV_PFAULT_CLIENT_HUB_GSPLITE1    0x00000049 /*       */
+#define NV_PFAULT_CLIENT_HUB_GSPLITE2    0x0000004A /*       */
+#define NV_PFAULT_CLIENT_HUB_GSPLITE3    0x0000004B /*       */
+#define NV_PFAULT_CLIENT_HUB_PD2         0x0000004C /*       */
+#define NV_PFAULT_CLIENT_HUB_PD3         0x0000004D /*       */
+#define NV_PFAULT_CLIENT_HUB_FE1         0x0000004E /*       */
+#define NV_PFAULT_CLIENT_HUB_FE2         0x0000004F /*       */
+#define NV_PFAULT_CLIENT_HUB_FE3         0x00000050 /*       */
+#define NV_PFAULT_CLIENT_HUB_FE4         0x00000051 /*       */
+#define NV_PFAULT_CLIENT_HUB_FE5         0x00000052 /*       */
+#define NV_PFAULT_CLIENT_HUB_FE6         0x00000053 /*       */
+#define NV_PFAULT_CLIENT_HUB_FE7         0x00000054 /*       */
+#define NV_PFAULT_CLIENT_HUB_FECS1       0x00000055 /*       */
+#define NV_PFAULT_CLIENT_HUB_FECS2       0x00000056 /*       */
+#define NV_PFAULT_CLIENT_HUB_FECS3       0x00000057 /*       */
+#define NV_PFAULT_CLIENT_HUB_FECS4       0x00000058 /*       */
+#define NV_PFAULT_CLIENT_HUB_FECS5       0x00000059 /*       */
+#define NV_PFAULT_CLIENT_HUB_FECS6       0x0000005A /*       */
+#define NV_PFAULT_CLIENT_HUB_FECS7       0x0000005B /*       */
+#define NV_PFAULT_CLIENT_HUB_SKED1       0x0000005C /*       */
+#define NV_PFAULT_CLIENT_HUB_SKED2       0x0000005D /*       */
+#define NV_PFAULT_CLIENT_HUB_SKED3       0x0000005E /*       */
+#define NV_PFAULT_CLIENT_HUB_SKED4       0x0000005F /*       */
+#define NV_PFAULT_CLIENT_HUB_SKED5       0x00000060 /*       */
+#define NV_PFAULT_CLIENT_HUB_SKED6       0x00000061 /*       */
+#define NV_PFAULT_CLIENT_HUB_SKED7       0x00000062 /*       */
+#define NV_PFAULT_CLIENT_HUB_ESC          0x00000063 /*       */
+#define NV_PFAULT_CLIENT_HUB_ESC0         0x00000063 /*       */
+#define NV_PFAULT_CLIENT_HUB_ESC1         0x00000064 /*       */
+#define NV_PFAULT_CLIENT_HUB_ESC2         0x00000065 /*       */
+#define NV_PFAULT_CLIENT_HUB_ESC3         0x00000066 /*       */
+#define NV_PFAULT_CLIENT_HUB_ESC4         0x00000067 /*       */
+#define NV_PFAULT_CLIENT_HUB_ESC5         0x00000068 /*       */
+#define NV_PFAULT_CLIENT_HUB_ESC6         0x00000069 /*       */
+#define NV_PFAULT_CLIENT_HUB_ESC7         0x0000006a /*       */
+#define NV_PFAULT_CLIENT_HUB_ESC8         0x0000006b /*       */
+#define NV_PFAULT_CLIENT_HUB_ESC9         0x0000006c /*       */
+#define NV_PFAULT_CLIENT_HUB_ESC10        0x0000006d /*       */
+#define NV_PFAULT_CLIENT_HUB_ESC11        0x0000006e /*       */
+#define NV_PFAULT_CLIENT_HUB_NVDEC5      0x0000006F /*       */
+#define NV_PFAULT_CLIENT_HUB_NVDEC6      0x00000070 /*       */
+#define NV_PFAULT_CLIENT_HUB_NVDEC7      0x00000071 /*       */
+#define NV_PFAULT_CLIENT_HUB_NVJPG1      0x00000072 /*       */
+#define NV_PFAULT_CLIENT_HUB_NVJPG2      0x00000073 /*       */
+#define NV_PFAULT_CLIENT_HUB_NVJPG3      0x00000074 /*       */
+#define NV_PFAULT_CLIENT_HUB_NVJPG4      0x00000075 /*       */
+#define NV_PFAULT_CLIENT_HUB_NVJPG5      0x00000076 /*       */
+#define NV_PFAULT_CLIENT_HUB_NVJPG6      0x00000077 /*       */
+#define NV_PFAULT_CLIENT_HUB_NVJPG7      0x00000078 /*       */
+#define NV_PFAULT_CLIENT_HUB_FSP         0x00000079 /*       */
+#define NV_PFAULT_CLIENT_HUB_BSI         0x0000007A /*       */
+#define NV_PFAULT_CLIENT_HUB_GSPLITE     0x0000007B /*       */
+#define NV_PFAULT_CLIENT_HUB_GSPLITE0    0x0000007B /*       */
+#define NV_PFAULT_CLIENT_HUB_VPR_SCRUBBER2 0x0000007C /*       */
+#define NV_PFAULT_CLIENT_HUB_VPR_SCRUBBER3 0x0000007D /*       */
+#define NV_PFAULT_CLIENT_HUB_VPR_SCRUBBER4 0x0000007E /*       */
+#define NV_PFAULT_CLIENT_HUB_NVENC3      0x0000007F /*       */
+#define NV_PFAULT_ACCESS_TYPE                 19:16 /*       */
+#define NV_PFAULT_ACCESS_TYPE_READ       0x00000000 /*       */
+#define NV_PFAULT_ACCESS_TYPE_WRITE      0x00000001 /*       */
+#define NV_PFAULT_ACCESS_TYPE_ATOMIC     0x00000002 /*       */
+#define NV_PFAULT_ACCESS_TYPE_PREFETCH   0x00000003 /*       */
+#define NV_PFAULT_ACCESS_TYPE_VIRT_READ          0x00000000 /*       */
+#define NV_PFAULT_ACCESS_TYPE_VIRT_WRITE         0x00000001 /*       */
+#define NV_PFAULT_ACCESS_TYPE_VIRT_ATOMIC        0x00000002 /*       */
+#define NV_PFAULT_ACCESS_TYPE_VIRT_ATOMIC_STRONG 0x00000002 /*       */
+#define NV_PFAULT_ACCESS_TYPE_VIRT_PREFETCH      0x00000003 /*       */
+#define NV_PFAULT_ACCESS_TYPE_VIRT_ATOMIC_WEAK   0x00000004 /*       */
+#define NV_PFAULT_ACCESS_TYPE_PHYS_READ          0x00000008 /*       */
+#define NV_PFAULT_ACCESS_TYPE_PHYS_WRITE         0x00000009 /*       */
+#define NV_PFAULT_ACCESS_TYPE_PHYS_ATOMIC        0x0000000a /*       */
+#define NV_PFAULT_ACCESS_TYPE_PHYS_PREFETCH      0x0000000b /*       */
+#define NV_PFAULT_MMU_CLIENT_TYPE             20:20 /*       */
+#define NV_PFAULT_MMU_CLIENT_TYPE_GPC    0x00000000 /*       */
+#define NV_PFAULT_MMU_CLIENT_TYPE_HUB    0x00000001 /*       */
+#define NV_PFAULT_GPC_ID                      28:24 /*       */
+#define NV_PFAULT_PROTECTED_MODE              29:29 /*       */
+#define NV_PFAULT_REPLAYABLE_FAULT_EN         30:30 /*       */
+#define NV_PFAULT_VALID                       31:31 /*       */
+#endif // __gb100_dev_fault_h__
--- a/kernel-open/nvidia-uvm/hwref/blackwell/gb100/dev_mmu.h
+++ b/kernel-open/nvidia-uvm/hwref/blackwell/gb100/dev_mmu.h
@@ -0,0 +1,560 @@
+/*******************************************************************************
+    Copyright (c) 2003-2016 NVIDIA Corporation
+
+    Permission is hereby granted, free of charge, to any person obtaining a copy
+    of this software and associated documentation files (the "Software"), to
+    deal in the Software without restriction, including without limitation the
+    rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+    sell copies of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+
+    The above copyright notice and this permission notice shall be
+    included in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+    THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+    DEALINGS IN THE SOFTWARE.
+
+*******************************************************************************/
+
+
+#ifndef __gb100_dev_mmu_h__
+#define __gb100_dev_mmu_h__
+/* This file is autogenerated.  Do not edit */
+#define NV_MMU_PDE                                                      /* ----G */
+#define NV_MMU_PDE_APERTURE_BIG                       (0*32+1):(0*32+0) /* RWXVF */
+#define NV_MMU_PDE_APERTURE_BIG_INVALID                      0x00000000 /* RW--V */
+#define NV_MMU_PDE_APERTURE_BIG_VIDEO_MEMORY                 0x00000001 /* RW--V */
+#define NV_MMU_PDE_APERTURE_BIG_SYSTEM_COHERENT_MEMORY       0x00000002 /* RW--V */
+#define NV_MMU_PDE_APERTURE_BIG_SYSTEM_NON_COHERENT_MEMORY   0x00000003 /* RW--V */
+#define NV_MMU_PDE_SIZE                               (0*32+3):(0*32+2) /* RWXVF */
+#define NV_MMU_PDE_SIZE_FULL                                 0x00000000 /* RW--V */
+#define NV_MMU_PDE_SIZE_HALF                                 0x00000001 /* RW--V */
+#define NV_MMU_PDE_SIZE_QUARTER                              0x00000002 /* RW--V */
+#define NV_MMU_PDE_SIZE_EIGHTH                               0x00000003 /* RW--V */
+#define NV_MMU_PDE_ADDRESS_BIG_SYS                   (0*32+31):(0*32+4) /* RWXVF */
+#define NV_MMU_PDE_ADDRESS_BIG_VID                   (0*32+31-3):(0*32+4) /* RWXVF */
+#define NV_MMU_PDE_ADDRESS_BIG_VID_PEER             (0*32+31):(0*32+32-3) /* RWXVF */
+#define NV_MMU_PDE_ADDRESS_BIG_VID_PEER_0                    0x00000000 /* RW--V */
+#define NV_MMU_PDE_APERTURE_SMALL                     (1*32+1):(1*32+0) /* RWXVF */
+#define NV_MMU_PDE_APERTURE_SMALL_INVALID                    0x00000000 /* RW--V */
+#define NV_MMU_PDE_APERTURE_SMALL_VIDEO_MEMORY               0x00000001 /* RW--V */
+#define NV_MMU_PDE_APERTURE_SMALL_SYSTEM_COHERENT_MEMORY     0x00000002 /* RW--V */
+#define NV_MMU_PDE_APERTURE_SMALL_SYSTEM_NON_COHERENT_MEMORY 0x00000003 /* RW--V */
+#define NV_MMU_PDE_VOL_SMALL                          (1*32+2):(1*32+2) /* RWXVF */
+#define NV_MMU_PDE_VOL_SMALL_TRUE                            0x00000001 /* RW--V */
+#define NV_MMU_PDE_VOL_SMALL_FALSE                           0x00000000 /* RW--V */
+#define NV_MMU_PDE_VOL_BIG                            (1*32+3):(1*32+3) /* RWXVF */
+#define NV_MMU_PDE_VOL_BIG_TRUE                              0x00000001 /* RW--V */
+#define NV_MMU_PDE_VOL_BIG_FALSE                             0x00000000 /* RW--V */
+#define NV_MMU_PDE_ADDRESS_SMALL_SYS                 (1*32+31):(1*32+4) /* RWXVF */
+#define NV_MMU_PDE_ADDRESS_SMALL_VID                 (1*32+31-3):(1*32+4) /* RWXVF */
+#define NV_MMU_PDE_ADDRESS_SMALL_VID_PEER           (1*32+31):(1*32+32-3) /* RWXVF */
+#define NV_MMU_PDE_ADDRESS_SMALL_VID_PEER_0                  0x00000000 /* RW--V */
+#define NV_MMU_PDE_ADDRESS_SHIFT                             0x0000000c /*       */
+#define NV_MMU_PDE__SIZE                                              8
+#define NV_MMU_PTE                                                      /* ----G */
+#define NV_MMU_PTE_VALID                              (0*32+0):(0*32+0) /* RWXVF */
+#define NV_MMU_PTE_VALID_TRUE                                       0x1 /* RW--V */
+#define NV_MMU_PTE_VALID_FALSE                                      0x0 /* RW--V */
+#define NV_MMU_PTE_PRIVILEGE                          (0*32+1):(0*32+1) /* RWXVF */
+#define NV_MMU_PTE_PRIVILEGE_TRUE                                   0x1 /* RW--V */
+#define NV_MMU_PTE_PRIVILEGE_FALSE                                  0x0 /* RW--V */
+#define NV_MMU_PTE_READ_ONLY                          (0*32+2):(0*32+2) /* RWXVF */
+#define NV_MMU_PTE_READ_ONLY_TRUE                                  0x1  /* RW--V */
+#define NV_MMU_PTE_READ_ONLY_FALSE                                 0x0  /* RW--V */
+#define NV_MMU_PTE_ENCRYPTED                          (0*32+3):(0*32+3) /* RWXVF */
+#define NV_MMU_PTE_ENCRYPTED_TRUE                            0x00000001 /* R---V */
+#define NV_MMU_PTE_ENCRYPTED_FALSE                           0x00000000 /* R---V */
+#define NV_MMU_PTE_ADDRESS_SYS                      (0*32+31):(0*32+4) /* RWXVF */
+#define NV_MMU_PTE_ADDRESS_VID                      (0*32+31-3):(0*32+4) /* RWXVF */
+#define NV_MMU_PTE_ADDRESS_VID_PEER                (0*32+31):(0*32+32-3) /* RWXVF */
+#define NV_MMU_PTE_ADDRESS_VID_PEER_0                       0x00000000 /* RW--V */
+#define NV_MMU_PTE_ADDRESS_VID_PEER_1                       0x00000001 /* RW--V */
+#define NV_MMU_PTE_ADDRESS_VID_PEER_2                       0x00000002 /* RW--V */
+#define NV_MMU_PTE_ADDRESS_VID_PEER_3                       0x00000003 /* RW--V */
+#define NV_MMU_PTE_ADDRESS_VID_PEER_4                       0x00000004 /* RW--V */
+#define NV_MMU_PTE_ADDRESS_VID_PEER_5                       0x00000005 /* RW--V */
+#define NV_MMU_PTE_ADDRESS_VID_PEER_6                       0x00000006 /* RW--V */
+#define NV_MMU_PTE_ADDRESS_VID_PEER_7                       0x00000007 /* RW--V */
+#define NV_MMU_PTE_VOL                                (1*32+0):(1*32+0) /* RWXVF */
+#define NV_MMU_PTE_VOL_TRUE                                  0x00000001 /* RW--V */
+#define NV_MMU_PTE_VOL_FALSE                                 0x00000000 /* RW--V */
+#define NV_MMU_PTE_APERTURE                           (1*32+2):(1*32+1) /* RWXVF */
+#define NV_MMU_PTE_APERTURE_VIDEO_MEMORY                     0x00000000 /* RW--V */
+#define NV_MMU_PTE_APERTURE_PEER_MEMORY                      0x00000001 /* RW--V */
+#define NV_MMU_PTE_APERTURE_SYSTEM_COHERENT_MEMORY           0x00000002 /* RW--V */
+#define NV_MMU_PTE_APERTURE_SYSTEM_NON_COHERENT_MEMORY       0x00000003 /* RW--V */
+#define NV_MMU_PTE_LOCK                               (1*32+3):(1*32+3) /* RWXVF */
+#define NV_MMU_PTE_LOCK_TRUE                                        0x1 /* RW--V */
+#define NV_MMU_PTE_LOCK_FALSE                                       0x0 /* RW--V */
+#define NV_MMU_PTE_ATOMIC_DISABLE                     (1*32+3):(1*32+3) /* RWXVF */
+#define NV_MMU_PTE_ATOMIC_DISABLE_TRUE                              0x1 /* RW--V */
+#define NV_MMU_PTE_ATOMIC_DISABLE_FALSE                             0x0 /* RW--V */
+#define NV_MMU_PTE_COMPTAGLINE                      (1*32+20+11):(1*32+12) /* RWXVF */
+#define NV_MMU_PTE_READ_DISABLE                     (1*32+30):(1*32+30) /* RWXVF */
+#define NV_MMU_PTE_READ_DISABLE_TRUE                               0x1  /* RW--V */
+#define NV_MMU_PTE_READ_DISABLE_FALSE                              0x0  /* RW--V */
+#define NV_MMU_PTE_WRITE_DISABLE                    (1*32+31):(1*32+31) /* RWXVF */
+#define NV_MMU_PTE_WRITE_DISABLE_TRUE                              0x1  /* RW--V */
+#define NV_MMU_PTE_WRITE_DISABLE_FALSE                             0x0  /* RW--V */
+#define NV_MMU_PTE_ADDRESS_SHIFT                             0x0000000c /*       */
+#define NV_MMU_PTE__SIZE                                             8
+#define NV_MMU_PTE_COMPTAGS_NONE                                    0x0 /*       */
+#define NV_MMU_PTE_COMPTAGS_1                                       0x1 /*       */
+#define NV_MMU_PTE_COMPTAGS_2                                       0x2 /*       */
+#define NV_MMU_PTE_KIND                              (1*32+7):(1*32+4) /* RWXVF */
+#define NV_MMU_PTE_KIND_INVALID                       0x07 /* R---V */
+#define NV_MMU_PTE_KIND_PITCH                         0x00 /* R---V */
+#define NV_MMU_PTE_KIND_GENERIC_MEMORY                                                  0x6 /* R---V */
+#define NV_MMU_PTE_KIND_Z16                                                             0x1 /* R---V */
+#define NV_MMU_PTE_KIND_S8                                                              0x2 /* R---V */
+#define NV_MMU_PTE_KIND_S8Z24                                                           0x3 /* R---V */
+#define NV_MMU_PTE_KIND_ZF32_X24S8                                                      0x4 /* R---V */
+#define NV_MMU_PTE_KIND_Z24S8                                                           0x5 /* R---V */
+#define NV_MMU_PTE_KIND_GENERIC_MEMORY_COMPRESSIBLE                                     0x8 /* R---V */
+#define NV_MMU_PTE_KIND_GENERIC_MEMORY_COMPRESSIBLE_DISABLE_PLC                         0x9 /* R---V */
+#define NV_MMU_PTE_KIND_S8_COMPRESSIBLE_DISABLE_PLC                                     0xA /* R---V */
+#define NV_MMU_PTE_KIND_Z16_COMPRESSIBLE_DISABLE_PLC                                    0xB /* R---V */
+#define NV_MMU_PTE_KIND_S8Z24_COMPRESSIBLE_DISABLE_PLC                                  0xC /* R---V */
+#define NV_MMU_PTE_KIND_ZF32_X24S8_COMPRESSIBLE_DISABLE_PLC                             0xD /* R---V */
+#define NV_MMU_PTE_KIND_Z24S8_COMPRESSIBLE_DISABLE_PLC                                  0xE /* R---V */
+#define NV_MMU_PTE_KIND_SMSKED_MESSAGE                                                  0xF /* R---V */
+#define NV_MMU_VER1_PDE                                                      /* ----G */
+#define NV_MMU_VER1_PDE_APERTURE_BIG                       (0*32+1):(0*32+0) /* RWXVF */
+#define NV_MMU_VER1_PDE_APERTURE_BIG_INVALID                      0x00000000 /* RW--V */
+#define NV_MMU_VER1_PDE_APERTURE_BIG_VIDEO_MEMORY                 0x00000001 /* RW--V */
+#define NV_MMU_VER1_PDE_APERTURE_BIG_SYSTEM_COHERENT_MEMORY       0x00000002 /* RW--V */
+#define NV_MMU_VER1_PDE_APERTURE_BIG_SYSTEM_NON_COHERENT_MEMORY   0x00000003 /* RW--V */
+#define NV_MMU_VER1_PDE_SIZE                               (0*32+3):(0*32+2) /* RWXVF */
+#define NV_MMU_VER1_PDE_SIZE_FULL                                 0x00000000 /* RW--V */
+#define NV_MMU_VER1_PDE_SIZE_HALF                                 0x00000001 /* RW--V */
+#define NV_MMU_VER1_PDE_SIZE_QUARTER                              0x00000002 /* RW--V */
+#define NV_MMU_VER1_PDE_SIZE_EIGHTH                               0x00000003 /* RW--V */
+#define NV_MMU_VER1_PDE_ADDRESS_BIG_SYS                   (0*32+31):(0*32+4) /* RWXVF */
+#define NV_MMU_VER1_PDE_ADDRESS_BIG_VID                   (0*32+31-3):(0*32+4) /* RWXVF */
+#define NV_MMU_VER1_PDE_ADDRESS_BIG_VID_PEER             (0*32+31):(0*32+32-3) /* RWXVF */
+#define NV_MMU_VER1_PDE_ADDRESS_BIG_VID_PEER_0                    0x00000000 /* RW--V */
+#define NV_MMU_VER1_PDE_APERTURE_SMALL                     (1*32+1):(1*32+0) /* RWXVF */
+#define NV_MMU_VER1_PDE_APERTURE_SMALL_INVALID                    0x00000000 /* RW--V */
+#define NV_MMU_VER1_PDE_APERTURE_SMALL_VIDEO_MEMORY               0x00000001 /* RW--V */
+#define NV_MMU_VER1_PDE_APERTURE_SMALL_SYSTEM_COHERENT_MEMORY     0x00000002 /* RW--V */
+#define NV_MMU_VER1_PDE_APERTURE_SMALL_SYSTEM_NON_COHERENT_MEMORY 0x00000003 /* RW--V */
+#define NV_MMU_VER1_PDE_VOL_SMALL                          (1*32+2):(1*32+2) /* RWXVF */
+#define NV_MMU_VER1_PDE_VOL_SMALL_TRUE                            0x00000001 /* RW--V */
+#define NV_MMU_VER1_PDE_VOL_SMALL_FALSE                           0x00000000 /* RW--V */
+#define NV_MMU_VER1_PDE_VOL_BIG                            (1*32+3):(1*32+3) /* RWXVF */
+#define NV_MMU_VER1_PDE_VOL_BIG_TRUE                              0x00000001 /* RW--V */
+#define NV_MMU_VER1_PDE_VOL_BIG_FALSE                             0x00000000 /* RW--V */
+#define NV_MMU_VER1_PDE_ADDRESS_SMALL_SYS                 (1*32+31):(1*32+4) /* RWXVF */
+#define NV_MMU_VER1_PDE_ADDRESS_SMALL_VID                 (1*32+31-3):(1*32+4) /* RWXVF */
+#define NV_MMU_VER1_PDE_ADDRESS_SMALL_VID_PEER           (1*32+31):(1*32+32-3) /* RWXVF */
+#define NV_MMU_VER1_PDE_ADDRESS_SMALL_VID_PEER_0                  0x00000000 /* RW--V */
+#define NV_MMU_VER1_PDE_ADDRESS_SHIFT                             0x0000000c /*       */
+#define NV_MMU_VER1_PDE__SIZE                                              8
+#define NV_MMU_VER1_PTE                                                      /* ----G */
+#define NV_MMU_VER1_PTE_VALID                              (0*32+0):(0*32+0) /* RWXVF */
+#define NV_MMU_VER1_PTE_VALID_TRUE                                       0x1 /* RW--V */
+#define NV_MMU_VER1_PTE_VALID_FALSE                                      0x0 /* RW--V */
+#define NV_MMU_VER1_PTE_PRIVILEGE                          (0*32+1):(0*32+1) /* RWXVF */
+#define NV_MMU_VER1_PTE_PRIVILEGE_TRUE                                   0x1 /* RW--V */
+#define NV_MMU_VER1_PTE_PRIVILEGE_FALSE                                  0x0 /* RW--V */
+#define NV_MMU_VER1_PTE_READ_ONLY                          (0*32+2):(0*32+2) /* RWXVF */
+#define NV_MMU_VER1_PTE_READ_ONLY_TRUE                                  0x1  /* RW--V */
+#define NV_MMU_VER1_PTE_READ_ONLY_FALSE                                 0x0  /* RW--V */
+#define NV_MMU_VER1_PTE_ENCRYPTED                          (0*32+3):(0*32+3) /* RWXVF */
+#define NV_MMU_VER1_PTE_ENCRYPTED_TRUE                            0x00000001 /* R---V */
+#define NV_MMU_VER1_PTE_ENCRYPTED_FALSE                           0x00000000 /* R---V */
+#define NV_MMU_VER1_PTE_ADDRESS_SYS                      (0*32+31):(0*32+4) /* RWXVF */
+#define NV_MMU_VER1_PTE_ADDRESS_VID                      (0*32+31-3):(0*32+4) /* RWXVF */
+#define NV_MMU_VER1_PTE_ADDRESS_VID_PEER                (0*32+31):(0*32+32-3) /* RWXVF */
+#define NV_MMU_VER1_PTE_ADDRESS_VID_PEER_0                       0x00000000 /* RW--V */
+#define NV_MMU_VER1_PTE_ADDRESS_VID_PEER_1                       0x00000001 /* RW--V */
+#define NV_MMU_VER1_PTE_ADDRESS_VID_PEER_2                       0x00000002 /* RW--V */
+#define NV_MMU_VER1_PTE_ADDRESS_VID_PEER_3                       0x00000003 /* RW--V */
+#define NV_MMU_VER1_PTE_ADDRESS_VID_PEER_4                       0x00000004 /* RW--V */
+#define NV_MMU_VER1_PTE_ADDRESS_VID_PEER_5                       0x00000005 /* RW--V */
+#define NV_MMU_VER1_PTE_ADDRESS_VID_PEER_6                       0x00000006 /* RW--V */
+#define NV_MMU_VER1_PTE_ADDRESS_VID_PEER_7                       0x00000007 /* RW--V */
+#define NV_MMU_VER1_PTE_VOL                                (1*32+0):(1*32+0) /* RWXVF */
+#define NV_MMU_VER1_PTE_VOL_TRUE                                  0x00000001 /* RW--V */
+#define NV_MMU_VER1_PTE_VOL_FALSE                                 0x00000000 /* RW--V */
+#define NV_MMU_VER1_PTE_APERTURE                           (1*32+2):(1*32+1) /* RWXVF */
+#define NV_MMU_VER1_PTE_APERTURE_VIDEO_MEMORY                     0x00000000 /* RW--V */
+#define NV_MMU_VER1_PTE_APERTURE_PEER_MEMORY                      0x00000001 /* RW--V */
+#define NV_MMU_VER1_PTE_APERTURE_SYSTEM_COHERENT_MEMORY           0x00000002 /* RW--V */
+#define NV_MMU_VER1_PTE_APERTURE_SYSTEM_NON_COHERENT_MEMORY       0x00000003 /* RW--V */
+#define NV_MMU_VER1_PTE_ATOMIC_DISABLE                     (1*32+3):(1*32+3) /* RWXVF */
+#define NV_MMU_VER1_PTE_ATOMIC_DISABLE_TRUE                              0x1 /* RW--V */
+#define NV_MMU_VER1_PTE_ATOMIC_DISABLE_FALSE                             0x0 /* RW--V */
+#define NV_MMU_VER1_PTE_COMPTAGLINE                      (1*32+20+11):(1*32+12) /* RWXVF */
+#define NV_MMU_VER1_PTE_KIND                              (1*32+11):(1*32+4) /* RWXVF */
+#define NV_MMU_VER1_PTE_ADDRESS_SHIFT                             0x0000000c /*       */
+#define NV_MMU_VER1_PTE__SIZE                                             8
+#define NV_MMU_VER1_PTE_COMPTAGS_NONE                                    0x0 /*       */
+#define NV_MMU_VER1_PTE_COMPTAGS_1                                       0x1 /*       */
+#define NV_MMU_VER1_PTE_COMPTAGS_2                                       0x2 /*       */
+#define NV_MMU_NEW_PDE                                                      /* ----G */
+#define NV_MMU_NEW_PDE_IS_PTE                                           0:0 /* RWXVF */
+#define NV_MMU_NEW_PDE_IS_PTE_TRUE                                      0x1 /* RW--V */
+#define NV_MMU_NEW_PDE_IS_PTE_FALSE                                     0x0 /* RW--V */
+#define NV_MMU_NEW_PDE_IS_PDE                                           0:0 /* RWXVF */
+#define NV_MMU_NEW_PDE_IS_PDE_TRUE                                      0x0 /* RW--V */
+#define NV_MMU_NEW_PDE_IS_PDE_FALSE                                     0x1 /* RW--V */
+#define NV_MMU_NEW_PDE_VALID                                            0:0 /* RWXVF */
+#define NV_MMU_NEW_PDE_VALID_TRUE                                       0x1 /* RW--V */
+#define NV_MMU_NEW_PDE_VALID_FALSE                                      0x0 /* RW--V */
+#define NV_MMU_NEW_PDE_APERTURE                                         2:1 /* RWXVF */
+#define NV_MMU_NEW_PDE_APERTURE_INVALID                          0x00000000 /* RW--V */
+#define NV_MMU_NEW_PDE_APERTURE_VIDEO_MEMORY                     0x00000001 /* RW--V */
+#define NV_MMU_NEW_PDE_APERTURE_SYSTEM_COHERENT_MEMORY           0x00000002 /* RW--V */
+#define NV_MMU_NEW_PDE_APERTURE_SYSTEM_NON_COHERENT_MEMORY       0x00000003 /* RW--V */
+#define NV_MMU_NEW_PDE_VOL                                              3:3 /* RWXVF */
+#define NV_MMU_NEW_PDE_VOL_TRUE                                  0x00000001 /* RW--V */
+#define NV_MMU_NEW_PDE_VOL_FALSE                                 0x00000000 /* RW--V */
+#define NV_MMU_NEW_PDE_NO_ATS                                            5:5 /* RWXVF */
+#define NV_MMU_NEW_PDE_NO_ATS_TRUE                                       0x1 /* RW--V */
+#define NV_MMU_NEW_PDE_NO_ATS_FALSE                                      0x0 /* RW--V */
+#define NV_MMU_NEW_PDE_ADDRESS_SYS                                     53:8 /* RWXVF */
+#define NV_MMU_NEW_PDE_ADDRESS_VID             (35-3):8 /* RWXVF */
+#define NV_MMU_NEW_PDE_ADDRESS_VID_PEER       35:(36-3) /* RWXVF */
+#define NV_MMU_NEW_PDE_ADDRESS_VID_PEER_0                        0x00000000 /* RW--V */
+#define NV_MMU_NEW_PDE_ADDRESS_SHIFT                             0x0000000c /*       */
+#define NV_MMU_NEW_PDE__SIZE                                              8
+#define NV_MMU_NEW_DUAL_PDE                                                      /* ----G */
+#define NV_MMU_NEW_DUAL_PDE_IS_PTE                                           0:0 /* RWXVF */
+#define NV_MMU_NEW_DUAL_PDE_IS_PTE_TRUE                                      0x1 /* RW--V */
+#define NV_MMU_NEW_DUAL_PDE_IS_PTE_FALSE                                     0x0 /* RW--V */
+#define NV_MMU_NEW_DUAL_PDE_IS_PDE                                           0:0 /* RWXVF */
+#define NV_MMU_NEW_DUAL_PDE_IS_PDE_TRUE                                      0x0 /* RW--V */
+#define NV_MMU_NEW_DUAL_PDE_IS_PDE_FALSE                                     0x1 /* RW--V */
+#define NV_MMU_NEW_DUAL_PDE_VALID                                            0:0 /* RWXVF */
+#define NV_MMU_NEW_DUAL_PDE_VALID_TRUE                                       0x1 /* RW--V */
+#define NV_MMU_NEW_DUAL_PDE_VALID_FALSE                                      0x0 /* RW--V */
+#define NV_MMU_NEW_DUAL_PDE_APERTURE_BIG                                     2:1 /* RWXVF */
+#define NV_MMU_NEW_DUAL_PDE_APERTURE_BIG_INVALID                      0x00000000 /* RW--V */
+#define NV_MMU_NEW_DUAL_PDE_APERTURE_BIG_VIDEO_MEMORY                 0x00000001 /* RW--V */
+#define NV_MMU_NEW_DUAL_PDE_APERTURE_BIG_SYSTEM_COHERENT_MEMORY       0x00000002 /* RW--V */
+#define NV_MMU_NEW_DUAL_PDE_APERTURE_BIG_SYSTEM_NON_COHERENT_MEMORY   0x00000003 /* RW--V */
+#define NV_MMU_NEW_DUAL_PDE_VOL_BIG                                          3:3 /* RWXVF */
+#define NV_MMU_NEW_DUAL_PDE_VOL_BIG_TRUE                              0x00000001 /* RW--V */
+#define NV_MMU_NEW_DUAL_PDE_VOL_BIG_FALSE                             0x00000000 /* RW--V */
+#define NV_MMU_NEW_DUAL_PDE_NO_ATS                                       5:5 /* RWXVF */
+#define NV_MMU_NEW_DUAL_PDE_NO_ATS_TRUE                                  0x1 /* RW--V */
+#define NV_MMU_NEW_DUAL_PDE_NO_ATS_FALSE                                 0x0 /* RW--V */
+#define NV_MMU_NEW_DUAL_PDE_ADDRESS_BIG_SYS                                 53:(8-4) /* RWXVF */
+#define NV_MMU_NEW_DUAL_PDE_ADDRESS_BIG_VID         (35-3):(8-4) /* RWXVF */
+#define NV_MMU_NEW_DUAL_PDE_ADDRESS_BIG_VID_PEER   35:(36-3) /* RWXVF */
+#define NV_MMU_NEW_DUAL_PDE_ADDRESS_BIG_VID_PEER_0                    0x00000000 /* RW--V */
+#define NV_MMU_NEW_DUAL_PDE_APERTURE_SMALL                                 66:65 /* RWXVF */
+#define NV_MMU_NEW_DUAL_PDE_APERTURE_SMALL_INVALID                    0x00000000 /* RW--V */
+#define NV_MMU_NEW_DUAL_PDE_APERTURE_SMALL_VIDEO_MEMORY               0x00000001 /* RW--V */
+#define NV_MMU_NEW_DUAL_PDE_APERTURE_SMALL_SYSTEM_COHERENT_MEMORY     0x00000002 /* RW--V */
+#define NV_MMU_NEW_DUAL_PDE_APERTURE_SMALL_SYSTEM_NON_COHERENT_MEMORY 0x00000003 /* RW--V */
+#define NV_MMU_NEW_DUAL_PDE_VOL_SMALL                                      67:67 /* RWXVF */
+#define NV_MMU_NEW_DUAL_PDE_VOL_SMALL_TRUE                            0x00000001 /* RW--V */
+#define NV_MMU_NEW_DUAL_PDE_VOL_SMALL_FALSE                           0x00000000 /* RW--V */
+#define NV_MMU_NEW_DUAL_PDE_ADDRESS_SMALL_SYS                             117:72 /* RWXVF */
+#define NV_MMU_NEW_DUAL_PDE_ADDRESS_SMALL_VID      (99-3):72 /* RWXVF */
+#define NV_MMU_NEW_DUAL_PDE_ADDRESS_SMALL_VID_PEER 99:(100-3) /* RWXVF */
+#define NV_MMU_NEW_DUAL_PDE_ADDRESS_SMALL_VID_PEER_0                  0x00000000 /* RW--V */
+#define NV_MMU_NEW_DUAL_PDE_ADDRESS_SHIFT                             0x0000000c /*       */
+#define NV_MMU_NEW_DUAL_PDE_ADDRESS_BIG_SHIFT 8 /*       */
+#define NV_MMU_NEW_DUAL_PDE__SIZE                                             16
+#define NV_MMU_NEW_PTE                                                      /* ----G */
+#define NV_MMU_NEW_PTE_VALID                                            0:0 /* RWXVF */
+#define NV_MMU_NEW_PTE_VALID_TRUE                                       0x1 /* RW--V */
+#define NV_MMU_NEW_PTE_VALID_FALSE                                      0x0 /* RW--V */
+#define NV_MMU_NEW_PTE_APERTURE                                         2:1 /* RWXVF */
+#define NV_MMU_NEW_PTE_APERTURE_VIDEO_MEMORY                     0x00000000 /* RW--V */
+#define NV_MMU_NEW_PTE_APERTURE_PEER_MEMORY                      0x00000001 /* RW--V */
+#define NV_MMU_NEW_PTE_APERTURE_SYSTEM_COHERENT_MEMORY           0x00000002 /* RW--V */
+#define NV_MMU_NEW_PTE_APERTURE_SYSTEM_NON_COHERENT_MEMORY       0x00000003 /* RW--V */
+#define NV_MMU_NEW_PTE_VOL                                              3:3 /* RWXVF */
+#define NV_MMU_NEW_PTE_VOL_TRUE                                  0x00000001 /* RW--V */
+#define NV_MMU_NEW_PTE_VOL_FALSE                                 0x00000000 /* RW--V */
+#define NV_MMU_NEW_PTE_ENCRYPTED                                        4:4 /* RWXVF */
+#define NV_MMU_NEW_PTE_ENCRYPTED_TRUE                            0x00000001 /* R---V */
+#define NV_MMU_NEW_PTE_ENCRYPTED_FALSE                           0x00000000 /* R---V */
+#define NV_MMU_NEW_PTE_PRIVILEGE                                        5:5 /* RWXVF */
+#define NV_MMU_NEW_PTE_PRIVILEGE_TRUE                                   0x1 /* RW--V */
+#define NV_MMU_NEW_PTE_PRIVILEGE_FALSE                                  0x0 /* RW--V */
+#define NV_MMU_NEW_PTE_READ_ONLY                                        6:6 /* RWXVF */
+#define NV_MMU_NEW_PTE_READ_ONLY_TRUE                                   0x1 /* RW--V */
+#define NV_MMU_NEW_PTE_READ_ONLY_FALSE                                  0x0 /* RW--V */
+#define NV_MMU_NEW_PTE_ATOMIC_DISABLE                                   7:7 /* RWXVF */
+#define NV_MMU_NEW_PTE_ATOMIC_DISABLE_TRUE                              0x1 /* RW--V */
+#define NV_MMU_NEW_PTE_ATOMIC_DISABLE_FALSE                             0x0 /* RW--V */
+#define NV_MMU_NEW_PTE_ADDRESS_SYS                                     53:8 /* RWXVF */
+#define NV_MMU_NEW_PTE_ADDRESS_VID             (35-3):8 /* RWXVF */
+#define NV_MMU_NEW_PTE_ADDRESS_VID_PEER       35:(36-3) /* RWXVF */
+#define NV_MMU_NEW_PTE_ADDRESS_VID_PEER_0                        0x00000000 /* RW--V */
+#define NV_MMU_NEW_PTE_ADDRESS_VID_PEER_1                        0x00000001 /* RW--V */
+#define NV_MMU_NEW_PTE_ADDRESS_VID_PEER_2                        0x00000002 /* RW--V */
+#define NV_MMU_NEW_PTE_ADDRESS_VID_PEER_3                        0x00000003 /* RW--V */
+#define NV_MMU_NEW_PTE_ADDRESS_VID_PEER_4                        0x00000004 /* RW--V */
+#define NV_MMU_NEW_PTE_ADDRESS_VID_PEER_5                        0x00000005 /* RW--V */
+#define NV_MMU_NEW_PTE_ADDRESS_VID_PEER_6                        0x00000006 /* RW--V */
+#define NV_MMU_NEW_PTE_ADDRESS_VID_PEER_7                        0x00000007 /* RW--V */
+#define NV_MMU_NEW_PTE_COMPTAGLINE   (20+35):36 /* RWXVF */
+#define NV_MMU_NEW_PTE_KIND                                           63:56 /* RWXVF */
+#define NV_MMU_NEW_PTE_ADDRESS_SHIFT                             0x0000000c /*       */
+#define NV_MMU_NEW_PTE__SIZE                                              8
+#define NV_MMU_VER2_PDE                                                      /* ----G */
+#define NV_MMU_VER2_PDE_IS_PTE                                           0:0 /* RWXVF */
+#define NV_MMU_VER2_PDE_IS_PTE_TRUE                                      0x1 /* RW--V */
+#define NV_MMU_VER2_PDE_IS_PTE_FALSE                                     0x0 /* RW--V */
+#define NV_MMU_VER2_PDE_IS_PDE                                           0:0 /* RWXVF */
+#define NV_MMU_VER2_PDE_IS_PDE_TRUE                                      0x0 /* RW--V */
+#define NV_MMU_VER2_PDE_IS_PDE_FALSE                                     0x1 /* RW--V */
+#define NV_MMU_VER2_PDE_VALID                                            0:0 /* RWXVF */
+#define NV_MMU_VER2_PDE_VALID_TRUE                                       0x1 /* RW--V */
+#define NV_MMU_VER2_PDE_VALID_FALSE                                      0x0 /* RW--V */
+#define NV_MMU_VER2_PDE_APERTURE                                         2:1 /* RWXVF */
+#define NV_MMU_VER2_PDE_APERTURE_INVALID                          0x00000000 /* RW--V */
+#define NV_MMU_VER2_PDE_APERTURE_VIDEO_MEMORY                     0x00000001 /* RW--V */
+#define NV_MMU_VER2_PDE_APERTURE_SYSTEM_COHERENT_MEMORY           0x00000002 /* RW--V */
+#define NV_MMU_VER2_PDE_APERTURE_SYSTEM_NON_COHERENT_MEMORY       0x00000003 /* RW--V */
+#define NV_MMU_VER2_PDE_VOL                                              3:3 /* RWXVF */
+#define NV_MMU_VER2_PDE_VOL_TRUE                                  0x00000001 /* RW--V */
+#define NV_MMU_VER2_PDE_VOL_FALSE                                 0x00000000 /* RW--V */
+#define NV_MMU_VER2_PDE_NO_ATS                                           5:5 /* RWXVF */
+#define NV_MMU_VER2_PDE_NO_ATS_TRUE                                      0x1 /* RW--V */
+#define NV_MMU_VER2_PDE_NO_ATS_FALSE                                     0x0 /* RW--V */
+#define NV_MMU_VER2_PDE_ADDRESS_SYS                                     53:8 /* RWXVF */
+#define NV_MMU_VER2_PDE_ADDRESS_VID             (35-3):8 /* RWXVF */
+#define NV_MMU_VER2_PDE_ADDRESS_VID_PEER       35:(36-3) /* RWXVF */
+#define NV_MMU_VER2_PDE_ADDRESS_VID_PEER_0                        0x00000000 /* RW--V */
+#define NV_MMU_VER2_PDE_ADDRESS_SHIFT                             0x0000000c /*       */
+#define NV_MMU_VER2_PDE__SIZE                                              8
+#define NV_MMU_VER2_DUAL_PDE                                                      /* ----G */
+#define NV_MMU_VER2_DUAL_PDE_IS_PTE                                           0:0 /* RWXVF */
+#define NV_MMU_VER2_DUAL_PDE_IS_PTE_TRUE                                      0x1 /* RW--V */
+#define NV_MMU_VER2_DUAL_PDE_IS_PTE_FALSE                                     0x0 /* RW--V */
+#define NV_MMU_VER2_DUAL_PDE_IS_PDE                                           0:0 /* RWXVF */
+#define NV_MMU_VER2_DUAL_PDE_IS_PDE_TRUE                                      0x0 /* RW--V */
+#define NV_MMU_VER2_DUAL_PDE_IS_PDE_FALSE                                     0x1 /* RW--V */
+#define NV_MMU_VER2_DUAL_PDE_VALID                                            0:0 /* RWXVF */
+#define NV_MMU_VER2_DUAL_PDE_VALID_TRUE                                       0x1 /* RW--V */
+#define NV_MMU_VER2_DUAL_PDE_VALID_FALSE                                      0x0 /* RW--V */
+#define NV_MMU_VER2_DUAL_PDE_APERTURE_BIG                                     2:1 /* RWXVF */
+#define NV_MMU_VER2_DUAL_PDE_APERTURE_BIG_INVALID                      0x00000000 /* RW--V */
+#define NV_MMU_VER2_DUAL_PDE_APERTURE_BIG_VIDEO_MEMORY                 0x00000001 /* RW--V */
+#define NV_MMU_VER2_DUAL_PDE_APERTURE_BIG_SYSTEM_COHERENT_MEMORY       0x00000002 /* RW--V */
+#define NV_MMU_VER2_DUAL_PDE_APERTURE_BIG_SYSTEM_NON_COHERENT_MEMORY   0x00000003 /* RW--V */
+#define NV_MMU_VER2_DUAL_PDE_VOL_BIG                                          3:3 /* RWXVF */
+#define NV_MMU_VER2_DUAL_PDE_VOL_BIG_TRUE                              0x00000001 /* RW--V */
+#define NV_MMU_VER2_DUAL_PDE_VOL_BIG_FALSE                             0x00000000 /* RW--V */
+#define NV_MMU_VER2_DUAL_PDE_NO_ATS                                      5:5 /* RWXVF */
+#define NV_MMU_VER2_DUAL_PDE_NO_ATS_TRUE                                 0x1 /* RW--V */
+#define NV_MMU_VER2_DUAL_PDE_NO_ATS_FALSE                                0x0 /* RW--V */
+#define NV_MMU_VER2_DUAL_PDE_ADDRESS_BIG_SYS                                 53:(8-4) /* RWXVF */
+#define NV_MMU_VER2_DUAL_PDE_ADDRESS_BIG_VID         (35-3):(8-4) /* RWXVF */
+#define NV_MMU_VER2_DUAL_PDE_ADDRESS_BIG_VID_PEER   35:(36-3) /* RWXVF */
+#define NV_MMU_VER2_DUAL_PDE_ADDRESS_BIG_VID_PEER_0                    0x00000000 /* RW--V */
+#define NV_MMU_VER2_DUAL_PDE_APERTURE_SMALL                                 66:65 /* RWXVF */
+#define NV_MMU_VER2_DUAL_PDE_APERTURE_SMALL_INVALID                    0x00000000 /* RW--V */
+#define NV_MMU_VER2_DUAL_PDE_APERTURE_SMALL_VIDEO_MEMORY               0x00000001 /* RW--V */
+#define NV_MMU_VER2_DUAL_PDE_APERTURE_SMALL_SYSTEM_COHERENT_MEMORY     0x00000002 /* RW--V */
+#define NV_MMU_VER2_DUAL_PDE_APERTURE_SMALL_SYSTEM_NON_COHERENT_MEMORY 0x00000003 /* RW--V */
+#define NV_MMU_VER2_DUAL_PDE_VOL_SMALL                                      67:67 /* RWXVF */
+#define NV_MMU_VER2_DUAL_PDE_VOL_SMALL_TRUE                            0x00000001 /* RW--V */
+#define NV_MMU_VER2_DUAL_PDE_VOL_SMALL_FALSE                           0x00000000 /* RW--V */
+#define NV_MMU_VER2_DUAL_PDE_ADDRESS_SMALL_SYS                             117:72 /* RWXVF */
+#define NV_MMU_VER2_DUAL_PDE_ADDRESS_SMALL_VID      (99-3):72 /* RWXVF */
+#define NV_MMU_VER2_DUAL_PDE_ADDRESS_SMALL_VID_PEER 99:(100-3) /* RWXVF */
+#define NV_MMU_VER2_DUAL_PDE_ADDRESS_SMALL_VID_PEER_0                  0x00000000 /* RW--V */
+#define NV_MMU_VER2_DUAL_PDE_ADDRESS_SHIFT                             0x0000000c /*       */
+#define NV_MMU_VER2_DUAL_PDE_ADDRESS_BIG_SHIFT 8 /*       */
+#define NV_MMU_VER2_DUAL_PDE__SIZE                                             16
+#define NV_MMU_VER2_PTE                                                      /* ----G */
+#define NV_MMU_VER2_PTE_VALID                                            0:0 /* RWXVF */
+#define NV_MMU_VER2_PTE_VALID_TRUE                                       0x1 /* RW--V */
+#define NV_MMU_VER2_PTE_VALID_FALSE                                      0x0 /* RW--V */
+#define NV_MMU_VER2_PTE_APERTURE                                         2:1 /* RWXVF */
+#define NV_MMU_VER2_PTE_APERTURE_VIDEO_MEMORY                     0x00000000 /* RW--V */
+#define NV_MMU_VER2_PTE_APERTURE_PEER_MEMORY                      0x00000001 /* RW--V */
+#define NV_MMU_VER2_PTE_APERTURE_SYSTEM_COHERENT_MEMORY           0x00000002 /* RW--V */
+#define NV_MMU_VER2_PTE_APERTURE_SYSTEM_NON_COHERENT_MEMORY       0x00000003 /* RW--V */
+#define NV_MMU_VER2_PTE_VOL                                              3:3 /* RWXVF */
+#define NV_MMU_VER2_PTE_VOL_TRUE                                  0x00000001 /* RW--V */
+#define NV_MMU_VER2_PTE_VOL_FALSE                                 0x00000000 /* RW--V */
+#define NV_MMU_VER2_PTE_ENCRYPTED                                        4:4 /* RWXVF */
+#define NV_MMU_VER2_PTE_ENCRYPTED_TRUE                            0x00000001 /* R---V */
+#define NV_MMU_VER2_PTE_ENCRYPTED_FALSE                           0x00000000 /* R---V */
+#define NV_MMU_VER2_PTE_PRIVILEGE                                        5:5 /* RWXVF */
+#define NV_MMU_VER2_PTE_PRIVILEGE_TRUE                                   0x1 /* RW--V */
+#define NV_MMU_VER2_PTE_PRIVILEGE_FALSE                                  0x0 /* RW--V */
+#define NV_MMU_VER2_PTE_READ_ONLY                                        6:6 /* RWXVF */
+#define NV_MMU_VER2_PTE_READ_ONLY_TRUE                                   0x1 /* RW--V */
+#define NV_MMU_VER2_PTE_READ_ONLY_FALSE                                  0x0 /* RW--V */
+#define NV_MMU_VER2_PTE_ATOMIC_DISABLE                                   7:7 /* RWXVF */
+#define NV_MMU_VER2_PTE_ATOMIC_DISABLE_TRUE                              0x1 /* RW--V */
+#define NV_MMU_VER2_PTE_ATOMIC_DISABLE_FALSE                             0x0 /* RW--V */
+#define NV_MMU_VER2_PTE_ADDRESS_SYS                                     53:8 /* RWXVF */
+#define NV_MMU_VER2_PTE_ADDRESS_VID             (35-3):8 /* RWXVF */
+#define NV_MMU_VER2_PTE_ADDRESS_VID_PEER       35:(36-3) /* RWXVF */
+#define NV_MMU_VER2_PTE_ADDRESS_VID_PEER_0                        0x00000000 /* RW--V */
+#define NV_MMU_VER2_PTE_ADDRESS_VID_PEER_1                        0x00000001 /* RW--V */
+#define NV_MMU_VER2_PTE_ADDRESS_VID_PEER_2                        0x00000002 /* RW--V */
+#define NV_MMU_VER2_PTE_ADDRESS_VID_PEER_3                        0x00000003 /* RW--V */
+#define NV_MMU_VER2_PTE_ADDRESS_VID_PEER_4                        0x00000004 /* RW--V */
+#define NV_MMU_VER2_PTE_ADDRESS_VID_PEER_5                        0x00000005 /* RW--V */
+#define NV_MMU_VER2_PTE_ADDRESS_VID_PEER_6                        0x00000006 /* RW--V */
+#define NV_MMU_VER2_PTE_ADDRESS_VID_PEER_7                        0x00000007 /* RW--V */
+#define NV_MMU_VER2_PTE_COMPTAGLINE   (20+35):36 /* RWXVF */
+#define NV_MMU_VER2_PTE_KIND                                           63:56 /* RWXVF */
+#define NV_MMU_VER2_PTE_ADDRESS_SHIFT                             0x0000000c /*       */
+#define NV_MMU_VER2_PTE__SIZE                                              8
+#define NV_MMU_VER3_PDE                                                      /* ----G */
+#define NV_MMU_VER3_PDE_IS_PTE                                           0:0 /* RWXVF */
+#define NV_MMU_VER3_PDE_IS_PTE_TRUE                                      0x1 /* RW--V */
+#define NV_MMU_VER3_PDE_IS_PTE_FALSE                                     0x0 /* RW--V */
+#define NV_MMU_VER3_PDE_VALID                                            0:0 /* RWXVF */
+#define NV_MMU_VER3_PDE_VALID_TRUE                                       0x1 /* RW--V */
+#define NV_MMU_VER3_PDE_VALID_FALSE                                      0x0 /* RW--V */
+#define NV_MMU_VER3_PDE_APERTURE                                         2:1 /* RWXVF */
+#define NV_MMU_VER3_PDE_APERTURE_INVALID                          0x00000000 /* RW--V */
+#define NV_MMU_VER3_PDE_APERTURE_VIDEO_MEMORY                     0x00000001 /* RW--V */
+#define NV_MMU_VER3_PDE_APERTURE_SYSTEM_COHERENT_MEMORY           0x00000002 /* RW--V */
+#define NV_MMU_VER3_PDE_APERTURE_SYSTEM_NON_COHERENT_MEMORY       0x00000003 /* RW--V */
+#define NV_MMU_VER3_PDE_PCF                                                                        5:3 /* RWXVF */
+#define NV_MMU_VER3_PDE_PCF_VALID_CACHED_ATS_ALLOWED__OR__INVALID_ATS_ALLOWED               0x00000000 /* RW--V */
+#define NV_MMU_VER3_PDE_PCF_VALID_CACHED_ATS_ALLOWED                                        0x00000000 /* RW--V */
+#define NV_MMU_VER3_PDE_PCF_INVALID_ATS_ALLOWED                                             0x00000000 /* RW--V */
+#define NV_MMU_VER3_PDE_PCF_VALID_UNCACHED_ATS_ALLOWED__OR__SPARSE_ATS_ALLOWED              0x00000001 /* RW--V */
+#define NV_MMU_VER3_PDE_PCF_VALID_UNCACHED_ATS_ALLOWED                                      0x00000001 /* RW--V */
+#define NV_MMU_VER3_PDE_PCF_SPARSE_ATS_ALLOWED                                              0x00000001 /* RW--V */
+#define NV_MMU_VER3_PDE_PCF_VALID_CACHED_ATS_NOT_ALLOWED__OR__INVALID_ATS_NOT_ALLOWED       0x00000002 /* RW--V */
+#define NV_MMU_VER3_PDE_PCF_VALID_CACHED_ATS_NOT_ALLOWED                                    0x00000002 /* RW--V */
+#define NV_MMU_VER3_PDE_PCF_INVALID_ATS_NOT_ALLOWED                                         0x00000002 /* RW--V */
+#define NV_MMU_VER3_PDE_PCF_VALID_UNCACHED_ATS_NOT_ALLOWED__OR__SPARSE_ATS_NOT_ALLOWED      0x00000003 /* RW--V */
+#define NV_MMU_VER3_PDE_PCF_VALID_UNCACHED_ATS_NOT_ALLOWED                                  0x00000003 /* RW--V */
+#define NV_MMU_VER3_PDE_PCF_SPARSE_ATS_NOT_ALLOWED                                          0x00000003 /* RW--V */
+#define NV_MMU_VER3_PDE_ADDRESS                                             51:12 /* RWXVF */
+#define NV_MMU_VER3_PDE_ADDRESS_SHIFT                                  0x0000000c /*       */
+#define NV_MMU_VER3_PDE__SIZE                                              8
+#define NV_MMU_VER3_DUAL_PDE                                                      /* ----G */
+#define NV_MMU_VER3_DUAL_PDE_IS_PTE                                           0:0 /* RWXVF */
+#define NV_MMU_VER3_DUAL_PDE_IS_PTE_TRUE                                      0x1 /* RW--V */
+#define NV_MMU_VER3_DUAL_PDE_IS_PTE_FALSE                                     0x0 /* RW--V */
+#define NV_MMU_VER3_DUAL_PDE_VALID                                            0:0 /* RWXVF */
+#define NV_MMU_VER3_DUAL_PDE_VALID_TRUE                                       0x1 /* RW--V */
+#define NV_MMU_VER3_DUAL_PDE_VALID_FALSE                                      0x0 /* RW--V */
+#define NV_MMU_VER3_DUAL_PDE_APERTURE_BIG                                     2:1 /* RWXVF */
+#define NV_MMU_VER3_DUAL_PDE_APERTURE_BIG_INVALID                      0x00000000 /* RW--V */
+#define NV_MMU_VER3_DUAL_PDE_APERTURE_BIG_VIDEO_MEMORY                 0x00000001 /* RW--V */
+#define NV_MMU_VER3_DUAL_PDE_APERTURE_BIG_SYSTEM_COHERENT_MEMORY       0x00000002 /* RW--V */
+#define NV_MMU_VER3_DUAL_PDE_APERTURE_BIG_SYSTEM_NON_COHERENT_MEMORY   0x00000003 /* RW--V */
+#define NV_MMU_VER3_DUAL_PDE_PCF_BIG                                                                        5:3 /* RWXVF */
+#define NV_MMU_VER3_DUAL_PDE_PCF_BIG_VALID_CACHED_ATS_ALLOWED__OR__INVALID_ATS_ALLOWED               0x00000000 /* RW--V */
+#define NV_MMU_VER3_DUAL_PDE_PCF_BIG_VALID_CACHED_ATS_ALLOWED                                        0x00000000 /* RW--V */
+#define NV_MMU_VER3_DUAL_PDE_PCF_BIG_INVALID_ATS_ALLOWED                                             0x00000000 /* RW--V */
+#define NV_MMU_VER3_DUAL_PDE_PCF_BIG_VALID_UNCACHED_ATS_ALLOWED__OR__SPARSE_ATS_ALLOWED              0x00000001 /* RW--V */
+#define NV_MMU_VER3_DUAL_PDE_PCF_BIG_VALID_UNCACHED_ATS_ALLOWED                                      0x00000001 /* RW--V */
+#define NV_MMU_VER3_DUAL_PDE_PCF_BIG_SPARSE_ATS_ALLOWED                                              0x00000001 /* RW--V */
+#define NV_MMU_VER3_DUAL_PDE_PCF_BIG_VALID_CACHED_ATS_NOT_ALLOWED__OR__INVALID_ATS_NOT_ALLOWED       0x00000002 /* RW--V */
+#define NV_MMU_VER3_DUAL_PDE_PCF_BIG_VALID_CACHED_ATS_NOT_ALLOWED                                    0x00000002 /* RW--V */
+#define NV_MMU_VER3_DUAL_PDE_PCF_BIG_INVALID_ATS_NOT_ALLOWED                                         0x00000002 /* RW--V */
+#define NV_MMU_VER3_DUAL_PDE_PCF_BIG_VALID_UNCACHED_ATS_NOT_ALLOWED__OR__SPARSE_ATS_NOT_ALLOWED      0x00000003 /* RW--V */
+#define NV_MMU_VER3_DUAL_PDE_PCF_BIG_VALID_UNCACHED_ATS_NOT_ALLOWED                                  0x00000003 /* RW--V */
+#define NV_MMU_VER3_DUAL_PDE_PCF_BIG_SPARSE_ATS_NOT_ALLOWED                                          0x00000003 /* RW--V */
+#define NV_MMU_VER3_DUAL_PDE_ADDRESS_BIG                                     51:8 /* RWXVF */
+#define NV_MMU_VER3_DUAL_PDE_APERTURE_SMALL                                 66:65 /* RWXVF */
+#define NV_MMU_VER3_DUAL_PDE_APERTURE_SMALL_INVALID                    0x00000000 /* RW--V */
+#define NV_MMU_VER3_DUAL_PDE_APERTURE_SMALL_VIDEO_MEMORY               0x00000001 /* RW--V */
+#define NV_MMU_VER3_DUAL_PDE_APERTURE_SMALL_SYSTEM_COHERENT_MEMORY     0x00000002 /* RW--V */
+#define NV_MMU_VER3_DUAL_PDE_APERTURE_SMALL_SYSTEM_NON_COHERENT_MEMORY 0x00000003 /* RW--V */
+#define NV_MMU_VER3_DUAL_PDE_PCF_SMALL                                                                      69:67 /* RWXVF */
+#define NV_MMU_VER3_DUAL_PDE_PCF_SMALL_VALID_CACHED_ATS_ALLOWED__OR__INVALID_ATS_ALLOWED               0x00000000 /* RW--V */
+#define NV_MMU_VER3_DUAL_PDE_PCF_SMALL_VALID_CACHED_ATS_ALLOWED                                        0x00000000 /* RW--V */
+#define NV_MMU_VER3_DUAL_PDE_PCF_SMALL_INVALID_ATS_ALLOWED                                             0x00000000 /* RW--V */
+#define NV_MMU_VER3_DUAL_PDE_PCF_SMALL_VALID_UNCACHED_ATS_ALLOWED__OR__SPARSE_ATS_ALLOWED              0x00000001 /* RW--V */
+#define NV_MMU_VER3_DUAL_PDE_PCF_SMALL_VALID_UNCACHED_ATS_ALLOWED                                      0x00000001 /* RW--V */
+#define NV_MMU_VER3_DUAL_PDE_PCF_SMALL_SPARSE_ATS_ALLOWED                                              0x00000001 /* RW--V */
+#define NV_MMU_VER3_DUAL_PDE_PCF_SMALL_VALID_CACHED_ATS_NOT_ALLOWED__OR__INVALID_ATS_NOT_ALLOWED       0x00000002 /* RW--V */
+#define NV_MMU_VER3_DUAL_PDE_PCF_SMALL_VALID_CACHED_ATS_NOT_ALLOWED                                    0x00000002 /* RW--V */
+#define NV_MMU_VER3_DUAL_PDE_PCF_SMALL_INVALID_ATS_NOT_ALLOWED                                         0x00000002 /* RW--V */
+#define NV_MMU_VER3_DUAL_PDE_PCF_SMALL_VALID_UNCACHED_ATS_NOT_ALLOWED__OR__SPARSE_ATS_NOT_ALLOWED      0x00000003 /* RW--V */
+#define NV_MMU_VER3_DUAL_PDE_PCF_SMALL_VALID_UNCACHED_ATS_NOT_ALLOWED                                  0x00000003 /* RW--V */
+#define NV_MMU_VER3_DUAL_PDE_PCF_SMALL_SPARSE_ATS_NOT_ALLOWED                                          0x00000003 /* RW--V */
+#define NV_MMU_VER3_DUAL_PDE_ADDRESS_SMALL                                 115:76 /* RWXVF */
+#define NV_MMU_VER3_DUAL_PDE_ADDRESS_SHIFT                             0x0000000c /*       */
+#define NV_MMU_VER3_DUAL_PDE_ADDRESS_BIG_SHIFT 8 /*       */
+#define NV_MMU_VER3_DUAL_PDE__SIZE                                             16
+#define NV_MMU_VER3_PTE                                                      /* ----G */
+#define NV_MMU_VER3_PTE_VALID                                            0:0 /* RWXVF */
+#define NV_MMU_VER3_PTE_VALID_TRUE                                       0x1 /* RW--V */
+#define NV_MMU_VER3_PTE_VALID_FALSE                                      0x0 /* RW--V */
+#define NV_MMU_VER3_PTE_APERTURE                                         2:1 /* RWXVF */
+#define NV_MMU_VER3_PTE_APERTURE_VIDEO_MEMORY                     0x00000000 /* RW--V */
+#define NV_MMU_VER3_PTE_APERTURE_PEER_MEMORY                      0x00000001 /* RW--V */
+#define NV_MMU_VER3_PTE_APERTURE_SYSTEM_COHERENT_MEMORY           0x00000002 /* RW--V */
+#define NV_MMU_VER3_PTE_APERTURE_SYSTEM_NON_COHERENT_MEMORY       0x00000003 /* RW--V */
+#define NV_MMU_VER3_PTE_PCF                                                                        7:3 /* RWXVF */
+#define NV_MMU_VER3_PTE_PCF_INVALID                                                         0x00000000 /* RW--V */
+#define NV_MMU_VER3_PTE_PCF_SPARSE                                                          0x00000001 /* RW--V */
+#define NV_MMU_VER3_PTE_PCF_MAPPING_NOWHERE                                                 0x00000002 /* RW--V */
+#define NV_MMU_VER3_PTE_PCF_NO_VALID_4KB_PAGE                                               0x00000003 /* RW--V */
+#define NV_MMU_VER3_PTE_PCF_REGULAR_RW_ATOMIC_CACHED_ACE                                    0x00000000 /* RW--V */
+#define NV_MMU_VER3_PTE_PCF_REGULAR_RW_ATOMIC_UNCACHED_ACE                                  0x00000001 /* RW--V */
+#define NV_MMU_VER3_PTE_PCF_PRIVILEGE_RW_ATOMIC_CACHED_ACE                                  0x00000002 /* RW--V */
+#define NV_MMU_VER3_PTE_PCF_PRIVILEGE_RW_ATOMIC_UNCACHED_ACE                                0x00000003 /* RW--V */
+#define NV_MMU_VER3_PTE_PCF_REGULAR_RO_ATOMIC_CACHED_ACE                                    0x00000004 /* RW--V */
+#define NV_MMU_VER3_PTE_PCF_REGULAR_RO_ATOMIC_UNCACHED_ACE                                   0x00000005 /* RW--V */
+#define NV_MMU_VER3_PTE_PCF_PRIVILEGE_RO_ATOMIC_CACHED_ACE                                  0x00000006 /* RW--V */
+#define NV_MMU_VER3_PTE_PCF_PRIVILEGE_RO_ATOMIC_UNCACHED_ACE                                0x00000007 /* RW--V */
+#define NV_MMU_VER3_PTE_PCF_REGULAR_RW_NO_ATOMIC_CACHED_ACE                                 0x00000008 /* RW--V */
+#define NV_MMU_VER3_PTE_PCF_REGULAR_RW_NO_ATOMIC_UNCACHED_ACE                               0x00000009 /* RW--V */
+#define NV_MMU_VER3_PTE_PCF_PRIVILEGE_RW_NO_ATOMIC_CACHED_ACE                               0x0000000A /* RW--V */
+#define NV_MMU_VER3_PTE_PCF_PRIVILEGE_RW_NO_ATOMIC_UNCACHED_ACE                             0x0000000B /* RW--V */
+#define NV_MMU_VER3_PTE_PCF_REGULAR_RO_NO_ATOMIC_CACHED_ACE                                 0x0000000C /* RW--V */
+#define NV_MMU_VER3_PTE_PCF_REGULAR_RO_NO_ATOMIC_UNCACHED_ACE                               0x0000000D /* RW--V */
+#define NV_MMU_VER3_PTE_PCF_PRIVILEGE_RO_NO_ATOMIC_CACHED_ACE                               0x0000000E /* RW--V */
+#define NV_MMU_VER3_PTE_PCF_PRIVILEGE_RO_NO_ATOMIC_UNCACHED_ACE                             0x0000000F /* RW--V */
+#define NV_MMU_VER3_PTE_PCF_REGULAR_RW_ATOMIC_CACHED_ACD                                    0x00000010 /* RW--V */
+#define NV_MMU_VER3_PTE_PCF_REGULAR_RW_ATOMIC_UNCACHED_ACD                                  0x00000011 /* RW--V */
+#define NV_MMU_VER3_PTE_PCF_PRIVILEGE_RW_ATOMIC_CACHED_ACD                                  0x00000012 /* RW--V */
+#define NV_MMU_VER3_PTE_PCF_PRIVILEGE_RW_ATOMIC_UNCACHED_ACD                                0x00000013 /* RW--V */
+#define NV_MMU_VER3_PTE_PCF_REGULAR_RO_ATOMIC_CACHED_ACD                                    0x00000014 /* RW--V */
+#define NV_MMU_VER3_PTE_PCF_REGULAR_RO_ATOMIC_UNCACHED_ACD                                  0x00000015 /* RW--V */
+#define NV_MMU_VER3_PTE_PCF_PRIVILEGE_RO_ATOMIC_CACHED_ACD                                  0x00000016 /* RW--V */
+#define NV_MMU_VER3_PTE_PCF_PRIVILEGE_RO_ATOMIC_UNCACHED_ACD                                0x00000017 /* RW--V */
+#define NV_MMU_VER3_PTE_PCF_REGULAR_RW_NO_ATOMIC_CACHED_ACD                                 0x00000018 /* RW--V */
+#define NV_MMU_VER3_PTE_PCF_REGULAR_RW_NO_ATOMIC_UNCACHED_ACD                               0x00000019 /* RW--V */
+#define NV_MMU_VER3_PTE_PCF_PRIVILEGE_RW_NO_ATOMIC_CACHED_ACD                               0x0000001A /* RW--V */
+#define NV_MMU_VER3_PTE_PCF_PRIVILEGE_RW_NO_ATOMIC_UNCACHED_ACD                             0x0000001B /* RW--V */
+#define NV_MMU_VER3_PTE_PCF_REGULAR_RO_NO_ATOMIC_CACHED_ACD                                 0x0000001C /* RW--V */
+#define NV_MMU_VER3_PTE_PCF_REGULAR_RO_NO_ATOMIC_UNCACHED_ACD                               0x0000001D /* RW--V */
+#define NV_MMU_VER3_PTE_PCF_PRIVILEGE_RO_NO_ATOMIC_CACHED_ACD                               0x0000001E /* RW--V */
+#define NV_MMU_VER3_PTE_PCF_PRIVILEGE_RO_NO_ATOMIC_UNCACHED_ACD                             0x0000001F /* RW--V */
+#define NV_MMU_VER3_PTE_KIND                                           11:8 /* RWXVF */
+#define NV_MMU_VER3_PTE_ADDRESS                                         51:12 /* RWXVF */
+#define NV_MMU_VER3_PTE_ADDRESS_SYS                                     51:12 /* RWXVF */
+#define NV_MMU_VER3_PTE_ADDRESS_PEER                                    51:12 /* RWXVF */
+#define NV_MMU_VER3_PTE_ADDRESS_VID                                     39:12 /* RWXVF */
+#define NV_MMU_VER3_PTE_PEER_ID                63:(64-3) /* RWXVF */
+#define NV_MMU_VER3_PTE_PEER_ID_0                                 0x00000000 /* RW--V */
+#define NV_MMU_VER3_PTE_PEER_ID_1                                 0x00000001 /* RW--V */
+#define NV_MMU_VER3_PTE_PEER_ID_2                                 0x00000002 /* RW--V */
+#define NV_MMU_VER3_PTE_PEER_ID_3                                 0x00000003 /* RW--V */
+#define NV_MMU_VER3_PTE_PEER_ID_4                                 0x00000004 /* RW--V */
+#define NV_MMU_VER3_PTE_PEER_ID_5                                 0x00000005 /* RW--V */
+#define NV_MMU_VER3_PTE_PEER_ID_6                                 0x00000006 /* RW--V */
+#define NV_MMU_VER3_PTE_PEER_ID_7                                 0x00000007 /* RW--V */
+#define NV_MMU_VER3_PTE_ADDRESS_SHIFT                             0x0000000c /*       */
+#define NV_MMU_VER3_PTE__SIZE                                              8
+#define NV_MMU_CLIENT                                             /* ----G */
+#define NV_MMU_CLIENT_KIND                                    2:0 /* RWXVF */
+#define NV_MMU_CLIENT_KIND_Z16                                0x1 /* R---V */
+#define NV_MMU_CLIENT_KIND_S8                                 0x2 /* R---V */
+#define NV_MMU_CLIENT_KIND_S8Z24                              0x3 /* R---V */
+#define NV_MMU_CLIENT_KIND_ZF32_X24S8                         0x4 /* R---V */
+#define NV_MMU_CLIENT_KIND_Z24S8                              0x5 /* R---V */
+#define NV_MMU_CLIENT_KIND_GENERIC_MEMORY                     0x6 /* R---V */
+#define NV_MMU_CLIENT_KIND_INVALID                            0x7 /* R---V */
+#endif // __gb100_dev_mmu_h__
--- a/kernel-open/nvidia-uvm/nv-kthread-q.c
+++ b/kernel-open/nvidia-uvm/nv-kthread-q.c
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2016 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2016-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@@ -176,7 +176,7 @@ static struct task_struct *thread_create_on_node(int (*threadfn)(void *data),
 {

    unsigned i, j;
-    const static unsigned attempts = 3;
+    static const unsigned attempts = 3;
    struct task_struct *thread[3];

    for (i = 0;; i++) {
--- a/kernel-open/nvidia-uvm/nvidia-uvm-sources.Kbuild
+++ b/kernel-open/nvidia-uvm/nvidia-uvm-sources.Kbuild
@@ -6,6 +6,10 @@ NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_conf_computing.c
 NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_sec2_test.c
 NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_maxwell_sec2.c
 NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_hopper_sec2.c
+NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_blackwell.c
+NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_blackwell_fault_buffer.c
+NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_blackwell_mmu.c
+NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_blackwell_host.c
 NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_common.c
 NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_linux.c
 NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_debug_optimized.c
@@ -72,6 +76,7 @@ NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_turing_host.c
 NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_ampere.c
 NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_ampere_ce.c
 NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_ampere_host.c
+NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_ampere_fault_buffer.c
 NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_ampere_mmu.c
 NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_hopper.c
 NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_hopper_fault_buffer.c
--- a/kernel-open/nvidia-uvm/nvidia-uvm.Kbuild
+++ b/kernel-open/nvidia-uvm/nvidia-uvm.Kbuild
@@ -114,6 +114,7 @@ NV_CONFTEST_TYPE_COMPILE_TESTS += mempolicy_has_unified_nodes
 NV_CONFTEST_TYPE_COMPILE_TESTS += mempolicy_has_home_node
 NV_CONFTEST_TYPE_COMPILE_TESTS += mpol_preferred_many_present
 NV_CONFTEST_TYPE_COMPILE_TESTS += mmu_interval_notifier
+NV_CONFTEST_TYPE_COMPILE_TESTS += fault_flag_remote_present

 NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_present_int_active_memcg
 NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_present_migrate_vma_setup
--- a/kernel-open/nvidia-uvm/uvm.h
+++ b/kernel-open/nvidia-uvm/uvm.h
@@ -58,7 +58,7 @@
 #ifndef _UVM_H_
 #define _UVM_H_

-#define UVM_API_LATEST_REVISION 11
+#define UVM_API_LATEST_REVISION 12

 #if !defined(UVM_API_REVISION)
 #error "please define UVM_API_REVISION macro to a desired version number or UVM_API_LATEST_REVISION macro"
@@ -167,7 +167,7 @@ NV_STATUS UvmSetDriverVersion(NvU32 major, NvU32 changelist);
 //
 // Error codes:
 //     NV_ERR_NOT_SUPPORTED:
-//         The Linux kernel is not able to support UVM. This could be because
+//         The kernel is not able to support UVM. This could be because
 //         the kernel is too old, or because it lacks a feature that UVM
 //         requires. The kernel log will have details.
 //
@@ -1448,7 +1448,9 @@ NV_STATUS UvmAllocSemaphorePool(void                          *base,
 //
 //     preferredCpuMemoryNode: (INPUT)
 //         Preferred CPU NUMA memory node used if the destination processor is
-//         the CPU.
+//         the CPU. -1 indicates no preference, in which case the pages used
+//         can be on any of the available CPU NUMA nodes. If NUMA is disabled
+//         only 0 and -1 are allowed.
 //
 // Error codes:
 //     NV_ERR_INVALID_ADDRESS:
@@ -1462,6 +1464,11 @@ NV_STATUS UvmAllocSemaphorePool(void                          *base,
 //         The VA range exceeds the largest virtual address supported by the
 //         destination processor.
 //
+//     NV_ERR_INVALID_ARGUMENT:
+//         preferredCpuMemoryNode is not a valid CPU NUMA node or it corresponds
+//         to a NUMA node ID for a registered GPU. If NUMA is disabled, it
+//         indicates that preferredCpuMemoryNode was not either 0 or -1.
+//
 //     NV_ERR_INVALID_DEVICE:
 //         destinationUuid does not represent a valid processor such as a CPU or
 //         a GPU with a GPU VA space registered for it. Or destinationUuid is a
@@ -1528,8 +1535,9 @@ NV_STATUS UvmMigrate(void                  *base,
 //
 //     preferredCpuMemoryNode: (INPUT)
 //         Preferred CPU NUMA memory node used if the destination processor is
-//         the CPU. This argument is ignored if the given virtual address range
-//         corresponds to managed memory.
+//         the CPU. -1 indicates no preference, in which case the pages used
+//         can be on any of the available CPU NUMA nodes. If NUMA is disabled
+//         only 0 and -1 are allowed.
 //
 //     semaphoreAddress: (INPUT)
 //         Base address of the semaphore.
@@ -1586,8 +1594,8 @@ NV_STATUS UvmMigrateAsync(void                  *base,
 //
 // Migrates the backing of all virtual address ranges associated with the given
 // range group to the specified destination processor. The behavior of this API
-// is equivalent to calling UvmMigrate on each VA range associated with this
-// range group.
+// is equivalent to calling UvmMigrate with preferredCpuMemoryNode = -1 on each
+// VA range associated with this range group.
 //
 // Any errors encountered during migration are returned immediately. No attempt
 // is made to migrate the remaining unmigrated ranges and the ranges that are
@@ -2169,7 +2177,8 @@ NV_STATUS UvmMapDynamicParallelismRegion(void                  *base,
 //
 // If any page in the VA range has a preferred location, then the migration and
 // mapping policies associated with this API take precedence over those related
-// to the preferred location.
+// to the preferred location. If the preferred location is a specific CPU NUMA
+// node, that NUMA node will be used for a CPU-resident copy of the page.
 //
 // If any pages in this VA range have any processors present in their
 // accessed-by list, the migration and mapping policies associated with this
@@ -2300,7 +2309,7 @@ NV_STATUS UvmDisableReadDuplication(void     *base,
 // UvmPreventMigrationRangeGroups has not been called on the range group that
 // those pages are associated with, then the migration and mapping policies
 // associated with UvmEnableReadDuplication override the policies outlined
-// above. Note that enabling read duplication on on any pages in this VA range
+// above. Note that enabling read duplication on any pages in this VA range
 // does not clear the state set by this API for those pages. It merely overrides
 // the policies associated with this state until read duplication is disabled
 // for those pages.
@@ -2333,7 +2342,8 @@ NV_STATUS UvmDisableReadDuplication(void     *base,
 //     preferredCpuMemoryNode: (INPUT)
 //         Preferred CPU NUMA memory node used if preferredLocationUuid is the
 //         UUID of the CPU. -1 is a special value which indicates all CPU nodes
-//         allowed by the global and thread memory policies.
+//         allowed by the global and thread memory policies. If NUMA is disabled
+//         only 0 and -1 are allowed.
 //
 // Errors:
 //     NV_ERR_INVALID_ADDRESS:
@@ -3486,7 +3496,7 @@ NvLength UvmToolsGetNumberOfCounters(void);
 //
 //     version: (INPUT)
 //         Requested version for events or counters.
-//         See UvmEventEntry_V1 and UvmEventEntry_V2.
+//         See UvmToolsEventQueueVersion.
 //
 //     event_buffer: (INPUT)
 //         User allocated buffer. Must be page-aligned. Must be large enough to
@@ -3510,10 +3520,16 @@ NvLength UvmToolsGetNumberOfCounters(void);
 //         Session handle does not refer to a valid session
 //
 //     NV_ERR_INVALID_ARGUMENT:
-//         The version is not UvmEventEntry_V1 or UvmEventEntry_V2.
+//         The version is not UvmToolsEventQueueVersion_V1 or
+//         UvmToolsEventQueueVersion_V2.
 //         One of the parameters: event_buffer, event_buffer_size, event_control
 //         is not valid
 //
+//     NV_ERR_NOT_SUPPORTED:
+//         The requested version queue could not be created
+//         (i.e., the UVM kernel driver is older and doesn't support
+//         UvmToolsEventQueueVersion_V2).
+//
 //     NV_ERR_INSUFFICIENT_RESOURCES:
 //         There could be multiple reasons for this error. One would be that
 //         it's not possible to allocate a queue of requested size. Another
@@ -3966,57 +3982,51 @@ NV_STATUS UvmToolsWriteProcessMemory(UvmToolsSessionHandle  session,
 //     version: (INPUT)
 //         Requested version for the UUID table returned. The version must
 //         match the requested version of the event queue created with
-//         UvmToolsCreateEventQueue().
-//         See UvmEventEntry_V1 and UvmEventEntry_V2.
+//         UvmToolsCreateEventQueue(). See UvmToolsEventQueueVersion.
+//         If the version of the event queue does not match the version of the
+//         UUID table, the behavior is undefined.
 //
 //     table: (OUTPUT)
 //         Array of processor UUIDs, including the CPU's UUID which is always
-//         at index zero.  The srcIndex and dstIndex fields of the
-//         UvmEventMigrationInfo struct index this array.  Unused indices will
-//         have a UUID of zero. Version UvmEventEntry_V1 only uses GPU UUIDs
-//         for the UUID of the physical GPU and only supports a single SMC
-//         partition registered per process. Version UvmEventEntry_V2 supports
-//         multiple SMC partitions registered per process and uses physical GPU
-//         UUIDs if the GPU is not SMC capable or SMC enabled and GPU instance
-//         UUIDs for SMC partitions.
-//         The table pointer can be NULL in which case, the size of the table
-//         needed to hold all the UUIDs is returned in 'count'.
-//
-//     table_size: (INPUT)
-//         The size of the table in number of array elements. This can be
-//         zero if the table pointer is NULL.
-//
-//     count: (OUTPUT)
-//         On output, it is set by UVM to the number of UUIDs needed to hold
-//         all the UUIDs, including any gaps in the table due to unregistered
-//         GPUs.
+//         at index zero. The number of elements in the array must be greater
+//         or equal to UVM_MAX_PROCESSORS_V1 if the version is
+//         UvmToolsEventQueueVersion_V1 and UVM_MAX_PROCESSORS if the version is
+//         UvmToolsEventQueueVersion_V2.
+//         The srcIndex and dstIndex fields of the UvmEventMigrationInfo struct
+//         index this array. Unused indices will have a UUID of zero.
+//         If version is UvmToolsEventQueueVersion_V1 then the reported UUID
+//         will be that of the corresponding physical GPU, even if multiple SMC
+//         partitions are registered under that physical GPU. If version is
+//         UvmToolsEventQueueVersion_V2 then the reported UUID will be the GPU
+//         instance UUID if SMC is enabled, otherwise it will be the UUID of
+//         the physical GPU.
 //
 // Error codes:
 //     NV_ERR_INVALID_ADDRESS:
-//         writing to table failed or the count pointer was invalid.
+//         writing to table failed.
 //
 //     NV_ERR_INVALID_ARGUMENT:
-//         The version is not UvmEventEntry_V1 or UvmEventEntry_V2.
-//         The count pointer is NULL.
-//         See UvmToolsEventQueueVersion.
+//         The version is not UvmToolsEventQueueVersion_V1 or
+//         UvmToolsEventQueueVersion_V2.
 //
-//     NV_WARN_MISMATCHED_TARGET:
-//         The kernel returned a table suitable for UvmEventEntry_V1 events.
-//         (i.e., the kernel is older and doesn't support UvmEventEntry_V2).
+//     NV_ERR_NOT_SUPPORTED:
+//         The kernel is not able to support the requested version
+//         (i.e., the UVM kernel driver is older and doesn't support
+//         UvmToolsEventQueueVersion_V2).
 //
 //     NV_ERR_NO_MEMORY:
 //         Internal memory allocation failed.
 //------------------------------------------------------------------------------
-#if UVM_API_REV_IS_AT_MOST(10)
-NV_STATUS UvmToolsGetProcessorUuidTable(UvmToolsSessionHandle  session,
-                                        NvProcessorUuid       *table,
-                                        NvLength              *count);
-#else
+#if UVM_API_REV_IS_AT_MOST(11)
 NV_STATUS UvmToolsGetProcessorUuidTable(UvmToolsSessionHandle      session,
                                        UvmToolsEventQueueVersion  version,
                                        NvProcessorUuid           *table,
                                        NvLength                   table_size,
                                        NvLength                  *count);
+#else
+NV_STATUS UvmToolsGetProcessorUuidTable(UvmToolsSessionHandle     session,
+                                        UvmToolsEventQueueVersion version,
+                                        NvProcessorUuid          *table);
 #endif

 //------------------------------------------------------------------------------
--- a/kernel-open/nvidia-uvm/uvm_ampere_fault_buffer.c
+++ b/kernel-open/nvidia-uvm/uvm_ampere_fault_buffer.c
@@ -0,0 +1,75 @@
+/*******************************************************************************
+    Copyright (c) 2024 NVIDIA Corporation
+
+    Permission is hereby granted, free of charge, to any person obtaining a copy
+    of this software and associated documentation files (the "Software"), to
+    deal in the Software without restriction, including without limitation the
+    rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+    sell copies of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+
+        The above copyright notice and this permission notice shall be
+        included in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+    THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+    DEALINGS IN THE SOFTWARE.
+
+*******************************************************************************/
+
+#include "uvm_linux.h"
+#include "uvm_global.h"
+#include "uvm_gpu.h"
+#include "uvm_hal.h"
+#include "hwref/ampere/ga100/dev_fault.h"
+
+static bool client_id_ce(NvU16 client_id)
+{
+    if (client_id >= NV_PFAULT_CLIENT_HUB_HSCE0 && client_id <= NV_PFAULT_CLIENT_HUB_HSCE9)
+        return true;
+
+    if (client_id >= NV_PFAULT_CLIENT_HUB_HSCE10 && client_id <= NV_PFAULT_CLIENT_HUB_HSCE15)
+        return true;
+
+    switch (client_id) {
+        case NV_PFAULT_CLIENT_HUB_CE0:
+        case NV_PFAULT_CLIENT_HUB_CE1:
+        case NV_PFAULT_CLIENT_HUB_CE2:
+            return true;
+    }
+
+    return false;
+}
+
+uvm_mmu_engine_type_t uvm_hal_ampere_fault_buffer_get_mmu_engine_type(NvU16 mmu_engine_id,
+                                                                      uvm_fault_client_type_t client_type,
+                                                                      NvU16 client_id)
+{
+    // Servicing CE and Host (HUB clients) faults.
+    if (client_type == UVM_FAULT_CLIENT_TYPE_HUB) {
+        if (client_id_ce(client_id)) {
+            UVM_ASSERT(mmu_engine_id >= NV_PFAULT_MMU_ENG_ID_CE0 && mmu_engine_id <= NV_PFAULT_MMU_ENG_ID_CE9);
+
+            return UVM_MMU_ENGINE_TYPE_CE;
+        }
+
+        if (client_id == NV_PFAULT_CLIENT_HUB_HOST || client_id == NV_PFAULT_CLIENT_HUB_ESC) {
+            UVM_ASSERT(mmu_engine_id >= NV_PFAULT_MMU_ENG_ID_HOST0 && mmu_engine_id <= NV_PFAULT_MMU_ENG_ID_HOST31);
+
+            return UVM_MMU_ENGINE_TYPE_HOST;
+        }
+    }
+
+    // We shouldn't be servicing faults from any other engines other than GR.
+    UVM_ASSERT_MSG(client_id <= NV_PFAULT_CLIENT_GPC_ROP_3, "Unexpected client ID: 0x%x\n", client_id);
+    UVM_ASSERT_MSG(mmu_engine_id >= NV_PFAULT_MMU_ENG_ID_GRAPHICS && mmu_engine_id < NV_PFAULT_MMU_ENG_ID_BAR1,
+                   "Unexpected engine ID: 0x%x\n",
+                   mmu_engine_id);
+    UVM_ASSERT(client_type == UVM_FAULT_CLIENT_TYPE_GPC);
+
+    return UVM_MMU_ENGINE_TYPE_GRAPHICS;
+}
--- a/kernel-open/nvidia-uvm/uvm_ampere_mmu.c
+++ b/kernel-open/nvidia-uvm/uvm_ampere_mmu.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2018-2020 NVIDIA Corporation
+    Copyright (c) 2018-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -36,21 +36,6 @@
 #include "uvm_ampere_fault_buffer.h"
 #include "hwref/ampere/ga100/dev_fault.h"

-uvm_mmu_engine_type_t uvm_hal_ampere_mmu_engine_id_to_type(NvU16 mmu_engine_id)
-{
-    if (mmu_engine_id >= NV_PFAULT_MMU_ENG_ID_HOST0 && mmu_engine_id <= NV_PFAULT_MMU_ENG_ID_HOST31)
-        return UVM_MMU_ENGINE_TYPE_HOST;
-
-    if (mmu_engine_id >= NV_PFAULT_MMU_ENG_ID_CE0 && mmu_engine_id <= NV_PFAULT_MMU_ENG_ID_CE9)
-        return UVM_MMU_ENGINE_TYPE_CE;
-
-    // We shouldn't be servicing faults from any other engines
-    UVM_ASSERT_MSG(mmu_engine_id >= NV_PFAULT_MMU_ENG_ID_GRAPHICS && mmu_engine_id < NV_PFAULT_MMU_ENG_ID_BAR1,
-                   "Unexpected engine ID: 0x%x\n", mmu_engine_id);
-
-    return UVM_MMU_ENGINE_TYPE_GRAPHICS;
-}
-
 static NvU32 page_table_depth_ampere(NvU64 page_size)
 {
    // The common-case is page_size == UVM_PAGE_SIZE_2M, hence the first check
--- a/kernel-open/nvidia-uvm/uvm_blackwell.c
+++ b/kernel-open/nvidia-uvm/uvm_blackwell.c
@@ -0,0 +1,105 @@
+/*******************************************************************************
+    Copyright (c) 2022-2023 NVIDIA Corporation
+
+    Permission is hereby granted, free of charge, to any person obtaining a copy
+    of this software and associated documentation files (the "Software"), to
+    deal in the Software without restriction, including without limitation the
+    rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+    sell copies of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+
+        The above copyright notice and this permission notice shall be
+        included in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+    THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+    DEALINGS IN THE SOFTWARE.
+
+*******************************************************************************/
+
+#include "uvm_global.h"
+#include "uvm_hal.h"
+#include "uvm_gpu.h"
+#include "uvm_mem.h"
+#include "uvm_blackwell_fault_buffer.h"
+
+void uvm_hal_blackwell_arch_init_properties(uvm_parent_gpu_t *parent_gpu)
+{
+    parent_gpu->tlb_batch.va_invalidate_supported = true;
+
+    parent_gpu->tlb_batch.va_range_invalidate_supported = true;
+
+    // TODO: Bug 1767241: Run benchmarks to figure out a good number
+    parent_gpu->tlb_batch.max_ranges = 8;
+
+    parent_gpu->utlb_per_gpc_count = uvm_blackwell_get_utlbs_per_gpc(parent_gpu);
+
+    parent_gpu->fault_buffer_info.replayable.utlb_count = parent_gpu->rm_info.maxGpcCount *
+                                                          parent_gpu->utlb_per_gpc_count;
+    {
+        uvm_fault_buffer_entry_t *dummy;
+        UVM_ASSERT(parent_gpu->fault_buffer_info.replayable.utlb_count <= (1 <<
+                                                                           (sizeof(dummy->fault_source.utlb_id) * 8)));
+    }
+
+    // A single top level PDE on Blackwell covers 64 PB and that's the minimum
+    // size that can be used.
+    parent_gpu->rm_va_base = 0;
+    parent_gpu->rm_va_size = 64 * UVM_SIZE_1PB;
+
+    parent_gpu->uvm_mem_va_base = parent_gpu->rm_va_size + 384 * UVM_SIZE_1TB;
+    parent_gpu->uvm_mem_va_size = UVM_MEM_VA_SIZE;
+
+    // See uvm_mmu.h for mapping placement
+    parent_gpu->flat_vidmem_va_base = (64 * UVM_SIZE_1PB) + (32 * UVM_SIZE_1TB);
+
+    // TODO: Bug 3953852: Set this to true pending Blackwell changes
+    parent_gpu->ce_phys_vidmem_write_supported = !uvm_parent_gpu_is_coherent(parent_gpu);
+
+    parent_gpu->peer_copy_mode = g_uvm_global.peer_copy_mode;
+
+    // All GR context buffers may be mapped to 57b wide VAs. All "compute" units
+    // accessing GR context buffers support the 57-bit VA range.
+    parent_gpu->max_channel_va = 1ull << 57;
+
+    parent_gpu->max_host_va = 1ull << 57;
+
+    // Blackwell can map sysmem with any page size
+    parent_gpu->can_map_sysmem_with_large_pages = true;
+
+    // Prefetch instructions will generate faults
+    parent_gpu->prefetch_fault_supported = true;
+
+    // Blackwell can place GPFIFO in vidmem
+    parent_gpu->gpfifo_in_vidmem_supported = true;
+
+    parent_gpu->replayable_faults_supported = true;
+
+    parent_gpu->non_replayable_faults_supported = true;
+
+    parent_gpu->access_counters_supported = true;
+
+    parent_gpu->access_counters_can_use_physical_addresses = false;
+
+    parent_gpu->fault_cancel_va_supported = true;
+
+    parent_gpu->scoped_atomics_supported = true;
+
+    parent_gpu->has_clear_faulted_channel_sw_method = true;
+
+    parent_gpu->has_clear_faulted_channel_method = false;
+
+    parent_gpu->smc.supported = true;
+
+    parent_gpu->sparse_mappings_supported = true;
+
+    parent_gpu->map_remap_larger_page_promotion = false;
+
+    parent_gpu->plc_supported = true;
+
+    parent_gpu->no_ats_range_required = true;
+}
--- a/kernel-open/nvidia-uvm/uvm_blackwell_fault_buffer.c
+++ b/kernel-open/nvidia-uvm/uvm_blackwell_fault_buffer.c
@@ -0,0 +1,122 @@
+/*******************************************************************************
+    Copyright (c) 2023-2024 NVIDIA Corporation
+
+    Permission is hereby granted, free of charge, to any person obtaining a copy
+    of this software and associated documentation files (the "Software"), to
+    deal in the Software without restriction, including without limitation the
+    rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+    sell copies of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+
+        The above copyright notice and this permission notice shall be
+        included in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+    THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+    DEALINGS IN THE SOFTWARE.
+
+*******************************************************************************/
+
+#include "uvm_linux.h"
+#include "uvm_global.h"
+#include "uvm_gpu.h"
+#include "uvm_hal.h"
+#include "uvm_hal_types.h"
+#include "hwref/blackwell/gb100/dev_fault.h"
+#include "clc369.h"
+
+// NV_PFAULT_FAULT_TYPE_COMPRESSION_FAILURE fault type is deprecated on
+// Blackwell.
+uvm_fault_type_t uvm_hal_blackwell_fault_buffer_get_fault_type(const NvU32 *fault_entry)
+{
+    NvU32 hw_fault_type_value = READ_HWVALUE_MW(fault_entry, C369, BUF_ENTRY, FAULT_TYPE);
+
+    switch (hw_fault_type_value) {
+        case NV_PFAULT_FAULT_TYPE_PDE:
+            return UVM_FAULT_TYPE_INVALID_PDE;
+        case NV_PFAULT_FAULT_TYPE_PTE:
+            return UVM_FAULT_TYPE_INVALID_PTE;
+        case NV_PFAULT_FAULT_TYPE_RO_VIOLATION:
+            return UVM_FAULT_TYPE_WRITE;
+        case NV_PFAULT_FAULT_TYPE_ATOMIC_VIOLATION:
+            return UVM_FAULT_TYPE_ATOMIC;
+        case NV_PFAULT_FAULT_TYPE_WO_VIOLATION:
+            return UVM_FAULT_TYPE_READ;
+
+        case NV_PFAULT_FAULT_TYPE_PDE_SIZE:
+            return UVM_FAULT_TYPE_PDE_SIZE;
+        case NV_PFAULT_FAULT_TYPE_VA_LIMIT_VIOLATION:
+            return UVM_FAULT_TYPE_VA_LIMIT_VIOLATION;
+        case NV_PFAULT_FAULT_TYPE_UNBOUND_INST_BLOCK:
+            return UVM_FAULT_TYPE_UNBOUND_INST_BLOCK;
+        case NV_PFAULT_FAULT_TYPE_PRIV_VIOLATION:
+            return UVM_FAULT_TYPE_PRIV_VIOLATION;
+        case NV_PFAULT_FAULT_TYPE_PITCH_MASK_VIOLATION:
+            return UVM_FAULT_TYPE_PITCH_MASK_VIOLATION;
+        case NV_PFAULT_FAULT_TYPE_WORK_CREATION:
+            return UVM_FAULT_TYPE_WORK_CREATION;
+        case NV_PFAULT_FAULT_TYPE_UNSUPPORTED_APERTURE:
+            return UVM_FAULT_TYPE_UNSUPPORTED_APERTURE;
+        case NV_PFAULT_FAULT_TYPE_CC_VIOLATION:
+            return UVM_FAULT_TYPE_CC_VIOLATION;
+        case NV_PFAULT_FAULT_TYPE_UNSUPPORTED_KIND:
+            return UVM_FAULT_TYPE_UNSUPPORTED_KIND;
+        case NV_PFAULT_FAULT_TYPE_REGION_VIOLATION:
+            return UVM_FAULT_TYPE_REGION_VIOLATION;
+        case NV_PFAULT_FAULT_TYPE_POISONED:
+            return UVM_FAULT_TYPE_POISONED;
+    }
+
+    UVM_ASSERT_MSG(false, "Invalid fault type value: %d\n", hw_fault_type_value);
+
+    return UVM_FAULT_TYPE_COUNT;
+}
+
+static bool client_id_ce(NvU16 client_id)
+{
+    if (client_id >= NV_PFAULT_CLIENT_HUB_HSCE0 && client_id <= NV_PFAULT_CLIENT_HUB_HSCE7)
+        return true;
+
+    switch (client_id) {
+        case NV_PFAULT_CLIENT_HUB_CE0:
+        case NV_PFAULT_CLIENT_HUB_CE1:
+        case NV_PFAULT_CLIENT_HUB_CE2:
+        case NV_PFAULT_CLIENT_HUB_CE3:
+            return true;
+    }
+
+    return false;
+}
+
+uvm_mmu_engine_type_t uvm_hal_blackwell_fault_buffer_get_mmu_engine_type(NvU16 mmu_engine_id,
+                                                                         uvm_fault_client_type_t client_type,
+                                                                         NvU16 client_id)
+{
+    // Servicing CE and Host (HUB clients) faults.
+    if (client_type == UVM_FAULT_CLIENT_TYPE_HUB) {
+        if (client_id_ce(client_id)) {
+            UVM_ASSERT(mmu_engine_id >= NV_PFAULT_MMU_ENG_ID_CE0 && mmu_engine_id <= NV_PFAULT_MMU_ENG_ID_CE19);
+
+            return UVM_MMU_ENGINE_TYPE_CE;
+        }
+
+        if (client_id == NV_PFAULT_CLIENT_HUB_HOST ||
+            (client_id >= NV_PFAULT_CLIENT_HUB_ESC0 && client_id <= NV_PFAULT_CLIENT_HUB_ESC11)) {
+            UVM_ASSERT((mmu_engine_id >= NV_PFAULT_MMU_ENG_ID_HOST0 && mmu_engine_id <= NV_PFAULT_MMU_ENG_ID_HOST44) ||
+                       (mmu_engine_id >= NV_PFAULT_MMU_ENG_ID_GRAPHICS));
+
+            return UVM_MMU_ENGINE_TYPE_HOST;
+        }
+    }
+
+    // We shouldn't be servicing faults from any other engines other than GR.
+    UVM_ASSERT_MSG(client_id <= NV_PFAULT_CLIENT_GPC_ROP_3, "Unexpected client ID: 0x%x\n", client_id);
+    UVM_ASSERT_MSG(mmu_engine_id >= NV_PFAULT_MMU_ENG_ID_GRAPHICS, "Unexpected engine ID: 0x%x\n", mmu_engine_id);
+    UVM_ASSERT(client_type == UVM_FAULT_CLIENT_TYPE_GPC);
+
+    return UVM_MMU_ENGINE_TYPE_GRAPHICS;
+}
--- a/kernel-open/nvidia-uvm/uvm_blackwell_fault_buffer.h
+++ b/kernel-open/nvidia-uvm/uvm_blackwell_fault_buffer.h
@@ -0,0 +1,92 @@
+/*******************************************************************************
+    Copyright (c) 2022 NVIDIA Corporation
+
+    Permission is hereby granted, free of charge, to any person obtaining a copy
+    of this software and associated documentation files (the "Software"), to
+    deal in the Software without restriction, including without limitation the
+    rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+    sell copies of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+
+        The above copyright notice and this permission notice shall be
+        included in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+    THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+    DEALINGS IN THE SOFTWARE.
+
+*******************************************************************************/
+
+#ifndef __UVM_HAL_BLACKWELL_FAULT_BUFFER_H__
+#define __UVM_HAL_BLACKWELL_FAULT_BUFFER_H__
+
+#include "nvtypes.h"
+#include "uvm_common.h"
+#include "uvm_gpu.h"
+
+// There are up to 10 TPCs per GPC in Blackwell, and there are 2 LTP uTLBs per
+// TPC. Besides, there is one active RGG uTLB per GPC. Each TPC has a number of
+// clients that can make requests to its uTLBs: 1xTPCCS, 1xPE, 2xT1. Requests
+// from these units are routed as follows to the 2 LTP uTLBs:
+//
+// --------                    ---------
+// | T1_0 | -----------------> | uTLB0 |
+// --------                    ---------
+//
+// --------                    ---------
+// | T1_1 | -----------------> | uTLB1 |
+// --------          --------> ---------
+//                   |             ^
+// -------           |             |
+// | PE  | -----------             |
+// -------                         |
+//                                 |
+// ---------                       |
+// | TPCCS | -----------------------
+// ---------
+//
+//
+// The client ids are local to their GPC and the id mapping is linear across
+// TPCs: TPC_n has TPCCS_n, PE_n, T1_p, and T1_q, where p=2*n and q=p+1.
+//
+// NV_PFAULT_CLIENT_GPC_LTP_UTLB_n and NV_PFAULT_CLIENT_GPC_RGG_UTLB enums can
+// be ignored. These will never be reported in a fault message, and should
+// never be used in an invalidate. Therefore, we define our own values.
+typedef enum {
+    UVM_BLACKWELL_GPC_UTLB_ID_RGG = 0,
+    UVM_BLACKWELL_GPC_UTLB_ID_LTP0 = 1,
+    UVM_BLACKWELL_GPC_UTLB_ID_LTP1 = 2,
+    UVM_BLACKWELL_GPC_UTLB_ID_LTP2 = 3,
+    UVM_BLACKWELL_GPC_UTLB_ID_LTP3 = 4,
+    UVM_BLACKWELL_GPC_UTLB_ID_LTP4 = 5,
+    UVM_BLACKWELL_GPC_UTLB_ID_LTP5 = 6,
+    UVM_BLACKWELL_GPC_UTLB_ID_LTP6 = 7,
+    UVM_BLACKWELL_GPC_UTLB_ID_LTP7 = 8,
+    UVM_BLACKWELL_GPC_UTLB_ID_LTP8 = 9,
+    UVM_BLACKWELL_GPC_UTLB_ID_LTP9 = 10,
+    UVM_BLACKWELL_GPC_UTLB_ID_LTP10 = 11,
+    UVM_BLACKWELL_GPC_UTLB_ID_LTP11 = 12,
+    UVM_BLACKWELL_GPC_UTLB_ID_LTP12 = 13,
+    UVM_BLACKWELL_GPC_UTLB_ID_LTP13 = 14,
+    UVM_BLACKWELL_GPC_UTLB_ID_LTP14 = 15,
+    UVM_BLACKWELL_GPC_UTLB_ID_LTP15 = 16,
+    UVM_BLACKWELL_GPC_UTLB_ID_LTP16 = 17,
+    UVM_BLACKWELL_GPC_UTLB_ID_LTP17 = 18,
+    UVM_BLACKWELL_GPC_UTLB_ID_LTP18 = 19,
+    UVM_BLACKWELL_GPC_UTLB_ID_LTP19 = 20,
+
+    UVM_BLACKWELL_GPC_UTLB_COUNT,
+} uvm_blackwell_gpc_utlb_id_t;
+
+static NvU32 uvm_blackwell_get_utlbs_per_gpc(uvm_parent_gpu_t *parent_gpu)
+{
+    NvU32 utlbs = parent_gpu->rm_info.maxTpcPerGpcCount * 2 + 1;
+    UVM_ASSERT(utlbs <= UVM_BLACKWELL_GPC_UTLB_COUNT);
+    return utlbs;
+}
+
+#endif
--- a/kernel-open/nvidia-uvm/uvm_blackwell_host.c
+++ b/kernel-open/nvidia-uvm/uvm_blackwell_host.c
@@ -0,0 +1,256 @@
+/*******************************************************************************
+    Copyright (c) 2024 NVIDIA Corporation
+
+    Permission is hereby granted, free of charge, to any person obtaining a copy
+    of this software and associated documentation files (the "Software"), to
+    deal in the Software without restriction, including without limitation the
+    rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+    sell copies of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+
+        The above copyright notice and this permission notice shall be
+        included in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+    THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+    DEALINGS IN THE SOFTWARE.
+
+*******************************************************************************/
+
+#include "uvm_hal.h"
+#include "uvm_push.h"
+#include "uvm_push_macros.h"
+#include "clc96f.h"
+
+// TODO: Bug 3210931: Rename HOST references and files to ESCHED.
+
+void uvm_hal_blackwell_host_tlb_invalidate_all(uvm_push_t *push,
+                                               uvm_gpu_phys_address_t pdb,
+                                               NvU32 depth,
+                                               uvm_membar_t membar)
+{
+    NvU32 aperture_value;
+    NvU32 page_table_level;
+    NvU32 pdb_lo;
+    NvU32 pdb_hi;
+    NvU32 ack_value = 0;
+    NvU32 sysmembar_value = 0;
+
+    UVM_ASSERT_MSG(pdb.aperture == UVM_APERTURE_VID || pdb.aperture == UVM_APERTURE_SYS, "aperture: %u", pdb.aperture);
+
+    if (pdb.aperture == UVM_APERTURE_VID)
+        aperture_value = HWCONST(C96F, MEM_OP_C, TLB_INVALIDATE_PDB_APERTURE, VID_MEM);
+    else
+        aperture_value = HWCONST(C96F, MEM_OP_C, TLB_INVALIDATE_PDB_APERTURE, SYS_MEM_COHERENT);
+
+    UVM_ASSERT_MSG(IS_ALIGNED(pdb.address, 1 << 12), "pdb 0x%llx\n", pdb.address);
+    pdb.address >>= 12;
+
+    pdb_lo = pdb.address & HWMASK(C96F, MEM_OP_C, TLB_INVALIDATE_PDB_ADDR_LO);
+    pdb_hi = pdb.address >> HWSIZE(C96F, MEM_OP_C, TLB_INVALIDATE_PDB_ADDR_LO);
+
+    // PDE4 is the highest level on Blackwell, see the comment in
+    // uvm_blackwell_mmu.c for details.
+    UVM_ASSERT_MSG(depth < NVC96F_MEM_OP_C_TLB_INVALIDATE_PAGE_TABLE_LEVEL_UP_TO_PDE4, "depth %u", depth);
+    page_table_level = NVC96F_MEM_OP_C_TLB_INVALIDATE_PAGE_TABLE_LEVEL_UP_TO_PDE4 - depth;
+
+    if (membar != UVM_MEMBAR_NONE)
+        ack_value = HWCONST(C96F, MEM_OP_C, TLB_INVALIDATE_ACK_TYPE, GLOBALLY);
+
+    if (membar == UVM_MEMBAR_SYS)
+        sysmembar_value = HWCONST(C96F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, EN);
+    else
+        sysmembar_value = HWCONST(C96F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, DIS);
+
+    NV_PUSH_4U(C96F, MEM_OP_A, sysmembar_value |
+                               HWCONST(C96F, MEM_OP_A, TLB_INVALIDATE_INVAL_SCOPE, NON_LINK_TLBS),
+                     MEM_OP_B, 0,
+                     MEM_OP_C, HWCONST(C96F, MEM_OP_C, TLB_INVALIDATE_PDB, ONE) |
+                               HWVALUE(C96F, MEM_OP_C, TLB_INVALIDATE_PDB_ADDR_LO, pdb_lo) |
+                               HWCONST(C96F, MEM_OP_C, TLB_INVALIDATE_GPC, ENABLE) |
+                               HWCONST(C96F, MEM_OP_C, TLB_INVALIDATE_REPLAY, NONE) |
+                               HWVALUE(C96F, MEM_OP_C, TLB_INVALIDATE_PAGE_TABLE_LEVEL, page_table_level) |
+                               aperture_value |
+                               ack_value,
+                     MEM_OP_D, HWCONST(C96F, MEM_OP_D, OPERATION, MMU_TLB_INVALIDATE) |
+                               HWVALUE(C96F, MEM_OP_D, TLB_INVALIDATE_PDB_ADDR_HI, pdb_hi));
+}
+
+void uvm_hal_blackwell_host_tlb_invalidate_va(uvm_push_t *push,
+                                              uvm_gpu_phys_address_t pdb,
+                                              NvU32 depth,
+                                              NvU64 base,
+                                              NvU64 size,
+                                              NvU64 page_size,
+                                              uvm_membar_t membar)
+{
+    NvU32 aperture_value;
+    NvU32 page_table_level;
+    NvU32 pdb_lo;
+    NvU32 pdb_hi;
+    NvU32 ack_value = 0;
+    NvU32 sysmembar_value = 0;
+    NvU32 va_lo;
+    NvU32 va_hi;
+    NvU64 end;
+    NvU64 actual_base;
+    NvU64 actual_size;
+    NvU64 actual_end;
+    NvU32 log2_invalidation_size;
+    uvm_gpu_t *gpu = uvm_push_get_gpu(push);
+
+    UVM_ASSERT_MSG(IS_ALIGNED(page_size, 1 << 12), "page_size 0x%llx\n", page_size);
+    UVM_ASSERT_MSG(IS_ALIGNED(base, page_size), "base 0x%llx page_size 0x%llx\n", base, page_size);
+    UVM_ASSERT_MSG(IS_ALIGNED(size, page_size), "size 0x%llx page_size 0x%llx\n", size, page_size);
+    UVM_ASSERT_MSG(size > 0, "size 0x%llx\n", size);
+
+    // The invalidation size must be a power-of-two number of pages containing
+    // the passed interval
+    end = base + size - 1;
+    log2_invalidation_size = __fls((unsigned long)(end ^ base)) + 1;
+
+    if (log2_invalidation_size == 64) {
+        // Invalidate everything
+        gpu->parent->host_hal->tlb_invalidate_all(push, pdb, depth, membar);
+        return;
+    }
+
+    // The hardware aligns the target address down to the invalidation size.
+    actual_size = 1ULL << log2_invalidation_size;
+    actual_base = UVM_ALIGN_DOWN(base, actual_size);
+    actual_end = actual_base + actual_size - 1;
+    UVM_ASSERT(actual_end >= end);
+
+    // The invalidation size field expects log2(invalidation size in 4K), not
+    // log2(invalidation size in bytes)
+    log2_invalidation_size -= 12;
+
+    // Address to invalidate, as a multiple of 4K.
+    base >>= 12;
+    va_lo = base & HWMASK(C96F, MEM_OP_A, TLB_INVALIDATE_TARGET_ADDR_LO);
+    va_hi = base >> HWSIZE(C96F, MEM_OP_A, TLB_INVALIDATE_TARGET_ADDR_LO);
+
+    UVM_ASSERT_MSG(pdb.aperture == UVM_APERTURE_VID || pdb.aperture == UVM_APERTURE_SYS, "aperture: %u", pdb.aperture);
+
+    if (pdb.aperture == UVM_APERTURE_VID)
+        aperture_value = HWCONST(C96F, MEM_OP_C, TLB_INVALIDATE_PDB_APERTURE, VID_MEM);
+    else
+        aperture_value = HWCONST(C96F, MEM_OP_C, TLB_INVALIDATE_PDB_APERTURE, SYS_MEM_COHERENT);
+
+    UVM_ASSERT_MSG(IS_ALIGNED(pdb.address, 1 << 12), "pdb 0x%llx\n", pdb.address);
+    pdb.address >>= 12;
+
+    pdb_lo = pdb.address & HWMASK(C96F, MEM_OP_C, TLB_INVALIDATE_PDB_ADDR_LO);
+    pdb_hi = pdb.address >> HWSIZE(C96F, MEM_OP_C, TLB_INVALIDATE_PDB_ADDR_LO);
+
+    // PDE4 is the highest level on Blackwell, see the comment in
+    // uvm_blackwell_mmu.c for details.
+    UVM_ASSERT_MSG(depth < NVC96F_MEM_OP_C_TLB_INVALIDATE_PAGE_TABLE_LEVEL_UP_TO_PDE4, "depth %u", depth);
+    page_table_level = NVC96F_MEM_OP_C_TLB_INVALIDATE_PAGE_TABLE_LEVEL_UP_TO_PDE4 - depth;
+
+    if (membar != UVM_MEMBAR_NONE)
+        ack_value = HWCONST(C96F, MEM_OP_C, TLB_INVALIDATE_ACK_TYPE, GLOBALLY);
+
+    if (membar == UVM_MEMBAR_SYS)
+        sysmembar_value = HWCONST(C96F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, EN);
+    else
+        sysmembar_value = HWCONST(C96F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, DIS);
+
+    NV_PUSH_4U(C96F, MEM_OP_A, HWVALUE(C96F, MEM_OP_A, TLB_INVALIDATE_INVALIDATION_SIZE, log2_invalidation_size) |
+                               sysmembar_value |
+                               HWCONST(C96F, MEM_OP_A, TLB_INVALIDATE_INVAL_SCOPE, NON_LINK_TLBS) |
+                               HWVALUE(C96F, MEM_OP_A, TLB_INVALIDATE_TARGET_ADDR_LO, va_lo),
+                     MEM_OP_B, HWVALUE(C96F, MEM_OP_B, TLB_INVALIDATE_TARGET_ADDR_HI, va_hi),
+                     MEM_OP_C, HWCONST(C96F, MEM_OP_C, TLB_INVALIDATE_PDB, ONE) |
+                               HWVALUE(C96F, MEM_OP_C, TLB_INVALIDATE_PDB_ADDR_LO, pdb_lo) |
+                               HWCONST(C96F, MEM_OP_C, TLB_INVALIDATE_GPC, ENABLE) |
+                               HWCONST(C96F, MEM_OP_C, TLB_INVALIDATE_REPLAY, NONE) |
+                               HWVALUE(C96F, MEM_OP_C, TLB_INVALIDATE_PAGE_TABLE_LEVEL, page_table_level) |
+                               aperture_value |
+                               ack_value,
+                     MEM_OP_D, HWCONST(C96F, MEM_OP_D, OPERATION, MMU_TLB_INVALIDATE_TARGETED) |
+                               HWVALUE(C96F, MEM_OP_D, TLB_INVALIDATE_PDB_ADDR_HI, pdb_hi));
+}
+
+void uvm_hal_blackwell_host_tlb_invalidate_test(uvm_push_t *push,
+                                                uvm_gpu_phys_address_t pdb,
+                                                UVM_TEST_INVALIDATE_TLB_PARAMS *params)
+{
+    NvU32 ack_value = 0;
+    NvU32 sysmembar_value = 0;
+    NvU32 invalidate_gpc_value = 0;
+    NvU32 aperture_value = 0;
+    NvU32 pdb_lo = 0;
+    NvU32 pdb_hi = 0;
+    NvU32 page_table_level = 0;
+
+    UVM_ASSERT_MSG(pdb.aperture == UVM_APERTURE_VID || pdb.aperture == UVM_APERTURE_SYS, "aperture: %u", pdb.aperture);
+    if (pdb.aperture == UVM_APERTURE_VID)
+        aperture_value = HWCONST(C96F, MEM_OP_C, TLB_INVALIDATE_PDB_APERTURE, VID_MEM);
+    else
+        aperture_value = HWCONST(C96F, MEM_OP_C, TLB_INVALIDATE_PDB_APERTURE, SYS_MEM_COHERENT);
+
+    UVM_ASSERT_MSG(IS_ALIGNED(pdb.address, 1 << 12), "pdb 0x%llx\n", pdb.address);
+    pdb.address >>= 12;
+
+    pdb_lo = pdb.address & HWMASK(C96F, MEM_OP_C, TLB_INVALIDATE_PDB_ADDR_LO);
+    pdb_hi = pdb.address >> HWSIZE(C96F, MEM_OP_C, TLB_INVALIDATE_PDB_ADDR_LO);
+
+    if (params->page_table_level != UvmInvalidatePageTableLevelAll) {
+        // PDE4 is the highest level on Blackwell, see the comment in
+        // uvm_blackwell_mmu.c for details.
+        page_table_level = min((NvU32)UvmInvalidatePageTableLevelPde4, params->page_table_level) - 1;
+    }
+
+    if (params->membar != UvmInvalidateTlbMemBarNone)
+        ack_value = HWCONST(C96F, MEM_OP_C, TLB_INVALIDATE_ACK_TYPE, GLOBALLY);
+
+    if (params->membar == UvmInvalidateTlbMemBarSys)
+        sysmembar_value = HWCONST(C96F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, EN);
+    else
+        sysmembar_value = HWCONST(C96F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, DIS);
+
+    if (params->disable_gpc_invalidate)
+        invalidate_gpc_value = HWCONST(C96F, MEM_OP_C, TLB_INVALIDATE_GPC, DISABLE);
+    else
+        invalidate_gpc_value = HWCONST(C96F, MEM_OP_C, TLB_INVALIDATE_GPC, ENABLE);
+
+    if (params->target_va_mode == UvmTargetVaModeTargeted) {
+        NvU64 va = params->va >> 12;
+
+        NvU32 va_lo = va & HWMASK(C96F, MEM_OP_A, TLB_INVALIDATE_TARGET_ADDR_LO);
+        NvU32 va_hi = va >> HWSIZE(C96F, MEM_OP_A, TLB_INVALIDATE_TARGET_ADDR_LO);
+
+        NV_PUSH_4U(C96F, MEM_OP_A, sysmembar_value |
+                                   HWCONST(C96F, MEM_OP_A, TLB_INVALIDATE_INVAL_SCOPE, NON_LINK_TLBS) |
+                                   HWVALUE(C96F, MEM_OP_A, TLB_INVALIDATE_TARGET_ADDR_LO, va_lo),
+                         MEM_OP_B, HWVALUE(C96F, MEM_OP_B, TLB_INVALIDATE_TARGET_ADDR_HI, va_hi),
+                         MEM_OP_C, HWCONST(C96F, MEM_OP_C, TLB_INVALIDATE_REPLAY, NONE) |
+                                   HWVALUE(C96F, MEM_OP_C, TLB_INVALIDATE_PAGE_TABLE_LEVEL, page_table_level) |
+                                   HWCONST(C96F, MEM_OP_C, TLB_INVALIDATE_PDB, ONE) |
+                                   HWVALUE(C96F, MEM_OP_C, TLB_INVALIDATE_PDB_ADDR_LO, pdb_lo) |
+                                   invalidate_gpc_value |
+                                   aperture_value |
+                                   ack_value,
+                         MEM_OP_D, HWCONST(C96F, MEM_OP_D, OPERATION, MMU_TLB_INVALIDATE_TARGETED) |
+                                   HWVALUE(C96F, MEM_OP_D, TLB_INVALIDATE_PDB_ADDR_HI, pdb_hi));
+    }
+    else {
+        NV_PUSH_4U(C96F, MEM_OP_A, sysmembar_value |
+                                   HWCONST(C96F, MEM_OP_A, TLB_INVALIDATE_INVAL_SCOPE, NON_LINK_TLBS),
+                         MEM_OP_B, 0,
+                         MEM_OP_C, HWCONST(C96F, MEM_OP_C, TLB_INVALIDATE_REPLAY, NONE) |
+                                   HWVALUE(C96F, MEM_OP_C, TLB_INVALIDATE_PAGE_TABLE_LEVEL, page_table_level) |
+                                   HWCONST(C96F, MEM_OP_C, TLB_INVALIDATE_PDB, ONE) |
+                                   HWVALUE(C96F, MEM_OP_C, TLB_INVALIDATE_PDB_ADDR_LO, pdb_lo) |
+                                   invalidate_gpc_value |
+                                   aperture_value |
+                                   ack_value,
+                         MEM_OP_D, HWCONST(C96F, MEM_OP_D, OPERATION, MMU_TLB_INVALIDATE) |
+                                   HWVALUE(C96F, MEM_OP_D, TLB_INVALIDATE_PDB_ADDR_HI, pdb_hi));
+    }
+}
--- a/kernel-open/nvidia-uvm/uvm_blackwell_mmu.c
+++ b/kernel-open/nvidia-uvm/uvm_blackwell_mmu.c
@@ -0,0 +1,165 @@
+/*******************************************************************************
+    Copyright (c) 2022-2024 NVIDIA Corporation
+
+    Permission is hereby granted, free of charge, to any person obtaining a copy
+    of this software and associated documentation files (the "Software"), to
+    deal in the Software without restriction, including without limitation the
+    rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+    sell copies of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+
+        The above copyright notice and this permission notice shall be
+        included in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+    THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+    DEALINGS IN THE SOFTWARE.
+
+*******************************************************************************/
+
+// On Blackwell, the UVM page tree 'depth' maps to hardware as follows:
+//
+// UVM depth   HW level                            VA bits
+// 0           PDE4                                56:56
+// 1           PDE3                                55:47
+// 2           PDE2 (or 256G PTE)                  46:38
+// 3           PDE1 (or 512M PTE)                  37:29
+// 4           PDE0 (dual 64K/4K PDE, or 2M PTE)   28:21
+// 5           PTE_64K / PTE_4K                    20:16 / 20:12
+
+#include "uvm_types.h"
+#include "uvm_global.h"
+#include "uvm_hal.h"
+#include "uvm_hal_types.h"
+#include "uvm_blackwell_fault_buffer.h"
+#include "hwref/blackwell/gb100/dev_fault.h"
+#include "hwref/blackwell/gb100/dev_mmu.h"
+
+static uvm_mmu_mode_hal_t blackwell_mmu_mode_hal;
+
+static NvU32 page_table_depth_blackwell(NvU64 page_size)
+{
+    switch (page_size) {
+        case UVM_PAGE_SIZE_2M:
+            return 4;
+        case UVM_PAGE_SIZE_512M:
+            return 3;
+        case UVM_PAGE_SIZE_256G:
+            return 2;
+        default:
+            return 5;
+    }
+}
+
+static NvU64 page_sizes_blackwell(void)
+{
+    return UVM_PAGE_SIZE_256G | UVM_PAGE_SIZE_512M | UVM_PAGE_SIZE_2M | UVM_PAGE_SIZE_64K | UVM_PAGE_SIZE_4K;
+}
+
+uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_blackwell(NvU64 big_page_size)
+{
+    static bool initialized = false;
+
+    UVM_ASSERT(big_page_size == UVM_PAGE_SIZE_64K || big_page_size == UVM_PAGE_SIZE_128K);
+
+    // TODO: Bug 1789555: RM should reject the creation of GPU VA spaces with
+    // 128K big page size for Pascal+ GPUs
+    if (big_page_size == UVM_PAGE_SIZE_128K)
+        return NULL;
+
+    if (!initialized) {
+        uvm_mmu_mode_hal_t *hopper_mmu_mode_hal = uvm_hal_mmu_mode_hopper(big_page_size);
+        UVM_ASSERT(hopper_mmu_mode_hal);
+
+        // The assumption made is that arch_hal->mmu_mode_hal() will be called
+        // under the global lock the first time, so check it here.
+        uvm_assert_mutex_locked(&g_uvm_global.global_lock);
+
+        blackwell_mmu_mode_hal = *hopper_mmu_mode_hal;
+        blackwell_mmu_mode_hal.page_table_depth = page_table_depth_blackwell;
+        blackwell_mmu_mode_hal.page_sizes = page_sizes_blackwell;
+
+        initialized = true;
+    }
+
+    return &blackwell_mmu_mode_hal;
+}
+
+NvU16 uvm_hal_blackwell_mmu_client_id_to_utlb_id(NvU16 client_id)
+{
+    switch (client_id) {
+        case NV_PFAULT_CLIENT_GPC_RAST:
+        case NV_PFAULT_CLIENT_GPC_GCC:
+        case NV_PFAULT_CLIENT_GPC_GPCCS:
+            return UVM_BLACKWELL_GPC_UTLB_ID_RGG;
+        case NV_PFAULT_CLIENT_GPC_T1_0:
+            return UVM_BLACKWELL_GPC_UTLB_ID_LTP0;
+        case NV_PFAULT_CLIENT_GPC_T1_1:
+        case NV_PFAULT_CLIENT_GPC_PE_0:
+        case NV_PFAULT_CLIENT_GPC_TPCCS_0:
+            return UVM_BLACKWELL_GPC_UTLB_ID_LTP1;
+        case NV_PFAULT_CLIENT_GPC_T1_2:
+            return UVM_BLACKWELL_GPC_UTLB_ID_LTP2;
+        case NV_PFAULT_CLIENT_GPC_T1_3:
+        case NV_PFAULT_CLIENT_GPC_PE_1:
+        case NV_PFAULT_CLIENT_GPC_TPCCS_1:
+            return UVM_BLACKWELL_GPC_UTLB_ID_LTP3;
+        case NV_PFAULT_CLIENT_GPC_T1_4:
+            return UVM_BLACKWELL_GPC_UTLB_ID_LTP4;
+        case NV_PFAULT_CLIENT_GPC_T1_5:
+        case NV_PFAULT_CLIENT_GPC_PE_2:
+        case NV_PFAULT_CLIENT_GPC_TPCCS_2:
+            return UVM_BLACKWELL_GPC_UTLB_ID_LTP5;
+        case NV_PFAULT_CLIENT_GPC_T1_6:
+            return UVM_BLACKWELL_GPC_UTLB_ID_LTP6;
+        case NV_PFAULT_CLIENT_GPC_T1_7:
+        case NV_PFAULT_CLIENT_GPC_PE_3:
+        case NV_PFAULT_CLIENT_GPC_TPCCS_3:
+            return UVM_BLACKWELL_GPC_UTLB_ID_LTP7;
+        case NV_PFAULT_CLIENT_GPC_T1_8:
+            return UVM_BLACKWELL_GPC_UTLB_ID_LTP8;
+        case NV_PFAULT_CLIENT_GPC_T1_9:
+        case NV_PFAULT_CLIENT_GPC_PE_4:
+        case NV_PFAULT_CLIENT_GPC_TPCCS_4:
+            return UVM_BLACKWELL_GPC_UTLB_ID_LTP9;
+        case NV_PFAULT_CLIENT_GPC_T1_10:
+            return UVM_BLACKWELL_GPC_UTLB_ID_LTP10;
+        case NV_PFAULT_CLIENT_GPC_T1_11:
+        case NV_PFAULT_CLIENT_GPC_PE_5:
+        case NV_PFAULT_CLIENT_GPC_TPCCS_5:
+            return UVM_BLACKWELL_GPC_UTLB_ID_LTP11;
+        case NV_PFAULT_CLIENT_GPC_T1_12:
+            return UVM_BLACKWELL_GPC_UTLB_ID_LTP12;
+        case NV_PFAULT_CLIENT_GPC_T1_13:
+        case NV_PFAULT_CLIENT_GPC_PE_6:
+        case NV_PFAULT_CLIENT_GPC_TPCCS_6:
+            return UVM_BLACKWELL_GPC_UTLB_ID_LTP13;
+        case NV_PFAULT_CLIENT_GPC_T1_14:
+            return UVM_BLACKWELL_GPC_UTLB_ID_LTP14;
+        case NV_PFAULT_CLIENT_GPC_T1_15:
+        case NV_PFAULT_CLIENT_GPC_PE_7:
+        case NV_PFAULT_CLIENT_GPC_TPCCS_7:
+            return UVM_BLACKWELL_GPC_UTLB_ID_LTP15;
+        case NV_PFAULT_CLIENT_GPC_T1_16:
+            return UVM_BLACKWELL_GPC_UTLB_ID_LTP16;
+        case NV_PFAULT_CLIENT_GPC_T1_17:
+        case NV_PFAULT_CLIENT_GPC_PE_8:
+        case NV_PFAULT_CLIENT_GPC_TPCCS_8:
+            return UVM_BLACKWELL_GPC_UTLB_ID_LTP17;
+        case NV_PFAULT_CLIENT_GPC_T1_18:
+            return UVM_BLACKWELL_GPC_UTLB_ID_LTP18;
+        case NV_PFAULT_CLIENT_GPC_T1_19:
+        case NV_PFAULT_CLIENT_GPC_PE_9:
+        case NV_PFAULT_CLIENT_GPC_TPCCS_9:
+            return UVM_BLACKWELL_GPC_UTLB_ID_LTP19;
+
+        default:
+            UVM_ASSERT_MSG(false, "Invalid client value: 0x%x\n", client_id);
+    }
+
+    return 0;
+}
--- a/kernel-open/nvidia-uvm/uvm_channel.c
+++ b/kernel-open/nvidia-uvm/uvm_channel.c
@@ -361,7 +361,6 @@ static NV_STATUS channel_reserve_and_lock_in_pool(uvm_channel_pool_t *pool, uvm_
            NV_STATUS status;

            uvm_channel_update_progress(channel);
-            index = uvm_channel_index_in_pool(channel);

            channel_pool_lock(pool);

@@ -493,25 +492,20 @@ static NvU32 channel_get_available_push_info_index(uvm_channel_t *channel)
 static void channel_semaphore_gpu_encrypt_payload(uvm_push_t *push, NvU64 semaphore_va)
 {
    NvU32 iv_index;
-    uvm_gpu_address_t notifier_gpu_va;
-    uvm_gpu_address_t auth_tag_gpu_va;
-    uvm_gpu_address_t semaphore_gpu_va;
-    uvm_gpu_address_t encrypted_payload_gpu_va;
    uvm_gpu_t *gpu = push->gpu;
    uvm_channel_t *channel = push->channel;
    uvm_gpu_semaphore_t *semaphore = &channel->tracking_sem.semaphore;
+    uvm_gpu_address_t notifier_gpu_va = uvm_gpu_semaphore_get_notifier_gpu_va(semaphore);
+    uvm_gpu_address_t auth_tag_gpu_va = uvm_gpu_semaphore_get_auth_tag_gpu_va(semaphore);
+    uvm_gpu_address_t encrypted_payload_gpu_va = uvm_gpu_semaphore_get_encrypted_payload_gpu_va(semaphore);
+    uvm_gpu_address_t semaphore_gpu_va = uvm_gpu_address_virtual(semaphore_va);
    UvmCslIv *iv_cpu_addr = semaphore->conf_computing.ivs;
-    NvU32 payload_size = sizeof(*semaphore->payload);
+    NvU32 payload_size = sizeof(*uvm_gpu_semaphore_get_encrypted_payload_cpu_va(semaphore));
    NvU32 *last_pushed_notifier = &semaphore->conf_computing.last_pushed_notifier;

    UVM_ASSERT(g_uvm_global.conf_computing_enabled);
    UVM_ASSERT(uvm_channel_is_ce(channel));

-    encrypted_payload_gpu_va = uvm_rm_mem_get_gpu_va(semaphore->conf_computing.encrypted_payload, gpu, false);
-    notifier_gpu_va = uvm_rm_mem_get_gpu_va(semaphore->conf_computing.notifier, gpu, false);
-    auth_tag_gpu_va = uvm_rm_mem_get_gpu_va(semaphore->conf_computing.auth_tag, gpu, false);
-    semaphore_gpu_va = uvm_gpu_address_virtual(semaphore_va);
-
    iv_index = ((*last_pushed_notifier + 2) / 2) % channel->num_gpfifo_entries;

    uvm_conf_computing_log_gpu_encryption(channel, &iv_cpu_addr[iv_index]);
@@ -1710,59 +1704,24 @@ static void free_conf_computing_buffers(uvm_channel_t *channel)
    channel->conf_computing.static_pb_protected_sysmem = NULL;
    channel->conf_computing.push_crypto_bundles = NULL;

-    uvm_rm_mem_free(channel->tracking_sem.semaphore.conf_computing.encrypted_payload);
-    uvm_rm_mem_free(channel->tracking_sem.semaphore.conf_computing.notifier);
-    uvm_rm_mem_free(channel->tracking_sem.semaphore.conf_computing.auth_tag);
    uvm_kvfree(channel->tracking_sem.semaphore.conf_computing.ivs);
-    channel->tracking_sem.semaphore.conf_computing.encrypted_payload = NULL;
-    channel->tracking_sem.semaphore.conf_computing.notifier = NULL;
-    channel->tracking_sem.semaphore.conf_computing.auth_tag = NULL;
    channel->tracking_sem.semaphore.conf_computing.ivs = NULL;
 }

 static NV_STATUS alloc_conf_computing_buffers_semaphore(uvm_channel_t *channel)
 {
    uvm_gpu_semaphore_t *semaphore = &channel->tracking_sem.semaphore;
-    uvm_gpu_t *gpu = uvm_channel_get_gpu(channel);
-    NV_STATUS status;

    UVM_ASSERT(g_uvm_global.conf_computing_enabled);
    UVM_ASSERT(uvm_channel_is_ce(channel));

-    status = uvm_rm_mem_alloc_and_map_cpu(gpu,
-                                          UVM_RM_MEM_TYPE_SYS,
-                                          sizeof(semaphore->conf_computing.last_pushed_notifier),
-                                          UVM_CONF_COMPUTING_BUF_ALIGNMENT,
-                                          &semaphore->conf_computing.notifier);
-
-    if (status != NV_OK)
-        return status;
-
-    status = uvm_rm_mem_alloc_and_map_cpu(gpu,
-                                          UVM_RM_MEM_TYPE_SYS,
-                                          sizeof(*channel->tracking_sem.semaphore.payload),
-                                          UVM_CONF_COMPUTING_BUF_ALIGNMENT,
-                                          &semaphore->conf_computing.encrypted_payload);
-
-    if (status != NV_OK)
-        return status;
-
-    status = uvm_rm_mem_alloc_and_map_cpu(gpu,
-                                          UVM_RM_MEM_TYPE_SYS,
-                                          UVM_CONF_COMPUTING_AUTH_TAG_SIZE,
-                                          UVM_CONF_COMPUTING_BUF_ALIGNMENT,
-                                          &semaphore->conf_computing.auth_tag);
-
-    if (status != NV_OK)
-        return status;
-
    semaphore->conf_computing.ivs = uvm_kvmalloc_zero(sizeof(*semaphore->conf_computing.ivs)
-                                    * channel->num_gpfifo_entries);
+                                                      * channel->num_gpfifo_entries);

    if (!semaphore->conf_computing.ivs)
        return NV_ERR_NO_MEMORY;

-    return status;
+    return NV_OK;
 }

 static NV_STATUS alloc_conf_computing_buffers_wlc(uvm_channel_t *channel)
@@ -2380,24 +2339,41 @@ static NV_STATUS channel_pool_add(uvm_channel_manager_t *channel_manager,
    return status;
 }

-static bool ce_usable_for_channel_type(uvm_channel_type_t type, const UvmGpuCopyEngineCaps *cap)
+static bool ce_is_usable(const UvmGpuCopyEngineCaps *cap)
 {
-    if (!cap->supported || cap->grce)
-        return false;
+    return cap->supported && !cap->grce;
+}

-    switch (type) {
-        case UVM_CHANNEL_TYPE_CPU_TO_GPU:
-        case UVM_CHANNEL_TYPE_GPU_TO_CPU:
-            return cap->sysmem;
-        case UVM_CHANNEL_TYPE_GPU_INTERNAL:
-        case UVM_CHANNEL_TYPE_MEMOPS:
-            return true;
-        case UVM_CHANNEL_TYPE_GPU_TO_GPU:
-            return cap->p2p;
-        default:
-            UVM_ASSERT_MSG(false, "Unexpected channel type 0x%x\n", type);
-            return false;
+// Check that all asynchronous CEs are usable, and that there is at least one
+// such CE.
+static NV_STATUS ces_validate(uvm_channel_manager_t *manager, const UvmGpuCopyEngineCaps *ces_caps)
+{
+    unsigned ce;
+    bool found_usable_ce = false;
+
+    for (ce = 0; ce < UVM_COPY_ENGINE_COUNT_MAX; ++ce) {
+        const UvmGpuCopyEngineCaps *ce_caps = ces_caps + ce;
+
+        if (!ce_is_usable(ce_caps))
+            continue;
+
+        found_usable_ce = true;
+
+        // All channels may need to release their semaphore to sysmem.
+        // All CEs are expected to have the sysmem flag set.
+        if (!ce_caps->sysmem)
+            return NV_ERR_NOT_SUPPORTED;
+
+        // While P2P capabilities are only required for transfers between GPUs,
+        // in practice all CEs are expected to have the corresponding flag set.
+        if (!ce_caps->p2p)
+            return NV_ERR_NOT_SUPPORTED;
    }
+
+    if (!found_usable_ce)
+        return NV_ERR_NOT_SUPPORTED;
+
+    return NV_OK;
 }

 static unsigned ce_usage_count(NvU32 ce, const unsigned *preferred_ce)
@@ -2426,15 +2402,13 @@ static int compare_ce_for_channel_type(const UvmGpuCopyEngineCaps *ce_caps,
    const UvmGpuCopyEngineCaps *cap0 = ce_caps + ce_index0;
    const UvmGpuCopyEngineCaps *cap1 = ce_caps + ce_index1;

-    UVM_ASSERT(ce_usable_for_channel_type(type, cap0));
-    UVM_ASSERT(ce_usable_for_channel_type(type, cap1));
    UVM_ASSERT(ce_index0 < UVM_COPY_ENGINE_COUNT_MAX);
    UVM_ASSERT(ce_index1 < UVM_COPY_ENGINE_COUNT_MAX);
    UVM_ASSERT(ce_index0 != ce_index1);

    switch (type) {
+        // For CPU to GPU fast sysmem read is the most important
        case UVM_CHANNEL_TYPE_CPU_TO_GPU:
-            // For CPU to GPU fast sysmem read is the most important
            if (cap0->sysmemRead != cap1->sysmemRead)
                return cap1->sysmemRead - cap0->sysmemRead;

@@ -2444,8 +2418,8 @@ static int compare_ce_for_channel_type(const UvmGpuCopyEngineCaps *ce_caps,

            break;

+        // For GPU to CPU fast sysmem write is the most important
        case UVM_CHANNEL_TYPE_GPU_TO_CPU:
-            // For GPU to CPU fast sysmem write is the most important
            if (cap0->sysmemWrite != cap1->sysmemWrite)
                return cap1->sysmemWrite - cap0->sysmemWrite;

@@ -2455,8 +2429,8 @@ static int compare_ce_for_channel_type(const UvmGpuCopyEngineCaps *ce_caps,

            break;

+        // For GPU to GPU prefer the LCE with the most PCEs
        case UVM_CHANNEL_TYPE_GPU_TO_GPU:
-            // Prefer the LCE with the most PCEs
            {
                int pce_diff = (int)hweight32(cap1->cePceMask) - (int)hweight32(cap0->cePceMask);

@@ -2466,10 +2440,10 @@ static int compare_ce_for_channel_type(const UvmGpuCopyEngineCaps *ce_caps,

            break;

+        // For GPU_INTERNAL we want the max possible bandwidth for CEs. For now
+        // assume that the number of PCEs is a good measure.
+        // TODO: Bug 1735254: Add a direct CE query for local FB bandwidth
        case UVM_CHANNEL_TYPE_GPU_INTERNAL:
-            // We want the max possible bandwidth for CEs used for GPU_INTERNAL,
-            // for now assume that the number of PCEs is a good measure.
-            // TODO: Bug 1735254: Add a direct CE query for local FB bandwidth
            {
                int pce_diff = (int)hweight32(cap1->cePceMask) - (int)hweight32(cap0->cePceMask);

@@ -2483,11 +2457,15 @@ static int compare_ce_for_channel_type(const UvmGpuCopyEngineCaps *ce_caps,

            break;

+        // For MEMOPS we mostly care about latency which should be better with
+        // less used CEs (although we only know about our own usage and not
+        // system-wide) so just break out to get the default ordering which
+        // prioritizes usage count.
        case UVM_CHANNEL_TYPE_MEMOPS:
-            // For MEMOPS we mostly care about latency which should be better
-            // with less used CEs (although we only know about our own usage and
-            // not system-wide) so just break out to get the default ordering
-            // which prioritizes usage count.
+        // For WLC we only care about using a dedicated CE, which requires
+        // knowing the global CE mappings. For now just rely on the default
+        // ordering, which results on selecting an unused CE (if available).
+        case UVM_CHANNEL_TYPE_WLC:
            break;

        default:
@@ -2510,54 +2488,104 @@ static int compare_ce_for_channel_type(const UvmGpuCopyEngineCaps *ce_caps,
    return ce_index0 - ce_index1;
 }

-// Identify usable CEs, and select the preferred CE for a given channel type.
-static NV_STATUS pick_ce_for_channel_type(uvm_channel_manager_t *manager,
-                                          const UvmGpuCopyEngineCaps *ce_caps,
-                                          uvm_channel_type_t type,
-                                          unsigned *preferred_ce)
+// Select the preferred CE for the given channel types.
+static void pick_ces_for_channel_types(uvm_channel_manager_t *manager,
+                                       const UvmGpuCopyEngineCaps *ce_caps,
+                                       uvm_channel_type_t *channel_types,
+                                       unsigned num_channel_types,
+                                       unsigned *preferred_ce)
 {
-    NvU32 i;
-    NvU32 best_ce = UVM_COPY_ENGINE_COUNT_MAX;
+    unsigned i;

-    UVM_ASSERT(type < UVM_CHANNEL_TYPE_CE_COUNT);
+    // In Confidential Computing, do not mark all usable CEs, only the preferred
+    // ones, because non-preferred CE channels are guaranteed to not be used.
+    bool mark_all_usable_ces = !g_uvm_global.conf_computing_enabled;

-    for (i = 0; i < UVM_COPY_ENGINE_COUNT_MAX; ++i) {
-        const UvmGpuCopyEngineCaps *cap = ce_caps + i;
+    for (i = 0; i < num_channel_types; ++i) {
+        unsigned ce;
+        unsigned best_ce = UVM_COPY_ENGINE_COUNT_MAX;
+        uvm_channel_type_t type = channel_types[i];

-        if (!ce_usable_for_channel_type(type, cap))
-            continue;
+        for (ce = 0; ce < UVM_COPY_ENGINE_COUNT_MAX; ++ce) {
+            if (!ce_is_usable(ce_caps + ce))
+                continue;

-        __set_bit(i, manager->ce_mask);
+            if (mark_all_usable_ces)
+                __set_bit(ce, manager->ce_mask);

-        if (best_ce == UVM_COPY_ENGINE_COUNT_MAX) {
-            best_ce = i;
-            continue;
+            if (best_ce == UVM_COPY_ENGINE_COUNT_MAX) {
+                best_ce = ce;
+                continue;
+            }
+
+            if (compare_ce_for_channel_type(ce_caps, type, ce, best_ce, preferred_ce) < 0)
+                best_ce = ce;
        }

-        if (compare_ce_for_channel_type(ce_caps, type, i, best_ce, preferred_ce) < 0)
-            best_ce = i;
-    }
+        UVM_ASSERT(best_ce != UVM_COPY_ENGINE_COUNT_MAX);

-    if (best_ce == UVM_COPY_ENGINE_COUNT_MAX) {
-        UVM_ERR_PRINT("Failed to find a suitable CE for channel type %s\n", uvm_channel_type_to_string(type));
-        return NV_ERR_NOT_SUPPORTED;
-    }
+        preferred_ce[type] = best_ce;

-    preferred_ce[type] = best_ce;
-    return NV_OK;
+        // Preferred CEs are always marked as usable.
+        if (type < UVM_CHANNEL_TYPE_CE_COUNT)
+            __set_bit(best_ce, manager->ce_mask);
+    }
 }

-static NV_STATUS channel_manager_pick_copy_engines(uvm_channel_manager_t *manager, unsigned *preferred_ce)
+static void pick_ces(uvm_channel_manager_t *manager, const UvmGpuCopyEngineCaps *ce_caps, unsigned *preferred_ce)
 {
-    NV_STATUS status;
-    unsigned i;
-    UvmGpuCopyEnginesCaps *ces_caps;
+    // The order of picking CEs for each type matters as it's affected by
+    // the usage count of each CE and it increases every time a CE
+    // is selected. MEMOPS has the least priority as it only cares about
+    // low usage of the CE to improve latency
    uvm_channel_type_t types[] = {UVM_CHANNEL_TYPE_CPU_TO_GPU,
                                  UVM_CHANNEL_TYPE_GPU_TO_CPU,
                                  UVM_CHANNEL_TYPE_GPU_INTERNAL,
                                  UVM_CHANNEL_TYPE_GPU_TO_GPU,
                                  UVM_CHANNEL_TYPE_MEMOPS};

+    UVM_ASSERT(!g_uvm_global.conf_computing_enabled);
+
+    pick_ces_for_channel_types(manager, ce_caps, types, ARRAY_SIZE(types), preferred_ce);
+}
+
+static void pick_ces_conf_computing(uvm_channel_manager_t *manager,
+                                    const UvmGpuCopyEngineCaps *ce_caps,
+                                    unsigned *preferred_ce)
+{
+    unsigned best_wlc_ce;
+
+    // The WLC type must go last so an unused CE is chosen, if available
+    uvm_channel_type_t types[] = {UVM_CHANNEL_TYPE_CPU_TO_GPU,
+                                  UVM_CHANNEL_TYPE_GPU_TO_CPU,
+                                  UVM_CHANNEL_TYPE_GPU_INTERNAL,
+                                  UVM_CHANNEL_TYPE_MEMOPS,
+                                  UVM_CHANNEL_TYPE_WLC};
+
+    UVM_ASSERT(g_uvm_global.conf_computing_enabled);
+
+    pick_ces_for_channel_types(manager, ce_caps, types, ARRAY_SIZE(types), preferred_ce);
+
+    // Direct transfers between GPUs are disallowed in Confidential Computing,
+    // but the preferred CE is still set to an arbitrary value for consistency.
+    preferred_ce[UVM_CHANNEL_TYPE_GPU_TO_GPU] = preferred_ce[UVM_CHANNEL_TYPE_GPU_TO_CPU];
+
+    best_wlc_ce = preferred_ce[UVM_CHANNEL_TYPE_WLC];
+
+    // TODO: Bug 4576908: in HCC, the WLC type should not share a CE with any
+    // channel type other than LCIC. The assertion should be a check instead.
+    UVM_ASSERT(ce_usage_count(best_wlc_ce, preferred_ce) == 0);
+}
+
+static NV_STATUS channel_manager_pick_ces(uvm_channel_manager_t *manager, unsigned *preferred_ce)
+{
+    NV_STATUS status;
+    UvmGpuCopyEnginesCaps *ces_caps;
+    uvm_channel_type_t type;
+
+    for (type = 0; type < UVM_CHANNEL_TYPE_COUNT; type++)
+        preferred_ce[type] = UVM_COPY_ENGINE_COUNT_MAX;
+
    ces_caps = uvm_kvmalloc_zero(sizeof(*ces_caps));
    if (!ces_caps)
        return NV_ERR_NO_MEMORY;
@@ -2566,16 +2594,14 @@ static NV_STATUS channel_manager_pick_copy_engines(uvm_channel_manager_t *manage
    if (status != NV_OK)
        goto out;

-   // The order of picking CEs for each type matters as it's affected by the
-   // usage count of each CE and it increases every time a CE is selected.
-   // MEMOPS has the least priority as it only cares about low usage of the
-   // CE to improve latency
-    for (i = 0; i < ARRAY_SIZE(types); ++i) {
-        status = pick_ce_for_channel_type(manager, ces_caps->copyEngineCaps, types[i], preferred_ce);
-        if (status != NV_OK)
-            goto out;
-    }
+    status = ces_validate(manager, ces_caps->copyEngineCaps);
+    if (status != NV_OK)
+        goto out;

+    if (g_uvm_global.conf_computing_enabled)
+        pick_ces_conf_computing(manager, ces_caps->copyEngineCaps, preferred_ce);
+    else
+        pick_ces(manager, ces_caps->copyEngineCaps, preferred_ce);
 out:
    uvm_kvfree(ces_caps);

@@ -2641,7 +2667,7 @@ static const char *buffer_location_to_string(UVM_BUFFER_LOCATION loc)
    else if (loc == UVM_BUFFER_LOCATION_DEFAULT)
        return "auto";

-    UVM_ASSERT_MSG(false, "Invalid buffer locationvalue %d\n", loc);
+    UVM_ASSERT_MSG(false, "Invalid buffer location value %d\n", loc);
    return NULL;
 }

@@ -2818,7 +2844,9 @@ static NV_STATUS channel_manager_create_ce_pools(uvm_channel_manager_t *manager,
    // A pool is created for each usable CE, even if it has not been selected as
    // the preferred CE for any type, because as more information is discovered
    // (for example, a pair of peer GPUs is added) we may start using the
-    // previously idle pools.
+    // previously idle pools. Configurations where non-preferred CEs are
+    // guaranteed to remain unused are allowed to avoid marking those engines as
+    // usable.
    for_each_set_bit(ce, manager->ce_mask, UVM_COPY_ENGINE_COUNT_MAX) {
        NV_STATUS status;
        uvm_channel_pool_t *pool = NULL;
@@ -3005,17 +3033,15 @@ static NV_STATUS setup_lcic_schedule(uvm_channel_t *paired_wlc, uvm_channel_t *l
    // Reuse WLC sysmem allocation
    NvU64 gpu_unprotected = uvm_rm_mem_get_gpu_uvm_va(paired_wlc->conf_computing.static_pb_unprotected_sysmem, gpu);
    char *cpu_unprotected = paired_wlc->conf_computing.static_pb_unprotected_sysmem_cpu;
-    uvm_gpu_semaphore_t *lcic_gpu_semaphore = &lcic->tracking_sem.semaphore;
+
+    uvm_gpu_semaphore_t *lcic_semaphore = &lcic->tracking_sem.semaphore;
    uvm_gpu_address_t notifier_src_entry_addr = lcic->conf_computing.static_notifier_entry_unprotected_sysmem_gpu_va;
    uvm_gpu_address_t notifier_src_exit_addr = lcic->conf_computing.static_notifier_exit_unprotected_sysmem_gpu_va;
-    uvm_gpu_address_t notifier_dst_addr = uvm_rm_mem_get_gpu_va(lcic_gpu_semaphore->conf_computing.notifier,
-                                                                gpu,
-                                                                false);
-    uvm_gpu_address_t encrypted_payload_gpu_va =
-        uvm_rm_mem_get_gpu_va(lcic_gpu_semaphore->conf_computing.encrypted_payload, gpu, false);
+    uvm_gpu_address_t notifier_dst_addr = uvm_gpu_semaphore_get_notifier_gpu_va(lcic_semaphore);
+    uvm_gpu_address_t encrypted_payload_gpu_va = uvm_gpu_semaphore_get_encrypted_payload_gpu_va(lcic_semaphore);
+    uvm_gpu_address_t auth_tag_gpu_va = uvm_gpu_semaphore_get_auth_tag_gpu_va(lcic_semaphore);
    uvm_gpu_address_t semaphore_gpu_va = uvm_gpu_address_virtual(uvm_channel_tracking_semaphore_get_gpu_va(lcic));
-    uvm_gpu_address_t auth_tag_gpu_va = uvm_rm_mem_get_gpu_va(lcic_gpu_semaphore->conf_computing.auth_tag, gpu, false);
-    NvU32 payload_size = sizeof(*lcic->tracking_sem.semaphore.payload);
+    NvU32 payload_size = sizeof(*uvm_gpu_semaphore_get_encrypted_payload_cpu_va(lcic_semaphore));
    NvU32 notifier_size = sizeof(*lcic->conf_computing.static_notifier_entry_unprotected_sysmem_cpu);

    NvU64 *lcic_gpfifo_entries;
@@ -3194,12 +3220,8 @@ static NV_STATUS channel_manager_create_conf_computing_pools(uvm_channel_manager

    manager->pool_to_use.default_for_type[UVM_CHANNEL_TYPE_SEC2] = sec2_pool;

-    // Use the same CE as CPU TO GPU channels for WLC/LCIC
-    // Both need to use the same engine for the fixed schedule to work.
-    // TODO: Bug 3981928: [hcc][uvm] Optimize parameters of WLC/LCIC secure
-    // work launch
-    // Find a metric to select the best CE to use
-    wlc_lcic_ce_index = preferred_ce[UVM_CHANNEL_TYPE_CPU_TO_GPU];
+    // WLC and LCIC must use the same engine for the fixed schedule to work.
+    wlc_lcic_ce_index = preferred_ce[UVM_CHANNEL_TYPE_WLC];

    // Create WLC/LCIC pools. This should be done early, CE channels use
    // them for secure launch. The WLC pool must be created before the LCIC.
@@ -3228,14 +3250,10 @@ static NV_STATUS channel_manager_create_conf_computing_pools(uvm_channel_manager
 static NV_STATUS channel_manager_create_pools(uvm_channel_manager_t *manager)
 {
    NV_STATUS status;
-    uvm_channel_type_t type;
    unsigned max_channel_pools;
-    unsigned preferred_ce[UVM_CHANNEL_TYPE_CE_COUNT];
+    unsigned preferred_ce[UVM_CHANNEL_TYPE_COUNT];

-    for (type = 0; type < ARRAY_SIZE(preferred_ce); type++)
-        preferred_ce[type] = UVM_COPY_ENGINE_COUNT_MAX;
-
-    status = channel_manager_pick_copy_engines(manager, preferred_ce);
+    status = channel_manager_pick_ces(manager, preferred_ce);
    if (status != NV_OK)
        return status;

@@ -3496,7 +3514,7 @@ static void uvm_channel_print_info(uvm_channel_t *channel, struct seq_file *s)
    UVM_SEQ_OR_DBG_PRINT(s, "get                %u\n", channel->gpu_get);
    UVM_SEQ_OR_DBG_PRINT(s, "put                %u\n", channel->cpu_put);
    UVM_SEQ_OR_DBG_PRINT(s, "Semaphore GPU VA   0x%llx\n", uvm_channel_tracking_semaphore_get_gpu_va(channel));
-    UVM_SEQ_OR_DBG_PRINT(s, "Semaphore CPU VA   0x%llx\n", (NvU64)(uintptr_t)channel->tracking_sem.semaphore.payload);
+    UVM_SEQ_OR_DBG_PRINT(s, "Semaphore CPU VA   0x%llx\n", (NvU64)uvm_gpu_semaphore_get_cpu_va(&channel->tracking_sem.semaphore));

    channel_pool_unlock(channel->pool);
 }
--- a/kernel-open/nvidia-uvm/uvm_channel.h
+++ b/kernel-open/nvidia-uvm/uvm_channel.h
@@ -418,7 +418,7 @@ struct uvm_channel_manager_struct
    unsigned num_channel_pools;

    // Mask containing the indexes of the usable Copy Engines. Each usable CE
-    // has at least one pool associated with it.
+    // has at least one pool of type UVM_CHANNEL_POOL_TYPE_CE associated with it
    DECLARE_BITMAP(ce_mask, UVM_COPY_ENGINE_COUNT_MAX);

    struct
--- a/kernel-open/nvidia-uvm/uvm_channel_test.c
+++ b/kernel-open/nvidia-uvm/uvm_channel_test.c
@@ -793,7 +793,7 @@ done:
 // This test verifies that concurrent pushes using the same channel pool
 // select different channels, when the Confidential Computing feature is
 // enabled.
-NV_STATUS test_conf_computing_channel_selection(uvm_va_space_t *va_space)
+static NV_STATUS test_conf_computing_channel_selection(uvm_va_space_t *va_space)
 {
    NV_STATUS status = NV_OK;
    uvm_channel_pool_t *pool;
@@ -853,7 +853,7 @@ error:
    return status;
 }

-NV_STATUS test_channel_iv_rotation(uvm_va_space_t *va_space)
+static NV_STATUS test_channel_iv_rotation(uvm_va_space_t *va_space)
 {
    uvm_gpu_t *gpu;

@@ -948,7 +948,7 @@ release:
    return NV_OK;
 }

-NV_STATUS test_write_ctrl_gpfifo_noop(uvm_va_space_t *va_space)
+static NV_STATUS test_write_ctrl_gpfifo_noop(uvm_va_space_t *va_space)
 {
    uvm_gpu_t *gpu;

@@ -987,7 +987,7 @@ NV_STATUS test_write_ctrl_gpfifo_noop(uvm_va_space_t *va_space)
    return NV_OK;
 }

-NV_STATUS test_write_ctrl_gpfifo_and_pushes(uvm_va_space_t *va_space)
+static NV_STATUS test_write_ctrl_gpfifo_and_pushes(uvm_va_space_t *va_space)
 {
    uvm_gpu_t *gpu;

@@ -1035,7 +1035,7 @@ NV_STATUS test_write_ctrl_gpfifo_and_pushes(uvm_va_space_t *va_space)
    return NV_OK;
 }

-NV_STATUS test_write_ctrl_gpfifo_tight(uvm_va_space_t *va_space)
+static NV_STATUS test_write_ctrl_gpfifo_tight(uvm_va_space_t *va_space)
 {
    NV_STATUS status = NV_OK;
    uvm_gpu_t *gpu;
--- a/kernel-open/nvidia-uvm/uvm_conf_computing.c
+++ b/kernel-open/nvidia-uvm/uvm_conf_computing.c
@@ -469,6 +469,7 @@ NV_STATUS uvm_conf_computing_cpu_decrypt(uvm_channel_t *channel,
                                      size,
                                      (const NvU8 *) src_cipher,
                                      src_iv,
+                                      NV_U32_MAX,
                                      (NvU8 *) dst_plain,
                                      NULL,
                                      0,
@@ -485,6 +486,8 @@ NV_STATUS uvm_conf_computing_fault_decrypt(uvm_parent_gpu_t *parent_gpu,
                                           NvU8 valid)
 {
    NV_STATUS status;
+    NvU32 fault_entry_size = parent_gpu->fault_buffer_hal->entry_size(parent_gpu);
+    UvmCslContext *csl_context = &parent_gpu->fault_buffer_info.rm_info.replayable.cslCtx;

    // There is no dedicated lock for the CSL context associated with replayable
    // faults. The mutual exclusion required by the RM CSL API is enforced by
@@ -494,36 +497,48 @@ NV_STATUS uvm_conf_computing_fault_decrypt(uvm_parent_gpu_t *parent_gpu,

    UVM_ASSERT(g_uvm_global.conf_computing_enabled);

-    status = nvUvmInterfaceCslDecrypt(&parent_gpu->fault_buffer_info.rm_info.replayable.cslCtx,
-                                      parent_gpu->fault_buffer_hal->entry_size(parent_gpu),
+    status = nvUvmInterfaceCslLogEncryption(csl_context, UVM_CSL_OPERATION_DECRYPT, fault_entry_size);
+
+    // Informing RM of an encryption/decryption should not fail
+    UVM_ASSERT(status == NV_OK);
+
+    status = nvUvmInterfaceCslDecrypt(csl_context,
+                                      fault_entry_size,
                                      (const NvU8 *) src_cipher,
                                      NULL,
+                                      NV_U32_MAX,
                                      (NvU8 *) dst_plain,
                                      &valid,
                                      sizeof(valid),
                                      (const NvU8 *) auth_tag_buffer);

-    if (status != NV_OK)
+    if (status != NV_OK) {
        UVM_ERR_PRINT("nvUvmInterfaceCslDecrypt() failed: %s, GPU %s\n",
                      nvstatusToString(status),
                      uvm_parent_gpu_name(parent_gpu));

+    }
+
    return status;
 }

-void uvm_conf_computing_fault_increment_decrypt_iv(uvm_parent_gpu_t *parent_gpu, NvU64 increment)
+void uvm_conf_computing_fault_increment_decrypt_iv(uvm_parent_gpu_t *parent_gpu)
 {
    NV_STATUS status;
+    NvU32 fault_entry_size = parent_gpu->fault_buffer_hal->entry_size(parent_gpu);
+    UvmCslContext *csl_context = &parent_gpu->fault_buffer_info.rm_info.replayable.cslCtx;

    // See comment in uvm_conf_computing_fault_decrypt
    UVM_ASSERT(uvm_sem_is_locked(&parent_gpu->isr.replayable_faults.service_lock));

    UVM_ASSERT(g_uvm_global.conf_computing_enabled);

-    status = nvUvmInterfaceCslIncrementIv(&parent_gpu->fault_buffer_info.rm_info.replayable.cslCtx,
-                                          UVM_CSL_OPERATION_DECRYPT,
-                                          increment,
-                                          NULL);
+    status = nvUvmInterfaceCslLogEncryption(csl_context, UVM_CSL_OPERATION_DECRYPT, fault_entry_size);
+
+    // Informing RM of an encryption/decryption should not fail
+    UVM_ASSERT(status == NV_OK);
+
+    status = nvUvmInterfaceCslIncrementIv(csl_context, UVM_CSL_OPERATION_DECRYPT, 1, NULL);

    UVM_ASSERT(status == NV_OK);
 }
--- a/kernel-open/nvidia-uvm/uvm_conf_computing.h
+++ b/kernel-open/nvidia-uvm/uvm_conf_computing.h
@@ -191,12 +191,12 @@ NV_STATUS uvm_conf_computing_fault_decrypt(uvm_parent_gpu_t *parent_gpu,
                                           NvU8 valid);

 // Increment the CPU-side decrypt IV of the CSL context associated with
-// replayable faults. The function is a no-op if the given increment is zero.
+// replayable faults.
 //
 // The IV associated with a fault CSL context is a 64-bit counter.
 //
 // Locking: this function must be invoked while holding the replayable ISR lock.
-void uvm_conf_computing_fault_increment_decrypt_iv(uvm_parent_gpu_t *parent_gpu, NvU64 increment);
+void uvm_conf_computing_fault_increment_decrypt_iv(uvm_parent_gpu_t *parent_gpu);

 // Query the number of remaining messages before IV needs to be rotated.
 void uvm_conf_computing_query_message_pools(uvm_channel_t *channel,
--- a/kernel-open/nvidia-uvm/uvm_get_rm_ptes_test.c
+++ b/kernel-open/nvidia-uvm/uvm_get_rm_ptes_test.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2016-2023 NVIDIA Corporation
+    Copyright (c) 2016-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -119,10 +119,6 @@ static NV_STATUS verify_mapping_info(uvm_va_space_t *va_space,
    if (memory_owning_gpu == NULL)
        return NV_ERR_INVALID_DEVICE;

-    // TODO: Bug 1903234: Once RM supports indirect peer mappings, we'll need to
-    //       update this test since the aperture will be SYS. Depending on how
-    //       RM implements things, we might not be able to compare the physical
-    //       addresses either.
    aperture = get_aperture(va_space, memory_owning_gpu, memory_mapping_gpu, memory_info, sli_supported);

    if (is_cacheable(ext_mapping_info, aperture))
--- a/kernel-open/nvidia-uvm/uvm_gpu.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu.c
@@ -81,6 +81,8 @@ static uvm_gpu_link_type_t get_gpu_link_type(UVM_LINK_TYPE link_type)
            return UVM_GPU_LINK_NVLINK_3;
        case UVM_LINK_TYPE_NVLINK_4:
            return UVM_GPU_LINK_NVLINK_4;
+        case UVM_LINK_TYPE_NVLINK_5:
+            return UVM_GPU_LINK_NVLINK_5;
        case UVM_LINK_TYPE_C2C:
            return UVM_GPU_LINK_C2C;
        default:
@@ -460,7 +462,7 @@ static const char *uvm_gpu_virt_type_string(UVM_VIRT_MODE virtMode)
 static const char *uvm_gpu_link_type_string(uvm_gpu_link_type_t link_type)
 {

-    BUILD_BUG_ON(UVM_GPU_LINK_MAX != 7);
+    BUILD_BUG_ON(UVM_GPU_LINK_MAX != 8);

    switch (link_type) {
        UVM_ENUM_STRING_CASE(UVM_GPU_LINK_INVALID);
@@ -469,6 +471,7 @@ static const char *uvm_gpu_link_type_string(uvm_gpu_link_type_t link_type)
        UVM_ENUM_STRING_CASE(UVM_GPU_LINK_NVLINK_2);
        UVM_ENUM_STRING_CASE(UVM_GPU_LINK_NVLINK_3);
        UVM_ENUM_STRING_CASE(UVM_GPU_LINK_NVLINK_4);
+        UVM_ENUM_STRING_CASE(UVM_GPU_LINK_NVLINK_5);
        UVM_ENUM_STRING_CASE(UVM_GPU_LINK_C2C);
        UVM_ENUM_STRING_DEFAULT();
    }
@@ -1679,12 +1682,9 @@ static void remove_gpu(uvm_gpu_t *gpu)
    // TODO: Bug 2008200: Add and remove the GPU in a more reasonable spot.
    uvm_conf_computing_gpu_deinit(gpu);

-    // TODO: Bug 2844714: If the parent is not being freed, the following
-    // gpu_table_lock is only needed to protect concurrent
-    // find_first_valid_gpu() in BH from the __clear_bit here. After
-    // find_first_valid_gpu() is removed, gpu_table_lock should only be acquired
-    // and released in the free_parent case.
-    //
+    // If the parent is not being freed, the following gpu_table_lock is only
+    // needed to protect concurrent uvm_parent_gpu_find_first_valid_gpu() in BH
+    // from the __clear_bit here.
    // In the free_parent case, gpu_table_lock protects the top half from the
    // uvm_global_remove_parent_gpu()
    uvm_spin_lock_irqsave(&g_uvm_global.gpu_table_lock);
@@ -2262,18 +2262,6 @@ static void set_optimal_p2p_write_ces(const UvmGpuP2PCapsParams *p2p_caps_params
    ce0 = p2p_caps_params->optimalNvlinkWriteCEs[sorted ? 0 : 1];
    ce1 = p2p_caps_params->optimalNvlinkWriteCEs[sorted ? 1 : 0];

-    // Indirect peers communicate through the CPU, so the optimal CE
-    // should match the one selected for writing to system memory
-    if (peer_caps->is_indirect_peer) {
-        uvm_channel_pool_t *pool;
-
-        pool = gpu0->channel_manager->pool_to_use.default_for_type[UVM_CHANNEL_TYPE_GPU_TO_CPU];
-        UVM_ASSERT(ce0 == pool->engine_index);
-
-        pool = gpu1->channel_manager->pool_to_use.default_for_type[UVM_CHANNEL_TYPE_GPU_TO_CPU];
-        UVM_ASSERT(ce1 == pool->engine_index);
-    }
-
    uvm_channel_manager_set_p2p_ce(gpu0->channel_manager, gpu1, ce0);
    uvm_channel_manager_set_p2p_ce(gpu1->channel_manager, gpu0, ce1);
 }
@@ -2369,66 +2357,45 @@ static NV_STATUS init_peer_access(uvm_gpu_t *gpu0,
    peer_caps->total_link_line_rate_mbyte_per_s = p2p_caps_params->totalLinkLineRateMBps;

    // Initialize peer ids and establish peer mappings
-    peer_caps->is_indirect_peer = (p2p_caps_params->indirectAccess == NV_TRUE);
+    // Peer id from min(gpu_id0, gpu_id1) -> max(gpu_id0, gpu_id1)
+    peer_caps->peer_ids[0] = p2p_caps_params->peerIds[0];

-    if (peer_caps->is_indirect_peer) {
-        UVM_ASSERT(gpu0->mem_info.numa.enabled);
-        UVM_ASSERT(gpu1->mem_info.numa.enabled);
+    // Peer id from max(gpu_id0, gpu_id1) -> min(gpu_id0, gpu_id1)
+    peer_caps->peer_ids[1] = p2p_caps_params->peerIds[1];

-        status = uvm_pmm_gpu_indirect_peer_init(&gpu0->pmm, gpu1);
-        if (status != NV_OK)
-            return status;
+    // Establish peer mappings from each GPU to the other.
+    status = uvm_mmu_create_peer_identity_mappings(gpu0, gpu1);
+    if (status != NV_OK)
+        return status;

-        status = uvm_pmm_gpu_indirect_peer_init(&gpu1->pmm, gpu0);
-        if (status != NV_OK)
-            return status;
+    status = uvm_mmu_create_peer_identity_mappings(gpu1, gpu0);
+    if (status != NV_OK)
+        return status;

-        set_optimal_p2p_write_ces(p2p_caps_params, peer_caps, gpu0, gpu1);
-        UVM_ASSERT(peer_caps->total_link_line_rate_mbyte_per_s == 0);
-    }
-    else {
-        // Peer id from min(gpu_id0, gpu_id1) -> max(gpu_id0, gpu_id1)
-        peer_caps->peer_ids[0] = p2p_caps_params->peerIds[0];
+    set_optimal_p2p_write_ces(p2p_caps_params, peer_caps, gpu0, gpu1);

-        // Peer id from max(gpu_id0, gpu_id1) -> min(gpu_id0, gpu_id1)
-        peer_caps->peer_ids[1] = p2p_caps_params->peerIds[1];
+    UVM_ASSERT(uvm_gpu_get(gpu0->id) == gpu0);
+    UVM_ASSERT(uvm_gpu_get(gpu1->id) == gpu1);

-        // Establish peer mappings from each GPU to the other. Indirect peers
-        // do not require identity mappings since they use sysmem aperture to
-        // communicate.
-        status = uvm_mmu_create_peer_identity_mappings(gpu0, gpu1);
-        if (status != NV_OK)
-            return status;
+    // In the case of NVLINK peers, this initialization will happen during
+    // add_gpu. As soon as the peer info table is assigned below, the access
+    // counter bottom half could start operating on the GPU being newly
+    // added and inspecting the peer caps, so all of the appropriate
+    // initialization must happen before this point.
+    uvm_spin_lock(&gpu0->peer_info.peer_gpus_lock);

-        status = uvm_mmu_create_peer_identity_mappings(gpu1, gpu0);
-        if (status != NV_OK)
-            return status;
+    uvm_processor_mask_set(&gpu0->peer_info.peer_gpu_mask, gpu1->id);
+    UVM_ASSERT(gpu0->peer_info.peer_gpus[uvm_id_gpu_index(gpu1->id)] == NULL);
+    gpu0->peer_info.peer_gpus[uvm_id_gpu_index(gpu1->id)] = gpu1;

-        set_optimal_p2p_write_ces(p2p_caps_params, peer_caps, gpu0, gpu1);
+    uvm_spin_unlock(&gpu0->peer_info.peer_gpus_lock);
+    uvm_spin_lock(&gpu1->peer_info.peer_gpus_lock);

-        UVM_ASSERT(uvm_gpu_get(gpu0->id) == gpu0);
-        UVM_ASSERT(uvm_gpu_get(gpu1->id) == gpu1);
+    uvm_processor_mask_set(&gpu1->peer_info.peer_gpu_mask, gpu0->id);
+    UVM_ASSERT(gpu1->peer_info.peer_gpus[uvm_id_gpu_index(gpu0->id)] == NULL);
+    gpu1->peer_info.peer_gpus[uvm_id_gpu_index(gpu0->id)] = gpu0;

-        // In the case of NVLINK peers, this initialization will happen during
-        // add_gpu. As soon as the peer info table is assigned below, the access
-        // counter bottom half could start operating on the GPU being newly
-        // added and inspecting the peer caps, so all of the appropriate
-        // initialization must happen before this point.
-        uvm_spin_lock(&gpu0->peer_info.peer_gpus_lock);
-
-        uvm_processor_mask_set(&gpu0->peer_info.peer_gpu_mask, gpu1->id);
-        UVM_ASSERT(gpu0->peer_info.peer_gpus[uvm_id_gpu_index(gpu1->id)] == NULL);
-        gpu0->peer_info.peer_gpus[uvm_id_gpu_index(gpu1->id)] = gpu1;
-
-        uvm_spin_unlock(&gpu0->peer_info.peer_gpus_lock);
-        uvm_spin_lock(&gpu1->peer_info.peer_gpus_lock);
-
-        uvm_processor_mask_set(&gpu1->peer_info.peer_gpu_mask, gpu0->id);
-        UVM_ASSERT(gpu1->peer_info.peer_gpus[uvm_id_gpu_index(gpu0->id)] == NULL);
-        gpu1->peer_info.peer_gpus[uvm_id_gpu_index(gpu0->id)] = gpu0;
-
-        uvm_spin_unlock(&gpu1->peer_info.peer_gpus_lock);
-    }
+    uvm_spin_unlock(&gpu1->peer_info.peer_gpus_lock);

    return init_procfs_peer_files(gpu0, gpu1);
 }
@@ -2496,7 +2463,6 @@ static NV_STATUS enable_pcie_peer_access(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1)
        goto cleanup;

    // Sanity checks
-    UVM_ASSERT(p2p_caps_params.indirectAccess == NV_FALSE);
    UVM_ASSERT(p2p_caps_params.p2pLink == UVM_LINK_TYPE_PCIE);

    status = init_peer_access(gpu0, gpu1, &p2p_caps_params, peer_caps);
@@ -2526,29 +2492,26 @@ static NV_STATUS enable_nvlink_peer_access(uvm_gpu_t *gpu0,
    UVM_ASSERT(peer_caps->ref_count == 0);
    peer_caps->ref_count = 1;

-    if (!p2p_caps_params->indirectAccess) {
-        // Create P2P object for direct NVLink peers
-        status = create_p2p_object(gpu0, gpu1, &p2p_handle);
-        if (status != NV_OK) {
-            UVM_ERR_PRINT("failed to create a P2P object with error: %s, for GPU1:%s and GPU2:%s \n",
-                           nvstatusToString(status),
-                           uvm_gpu_name(gpu0),
-                           uvm_gpu_name(gpu1));
-            return status;
-        }
-
-        UVM_ASSERT(p2p_handle != 0);
-
-        // Store the handle in the global table.
-        peer_caps->p2p_handle = p2p_handle;
-
-        // Update p2p caps after p2p object creation as it generates the peer
-        // ids
-        status = get_p2p_caps(gpu0, gpu1, p2p_caps_params);
-        if (status != NV_OK)
-            goto cleanup;
+    // Create P2P object for direct NVLink peers
+    status = create_p2p_object(gpu0, gpu1, &p2p_handle);
+    if (status != NV_OK) {
+        UVM_ERR_PRINT("failed to create a P2P object with error: %s, for GPU1:%s and GPU2:%s \n",
+                       nvstatusToString(status),
+                       uvm_gpu_name(gpu0),
+                       uvm_gpu_name(gpu1));
+        return status;
    }

+    UVM_ASSERT(p2p_handle != 0);
+
+    // Store the handle in the global table.
+    peer_caps->p2p_handle = p2p_handle;
+
+    // Update p2p caps after p2p object creation as it generates the peer ids.
+    status = get_p2p_caps(gpu0, gpu1, p2p_caps_params);
+    if (status != NV_OK)
+        goto cleanup;
+
    status = init_peer_access(gpu0, gpu1, p2p_caps_params, peer_caps);
    if (status != NV_OK)
        goto cleanup;
@@ -2583,11 +2546,6 @@ static NV_STATUS discover_nvlink_peers(uvm_gpu_t *gpu)
        if (p2p_caps_params.p2pLink == UVM_LINK_TYPE_NONE || p2p_caps_params.p2pLink == UVM_LINK_TYPE_PCIE)
            continue;

-        // Indirect peers are only supported when onlined as NUMA nodes, because
-        // we want to use vm_insert_page and dma_map_page.
-        if (p2p_caps_params.indirectAccess && (!gpu->mem_info.numa.enabled || !other_gpu->mem_info.numa.enabled))
-            continue;
-
        status = enable_nvlink_peer_access(gpu, other_gpu, &p2p_caps_params);
        if (status != NV_OK)
            goto cleanup;
@@ -2676,32 +2634,25 @@ static void disable_peer_access(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1)
        deinit_procfs_peer_cap_files(peer_caps);

    p2p_handle = peer_caps->p2p_handle;
+    UVM_ASSERT(p2p_handle);

-    if (peer_caps->is_indirect_peer) {
-        uvm_pmm_gpu_indirect_peer_destroy(&gpu0->pmm, gpu1);
-        uvm_pmm_gpu_indirect_peer_destroy(&gpu1->pmm, gpu0);
-    }
-    else {
-        UVM_ASSERT(p2p_handle);
+    uvm_mmu_destroy_peer_identity_mappings(gpu0, gpu1);
+    uvm_mmu_destroy_peer_identity_mappings(gpu1, gpu0);

-        uvm_mmu_destroy_peer_identity_mappings(gpu0, gpu1);
-        uvm_mmu_destroy_peer_identity_mappings(gpu1, gpu0);
+    uvm_rm_locked_call_void(nvUvmInterfaceP2pObjectDestroy(uvm_global_session_handle(), p2p_handle));

-        uvm_rm_locked_call_void(nvUvmInterfaceP2pObjectDestroy(uvm_global_session_handle(), p2p_handle));
+    UVM_ASSERT(uvm_gpu_get(gpu0->id) == gpu0);
+    UVM_ASSERT(uvm_gpu_get(gpu1->id) == gpu1);

-        UVM_ASSERT(uvm_gpu_get(gpu0->id) == gpu0);
-        UVM_ASSERT(uvm_gpu_get(gpu1->id) == gpu1);
+    uvm_spin_lock(&gpu0->peer_info.peer_gpus_lock);
+    uvm_processor_mask_clear(&gpu0->peer_info.peer_gpu_mask, gpu1->id);
+    gpu0->peer_info.peer_gpus[uvm_id_gpu_index(gpu1->id)] = NULL;
+    uvm_spin_unlock(&gpu0->peer_info.peer_gpus_lock);

-        uvm_spin_lock(&gpu0->peer_info.peer_gpus_lock);
-        uvm_processor_mask_clear(&gpu0->peer_info.peer_gpu_mask, gpu1->id);
-        gpu0->peer_info.peer_gpus[uvm_id_gpu_index(gpu1->id)] = NULL;
-        uvm_spin_unlock(&gpu0->peer_info.peer_gpus_lock);
-
-        uvm_spin_lock(&gpu1->peer_info.peer_gpus_lock);
-        uvm_processor_mask_clear(&gpu1->peer_info.peer_gpu_mask, gpu0->id);
-        gpu1->peer_info.peer_gpus[uvm_id_gpu_index(gpu0->id)] = NULL;
-        uvm_spin_unlock(&gpu1->peer_info.peer_gpus_lock);
-    }
+    uvm_spin_lock(&gpu1->peer_info.peer_gpus_lock);
+    uvm_processor_mask_clear(&gpu1->peer_info.peer_gpu_mask, gpu0->id);
+    gpu1->peer_info.peer_gpus[uvm_id_gpu_index(gpu0->id)] = NULL;
+    uvm_spin_unlock(&gpu1->peer_info.peer_gpus_lock);

    // Flush the access counter buffer to avoid getting stale notifications for
    // accesses to GPUs to which peer access is being disabled. This is also
@@ -2741,10 +2692,6 @@ static uvm_aperture_t uvm_gpu_peer_caps_aperture(uvm_gpu_peer_t *peer_caps, uvm_
 {
    size_t peer_index;

-    // Indirect peers are accessed as sysmem addresses
-    if (peer_caps->is_indirect_peer)
-        return UVM_APERTURE_SYS;
-
    // MIG instances in the same physical GPU have vidmem addresses
    if (local_gpu->parent == remote_gpu->parent)
        return UVM_APERTURE_VID;
@@ -2795,6 +2742,7 @@ uvm_processor_id_t uvm_gpu_get_processor_id_by_address(uvm_gpu_t *gpu, uvm_gpu_p
    for_each_gpu_id_in_mask(id, &gpu->peer_info.peer_gpu_mask) {
        uvm_gpu_t *other_gpu = gpu->peer_info.peer_gpus[uvm_id_gpu_index(id)];
        UVM_ASSERT(other_gpu);
+        UVM_ASSERT(!uvm_gpus_are_smc_peers(gpu, other_gpu));

        if (uvm_gpus_are_nvswitch_connected(gpu, other_gpu)) {
            // NVSWITCH connected systems use an extended physical address to
@@ -2831,7 +2779,7 @@ static NvU64 instance_ptr_to_key(uvm_gpu_phys_address_t instance_ptr)

    // Instance pointers must be 4k aligned and they must have either VID or SYS
    // apertures. Compress them as much as we can both to guarantee that the key
-    // fits within 64 bits, and to make the table as shallow as possible.
+    // fits within 64 bits, and to make the key space as small as possible.
    UVM_ASSERT(IS_ALIGNED(instance_ptr.address, UVM_PAGE_SIZE_4K));
    UVM_ASSERT(instance_ptr.aperture == UVM_APERTURE_VID || instance_ptr.aperture == UVM_APERTURE_SYS);

@@ -2848,7 +2796,7 @@ static NV_STATUS parent_gpu_add_user_channel_subctx_info(uvm_parent_gpu_t *paren
    uvm_rb_tree_node_t *channel_tree_node;
    uvm_user_channel_subctx_info_t *channel_subctx_info;
    uvm_user_channel_subctx_info_t *new_channel_subctx_info = NULL;
-    uvm_va_space_t *va_space = user_channel->gpu_va_space->va_space;
+    uvm_gpu_va_space_t *gpu_va_space = user_channel->gpu_va_space;

    if (!user_channel->in_subctx)
        return NV_OK;
@@ -2892,21 +2840,21 @@ static NV_STATUS parent_gpu_add_user_channel_subctx_info(uvm_parent_gpu_t *paren

    user_channel->subctx_info = channel_subctx_info;

-    // Register the VA space of the channel subcontext info descriptor, or
+    // Register the GPU VA space of the channel subcontext info descriptor, or
    // check that the existing one matches the channel's
    if (channel_subctx_info->subctxs[user_channel->subctx_id].refcount++ > 0) {
-        UVM_ASSERT_MSG(channel_subctx_info->subctxs[user_channel->subctx_id].va_space == va_space,
-                       "CH %u:%u instance_ptr {0x%llx:%s} SubCTX %u in TSG %u: expected VA space 0x%llx but got 0x%llx instead\n",
+        UVM_ASSERT_MSG(channel_subctx_info->subctxs[user_channel->subctx_id].gpu_va_space == gpu_va_space,
+                       "CH %u:%u instance_ptr {0x%llx:%s} SubCTX %u in TSG %u: expected GPU VA space 0x%llx but got 0x%llx instead\n",
                       user_channel->hw_runlist_id,
                       user_channel->hw_channel_id,
                       instance_ptr.address,
                       uvm_aperture_string(instance_ptr.aperture),
                       user_channel->subctx_id,
                       user_channel->tsg.id,
-                       (NvU64)va_space,
-                       (NvU64)channel_subctx_info->subctxs[user_channel->subctx_id].va_space);
-        UVM_ASSERT_MSG(channel_subctx_info->subctxs[user_channel->subctx_id].va_space != NULL,
-                       "CH %u:%u instance_ptr {0x%llx:%s} SubCTX %u in TSG %u: VA space is NULL\n",
+                       (NvU64)gpu_va_space,
+                       (NvU64)channel_subctx_info->subctxs[user_channel->subctx_id].gpu_va_space);
+        UVM_ASSERT_MSG(channel_subctx_info->subctxs[user_channel->subctx_id].gpu_va_space != NULL,
+                       "CH %u:%u instance_ptr {0x%llx:%s} SubCTX %u in TSG %u: GPU VA space is NULL\n",
                       user_channel->hw_runlist_id,
                       user_channel->hw_channel_id,
                       instance_ptr.address,
@@ -2923,17 +2871,17 @@ static NV_STATUS parent_gpu_add_user_channel_subctx_info(uvm_parent_gpu_t *paren
                       user_channel->tsg.id);
    }
    else {
-        UVM_ASSERT_MSG(channel_subctx_info->subctxs[user_channel->subctx_id].va_space == NULL,
-                       "CH %u:%u instance_ptr {0x%llx:%s} SubCTX %u in TSG %u: expected VA space NULL but got 0x%llx instead\n",
+        UVM_ASSERT_MSG(channel_subctx_info->subctxs[user_channel->subctx_id].gpu_va_space == NULL,
+                       "CH %u:%u instance_ptr {0x%llx:%s} SubCTX %u in TSG %u: expected GPU VA space NULL but got 0x%llx instead\n",
                       user_channel->hw_runlist_id,
                       user_channel->hw_channel_id,
                       instance_ptr.address,
                       uvm_aperture_string(instance_ptr.aperture),
                       user_channel->subctx_id,
                       user_channel->tsg.id,
-                       (NvU64)channel_subctx_info->subctxs[user_channel->subctx_id].va_space);
+                       (NvU64)channel_subctx_info->subctxs[user_channel->subctx_id].gpu_va_space);

-        channel_subctx_info->subctxs[user_channel->subctx_id].va_space = va_space;
+        channel_subctx_info->subctxs[user_channel->subctx_id].gpu_va_space = gpu_va_space;
    }

    ++channel_subctx_info->total_refcount;
@@ -2957,7 +2905,7 @@ static void parent_gpu_remove_user_channel_subctx_info_locked(uvm_parent_gpu_t *
                                                              uvm_user_channel_t *user_channel)
 {
    uvm_gpu_phys_address_t instance_ptr = user_channel->instance_ptr.addr;
-    uvm_va_space_t *va_space = user_channel->gpu_va_space->va_space;
+    uvm_gpu_va_space_t *gpu_va_space = user_channel->gpu_va_space;

    uvm_assert_spinlock_locked(&parent_gpu->instance_ptr_table_lock);

@@ -2986,16 +2934,17 @@ static void parent_gpu_remove_user_channel_subctx_info_locked(uvm_parent_gpu_t *
                   user_channel->subctx_id,
                   user_channel->tsg.id);

-    UVM_ASSERT_MSG(user_channel->subctx_info->subctxs[user_channel->subctx_id].va_space == va_space,
-                   "CH %u:%u instance_ptr {0x%llx:%s} SubCTX %u in TSG %u: expected VA space 0x%llx but got 0x%llx instead\n",
+    UVM_ASSERT_MSG(user_channel->subctx_info->subctxs[user_channel->subctx_id].gpu_va_space == gpu_va_space,
+                   "CH %u:%u instance_ptr {0x%llx:%s} SubCTX %u in TSG %u: "
+                   "expected GPU VA space 0x%llx but got 0x%llx instead\n",
                   user_channel->hw_runlist_id,
                   user_channel->hw_channel_id,
                   instance_ptr.address,
                   uvm_aperture_string(instance_ptr.aperture),
                   user_channel->subctx_id,
                   user_channel->tsg.id,
-                   (NvU64)va_space,
-                   (NvU64)user_channel->subctx_info->subctxs[user_channel->subctx_id].va_space);
+                   (NvU64)gpu_va_space,
+                   (NvU64)user_channel->subctx_info->subctxs[user_channel->subctx_id].gpu_va_space);

    UVM_ASSERT_MSG(user_channel->subctx_info->total_refcount > 0,
                   "CH %u:%u instance_ptr {0x%llx:%s} SubCTX %u in TSG %u: TSG refcount is 0\n",
@@ -3008,7 +2957,7 @@ static void parent_gpu_remove_user_channel_subctx_info_locked(uvm_parent_gpu_t *

    // Decrement VA space refcount. If it gets to zero, unregister the pointer
    if (--user_channel->subctx_info->subctxs[user_channel->subctx_id].refcount == 0)
-        user_channel->subctx_info->subctxs[user_channel->subctx_id].va_space = NULL;
+        user_channel->subctx_info->subctxs[user_channel->subctx_id].gpu_va_space = NULL;

    if (--user_channel->subctx_info->total_refcount == 0) {
        uvm_rb_tree_remove(&parent_gpu->tsg_table, &user_channel->subctx_info->node);
@@ -3091,7 +3040,7 @@ static uvm_user_channel_t *instance_ptr_to_user_channel(uvm_parent_gpu_t *parent
    return get_user_channel(instance_node);
 }

-static uvm_va_space_t *user_channel_and_subctx_to_va_space(uvm_user_channel_t *user_channel, NvU32 subctx_id)
+static uvm_gpu_va_space_t *user_channel_and_subctx_to_gpu_va_space(uvm_user_channel_t *user_channel, NvU32 subctx_id)
 {
    uvm_user_channel_subctx_info_t *channel_subctx_info;

@@ -3119,28 +3068,31 @@ static uvm_va_space_t *user_channel_and_subctx_to_va_space(uvm_user_channel_t *u
    // uncleanly and work from that subcontext continues running with work from
    // other subcontexts.
    if (channel_subctx_info->subctxs[subctx_id].refcount == 0) {
-        UVM_ASSERT(channel_subctx_info->subctxs[subctx_id].va_space == NULL);
+        UVM_ASSERT(channel_subctx_info->subctxs[subctx_id].gpu_va_space == NULL);
    }
    else {
-        UVM_ASSERT_MSG(channel_subctx_info->subctxs[subctx_id].va_space,
-                       "instance_ptr {0x%llx:%s} in TSG %u: no VA space for SubCTX %u\n",
+        UVM_ASSERT_MSG(channel_subctx_info->subctxs[subctx_id].gpu_va_space,
+                       "instance_ptr {0x%llx:%s} in TSG %u: no GPU VA space for SubCTX %u\n",
                       user_channel->instance_ptr.addr.address,
                       uvm_aperture_string(user_channel->instance_ptr.addr.aperture),
                       user_channel->tsg.id,
                       subctx_id);
    }

-    return channel_subctx_info->subctxs[subctx_id].va_space;
+    return channel_subctx_info->subctxs[subctx_id].gpu_va_space;
 }

 NV_STATUS uvm_parent_gpu_fault_entry_to_va_space(uvm_parent_gpu_t *parent_gpu,
-                                                 uvm_fault_buffer_entry_t *fault,
-                                                 uvm_va_space_t **out_va_space)
+                                                 const uvm_fault_buffer_entry_t *fault,
+                                                 uvm_va_space_t **out_va_space,
+                                                 uvm_gpu_t **out_gpu)
 {
    uvm_user_channel_t *user_channel;
+    uvm_gpu_va_space_t *gpu_va_space;
    NV_STATUS status = NV_OK;

    *out_va_space = NULL;
+    *out_gpu = NULL;

    uvm_spin_lock(&parent_gpu->instance_ptr_table_lock);

@@ -3161,8 +3113,10 @@ NV_STATUS uvm_parent_gpu_fault_entry_to_va_space(uvm_parent_gpu_t *parent_gpu,
        // We can safely access user_channel->gpu_va_space under the
        // instance_ptr_table_lock since gpu_va_space is set to NULL after this
        // function is called in uvm_user_channel_detach
-        UVM_ASSERT(uvm_gpu_va_space_state(user_channel->gpu_va_space) == UVM_GPU_VA_SPACE_STATE_ACTIVE);
-        *out_va_space = user_channel->gpu_va_space->va_space;
+        gpu_va_space = user_channel->gpu_va_space;
+        UVM_ASSERT(uvm_gpu_va_space_state(gpu_va_space) == UVM_GPU_VA_SPACE_STATE_ACTIVE);
+        *out_va_space = gpu_va_space->va_space;
+        *out_gpu = gpu_va_space->gpu;
    }
    else {
        NvU32 ve_id = fault->fault_source.ve_id;
@@ -3172,12 +3126,17 @@ NV_STATUS uvm_parent_gpu_fault_entry_to_va_space(uvm_parent_gpu_t *parent_gpu,

        ve_id -= user_channel->smc_engine_ve_id_offset;

-        *out_va_space = user_channel_and_subctx_to_va_space(user_channel, ve_id);
+        gpu_va_space = user_channel_and_subctx_to_gpu_va_space(user_channel, ve_id);

        // Instance pointer is valid but the fault targets a non-existent
        // subcontext.
-        if (!*out_va_space)
+        if (gpu_va_space) {
+            *out_va_space = gpu_va_space->va_space;
+            *out_gpu = gpu_va_space->gpu;
+        }
+        else {
            status = NV_ERR_PAGE_TABLE_NOT_AVAIL;
+        }
    }

 exit_unlock:
@@ -3187,13 +3146,16 @@ exit_unlock:
 }

 NV_STATUS uvm_parent_gpu_access_counter_entry_to_va_space(uvm_parent_gpu_t *parent_gpu,
-                                                          uvm_access_counter_buffer_entry_t *entry,
-                                                          uvm_va_space_t **out_va_space)
+                                                          const uvm_access_counter_buffer_entry_t *entry,
+                                                          uvm_va_space_t **out_va_space,
+                                                          uvm_gpu_t **out_gpu)
 {
    uvm_user_channel_t *user_channel;
+    uvm_gpu_va_space_t *gpu_va_space;
    NV_STATUS status = NV_OK;

    *out_va_space = NULL;
+    *out_gpu = NULL;
    UVM_ASSERT(entry->address.is_virtual);

    uvm_spin_lock(&parent_gpu->instance_ptr_table_lock);
@@ -3209,13 +3171,20 @@ NV_STATUS uvm_parent_gpu_access_counter_entry_to_va_space(uvm_parent_gpu_t *pare
                       "Access counter packet contains SubCTX %u for channel not in subctx\n",
                       entry->virtual_info.ve_id);

-        UVM_ASSERT(uvm_gpu_va_space_state(user_channel->gpu_va_space) == UVM_GPU_VA_SPACE_STATE_ACTIVE);
-        *out_va_space = user_channel->gpu_va_space->va_space;
+        gpu_va_space = user_channel->gpu_va_space;
+        UVM_ASSERT(uvm_gpu_va_space_state(gpu_va_space) == UVM_GPU_VA_SPACE_STATE_ACTIVE);
+        *out_va_space = gpu_va_space->va_space;
+        *out_gpu = gpu_va_space->gpu;
    }
    else {
-        *out_va_space = user_channel_and_subctx_to_va_space(user_channel, entry->virtual_info.ve_id);
-        if (!*out_va_space)
+        gpu_va_space = user_channel_and_subctx_to_gpu_va_space(user_channel, entry->virtual_info.ve_id);
+        if (gpu_va_space) {
+            *out_va_space = gpu_va_space->va_space;
+            *out_gpu = gpu_va_space->gpu;
+        }
+        else {
            status = NV_ERR_PAGE_TABLE_NOT_AVAIL;
+        }
    }

 exit_unlock:
--- a/kernel-open/nvidia-uvm/uvm_gpu.h
+++ b/kernel-open/nvidia-uvm/uvm_gpu.h
@@ -279,6 +279,10 @@ struct uvm_fault_service_batch_context_struct
    // pick one to be the target of the cancel sequence.
    uvm_va_space_t *fatal_va_space;

+    // TODO: Bug 3900733: refactor service_fault_batch_for_cancel() to handle
+    // iterating over multiple GPU VA spaces and remove fatal_gpu.
+    uvm_gpu_t *fatal_gpu;
+
    bool has_throttled_faults;

    NvU32 num_invalid_prefetch_faults;
@@ -593,6 +597,7 @@ typedef enum
    UVM_GPU_LINK_NVLINK_2,
    UVM_GPU_LINK_NVLINK_3,
    UVM_GPU_LINK_NVLINK_4,
+    UVM_GPU_LINK_NVLINK_5,
    UVM_GPU_LINK_C2C,
    UVM_GPU_LINK_MAX
 } uvm_gpu_link_type_t;
@@ -1265,11 +1270,6 @@ struct uvm_gpu_peer_struct
    // peer_id[1] from max(gpu_id_1, gpu_id_2) -> min(gpu_id_1, gpu_id_2)
    NvU8 peer_ids[2];

-    // Indirect peers are GPUs which can coherently access each others' memory
-    // over NVLINK, but are routed through the CPU using the SYS aperture rather
-    // than a PEER aperture
-    NvU8 is_indirect_peer : 1;
-
    // The link type between the peer GPUs, currently either PCIe or NVLINK.
    // This field is used to determine the when this peer struct has been
    // initialized (link_type != UVM_GPU_LINK_INVALID). NVLink peers are
@@ -1278,8 +1278,8 @@ struct uvm_gpu_peer_struct
    uvm_gpu_link_type_t link_type;

    // Maximum unidirectional bandwidth between the peers in megabytes per
-    // second, not taking into account the protocols' overhead. The reported
-    // bandwidth for indirect peers is zero. See UvmGpuP2PCapsParams.
+    // second, not taking into account the protocols' overhead.
+    // See UvmGpuP2PCapsParams.
    NvU32 total_link_line_rate_mbyte_per_s;

    // For PCIe, the number of times that this has been retained by a VA space.
@@ -1423,19 +1423,9 @@ static bool uvm_gpus_are_nvswitch_connected(const uvm_gpu_t *gpu0, const uvm_gpu
    return false;
 }

-static bool uvm_gpus_are_indirect_peers(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1)
+static bool uvm_gpus_are_smc_peers(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1)
 {
-    uvm_gpu_peer_t *peer_caps = uvm_gpu_peer_caps(gpu0, gpu1);
-
-    if (peer_caps->link_type != UVM_GPU_LINK_INVALID && peer_caps->is_indirect_peer) {
-        UVM_ASSERT(gpu0->mem_info.numa.enabled);
-        UVM_ASSERT(gpu1->mem_info.numa.enabled);
-        UVM_ASSERT(peer_caps->link_type != UVM_GPU_LINK_PCIE);
-        UVM_ASSERT(!uvm_gpus_are_nvswitch_connected(gpu0, gpu1));
-        return true;
-    }
-
-    return false;
+    return gpu0->parent == gpu1->parent;
 }

 // Retrieve the virtual address corresponding to the given vidmem physical
@@ -1620,16 +1610,25 @@ void uvm_parent_gpu_remove_user_channel(uvm_parent_gpu_t *parent_gpu, uvm_user_c
 //  NV_ERR_PAGE_TABLE_NOT_AVAIL  Entry's instance pointer is valid but the entry
 //                               targets an invalid subcontext
 //
-// out_va_space is valid if NV_OK is returned, otherwise it's NULL. The caller
-// is responsibile for ensuring that the returned va_space can't be destroyed,
-// so these functions should only be called from the bottom half.
+// out_va_space is valid if NV_OK is returned, otherwise it's NULL.
+// out_gpu is valid if NV_OK is returned, otherwise it's NULL.
+// The caller is responsible for ensuring that the returned va_space and gpu
+// can't be destroyed, so this function should only be called from the bottom
+// half.
 NV_STATUS uvm_parent_gpu_fault_entry_to_va_space(uvm_parent_gpu_t *parent_gpu,
-                                                 uvm_fault_buffer_entry_t *fault,
-                                                 uvm_va_space_t **out_va_space);
+                                                 const uvm_fault_buffer_entry_t *fault,
+                                                 uvm_va_space_t **out_va_space,
+                                                 uvm_gpu_t **out_gpu);

+// Return the GPU VA space for the given instance pointer and ve_id in the
+// access counter entry. This function can only be used for virtual address
+// entries.
+// The return values are the same as uvm_parent_gpu_fault_entry_to_va_space()
+// but for virtual access counter entries.
 NV_STATUS uvm_parent_gpu_access_counter_entry_to_va_space(uvm_parent_gpu_t *parent_gpu,
-                                                          uvm_access_counter_buffer_entry_t *entry,
-                                                          uvm_va_space_t **out_va_space);
+                                                          const uvm_access_counter_buffer_entry_t *entry,
+                                                          uvm_va_space_t **out_va_space,
+                                                          uvm_gpu_t **out_gpu);

 typedef enum
 {
--- a/kernel-open/nvidia-uvm/uvm_gpu_access_counters.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu_access_counters.c
@@ -734,9 +734,18 @@ static int cmp_sort_virt_notifications_by_instance_ptr(const void *_a, const voi
    return cmp_access_counter_instance_ptr(a, b);
 }

+// Compare two GPUs
+static inline int cmp_gpu(const uvm_gpu_t *a, const uvm_gpu_t *b)
+{
+    NvU32 id_a = a ? uvm_id_value(a->id) : 0;
+    NvU32 id_b = b ? uvm_id_value(b->id) : 0;
+
+    return UVM_CMP_DEFAULT(id_a, id_b);
+}
+
 // Sort comparator for pointers to GVA access counter notification buffer
-// entries that sorts by va_space, and fault address.
-static int cmp_sort_virt_notifications_by_va_space_address(const void *_a, const void *_b)
+// entries that sorts by va_space, GPU ID, and fault address.
+static int cmp_sort_virt_notifications_by_va_space_gpu_address(const void *_a, const void *_b)
 {
    const uvm_access_counter_buffer_entry_t **a = (const uvm_access_counter_buffer_entry_t **)_a;
    const uvm_access_counter_buffer_entry_t **b = (const uvm_access_counter_buffer_entry_t **)_b;
@@ -747,6 +756,10 @@ static int cmp_sort_virt_notifications_by_va_space_address(const void *_a, const
    if (result != 0)
        return result;

+    result = cmp_gpu((*a)->gpu, (*b)->gpu);
+    if (result != 0)
+        return result;
+
    return UVM_CMP_DEFAULT((*a)->address.address, (*b)->address.address);
 }

@@ -774,7 +787,7 @@ typedef enum
    NOTIFICATION_FETCH_MODE_ALL,
 } notification_fetch_mode_t;

-static NvU32 fetch_access_counter_buffer_entries(uvm_gpu_t *gpu,
+static NvU32 fetch_access_counter_buffer_entries(uvm_parent_gpu_t *parent_gpu,
                                                 uvm_access_counter_service_batch_context_t *batch_context,
                                                 notification_fetch_mode_t fetch_mode)
 {
@@ -783,12 +796,12 @@ static NvU32 fetch_access_counter_buffer_entries(uvm_gpu_t *gpu,
    NvU32 notification_index;
    uvm_access_counter_buffer_entry_t *notification_cache;
    uvm_spin_loop_t spin;
-    uvm_access_counter_buffer_info_t *access_counters = &gpu->parent->access_counter_buffer_info;
+    uvm_access_counter_buffer_info_t *access_counters = &parent_gpu->access_counter_buffer_info;
    NvU32 last_instance_ptr_idx = 0;
    uvm_aperture_t last_aperture = UVM_APERTURE_PEER_MAX;

-    UVM_ASSERT(uvm_sem_is_locked(&gpu->parent->isr.access_counters.service_lock));
-    UVM_ASSERT(gpu->parent->access_counters_supported);
+    UVM_ASSERT(uvm_sem_is_locked(&parent_gpu->isr.access_counters.service_lock));
+    UVM_ASSERT(parent_gpu->access_counters_supported);

    notification_cache = batch_context->notification_cache;

@@ -819,7 +832,7 @@ static NvU32 fetch_access_counter_buffer_entries(uvm_gpu_t *gpu,

        // We cannot just wait for the last entry (the one pointed by put) to become valid, we have to do it
        // individually since entries can be written out of order
-        UVM_SPIN_WHILE(!gpu->parent->access_counter_buffer_hal->entry_is_valid(gpu->parent, get), &spin) {
+        UVM_SPIN_WHILE(!parent_gpu->access_counter_buffer_hal->entry_is_valid(parent_gpu, get), &spin) {
            // We have some entry to work on. Let's do the rest later.
            if (fetch_mode != NOTIFICATION_FETCH_MODE_ALL && notification_index > 0)
                goto done;
@@ -829,7 +842,7 @@ static NvU32 fetch_access_counter_buffer_entries(uvm_gpu_t *gpu,
        smp_mb__after_atomic();

        // Got valid bit set. Let's cache.
-        gpu->parent->access_counter_buffer_hal->parse_entry(gpu->parent, get, current_entry);
+        parent_gpu->access_counter_buffer_hal->parse_entry(parent_gpu, get, current_entry);

        if (current_entry->address.is_virtual) {
            batch_context->virt.notifications[batch_context->virt.num_notifications++] = current_entry;
@@ -845,26 +858,38 @@ static NvU32 fetch_access_counter_buffer_entries(uvm_gpu_t *gpu,
            }
        }
        else {
-            const NvU64 translation_size = get_config_for_type(access_counters, current_entry->counter_type)->translation_size;
+            NvU64 translation_size;
+            uvm_gpu_t *gpu;
+
+            translation_size = get_config_for_type(access_counters,
+                                                   current_entry->counter_type)->translation_size;
            current_entry->address.address = UVM_ALIGN_DOWN(current_entry->address.address, translation_size);

            batch_context->phys.notifications[batch_context->phys.num_notifications++] = current_entry;

-            current_entry->physical_info.resident_id =
-                uvm_gpu_get_processor_id_by_address(gpu, uvm_gpu_phys_address(current_entry->address.aperture,
-                                                                              current_entry->address.address));
-
-            if (batch_context->phys.is_single_aperture) {
-                if (batch_context->phys.num_notifications == 1)
-                    last_aperture = current_entry->address.aperture;
-                else if (current_entry->address.aperture != last_aperture)
-                    batch_context->phys.is_single_aperture = false;
+            gpu = uvm_parent_gpu_find_first_valid_gpu(parent_gpu);
+            if (!gpu) {
+                current_entry->physical_info.resident_id = UVM_ID_INVALID;
+                current_entry->gpu = NULL;
            }
+            else {
+                current_entry->gpu = gpu;
+                current_entry->physical_info.resident_id =
+                    uvm_gpu_get_processor_id_by_address(gpu, uvm_gpu_phys_address(current_entry->address.aperture,
+                                                                                  current_entry->address.address));

-            if (current_entry->counter_type == UVM_ACCESS_COUNTER_TYPE_MOMC)
-                UVM_ASSERT(uvm_id_equal(current_entry->physical_info.resident_id, gpu->id));
-            else
-                UVM_ASSERT(!uvm_id_equal(current_entry->physical_info.resident_id, gpu->id));
+                if (batch_context->phys.is_single_aperture) {
+                    if (batch_context->phys.num_notifications == 1)
+                        last_aperture = current_entry->address.aperture;
+                    else if (current_entry->address.aperture != last_aperture)
+                        batch_context->phys.is_single_aperture = false;
+                }
+
+                if (current_entry->counter_type == UVM_ACCESS_COUNTER_TYPE_MOMC)
+                    UVM_ASSERT(uvm_id_equal(current_entry->physical_info.resident_id, gpu->id));
+                else
+                    UVM_ASSERT(!uvm_id_equal(current_entry->physical_info.resident_id, gpu->id));
+            }
        }

        ++notification_index;
@@ -874,7 +899,7 @@ static NvU32 fetch_access_counter_buffer_entries(uvm_gpu_t *gpu,
    }

 done:
-    write_get(gpu->parent, get);
+    write_get(parent_gpu, get);

    return notification_index;
 }
@@ -895,12 +920,16 @@ static void translate_virt_notifications_instance_ptrs(uvm_parent_gpu_t *parent_
            // simply be ignored in subsequent processing.
            status = uvm_parent_gpu_access_counter_entry_to_va_space(parent_gpu,
                                                                     current_entry,
-                                                                     &current_entry->virtual_info.va_space);
-            if (status != NV_OK)
+                                                                     &current_entry->virtual_info.va_space,
+                                                                     &current_entry->gpu);
+            if (status != NV_OK) {
                UVM_ASSERT(current_entry->virtual_info.va_space == NULL);
+                UVM_ASSERT(current_entry->gpu == NULL);
+            }
        }
        else {
            current_entry->virtual_info.va_space = batch_context->virt.notifications[i - 1]->virtual_info.va_space;
+            current_entry->gpu = batch_context->virt.notifications[i - 1]->gpu;
        }
    }
 }
@@ -924,7 +953,7 @@ static void preprocess_virt_notifications(uvm_parent_gpu_t *parent_gpu,
    sort(batch_context->virt.notifications,
         batch_context->virt.num_notifications,
         sizeof(*batch_context->virt.notifications),
-         cmp_sort_virt_notifications_by_va_space_address,
+         cmp_sort_virt_notifications_by_va_space_gpu_address,
         NULL);
 }

@@ -942,13 +971,17 @@ static void preprocess_phys_notifications(uvm_access_counter_service_batch_conte
    }
 }

-static NV_STATUS notify_tools_and_process_flags(uvm_gpu_t *gpu,
-                                                uvm_access_counter_buffer_entry_t **notification_start,
-                                                NvU32 num_entries,
-                                                NvU32 flags)
+static NV_STATUS notify_tools_broadcast_and_process_flags(uvm_parent_gpu_t *parent_gpu,
+                                                          uvm_access_counter_buffer_entry_t **notification_start,
+                                                          NvU32 num_entries,
+                                                          NvU32 flags)
 {
+    uvm_gpu_t *gpu = uvm_parent_gpu_find_first_valid_gpu(parent_gpu);
    NV_STATUS status = NV_OK;

+    if (!gpu)
+        return NV_OK;
+
    if (uvm_enable_builtin_tests) {
        // TODO: Bug 4310744: [UVM][TOOLS] Attribute access counter tools events
        //                    to va_space instead of broadcasting.
@@ -964,6 +997,31 @@ static NV_STATUS notify_tools_and_process_flags(uvm_gpu_t *gpu,
    return status;
 }

+static NV_STATUS notify_tools_and_process_flags(uvm_va_space_t *va_space,
+                                                uvm_gpu_t *gpu,
+                                                uvm_access_counter_buffer_entry_t **notification_start,
+                                                NvU32 num_entries,
+                                                NvU32 flags)
+{
+    NV_STATUS status = NV_OK;
+
+    if (uvm_enable_builtin_tests) {
+        NvU32 i;
+
+        for (i = 0; i < num_entries; i++) {
+            uvm_tools_record_access_counter(va_space,
+                                            gpu->id,
+                                            notification_start[i],
+                                            flags & UVM_ACCESS_COUNTER_PHYS_ON_MANAGED);
+        }
+    }
+
+    if (flags & UVM_ACCESS_COUNTER_ACTION_CLEAR)
+        status = access_counter_clear_notifications(gpu, notification_start, num_entries);
+
+    return status;
+}
+
 static NV_STATUS service_va_block_locked(uvm_processor_id_t processor,
                                         uvm_va_block_t *va_block,
                                         uvm_va_block_retry_t *va_block_retry,
@@ -1169,13 +1227,13 @@ static void reverse_mappings_to_va_block_page_mask(uvm_va_block_t *va_block,
    }
 }

-static NV_STATUS service_phys_single_va_block(uvm_gpu_t *gpu,
-                                              uvm_access_counter_service_batch_context_t *batch_context,
+static NV_STATUS service_phys_single_va_block(uvm_access_counter_service_batch_context_t *batch_context,
                                              const uvm_access_counter_buffer_entry_t *current_entry,
                                              const uvm_reverse_map_t *reverse_mappings,
                                              size_t num_reverse_mappings,
                                              NvU32 *out_flags)
 {
+    uvm_gpu_t *gpu = current_entry->gpu;
    size_t index;
    uvm_va_block_t *va_block = reverse_mappings[0].va_block;
    uvm_va_space_t *va_space = NULL;
@@ -1262,8 +1320,7 @@ done:
    return status;
 }

-static NV_STATUS service_phys_va_blocks(uvm_gpu_t *gpu,
-                                        uvm_access_counter_service_batch_context_t *batch_context,
+static NV_STATUS service_phys_va_blocks(uvm_access_counter_service_batch_context_t *batch_context,
                                        const uvm_access_counter_buffer_entry_t *current_entry,
                                        const uvm_reverse_map_t *reverse_mappings,
                                        size_t num_reverse_mappings,
@@ -1276,8 +1333,7 @@ static NV_STATUS service_phys_va_blocks(uvm_gpu_t *gpu,

    for (index = 0; index < num_reverse_mappings; ++index) {
        NvU32 out_flags_local = 0;
-        status = service_phys_single_va_block(gpu,
-                                              batch_context,
+        status = service_phys_single_va_block(batch_context,
                                              current_entry,
                                              reverse_mappings + index,
                                              1,
@@ -1326,8 +1382,7 @@ static bool are_reverse_mappings_on_single_block(const uvm_reverse_map_t *revers
 // Service the given translation range. It will return the count of the reverse
 // mappings found during servicing in num_reverse_mappings, even if the function
 // doesn't return NV_OK.
-static NV_STATUS service_phys_notification_translation(uvm_gpu_t *gpu,
-                                                       uvm_gpu_t *resident_gpu,
+static NV_STATUS service_phys_notification_translation(uvm_gpu_t *resident_gpu,
                                                       uvm_access_counter_service_batch_context_t *batch_context,
                                                       const uvm_gpu_access_counter_type_config_t *config,
                                                       const uvm_access_counter_buffer_entry_t *current_entry,
@@ -1336,6 +1391,7 @@ static NV_STATUS service_phys_notification_translation(uvm_gpu_t *gpu,
                                                       size_t *num_reverse_mappings,
                                                       NvU32 *out_flags)
 {
+    uvm_gpu_t *gpu = current_entry->gpu;
    NV_STATUS status;
    NvU32 region_start, region_end;

@@ -1373,16 +1429,14 @@ static NV_STATUS service_phys_notification_translation(uvm_gpu_t *gpu,

    // Service all the translations
    if (are_reverse_mappings_on_single_block(batch_context->phys.translations, *num_reverse_mappings)) {
-        status = service_phys_single_va_block(gpu,
-                                              batch_context,
+        status = service_phys_single_va_block(batch_context,
                                              current_entry,
                                              batch_context->phys.translations,
                                              *num_reverse_mappings,
                                              out_flags);
    }
    else {
-        status = service_phys_va_blocks(gpu,
-                                        batch_context,
+        status = service_phys_va_blocks(batch_context,
                                        current_entry,
                                        batch_context->phys.translations,
                                        *num_reverse_mappings,
@@ -1392,14 +1446,14 @@ static NV_STATUS service_phys_notification_translation(uvm_gpu_t *gpu,
    return status;
 }

-static NV_STATUS service_phys_notification(uvm_gpu_t *gpu,
-                                           uvm_access_counter_service_batch_context_t *batch_context,
-                                           const uvm_access_counter_buffer_entry_t *current_entry,
-                                           NvU32 *out_flags)
+static NV_STATUS service_phys_notification(uvm_access_counter_service_batch_context_t *batch_context,
+                                           uvm_access_counter_buffer_entry_t *current_entry)
 {
    NvU64 address;
    NvU64 translation_index;
-    uvm_access_counter_buffer_info_t *access_counters = &gpu->parent->access_counter_buffer_info;
+    uvm_gpu_t *gpu = current_entry->gpu;
+    uvm_parent_gpu_t *parent_gpu = gpu->parent;
+    uvm_access_counter_buffer_info_t *access_counters = &parent_gpu->access_counter_buffer_info;
    uvm_access_counter_type_t counter_type = current_entry->counter_type;
    const uvm_gpu_access_counter_type_config_t *config = get_config_for_type(access_counters, counter_type);
    unsigned long sub_granularity;
@@ -1429,14 +1483,13 @@ static NV_STATUS service_phys_notification(uvm_gpu_t *gpu,
        // fall outside of the allocatable address range. We just drop
        // them.
        if (address >= resident_gpu->mem_info.max_allocatable_address)
-            return NV_OK;
+            goto out;
    }

    for (translation_index = 0; translation_index < config->translations_per_counter; ++translation_index) {
        size_t num_reverse_mappings;
        NvU32 out_flags_local = 0;
-        status = service_phys_notification_translation(gpu,
-                                                       resident_gpu,
+        status = service_phys_notification_translation(resident_gpu,
                                                       batch_context,
                                                       config,
                                                       current_entry,
@@ -1457,37 +1510,32 @@ static NV_STATUS service_phys_notification(uvm_gpu_t *gpu,
    }

    if (uvm_enable_builtin_tests)
-        *out_flags |= ((total_reverse_mappings != 0) ? UVM_ACCESS_COUNTER_PHYS_ON_MANAGED : 0);
-
-    if (status == NV_OK && (flags & UVM_ACCESS_COUNTER_ACTION_CLEAR))
-        *out_flags |= UVM_ACCESS_COUNTER_ACTION_CLEAR;
+        flags |= ((total_reverse_mappings != 0) ? UVM_ACCESS_COUNTER_PHYS_ON_MANAGED : 0);

+out:
+    notify_tools_broadcast_and_process_flags(parent_gpu, &current_entry, 1, flags);
    return status;
 }

 // TODO: Bug 2018899: Add statistics for dropped access counter notifications
-static NV_STATUS service_phys_notifications(uvm_gpu_t *gpu,
+static NV_STATUS service_phys_notifications(uvm_parent_gpu_t *parent_gpu,
                                            uvm_access_counter_service_batch_context_t *batch_context)
 {
    NvU32 i;
    uvm_access_counter_buffer_entry_t **notifications = batch_context->phys.notifications;

-    UVM_ASSERT(gpu->parent->access_counters_can_use_physical_addresses);
+    UVM_ASSERT(parent_gpu->access_counters_can_use_physical_addresses);

    preprocess_phys_notifications(batch_context);

    for (i = 0; i < batch_context->phys.num_notifications; ++i) {
        NV_STATUS status;
        uvm_access_counter_buffer_entry_t *current_entry = notifications[i];
-        NvU32 flags = 0;

        if (!UVM_ID_IS_VALID(current_entry->physical_info.resident_id))
            continue;

-        status = service_phys_notification(gpu, batch_context, current_entry, &flags);
-
-        notify_tools_and_process_flags(gpu, &notifications[i], 1, flags);
-
+        status = service_phys_notification(batch_context, current_entry);
        if (status != NV_OK)
            return status;
    }
@@ -1624,16 +1672,14 @@ static NV_STATUS service_virt_notifications_in_block(uvm_gpu_va_space_t *gpu_va_
        uvm_access_counter_buffer_entry_t *current_entry = notifications[i];
        NvU64 address = current_entry->address.address;

-        if ((current_entry->virtual_info.va_space == va_space) && (address <= va_block->end)) {
-            expand_notification_block(gpu_va_space,
-                                      va_block,
-                                      batch_context->block_service_context.block_context,
-                                      accessed_pages,
-                                      current_entry);
-        }
-        else {
+        if (current_entry->virtual_info.va_space != va_space || current_entry->gpu != gpu || address > va_block->end)
            break;
-        }
+
+        expand_notification_block(gpu_va_space,
+                                  va_block,
+                                  batch_context->block_service_context.block_context,
+                                  accessed_pages,
+                                  current_entry);
    }

    *out_index = i;
@@ -1648,7 +1694,7 @@ static NV_STATUS service_virt_notifications_in_block(uvm_gpu_va_space_t *gpu_va_
    if (status == NV_OK)
        flags |= UVM_ACCESS_COUNTER_ACTION_CLEAR;

-    flags_status = notify_tools_and_process_flags(gpu, &notifications[index], *out_index - index, flags);
+    flags_status = notify_tools_and_process_flags(va_space, gpu, &notifications[index], *out_index - index, flags);

    if ((status == NV_OK) && (flags_status != NV_OK))
        status = flags_status;
@@ -1687,7 +1733,7 @@ static NV_STATUS service_virt_notification_ats(uvm_gpu_va_space_t *gpu_va_space,
    if (!vma) {
        // Clear the notification entry to continue receiving access counter
        // notifications when a new VMA is allocated in this range.
-        status = notify_tools_and_process_flags(gpu, &notifications[index], 1, flags);
+        status = notify_tools_and_process_flags(va_space, gpu, &notifications[index], 1, flags);
        *out_index = index + 1;
        return status;
    }
@@ -1701,10 +1747,10 @@ static NV_STATUS service_virt_notification_ats(uvm_gpu_va_space_t *gpu_va_space,
        uvm_access_counter_buffer_entry_t *current_entry = notifications[i];
        address = current_entry->address.address;

-        if ((current_entry->virtual_info.va_space == va_space) && (address < end))
-            uvm_page_mask_set(&ats_context->accessed_mask, (address - base) / PAGE_SIZE);
-        else
+        if (current_entry->virtual_info.va_space != va_space || current_entry->gpu != gpu || address >= end)
            break;
+
+        uvm_page_mask_set(&ats_context->accessed_mask, (address - base) / PAGE_SIZE);
    }

    *out_index = i;
@@ -1719,7 +1765,7 @@ static NV_STATUS service_virt_notification_ats(uvm_gpu_va_space_t *gpu_va_space,
    if (status != NV_OK)
        flags &= ~UVM_ACCESS_COUNTER_ACTION_CLEAR;

-    flags_status = notify_tools_and_process_flags(gpu, &notifications[index], *out_index - index, flags);
+    flags_status = notify_tools_and_process_flags(va_space, gpu, &notifications[index], *out_index - index, flags);
    if ((status == NV_OK) && (flags_status != NV_OK))
        status = flags_status;

@@ -1771,7 +1817,7 @@ static NV_STATUS service_virt_notifications_batch(uvm_gpu_va_space_t *gpu_va_spa
            status = service_virt_notifications_in_block(gpu_va_space, mm, va_block, batch_context, index, out_index);
        }
        else {
-            status = notify_tools_and_process_flags(gpu_va_space->gpu, batch_context->virt.notifications, 1, flags);
+            status = notify_tools_and_process_flags(va_space, gpu_va_space->gpu, batch_context->virt.notifications, 1, flags);
            *out_index = index + 1;
        }
    }
@@ -1801,7 +1847,11 @@ static NV_STATUS service_virt_notifications_batch(uvm_gpu_va_space_t *gpu_va_spa

        // Clobber status to continue processing the rest of the notifications
        // in the batch.
-        status = notify_tools_and_process_flags(gpu_va_space->gpu, batch_context->virt.notifications, 1, flags);
+        status = notify_tools_and_process_flags(va_space,
+                                                gpu_va_space->gpu,
+                                                batch_context->virt.notifications,
+                                                1,
+                                                flags);

        *out_index = index + 1;
    }
@@ -1809,7 +1859,7 @@ static NV_STATUS service_virt_notifications_batch(uvm_gpu_va_space_t *gpu_va_spa
    return status;
 }

-static NV_STATUS service_virt_notifications(uvm_gpu_t *gpu,
+static NV_STATUS service_virt_notifications(uvm_parent_gpu_t *parent_gpu,
                                            uvm_access_counter_service_batch_context_t *batch_context)
 {
    NvU32 i = 0;
@@ -1817,18 +1867,19 @@ static NV_STATUS service_virt_notifications(uvm_gpu_t *gpu,
    struct mm_struct *mm = NULL;
    uvm_va_space_t *va_space = NULL;
    uvm_va_space_t *prev_va_space = NULL;
+    uvm_gpu_t *prev_gpu = NULL;
    uvm_gpu_va_space_t *gpu_va_space = NULL;

    // TODO: Bug 4299018 : Add support for virtual access counter migrations on
    //                     4K page sizes.
    if (PAGE_SIZE == UVM_PAGE_SIZE_4K) {
-        return notify_tools_and_process_flags(gpu,
-                                              batch_context->virt.notifications,
-                                              batch_context->virt.num_notifications,
-                                              0);
+        return notify_tools_broadcast_and_process_flags(parent_gpu,
+                                                        batch_context->virt.notifications,
+                                                        batch_context->virt.num_notifications,
+                                                        0);
    }

-    preprocess_virt_notifications(gpu->parent, batch_context);
+    preprocess_virt_notifications(parent_gpu, batch_context);

    while (i < batch_context->virt.num_notifications) {
        uvm_access_counter_buffer_entry_t *current_entry = batch_context->virt.notifications[i];
@@ -1842,25 +1893,38 @@ static NV_STATUS service_virt_notifications(uvm_gpu_t *gpu,
                uvm_va_space_mm_release_unlock(prev_va_space, mm);

                mm = NULL;
-                gpu_va_space = NULL;
+                prev_gpu = NULL;
            }

            // Acquire locks for the new va_space.
            if (va_space) {
                mm = uvm_va_space_mm_retain_lock(va_space);
                uvm_va_space_down_read(va_space);
-
-                gpu_va_space = uvm_gpu_va_space_get_by_parent_gpu(va_space, gpu->parent);
            }

            prev_va_space = va_space;
        }

-        if (va_space && gpu_va_space && uvm_va_space_has_access_counter_migrations(va_space)) {
-            status = service_virt_notifications_batch(gpu_va_space, mm, batch_context, i, &i);
+        if (va_space) {
+            if (prev_gpu != current_entry->gpu) {
+                prev_gpu = current_entry->gpu;
+                gpu_va_space = uvm_gpu_va_space_get(va_space, current_entry->gpu);
+            }
+
+            if (gpu_va_space && uvm_va_space_has_access_counter_migrations(va_space)) {
+                status = service_virt_notifications_batch(gpu_va_space, mm, batch_context, i, &i);
+            }
+            else {
+                status = notify_tools_and_process_flags(va_space,
+                                                        current_entry->gpu,
+                                                        &batch_context->virt.notifications[i],
+                                                        1,
+                                                        0);
+                i++;
+            }
        }
        else {
-            status = notify_tools_and_process_flags(gpu, &batch_context->virt.notifications[i], 1, 0);
+            status = notify_tools_broadcast_and_process_flags(parent_gpu, &batch_context->virt.notifications[i], 1, 0);
            i++;
        }

@@ -1876,19 +1940,18 @@ static NV_STATUS service_virt_notifications(uvm_gpu_t *gpu,
    return status;
 }

-
-void uvm_gpu_service_access_counters(uvm_gpu_t *gpu)
+void uvm_parent_gpu_service_access_counters(uvm_parent_gpu_t *parent_gpu)
 {
    NV_STATUS status = NV_OK;
-    uvm_access_counter_service_batch_context_t *batch_context = &gpu->parent->access_counter_buffer_info.batch_service_context;
+    uvm_access_counter_service_batch_context_t *batch_context = &parent_gpu->access_counter_buffer_info.batch_service_context;

-    UVM_ASSERT(gpu->parent->access_counters_supported);
+    UVM_ASSERT(parent_gpu->access_counters_supported);

-    if (gpu->parent->access_counter_buffer_info.notifications_ignored_count > 0)
+    if (parent_gpu->access_counter_buffer_info.notifications_ignored_count > 0)
        return;

    while (1) {
-        batch_context->num_cached_notifications = fetch_access_counter_buffer_entries(gpu,
+        batch_context->num_cached_notifications = fetch_access_counter_buffer_entries(parent_gpu,
                                                                                      batch_context,
                                                                                      NOTIFICATION_FETCH_MODE_BATCH_READY);
        if (batch_context->num_cached_notifications == 0)
@@ -1897,13 +1960,13 @@ void uvm_gpu_service_access_counters(uvm_gpu_t *gpu)
        ++batch_context->batch_id;

        if (batch_context->virt.num_notifications) {
-            status = service_virt_notifications(gpu, batch_context);
+            status = service_virt_notifications(parent_gpu, batch_context);
            if (status != NV_OK)
                break;
        }

        if (batch_context->phys.num_notifications) {
-            status = service_phys_notifications(gpu, batch_context);
+            status = service_phys_notifications(parent_gpu, batch_context);
            if (status != NV_OK)
                break;
        }
@@ -1912,7 +1975,7 @@ void uvm_gpu_service_access_counters(uvm_gpu_t *gpu)
    if (status != NV_OK) {
        UVM_DBG_PRINT("Error %s servicing access counter notifications on GPU: %s\n",
                      nvstatusToString(status),
-                      uvm_gpu_name(gpu));
+                      uvm_parent_gpu_name(parent_gpu));
    }
 }

--- a/kernel-open/nvidia-uvm/uvm_gpu_access_counters.h
+++ b/kernel-open/nvidia-uvm/uvm_gpu_access_counters.h
@@ -31,7 +31,7 @@ NV_STATUS uvm_parent_gpu_init_access_counters(uvm_parent_gpu_t *parent_gpu);
 void uvm_parent_gpu_deinit_access_counters(uvm_parent_gpu_t *parent_gpu);
 bool uvm_parent_gpu_access_counters_pending(uvm_parent_gpu_t *parent_gpu);

-void uvm_gpu_service_access_counters(uvm_gpu_t *gpu);
+void uvm_parent_gpu_service_access_counters(uvm_parent_gpu_t *parent_gpu);

 void uvm_parent_gpu_access_counter_buffer_flush(uvm_parent_gpu_t *parent_gpu);

--- a/kernel-open/nvidia-uvm/uvm_gpu_isr.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu_isr.c
@@ -479,17 +479,14 @@ void uvm_parent_gpu_deinit_isr(uvm_parent_gpu_t *parent_gpu)
    uvm_kvfree(parent_gpu->isr.access_counters.stats.cpu_exec_count);
 }

-static uvm_gpu_t *find_first_valid_gpu(uvm_parent_gpu_t *parent_gpu)
+uvm_gpu_t *uvm_parent_gpu_find_first_valid_gpu(uvm_parent_gpu_t *parent_gpu)
 {
    uvm_gpu_t *gpu;

    // When SMC is enabled, there's no longer a 1:1 relationship between the
-    // parent and the partitions.  But because all relevant interrupt paths
-    // are shared, as is the fault reporting logic, it's sufficient here
-    // to proceed with any valid uvm_gpu_t, even if the corresponding partition
-    // didn't cause all, or even any of the interrupts.
-    // The bottom half handlers will later find the appropriate partitions by
-    // attributing the notifications to VA spaces as necessary.
+    // parent and the partitions. It's sufficient to return any valid uvm_gpu_t
+    // since the purpose is to have a channel and push buffer for operations
+    // that affect the whole parent GPU.
    if (parent_gpu->smc.enabled) {
        NvU32 sub_processor_index;

@@ -518,13 +515,8 @@ static uvm_gpu_t *find_first_valid_gpu(uvm_parent_gpu_t *parent_gpu)
 static void replayable_faults_isr_bottom_half(void *args)
 {
    uvm_parent_gpu_t *parent_gpu = (uvm_parent_gpu_t *)args;
-    uvm_gpu_t *gpu;
    unsigned int cpu;

-    gpu = find_first_valid_gpu(parent_gpu);
-    if (gpu == NULL)
-        goto put_kref;
-
    UVM_ASSERT(parent_gpu->replayable_faults_supported);

    // Record the lock ownership
@@ -545,11 +537,10 @@ static void replayable_faults_isr_bottom_half(void *args)
    ++parent_gpu->isr.replayable_faults.stats.cpu_exec_count[cpu];
    put_cpu();

-    uvm_gpu_service_replayable_faults(gpu);
+    uvm_parent_gpu_service_replayable_faults(parent_gpu);

    uvm_parent_gpu_replayable_faults_isr_unlock(parent_gpu);

-put_kref:
    // It is OK to drop a reference on the parent GPU if a bottom half has
    // been retriggered within uvm_parent_gpu_replayable_faults_isr_unlock,
    // because the rescheduling added an additional reference.
@@ -564,13 +555,8 @@ static void replayable_faults_isr_bottom_half_entry(void *args)
 static void non_replayable_faults_isr_bottom_half(void *args)
 {
    uvm_parent_gpu_t *parent_gpu = (uvm_parent_gpu_t *)args;
-    uvm_gpu_t *gpu;
    unsigned int cpu;

-    gpu = find_first_valid_gpu(parent_gpu);
-    if (gpu == NULL)
-        goto put_kref;
-
    UVM_ASSERT(parent_gpu->non_replayable_faults_supported);

    uvm_parent_gpu_non_replayable_faults_isr_lock(parent_gpu);
@@ -584,11 +570,10 @@ static void non_replayable_faults_isr_bottom_half(void *args)
    ++parent_gpu->isr.non_replayable_faults.stats.cpu_exec_count[cpu];
    put_cpu();

-    uvm_gpu_service_non_replayable_fault_buffer(gpu);
+    uvm_parent_gpu_service_non_replayable_fault_buffer(parent_gpu);

    uvm_parent_gpu_non_replayable_faults_isr_unlock(parent_gpu);

-put_kref:
    uvm_parent_gpu_kref_put(parent_gpu);
 }

@@ -600,13 +585,8 @@ static void non_replayable_faults_isr_bottom_half_entry(void *args)
 static void access_counters_isr_bottom_half(void *args)
 {
    uvm_parent_gpu_t *parent_gpu = (uvm_parent_gpu_t *)args;
-    uvm_gpu_t *gpu;
    unsigned int cpu;

-    gpu = find_first_valid_gpu(parent_gpu);
-    if (gpu == NULL)
-        goto put_kref;
-
    UVM_ASSERT(parent_gpu->access_counters_supported);

    uvm_record_lock(&parent_gpu->isr.access_counters.service_lock, UVM_LOCK_FLAGS_MODE_SHARED);
@@ -620,11 +600,10 @@ static void access_counters_isr_bottom_half(void *args)
    ++parent_gpu->isr.access_counters.stats.cpu_exec_count[cpu];
    put_cpu();

-    uvm_gpu_service_access_counters(gpu);
+    uvm_parent_gpu_service_access_counters(parent_gpu);

    uvm_parent_gpu_access_counters_isr_unlock(parent_gpu);

-put_kref:
    uvm_parent_gpu_kref_put(parent_gpu);
 }

--- a/kernel-open/nvidia-uvm/uvm_gpu_isr.h
+++ b/kernel-open/nvidia-uvm/uvm_gpu_isr.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2016-2023 NVIDIA Corporation
+    Copyright (c) 2016-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -193,4 +193,10 @@ void uvm_parent_gpu_access_counters_intr_disable(uvm_parent_gpu_t *parent_gpu);
 // parent_gpu->isr.interrupts_lock must be held to call this function.
 void uvm_parent_gpu_access_counters_intr_enable(uvm_parent_gpu_t *parent_gpu);

+// Return the first valid GPU given the parent GPU or NULL if no MIG instances
+// are registered. This should only be called from bottom halves or if the
+// g_uvm_global.global_lock is held so that the returned pointer remains valid.
+//
+uvm_gpu_t *uvm_parent_gpu_find_first_valid_gpu(uvm_parent_gpu_t *parent_gpu);
+
 #endif // __UVM_GPU_ISR_H__
--- a/kernel-open/nvidia-uvm/uvm_gpu_non_replayable_faults.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu_non_replayable_faults.c
@@ -212,6 +212,7 @@ static NV_STATUS fetch_non_replayable_fault_buffer_entries(uvm_parent_gpu_t *par

        // Make sure that all fields in the entry are properly initialized
        fault_entry->va_space = NULL;
+        fault_entry->gpu = NULL;
        fault_entry->is_fatal = (fault_entry->fault_type >= UVM_FAULT_TYPE_FATAL);
        fault_entry->filtered = false;

@@ -235,7 +236,7 @@ static NV_STATUS fetch_non_replayable_fault_buffer_entries(uvm_parent_gpu_t *par
    return NV_OK;
 }

-static bool use_clear_faulted_channel_sw_method(uvm_gpu_t *gpu)
+static bool use_clear_faulted_channel_sw_method(uvm_parent_gpu_t *parent_gpu)
 {
    // If true, UVM uses a SW method to request RM to do the clearing on its
    // behalf.
@@ -243,7 +244,7 @@ static bool use_clear_faulted_channel_sw_method(uvm_gpu_t *gpu)

    // In SRIOV, the UVM (guest) driver does not have access to the privileged
    // registers used to clear the faulted bit.
-    if (uvm_parent_gpu_is_virt_mode_sriov(gpu->parent))
+    if (uvm_parent_gpu_is_virt_mode_sriov(parent_gpu))
        use_sw_method = true;

    // In Confidential Computing access to the privileged registers is blocked,
@@ -253,17 +254,17 @@ static bool use_clear_faulted_channel_sw_method(uvm_gpu_t *gpu)
        use_sw_method = true;

    if (use_sw_method)
-        UVM_ASSERT(gpu->parent->has_clear_faulted_channel_sw_method);
+        UVM_ASSERT(parent_gpu->has_clear_faulted_channel_sw_method);

    return use_sw_method;
 }

-static NV_STATUS clear_faulted_method_on_gpu(uvm_gpu_t *gpu,
-                                             uvm_user_channel_t *user_channel,
+static NV_STATUS clear_faulted_method_on_gpu(uvm_user_channel_t *user_channel,
                                             const uvm_fault_buffer_entry_t *fault_entry,
                                             NvU32 batch_id,
                                             uvm_tracker_t *tracker)
 {
+    uvm_gpu_t *gpu = user_channel->gpu;
    NV_STATUS status;
    uvm_push_t push;
    uvm_non_replayable_fault_buffer_info_t *non_replayable_faults = &gpu->parent->fault_buffer_info.non_replayable;
@@ -283,7 +284,7 @@ static NV_STATUS clear_faulted_method_on_gpu(uvm_gpu_t *gpu,
        return status;
    }

-    if (use_clear_faulted_channel_sw_method(gpu))
+    if (use_clear_faulted_channel_sw_method(gpu->parent))
        gpu->parent->host_hal->clear_faulted_channel_sw_method(&push, user_channel, fault_entry);
    else
        gpu->parent->host_hal->clear_faulted_channel_method(&push, user_channel, fault_entry);
@@ -305,12 +306,12 @@ static NV_STATUS clear_faulted_method_on_gpu(uvm_gpu_t *gpu,
    return status;
 }

-static NV_STATUS clear_faulted_register_on_gpu(uvm_gpu_t *gpu,
-                                               uvm_user_channel_t *user_channel,
+static NV_STATUS clear_faulted_register_on_gpu(uvm_user_channel_t *user_channel,
                                               const uvm_fault_buffer_entry_t *fault_entry,
                                               NvU32 batch_id,
                                               uvm_tracker_t *tracker)
 {
+    uvm_gpu_t *gpu = user_channel->gpu;
    NV_STATUS status;

    UVM_ASSERT(!gpu->parent->has_clear_faulted_channel_method);
@@ -328,25 +329,26 @@ static NV_STATUS clear_faulted_register_on_gpu(uvm_gpu_t *gpu,
    return NV_OK;
 }

-static NV_STATUS clear_faulted_on_gpu(uvm_gpu_t *gpu,
-                                      uvm_user_channel_t *user_channel,
+static NV_STATUS clear_faulted_on_gpu(uvm_user_channel_t *user_channel,
                                      const uvm_fault_buffer_entry_t *fault_entry,
                                      NvU32 batch_id,
                                      uvm_tracker_t *tracker)
 {
-    if (gpu->parent->has_clear_faulted_channel_method || use_clear_faulted_channel_sw_method(gpu))
-        return clear_faulted_method_on_gpu(gpu, user_channel, fault_entry, batch_id, tracker);
+    uvm_gpu_t *gpu = user_channel->gpu;

-    return clear_faulted_register_on_gpu(gpu, user_channel, fault_entry, batch_id, tracker);
+    if (gpu->parent->has_clear_faulted_channel_method || use_clear_faulted_channel_sw_method(gpu->parent))
+        return clear_faulted_method_on_gpu(user_channel, fault_entry, batch_id, tracker);
+
+    return clear_faulted_register_on_gpu(user_channel, fault_entry, batch_id, tracker);
 }

-static NV_STATUS service_managed_fault_in_block_locked(uvm_gpu_t *gpu,
-                                                       uvm_va_block_t *va_block,
+static NV_STATUS service_managed_fault_in_block_locked(uvm_va_block_t *va_block,
                                                       uvm_va_block_retry_t *va_block_retry,
                                                       uvm_fault_buffer_entry_t *fault_entry,
                                                       uvm_service_block_context_t *service_context,
                                                       const bool hmm_migratable)
 {
+    uvm_gpu_t *gpu = fault_entry->gpu;
    NV_STATUS status = NV_OK;
    uvm_page_index_t page_index;
    uvm_perf_thrashing_hint_t thrashing_hint;
@@ -441,13 +443,13 @@ static NV_STATUS service_managed_fault_in_block_locked(uvm_gpu_t *gpu,
    return status;
 }

-static NV_STATUS service_managed_fault_in_block(uvm_gpu_t *gpu,
-                                                uvm_va_block_t *va_block,
+static NV_STATUS service_managed_fault_in_block(uvm_va_block_t *va_block,
                                                uvm_fault_buffer_entry_t *fault_entry,
                                                const bool hmm_migratable)
 {
    NV_STATUS status, tracker_status;
    uvm_va_block_retry_t va_block_retry;
+    uvm_gpu_t *gpu = fault_entry->gpu;
    uvm_service_block_context_t *service_context = &gpu->parent->fault_buffer_info.non_replayable.block_service_context;

    service_context->operation = UVM_SERVICE_OPERATION_NON_REPLAYABLE_FAULTS;
@@ -459,8 +461,7 @@ static NV_STATUS service_managed_fault_in_block(uvm_gpu_t *gpu,
    uvm_mutex_lock(&va_block->lock);

    status = UVM_VA_BLOCK_RETRY_LOCKED(va_block, &va_block_retry,
-                                       service_managed_fault_in_block_locked(gpu,
-                                                                             va_block,
+                                       service_managed_fault_in_block_locked(va_block,
                                                                             &va_block_retry,
                                                                             fault_entry,
                                                                             service_context,
@@ -502,16 +503,14 @@ static void kill_channel_delayed_entry(void *user_channel)
    UVM_ENTRY_VOID(kill_channel_delayed(user_channel));
 }

-static void schedule_kill_channel(uvm_gpu_t *gpu,
-                                  uvm_fault_buffer_entry_t *fault_entry,
-                                  uvm_user_channel_t *user_channel)
+static void schedule_kill_channel(uvm_fault_buffer_entry_t *fault_entry, uvm_user_channel_t *user_channel)
 {
    uvm_va_space_t *va_space = fault_entry->va_space;
-    uvm_non_replayable_fault_buffer_info_t *non_replayable_faults = &gpu->parent->fault_buffer_info.non_replayable;
+    uvm_parent_gpu_t *parent_gpu = fault_entry->gpu->parent;
+    uvm_non_replayable_fault_buffer_info_t *non_replayable_faults = &parent_gpu->fault_buffer_info.non_replayable;
    void *packet = (char *)non_replayable_faults->shadow_buffer_copy +
-                   (fault_entry->non_replayable.buffer_index * gpu->parent->fault_buffer_hal->entry_size(gpu->parent));
+                   (fault_entry->non_replayable.buffer_index * parent_gpu->fault_buffer_hal->entry_size(parent_gpu));

-    UVM_ASSERT(gpu);
    UVM_ASSERT(va_space);
    UVM_ASSERT(user_channel);

@@ -522,7 +521,7 @@ static void schedule_kill_channel(uvm_gpu_t *gpu,
    user_channel->kill_channel.va_space = va_space;

    // Save the packet to be handled by RM in the channel structure
-    memcpy(user_channel->kill_channel.fault_packet, packet, gpu->parent->fault_buffer_hal->entry_size(gpu->parent));
+    memcpy(user_channel->kill_channel.fault_packet, packet, parent_gpu->fault_buffer_hal->entry_size(parent_gpu));

    // Retain the channel here so it is not prematurely destroyed. It will be
    // released after forwarding the fault to RM in kill_channel_delayed.
@@ -533,7 +532,7 @@ static void schedule_kill_channel(uvm_gpu_t *gpu,
                           kill_channel_delayed_entry,
                           user_channel);

-    nv_kthread_q_schedule_q_item(&gpu->parent->isr.kill_channel_q,
+    nv_kthread_q_schedule_q_item(&parent_gpu->isr.kill_channel_q,
                                 &user_channel->kill_channel.kill_channel_q_item);
 }

@@ -550,6 +549,7 @@ static NV_STATUS service_non_managed_fault(uvm_gpu_va_space_t *gpu_va_space,
                                           uvm_fault_buffer_entry_t *fault_entry,
                                           NV_STATUS lookup_status)
 {
+    uvm_va_space_t *va_space = gpu_va_space->va_space;
    uvm_gpu_t *gpu = gpu_va_space->gpu;
    uvm_non_replayable_fault_buffer_info_t *non_replayable_faults = &gpu->parent->fault_buffer_info.non_replayable;
    uvm_ats_fault_invalidate_t *ats_invalidate = &non_replayable_faults->ats_invalidate;
@@ -557,9 +557,11 @@ static NV_STATUS service_non_managed_fault(uvm_gpu_va_space_t *gpu_va_space,
    NV_STATUS fatal_fault_status = NV_ERR_INVALID_ADDRESS;

    UVM_ASSERT(!fault_entry->is_fatal);
+    UVM_ASSERT(fault_entry->va_space == va_space);
+    UVM_ASSERT(fault_entry->gpu == gpu);

    // Avoid dropping fault events when the VA block is not found or cannot be created
-    uvm_perf_event_notify_gpu_fault(&fault_entry->va_space->perf_events,
+    uvm_perf_event_notify_gpu_fault(&va_space->perf_events,
                                    NULL,
                                    gpu->id,
                                    UVM_ID_INVALID,
@@ -584,11 +586,11 @@ static NV_STATUS service_non_managed_fault(uvm_gpu_va_space_t *gpu_va_space,

        ats_invalidate->tlb_batch_pending = false;

-        va_range_next = uvm_va_space_iter_first(gpu_va_space->va_space, fault_entry->fault_address, ~0ULL);
+        va_range_next = uvm_va_space_iter_first(va_space, fault_entry->fault_address, ~0ULL);

        // The VA isn't managed. See if ATS knows about it.
        vma = find_vma_intersection(mm, fault_address, fault_address + 1);
-        if (!vma || uvm_ats_check_in_gmmu_region(gpu_va_space->va_space, fault_address, va_range_next)) {
+        if (!vma || uvm_ats_check_in_gmmu_region(va_space, fault_address, va_range_next)) {

            // Do not return error due to logical errors in the application
            status = NV_OK;
@@ -631,19 +633,24 @@ static NV_STATUS service_non_managed_fault(uvm_gpu_va_space_t *gpu_va_space,
    return status;
 }

-static NV_STATUS service_fault_once(uvm_gpu_t *gpu, uvm_fault_buffer_entry_t *fault_entry, const bool hmm_migratable)
+static NV_STATUS service_fault_once(uvm_parent_gpu_t *parent_gpu,
+                                    uvm_fault_buffer_entry_t *fault_entry,
+                                    const bool hmm_migratable)
 {
    NV_STATUS status;
    uvm_user_channel_t *user_channel;
    uvm_va_block_t *va_block;
-    uvm_va_space_t *va_space = NULL;
+    uvm_va_space_t *va_space;
    struct mm_struct *mm;
    uvm_gpu_va_space_t *gpu_va_space;
-    uvm_non_replayable_fault_buffer_info_t *non_replayable_faults = &gpu->parent->fault_buffer_info.non_replayable;
-    uvm_va_block_context_t *va_block_context =
-        gpu->parent->fault_buffer_info.non_replayable.block_service_context.block_context;
+    uvm_gpu_t *gpu;
+    uvm_non_replayable_fault_buffer_info_t *non_replayable_faults = &parent_gpu->fault_buffer_info.non_replayable;
+    uvm_va_block_context_t *va_block_context = non_replayable_faults->block_service_context.block_context;

-    status = uvm_parent_gpu_fault_entry_to_va_space(gpu->parent, fault_entry, &va_space);
+    status = uvm_parent_gpu_fault_entry_to_va_space(parent_gpu,
+                                                    fault_entry,
+                                                    &va_space,
+                                                    &gpu);
    if (status != NV_OK) {
        // The VA space lookup will fail if we're running concurrently with
        // removal of the channel from the VA space (channel unregister, GPU VA
@@ -657,10 +664,12 @@ static NV_STATUS service_fault_once(uvm_gpu_t *gpu, uvm_fault_buffer_entry_t *fa
        // replayable faults only use the address space of their channel.
        UVM_ASSERT(status == NV_ERR_INVALID_CHANNEL);
        UVM_ASSERT(!va_space);
+        UVM_ASSERT(!gpu);
        return NV_OK;
    }

    UVM_ASSERT(va_space);
+    UVM_ASSERT(gpu);

    // If an mm is registered with the VA space, we have to retain it
    // in order to lock it before locking the VA space. It is guaranteed
@@ -671,8 +680,7 @@ static NV_STATUS service_fault_once(uvm_gpu_t *gpu, uvm_fault_buffer_entry_t *fa

    uvm_va_space_down_read(va_space);

-    gpu_va_space = uvm_gpu_va_space_get_by_parent_gpu(va_space, gpu->parent);
-
+    gpu_va_space = uvm_gpu_va_space_get(va_space, gpu);
    if (!gpu_va_space) {
        // The va_space might have gone away. See the comment above.
        status = NV_OK;
@@ -680,6 +688,7 @@ static NV_STATUS service_fault_once(uvm_gpu_t *gpu, uvm_fault_buffer_entry_t *fa
    }

    fault_entry->va_space = va_space;
+    fault_entry->gpu = gpu;

    user_channel = uvm_gpu_va_space_get_user_channel(gpu_va_space, fault_entry->instance_ptr);
    if (!user_channel) {
@@ -692,26 +701,25 @@ static NV_STATUS service_fault_once(uvm_gpu_t *gpu, uvm_fault_buffer_entry_t *fa

    if (!fault_entry->is_fatal) {
        if (mm) {
-            status = uvm_va_block_find_create(fault_entry->va_space,
+            status = uvm_va_block_find_create(va_space,
                                              fault_entry->fault_address,
                                              &va_block_context->hmm.vma,
                                              &va_block);
        }
        else {
-            status = uvm_va_block_find_create_managed(fault_entry->va_space,
+            status = uvm_va_block_find_create_managed(va_space,
                                                      fault_entry->fault_address,
                                                      &va_block);
        }
        if (status == NV_OK)
-            status = service_managed_fault_in_block(gpu_va_space->gpu, va_block, fault_entry, hmm_migratable);
+            status = service_managed_fault_in_block(va_block, fault_entry, hmm_migratable);
        else
            status = service_non_managed_fault(gpu_va_space, mm, fault_entry, status);

        // We are done, we clear the faulted bit on the channel, so it can be
        // re-scheduled again
        if (status == NV_OK && !fault_entry->is_fatal) {
-            status = clear_faulted_on_gpu(gpu,
-                                          user_channel,
+            status = clear_faulted_on_gpu(user_channel,
                                          fault_entry,
                                          non_replayable_faults->batch_id,
                                          &non_replayable_faults->fault_service_tracker);
@@ -720,13 +728,13 @@ static NV_STATUS service_fault_once(uvm_gpu_t *gpu, uvm_fault_buffer_entry_t *fa
    }

    if (fault_entry->is_fatal)
-        uvm_tools_record_gpu_fatal_fault(gpu->id, fault_entry->va_space, fault_entry, fault_entry->fatal_reason);
+        uvm_tools_record_gpu_fatal_fault(gpu->id, va_space, fault_entry, fault_entry->fatal_reason);

    if (fault_entry->is_fatal ||
        (status != NV_OK &&
         status != NV_WARN_MORE_PROCESSING_REQUIRED &&
         status != NV_WARN_MISMATCHED_TARGET))
-        schedule_kill_channel(gpu, fault_entry, user_channel);
+        schedule_kill_channel(fault_entry, user_channel);

 exit_no_channel:
    uvm_va_space_up_read(va_space);
@@ -735,22 +743,23 @@ exit_no_channel:
    if (status != NV_OK &&
        status != NV_WARN_MORE_PROCESSING_REQUIRED &&
        status != NV_WARN_MISMATCHED_TARGET)
-        UVM_DBG_PRINT("Error servicing non-replayable faults on GPU: %s\n", uvm_gpu_name(gpu));
+        UVM_DBG_PRINT("Error servicing non-replayable faults on GPU: %s\n",
+                      uvm_parent_gpu_name(parent_gpu));

    return status;
 }

-static NV_STATUS service_fault(uvm_gpu_t *gpu, uvm_fault_buffer_entry_t *fault_entry)
+static NV_STATUS service_fault(uvm_parent_gpu_t *parent_gpu, uvm_fault_buffer_entry_t *fault_entry)
 {
    uvm_service_block_context_t *service_context =
-        &gpu->parent->fault_buffer_info.non_replayable.block_service_context;
+        &parent_gpu->fault_buffer_info.non_replayable.block_service_context;
    NV_STATUS status;
    bool hmm_migratable = true;

    service_context->num_retries = 0;

    do {
-        status = service_fault_once(gpu, fault_entry, hmm_migratable);
+        status = service_fault_once(parent_gpu, fault_entry, hmm_migratable);
        if (status == NV_WARN_MISMATCHED_TARGET) {
            hmm_migratable = false;
            status = NV_WARN_MORE_PROCESSING_REQUIRED;
@@ -760,7 +769,7 @@ static NV_STATUS service_fault(uvm_gpu_t *gpu, uvm_fault_buffer_entry_t *fault_e
    return status;
 }

-void uvm_gpu_service_non_replayable_fault_buffer(uvm_gpu_t *gpu)
+void uvm_parent_gpu_service_non_replayable_fault_buffer(uvm_parent_gpu_t *parent_gpu)
 {
    NvU32 cached_faults;

@@ -772,7 +781,7 @@ void uvm_gpu_service_non_replayable_fault_buffer(uvm_gpu_t *gpu)
        NV_STATUS status;
        NvU32 i;

-        status = fetch_non_replayable_fault_buffer_entries(gpu->parent, &cached_faults);
+        status = fetch_non_replayable_fault_buffer_entries(parent_gpu, &cached_faults);
        if (status != NV_OK)
            return;

@@ -780,7 +789,7 @@ void uvm_gpu_service_non_replayable_fault_buffer(uvm_gpu_t *gpu)
        // non-replayable faults since getting multiple faults on the same
        // memory region is not very likely
        for (i = 0; i < cached_faults; ++i) {
-            status = service_fault(gpu, &gpu->parent->fault_buffer_info.non_replayable.fault_cache[i]);
+            status = service_fault(parent_gpu, &parent_gpu->fault_buffer_info.non_replayable.fault_cache[i]);
            if (status != NV_OK)
                return;
        }
--- a/kernel-open/nvidia-uvm/uvm_gpu_non_replayable_faults.h
+++ b/kernel-open/nvidia-uvm/uvm_gpu_non_replayable_faults.h
@@ -28,7 +28,7 @@

 bool uvm_parent_gpu_non_replayable_faults_pending(uvm_parent_gpu_t *parent_gpu);

-void uvm_gpu_service_non_replayable_fault_buffer(uvm_gpu_t *gpu);
+void uvm_parent_gpu_service_non_replayable_fault_buffer(uvm_parent_gpu_t *parent_gpu);

 NV_STATUS uvm_parent_gpu_fault_buffer_init_non_replayable_faults(uvm_parent_gpu_t *parent_gpu);

--- a/kernel-open/nvidia-uvm/uvm_gpu_replayable_faults.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu_replayable_faults.c
--- a/kernel-open/nvidia-uvm/uvm_gpu_replayable_faults.h
+++ b/kernel-open/nvidia-uvm/uvm_gpu_replayable_faults.h
@@ -73,5 +73,5 @@ void uvm_parent_gpu_disable_prefetch_faults(uvm_parent_gpu_t *parent_gpu);

 // Service pending replayable faults on the given GPU. This function must be
 // only called from the ISR bottom half
-void uvm_gpu_service_replayable_faults(uvm_gpu_t *gpu);
+void uvm_parent_gpu_service_replayable_faults(uvm_parent_gpu_t *parent_gpu);
 #endif // __UVM_GPU_PAGE_FAULT_H__
--- a/kernel-open/nvidia-uvm/uvm_gpu_semaphore.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu_semaphore.c
@@ -60,6 +60,17 @@ struct uvm_gpu_semaphore_pool_page_struct
    // Allocation backing the page
    uvm_rm_mem_t *memory;

+    struct {
+        // Unprotected sysmem storing encrypted value of semaphores
+        uvm_rm_mem_t *encrypted_payload_memory;
+
+        // Unprotected sysmem storing encryption auth tags
+        uvm_rm_mem_t *auth_tag_memory;
+
+        // Unprotected sysmem storing plain text notifier values
+        uvm_rm_mem_t *notifier_memory;
+    } conf_computing;
+
    // Pool the page is part of
    uvm_gpu_semaphore_pool_t *pool;

@@ -80,26 +91,6 @@ static bool gpu_semaphore_is_secure(uvm_gpu_semaphore_t *semaphore)
    return gpu_semaphore_pool_is_secure(semaphore->page->pool);
 }

-static NvU32 get_index(uvm_gpu_semaphore_t *semaphore)
-{
-    NvU32 offset;
-    NvU32 index;
-
-    if (gpu_semaphore_is_secure(semaphore))
-        return semaphore->conf_computing.index;
-
-    UVM_ASSERT(semaphore->payload != NULL);
-    UVM_ASSERT(semaphore->page != NULL);
-
-    offset = (char*)semaphore->payload - (char*)uvm_rm_mem_get_cpu_va(semaphore->page->memory);
-    UVM_ASSERT(offset % UVM_SEMAPHORE_SIZE == 0);
-
-    index = offset / UVM_SEMAPHORE_SIZE;
-    UVM_ASSERT(index < UVM_SEMAPHORE_COUNT_PER_PAGE);
-
-    return index;
-}
-
 // Use canary values on debug builds to catch semaphore use-after-free. We can
 // catch release-after-free by simply setting the payload to a known value at
 // free then checking it on alloc or pool free, but catching acquire-after-free
@@ -150,34 +141,83 @@ static bool gpu_can_access_semaphore_pool(uvm_gpu_t *gpu, uvm_rm_mem_t *rm_mem)
    return ((uvm_rm_mem_get_gpu_uvm_va(rm_mem, gpu) + rm_mem->size - 1) < gpu->parent->max_host_va);
 }

-// Secure semaphore pools are allocated in the CPR of vidmem and only mapped to
-// the owning GPU as no other processor have access to it.
-static NV_STATUS pool_alloc_secure_page(uvm_gpu_semaphore_pool_t *pool,
-                                        uvm_gpu_semaphore_pool_page_t *pool_page,
-                                        uvm_rm_mem_type_t memory_type)
+static void pool_page_free_buffers(uvm_gpu_semaphore_pool_page_t *page)
+{
+    uvm_rm_mem_free(page->memory);
+    page->memory = NULL;
+
+    if (gpu_semaphore_pool_is_secure(page->pool)) {
+        uvm_rm_mem_free(page->conf_computing.encrypted_payload_memory);
+        uvm_rm_mem_free(page->conf_computing.auth_tag_memory);
+        uvm_rm_mem_free(page->conf_computing.notifier_memory);
+
+        page->conf_computing.encrypted_payload_memory = NULL;
+        page->conf_computing.auth_tag_memory = NULL;
+        page->conf_computing.notifier_memory = NULL;
+    }
+    else {
+        UVM_ASSERT(!page->conf_computing.encrypted_payload_memory);
+        UVM_ASSERT(!page->conf_computing.auth_tag_memory);
+        UVM_ASSERT(!page->conf_computing.notifier_memory);
+    }
+}
+
+static NV_STATUS pool_page_alloc_buffers(uvm_gpu_semaphore_pool_page_t *page)
 {
    NV_STATUS status;
+    uvm_gpu_semaphore_pool_t *pool = page->pool;
+    uvm_rm_mem_type_t memory_type = (pool->aperture == UVM_APERTURE_SYS) ? UVM_RM_MEM_TYPE_SYS : UVM_RM_MEM_TYPE_GPU;
+    size_t align = 0;
+    bool map_all = true;
+    align = gpu_semaphore_pool_is_secure(pool) ? UVM_CONF_COMPUTING_BUF_ALIGNMENT : 0;
+    map_all = gpu_semaphore_pool_is_secure(pool) ? false : true;

-    UVM_ASSERT(gpu_semaphore_pool_is_secure(pool));
-    status = uvm_rm_mem_alloc(pool->gpu,
-                              memory_type,
-                              UVM_SEMAPHORE_PAGE_SIZE,
-                              UVM_CONF_COMPUTING_BUF_ALIGNMENT,
-                              &pool_page->memory);
+    if (map_all)
+        status = uvm_rm_mem_alloc_and_map_all(pool->gpu, memory_type, UVM_SEMAPHORE_PAGE_SIZE, align, &page->memory);
+    else
+        status = uvm_rm_mem_alloc(pool->gpu, memory_type, UVM_SEMAPHORE_PAGE_SIZE, align, &page->memory);

    if (status != NV_OK)
-        return status;
+        goto error;
+
+    if (!gpu_semaphore_pool_is_secure(pool))
+        return NV_OK;
+
+    status = uvm_rm_mem_alloc_and_map_cpu(pool->gpu,
+                                          UVM_RM_MEM_TYPE_SYS,
+                                          UVM_SEMAPHORE_PAGE_SIZE,
+                                          UVM_CONF_COMPUTING_BUF_ALIGNMENT,
+                                          &page->conf_computing.encrypted_payload_memory);
+    if (status != NV_OK)
+        goto error;
+
+    BUILD_BUG_ON(UVM_CONF_COMPUTING_AUTH_TAG_SIZE % UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT);
+    status = uvm_rm_mem_alloc_and_map_cpu(pool->gpu,
+                                          UVM_RM_MEM_TYPE_SYS,
+                                          UVM_SEMAPHORE_COUNT_PER_PAGE * UVM_CONF_COMPUTING_AUTH_TAG_SIZE,
+                                          UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT,
+                                          &page->conf_computing.auth_tag_memory);
+    if (status != NV_OK)
+        goto error;
+
+    status = uvm_rm_mem_alloc_and_map_cpu(pool->gpu,
+                                          UVM_RM_MEM_TYPE_SYS,
+                                          UVM_SEMAPHORE_COUNT_PER_PAGE * sizeof(NvU32),
+                                          0,
+                                          &page->conf_computing.notifier_memory);
+    if (status != NV_OK)
+        goto error;

    return NV_OK;
+error:
+    pool_page_free_buffers(page);
+    return status;
 }

 static NV_STATUS pool_alloc_page(uvm_gpu_semaphore_pool_t *pool)
 {
    NV_STATUS status;
    uvm_gpu_semaphore_pool_page_t *pool_page;
-    NvU32 *payloads;
-    size_t i;
-    uvm_rm_mem_type_t memory_type = (pool->aperture == UVM_APERTURE_SYS) ? UVM_RM_MEM_TYPE_SYS : UVM_RM_MEM_TYPE_GPU;

    uvm_assert_mutex_locked(&pool->mutex);

@@ -188,24 +228,9 @@ static NV_STATUS pool_alloc_page(uvm_gpu_semaphore_pool_t *pool)

    pool_page->pool = pool;

-    // Whenever the Confidential Computing feature is enabled, engines can
-    // access semaphores only in the CPR of vidmem. Mapping to other GPUs is
-    // also disabled.
-    if (gpu_semaphore_pool_is_secure(pool)) {
-        status = pool_alloc_secure_page(pool, pool_page, memory_type);
-
-        if (status != NV_OK)
-            goto error;
-    }
-    else {
-    status = uvm_rm_mem_alloc_and_map_all(pool->gpu,
-                                          memory_type,
-                                          UVM_SEMAPHORE_PAGE_SIZE,
-                                          0,
-                                          &pool_page->memory);
+    status = pool_page_alloc_buffers(pool_page);
    if (status != NV_OK)
        goto error;
-    }

    // Verify the GPU can access the semaphore pool.
    UVM_ASSERT(gpu_can_access_semaphore_pool(pool->gpu, pool_page->memory));
@@ -217,7 +242,9 @@ static NV_STATUS pool_alloc_page(uvm_gpu_semaphore_pool_t *pool)
    pool->free_semaphores_count += UVM_SEMAPHORE_COUNT_PER_PAGE;

    if (semaphore_uses_canary(pool)) {
-        payloads = uvm_rm_mem_get_cpu_va(pool_page->memory);
+        size_t i;
+        NvU32 *payloads = uvm_rm_mem_get_cpu_va(pool_page->memory);
+
        for (i = 0; i < UVM_SEMAPHORE_COUNT_PER_PAGE; i++)
            payloads[i] = make_canary(0);
    }
@@ -253,7 +280,7 @@ static void pool_free_page(uvm_gpu_semaphore_pool_page_t *page)

    pool->free_semaphores_count -= UVM_SEMAPHORE_COUNT_PER_PAGE;
    list_del(&page->all_pages_node);
-    uvm_rm_mem_free(page->memory);
+    pool_page_free_buffers(page);
    uvm_kvfree(page);
 }

@@ -273,19 +300,22 @@ NV_STATUS uvm_gpu_semaphore_alloc(uvm_gpu_semaphore_pool_t *pool, uvm_gpu_semaph
        goto done;

    list_for_each_entry(page, &pool->pages, all_pages_node) {
-        NvU32 semaphore_index = find_first_bit(page->free_semaphores, UVM_SEMAPHORE_COUNT_PER_PAGE);
+        const NvU32 semaphore_index = find_first_bit(page->free_semaphores, UVM_SEMAPHORE_COUNT_PER_PAGE);
+
+        UVM_ASSERT(semaphore_index <= UVM_SEMAPHORE_COUNT_PER_PAGE);
+
        if (semaphore_index == UVM_SEMAPHORE_COUNT_PER_PAGE)
            continue;

-        if (gpu_semaphore_pool_is_secure(pool)) {
-            semaphore->conf_computing.index = semaphore_index;
-        }
-        else {
-            semaphore->payload = (NvU32*)((char*)uvm_rm_mem_get_cpu_va(page->memory) +
-                                                 semaphore_index * UVM_SEMAPHORE_SIZE);
-        }
-
        semaphore->page = page;
+        semaphore->index = semaphore_index;
+
+        if (gpu_semaphore_pool_is_secure(pool)) {
+
+            // Reset the notifier to prevent detection of false attack when
+            // checking for updated value
+            *uvm_gpu_semaphore_get_notifier_cpu_va(semaphore) = semaphore->conf_computing.last_observed_notifier;
+        }

        if (semaphore_uses_canary(pool))
            UVM_ASSERT(is_canary(uvm_gpu_semaphore_get_payload(semaphore)));
@@ -311,7 +341,6 @@ void uvm_gpu_semaphore_free(uvm_gpu_semaphore_t *semaphore)
 {
    uvm_gpu_semaphore_pool_page_t *page;
    uvm_gpu_semaphore_pool_t *pool;
-    NvU32 index;

    UVM_ASSERT(semaphore);

@@ -323,7 +352,6 @@ void uvm_gpu_semaphore_free(uvm_gpu_semaphore_t *semaphore)
        return;

    pool = page->pool;
-    index = get_index(semaphore);

    // Write a known value lower than the current payload in an attempt to catch
    // release-after-free and acquire-after-free.
@@ -333,10 +361,9 @@ void uvm_gpu_semaphore_free(uvm_gpu_semaphore_t *semaphore)
    uvm_mutex_lock(&pool->mutex);

    semaphore->page = NULL;
-    semaphore->payload = NULL;

    ++pool->free_semaphores_count;
-    __set_bit(index, page->free_semaphores);
+    __set_bit(semaphore->index, page->free_semaphores);

    uvm_mutex_unlock(&pool->mutex);
 }
@@ -449,18 +476,70 @@ NvU64 uvm_gpu_semaphore_get_gpu_proxy_va(uvm_gpu_semaphore_t *semaphore, uvm_gpu

 NvU64 uvm_gpu_semaphore_get_gpu_va(uvm_gpu_semaphore_t *semaphore, uvm_gpu_t *gpu, bool is_proxy_va_space)
 {
-    NvU32 index = get_index(semaphore);
    NvU64 base_va = uvm_rm_mem_get_gpu_va(semaphore->page->memory, gpu, is_proxy_va_space).address;

-    return base_va + UVM_SEMAPHORE_SIZE * index;
+    return base_va + semaphore->index * UVM_SEMAPHORE_SIZE;
+}
+
+NvU32 *uvm_gpu_semaphore_get_cpu_va(uvm_gpu_semaphore_t *semaphore)
+{
+    char *base_va;
+
+    if (gpu_semaphore_is_secure(semaphore))
+        return &semaphore->conf_computing.cached_payload;
+
+    base_va = uvm_rm_mem_get_cpu_va(semaphore->page->memory);
+    return (NvU32*)(base_va + semaphore->index * UVM_SEMAPHORE_SIZE);
+}
+
+NvU32 *uvm_gpu_semaphore_get_encrypted_payload_cpu_va(uvm_gpu_semaphore_t *semaphore)
+{
+    char *encrypted_base_va = uvm_rm_mem_get_cpu_va(semaphore->page->conf_computing.encrypted_payload_memory);
+
+    return (NvU32*)(encrypted_base_va + semaphore->index * UVM_SEMAPHORE_SIZE);
+}
+
+uvm_gpu_address_t uvm_gpu_semaphore_get_encrypted_payload_gpu_va(uvm_gpu_semaphore_t *semaphore)
+{
+    NvU64 encrypted_base_va = uvm_rm_mem_get_gpu_uvm_va(semaphore->page->conf_computing.encrypted_payload_memory,
+                                                        semaphore->page->pool->gpu);
+
+    return uvm_gpu_address_virtual_unprotected(encrypted_base_va + semaphore->index * UVM_SEMAPHORE_SIZE);
+}
+
+NvU32 *uvm_gpu_semaphore_get_notifier_cpu_va(uvm_gpu_semaphore_t *semaphore)
+{
+    char *notifier_base_va = uvm_rm_mem_get_cpu_va(semaphore->page->conf_computing.notifier_memory);
+
+    return (NvU32*)(notifier_base_va + semaphore->index * sizeof(NvU32));
+}
+
+uvm_gpu_address_t uvm_gpu_semaphore_get_notifier_gpu_va(uvm_gpu_semaphore_t *semaphore)
+{
+    NvU64 notifier_base_va = uvm_rm_mem_get_gpu_uvm_va(semaphore->page->conf_computing.notifier_memory,
+                                                       semaphore->page->pool->gpu);
+
+    return uvm_gpu_address_virtual_unprotected(notifier_base_va + semaphore->index * sizeof(NvU32));
+}
+
+void *uvm_gpu_semaphore_get_auth_tag_cpu_va(uvm_gpu_semaphore_t *semaphore)
+{
+    char *auth_tag_base_va = uvm_rm_mem_get_cpu_va(semaphore->page->conf_computing.auth_tag_memory);
+
+    return (void*)(auth_tag_base_va + semaphore->index * UVM_CONF_COMPUTING_AUTH_TAG_SIZE);
+}
+
+uvm_gpu_address_t uvm_gpu_semaphore_get_auth_tag_gpu_va(uvm_gpu_semaphore_t *semaphore)
+{
+    NvU64 auth_tag_base_va = uvm_rm_mem_get_gpu_uvm_va(semaphore->page->conf_computing.auth_tag_memory,
+                                                       semaphore->page->pool->gpu);
+
+    return uvm_gpu_address_virtual_unprotected(auth_tag_base_va + semaphore->index * UVM_CONF_COMPUTING_AUTH_TAG_SIZE);
 }

 NvU32 uvm_gpu_semaphore_get_payload(uvm_gpu_semaphore_t *semaphore)
 {
-    if (gpu_semaphore_is_secure(semaphore))
-        return UVM_GPU_READ_ONCE(semaphore->conf_computing.cached_payload);
-
-    return UVM_GPU_READ_ONCE(*semaphore->payload);
+    return UVM_GPU_READ_ONCE(*uvm_gpu_semaphore_get_cpu_va(semaphore));
 }

 void uvm_gpu_semaphore_set_payload(uvm_gpu_semaphore_t *semaphore, NvU32 payload)
@@ -477,10 +556,7 @@ void uvm_gpu_semaphore_set_payload(uvm_gpu_semaphore_t *semaphore, NvU32 payload
    // the GPU correctly even on non-SMP).
    mb();

-    if (gpu_semaphore_is_secure(semaphore))
-            UVM_GPU_WRITE_ONCE(semaphore->conf_computing.cached_payload, payload);
-    else
-    UVM_GPU_WRITE_ONCE(*semaphore->payload, payload);
+    UVM_GPU_WRITE_ONCE(*uvm_gpu_semaphore_get_cpu_va(semaphore), payload);
 }

 // This function is intended to catch channels which have been left dangling in
@@ -507,7 +583,7 @@ static bool tracking_semaphore_check_gpu(uvm_gpu_tracking_semaphore_t *tracking_
    return true;
 }

-bool tracking_semaphore_uses_mutex(uvm_gpu_tracking_semaphore_t *tracking_semaphore)
+static bool tracking_semaphore_uses_mutex(uvm_gpu_tracking_semaphore_t *tracking_semaphore)
 {
    UVM_ASSERT(tracking_semaphore_check_gpu(tracking_semaphore));

@@ -571,9 +647,7 @@ static void uvm_gpu_semaphore_encrypted_payload_update(uvm_channel_t *channel, u
    NV_STATUS status = NV_OK;
    NvU8 local_auth_tag[UVM_CONF_COMPUTING_AUTH_TAG_SIZE];
    UvmCslIv *ivs_cpu_addr = semaphore->conf_computing.ivs;
-    void *auth_tag_cpu_addr = uvm_rm_mem_get_cpu_va(semaphore->conf_computing.auth_tag);
-    NvU32 *gpu_notifier_cpu_addr = (NvU32 *)uvm_rm_mem_get_cpu_va(semaphore->conf_computing.notifier);
-    NvU32 *payload_cpu_addr = (NvU32 *)uvm_rm_mem_get_cpu_va(semaphore->conf_computing.encrypted_payload);
+    NvU32 *gpu_notifier_cpu_addr = uvm_gpu_semaphore_get_notifier_cpu_va(semaphore);

    UVM_ASSERT(g_uvm_global.conf_computing_enabled);
    UVM_ASSERT(uvm_channel_is_ce(channel));
@@ -596,8 +670,8 @@ static void uvm_gpu_semaphore_encrypted_payload_update(uvm_channel_t *channel, u
        smp_mb__after_atomic();

        iv_index = (gpu_notifier / 2) % channel->num_gpfifo_entries;
-        memcpy(local_auth_tag, auth_tag_cpu_addr, sizeof(local_auth_tag));
-        local_payload = UVM_READ_ONCE(*payload_cpu_addr);
+        memcpy(local_auth_tag, uvm_gpu_semaphore_get_auth_tag_cpu_va(semaphore), sizeof(local_auth_tag));
+        local_payload = UVM_READ_ONCE(*uvm_gpu_semaphore_get_encrypted_payload_cpu_va(semaphore));
        memcpy(&local_iv, &ivs_cpu_addr[iv_index], sizeof(local_iv));

        // Make sure the second read of notifier happens after
@@ -650,7 +724,7 @@ static NvU64 update_completed_value_locked(uvm_gpu_tracking_semaphore_t *trackin
    else
        uvm_assert_spinlock_locked(&tracking_semaphore->s_lock);

-    if (tracking_semaphore->semaphore.conf_computing.encrypted_payload) {
+    if (gpu_semaphore_is_secure(&tracking_semaphore->semaphore)) {
        // TODO: Bug 4008734: [UVM][HCC] Extend secure tracking semaphore
        //                     mechanism to all semaphore
        uvm_channel_t *channel = container_of(tracking_semaphore, uvm_channel_t, tracking_sem);
@@ -690,7 +764,7 @@ static NvU64 update_completed_value_locked(uvm_gpu_tracking_semaphore_t *trackin
    UVM_ASSERT_MSG_RELEASE(new_value - old_value <= UVM_GPU_SEMAPHORE_MAX_JUMP,
                           "GPU %s unexpected semaphore (CPU VA 0x%llx) jump from 0x%llx to 0x%llx\n",
                           uvm_gpu_name(tracking_semaphore->semaphore.page->pool->gpu),
-                           (NvU64)(uintptr_t)tracking_semaphore->semaphore.payload,
+                           (NvU64)(uintptr_t)uvm_gpu_semaphore_get_cpu_va(&tracking_semaphore->semaphore),
                           old_value, new_value);

    // Use an atomic write even though the lock is held so that the value can
--- a/kernel-open/nvidia-uvm/uvm_gpu_semaphore.h
+++ b/kernel-open/nvidia-uvm/uvm_gpu_semaphore.h
@@ -45,15 +45,13 @@ struct uvm_gpu_semaphore_struct
    // The semaphore pool page the semaphore came from
    uvm_gpu_semaphore_pool_page_t *page;

-    // Pointer to the memory location
-    NvU32 *payload;
+    // Index of the semaphore in semaphore page
+    NvU16 index;
+
    struct {
-        NvU16 index;
-        NvU32 cached_payload;
-        uvm_rm_mem_t *encrypted_payload;
-        uvm_rm_mem_t *notifier;
-        uvm_rm_mem_t *auth_tag;
        UvmCslIv *ivs;
+        NvU32 cached_payload;
+
        NvU32 last_pushed_notifier;
        NvU32 last_observed_notifier;
    } conf_computing;
@@ -151,6 +149,17 @@ NvU64 uvm_gpu_semaphore_get_gpu_proxy_va(uvm_gpu_semaphore_t *semaphore, uvm_gpu

 NvU64 uvm_gpu_semaphore_get_gpu_va(uvm_gpu_semaphore_t *semaphore, uvm_gpu_t *gpu, bool is_proxy_va_space);

+NvU32 *uvm_gpu_semaphore_get_cpu_va(uvm_gpu_semaphore_t *semaphore);
+
+NvU32 *uvm_gpu_semaphore_get_encrypted_payload_cpu_va(uvm_gpu_semaphore_t *semaphore);
+uvm_gpu_address_t uvm_gpu_semaphore_get_encrypted_payload_gpu_va(uvm_gpu_semaphore_t *semaphore);
+
+NvU32 *uvm_gpu_semaphore_get_notifier_cpu_va(uvm_gpu_semaphore_t *semaphore);
+uvm_gpu_address_t uvm_gpu_semaphore_get_notifier_gpu_va(uvm_gpu_semaphore_t *semaphore);
+
+void *uvm_gpu_semaphore_get_auth_tag_cpu_va(uvm_gpu_semaphore_t *semaphore);
+uvm_gpu_address_t uvm_gpu_semaphore_get_auth_tag_gpu_va(uvm_gpu_semaphore_t *semaphore);
+
 // Read the 32-bit payload of the semaphore
 // Notably doesn't provide any memory ordering guarantees and needs to be used with
 // care. For an example of what needs to be considered see
--- a/kernel-open/nvidia-uvm/uvm_hal.c
+++ b/kernel-open/nvidia-uvm/uvm_hal.c
@@ -44,6 +44,8 @@
 #include "clc7b5.h"
 #include "clc86f.h"
 #include "clc8b5.h"
+#include "clc96f.h"
+#include "clc9b5.h"

 static int uvm_downgrade_force_membar_sys = 1;
 module_param(uvm_downgrade_force_membar_sys, uint, 0644);
@@ -164,6 +166,11 @@ static uvm_hal_class_ops_t ce_table[] =
            .decrypt = uvm_hal_hopper_ce_decrypt,
        },
    },
+    {
+        .id = BLACKWELL_DMA_COPY_A,
+        .parent_id = HOPPER_DMA_COPY_A,
+        .u.ce_ops = {},
+    },
 };

 // Table for GPFIFO functions.  Same idea as the copy engine table.
@@ -286,6 +293,15 @@ static uvm_hal_class_ops_t host_table[] =
            .set_gpfifo_pushbuffer_segment_base = uvm_hal_hopper_host_set_gpfifo_pushbuffer_segment_base,
        }
    },
+    {
+        .id = BLACKWELL_CHANNEL_GPFIFO_A,
+        .parent_id = HOPPER_CHANNEL_GPFIFO_A,
+        .u.host_ops = {
+            .tlb_invalidate_all = uvm_hal_blackwell_host_tlb_invalidate_all,
+            .tlb_invalidate_va = uvm_hal_blackwell_host_tlb_invalidate_va,
+            .tlb_invalidate_test = uvm_hal_blackwell_host_tlb_invalidate_test,
+        }
+    },
 };

 static uvm_hal_class_ops_t arch_table[] =
@@ -297,7 +313,6 @@ static uvm_hal_class_ops_t arch_table[] =
            .mmu_mode_hal = uvm_hal_mmu_mode_maxwell,
            .enable_prefetch_faults = uvm_hal_maxwell_mmu_enable_prefetch_faults_unsupported,
            .disable_prefetch_faults = uvm_hal_maxwell_mmu_disable_prefetch_faults_unsupported,
-            .mmu_engine_id_to_type = uvm_hal_maxwell_mmu_engine_id_to_type_unsupported,
            .mmu_client_id_to_utlb_id = uvm_hal_maxwell_mmu_client_id_to_utlb_id_unsupported,
        }
    },
@@ -323,7 +338,6 @@ static uvm_hal_class_ops_t arch_table[] =
        .u.arch_ops = {
            .init_properties = uvm_hal_volta_arch_init_properties,
            .mmu_mode_hal = uvm_hal_mmu_mode_volta,
-            .mmu_engine_id_to_type = uvm_hal_volta_mmu_engine_id_to_type,
            .mmu_client_id_to_utlb_id = uvm_hal_volta_mmu_client_id_to_utlb_id,
        },
    },
@@ -333,7 +347,6 @@ static uvm_hal_class_ops_t arch_table[] =
        .u.arch_ops = {
            .init_properties = uvm_hal_turing_arch_init_properties,
            .mmu_mode_hal = uvm_hal_mmu_mode_turing,
-            .mmu_engine_id_to_type = uvm_hal_turing_mmu_engine_id_to_type,
        },
    },
    {
@@ -342,7 +355,6 @@ static uvm_hal_class_ops_t arch_table[] =
        .u.arch_ops = {
            .init_properties = uvm_hal_ampere_arch_init_properties,
            .mmu_mode_hal = uvm_hal_mmu_mode_ampere,
-            .mmu_engine_id_to_type = uvm_hal_ampere_mmu_engine_id_to_type,
            .mmu_client_id_to_utlb_id = uvm_hal_ampere_mmu_client_id_to_utlb_id,
        },
    },
@@ -359,10 +371,18 @@ static uvm_hal_class_ops_t arch_table[] =
        .u.arch_ops = {
            .init_properties = uvm_hal_hopper_arch_init_properties,
            .mmu_mode_hal = uvm_hal_mmu_mode_hopper,
-            .mmu_engine_id_to_type = uvm_hal_hopper_mmu_engine_id_to_type,
            .mmu_client_id_to_utlb_id = uvm_hal_hopper_mmu_client_id_to_utlb_id,
        },
    },
+    {
+        .id = NV2080_CTRL_MC_ARCH_INFO_ARCHITECTURE_GB100,
+        .parent_id = NV2080_CTRL_MC_ARCH_INFO_ARCHITECTURE_GH100,
+        .u.arch_ops = {
+            .init_properties = uvm_hal_blackwell_arch_init_properties,
+            .mmu_mode_hal = uvm_hal_mmu_mode_blackwell,
+            .mmu_client_id_to_utlb_id = uvm_hal_blackwell_mmu_client_id_to_utlb_id,
+        }
+    },
 };

 static uvm_hal_class_ops_t fault_buffer_table[] =
@@ -377,6 +397,7 @@ static uvm_hal_class_ops_t fault_buffer_table[] =
            .read_get = uvm_hal_maxwell_fault_buffer_read_get_unsupported,
            .write_get = uvm_hal_maxwell_fault_buffer_write_get_unsupported,
            .get_ve_id = uvm_hal_maxwell_fault_buffer_get_ve_id_unsupported,
+            .get_mmu_engine_type = uvm_hal_maxwell_fault_buffer_get_mmu_engine_type_unsupported,
            .parse_replayable_entry = uvm_hal_maxwell_fault_buffer_parse_replayable_entry_unsupported,
            .entry_is_valid = uvm_hal_maxwell_fault_buffer_entry_is_valid_unsupported,
            .entry_clear_valid = uvm_hal_maxwell_fault_buffer_entry_clear_valid_unsupported,
@@ -415,6 +436,7 @@ static uvm_hal_class_ops_t fault_buffer_table[] =
            .read_get = uvm_hal_volta_fault_buffer_read_get,
            .write_get = uvm_hal_volta_fault_buffer_write_get,
            .get_ve_id = uvm_hal_volta_fault_buffer_get_ve_id,
+            .get_mmu_engine_type = uvm_hal_volta_fault_buffer_get_mmu_engine_type,
            .parse_replayable_entry = uvm_hal_volta_fault_buffer_parse_replayable_entry,
            .parse_non_replayable_entry = uvm_hal_volta_fault_buffer_parse_non_replayable_entry,
            .get_fault_type = uvm_hal_volta_fault_buffer_get_fault_type,
@@ -426,12 +448,15 @@ static uvm_hal_class_ops_t fault_buffer_table[] =
        .u.fault_buffer_ops = {
            .disable_replayable_faults = uvm_hal_turing_disable_replayable_faults,
            .clear_replayable_faults = uvm_hal_turing_clear_replayable_faults,
+            .get_mmu_engine_type = uvm_hal_turing_fault_buffer_get_mmu_engine_type,
        }
    },
    {
        .id = NV2080_CTRL_MC_ARCH_INFO_ARCHITECTURE_GA100,
        .parent_id = NV2080_CTRL_MC_ARCH_INFO_ARCHITECTURE_TU100,
-        .u.fault_buffer_ops = {}
+        .u.fault_buffer_ops = {
+            .get_mmu_engine_type = uvm_hal_ampere_fault_buffer_get_mmu_engine_type,
+        }
    },
    {
        .id = NV2080_CTRL_MC_ARCH_INFO_ARCHITECTURE_AD100,
@@ -443,6 +468,15 @@ static uvm_hal_class_ops_t fault_buffer_table[] =
        .parent_id = NV2080_CTRL_MC_ARCH_INFO_ARCHITECTURE_AD100,
        .u.fault_buffer_ops = {
            .get_ve_id = uvm_hal_hopper_fault_buffer_get_ve_id,
+            .get_mmu_engine_type = uvm_hal_hopper_fault_buffer_get_mmu_engine_type,
+        }
+    },
+    {
+        .id = NV2080_CTRL_MC_ARCH_INFO_ARCHITECTURE_GB100,
+        .parent_id = NV2080_CTRL_MC_ARCH_INFO_ARCHITECTURE_GH100,
+        .u.fault_buffer_ops = {
+            .get_fault_type = uvm_hal_blackwell_fault_buffer_get_fault_type,
+            .get_mmu_engine_type = uvm_hal_blackwell_fault_buffer_get_mmu_engine_type,
        }
    },
 };
@@ -507,6 +541,11 @@ static uvm_hal_class_ops_t access_counter_buffer_table[] =
        .parent_id = NV2080_CTRL_MC_ARCH_INFO_ARCHITECTURE_AD100,
        .u.access_counter_buffer_ops = {}
    },
+    {
+        .id = NV2080_CTRL_MC_ARCH_INFO_ARCHITECTURE_GB100,
+        .parent_id = NV2080_CTRL_MC_ARCH_INFO_ARCHITECTURE_GH100,
+        .u.access_counter_buffer_ops = {}
+    },
 };

 static uvm_hal_class_ops_t sec2_table[] =
@@ -560,6 +599,11 @@ static uvm_hal_class_ops_t sec2_table[] =
            .decrypt = uvm_hal_hopper_sec2_decrypt,
        }
    },
+    {
+        .id = NV2080_CTRL_MC_ARCH_INFO_ARCHITECTURE_GB100,
+        .parent_id = NV2080_CTRL_MC_ARCH_INFO_ARCHITECTURE_GH100,
+        .u.sec2_ops = {}
+    },
 };

 static inline uvm_hal_class_ops_t *ops_find_by_id(uvm_hal_class_ops_t *table, NvU32 row_count, NvU32 id)
@@ -787,6 +831,9 @@ void uvm_hal_tlb_invalidate_membar(uvm_push_t *push, uvm_membar_t membar)

    gpu = uvm_push_get_gpu(push);

+    // TLB invalidate on Blackwell+ GPUs should not use a standalone membar.
+    UVM_ASSERT(gpu->parent->rm_info.gpuArch < NV2080_CTRL_MC_ARCH_INFO_ARCHITECTURE_GB100);
+
    for (i = 0; i < gpu->parent->num_hshub_tlb_invalidate_membars; i++)
        gpu->parent->host_hal->membar_gpu(push);

@@ -892,7 +939,7 @@ const char *uvm_fault_access_type_string(uvm_fault_access_type_t fault_access_ty

 const char *uvm_fault_type_string(uvm_fault_type_t fault_type)
 {
-    BUILD_BUG_ON(UVM_FAULT_TYPE_COUNT != 16);
+    BUILD_BUG_ON(UVM_FAULT_TYPE_COUNT != 17);

    switch (fault_type) {
        UVM_ENUM_STRING_CASE(UVM_FAULT_TYPE_INVALID_PDE);
@@ -911,6 +958,7 @@ const char *uvm_fault_type_string(uvm_fault_type_t fault_type)
        UVM_ENUM_STRING_CASE(UVM_FAULT_TYPE_UNSUPPORTED_KIND);
        UVM_ENUM_STRING_CASE(UVM_FAULT_TYPE_REGION_VIOLATION);
        UVM_ENUM_STRING_CASE(UVM_FAULT_TYPE_POISONED);
+        UVM_ENUM_STRING_CASE(UVM_FAULT_TYPE_CC_VIOLATION);
        UVM_ENUM_STRING_DEFAULT();
    }
 }
--- a/kernel-open/nvidia-uvm/uvm_hal.h
+++ b/kernel-open/nvidia-uvm/uvm_hal.h
@@ -124,6 +124,10 @@ void uvm_hal_hopper_host_tlb_invalidate_all(uvm_push_t *push,
                                            uvm_gpu_phys_address_t pdb,
                                            NvU32 depth,
                                            uvm_membar_t membar);
+void uvm_hal_blackwell_host_tlb_invalidate_all(uvm_push_t *push,
+                                               uvm_gpu_phys_address_t pdb,
+                                               NvU32 depth,
+                                               uvm_membar_t membar);

 // Issue a TLB invalidate applying to the specified VA range in a PDB.
 //
@@ -197,6 +201,13 @@ void uvm_hal_hopper_host_tlb_invalidate_va(uvm_push_t *push,
                                           NvU64 size,
                                           NvU64 page_size,
                                           uvm_membar_t membar);
+void uvm_hal_blackwell_host_tlb_invalidate_va(uvm_push_t *push,
+                                              uvm_gpu_phys_address_t pdb,
+                                              NvU32 depth,
+                                              NvU64 base,
+                                              NvU64 size,
+                                              NvU64 page_size,
+                                              uvm_membar_t membar);

 typedef void (*uvm_hal_host_tlb_invalidate_test_t)(uvm_push_t *push,
                                                   uvm_gpu_phys_address_t pdb,
@@ -216,6 +227,9 @@ void uvm_hal_ampere_host_tlb_invalidate_test(uvm_push_t *push,
 void uvm_hal_hopper_host_tlb_invalidate_test(uvm_push_t *push,
                                             uvm_gpu_phys_address_t pdb,
                                             UVM_TEST_INVALIDATE_TLB_PARAMS *params);
+void uvm_hal_blackwell_host_tlb_invalidate_test(uvm_push_t *push,
+                                                uvm_gpu_phys_address_t pdb,
+                                                UVM_TEST_INVALIDATE_TLB_PARAMS *params);

 // By default all semaphore release operations include a membar sys before the
 // operation. This can be affected by using UVM_PUSH_FLAG_NEXT_* flags with
@@ -457,6 +471,7 @@ void uvm_hal_turing_arch_init_properties(uvm_parent_gpu_t *parent_gpu);
 void uvm_hal_ampere_arch_init_properties(uvm_parent_gpu_t *parent_gpu);
 void uvm_hal_ada_arch_init_properties(uvm_parent_gpu_t *parent_gpu);
 void uvm_hal_hopper_arch_init_properties(uvm_parent_gpu_t *parent_gpu);
+void uvm_hal_blackwell_arch_init_properties(uvm_parent_gpu_t *parent_gpu);

 // Retrieve the page-tree HAL for a given big page size
 typedef uvm_mmu_mode_hal_t *(*uvm_hal_lookup_mode_hal_t)(NvU64 big_page_size);
@@ -468,27 +483,19 @@ uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_volta(NvU64 big_page_size);
 uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_turing(NvU64 big_page_size);
 uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_ampere(NvU64 big_page_size);
 uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_hopper(NvU64 big_page_size);
+uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_blackwell(NvU64 big_page_size);
 void uvm_hal_maxwell_mmu_enable_prefetch_faults_unsupported(uvm_parent_gpu_t *parent_gpu);
 void uvm_hal_maxwell_mmu_disable_prefetch_faults_unsupported(uvm_parent_gpu_t *parent_gpu);
 void uvm_hal_pascal_mmu_enable_prefetch_faults(uvm_parent_gpu_t *parent_gpu);
 void uvm_hal_pascal_mmu_disable_prefetch_faults(uvm_parent_gpu_t *parent_gpu);

-// Convert a faulted MMU engine ID to a UVM engine type. Only engines which have
-// faults serviced by UVM are handled. On Pascal the only such engine is
-// GRAPHICS, so no translation is provided.
-typedef uvm_mmu_engine_type_t (*uvm_hal_mmu_engine_id_to_type_t)(NvU16 mmu_engine_id);
-uvm_mmu_engine_type_t uvm_hal_maxwell_mmu_engine_id_to_type_unsupported(NvU16 mmu_engine_id);
-uvm_mmu_engine_type_t uvm_hal_volta_mmu_engine_id_to_type(NvU16 mmu_engine_id);
-uvm_mmu_engine_type_t uvm_hal_turing_mmu_engine_id_to_type(NvU16 mmu_engine_id);
-uvm_mmu_engine_type_t uvm_hal_ampere_mmu_engine_id_to_type(NvU16 mmu_engine_id);
-uvm_mmu_engine_type_t uvm_hal_hopper_mmu_engine_id_to_type(NvU16 mmu_engine_id);
-
 typedef NvU16 (*uvm_hal_mmu_client_id_to_utlb_id_t)(NvU16 client_id);
 NvU16 uvm_hal_maxwell_mmu_client_id_to_utlb_id_unsupported(NvU16 client_id);
 NvU16 uvm_hal_pascal_mmu_client_id_to_utlb_id(NvU16 client_id);
 NvU16 uvm_hal_volta_mmu_client_id_to_utlb_id(NvU16 client_id);
 NvU16 uvm_hal_ampere_mmu_client_id_to_utlb_id(NvU16 client_id);
 NvU16 uvm_hal_hopper_mmu_client_id_to_utlb_id(NvU16 client_id);
+NvU16 uvm_hal_blackwell_mmu_client_id_to_utlb_id(NvU16 client_id);

 // Replayable faults
 typedef void (*uvm_hal_enable_replayable_faults_t)(uvm_parent_gpu_t *parent_gpu);
@@ -498,6 +505,9 @@ typedef NvU32 (*uvm_hal_fault_buffer_read_put_t)(uvm_parent_gpu_t *parent_gpu);
 typedef NvU32 (*uvm_hal_fault_buffer_read_get_t)(uvm_parent_gpu_t *parent_gpu);
 typedef void (*uvm_hal_fault_buffer_write_get_t)(uvm_parent_gpu_t *parent_gpu, NvU32 get);
 typedef NvU8 (*uvm_hal_fault_buffer_get_ve_id_t)(NvU16 mmu_engine_id, uvm_mmu_engine_type_t mmu_engine_type);
+typedef uvm_mmu_engine_type_t (*uvm_hal_fault_buffer_get_mmu_engine_type_t)(NvU16 mmu_engine_id,
+                                                                            uvm_fault_client_type_t client_type,
+                                                                            NvU16 client_id);

 // Parse the replayable entry at the given buffer index. This also clears the
 // valid bit of the entry in the buffer.
@@ -535,6 +545,9 @@ NvU32 uvm_hal_maxwell_fault_buffer_read_put_unsupported(uvm_parent_gpu_t *parent
 NvU32 uvm_hal_maxwell_fault_buffer_read_get_unsupported(uvm_parent_gpu_t *parent_gpu);
 void uvm_hal_maxwell_fault_buffer_write_get_unsupported(uvm_parent_gpu_t *parent_gpu, NvU32 index);
 NvU8 uvm_hal_maxwell_fault_buffer_get_ve_id_unsupported(NvU16 mmu_engine_id, uvm_mmu_engine_type_t mmu_engine_type);
+uvm_mmu_engine_type_t uvm_hal_maxwell_fault_buffer_get_mmu_engine_type_unsupported(NvU16 mmu_engine_id,
+                                                                                   uvm_fault_client_type_t client_type,
+                                                                                   NvU16 client_id);
 uvm_fault_type_t uvm_hal_maxwell_fault_buffer_get_fault_type_unsupported(const NvU32 *fault_entry);

 void uvm_hal_pascal_enable_replayable_faults(uvm_parent_gpu_t *parent_gpu);
@@ -550,12 +563,31 @@ NvU32 uvm_hal_volta_fault_buffer_read_put(uvm_parent_gpu_t *parent_gpu);
 NvU32 uvm_hal_volta_fault_buffer_read_get(uvm_parent_gpu_t *parent_gpu);
 void uvm_hal_volta_fault_buffer_write_get(uvm_parent_gpu_t *parent_gpu, NvU32 index);
 NvU8 uvm_hal_volta_fault_buffer_get_ve_id(NvU16 mmu_engine_id, uvm_mmu_engine_type_t mmu_engine_type);
+uvm_mmu_engine_type_t uvm_hal_volta_fault_buffer_get_mmu_engine_type(NvU16 mmu_engine_id,
+                                                                     uvm_fault_client_type_t client_type,
+                                                                     NvU16 client_id);

 uvm_fault_type_t uvm_hal_volta_fault_buffer_get_fault_type(const NvU32 *fault_entry);

 void uvm_hal_turing_disable_replayable_faults(uvm_parent_gpu_t *parent_gpu);
 void uvm_hal_turing_clear_replayable_faults(uvm_parent_gpu_t *parent_gpu, NvU32 get);
+uvm_mmu_engine_type_t uvm_hal_turing_fault_buffer_get_mmu_engine_type(NvU16 mmu_engine_id,
+                                                                      uvm_fault_client_type_t client_type,
+                                                                      NvU16 client_id);
+
+uvm_mmu_engine_type_t uvm_hal_ampere_fault_buffer_get_mmu_engine_type(NvU16 mmu_engine_id,
+                                                                      uvm_fault_client_type_t client_type,
+                                                                      NvU16 client_id);
+
 NvU8 uvm_hal_hopper_fault_buffer_get_ve_id(NvU16 mmu_engine_id, uvm_mmu_engine_type_t mmu_engine_type);
+uvm_mmu_engine_type_t uvm_hal_hopper_fault_buffer_get_mmu_engine_type(NvU16 mmu_engine_id,
+                                                                      uvm_fault_client_type_t client_type,
+                                                                      NvU16 client_id);
+
+uvm_mmu_engine_type_t uvm_hal_blackwell_fault_buffer_get_mmu_engine_type(NvU16 mmu_engine_id,
+                                                                         uvm_fault_client_type_t client_type,
+                                                                         NvU16 client_id);
+uvm_fault_type_t uvm_hal_blackwell_fault_buffer_get_fault_type(const NvU32 *fault_entry);

 bool uvm_hal_maxwell_fault_buffer_entry_is_valid_unsupported(uvm_parent_gpu_t *parent_gpu, NvU32 index);
 void uvm_hal_maxwell_fault_buffer_entry_clear_valid_unsupported(uvm_parent_gpu_t *parent_gpu, NvU32 index);
@@ -779,7 +811,6 @@ struct uvm_arch_hal_struct
    uvm_hal_lookup_mode_hal_t mmu_mode_hal;
    uvm_hal_mmu_enable_prefetch_faults_t enable_prefetch_faults;
    uvm_hal_mmu_disable_prefetch_faults_t disable_prefetch_faults;
-    uvm_hal_mmu_engine_id_to_type_t mmu_engine_id_to_type;
    uvm_hal_mmu_client_id_to_utlb_id_t mmu_client_id_to_utlb_id;
 };

@@ -792,6 +823,7 @@ struct uvm_fault_buffer_hal_struct
    uvm_hal_fault_buffer_read_get_t read_get;
    uvm_hal_fault_buffer_write_get_t write_get;
    uvm_hal_fault_buffer_get_ve_id_t get_ve_id;
+    uvm_hal_fault_buffer_get_mmu_engine_type_t get_mmu_engine_type;
    uvm_hal_fault_buffer_parse_replayable_entry_t parse_replayable_entry;
    uvm_hal_fault_buffer_entry_is_valid_t entry_is_valid;
    uvm_hal_fault_buffer_entry_clear_valid_t entry_clear_valid;
--- a/kernel-open/nvidia-uvm/uvm_hal_types.h
+++ b/kernel-open/nvidia-uvm/uvm_hal_types.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2016-2023 NVIDIA Corporation
+    Copyright (c) 2016-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -300,6 +300,7 @@ typedef enum
    UVM_FAULT_TYPE_UNSUPPORTED_KIND,
    UVM_FAULT_TYPE_REGION_VIOLATION,
    UVM_FAULT_TYPE_POISONED,
+    UVM_FAULT_TYPE_CC_VIOLATION,

    UVM_FAULT_TYPE_COUNT
 } uvm_fault_type_t;
@@ -399,6 +400,7 @@ struct uvm_fault_buffer_entry_struct
    //

    uvm_va_space_t                           *va_space;
+    uvm_gpu_t                                     *gpu;

    // This is set to true when some fault could not be serviced and a
    // cancel command needs to be issued
@@ -490,9 +492,9 @@ struct uvm_access_counter_buffer_entry_struct
    // Address of the region for which a notification was sent
    uvm_gpu_address_t address;

-    // These fields are only valid if address.is_virtual is true
    union
    {
+        // These fields are only valid if address.is_virtual is true
        struct
        {
            // Instance pointer of one of the channels in the TSG that triggered
@@ -522,9 +524,14 @@ struct uvm_access_counter_buffer_entry_struct
            // a regular processor id because P2P is not allowed between
            // partitioned GPUs.
            uvm_processor_id_t resident_id;
+
        } physical_info;
    };

+    // This is the GPU that triggered the notification. Note that physical
+    // address based notifications are only supported on non-MIG-capable GPUs.
+    uvm_gpu_t *gpu;
+
    // Number of times the tracked region was accessed since the last time it
    // was cleared. Counter values saturate at the maximum value supported by
    // the GPU (2^16 - 1 in Volta)
--- a/kernel-open/nvidia-uvm/uvm_hmm.c
+++ b/kernel-open/nvidia-uvm/uvm_hmm.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2016-2023 NVIDIA Corporation
+    Copyright (c) 2016-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -284,8 +284,10 @@ static void hmm_va_block_unregister_gpu(uvm_va_block_t *va_block,

    // Reset preferred location and accessed-by of policy nodes if needed.
    uvm_for_each_va_policy_node_in(node, va_block, va_block->start, va_block->end) {
-        if (uvm_id_equal(node->policy.preferred_location, gpu->id))
+        if (uvm_va_policy_preferred_location_equal(&node->policy, gpu->id, NUMA_NO_NODE)) {
            node->policy.preferred_location = UVM_ID_INVALID;
+            node->policy.preferred_nid = NUMA_NO_NODE;
+        }

        uvm_processor_mask_clear(&node->policy.accessed_by, gpu->id);
    }
@@ -1704,8 +1706,6 @@ static void gpu_chunk_remove(uvm_va_block_t *va_block,
        return;
    }

-    // TODO: Bug 3898467: unmap indirect peers when freeing GPU chunks
-
    uvm_mmu_chunk_unmap(gpu_chunk, &va_block->tracker);
    gpu_state->chunks[page_index] = NULL;
 }
@@ -1754,8 +1754,6 @@ static NV_STATUS gpu_chunk_add(uvm_va_block_t *va_block,
    if (status != NV_OK)
        return status;

-    // TODO: Bug 3898467: map indirect peers.
-
    uvm_processor_mask_set(&va_block->resident, id);
    uvm_page_mask_set(&gpu_state->resident, page_index);

@@ -2276,7 +2274,7 @@ static NV_STATUS populate_region(uvm_va_block_t *va_block,
            // uvm_hmm_invalidate() should handle that if the underlying page
            // is invalidated.
            // Also note there can be an allocated page due to GPU-to-GPU
-            // migration between non-peer or indirect peer GPUs.
+            // migration between non-peer GPUs.
            continue;
        }

--- a/kernel-open/nvidia-uvm/uvm_hopper_fault_buffer.c
+++ b/kernel-open/nvidia-uvm/uvm_hopper_fault_buffer.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2020 NVIDIA Corporation
+    Copyright (c) 2020-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -21,6 +21,7 @@

 *******************************************************************************/

+#include "uvm_hal.h"
 #include "uvm_hal_types.h"
 #include "hwref/hopper/gh100/dev_fault.h"

@@ -40,3 +41,49 @@ NvU8 uvm_hal_hopper_fault_buffer_get_ve_id(NvU16 mmu_engine_id, uvm_mmu_engine_t
        return 0;
    }
 }
+
+static bool client_id_ce(NvU16 client_id)
+{
+    if (client_id >= NV_PFAULT_CLIENT_HUB_HSCE0 && client_id <= NV_PFAULT_CLIENT_HUB_HSCE9)
+        return true;
+
+    if (client_id >= NV_PFAULT_CLIENT_HUB_HSCE10 && client_id <= NV_PFAULT_CLIENT_HUB_HSCE15)
+        return true;
+
+    switch (client_id) {
+        case NV_PFAULT_CLIENT_HUB_CE0:
+        case NV_PFAULT_CLIENT_HUB_CE1:
+        case NV_PFAULT_CLIENT_HUB_CE2:
+        case NV_PFAULT_CLIENT_HUB_CE3:
+            return true;
+    }
+
+    return false;
+}
+
+uvm_mmu_engine_type_t uvm_hal_hopper_fault_buffer_get_mmu_engine_type(NvU16 mmu_engine_id,
+                                                                      uvm_fault_client_type_t client_type,
+                                                                      NvU16 client_id)
+{
+    // Servicing CE and Host (HUB clients) faults.
+    if (client_type == UVM_FAULT_CLIENT_TYPE_HUB) {
+        if (client_id_ce(client_id)) {
+            UVM_ASSERT(mmu_engine_id >= NV_PFAULT_MMU_ENG_ID_CE0 && mmu_engine_id <= NV_PFAULT_MMU_ENG_ID_CE9);
+
+            return UVM_MMU_ENGINE_TYPE_CE;
+        }
+
+        if (client_id == NV_PFAULT_CLIENT_HUB_HOST || client_id == NV_PFAULT_CLIENT_HUB_ESC) {
+            UVM_ASSERT(mmu_engine_id >= NV_PFAULT_MMU_ENG_ID_HOST0 && mmu_engine_id <= NV_PFAULT_MMU_ENG_ID_HOST44);
+
+            return UVM_MMU_ENGINE_TYPE_HOST;
+        }
+    }
+
+    // We shouldn't be servicing faults from any other engines other than GR.
+    UVM_ASSERT_MSG(client_id <= NV_PFAULT_CLIENT_GPC_ROP_3, "Unexpected client ID: 0x%x\n", client_id);
+    UVM_ASSERT_MSG(mmu_engine_id >= NV_PFAULT_MMU_ENG_ID_GRAPHICS, "Unexpected engine ID: 0x%x\n", mmu_engine_id);
+    UVM_ASSERT(client_type == UVM_FAULT_CLIENT_TYPE_GPC);
+
+    return UVM_MMU_ENGINE_TYPE_GRAPHICS;
+}
--- a/kernel-open/nvidia-uvm/uvm_hopper_mmu.c
+++ b/kernel-open/nvidia-uvm/uvm_hopper_mmu.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2020-2023 NVIDIA Corporation
+    Copyright (c) 2020-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -47,20 +47,6 @@
 #define ATS_ALLOWED 0
 #define ATS_NOT_ALLOWED 1

-uvm_mmu_engine_type_t uvm_hal_hopper_mmu_engine_id_to_type(NvU16 mmu_engine_id)
-{
-    if (mmu_engine_id >= NV_PFAULT_MMU_ENG_ID_HOST0 && mmu_engine_id <= NV_PFAULT_MMU_ENG_ID_HOST44)
-        return UVM_MMU_ENGINE_TYPE_HOST;
-
-    if (mmu_engine_id >= NV_PFAULT_MMU_ENG_ID_CE0 && mmu_engine_id <= NV_PFAULT_MMU_ENG_ID_CE9)
-        return UVM_MMU_ENGINE_TYPE_CE;
-
-    // We shouldn't be servicing faults from any other engines
-    UVM_ASSERT_MSG(mmu_engine_id >= NV_PFAULT_MMU_ENG_ID_GRAPHICS, "Unexpected engine ID: 0x%x\n", mmu_engine_id);
-
-    return UVM_MMU_ENGINE_TYPE_GRAPHICS;
-}
-
 static NvU32 page_table_depth_hopper(NvU64 page_size)
 {
    // The common-case is page_size == UVM_PAGE_SIZE_2M, hence the first check
--- a/kernel-open/nvidia-uvm/uvm_ioctl.h
+++ b/kernel-open/nvidia-uvm/uvm_ioctl.h
@@ -837,12 +837,6 @@ typedef struct
 // Initialize any tracker object such as a queue or counter
 // UvmToolsCreateEventQueue, UvmToolsCreateProcessAggregateCounters,
 // UvmToolsCreateProcessorCounters.
-// Note that the order of structure elements has the version as the last field.
-// This is used to tell whether the kernel supports V2 events or not because
-// the V1 UVM_TOOLS_INIT_EVENT_TRACKER ioctl would not read or update that
-// field but V2 will. This is needed because it is possible to create an event
-// queue before CUDA is initialized which means UvmSetDriverVersion() hasn't
-// been called yet and the kernel version is unknown.
 //
 #define UVM_TOOLS_INIT_EVENT_TRACKER                                  UVM_IOCTL_BASE(56)
 typedef struct
@@ -853,9 +847,8 @@ typedef struct
    NvProcessorUuid processor;                            // IN
    NvU32           allProcessors;                        // IN
    NvU32           uvmFd;                                // IN
+    NvU32           version;                              // IN (UvmToolsEventQueueVersion)
    NV_STATUS       rmStatus;                             // OUT
-    NvU32           requestedVersion;                     // IN
-    NvU32           grantedVersion;                       // OUT
 } UVM_TOOLS_INIT_EVENT_TRACKER_PARAMS;

 //
@@ -936,20 +929,13 @@ typedef struct

 //
 // UvmToolsGetProcessorUuidTable
-// Note that tablePtr != 0 and count == 0 means that tablePtr is assumed to be
-// an array of size UVM_MAX_PROCESSORS_V1 and that only UvmEventEntry_V1
-// processor IDs (physical GPU UUIDs) will be reported.
-// tablePtr == 0 and count == 0 can be used to query how many processors are
-// present in order to dynamically allocate the correct size array since the
-// total number of processors is returned in 'count'.
 //
 #define UVM_TOOLS_GET_PROCESSOR_UUID_TABLE                            UVM_IOCTL_BASE(64)
 typedef struct
 {
    NvU64     tablePtr                 NV_ALIGN_BYTES(8); // IN
-    NvU32     count;                                      // IN/OUT
+    NvU32     version;                                    // IN (UvmToolsEventQueueVersion)
    NV_STATUS rmStatus;                                   // OUT
-    NvU32     version;                                    // OUT
 } UVM_TOOLS_GET_PROCESSOR_UUID_TABLE_PARAMS;

 //
--- a/kernel-open/nvidia-uvm/uvm_maxwell_fault_buffer.c
+++ b/kernel-open/nvidia-uvm/uvm_maxwell_fault_buffer.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2021-2023 NVIDIA Corporation
+    Copyright (c) 2021-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -74,6 +74,14 @@ NvU8 uvm_hal_maxwell_fault_buffer_get_ve_id_unsupported(NvU16 mmu_engine_id, uvm
    return 0;
 }

+uvm_mmu_engine_type_t uvm_hal_maxwell_fault_buffer_get_mmu_engine_type_unsupported(NvU16 mmu_engine_id,
+                                                                                   uvm_fault_client_type_t client_type,
+                                                                                   NvU16 client_id)
+{
+    UVM_ASSERT_MSG(false, "fault_buffer_get_mmu_engine_type is not supported on Maxwell GPUs.\n");
+    return UVM_MMU_ENGINE_TYPE_GRAPHICS;
+}
+
 uvm_fault_type_t uvm_hal_maxwell_fault_buffer_get_fault_type_unsupported(const NvU32 *fault_entry)
 {
    UVM_ASSERT_MSG(false, "fault_buffer_get_fault_type is not supported.\n");
--- a/kernel-open/nvidia-uvm/uvm_maxwell_mmu.c
+++ b/kernel-open/nvidia-uvm/uvm_maxwell_mmu.c
@@ -38,6 +38,7 @@
 #include "uvm_forward_decl.h"
 #include "uvm_gpu.h"
 #include "uvm_mmu.h"
+#include "uvm_hal.h"
 #include "uvm_push_macros.h"
 #include "hwref/maxwell/gm107/dev_mmu.h"

@@ -375,12 +376,6 @@ void uvm_hal_maxwell_mmu_disable_prefetch_faults_unsupported(uvm_parent_gpu_t *p
    UVM_ASSERT_MSG(false, "mmu disable_prefetch_faults called on Maxwell GPU\n");
 }

-uvm_mmu_engine_type_t uvm_hal_maxwell_mmu_engine_id_to_type_unsupported(NvU16 mmu_engine_id)
-{
-    UVM_ASSERT(0);
-    return UVM_MMU_ENGINE_TYPE_COUNT;
-}
-
 NvU16 uvm_hal_maxwell_mmu_client_id_to_utlb_id_unsupported(NvU16 client_id)
 {
    UVM_ASSERT(0);
--- a/kernel-open/nvidia-uvm/uvm_mem.h
+++ b/kernel-open/nvidia-uvm/uvm_mem.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2016-2023 NVIDIA Corporation
+    Copyright (c) 2016-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -324,8 +324,7 @@ uvm_gpu_phys_address_t uvm_mem_gpu_physical(uvm_mem_t *mem, uvm_gpu_t *gpu, NvU6
 uvm_gpu_address_t uvm_mem_gpu_address_physical(uvm_mem_t *mem, uvm_gpu_t *gpu, NvU64 offset, NvU64 size);

 // Helper to get an address suitable for accessing_gpu (which may be the backing
-// GPU) to access with CE. Note that mappings for indirect peers are not
-// created automatically.
+// GPU) to access with CE.
 uvm_gpu_address_t uvm_mem_gpu_address_copy(uvm_mem_t *mem, uvm_gpu_t *accessing_gpu, NvU64 offset, NvU64 size);

 static bool uvm_mem_is_sysmem(uvm_mem_t *mem)
--- a/kernel-open/nvidia-uvm/uvm_mem_test.c
+++ b/kernel-open/nvidia-uvm/uvm_mem_test.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2016-2023 NVIDIA Corporation
+    Copyright (c) 2016-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -33,7 +33,7 @@

 static const size_t sysmem_alloc_sizes[] = { 1, PAGE_SIZE - 1, PAGE_SIZE, 7 * PAGE_SIZE };

-static NvU32 first_page_size(NvU32 page_sizes)
+static NvU64 first_page_size(NvU64 page_sizes)
 {
    return page_sizes & ~(page_sizes - 1);
 }
@@ -43,7 +43,7 @@ static NvU32 first_page_size(NvU32 page_sizes)
         page_size;                                                                 \
         page_size = first_page_size((page_sizes) & ~(page_size | (page_size - 1))))

-static inline NV_STATUS __alloc_map_sysmem(NvU64 size, uvm_gpu_t *gpu, uvm_mem_t **sys_mem)
+static inline NV_STATUS mem_alloc_sysmem_and_map_cpu_kernel(NvU64 size, uvm_gpu_t *gpu, uvm_mem_t **sys_mem)
 {
    if (g_uvm_global.conf_computing_enabled)
        return uvm_mem_alloc_sysmem_dma_and_map_cpu_kernel(size, gpu, current->mm, sys_mem);
@@ -67,7 +67,7 @@ static NV_STATUS check_accessible_from_gpu(uvm_gpu_t *gpu, uvm_mem_t *mem)
    UVM_ASSERT(uvm_mem_physical_size(mem) >= verif_size);
    UVM_ASSERT(verif_size >= sizeof(*sys_verif));

-    TEST_NV_CHECK_GOTO(__alloc_map_sysmem(verif_size, gpu, &sys_mem), done);
+    TEST_NV_CHECK_GOTO(mem_alloc_sysmem_and_map_cpu_kernel(verif_size, gpu, &sys_mem), done);
    TEST_NV_CHECK_GOTO(uvm_mem_map_gpu_kernel(sys_mem, gpu), done);

    sys_verif = (NvU64*)uvm_mem_get_cpu_addr_kernel(sys_mem);
@@ -100,9 +100,9 @@ static NV_STATUS check_accessible_from_gpu(uvm_gpu_t *gpu, uvm_mem_t *mem)
                                 "Memcopy %zd bytes from virtual sys_mem 0x%llx to %s mem 0x%llx [mem loc: %s, page size: %u]",
                                 size_this_time,
                                 sys_mem_gpu_address.address,
-                                 mem_gpu_address.is_virtual? "virtual" : "physical",
+                                 mem_gpu_address.is_virtual ? "virtual" : "physical",
                                 mem_gpu_address.address,
-                                 uvm_mem_is_sysmem(mem)? "sys" : "vid",
+                                 uvm_mem_is_sysmem(mem) ? "sys" : "vid",
                                 mem->chunk_size);

        gpu->parent->ce_hal->memcopy(&push, mem_gpu_address, sys_mem_gpu_address, size_this_time);
@@ -140,7 +140,7 @@ static NV_STATUS check_accessible_from_gpu(uvm_gpu_t *gpu, uvm_mem_t *mem)
                                 "Memcopy %zd bytes from virtual mem 0x%llx to %s sys_mem 0x%llx",
                                 size_this_time,
                                 mem_gpu_address.address,
-                                 sys_mem_gpu_address.is_virtual? "virtual" : "physical",
+                                 sys_mem_gpu_address.is_virtual ? "virtual" : "physical",
                                 sys_mem_gpu_address.address);

        gpu->parent->ce_hal->memcopy(&push, sys_mem_gpu_address, mem_gpu_address, size_this_time);
@@ -252,10 +252,9 @@ static NV_STATUS test_alloc_sysmem(uvm_va_space_t *va_space, NvU64 page_size, si
    params.page_size = page_size;
    params.mm = current->mm;

-    status = uvm_mem_alloc(&params, &mem);
-    TEST_CHECK_GOTO(status == NV_OK, error);
+    TEST_NV_CHECK_GOTO(uvm_mem_alloc(&params, &mem), error);

-    TEST_CHECK_GOTO(test_map_cpu(mem) == NV_OK, error);
+    TEST_NV_CHECK_GOTO(test_map_cpu(mem), error);

    for_each_va_space_gpu(gpu, va_space)
        TEST_NV_CHECK_GOTO(test_map_gpu(mem, gpu), error);
@@ -266,6 +265,7 @@ static NV_STATUS test_alloc_sysmem(uvm_va_space_t *va_space, NvU64 page_size, si

 error:
    uvm_mem_free(mem);
+
    return status;
 }

@@ -352,13 +352,15 @@ static NV_STATUS test_all(uvm_va_space_t *va_space)
    NvU32 current_alloc = 0;

    // Create allocations of these sizes
-    static const size_t sizes[] = {1, 4, 16, 1024, 4096, 1024 * 1024, 7 * 1024 * 1024 + 17 };
+    static const size_t sizes[] = { 1, 4, 16, 1024, 4096, 1024 * 1024, 7 * 1024 * 1024 + 17 };

    // Pascal+ can map sysmem with 4K, 64K and 2M PTEs, other GPUs can only use
    // 4K. Test all of the sizes supported by Pascal+ and 128K to match big page
    // size on pre-Pascal GPUs with 128K big page size.
    // Ampere+ also supports 512M PTEs, but since UVM's maximum chunk size is
    // 2M, we don't test for this page size.
+    // Blackwell+ also supports 256G PTEs and the above holds for this case too.
+
    static const NvU64 cpu_chunk_sizes = PAGE_SIZE | UVM_PAGE_SIZE_64K | UVM_PAGE_SIZE_128K | UVM_PAGE_SIZE_2M;

    // All supported page sizes will be tested, CPU has the most with 4 and +1
@@ -366,7 +368,6 @@ static NV_STATUS test_all(uvm_va_space_t *va_space)
    static const int max_supported_page_sizes = 4 + 1;
    int i;

-
    // TODO: Bug 3839176: the test is waived on Confidential Computing because
    // it assumes that GPU can access system memory without using encryption.
    if (g_uvm_global.conf_computing_enabled)
@@ -386,13 +387,13 @@ static NV_STATUS test_all(uvm_va_space_t *va_space)
        return NV_ERR_NO_MEMORY;

    for (i = 0; i < ARRAY_SIZE(sizes); ++i) {
-        NvU32 page_size = 0;
+        NvU64 page_size = 0;
        uvm_mem_t *mem;

        if (should_test_page_size(sizes[i], UVM_PAGE_SIZE_DEFAULT)) {
            status = test_alloc_sysmem(va_space, UVM_PAGE_SIZE_DEFAULT, sizes[i], &mem);
            if (status != NV_OK) {
-                UVM_TEST_PRINT("Failed to alloc sysmem size %zd, page_size default\n", sizes[i], page_size);
+                UVM_TEST_PRINT("Failed to alloc sysmem size %zd, page_size default\n", sizes[i]);
                goto cleanup;
            }
            all_mem[current_alloc++] = mem;
@@ -404,14 +405,14 @@ static NV_STATUS test_all(uvm_va_space_t *va_space)

            status = test_alloc_sysmem(va_space, page_size, sizes[i], &mem);
            if (status != NV_OK) {
-                UVM_TEST_PRINT("Failed to alloc sysmem size %zd, page_size %u\n", sizes[i], page_size);
+                UVM_TEST_PRINT("Failed to alloc sysmem size %zd, page_size %llu\n", sizes[i], page_size);
                goto cleanup;
            }
            all_mem[current_alloc++] = mem;
        }

        for_each_va_space_gpu(gpu, va_space) {
-            NvU32 page_sizes = gpu->address_space_tree.hal->page_sizes();
+            NvU64 page_sizes = gpu->address_space_tree.hal->page_sizes();

            UVM_ASSERT(max_supported_page_sizes >= hweight_long(page_sizes));

@@ -428,7 +429,7 @@ static NV_STATUS test_all(uvm_va_space_t *va_space)
            for_each_page_size(page_size, page_sizes) {
                status = test_alloc_vidmem(gpu, page_size, sizes[i], &mem);
                if (status != NV_OK) {
-                    UVM_TEST_PRINT("Test alloc vidmem failed, page_size %u size %zd GPU %s\n",
+                    UVM_TEST_PRINT("Test alloc vidmem failed, page_size %llu size %zd GPU %s\n",
                                   page_size,
                                   sizes[i],
                                   uvm_gpu_name(gpu));
@@ -461,17 +462,17 @@ cleanup:
 static NV_STATUS test_basic_vidmem(uvm_gpu_t *gpu)
 {
    NV_STATUS status = NV_OK;
-    NvU32 page_size;
-    NvU32 page_sizes = gpu->address_space_tree.hal->page_sizes();
-    NvU32 biggest_page_size = uvm_mmu_biggest_page_size_up_to(&gpu->address_space_tree, UVM_CHUNK_SIZE_MAX);
-    NvU32 smallest_page_size = page_sizes & ~(page_sizes - 1);
+    NvU64 page_size;
+    NvU64 page_sizes = gpu->address_space_tree.hal->page_sizes();
+    NvU64 biggest_page_size = uvm_mmu_biggest_page_size_up_to(&gpu->address_space_tree, UVM_CHUNK_SIZE_MAX);
+    NvU64 smallest_page_size = page_sizes & ~(page_sizes - 1);
    uvm_mem_t *mem = NULL;

    page_sizes &= UVM_CHUNK_SIZES_MASK;
    for_each_page_size(page_size, page_sizes) {
        TEST_CHECK_GOTO(uvm_mem_alloc_vidmem(page_size - 1, gpu, &mem) == NV_OK, done);
        if (gpu->mem_info.numa.enabled)
-            TEST_CHECK_GOTO(mem->chunk_size >= PAGE_SIZE && mem->chunk_size <= max(page_size, (NvU32)PAGE_SIZE), done);
+            TEST_CHECK_GOTO(mem->chunk_size >= PAGE_SIZE && mem->chunk_size <= max(page_size, (NvU64)PAGE_SIZE), done);
        else
            TEST_CHECK_GOTO(mem->chunk_size < page_size || page_size == smallest_page_size, done);
        uvm_mem_free(mem);
@@ -479,14 +480,14 @@ static NV_STATUS test_basic_vidmem(uvm_gpu_t *gpu)

        TEST_CHECK_GOTO(uvm_mem_alloc_vidmem(page_size, gpu, &mem) == NV_OK, done);
        if (gpu->mem_info.numa.enabled)
-            TEST_CHECK_GOTO(mem->chunk_size == max(page_size, (NvU32)PAGE_SIZE), done);
+            TEST_CHECK_GOTO(mem->chunk_size == max(page_size, (NvU64)PAGE_SIZE), done);
        else
            TEST_CHECK_GOTO(mem->chunk_size == page_size, done);
        uvm_mem_free(mem);
        mem = NULL;
    }

-    TEST_CHECK_GOTO(uvm_mem_alloc_vidmem(5 * ((NvU64)biggest_page_size) - 1, gpu, &mem) == NV_OK, done);
+    TEST_CHECK_GOTO(uvm_mem_alloc_vidmem(5 * biggest_page_size - 1, gpu, &mem) == NV_OK, done);
    TEST_CHECK_GOTO(mem->chunk_size == biggest_page_size, done);

 done:
@@ -569,6 +570,135 @@ done:
    return status;
 }

+static NV_STATUS check_huge_page_from_gpu(uvm_gpu_t *gpu, uvm_mem_t *mem, NvU64 offset)
+{
+    NV_STATUS status = NV_OK;
+    uvm_mem_t *sys_mem = NULL;
+    uvm_push_t push;
+    NvU64 *sys_verif;
+    NvU64 *expected_value;
+    NvU64 verif_size = mem->size;
+    uvm_gpu_address_t mem_gpu_address, sys_mem_gpu_address;
+
+    UVM_ASSERT(uvm_mem_physical_size(mem) >= verif_size);
+
+    TEST_NV_CHECK_GOTO(mem_alloc_sysmem_and_map_cpu_kernel(verif_size, gpu, &sys_mem), done);
+    sys_verif = uvm_mem_get_cpu_addr_kernel(sys_mem);
+    memset(sys_verif, 0x0, mem->size);
+
+    TEST_NV_CHECK_GOTO(uvm_mem_map_gpu_kernel(sys_mem, gpu), done);
+
+    mem_gpu_address = uvm_gpu_address_virtual(offset);
+    sys_mem_gpu_address = uvm_mem_gpu_address_virtual_kernel(sys_mem, gpu);
+
+    TEST_NV_CHECK_GOTO(uvm_push_begin(gpu->channel_manager,
+                                      UVM_CHANNEL_TYPE_GPU_TO_CPU,
+                                      &push,
+                                      "Memcopy %llu bytes from virtual mem 0x%llx to virtual sys_mem 0x%llx",
+                                      verif_size,
+                                      mem_gpu_address.address,
+                                      sys_mem_gpu_address.address),
+                       done);
+
+    gpu->parent->ce_hal->memcopy(&push, sys_mem_gpu_address, mem_gpu_address, verif_size);
+    TEST_NV_CHECK_GOTO(uvm_push_end_and_wait(&push), done);
+
+    expected_value = uvm_mem_get_cpu_addr_kernel(mem);
+    TEST_CHECK_GOTO(memcmp(sys_verif, expected_value, verif_size) == 0, done);
+
+done:
+    uvm_mem_free(sys_mem);
+
+    return status;
+}
+
+static NvU64 test_pte_maker(uvm_page_table_range_vec_t *range_vec, NvU64 offset, void *phys_addr)
+{
+    uvm_page_tree_t *tree = range_vec->tree;
+    uvm_gpu_phys_address_t phys = uvm_gpu_phys_address(UVM_APERTURE_SYS, (NvU64)phys_addr);
+
+    return tree->hal->make_pte(phys.aperture, phys.address, UVM_PROT_READ_ONLY, UVM_MMU_PTE_FLAGS_NONE);
+}
+
+static NV_STATUS test_huge_page_size(uvm_va_space_t *va_space, uvm_gpu_t *gpu, NvU64 page_size)
+{
+    NV_STATUS status = NV_OK;
+    uvm_mem_t *mem = NULL;
+    size_t size = PAGE_SIZE;
+    NvU64 *cpu_addr;
+    NvU64 huge_gpu_va;
+    NvU64 gpu_phys_addr;
+    uvm_page_table_range_vec_t *range_vec;
+    NvU8 value = 0xA5;
+
+    // TODO: Bug 3839176: the test is waived on Confidential Computing because
+    // it assumes that GPU can access system memory without using encryption.
+    if (g_uvm_global.conf_computing_enabled)
+        return NV_OK;
+
+    TEST_NV_CHECK_GOTO(mem_alloc_sysmem_and_map_cpu_kernel(size, gpu, &mem), cleanup);
+    cpu_addr = uvm_mem_get_cpu_addr_kernel(mem);
+    memset(cpu_addr, value, mem->size);
+
+    // Map it on the GPU (uvm_mem base area), it creates GPU physical address
+    // for the sysmem mapping.
+    TEST_NV_CHECK_GOTO(uvm_mem_map_gpu_phys(mem, gpu), cleanup);
+
+    huge_gpu_va = UVM_ALIGN_UP(gpu->parent->uvm_mem_va_base + gpu->parent->uvm_mem_va_size, page_size);
+    TEST_CHECK_GOTO(IS_ALIGNED(huge_gpu_va, page_size), cleanup);
+    TEST_CHECK_GOTO((huge_gpu_va + page_size) < (1ull << gpu->address_space_tree.hal->num_va_bits()), cleanup);
+
+    // Manually mapping huge_gpu_va because page_size is larger than the largest
+    // uvm_mem_t chunk/page size, so we don't use uvm_mem_gpu_kernel() helper.
+    TEST_NV_CHECK_GOTO(uvm_page_table_range_vec_create(&gpu->address_space_tree,
+                                                       huge_gpu_va,
+                                                       page_size,
+                                                       page_size,
+                                                       UVM_PMM_ALLOC_FLAGS_NONE,
+                                                       &range_vec), cleanup);
+
+    gpu_phys_addr = uvm_mem_gpu_physical(mem, gpu, 0, size).address;
+
+    TEST_NV_CHECK_GOTO(uvm_page_table_range_vec_write_ptes(range_vec,
+                                                           UVM_MEMBAR_NONE,
+                                                           test_pte_maker,
+                                                           (void *)gpu_phys_addr), cleanup_range);
+
+    // Despite the huge page_size mapping, only PAGE_SIZE is backed by an
+    // allocation "own" by the test. We compute the offset within the huge page
+    // to verify only this segment.
+    TEST_NV_CHECK_GOTO(check_huge_page_from_gpu(gpu, mem, huge_gpu_va + (gpu_phys_addr % page_size)),
+                       cleanup_range);
+
+cleanup_range:
+    uvm_page_table_range_vec_destroy(range_vec);
+    range_vec = NULL;
+
+cleanup:
+    uvm_mem_free(mem);
+
+    return status;
+}
+
+// Check the GPU access to memory from a 512MB+ page size mapping.
+// The test allocates a PAGE_SIZE sysmem page, but uses the GMMU to map a huge
+// page size area. It maps the allocated page to this area, and uses the CE to
+// access it, thus, exercising a memory access using a huge page.
+static NV_STATUS test_huge_pages(uvm_va_space_t *va_space, uvm_gpu_t *gpu)
+{
+    NvU64 page_sizes = gpu->address_space_tree.hal->page_sizes();
+    NvU64 page_size = 0;
+
+    for_each_page_size(page_size, page_sizes) {
+        if (page_size < UVM_PAGE_SIZE_512M)
+            continue;
+
+        TEST_NV_CHECK_RET(test_huge_page_size(va_space, gpu, page_size));
+    }
+
+    return NV_OK;
+}
+
 static NV_STATUS test_basic(uvm_va_space_t *va_space)
 {
    uvm_gpu_t *gpu;
@@ -579,6 +709,7 @@ static NV_STATUS test_basic(uvm_va_space_t *va_space)
        TEST_NV_CHECK_RET(test_basic_vidmem(gpu));
        TEST_NV_CHECK_RET(test_basic_sysmem_dma(gpu));
        TEST_NV_CHECK_RET(test_basic_dma_pool(gpu));
+        TEST_NV_CHECK_RET(test_huge_pages(va_space, gpu));
    }

    return NV_OK;
--- a/kernel-open/nvidia-uvm/uvm_migrate.c
+++ b/kernel-open/nvidia-uvm/uvm_migrate.c
@@ -589,7 +589,7 @@ static NV_STATUS uvm_migrate_ranges(uvm_va_space_t *va_space,
                    skipped_migrate = true;
            }
            else if (uvm_processor_mask_test(&va_range->uvm_lite_gpus, dest_id) &&
-                     !uvm_id_equal(dest_id, policy->preferred_location)) {
+                     !uvm_va_policy_preferred_location_equal(policy, dest_id, NUMA_NO_NODE)) {
                // Don't migrate to a non-faultable GPU that is in UVM-Lite mode,
                // unless it's the preferred location
                status = NV_ERR_INVALID_DEVICE;
--- a/kernel-open/nvidia-uvm/uvm_migrate_pageable.c
+++ b/kernel-open/nvidia-uvm/uvm_migrate_pageable.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2018-2023 NVIDIA Corporation
+    Copyright (c) 2018-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -52,10 +52,6 @@ static NV_STATUS migrate_vma_page_copy_address(struct page *page,
    uvm_gpu_t *owning_gpu = UVM_ID_IS_CPU(resident_id)? NULL: uvm_va_space_get_gpu(va_space, resident_id);
    const bool can_copy_from = uvm_processor_mask_test(&va_space->can_copy_from[uvm_id_value(copying_gpu->id)],
                                                       resident_id);
-    const bool direct_peer = owning_gpu &&
-                             (owning_gpu != copying_gpu) &&
-                             can_copy_from &&
-                             !uvm_gpu_peer_caps(owning_gpu, copying_gpu)->is_indirect_peer;

    UVM_ASSERT(page_index < state->num_pages);

@@ -65,15 +61,13 @@ static NV_STATUS migrate_vma_page_copy_address(struct page *page,
        // Local vidmem address
        *gpu_addr = uvm_gpu_address_copy(owning_gpu, uvm_gpu_page_to_phys_address(owning_gpu, page));
    }
-    else if (direct_peer) {
-        // Direct GPU peer
+    else if (owning_gpu && can_copy_from) {
        uvm_gpu_identity_mapping_t *gpu_peer_mappings = uvm_gpu_get_peer_mapping(copying_gpu, owning_gpu->id);
        uvm_gpu_phys_address_t phys_addr = uvm_gpu_page_to_phys_address(owning_gpu, page);

        *gpu_addr = uvm_gpu_address_virtual(gpu_peer_mappings->base + phys_addr.address);
    }
    else {
-        // Sysmem/Indirect Peer
        NV_STATUS status = uvm_parent_gpu_map_cpu_page(copying_gpu->parent, page, &state->dma.addrs[page_index]);

        if (status != NV_OK)
@@ -507,7 +501,7 @@ static NV_STATUS migrate_vma_copy_pages(struct vm_area_struct *vma,
    return NV_OK;
 }

-void migrate_vma_cleanup_pages(unsigned long *dst, unsigned long npages)
+static void migrate_vma_cleanup_pages(unsigned long *dst, unsigned long npages)
 {
    unsigned long i;

@@ -523,7 +517,7 @@ void migrate_vma_cleanup_pages(unsigned long *dst, unsigned long npages)
    }
 }

-void uvm_migrate_vma_alloc_and_copy(struct migrate_vma *args, migrate_vma_state_t *state)
+static void migrate_vma_alloc_and_copy(struct migrate_vma *args, migrate_vma_state_t *state)
 {
    struct vm_area_struct *vma = args->vma;
    unsigned long start = args->start;
@@ -553,12 +547,13 @@ void uvm_migrate_vma_alloc_and_copy(struct migrate_vma *args, migrate_vma_state_
        migrate_vma_cleanup_pages(args->dst, state->num_pages);
 }

-void uvm_migrate_vma_alloc_and_copy_helper(struct vm_area_struct *vma,
-                                const unsigned long *src,
-                                unsigned long *dst,
-                                unsigned long start,
-                                unsigned long end,
-                                void *private)
+#if defined(CONFIG_MIGRATE_VMA_HELPER)
+static void migrate_vma_alloc_and_copy_helper(struct vm_area_struct *vma,
+                                              const unsigned long *src,
+                                              unsigned long *dst,
+                                              unsigned long start,
+                                              unsigned long end,
+                                              void *private)
 {
    struct migrate_vma args =
    {
@@ -569,10 +564,11 @@ void uvm_migrate_vma_alloc_and_copy_helper(struct vm_area_struct *vma,
        .end = end,
    };

-    uvm_migrate_vma_alloc_and_copy(&args, (migrate_vma_state_t *) private);
+    migrate_vma_alloc_and_copy(&args, (migrate_vma_state_t *) private);
 }
+#endif

-void uvm_migrate_vma_finalize_and_map(struct migrate_vma *args, migrate_vma_state_t *state)
+static void uvm_migrate_vma_finalize_and_map(struct migrate_vma *args, migrate_vma_state_t *state)
 {
    unsigned long i;

@@ -642,12 +638,13 @@ void uvm_migrate_vma_finalize_and_map(struct migrate_vma *args, migrate_vma_stat
    UVM_ASSERT(!bitmap_intersects(state->populate_pages_mask, state->allocation_failed_mask, state->num_pages));
 }

-void uvm_migrate_vma_finalize_and_map_helper(struct vm_area_struct *vma,
-                                             const unsigned long *src,
-                                             const unsigned long *dst,
-                                             unsigned long start,
-                                             unsigned long end,
-                                             void *private)
+#if defined(CONFIG_MIGRATE_VMA_HELPER)
+static void migrate_vma_finalize_and_map_helper(struct vm_area_struct *vma,
+                                                const unsigned long *src,
+                                                const unsigned long *dst,
+                                                unsigned long start,
+                                                unsigned long end,
+                                                void *private)
 {
    struct migrate_vma args =
    {
@@ -660,6 +657,7 @@ void uvm_migrate_vma_finalize_and_map_helper(struct vm_area_struct *vma,

    uvm_migrate_vma_finalize_and_map(&args, (migrate_vma_state_t *) private);
 }
+#endif

 static NV_STATUS nv_migrate_vma(struct migrate_vma *args, migrate_vma_state_t *state)
 {
@@ -668,8 +666,8 @@ static NV_STATUS nv_migrate_vma(struct migrate_vma *args, migrate_vma_state_t *s
 #if defined(CONFIG_MIGRATE_VMA_HELPER)
    static const struct migrate_vma_ops uvm_migrate_vma_ops =
    {
-        .alloc_and_copy = uvm_migrate_vma_alloc_and_copy_helper,
-        .finalize_and_map = uvm_migrate_vma_finalize_and_map_helper,
+        .alloc_and_copy = migrate_vma_alloc_and_copy_helper,
+        .finalize_and_map = migrate_vma_finalize_and_map_helper,
    };

    ret = migrate_vma(&uvm_migrate_vma_ops, args->vma, args->start, args->end, args->src, args->dst, state);
@@ -685,7 +683,7 @@ static NV_STATUS nv_migrate_vma(struct migrate_vma *args, migrate_vma_state_t *s
    if (ret < 0)
        return errno_to_nv_status(ret);

-    uvm_migrate_vma_alloc_and_copy(args, state);
+    migrate_vma_alloc_and_copy(args, state);
    if (state->status == NV_OK) {
        migrate_vma_pages(args);
        uvm_migrate_vma_finalize_and_map(args, state);
--- a/kernel-open/nvidia-uvm/uvm_migrate_pageable.h
+++ b/kernel-open/nvidia-uvm/uvm_migrate_pageable.h
@@ -150,23 +150,6 @@ struct migrate_vma {
    unsigned long           start;
    unsigned long           end;
 };
-
-void uvm_migrate_vma_alloc_and_copy_helper(struct vm_area_struct *vma,
-                                           const unsigned long *src,
-                                           unsigned long *dst,
-                                           unsigned long start,
-                                           unsigned long end,
-                                           void *private);
-
-void uvm_migrate_vma_finalize_and_map_helper(struct vm_area_struct *vma,
-                                             const unsigned long *src,
-                                             const unsigned long *dst,
-                                             unsigned long start,
-                                             unsigned long end,
-                                             void *private);
-#else
-void uvm_migrate_vma_alloc_and_copy(struct migrate_vma *args, migrate_vma_state_t *state);
-void uvm_migrate_vma_finalize_and_map(struct migrate_vma *args, migrate_vma_state_t *state);
 #endif // CONFIG_MIGRATE_VMA_HELPER

 // Populates the given VA range and tries to migrate all the pages to dst_id. If
--- a/kernel-open/nvidia-uvm/uvm_mmu.c
+++ b/kernel-open/nvidia-uvm/uvm_mmu.c
@@ -1349,7 +1349,7 @@ NV_STATUS uvm_page_tree_wait(uvm_page_tree_t *tree)
 static NV_STATUS try_get_ptes(uvm_page_tree_t *tree,
                              NvU64 page_size,
                              NvU64 start,
-                              NvLength size,
+                              NvU64 size,
                              uvm_page_table_range_t *range,
                              NvU32 *cur_depth,
                              uvm_page_directory_t **dir_cache)
@@ -1379,9 +1379,9 @@ static NV_STATUS try_get_ptes(uvm_page_tree_t *tree,
    // This algorithm will work with unaligned ranges, but the caller's intent
    // is unclear
    UVM_ASSERT_MSG(start % page_size == 0 && size % page_size == 0,
-                   "start 0x%llx size 0x%zx page_size 0x%llx\n",
+                   "start 0x%llx size 0x%llx page_size 0x%llx\n",
                   start,
-                   (size_t)size,
+                   size,
                   page_size);

    // The GPU should be capable of addressing the passed range
@@ -1444,7 +1444,7 @@ static NV_STATUS try_get_ptes(uvm_page_tree_t *tree,
    return write_gpu_state(tree, page_size, invalidate_depth, used_count, dirs_used);
 }

-static NV_STATUS map_remap(uvm_page_tree_t *tree, NvU64 start, NvLength size, uvm_page_table_range_t *range)
+static NV_STATUS map_remap(uvm_page_tree_t *tree, NvU64 start, NvU64 size, uvm_page_table_range_t *range)
 {
    NV_STATUS status;
    uvm_push_t push;
@@ -1502,7 +1502,7 @@ static NV_STATUS map_remap(uvm_page_tree_t *tree, NvU64 start, NvLength size, uv
 NV_STATUS uvm_page_tree_get_ptes_async(uvm_page_tree_t *tree,
                                       NvU64 page_size,
                                       NvU64 start,
-                                       NvLength size,
+                                       NvU64 size,
                                       uvm_pmm_alloc_flags_t pmm_flags,
                                       uvm_page_table_range_t *range)
 {
@@ -1547,7 +1547,7 @@ NV_STATUS uvm_page_tree_get_ptes_async(uvm_page_tree_t *tree,
 NV_STATUS uvm_page_tree_get_ptes(uvm_page_tree_t *tree,
                                 NvU64 page_size,
                                 NvU64 start,
-                                 NvLength size,
+                                 NvU64 size,
                                 uvm_pmm_alloc_flags_t pmm_flags,
                                 uvm_page_table_range_t *range)
 {
@@ -2076,13 +2076,13 @@ static NV_STATUS uvm_page_table_range_vec_write_ptes_cpu(uvm_page_table_range_ve
        uvm_mmu_page_table_alloc_t *dir = &range->table->phys_alloc;
        NvU32 entry;

-        for (entry = range->start_index; entry < range->entry_count; ++entry) {
+        for (entry = 0; entry < range->entry_count; ++entry) {
            NvU64 pte_bits[2] = {pte_maker(range_vec, offset, caller_data), 0};

            if (entry_size == 8)
-                uvm_mmu_page_table_cpu_memset_8(tree->gpu, dir, entry, pte_bits[0], 1);
+                uvm_mmu_page_table_cpu_memset_8(tree->gpu, dir, range->start_index + entry, pte_bits[0], 1);
            else
-                uvm_mmu_page_table_cpu_memset_16(tree->gpu, dir, entry, pte_bits, 1);
+                uvm_mmu_page_table_cpu_memset_16(tree->gpu, dir, range->start_index + entry, pte_bits, 1);

            offset += range_vec->page_size;
        }
@@ -2310,7 +2310,7 @@ bool uvm_mmu_parent_gpu_needs_dynamic_sysmem_mapping(uvm_parent_gpu_t *parent_gp
    return uvm_parent_gpu_is_virt_mode_sriov_heavy(parent_gpu);
 }

-NV_STATUS create_static_vidmem_mapping(uvm_gpu_t *gpu)
+static NV_STATUS create_static_vidmem_mapping(uvm_gpu_t *gpu)
 {
    NvU64 page_size;
    NvU64 size;
@@ -2406,9 +2406,9 @@ void uvm_mmu_init_gpu_chunk_sizes(uvm_parent_gpu_t *parent_gpu)
    // to handle allocating multiple chunks per page.
    parent_gpu->mmu_user_chunk_sizes = sizes & PAGE_MASK;

-    // Ampere+ GPUs support 512MB page size, however, the maximum chunk size is
-    // 2MB(i.e., UVM_CHUNK_SIZE_MAX), therefore we mask out any supported page
-    // size greater than UVM_CHUNK_SIZE_MAX from the chunk size list.
+    // The maximum chunk size is 2MB (i.e., UVM_CHUNK_SIZE_MAX), therefore we
+    // mask out any supported page size greater than UVM_CHUNK_SIZE_MAX from
+    // the chunk size list.
    parent_gpu->mmu_user_chunk_sizes &= UVM_CHUNK_SIZES_MASK;

    parent_gpu->mmu_kernel_chunk_sizes = allocation_sizes_for_big_page_size(parent_gpu, UVM_PAGE_SIZE_64K) |
--- a/kernel-open/nvidia-uvm/uvm_mmu.h
+++ b/kernel-open/nvidia-uvm/uvm_mmu.h
@@ -39,10 +39,10 @@
 // The following memory regions are not to scale. The memory layout is linear,
 // i.e., no canonical form address conversion.
 //
-// Hopper:
+// Hopper-Blackwell:
 // +----------------+ 128PB
 // |                |
-// |   (not used)   |
+// |   (not used)*  | * See note(1)
 // |                |
 // ------------------
 // |uvm_mem_t(128GB)| (uvm_mem_va_size)
@@ -66,7 +66,7 @@
 // Pascal-Ada:
 // +----------------+ 512TB
 // |                |
-// |   (not used)   |
+// |   (not used)*  | * See note(1)
 // |                |
 // ------------------
 // |uvm_mem_t(128GB)| (uvm_mem_va_size)
@@ -107,6 +107,9 @@
 // | rm_mem(128GB)  | (rm_va_size)
 // |                |
 // +----------------+ 0 (rm_va_base)
+//
+// Note (1): This region is used in unit tests, see
+// tests/uvm_mem_test.c:test_huge_pages().

 // Maximum memory of any GPU.
 #define UVM_GPU_MAX_PHYS_MEM (UVM_SIZE_1TB)
@@ -376,7 +379,7 @@ void uvm_page_tree_deinit(uvm_page_tree_t *tree);
 NV_STATUS uvm_page_tree_get_ptes(uvm_page_tree_t *tree,
                                 NvU64 page_size,
                                 NvU64 start,
-                                 NvLength size,
+                                 NvU64 size,
                                 uvm_pmm_alloc_flags_t pmm_flags,
                                 uvm_page_table_range_t *range);

@@ -386,7 +389,7 @@ NV_STATUS uvm_page_tree_get_ptes(uvm_page_tree_t *tree,
 NV_STATUS uvm_page_tree_get_ptes_async(uvm_page_tree_t *tree,
                                       NvU64 page_size,
                                       NvU64 start,
-                                       NvLength size,
+                                       NvU64 size,
                                       uvm_pmm_alloc_flags_t pmm_flags,
                                       uvm_page_table_range_t *range);

--- a/kernel-open/nvidia-uvm/uvm_page_tree_test.c
+++ b/kernel-open/nvidia-uvm/uvm_page_tree_test.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2023 NVIDIA Corporation
+    Copyright (c) 2015-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -46,6 +46,9 @@
 // HOPPER_*
 #include "clc8b5.h"
 #include "clc86f.h"
+// BLACKWELL_*
+#include "clc96f.h"
+#include "clc9b5.h"
 // ARCHITECTURE_*
 #include "ctrl2080mc.h"

@@ -672,6 +675,77 @@ static NV_STATUS get_single_page_512m(uvm_gpu_t *gpu)
    return NV_OK;
 }

+static NV_STATUS alloc_256g_memory(uvm_gpu_t *gpu)
+{
+    uvm_page_tree_t tree;
+    uvm_page_table_range_t range;
+
+    NvLength size = 256 * UVM_SIZE_1GB;
+    MEM_NV_CHECK_RET(test_page_tree_init(gpu, BIG_PAGE_SIZE_PASCAL, &tree), NV_OK);
+    MEM_NV_CHECK_RET(test_page_tree_get_ptes(&tree, UVM_PAGE_SIZE_256G, 0, size, &range), NV_OK);
+    TEST_CHECK_RET(range.entry_count == 1);
+    TEST_CHECK_RET(range.table->depth == 2);
+    TEST_CHECK_RET(range.start_index == 0);
+    TEST_CHECK_RET(range.page_size == UVM_PAGE_SIZE_256G);
+    TEST_CHECK_RET(tree.root->ref_count == 1);
+    TEST_CHECK_RET(tree.root->entries[0]->ref_count == 1);
+    TEST_CHECK_RET(tree.root->entries[0]->entries[0]->ref_count == 1);
+    TEST_CHECK_RET(range.table == tree.root->entries[0]->entries[0]);
+    uvm_page_tree_put_ptes(&tree, &range);
+    UVM_ASSERT(tree.root->ref_count == 0);
+    uvm_page_tree_deinit(&tree);
+
+    return NV_OK;
+}
+
+static NV_STATUS alloc_adjacent_256g_memory(uvm_gpu_t *gpu)
+{
+    uvm_page_tree_t tree;
+    uvm_page_table_range_t range1;
+    uvm_page_table_range_t range2;
+
+    NvLength size = 256 * UVM_SIZE_1GB;
+    MEM_NV_CHECK_RET(test_page_tree_init(gpu, BIG_PAGE_SIZE_PASCAL, &tree), NV_OK);
+    MEM_NV_CHECK_RET(test_page_tree_get_ptes(&tree, UVM_PAGE_SIZE_256G, size, size, &range1), NV_OK);
+    TEST_CHECK_RET(range1.entry_count == 1);
+
+    MEM_NV_CHECK_RET(test_page_tree_get_ptes(&tree, UVM_PAGE_SIZE_256G, 0, size, &range2), NV_OK);
+    TEST_CHECK_RET(range2.entry_count == 1);
+    TEST_CHECK_RET(range1.table == range2.table);
+    TEST_CHECK_RET(range1.table == tree.root->entries[0]->entries[0]);
+    TEST_CHECK_RET(range1.start_index == 1);
+    TEST_CHECK_RET(range2.start_index == 0);
+
+    uvm_page_tree_put_ptes(&tree, &range1);
+    uvm_page_tree_put_ptes(&tree, &range2);
+    uvm_page_tree_deinit(&tree);
+
+    return NV_OK;
+}
+
+static NV_STATUS get_single_page_256g(uvm_gpu_t *gpu)
+{
+    uvm_page_tree_t tree;
+    uvm_page_table_range_t range;
+
+    // use a start address not at the beginning of a PDE2 entry's range
+    NvU64 start = 3 * 256 * UVM_SIZE_1GB;
+    NvLength size = 256 * UVM_SIZE_1GB;
+
+    MEM_NV_CHECK_RET(test_page_tree_init(gpu, BIG_PAGE_SIZE_PASCAL, &tree), NV_OK);
+    MEM_NV_CHECK_RET(test_page_tree_get_ptes(&tree, UVM_PAGE_SIZE_256G, start, size, &range), NV_OK);
+
+    TEST_CHECK_RET(range.entry_count == 1);
+    TEST_CHECK_RET(range.table->depth == 2);
+    TEST_CHECK_RET(range.page_size == UVM_PAGE_SIZE_256G);
+
+    uvm_page_tree_put_ptes(&tree, &range);
+    TEST_CHECK_RET(tree.root->ref_count == 0);
+    uvm_page_tree_deinit(&tree);
+
+    return NV_OK;
+}
+
 static NV_STATUS get_entire_table_4k(uvm_gpu_t *gpu)
 {
    uvm_page_tree_t tree;
@@ -719,6 +793,29 @@ static NV_STATUS get_entire_table_512m(uvm_gpu_t *gpu)
    return NV_OK;
 }

+static NV_STATUS get_entire_table_256g(uvm_gpu_t *gpu)
+{
+    uvm_page_tree_t tree;
+    uvm_page_table_range_t range;
+
+    NvU64 start = 1UL << 48;
+    NvLength size = 512UL * UVM_PAGE_SIZE_256G;
+
+    MEM_NV_CHECK_RET(test_page_tree_init_kernel(gpu, BIG_PAGE_SIZE_PASCAL, &tree), NV_OK);
+    MEM_NV_CHECK_RET(test_page_tree_get_ptes(&tree, UVM_PAGE_SIZE_256G, start, size, &range), NV_OK);
+
+    TEST_CHECK_RET(range.table == tree.root->entries[0]->entries[2]);
+    TEST_CHECK_RET(range.entry_count == 512);
+    TEST_CHECK_RET(range.table->depth == 2);
+    TEST_CHECK_RET(range.page_size == UVM_PAGE_SIZE_256G);
+    TEST_CHECK_RET(tree.root->ref_count == 1);
+
+    uvm_page_tree_put_ptes(&tree, &range);
+    uvm_page_tree_deinit(&tree);
+
+    return NV_OK;
+}
+
 static NV_STATUS split_4k_from_2m(uvm_gpu_t *gpu)
 {
    uvm_page_tree_t tree;
@@ -805,6 +902,43 @@ static NV_STATUS split_2m_from_512m(uvm_gpu_t *gpu)
    return NV_OK;
 }

+static NV_STATUS split_512m_from_256g(uvm_gpu_t *gpu)
+{
+    uvm_page_tree_t tree;
+    uvm_page_table_range_t range_256g;
+    uvm_page_table_range_t range_adj;
+    uvm_page_table_range_t range_512m;
+
+    NvU64 start = 1UL << 48;
+    NvLength size = UVM_PAGE_SIZE_256G;
+
+    MEM_NV_CHECK_RET(test_page_tree_init_kernel(gpu, BIG_PAGE_SIZE_PASCAL, &tree), NV_OK);
+    MEM_NV_CHECK_RET(test_page_tree_get_ptes(&tree, UVM_PAGE_SIZE_256G, start, size, &range_256g), NV_OK);
+    MEM_NV_CHECK_RET(test_page_tree_get_ptes(&tree, UVM_PAGE_SIZE_256G, start + size, size, &range_adj), NV_OK);
+
+    TEST_CHECK_RET(range_256g.entry_count == 1);
+    TEST_CHECK_RET(range_256g.table->depth == 2);
+    TEST_CHECK_RET(range_adj.entry_count == 1);
+    TEST_CHECK_RET(range_adj.table->depth == 2);
+
+    // Need to release the 256G page so that the reference count is right.
+    uvm_page_tree_put_ptes(&tree, &range_256g);
+    MEM_NV_CHECK_RET(test_page_tree_get_ptes(&tree, UVM_PAGE_SIZE_512M, start, size, &range_512m), NV_OK);
+
+    TEST_CHECK_RET(range_512m.entry_count == 512);
+    TEST_CHECK_RET(range_512m.table->depth == 3);
+    TEST_CHECK_RET(range_512m.table == tree.root->entries[0]->entries[2]->entries[0]);
+    TEST_CHECK_RET(range_512m.start_index == 0);
+
+    // Free everything
+    uvm_page_tree_put_ptes(&tree, &range_adj);
+    uvm_page_tree_put_ptes(&tree, &range_512m);
+
+    uvm_page_tree_deinit(&tree);
+
+    return NV_OK;
+}
+
 static NV_STATUS get_512mb_range(uvm_gpu_t *gpu)
 {
    uvm_page_tree_t tree;
@@ -843,6 +977,25 @@ static NV_STATUS get_2gb_range(uvm_gpu_t *gpu)
    return NV_OK;
 }

+static NV_STATUS get_1tb_range(uvm_gpu_t *gpu)
+{
+    uvm_page_tree_t tree;
+    uvm_page_table_range_t range;
+
+    NvU64 start = UVM_SIZE_1TB;
+    NvU64 size = start;
+
+    MEM_NV_CHECK_RET(test_page_tree_init(gpu, BIG_PAGE_SIZE_PASCAL, &tree), NV_OK);
+    MEM_NV_CHECK_RET(test_page_tree_get_ptes(&tree, UVM_PAGE_SIZE_256G, start, size, &range), NV_OK);
+    TEST_CHECK_RET(range.entry_count == 4);
+    TEST_CHECK_RET(range.table->depth == 2);
+    TEST_CHECK_RET(range.start_index == 4);
+    uvm_page_tree_put_ptes(&tree, &range);
+    uvm_page_tree_deinit(&tree);
+
+    return NV_OK;
+}
+
 static NV_STATUS get_two_free_apart(uvm_gpu_t *gpu)
 {
    uvm_page_tree_t tree;
@@ -1040,7 +1193,7 @@ static NV_STATUS fast_split_double_backoff(uvm_gpu_t *gpu)
    return NV_OK;
 }

-static NV_STATUS test_tlb_invalidates(uvm_gpu_t *gpu)
+static NV_STATUS test_tlb_invalidates_gmmu_v2(uvm_gpu_t *gpu)
 {
    NV_STATUS status = NV_OK;
    uvm_page_tree_t tree;
@@ -1103,11 +1256,80 @@ static NV_STATUS test_tlb_invalidates(uvm_gpu_t *gpu)
    return status;
 }

+static NV_STATUS test_tlb_invalidates_gmmu_v3(uvm_gpu_t *gpu)
+{
+    NV_STATUS status = NV_OK;
+    uvm_page_tree_t tree;
+    uvm_page_table_range_t entries[6];
+    int i;
+
+    // Depth 5
+    NvU64 extent_pte = UVM_PAGE_SIZE_2M;
+
+    // Depth 4
+    NvU64 extent_pde0 = extent_pte * (1ull << 8);
+
+    // Depth 3
+    NvU64 extent_pde1 = extent_pde0 * (1ull << 9);
+
+    // Depth 2
+    NvU64 extent_pde2 = extent_pde1 * (1ull << 9);
+
+    // Depth 1
+    NvU64 extent_pde3 = extent_pde2 * (1ull << 9);
+
+    MEM_NV_CHECK_RET(test_page_tree_init_kernel(gpu, BIG_PAGE_SIZE_PASCAL, &tree), NV_OK);
+
+    fake_tlb_invals_enable();
+
+    TEST_CHECK_RET(assert_entry_invalidate(&tree, UVM_PAGE_SIZE_4K, 0, 0, true));
+    TEST_CHECK_RET(assert_entry_invalidate(&tree, UVM_PAGE_SIZE_4K, 0, 0, true));
+
+    TEST_CHECK_RET(test_page_tree_get_entry(&tree, UVM_PAGE_SIZE_4K, 0, &entries[0]) == NV_OK);
+    TEST_CHECK_RET(assert_and_reset_last_invalidate(0, false));
+
+    TEST_CHECK_RET(assert_entry_no_invalidate(&tree, UVM_PAGE_SIZE_4K, extent_pte - UVM_PAGE_SIZE_4K));
+
+    TEST_CHECK_RET(assert_entry_invalidate(&tree, UVM_PAGE_SIZE_64K, 0, 4, true));
+
+    TEST_CHECK_RET(test_page_tree_get_entry(&tree, UVM_PAGE_SIZE_64K, 0, &entries[1]) == NV_OK);
+    TEST_CHECK_RET(assert_and_reset_last_invalidate(4, false));
+
+    TEST_CHECK_RET(test_page_tree_get_entry(&tree, UVM_PAGE_SIZE_4K, extent_pde0, &entries[2]) == NV_OK);
+    TEST_CHECK_RET(assert_and_reset_last_invalidate(3, false));
+
+    TEST_CHECK_RET(test_page_tree_get_entry(&tree, UVM_PAGE_SIZE_4K, extent_pde1, &entries[3]) == NV_OK);
+    TEST_CHECK_RET(assert_and_reset_last_invalidate(2, false));
+
+    TEST_CHECK_RET(test_page_tree_get_entry(&tree, UVM_PAGE_SIZE_4K, extent_pde2, &entries[4]) == NV_OK);
+    TEST_CHECK_RET(assert_and_reset_last_invalidate(1, false));
+
+    TEST_CHECK_RET(test_page_tree_get_entry(&tree, UVM_PAGE_SIZE_4K, extent_pde3, &entries[5]) == NV_OK);
+    TEST_CHECK_RET(assert_and_reset_last_invalidate(0, false));
+
+    for (i = 5; i > 1; --i) {
+        uvm_page_tree_put_ptes(&tree, &entries[i]);
+        TEST_CHECK_RET(assert_and_reset_last_invalidate(5 - i, true));
+    }
+
+    uvm_page_tree_put_ptes(&tree, &entries[0]);
+    TEST_CHECK_RET(assert_and_reset_last_invalidate(4, true));
+
+    uvm_page_tree_put_ptes(&tree, &entries[1]);
+    TEST_CHECK_RET(assert_and_reset_last_invalidate(0, true));
+
+    fake_tlb_invals_disable();
+
+    uvm_page_tree_deinit(&tree);
+
+    return status;
+}
+
 static NV_STATUS test_tlb_batch_invalidates_case(uvm_page_tree_t *tree,
                                                 NvU64 base,
                                                 NvU64 size,
-                                                 NvU32 min_page_size,
-                                                 NvU32 max_page_size)
+                                                 NvU64 min_page_size,
+                                                 NvU64 max_page_size)
 {
    NV_STATUS status = NV_OK;
    uvm_push_t push;
@@ -1129,7 +1351,7 @@ static NV_STATUS test_tlb_batch_invalidates_case(uvm_page_tree_t *tree,
        uvm_tlb_batch_begin(tree, &batch);

        for (j = 0; j < i; ++j) {
-            NvU32 used_max_page_size = (j & 1) ? max_page_size : min_page_size;
+            NvU64 used_max_page_size = (j & 1) ? max_page_size : min_page_size;
            NvU32 expected_range_depth = tree->hal->page_table_depth(used_max_page_size);
            expected_inval_all_depth = min(expected_inval_all_depth, expected_range_depth);
            uvm_tlb_batch_invalidate(&batch,
@@ -1143,7 +1365,7 @@ static NV_STATUS test_tlb_batch_invalidates_case(uvm_page_tree_t *tree,
        uvm_tlb_batch_end(&batch, &push, UVM_MEMBAR_NONE);

        for (j = 0; j < i; ++j) {
-            NvU32 used_max_page_size = (j & 1) ? max_page_size : min_page_size;
+            NvU64 used_max_page_size = (j & 1) ? max_page_size : min_page_size;
            NvU32 expected_range_depth = tree->hal->page_table_depth(used_max_page_size);
            bool allow_inval_all = (total_pages > gpu->parent->tlb_batch.max_pages) ||
                                   !gpu->parent->tlb_batch.va_invalidate_supported ||
@@ -1515,7 +1737,7 @@ static uvm_mmu_page_table_alloc_t fake_table_alloc(uvm_aperture_t aperture, NvU6
 // Queries the supported page sizes of the GPU(uvm_gpu_t) and fills the
 // page_sizes array up to MAX_NUM_PAGE_SIZE. Returns the number of elements in
 // page_sizes;
-size_t get_page_sizes(uvm_gpu_t *gpu, NvU64 *page_sizes)
+static size_t get_page_sizes(uvm_gpu_t *gpu, NvU64 *page_sizes)
 {
    unsigned long page_size_log2;
    unsigned long page_sizes_bitvec;
@@ -1572,6 +1794,11 @@ static NV_STATUS entry_test_page_size_hopper(uvm_gpu_t *gpu, size_t page_size)
    return NV_OK;
 }

+static NV_STATUS entry_test_page_size_blackwell(uvm_gpu_t *gpu, size_t page_size)
+{
+    return entry_test_page_size_hopper(gpu, page_size);
+}
+
 typedef NV_STATUS (*entry_test_page_size_func)(uvm_gpu_t *gpu, size_t page_size);

 static NV_STATUS entry_test_maxwell(uvm_gpu_t *gpu)
@@ -1583,7 +1810,8 @@ static NV_STATUS entry_test_maxwell(uvm_gpu_t *gpu)
    uvm_mmu_page_table_alloc_t alloc_vid = fake_table_alloc(UVM_APERTURE_VID, 0x1BBBBBB000LL);
    uvm_mmu_mode_hal_t *hal;
    uvm_page_directory_t dir;
-    NvU32 i, j, big_page_size, page_size;
+    NvU64 big_page_size, page_size;
+    NvU32 i, j;

    dir.depth = 0;

@@ -2049,6 +2277,11 @@ cleanup:
    return status;
 }

+static NV_STATUS entry_test_blackwell(uvm_gpu_t *gpu, entry_test_page_size_func entry_test_page_size)
+{
+    return entry_test_ampere(gpu, entry_test_page_size_blackwell);
+}
+
 static NV_STATUS alloc_4k_maxwell(uvm_gpu_t *gpu)
 {
    uvm_page_tree_t tree;
@@ -2086,7 +2319,7 @@ static NV_STATUS alloc_4k_maxwell(uvm_gpu_t *gpu)
    return NV_OK;
 }

-static NV_STATUS shrink_test(uvm_gpu_t *gpu, NvU32 big_page_size, NvU32 page_size)
+static NV_STATUS shrink_test(uvm_gpu_t *gpu, NvU32 big_page_size, NvU64 page_size)
 {
    uvm_page_tree_t tree;
    uvm_page_table_range_t range;
@@ -2138,7 +2371,7 @@ static NV_STATUS shrink_test(uvm_gpu_t *gpu, NvU32 big_page_size, NvU32 page_siz
    return NV_OK;
 }

-static NV_STATUS get_upper_test(uvm_gpu_t *gpu, NvU32 big_page_size, NvU32 page_size)
+static NV_STATUS get_upper_test(uvm_gpu_t *gpu, NvU32 big_page_size, NvU64 page_size)
 {
    uvm_page_tree_t tree;
    uvm_page_table_range_t range, upper_range;
@@ -2291,6 +2524,14 @@ static NV_STATUS fake_gpu_init_hopper(uvm_gpu_t *fake_gpu)
                         fake_gpu);
 }

+static NV_STATUS fake_gpu_init_blackwell(uvm_gpu_t *fake_gpu)
+{
+    return fake_gpu_init(BLACKWELL_CHANNEL_GPFIFO_A,
+                         BLACKWELL_DMA_COPY_A,
+                         NV2080_CTRL_MC_ARCH_INFO_ARCHITECTURE_GB100,
+                         fake_gpu);
+}
+
 static NV_STATUS maxwell_test_page_tree(uvm_gpu_t *maxwell)
 {
    // create a fake Maxwell GPU for this test.
@@ -2350,7 +2591,7 @@ static NV_STATUS pascal_test_page_tree(uvm_gpu_t *pascal)
    MEM_NV_CHECK_RET(check_sizes(pascal), NV_OK);
    MEM_NV_CHECK_RET(fast_split_normal(pascal), NV_OK);
    MEM_NV_CHECK_RET(fast_split_double_backoff(pascal), NV_OK);
-    MEM_NV_CHECK_RET(test_tlb_invalidates(pascal), NV_OK);
+    MEM_NV_CHECK_RET(test_tlb_invalidates_gmmu_v2(pascal), NV_OK);
    MEM_NV_CHECK_RET(test_tlb_batch_invalidates(pascal, page_sizes, num_page_sizes), NV_OK);

    // Run the test again with a bigger limit on max pages
@@ -2406,7 +2647,7 @@ static NV_STATUS ampere_test_page_tree(uvm_gpu_t *ampere)
    MEM_NV_CHECK_RET(entry_test_ampere(ampere, entry_test_page_size_ampere), NV_OK);

    // TLB invalidate
-    MEM_NV_CHECK_RET(test_tlb_invalidates(ampere), NV_OK);
+    MEM_NV_CHECK_RET(test_tlb_invalidates_gmmu_v2(ampere), NV_OK);

    // TLB batch invalidate
    MEM_NV_CHECK_RET(test_tlb_batch_invalidates(ampere, page_sizes, num_page_sizes), NV_OK);
@@ -2441,6 +2682,55 @@ static NV_STATUS hopper_test_page_tree(uvm_gpu_t *hopper)
    return NV_OK;
 }

+static NV_STATUS blackwell_test_page_tree(uvm_gpu_t *blackwell)
+{
+    NvU32 i, tlb_batch_saved_max_pages;
+    NvU64 page_sizes[MAX_NUM_PAGE_SIZES];
+    size_t num_page_sizes;
+
+    TEST_CHECK_RET(fake_gpu_init_blackwell(blackwell) == NV_OK);
+
+    num_page_sizes = get_page_sizes(blackwell, page_sizes);
+    UVM_ASSERT(num_page_sizes > 0);
+
+    MEM_NV_CHECK_RET(alloc_256g_memory(blackwell), NV_OK);
+    MEM_NV_CHECK_RET(alloc_adjacent_256g_memory(blackwell), NV_OK);
+    MEM_NV_CHECK_RET(get_single_page_256g(blackwell), NV_OK);
+    MEM_NV_CHECK_RET(get_entire_table_256g(blackwell), NV_OK);
+
+    // Although there is no support for the 256GM page size for managed memory,
+    // we run tests that split a 256G page into 512x512M pages because UVM
+    // handles the PTEs for all supported page sizes.
+    MEM_NV_CHECK_RET(split_512m_from_256g(blackwell), NV_OK);
+    MEM_NV_CHECK_RET(get_1tb_range(blackwell), NV_OK);
+    MEM_NV_CHECK_RET(entry_test_blackwell(blackwell, entry_test_page_size_blackwell), NV_OK);
+
+    // TLB invalidate
+    MEM_NV_CHECK_RET(test_tlb_invalidates_gmmu_v3(blackwell), NV_OK);
+
+    // TLB batch invalidate
+    MEM_NV_CHECK_RET(test_tlb_batch_invalidates(blackwell, page_sizes, num_page_sizes), NV_OK);
+
+    // Run the test again with a bigger limit on max pages
+    tlb_batch_saved_max_pages = blackwell->parent->tlb_batch.max_pages;
+    blackwell->parent->tlb_batch.max_pages = 1024 * 1024;
+    MEM_NV_CHECK_RET(test_tlb_batch_invalidates(blackwell, page_sizes, num_page_sizes), NV_OK);
+    blackwell->parent->tlb_batch.max_pages = tlb_batch_saved_max_pages;
+
+    // And with per VA invalidates disabled
+    blackwell->parent->tlb_batch.va_invalidate_supported = false;
+    MEM_NV_CHECK_RET(test_tlb_batch_invalidates(blackwell, page_sizes, num_page_sizes), NV_OK);
+    blackwell->parent->tlb_batch.va_invalidate_supported = true;
+
+    for (i = 0; i < num_page_sizes; i++) {
+        MEM_NV_CHECK_RET(shrink_test(blackwell, BIG_PAGE_SIZE_PASCAL, page_sizes[i]), NV_OK);
+        MEM_NV_CHECK_RET(get_upper_test(blackwell, BIG_PAGE_SIZE_PASCAL, page_sizes[i]), NV_OK);
+        MEM_NV_CHECK_RET(test_range_vec(blackwell, BIG_PAGE_SIZE_PASCAL, page_sizes[i]), NV_OK);
+    }
+
+    return NV_OK;
+}
+
 NV_STATUS uvm_test_page_tree(UVM_TEST_PAGE_TREE_PARAMS *params, struct file *filp)
 {
    NV_STATUS status = NV_OK;
@@ -2481,6 +2771,7 @@ NV_STATUS uvm_test_page_tree(UVM_TEST_PAGE_TREE_PARAMS *params, struct file *fil
    TEST_NV_CHECK_GOTO(volta_test_page_tree(gpu), done);
    TEST_NV_CHECK_GOTO(ampere_test_page_tree(gpu), done);
    TEST_NV_CHECK_GOTO(hopper_test_page_tree(gpu), done);
+    TEST_NV_CHECK_GOTO(blackwell_test_page_tree(gpu), done);

 done:
    fake_tlb_invals_free();
--- a/kernel-open/nvidia-uvm/uvm_pascal_fault_buffer.c
+++ b/kernel-open/nvidia-uvm/uvm_pascal_fault_buffer.c
@@ -323,10 +323,3 @@ NvU32 uvm_hal_pascal_fault_buffer_entry_size(uvm_parent_gpu_t *parent_gpu)
 {
    return NVB069_FAULT_BUF_SIZE;
 }
-
-void uvm_hal_pascal_fault_buffer_parse_non_replayable_entry_unsupported(uvm_parent_gpu_t *parent_gpu,
-                                                                        void *fault_packet,
-                                                                        uvm_fault_buffer_entry_t *buffer_entry)
-{
-    UVM_ASSERT_MSG(false, "fault_buffer_parse_non_replayable_entry called on Pascal GPU\n");
-}
--- a/kernel-open/nvidia-uvm/uvm_pascal_mmu.c
+++ b/kernel-open/nvidia-uvm/uvm_pascal_mmu.c
@@ -37,6 +37,7 @@
 #include "uvm_global.h"
 #include "uvm_gpu.h"
 #include "uvm_mmu.h"
+#include "uvm_hal.h"
 #include "uvm_push_macros.h"
 #include "uvm_pascal_fault_buffer.h"
 #include "hwref/pascal/gp100/dev_fault.h"
--- a/kernel-open/nvidia-uvm/uvm_peer_identity_mappings_test.c
+++ b/kernel-open/nvidia-uvm/uvm_peer_identity_mappings_test.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2016-2019 NVIDIA Corporation
+    Copyright (c) 2016-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -152,9 +152,7 @@ NV_STATUS uvm_test_peer_identity_mappings(UVM_TEST_PEER_IDENTITY_MAPPINGS_PARAMS
        goto done;
    }

-    // Indirect peers don't use identity mappings
-    if (!uvm_processor_mask_test(&va_space->can_access[uvm_id_value(gpu_a->id)], gpu_b->id) ||
-        uvm_processor_mask_test(&va_space->indirect_peers[uvm_id_value(gpu_a->id)], gpu_b->id)) {
+    if (!uvm_processor_mask_test(&va_space->can_access[uvm_id_value(gpu_a->id)], gpu_b->id)) {
        status = NV_ERR_INVALID_DEVICE;
        goto done;
    }
--- a/kernel-open/nvidia-uvm/uvm_pmm_gpu.c
+++ b/kernel-open/nvidia-uvm/uvm_pmm_gpu.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2023 NVIDIA Corporation
+    Copyright (c) 2015-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -133,7 +133,7 @@
 //
 // - PMM root chunk bit locks
 //   Each bit lock protects the corresponding root chunk's allocation, freeing
-//   from/to PMA, root chunk trackers, and root chunk indirect_peer mappings.
+//   from/to PMA, and root chunk trackers.
 //
 // - PMA allocation/eviction lock
 //   A read-write semaphore used by the eviction path to flush any pending
@@ -1183,216 +1183,15 @@ void uvm_pmm_gpu_merge_chunk(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
    uvm_mutex_unlock(&pmm->lock);
 }

-static void root_chunk_unmap_indirect_peer(uvm_pmm_gpu_t *pmm, uvm_gpu_root_chunk_t *root_chunk, uvm_gpu_t *other_gpu)
-{
-    uvm_gpu_root_chunk_indirect_peer_t *indirect_peer;
-    size_t index = root_chunk_index(pmm, root_chunk);
-    long long new_count;
-    NV_STATUS status;
-
-    indirect_peer = &pmm->root_chunks.indirect_peer[uvm_id_gpu_index(other_gpu->id)];
-
-    uvm_assert_root_chunk_locked(pmm, root_chunk);
-    UVM_ASSERT(indirect_peer->dma_addrs);
-    UVM_ASSERT(root_chunk->chunk.state != UVM_PMM_GPU_CHUNK_STATE_PMA_OWNED);
-    UVM_ASSERT(uvm_processor_mask_test(&root_chunk->indirect_peers_mapped, other_gpu->id));
-
-    // The tracker could have work which requires the indirect peer mappings to
-    // remain until finished, such as PTE unmaps of this chunk from indirect
-    // peers, so we need to wait. We also need to wait on the entire tracker,
-    // not just other_gpu's entries, because there might be implicit chained
-    // dependencies in the tracker.
-    //
-    // We know there can't be any other work which requires these mappings:
-    // - If we're freeing the root chunk back to PMA or switching types of the
-    //   root chunk, nothing else can reference the chunk.
-    //
-    // - If the chunk is still allocated then global peer access must be in the
-    //   process of being disabled, say because one of the GPUs is being
-    //   unregistered. We know that all VA spaces must have already called
-    //   disable_peers and have waited on those PTE unmaps. The chunk could be
-    //   freed concurrently with this indirect peer unmap, but that will be
-    //   serialized by the root chunk lock.
-    status = uvm_tracker_wait(&root_chunk->tracker);
-    if (status != NV_OK)
-        UVM_ASSERT(uvm_global_get_status() != NV_OK);
-
-    uvm_parent_gpu_unmap_cpu_pages(other_gpu->parent, indirect_peer->dma_addrs[index], UVM_CHUNK_SIZE_MAX);
-    uvm_processor_mask_clear(&root_chunk->indirect_peers_mapped, other_gpu->id);
-    new_count = atomic64_dec_return(&indirect_peer->map_count);
-    UVM_ASSERT(new_count >= 0);
-}
-
-static void root_chunk_unmap_indirect_peers(uvm_pmm_gpu_t *pmm, uvm_gpu_root_chunk_t *root_chunk)
-{
-    uvm_gpu_id_t other_gpu_id;
-
-    // Root chunks should use a global processor mask as they are not bound to
-    // a specific VA space. However, indirect peers are not supported when SMC
-    // partitioning is enabled and, therefore, we can obtain the uvm_gpu_t
-    // object directly from the uvm_parent_gpu_t object's id.
-    for_each_gpu_id_in_mask(other_gpu_id, &root_chunk->indirect_peers_mapped) {
-        uvm_gpu_t *other_gpu = uvm_gpu_get(other_gpu_id);
-        root_chunk_unmap_indirect_peer(pmm, root_chunk, other_gpu);
-    }
-}
-
-NV_STATUS uvm_pmm_gpu_indirect_peer_init(uvm_pmm_gpu_t *pmm, uvm_gpu_t *accessing_gpu)
-{
-    uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
-    NvU64 *dma_addrs;
-    uvm_gpu_root_chunk_indirect_peer_t *indirect_peer;
-    NV_STATUS status = NV_OK;
-
-    indirect_peer = &pmm->root_chunks.indirect_peer[uvm_id_gpu_index(accessing_gpu->id)];
-
-    uvm_assert_mutex_locked(&g_uvm_global.global_lock);
-    UVM_ASSERT(uvm_gpus_are_indirect_peers(gpu, accessing_gpu));
-    UVM_ASSERT(!indirect_peer->dma_addrs);
-    UVM_ASSERT(atomic64_read(&indirect_peer->map_count) == 0);
-
-    // Each root chunk tracks whether it has a mapping to a given indirect peer,
-    // so we don't need to initialize this array.
-    dma_addrs = uvm_kvmalloc(pmm->root_chunks.count * sizeof(dma_addrs[0]));
-    if (!dma_addrs)
-        status = NV_ERR_NO_MEMORY;
-    else
-        indirect_peer->dma_addrs = dma_addrs;
-
-    return status;
-}
-
-static bool check_indirect_peer_empty(uvm_pmm_gpu_t *pmm, uvm_gpu_t *other_gpu)
-{
-    uvm_gpu_root_chunk_indirect_peer_t *indirect_peer;
-    size_t i;
-
-    indirect_peer = &pmm->root_chunks.indirect_peer[uvm_id_gpu_index(other_gpu->id)];
-
-    for (i = 0; i < pmm->root_chunks.count; i++) {
-        uvm_gpu_root_chunk_t *root_chunk = &pmm->root_chunks.array[i];
-
-        // This doesn't take the root chunk lock because checking the mask is an
-        // atomic operation.
-        if (uvm_processor_mask_test(&root_chunk->indirect_peers_mapped, other_gpu->id)) {
-            UVM_ASSERT(atomic64_read(&indirect_peer->map_count) > 0);
-            return false;
-        }
-    }
-
-    UVM_ASSERT(atomic64_read(&indirect_peer->map_count) == 0);
-    return true;
-}
-
-void uvm_pmm_gpu_indirect_peer_destroy(uvm_pmm_gpu_t *pmm, uvm_gpu_t *other_gpu)
-{
-    uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
-    uvm_gpu_root_chunk_indirect_peer_t *indirect_peer;
-    size_t i;
-
-    indirect_peer = &pmm->root_chunks.indirect_peer[uvm_id_gpu_index(other_gpu->id)];
-
-    uvm_assert_mutex_locked(&g_uvm_global.global_lock);
-    UVM_ASSERT(uvm_gpus_are_indirect_peers(gpu, other_gpu));
-
-    if (!indirect_peer->dma_addrs) {
-        UVM_ASSERT(check_indirect_peer_empty(pmm, other_gpu));
-        return;
-    }
-
-    // Just go over all root chunks and unmap them. This is slow, but it is not
-    // a frequent operation.
-    for (i = 0; i < pmm->root_chunks.count && atomic64_read(&indirect_peer->map_count); i++) {
-        uvm_gpu_root_chunk_t *root_chunk = &pmm->root_chunks.array[i];
-
-        // Take the root chunk lock to prevent chunks from transitioning in or
-        // out of the PMA_OWNED state, and to serialize updates to the tracker
-        // and indirect_peers_mapped mask. Note that indirect peers besides
-        // other_gpu could be trying to create mappings concurrently.
-        root_chunk_lock(pmm, root_chunk);
-
-        if (root_chunk->chunk.state == UVM_PMM_GPU_CHUNK_STATE_PMA_OWNED)
-            UVM_ASSERT(uvm_processor_mask_empty(&root_chunk->indirect_peers_mapped));
-        else if (uvm_processor_mask_test(&root_chunk->indirect_peers_mapped, other_gpu->id))
-            root_chunk_unmap_indirect_peer(pmm, root_chunk, other_gpu);
-
-        root_chunk_unlock(pmm, root_chunk);
-    }
-
-    UVM_ASSERT(check_indirect_peer_empty(pmm, other_gpu));
-
-    uvm_kvfree(indirect_peer->dma_addrs);
-    indirect_peer->dma_addrs = NULL;
-}
-
-NV_STATUS uvm_pmm_gpu_indirect_peer_map(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk, uvm_gpu_t *accessing_gpu)
-{
-    uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
-    uvm_gpu_root_chunk_indirect_peer_t *indirect_peer;
-    uvm_gpu_root_chunk_t *root_chunk = root_chunk_from_chunk(pmm, chunk);
-    size_t index = root_chunk_index(pmm, root_chunk);
-    NV_STATUS status = NV_OK;
-
-    indirect_peer = &pmm->root_chunks.indirect_peer[uvm_id_gpu_index(accessing_gpu->id)];
-
-    UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED ||
-               chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED);
-
-    UVM_ASSERT(uvm_gpus_are_indirect_peers(gpu, accessing_gpu));
-    UVM_ASSERT(indirect_peer->dma_addrs);
-
-    // Serialize:
-    //  - Concurrent mappings to this root chunk (same or different GPUs)
-    //  - Concurrent unmappings of this root chunk (must be a different GPU)
-    root_chunk_lock(pmm, root_chunk);
-
-    if (!uvm_processor_mask_test(&root_chunk->indirect_peers_mapped, accessing_gpu->id)) {
-        status = uvm_parent_gpu_map_cpu_pages(accessing_gpu->parent,
-                                              uvm_gpu_chunk_to_page(pmm, &root_chunk->chunk),
-                                              UVM_CHUNK_SIZE_MAX,
-                                              &indirect_peer->dma_addrs[index]);
-        if (status == NV_OK) {
-            uvm_processor_mask_set(&root_chunk->indirect_peers_mapped, accessing_gpu->id);
-            atomic64_inc(&indirect_peer->map_count);
-        }
-    }
-
-    root_chunk_unlock(pmm, root_chunk);
-    return status;
-}
-
-NvU64 uvm_pmm_gpu_indirect_peer_addr(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk, uvm_gpu_t *accessing_gpu)
-{
-    uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
-    uvm_gpu_root_chunk_indirect_peer_t *indirect_peer;
-    uvm_gpu_root_chunk_t *root_chunk = root_chunk_from_chunk(pmm, chunk);
-    size_t index = root_chunk_index(pmm, root_chunk);
-    NvU64 chunk_offset = chunk->address - root_chunk->chunk.address;
-
-    indirect_peer = &pmm->root_chunks.indirect_peer[uvm_id_gpu_index(accessing_gpu->id)];
-
-    UVM_ASSERT(uvm_gpus_are_indirect_peers(gpu, accessing_gpu));
-    UVM_ASSERT(indirect_peer->dma_addrs);
-    UVM_ASSERT(uvm_processor_mask_test(&root_chunk->indirect_peers_mapped, accessing_gpu->id));
-    UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED ||
-               chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED ||
-               chunk->state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT);
-
-    return indirect_peer->dma_addrs[index] + chunk_offset;
-}
-
 uvm_gpu_phys_address_t uvm_pmm_gpu_peer_phys_address(uvm_pmm_gpu_t *pmm,
                                                     uvm_gpu_chunk_t *chunk,
                                                     uvm_gpu_t *accessing_gpu)
 {
    uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
-    uvm_gpu_peer_t *peer_caps = uvm_gpu_peer_caps(accessing_gpu, gpu);
    uvm_aperture_t aperture = uvm_gpu_peer_aperture(accessing_gpu, gpu);
    NvU64 addr;

-    if (peer_caps->is_indirect_peer)
-        addr = uvm_pmm_gpu_indirect_peer_addr(pmm, chunk, accessing_gpu);
-    else if (uvm_gpus_are_nvswitch_connected(accessing_gpu, gpu))
+    if (uvm_gpus_are_nvswitch_connected(accessing_gpu, gpu))
        addr = chunk->address + gpu->parent->nvswitch_info.fabric_memory_window_start;
    else
        addr = chunk->address;
@@ -1405,15 +1204,10 @@ uvm_gpu_address_t uvm_pmm_gpu_peer_copy_address(uvm_pmm_gpu_t *pmm,
                                                uvm_gpu_t *accessing_gpu)
 {
    uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
-    uvm_gpu_peer_t *peer_caps = uvm_gpu_peer_caps(accessing_gpu, gpu);
    uvm_gpu_identity_mapping_t *gpu_peer_mapping;

-    if (peer_caps->is_indirect_peer ||
-        (accessing_gpu->parent->peer_copy_mode == UVM_GPU_PEER_COPY_MODE_PHYSICAL)) {
-        // Indirect peers are accessed as sysmem addresses, so they don't need
-        // to use identity mappings.
+    if (accessing_gpu->parent->peer_copy_mode == UVM_GPU_PEER_COPY_MODE_PHYSICAL)
        return uvm_gpu_address_from_phys(uvm_pmm_gpu_peer_phys_address(pmm, chunk, accessing_gpu));
-    }

    UVM_ASSERT(accessing_gpu->parent->peer_copy_mode == UVM_GPU_PEER_COPY_MODE_VIRTUAL);
    gpu_peer_mapping = uvm_gpu_get_peer_mapping(accessing_gpu, gpu->id);
@@ -1800,12 +1594,6 @@ static NV_STATUS pick_and_evict_root_chunk(uvm_pmm_gpu_t *pmm,
                       chunk->address,
                       nvstatusToString(status));

-        // Unmap any indirect peer physical mappings for this chunk, since
-        // kernel chunks generally don't need them.
-        root_chunk_lock(pmm, root_chunk);
-        root_chunk_unmap_indirect_peers(pmm, root_chunk);
-        root_chunk_unlock(pmm, root_chunk);
-
        uvm_spin_lock(&pmm->list_lock);
        chunk->type = type;
        uvm_spin_unlock(&pmm->list_lock);
@@ -2273,8 +2061,6 @@ void free_root_chunk(uvm_pmm_gpu_t *pmm, uvm_gpu_root_chunk_t *root_chunk, free_

    root_chunk_lock(pmm, root_chunk);

-    root_chunk_unmap_indirect_peers(pmm, root_chunk);
-
    status = uvm_tracker_wait_deinit(&root_chunk->tracker);
    if (status != NV_OK) {
        // TODO: Bug 1766184: Handle RC/ECC. For now just go ahead and free the chunk anyway.
@@ -2467,30 +2253,6 @@ static bool check_chunk(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
        UVM_ASSERT(chunk_size == uvm_chunk_find_last_size(chunk_sizes));
    }

-    if (uvm_pmm_sysmem_mappings_indirect_supported()) {
-        uvm_gpu_root_chunk_t *root_chunk = root_chunk_from_chunk(pmm, chunk);
-        uvm_gpu_id_t other_gpu_id;
-
-        root_chunk_lock(pmm, root_chunk);
-
-        // See root_chunk_unmap_indirect_peers for the usage of uvm_gpu_get
-        for_each_gpu_id_in_mask(other_gpu_id, &root_chunk->indirect_peers_mapped) {
-            uvm_gpu_t *other_gpu = uvm_gpu_get(other_gpu_id);
-            NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(pmm, chunk, other_gpu);
-            uvm_reverse_map_t reverse_map;
-            size_t num_mappings;
-
-            num_mappings = uvm_pmm_sysmem_mappings_dma_to_virt(&other_gpu->pmm_reverse_sysmem_mappings,
-                                                               peer_addr,
-                                                               uvm_gpu_chunk_get_size(chunk),
-                                                               &reverse_map,
-                                                               1);
-            UVM_ASSERT(num_mappings == 0);
-        }
-
-        root_chunk_unlock(pmm, root_chunk);
-    }
-
    return true;
 }

@@ -3734,11 +3496,6 @@ void uvm_pmm_gpu_deinit(uvm_pmm_gpu_t *pmm)

    uvm_bit_locks_deinit(&pmm->root_chunks.bitlocks);

-    for (i = 0; i < ARRAY_SIZE(pmm->root_chunks.indirect_peer); i++) {
-        UVM_ASSERT(pmm->root_chunks.indirect_peer[i].dma_addrs == NULL);
-        UVM_ASSERT(atomic64_read(&pmm->root_chunks.indirect_peer[i].map_count) == 0);
-    }
-
    if (pmm->root_chunks.array) {
        // Make sure that all chunks have been returned to PMA
        for (i = 0; i < pmm->root_chunks.count; ++i) {
@@ -3918,7 +3675,7 @@ static NV_STATUS test_check_pma_allocated_chunks(uvm_pmm_gpu_t *pmm,
        root_chunk = root_chunk_from_address(pmm, address);

        if (!IS_ALIGNED(address, params->page_size)) {
-            UVM_TEST_PRINT("Returned unaligned address 0x%llx page size %u\n", address, params->page_size);
+            UVM_TEST_PRINT("Returned unaligned address 0x%llx page size %llu\n", address, params->page_size);
            status = NV_ERR_INVALID_STATE;
        }

--- a/kernel-open/nvidia-uvm/uvm_pmm_gpu.h
+++ b/kernel-open/nvidia-uvm/uvm_pmm_gpu.h
@@ -304,46 +304,8 @@ typedef struct uvm_gpu_root_chunk_struct
    //
    // Protected by the corresponding root chunk bit lock.
    uvm_tracker_t tracker;
-
-    // Indirect peers which have IOMMU mappings to this root chunk. The mapped
-    // addresses are stored in this root chunk's index in
-    // uvm_pmm_gpu_t::root_chunks.indirect_peer[id].dma_addrs.
-    //
-    // Protected by the corresponding root chunk bit lock.
-    //
-    // We can use a regular processor id because indirect peers are not allowed
-    // between partitioned GPUs when SMC is enabled.
-    uvm_processor_mask_t indirect_peers_mapped;
 } uvm_gpu_root_chunk_t;

-typedef struct
-{
-    // Indirect peers are GPUs which can coherently access this GPU's memory,
-    // but are routed through an intermediate processor. Indirect peers access
-    // each others' memory with the SYS aperture rather then a PEER aperture,
-    // meaning they need IOMMU mappings:
-    //
-    // accessing_gpu ==> IOMMU ==> CPU ==> owning_gpu (this GPU)
-    //
-    // This array has one entry per root chunk on this GPU. Each entry
-    // contains the IOMMU address accessing_gpu needs to use in order to
-    // access this GPU's root chunk. The root chunks are mapped as whole
-    // regions both for tracking simplicity and to allow GPUs to map with
-    // large PTEs.
-    //
-    // An array entry is valid iff accessing_gpu's ID is set in the
-    // corresponding root chunk's indirect_peers_mapped mask.
-    //
-    // Management of these addresses would be simpler if they were stored
-    // in the root chunks themselves, but in the common case there are only
-    // a small number of indirect peers in a system. Dynamic array
-    // allocation per indirect peer wastes less memory.
-    NvU64 *dma_addrs;
-
-    // Number of this GPU's root chunks mapped for each indirect peer.
-    atomic64_t map_count;
-} uvm_gpu_root_chunk_indirect_peer_t;
-
 typedef struct uvm_pmm_gpu_struct
 {
    // Sizes of the MMU
@@ -388,8 +350,6 @@ typedef struct uvm_pmm_gpu_struct
        // or workqueue.
        struct list_head va_block_lazy_free;
        nv_kthread_q_item_t va_block_lazy_free_q_item;
-
-        uvm_gpu_root_chunk_indirect_peer_t indirect_peer[UVM_ID_MAX_GPUS];
    } root_chunks;

 #if UVM_IS_CONFIG_HMM()
@@ -592,31 +552,6 @@ void uvm_pmm_gpu_sync(uvm_pmm_gpu_t *pmm);
 // Mark an allocated chunk as evicted
 void uvm_pmm_gpu_mark_chunk_evicted(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk);

-// Initialize indirect peer state so accessing_gpu is ready to create mappings
-// to pmm's root chunks.
-//
-// Locking: The global lock must be held.
-NV_STATUS uvm_pmm_gpu_indirect_peer_init(uvm_pmm_gpu_t *pmm, uvm_gpu_t *accessing_gpu);
-
-// Tear down indirect peer state from other_gpu to pmm's GPU. Any existing IOMMU
-// mappings from other_gpu to this GPU are torn down.
-//
-// Locking: The global lock must be held.
-void uvm_pmm_gpu_indirect_peer_destroy(uvm_pmm_gpu_t *pmm, uvm_gpu_t *other_gpu);
-
-// Create an IOMMU mapping to allow accessing_gpu to access chunk on pmm's GPU.
-// chunk can be any size, and can be mapped more than once (the address will not
-// change). The address can be retrieved using uvm_pmm_gpu_indirect_peer_addr.
-//
-// Note that there is no corresponding unmap call. The mappings will be removed
-// automatically as necessary when the chunk is freed. This allows mappings to
-// be reused as much as possible.
-NV_STATUS uvm_pmm_gpu_indirect_peer_map(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk, uvm_gpu_t *accessing_gpu);
-
-// Retrieve the system address accessing_gpu must use to access this chunk.
-// uvm_pmm_gpu_indirect_peer_map must have been called first.
-NvU64 uvm_pmm_gpu_indirect_peer_addr(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk, uvm_gpu_t *accessing_gpu);
-
 // Returns the physical address for use by accessing_gpu of a vidmem allocation
 // on the peer pmm->gpu. This address can be used for making PTEs on
 // accessing_gpu, but not for copying between the two GPUs. For that, use
--- a/kernel-open/nvidia-uvm/uvm_pmm_sysmem.h
+++ b/kernel-open/nvidia-uvm/uvm_pmm_sysmem.h
@@ -49,13 +49,6 @@ struct uvm_pmm_sysmem_mappings_struct
    uvm_mutex_t                        reverse_map_lock;
 };

-// See comments in uvm_linux.h
-#ifdef NV_RADIX_TREE_REPLACE_SLOT_PRESENT
-#define uvm_pmm_sysmem_mappings_indirect_supported() true
-#else
-#define uvm_pmm_sysmem_mappings_indirect_supported() false
-#endif
-
 // Global initialization/exit functions, that need to be called during driver
 // initialization/tear-down. These are needed to allocate/free global internal
 // data structures.
@@ -78,35 +71,11 @@ NV_STATUS uvm_pmm_sysmem_mappings_add_gpu_mapping(uvm_pmm_sysmem_mappings_t *sys
                                                  uvm_va_block_t *va_block,
                                                  uvm_processor_id_t owner);

-static NV_STATUS uvm_pmm_sysmem_mappings_add_gpu_chunk_mapping(uvm_pmm_sysmem_mappings_t *sysmem_mappings,
-                                                               NvU64 dma_addr,
-                                                               NvU64 virt_addr,
-                                                               NvU64 region_size,
-                                                               uvm_va_block_t *va_block,
-                                                               uvm_gpu_id_t owner)
-{
-    if (!uvm_pmm_sysmem_mappings_indirect_supported())
-        return NV_OK;
-
-    return uvm_pmm_sysmem_mappings_add_gpu_mapping(sysmem_mappings,
-                                                   dma_addr,
-                                                   virt_addr,
-                                                   region_size,
-                                                   va_block,
-                                                   owner);
-}
-
 // If the GPU used to initialize sysmem_mappings supports access counters, the
 // entries for the physical region starting at dma_addr are removed from the
 // reverse map.
 void uvm_pmm_sysmem_mappings_remove_gpu_mapping(uvm_pmm_sysmem_mappings_t *sysmem_mappings, NvU64 dma_addr);

-static void uvm_pmm_sysmem_mappings_remove_gpu_chunk_mapping(uvm_pmm_sysmem_mappings_t *sysmem_mappings, NvU64 dma_addr)
-{
-    if (uvm_pmm_sysmem_mappings_indirect_supported())
-        uvm_pmm_sysmem_mappings_remove_gpu_mapping(sysmem_mappings, dma_addr);
-}
-
 // Like uvm_pmm_sysmem_mappings_remove_gpu_mapping but it doesn't assert if the
 // mapping doesn't exist. See uvm_va_block_evict_chunks for more information.
 void uvm_pmm_sysmem_mappings_remove_gpu_mapping_on_eviction(uvm_pmm_sysmem_mappings_t *sysmem_mappings, NvU64 dma_addr);
@@ -118,14 +87,6 @@ void uvm_pmm_sysmem_mappings_reparent_gpu_mapping(uvm_pmm_sysmem_mappings_t *sys
                                                  NvU64 dma_addr,
                                                  uvm_va_block_t *va_block);

-static void uvm_pmm_sysmem_mappings_reparent_gpu_chunk_mapping(uvm_pmm_sysmem_mappings_t *sysmem_mappings,
-                                                               NvU64 dma_addr,
-                                                               uvm_va_block_t *va_block)
-{
-    if (uvm_pmm_sysmem_mappings_indirect_supported())
-        uvm_pmm_sysmem_mappings_reparent_gpu_mapping(sysmem_mappings, dma_addr, va_block);
-}
-
 // If the GPU used to initialize sysmem_mappings supports access counters, the
 // mapping for the region starting at dma_addr is split into regions of
 // new_region_size. new_region_size must be a power of two and smaller than the
@@ -134,16 +95,6 @@ NV_STATUS uvm_pmm_sysmem_mappings_split_gpu_mappings(uvm_pmm_sysmem_mappings_t *
                                                     NvU64 dma_addr,
                                                     NvU64 new_region_size);

-static NV_STATUS uvm_pmm_sysmem_mappings_split_gpu_chunk_mappings(uvm_pmm_sysmem_mappings_t *sysmem_mappings,
-                                                                  NvU64 dma_addr,
-                                                                  NvU64 new_region_size)
-{
-    if (!uvm_pmm_sysmem_mappings_indirect_supported())
-        return NV_OK;
-
-    return uvm_pmm_sysmem_mappings_split_gpu_mappings(sysmem_mappings, dma_addr, new_region_size);
-}
-
 // If the GPU used to initialize sysmem_mappings supports access counters, all
 // the mappings within the region [dma_addr, dma_addr + new_region_size) are
 // merged into a single mapping. new_region_size must be a power of two. The
@@ -153,14 +104,6 @@ void uvm_pmm_sysmem_mappings_merge_gpu_mappings(uvm_pmm_sysmem_mappings_t *sysme
                                                NvU64 dma_addr,
                                                NvU64 new_region_size);

-static void uvm_pmm_sysmem_mappings_merge_gpu_chunk_mappings(uvm_pmm_sysmem_mappings_t *sysmem_mappings,
-                                                             NvU64 dma_addr,
-                                                             NvU64 new_region_size)
-{
-    if (uvm_pmm_sysmem_mappings_indirect_supported())
-        uvm_pmm_sysmem_mappings_merge_gpu_mappings(sysmem_mappings, dma_addr, new_region_size);
-}
-
 // Obtain the {va_block, virt_addr} information for the mappings in the given
 // [dma_addr:dma_addr + region_size) range. dma_addr and region_size must be
 // page-aligned.
--- a/kernel-open/nvidia-uvm/uvm_pmm_sysmem_test.c
+++ b/kernel-open/nvidia-uvm/uvm_pmm_sysmem_test.c
@@ -565,13 +565,7 @@ NV_STATUS uvm_test_pmm_sysmem(UVM_TEST_PMM_SYSMEM_PARAMS *params, struct file *f
    uvm_mutex_lock(&g_uvm_global.global_lock);
    uvm_va_space_down_write(va_space);

-    if (uvm_pmm_sysmem_mappings_indirect_supported()) {
-        status = test_pmm_sysmem_reverse_map(va_space, params->range_address1, params->range_address2);
-    }
-    else {
-        UVM_TEST_PRINT("Skipping kernel_driver_pmm_sysmem test due to lack of support for radix_tree_replace_slot in Linux 4.10");
-        status = NV_OK;
-    }
+    status = test_pmm_sysmem_reverse_map(va_space, params->range_address1, params->range_address2);

    uvm_va_space_up_write(va_space);
    uvm_mutex_unlock(&g_uvm_global.global_lock);
@@ -1220,9 +1214,9 @@ done:
    return status;
 }

-NV_STATUS do_test_cpu_chunk_free(uvm_cpu_chunk_t *chunk,
-                                 uvm_va_space_t *va_space,
-                                 const uvm_processor_mask_t *test_gpus)
+static NV_STATUS do_test_cpu_chunk_free(uvm_cpu_chunk_t *chunk,
+                                        uvm_va_space_t *va_space,
+                                        const uvm_processor_mask_t *test_gpus)
 {
    NV_STATUS status = NV_OK;
    uvm_cpu_chunk_t **split_chunks;
@@ -1318,8 +1312,8 @@ done_free:
    return status;
 }

-NV_STATUS test_cpu_chunk_free(uvm_va_space_t *va_space,
-                              const uvm_processor_mask_t *test_gpus)
+static NV_STATUS test_cpu_chunk_free(uvm_va_space_t *va_space,
+                                     const uvm_processor_mask_t *test_gpus)
 {
    uvm_cpu_chunk_t *chunk;
    uvm_chunk_sizes_mask_t alloc_sizes = uvm_cpu_chunk_get_allocation_sizes();
--- a/kernel-open/nvidia-uvm/uvm_pmm_test.c
+++ b/kernel-open/nvidia-uvm/uvm_pmm_test.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2023 NVIDIA Corporation
+    Copyright (c) 2015-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -898,11 +898,11 @@ NV_STATUS uvm_test_pmm_check_leak(UVM_TEST_PMM_CHECK_LEAK_PARAMS *params, struct
    return status;
 }

-NV_STATUS __test_pmm_async_alloc_type(uvm_va_space_t *va_space,
-                                      uvm_gpu_t *gpu,
-                                      size_t num_chunks,
-                                      uvm_pmm_gpu_memory_type_t mem_type,
-                                      size_t work_iterations)
+static NV_STATUS __test_pmm_async_alloc_type(uvm_va_space_t *va_space,
+                                             uvm_gpu_t *gpu,
+                                             size_t num_chunks,
+                                             uvm_pmm_gpu_memory_type_t mem_type,
+                                             size_t work_iterations)
 {
    NV_STATUS status;
    NV_STATUS tracker_status = NV_OK;
@@ -1199,120 +1199,6 @@ exit_unlock:
    return status;
 }

-static NV_STATUS test_indirect_peers(uvm_gpu_t *owning_gpu, uvm_gpu_t *accessing_gpu)
-{
-    uvm_pmm_gpu_t *pmm = &owning_gpu->pmm;
-    size_t chunk_size = uvm_chunk_find_first_size(pmm->chunk_sizes[UVM_PMM_GPU_MEMORY_TYPE_USER]);
-    uvm_gpu_chunk_t *parent_chunk = NULL;
-    uvm_gpu_chunk_t **chunks = NULL;
-    size_t i, num_chunks = UVM_CHUNK_SIZE_MAX / chunk_size;
-    NV_STATUS tracker_status, status = NV_OK;
-    uvm_mem_t *verif_mem = NULL;
-    uvm_tracker_t tracker = UVM_TRACKER_INIT();
-    uvm_gpu_address_t local_addr;
-    uvm_gpu_address_t peer_addr;
-    NvU32 init_val = 0x12345678;
-    NvU32 new_val = 0xabcdc0de;
-
-    chunks = uvm_kvmalloc_zero(num_chunks * sizeof(chunks[0]));
-    if (!chunks)
-        return NV_ERR_NO_MEMORY;
-
-    TEST_NV_CHECK_GOTO(uvm_mem_alloc_sysmem_and_map_cpu_kernel(UVM_CHUNK_SIZE_MAX, current->mm, &verif_mem), out);
-    TEST_NV_CHECK_GOTO(uvm_mem_map_gpu_kernel(verif_mem, owning_gpu), out);
-    TEST_NV_CHECK_GOTO(uvm_mem_map_gpu_kernel(verif_mem, accessing_gpu), out);
-
-    // Allocate a root chunk then split it to test multiple mappings across
-    // contiguous chunks under the same root.
-    TEST_NV_CHECK_GOTO(uvm_pmm_gpu_alloc_user(pmm,
-                                              1,
-                                              UVM_CHUNK_SIZE_MAX,
-                                              UVM_PMM_ALLOC_FLAGS_EVICT,
-                                              &parent_chunk,
-                                              NULL), out);
-
-    TEST_NV_CHECK_GOTO(uvm_pmm_gpu_split_chunk(pmm, parent_chunk, chunk_size, chunks), out);
-    parent_chunk = NULL;
-
-    // Verify contiguity and multiple mappings under a root chunk
-    for (i = 0; i < num_chunks; i++) {
-        TEST_NV_CHECK_GOTO(uvm_pmm_gpu_indirect_peer_map(pmm, chunks[i], accessing_gpu), out);
-        TEST_CHECK_GOTO(uvm_pmm_gpu_indirect_peer_addr(pmm, chunks[i], accessing_gpu) ==
-                        uvm_pmm_gpu_indirect_peer_addr(pmm, chunks[0], accessing_gpu) + i * chunk_size,
-                        out);
-    }
-
-    // Check that accessing_gpu can read and write
-    local_addr = chunk_copy_addr(owning_gpu, chunks[0]);
-    peer_addr  = uvm_pmm_gpu_peer_copy_address(&owning_gpu->pmm, chunks[0], accessing_gpu);
-
-    // Init on local GPU
-    TEST_NV_CHECK_GOTO(do_memset_4(owning_gpu, local_addr, init_val, UVM_CHUNK_SIZE_MAX, &tracker), out);
-
-    // Read using indirect peer and verify
-    TEST_NV_CHECK_GOTO(gpu_mem_check(accessing_gpu,
-                                     verif_mem,
-                                     peer_addr,
-                                     UVM_CHUNK_SIZE_MAX,
-                                     init_val,
-                                     &tracker), out);
-
-    // Write from indirect peer
-    TEST_NV_CHECK_GOTO(do_memset_4(accessing_gpu, peer_addr, new_val, UVM_CHUNK_SIZE_MAX, &tracker), out);
-
-    // Read using local gpu and verify
-    TEST_NV_CHECK_GOTO(gpu_mem_check(owning_gpu, verif_mem, local_addr, UVM_CHUNK_SIZE_MAX, new_val, &tracker), out);
-
-out:
-    tracker_status = uvm_tracker_wait_deinit(&tracker);
-    if (status == NV_OK && tracker_status != NV_OK) {
-        UVM_TEST_PRINT("Tracker wait failed\n");
-        status = tracker_status;
-    }
-
-    if (parent_chunk) {
-        uvm_pmm_gpu_free(pmm, parent_chunk, NULL);
-    }
-    else {
-        for (i = 0; i < num_chunks; i++) {
-            if (chunks[i])
-                uvm_pmm_gpu_free(pmm, chunks[i], NULL);
-        }
-    }
-
-    if (verif_mem)
-        uvm_mem_free(verif_mem);
-
-    uvm_kvfree(chunks);
-    return status;
-}
-
-NV_STATUS uvm_test_pmm_indirect_peers(UVM_TEST_PMM_INDIRECT_PEERS_PARAMS *params, struct file *filp)
-{
-    NV_STATUS status = NV_OK;
-    uvm_va_space_t *va_space = uvm_va_space_get(filp);
-    uvm_gpu_t *owning_gpu, *accessing_gpu;
-    bool ran_test = false;
-
-    uvm_va_space_down_read(va_space);
-
-    for_each_va_space_gpu(owning_gpu, va_space) {
-        for_each_va_space_gpu_in_mask(accessing_gpu, va_space, &va_space->indirect_peers[uvm_id_value(owning_gpu->id)]) {
-            ran_test = true;
-            status = test_indirect_peers(owning_gpu, accessing_gpu);
-            if (status != NV_OK)
-                goto out;
-        }
-    }
-
-    if (!ran_test)
-        status = NV_WARN_NOTHING_TO_DO;
-
-out:
-    uvm_va_space_up_read(va_space);
-    return status;
-}
-
 static NV_STATUS test_chunk_with_elevated_page(uvm_gpu_t *gpu)
 {
    uvm_pmm_gpu_t *pmm = &gpu->pmm;
--- a/kernel-open/nvidia-uvm/uvm_policy.c
+++ b/kernel-open/nvidia-uvm/uvm_policy.c
@@ -671,6 +671,9 @@ static NV_STATUS va_block_set_read_duplication_locked(uvm_va_block_t *va_block,

    uvm_assert_mutex_locked(&va_block->lock);

+    // Force CPU page residency to be on the preferred NUMA node.
+    va_block_context->make_resident.dest_nid = uvm_va_range_get_policy(va_block->va_range)->preferred_nid;
+
    for_each_id_in_mask(src_id, &va_block->resident) {
        NV_STATUS status;
        uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, src_id, NUMA_NO_NODE);
--- a/kernel-open/nvidia-uvm/uvm_populate_pageable.c
+++ b/kernel-open/nvidia-uvm/uvm_populate_pageable.c
@@ -53,7 +53,7 @@ static bool is_write_populate(struct vm_area_struct *vma, uvm_populate_permissio
    }
 }

-NV_STATUS uvm_handle_fault(struct vm_area_struct *vma, unsigned long start, unsigned long vma_num_pages, bool write)
+static NV_STATUS handle_fault(struct vm_area_struct *vma, unsigned long start, unsigned long vma_num_pages, bool write)
 {
    NV_STATUS status = NV_OK;

@@ -61,7 +61,7 @@ NV_STATUS uvm_handle_fault(struct vm_area_struct *vma, unsigned long start, unsi
    unsigned int ret = 0;
    unsigned int fault_flags = write ? FAULT_FLAG_WRITE : 0;

-#ifdef FAULT_FLAG_REMOTE
+#if defined(NV_MM_HAS_FAULT_FLAG_REMOTE)
    fault_flags |= (FAULT_FLAG_REMOTE);
 #endif

@@ -133,7 +133,7 @@ NV_STATUS uvm_populate_pageable_vma(struct vm_area_struct *vma,
    if (uvm_managed_vma)
        uvm_record_unlock_mmap_lock_read(mm);

-    status = uvm_handle_fault(vma, start, vma_num_pages, !!(gup_flags & FOLL_WRITE));
+    status = handle_fault(vma, start, vma_num_pages, !!(gup_flags & FOLL_WRITE));
    if (status != NV_OK)
        goto out;

--- a/kernel-open/nvidia-uvm/uvm_processors.c
+++ b/kernel-open/nvidia-uvm/uvm_processors.c
@@ -102,16 +102,8 @@ void uvm_parent_gpus_from_processor_mask(uvm_parent_processor_mask_t *parent_mas

 bool uvm_numa_id_eq(int nid0, int nid1)
 {
-    UVM_ASSERT(nid0 == -1 || nid0 < MAX_NUMNODES);
-    UVM_ASSERT(nid1 == -1 || nid1 < MAX_NUMNODES);
-
-    if ((nid0 == NUMA_NO_NODE || nid1 == NUMA_NO_NODE) && nodes_weight(node_possible_map) == 1) {
-        if (nid0 == NUMA_NO_NODE)
-            nid0 = first_node(node_possible_map);
-
-        if (nid1 == NUMA_NO_NODE)
-            nid1 = first_node(node_possible_map);
-    }
+    UVM_ASSERT(nid0 >= NUMA_NO_NODE && nid0 < MAX_NUMNODES);
+    UVM_ASSERT(nid1 >= NUMA_NO_NODE && nid1 < MAX_NUMNODES);

    return nid0 == nid1;
 }
--- a/kernel-open/nvidia-uvm/uvm_push_macros.h
+++ b/kernel-open/nvidia-uvm/uvm_push_macros.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2023 NVIDIA Corporation
+    Copyright (c) 2015-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -103,6 +103,8 @@
 #define UVM_SUBCHANNEL_C86F UVM_SUBCHANNEL_HOST
 #define UVM_SUBCHANNEL_C8B5 UVM_SUBCHANNEL_CE

+#define UVM_SUBCHANNEL_C96F UVM_SUBCHANNEL_HOST
+
 // Channel for UVM SW methods. This is defined in nv_uvm_types.h. RM does not
 // care about the specific number as long as it's bigger than the largest HW
 // value. For example, Kepler reserves subchannels 5-7 for software objects.
--- a/kernel-open/nvidia-uvm/uvm_push_test.c
+++ b/kernel-open/nvidia-uvm/uvm_push_test.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2023 NVIDIA Corporation
+    Copyright (c) 2015-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -560,20 +560,12 @@ static NV_STATUS test_push_exactly_max_push(uvm_gpu_t *gpu,

 static NvU32 test_count_idle_chunks(uvm_pushbuffer_t *pushbuffer)
 {
-    NvU32 i;
-    NvU32 count = 0;
-    for (i = 0; i < UVM_PUSHBUFFER_CHUNKS; ++i)
-        count += test_bit(i, pushbuffer->idle_chunks) ? 1 : 0;
-    return count;
+    return bitmap_weight(pushbuffer->idle_chunks, UVM_PUSHBUFFER_CHUNKS);
 }

 static NvU32 test_count_available_chunks(uvm_pushbuffer_t *pushbuffer)
 {
-    NvU32 i;
-    NvU32 count = 0;
-    for (i = 0; i < UVM_PUSHBUFFER_CHUNKS; ++i)
-        count += test_bit(i, pushbuffer->available_chunks) ? 1 : 0;
-    return count;
+    return bitmap_weight(pushbuffer->available_chunks, UVM_PUSHBUFFER_CHUNKS);
 }

 // Reuse the whole pushbuffer 4 times, one UVM_MAX_PUSH_SIZE at a time
@@ -859,10 +851,6 @@ static bool can_do_peer_copies(uvm_va_space_t *va_space, uvm_gpu_t *gpu_a, uvm_g

    UVM_ASSERT(uvm_processor_mask_test(&va_space->can_copy_from[uvm_id_value(gpu_b->id)], gpu_a->id));

-    // TODO: Bug 2028875. Indirect peers are not supported for now.
-    if (uvm_gpus_are_indirect_peers(gpu_a, gpu_b))
-        return false;
-
    return true;
 }

--- a/kernel-open/nvidia-uvm/uvm_test.c
+++ b/kernel-open/nvidia-uvm/uvm_test.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2023 NVIDIA Corporation
+    Copyright (c) 2015-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -124,24 +124,23 @@ static NV_STATUS uvm_test_verify_bh_affinity(uvm_intr_handler_t *isr, int node)
 static NV_STATUS uvm_test_numa_check_affinity(UVM_TEST_NUMA_CHECK_AFFINITY_PARAMS *params, struct file *filp)
 {
    uvm_gpu_t *gpu;
-    NV_STATUS status;
-    uvm_rm_user_object_t user_rm_va_space = {
-        .rm_control_fd = -1,
-        .user_client = params->client,
-        .user_object = params->smc_part_ref
-    };
+    NV_STATUS status = NV_OK;

    if (!UVM_THREAD_AFFINITY_SUPPORTED())
        return NV_ERR_NOT_SUPPORTED;

-    status = uvm_gpu_retain_by_uuid(&params->gpu_uuid, &user_rm_va_space, &gpu);
-    if (status != NV_OK)
-        return status;
+    uvm_mutex_lock(&g_uvm_global.global_lock);
+
+    gpu = uvm_gpu_get_by_uuid(&params->gpu_uuid);
+    if (!gpu) {
+        status = NV_ERR_INVALID_DEVICE;
+        goto unlock;
+    }

    // If the GPU is not attached to a NUMA node, there is nothing to do.
    if (gpu->parent->closest_cpu_numa_node == NUMA_NO_NODE) {
        status = NV_ERR_NOT_SUPPORTED;
-        goto release;
+        goto unlock;
    }

    if (gpu->parent->replayable_faults_supported) {
@@ -150,7 +149,7 @@ static NV_STATUS uvm_test_numa_check_affinity(UVM_TEST_NUMA_CHECK_AFFINITY_PARAM
                                              gpu->parent->closest_cpu_numa_node);
        uvm_parent_gpu_replayable_faults_isr_unlock(gpu->parent);
        if (status != NV_OK)
-            goto release;
+            goto unlock;

        if (gpu->parent->non_replayable_faults_supported) {
            uvm_parent_gpu_non_replayable_faults_isr_lock(gpu->parent);
@@ -158,7 +157,7 @@ static NV_STATUS uvm_test_numa_check_affinity(UVM_TEST_NUMA_CHECK_AFFINITY_PARAM
                                                  gpu->parent->closest_cpu_numa_node);
            uvm_parent_gpu_non_replayable_faults_isr_unlock(gpu->parent);
            if (status != NV_OK)
-                goto release;
+                goto unlock;
        }

        if (gpu->parent->access_counters_supported) {
@@ -168,8 +167,9 @@ static NV_STATUS uvm_test_numa_check_affinity(UVM_TEST_NUMA_CHECK_AFFINITY_PARAM
            uvm_parent_gpu_access_counters_isr_unlock(gpu->parent);
        }
    }
-release:
-    uvm_gpu_release(gpu);
+
+unlock:
+    uvm_mutex_unlock(&g_uvm_global.global_lock);
    return status;
 }

@@ -275,7 +275,6 @@ long uvm_test_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
        UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TEST_SET_PAGE_THRASHING_POLICY,    uvm_test_set_page_thrashing_policy);
        UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TEST_PMM_SYSMEM,                   uvm_test_pmm_sysmem);
        UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TEST_PMM_REVERSE_MAP,              uvm_test_pmm_reverse_map);
-        UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TEST_PMM_INDIRECT_PEERS,           uvm_test_pmm_indirect_peers);
        UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TEST_VA_SPACE_MM_RETAIN,           uvm_test_va_space_mm_retain);
        UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TEST_PMM_CHUNK_WITH_ELEVATED_PAGE, uvm_test_pmm_chunk_with_elevated_page);
        UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TEST_VA_SPACE_INJECT_ERROR,        uvm_test_va_space_inject_error);
--- a/kernel-open/nvidia-uvm/uvm_test.h
+++ b/kernel-open/nvidia-uvm/uvm_test.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2021 NVIDIA Corporation
+    Copyright (c) 2015-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -132,7 +132,6 @@ NV_STATUS uvm_test_pma_alloc_free(UVM_TEST_PMA_ALLOC_FREE_PARAMS *params, struct
 NV_STATUS uvm_test_pma_get_batch_size(UVM_TEST_PMA_GET_BATCH_SIZE_PARAMS *params, struct file *filp);
 NV_STATUS uvm_test_pmm_alloc_free_root(UVM_TEST_PMM_ALLOC_FREE_ROOT_PARAMS *params, struct file *filp);
 NV_STATUS uvm_test_pmm_inject_pma_evict_error(UVM_TEST_PMM_INJECT_PMA_EVICT_ERROR_PARAMS *params, struct file *filp);
-NV_STATUS uvm_test_pmm_indirect_peers(UVM_TEST_PMM_INDIRECT_PEERS_PARAMS *params, struct file *filp);
 NV_STATUS uvm_test_pmm_query_pma_stats(UVM_TEST_PMM_QUERY_PMA_STATS_PARAMS *params, struct file *filp);

 NV_STATUS uvm_test_perf_events_sanity(UVM_TEST_PERF_EVENTS_SANITY_PARAMS *params, struct file *filp);
--- a/kernel-open/nvidia-uvm/uvm_test_ioctl.h
+++ b/kernel-open/nvidia-uvm/uvm_test_ioctl.h
@@ -552,9 +552,9 @@ typedef struct
 // If user_pages_allocation_retry_force_count is non-0 then the next count user
 // memory allocations under the VA block will be forced to do allocation-retry.
 //
-// If cpu_pages_allocation_error_count is not zero, the subsequent operations
-// that need to allocate CPU pages will fail with NV_ERR_NO_MEMORY for
-// cpu_pages_allocation_error_count times. If cpu_pages_allocation_error_count
+// If cpu_chunk_allocation_error_count is not zero, the subsequent operations
+// that need to allocate CPU chunks will fail with NV_ERR_NO_MEMORY for
+// cpu_chunk_allocation_error_count times. If cpu_chunk_allocation_error_count
 // is equal to ~0U, the count is infinite.
 //
 // If eviction_failure is NV_TRUE, the next eviction attempt from the VA block
@@ -591,10 +591,10 @@ typedef struct
    NvU64     lookup_address NV_ALIGN_BYTES(8);         // In
    NvU32     page_table_allocation_retry_force_count;  // In
    NvU32     user_pages_allocation_retry_force_count;  // In
-    NvU32     cpu_chunk_allocation_size_mask;           // In
+    NvU64     cpu_chunk_allocation_size_mask;           // In
    NvS32     cpu_chunk_allocation_target_id;           // In
    NvS32     cpu_chunk_allocation_actual_id;           // In
-    NvU32     cpu_pages_allocation_error_count;         // In
+    NvU32     cpu_chunk_allocation_error_count;         // In
    NvBool    eviction_error;                           // In
    NvBool    populate_error;                           // In
    NV_STATUS rmStatus;                                 // Out
@@ -648,7 +648,7 @@ typedef struct

    // The size of the virtual mapping covering lookup_address on each
    // mapped_on processor.
-    NvU32                           page_size[UVM_MAX_PROCESSORS];                      // Out
+    NvU64                           page_size[UVM_MAX_PROCESSORS];                      // Out

    // Array of processors which have physical memory populated that would back
    // lookup_address if it was resident.
@@ -879,7 +879,7 @@ typedef struct
 typedef struct
 {
    NvProcessorUuid                 gpu_uuid;                                           // In
-    NvU32                           page_size;
+    NvU64                           page_size;
    NvBool                          contiguous;
    NvU64                           num_pages                        NV_ALIGN_BYTES(8); // In
    NvU64                           phys_begin                       NV_ALIGN_BYTES(8); // In
@@ -1065,12 +1065,6 @@ typedef struct
    NV_STATUS                       rmStatus;                                           // Out
 } UVM_TEST_PMM_REVERSE_MAP_PARAMS;

-#define UVM_TEST_PMM_INDIRECT_PEERS                      UVM_TEST_IOCTL_BASE(66)
-typedef struct
-{
-    NV_STATUS                       rmStatus;                                           // Out
-} UVM_TEST_PMM_INDIRECT_PEERS_PARAMS;
-
 // Calls uvm_va_space_mm_retain on a VA space, operates on the mm, optionally
 // sleeps for a while, then releases the va_space_mm and returns. The idea is to
 // simulate retaining a va_space_mm from a thread like the GPU fault handler
@@ -1210,8 +1204,6 @@ typedef struct
 typedef struct
 {
    NvProcessorUuid                 gpu_uuid;                                           // In
-    NvHandle                        client;                                             // In
-    NvHandle                        smc_part_ref;                                       // In

    NV_STATUS                       rmStatus;                                           // Out
 } UVM_TEST_NUMA_CHECK_AFFINITY_PARAMS;
@@ -1387,7 +1379,7 @@ typedef struct
 #define UVM_TEST_GET_CPU_CHUNK_ALLOC_SIZES               UVM_TEST_IOCTL_BASE(91)
 typedef struct
 {
-    NvU32                           alloc_size_mask;                                    // Out
+    NvU64                           alloc_size_mask;                                    // Out
    NvU32                           rmStatus;                                           // Out
 } UVM_TEST_GET_CPU_CHUNK_ALLOC_SIZES_PARAMS;

--- a/kernel-open/nvidia-uvm/uvm_tlb_batch.c
+++ b/kernel-open/nvidia-uvm/uvm_tlb_batch.c
@@ -59,7 +59,7 @@ static void tlb_batch_flush_invalidate_per_va(uvm_tlb_batch_t *batch, uvm_push_t
        // Use the depth of the max page size as it's the broadest
        NvU32 depth = tree->hal->page_table_depth(max_page_size);

-        UVM_ASSERT(hweight32(entry->page_sizes) > 0);
+        UVM_ASSERT(hweight64(entry->page_sizes) > 0);

        // Do the required membar only after the last invalidate
        if (i == batch->count - 1)
--- a/kernel-open/nvidia-uvm/uvm_tools.c
+++ b/kernel-open/nvidia-uvm/uvm_tools.c
@@ -26,6 +26,7 @@
 #include "uvm_gpu.h"
 #include "uvm_hal.h"
 #include "uvm_tools.h"
+#include "uvm_tools_init.h"
 #include "uvm_va_space.h"
 #include "uvm_api.h"
 #include "uvm_hal_types.h"
@@ -751,6 +752,7 @@ static UvmEventFaultType g_hal_to_tools_fault_type_table[UVM_FAULT_TYPE_COUNT] =
    [UVM_FAULT_TYPE_UNSUPPORTED_KIND]     = UvmFaultTypeUnsupportedKind,
    [UVM_FAULT_TYPE_REGION_VIOLATION]     = UvmFaultTypeRegionViolation,
    [UVM_FAULT_TYPE_POISONED]             = UvmFaultTypePoison,
+    [UVM_FAULT_TYPE_CC_VIOLATION]         = UvmFaultTypeCcViolation,
 };

 // TODO: add new value for weak atomics in tools
@@ -942,65 +944,57 @@ static void record_migration_events(void *args)
    migration_data_t *mig;
    migration_data_t *next;
    uvm_va_space_t *va_space = block_mig->va_space;
-
    NvU64 gpu_timestamp = block_mig->start_timestamp_gpu;

    uvm_down_read(&va_space->tools.lock);
-    if (tools_is_event_enabled_version(va_space, UvmEventTypeMigration, UvmToolsEventQueueVersion_V1)) {
-        UvmEventEntry_V1 entry;
-        UvmEventMigrationInfo_V1 *info = &entry.eventData.migration;
+    list_for_each_entry_safe(mig, next, &block_mig->events, events_node) {
+        UVM_ASSERT(mig->bytes > 0);
+        list_del(&mig->events_node);

-        // Initialize fields that are constant throughout the whole block
-        memset(&entry, 0, sizeof(entry));
-        info->eventType      = UvmEventTypeMigration;
-        info->srcIndex       = uvm_parent_id_value_from_processor_id(block_mig->src);
-        info->dstIndex       = uvm_parent_id_value_from_processor_id(block_mig->dst);
-        info->beginTimeStamp = block_mig->start_timestamp_cpu;
-        info->endTimeStamp   = block_mig->end_timestamp_cpu;
-        info->rangeGroupId   = block_mig->range_group_id;
-
-        list_for_each_entry_safe(mig, next, &block_mig->events, events_node) {
-            UVM_ASSERT(mig->bytes > 0);
-            list_del(&mig->events_node);
+        if (tools_is_event_enabled_version(va_space, UvmEventTypeMigration, UvmToolsEventQueueVersion_V1)) {
+            UvmEventEntry_V1 entry;
+            UvmEventMigrationInfo_V1 *info = &entry.eventData.migration;

+            // Initialize fields that are constant throughout the whole block
+            memset(&entry, 0, sizeof(entry));
+            info->eventType         = UvmEventTypeMigration;
+            info->srcIndex          = uvm_parent_id_value_from_processor_id(block_mig->src);
+            info->dstIndex          = uvm_parent_id_value_from_processor_id(block_mig->dst);
+            info->beginTimeStamp    = block_mig->start_timestamp_cpu;
+            info->endTimeStamp      = block_mig->end_timestamp_cpu;
+            info->rangeGroupId      = block_mig->range_group_id;
            info->address           = mig->address;
            info->migratedBytes     = mig->bytes;
            info->beginTimeStampGpu = gpu_timestamp;
            info->endTimeStampGpu   = mig->end_timestamp_gpu;
            info->migrationCause    = mig->cause;
-            gpu_timestamp = mig->end_timestamp_gpu;
-            kmem_cache_free(g_tools_migration_data_cache, mig);

            uvm_tools_record_event_v1(va_space, &entry);
        }
-    }
-    if (tools_is_event_enabled_version(va_space, UvmEventTypeMigration, UvmToolsEventQueueVersion_V2)) {
-        UvmEventEntry_V2 entry;
-        UvmEventMigrationInfo_V2 *info = &entry.eventData.migration;

-        // Initialize fields that are constant throughout the whole block
-        memset(&entry, 0, sizeof(entry));
-        info->eventType      = UvmEventTypeMigration;
-        info->srcIndex       = uvm_id_value(block_mig->src);
-        info->dstIndex       = uvm_id_value(block_mig->dst);
-        info->beginTimeStamp = block_mig->start_timestamp_cpu;
-        info->endTimeStamp   = block_mig->end_timestamp_cpu;
-        info->rangeGroupId   = block_mig->range_group_id;
-
-        list_for_each_entry_safe(mig, next, &block_mig->events, events_node) {
-            UVM_ASSERT(mig->bytes > 0);
-            list_del(&mig->events_node);
+        if (tools_is_event_enabled_version(va_space, UvmEventTypeMigration, UvmToolsEventQueueVersion_V2)) {
+            UvmEventEntry_V2 entry;
+            UvmEventMigrationInfo_V2 *info = &entry.eventData.migration;

+            // Initialize fields that are constant throughout the whole block
+            memset(&entry, 0, sizeof(entry));
+            info->eventType         = UvmEventTypeMigration;
+            info->srcIndex          = uvm_id_value(block_mig->src);
+            info->dstIndex          = uvm_id_value(block_mig->dst);
+            info->beginTimeStamp    = block_mig->start_timestamp_cpu;
+            info->endTimeStamp      = block_mig->end_timestamp_cpu;
+            info->rangeGroupId      = block_mig->range_group_id;
            info->address           = mig->address;
            info->migratedBytes     = mig->bytes;
            info->beginTimeStampGpu = gpu_timestamp;
            info->endTimeStampGpu   = mig->end_timestamp_gpu;
            info->migrationCause    = mig->cause;
-            gpu_timestamp = mig->end_timestamp_gpu;
-            kmem_cache_free(g_tools_migration_data_cache, mig);

            uvm_tools_record_event_v2(va_space, &entry);
        }
+
+        gpu_timestamp = mig->end_timestamp_gpu;
+        kmem_cache_free(g_tools_migration_data_cache, mig);
    }
    uvm_up_read(&va_space->tools.lock);

@@ -1879,49 +1873,44 @@ static void record_map_remote_events(void *args)
    uvm_va_space_t *va_space = block_map_remote->va_space;

    uvm_down_read(&va_space->tools.lock);
-    if (tools_is_event_enabled_version(va_space, UvmEventTypeMapRemote, UvmToolsEventQueueVersion_V1)) {
-        UvmEventEntry_V1 entry;
+    list_for_each_entry_safe(map_remote, next, &block_map_remote->events, events_node) {
+        list_del(&map_remote->events_node);

-        memset(&entry, 0, sizeof(entry));
+        if (tools_is_event_enabled_version(va_space, UvmEventTypeMapRemote, UvmToolsEventQueueVersion_V1)) {
+            UvmEventEntry_V1 entry;

-        entry.eventData.mapRemote.eventType      = UvmEventTypeMapRemote;
-        entry.eventData.mapRemote.srcIndex       = uvm_parent_id_value_from_processor_id(block_map_remote->src);
-        entry.eventData.mapRemote.dstIndex       = uvm_parent_id_value_from_processor_id(block_map_remote->dst);
-        entry.eventData.mapRemote.mapRemoteCause = block_map_remote->cause;
-        entry.eventData.mapRemote.timeStamp      = block_map_remote->timestamp;
+            memset(&entry, 0, sizeof(entry));

-        list_for_each_entry_safe(map_remote, next, &block_map_remote->events, events_node) {
-            list_del(&map_remote->events_node);
-
-            entry.eventData.mapRemote.address      = map_remote->address;
-            entry.eventData.mapRemote.size         = map_remote->size;
-            entry.eventData.mapRemote.timeStampGpu = map_remote->timestamp_gpu;
-            kmem_cache_free(g_tools_map_remote_data_cache, map_remote);
+            entry.eventData.mapRemote.eventType      = UvmEventTypeMapRemote;
+            entry.eventData.mapRemote.srcIndex       = uvm_parent_id_value_from_processor_id(block_map_remote->src);
+            entry.eventData.mapRemote.dstIndex       = uvm_parent_id_value_from_processor_id(block_map_remote->dst);
+            entry.eventData.mapRemote.mapRemoteCause = block_map_remote->cause;
+            entry.eventData.mapRemote.timeStamp      = block_map_remote->timestamp;
+            entry.eventData.mapRemote.address        = map_remote->address;
+            entry.eventData.mapRemote.size           = map_remote->size;
+            entry.eventData.mapRemote.timeStampGpu   = map_remote->timestamp_gpu;

            uvm_tools_record_event_v1(va_space, &entry);
        }
-    }
-    if (tools_is_event_enabled_version(va_space, UvmEventTypeMapRemote, UvmToolsEventQueueVersion_V2)) {
-        UvmEventEntry_V2 entry;

-        memset(&entry, 0, sizeof(entry));
+        if (tools_is_event_enabled_version(va_space, UvmEventTypeMapRemote, UvmToolsEventQueueVersion_V2)) {
+            UvmEventEntry_V2 entry;

-        entry.eventData.mapRemote.eventType      = UvmEventTypeMapRemote;
-        entry.eventData.mapRemote.srcIndex       = uvm_id_value(block_map_remote->src);
-        entry.eventData.mapRemote.dstIndex       = uvm_id_value(block_map_remote->dst);
-        entry.eventData.mapRemote.mapRemoteCause = block_map_remote->cause;
-        entry.eventData.mapRemote.timeStamp      = block_map_remote->timestamp;
+            memset(&entry, 0, sizeof(entry));

-        list_for_each_entry_safe(map_remote, next, &block_map_remote->events, events_node) {
-            list_del(&map_remote->events_node);
-
-            entry.eventData.mapRemote.address      = map_remote->address;
-            entry.eventData.mapRemote.size         = map_remote->size;
-            entry.eventData.mapRemote.timeStampGpu = map_remote->timestamp_gpu;
-            kmem_cache_free(g_tools_map_remote_data_cache, map_remote);
+            entry.eventData.mapRemote.eventType      = UvmEventTypeMapRemote;
+            entry.eventData.mapRemote.srcIndex       = uvm_id_value(block_map_remote->src);
+            entry.eventData.mapRemote.dstIndex       = uvm_id_value(block_map_remote->dst);
+            entry.eventData.mapRemote.mapRemoteCause = block_map_remote->cause;
+            entry.eventData.mapRemote.timeStamp      = block_map_remote->timestamp;
+            entry.eventData.mapRemote.address        = map_remote->address;
+            entry.eventData.mapRemote.size           = map_remote->size;
+            entry.eventData.mapRemote.timeStampGpu   = map_remote->timestamp_gpu;

            uvm_tools_record_event_v2(va_space, &entry);
        }
+
+        kmem_cache_free(g_tools_map_remote_data_cache, map_remote);
    }
    uvm_up_read(&va_space->tools.lock);

@@ -2064,15 +2053,15 @@ NV_STATUS uvm_api_tools_init_event_tracker(UVM_TOOLS_INIT_EVENT_TRACKER_PARAMS *
    NV_STATUS status = NV_OK;
    uvm_tools_event_tracker_t *event_tracker;

-    if (params->requestedVersion != UvmToolsEventQueueVersion_V1 &&
-        params->requestedVersion != UvmToolsEventQueueVersion_V2)
+    if (params->version != UvmToolsEventQueueVersion_V1 &&
+        params->version != UvmToolsEventQueueVersion_V2)
        return NV_ERR_INVALID_ARGUMENT;

    event_tracker = nv_kmem_cache_zalloc(g_tools_event_tracker_cache, NV_UVM_GFP_FLAGS);
    if (event_tracker == NULL)
        return NV_ERR_NO_MEMORY;

-    event_tracker->version = params->requestedVersion;
+    event_tracker->version = params->version;

    event_tracker->uvm_file = fget(params->uvmFd);
    if (event_tracker->uvm_file == NULL) {
@@ -2155,8 +2144,6 @@ NV_STATUS uvm_api_tools_init_event_tracker(UVM_TOOLS_INIT_EVENT_TRACKER_PARAMS *
        goto fail;
    }

-    params->grantedVersion = params->requestedVersion;
-
    return NV_OK;

 fail:
@@ -2690,32 +2677,22 @@ NV_STATUS uvm_api_tools_get_processor_uuid_table(UVM_TOOLS_GET_PROCESSOR_UUID_TA
    NvProcessorUuid *uuids;
    NvU64 remaining;
    uvm_gpu_t *gpu;
-    NvU32 count = params->count;
    uvm_va_space_t *va_space = uvm_va_space_get(filp);
-    NvU32 version = UvmToolsEventQueueVersion_V2;
+    NvU32 version = params->version;
+    NvU32 count;

-    // Prior to Multi-MIG support, params->count was always zero meaning the
-    // input array was size UVM_MAX_PROCESSORS_V1 or 33 at that time.
-    if (count == 0 && params->tablePtr) {
-        version = UvmToolsEventQueueVersion_V1;
+    if (version == UvmToolsEventQueueVersion_V1)
        count = UVM_MAX_PROCESSORS_V1;
-    }
-    else if (count == 0 || count > UVM_ID_MAX_PROCESSORS) {
-        // Note that we don't rely on the external API definition
-        // UVM_MAX_PROCESSORS since the kernel determines the array size needed
-        // and reports the number of processors found to the caller.
+    else if (version == UvmToolsEventQueueVersion_V2)
        count = UVM_ID_MAX_PROCESSORS;
-    }
-
-    // Return which version of the table is being returned.
-    params->version = version;
+    else
+        return NV_ERR_INVALID_ARGUMENT;

    uuids = uvm_kvmalloc_zero(sizeof(NvProcessorUuid) * count);
    if (uuids == NULL)
        return NV_ERR_NO_MEMORY;

    uvm_uuid_copy(&uuids[UVM_ID_CPU_VALUE], &NV_PROCESSOR_UUID_CPU_DEFAULT);
-    params->count = 1;

    uvm_va_space_down_read(va_space);
    for_each_va_space_gpu(gpu, va_space) {
@@ -2733,20 +2710,11 @@ NV_STATUS uvm_api_tools_get_processor_uuid_table(UVM_TOOLS_GET_PROCESSOR_UUID_TA
            uuid = &gpu->uuid;
        }

-        if (id_value < count)
-            uvm_uuid_copy(&uuids[id_value], uuid);
-
-        // Return the actual count even if the UUID isn't returned due to
-        // limited input array size.
-        if (id_value + 1 > params->count)
-            params->count = id_value + 1;
+        uvm_uuid_copy(&uuids[id_value], uuid);
    }
    uvm_va_space_up_read(va_space);

-    if (params->tablePtr)
-        remaining = nv_copy_to_user((void *)params->tablePtr, uuids, sizeof(NvProcessorUuid) * count);
-    else
-        remaining = 0;
+    remaining = nv_copy_to_user((void *)params->tablePtr, uuids, sizeof(NvProcessorUuid) * count);
    uvm_kvfree(uuids);

    if (remaining != 0)
--- a/kernel-open/nvidia-uvm/uvm_tools.h
+++ b/kernel-open/nvidia-uvm/uvm_tools.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2016-2019 NVIDIA Corporation
+    Copyright (c) 2016-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -110,6 +110,11 @@ void uvm_tools_broadcast_access_counter(uvm_gpu_t *gpu,
                                        const uvm_access_counter_buffer_entry_t *buffer_entry,
                                        bool on_managed_phys);

+void uvm_tools_record_access_counter(uvm_va_space_t *va_space,
+                                     uvm_gpu_id_t gpu_id,
+                                     const uvm_access_counter_buffer_entry_t *buffer_entry,
+                                     bool on_managed_phys);
+
 void uvm_tools_test_hmm_split_invalidate(uvm_va_space_t *va_space);

 // schedules completed events and then waits from the to be dispatched
--- a/kernel-open/nvidia-uvm/uvm_turing_fault_buffer.c
+++ b/kernel-open/nvidia-uvm/uvm_turing_fault_buffer.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2021 NVIDIA Corporation
+    Copyright (c) 2021-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -25,6 +25,7 @@
 #include "uvm_global.h"
 #include "uvm_gpu.h"
 #include "uvm_hal.h"
+#include "hwref/turing/tu102/dev_fault.h"

 static void clear_replayable_faults_interrupt(uvm_parent_gpu_t *parent_gpu)
 {
@@ -65,3 +66,58 @@ void uvm_hal_turing_disable_replayable_faults(uvm_parent_gpu_t *parent_gpu)
    // interrupt condition is no longer true.
    clear_replayable_faults_interrupt(parent_gpu);
 }
+
+static bool client_id_ce(NvU16 client_id)
+{
+    if (client_id >= NV_PFAULT_CLIENT_HUB_HSCE0 && client_id <= NV_PFAULT_CLIENT_HUB_HSCE9)
+        return true;
+
+    switch (client_id) {
+        case NV_PFAULT_CLIENT_HUB_CE0:
+        case NV_PFAULT_CLIENT_HUB_CE1:
+        case NV_PFAULT_CLIENT_HUB_CE2:
+            return true;
+    }
+
+    return false;
+}
+
+static bool client_id_host(NvU16 client_id)
+{
+    switch (client_id) {
+        case NV_PFAULT_CLIENT_HUB_HOST:
+        case NV_PFAULT_CLIENT_HUB_HOST_CPU:
+            return true;
+    }
+
+    return false;
+}
+
+uvm_mmu_engine_type_t uvm_hal_turing_fault_buffer_get_mmu_engine_type(NvU16 mmu_engine_id,
+                                                                      uvm_fault_client_type_t client_type,
+                                                                      NvU16 client_id)
+{
+    // Servicing CE and Host (HUB clients) faults.
+    if (client_type == UVM_FAULT_CLIENT_TYPE_HUB) {
+        if (client_id_ce(client_id)) {
+            UVM_ASSERT(mmu_engine_id >= NV_PFAULT_MMU_ENG_ID_CE0 && mmu_engine_id <= NV_PFAULT_MMU_ENG_ID_CE8);
+
+            return UVM_MMU_ENGINE_TYPE_CE;
+        }
+
+        if (client_id_host(client_id)) {
+            UVM_ASSERT(mmu_engine_id >= NV_PFAULT_MMU_ENG_ID_HOST0 && mmu_engine_id <= NV_PFAULT_MMU_ENG_ID_HOST14);
+
+            return UVM_MMU_ENGINE_TYPE_HOST;
+        }
+    }
+
+    // We shouldn't be servicing faults from any other engines other than GR.
+    UVM_ASSERT_MSG(client_id <= NV_PFAULT_CLIENT_GPC_T1_39, "Unexpected client ID: 0x%x\n", client_id);
+    UVM_ASSERT_MSG(mmu_engine_id >= NV_PFAULT_MMU_ENG_ID_GRAPHICS && mmu_engine_id < NV_PFAULT_MMU_ENG_ID_BAR1,
+                   "Unexpected engine ID: 0x%x\n",
+                   mmu_engine_id);
+    UVM_ASSERT(client_type == UVM_FAULT_CLIENT_TYPE_GPC);
+
+    return UVM_MMU_ENGINE_TYPE_GRAPHICS;
+}
--- a/kernel-open/nvidia-uvm/uvm_turing_mmu.c
+++ b/kernel-open/nvidia-uvm/uvm_turing_mmu.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2017-2020 NVIDIA Corporation
+    Copyright (c) 2017-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -167,18 +167,3 @@ uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_turing(NvU64 big_page_size)

    return &turing_mmu_mode_hal;
 }
-
-uvm_mmu_engine_type_t uvm_hal_turing_mmu_engine_id_to_type(NvU16 mmu_engine_id)
-{
-    if (mmu_engine_id >= NV_PFAULT_MMU_ENG_ID_HOST0 && mmu_engine_id <= NV_PFAULT_MMU_ENG_ID_HOST14)
-        return UVM_MMU_ENGINE_TYPE_HOST;
-
-    if (mmu_engine_id >= NV_PFAULT_MMU_ENG_ID_CE0 && mmu_engine_id <= NV_PFAULT_MMU_ENG_ID_CE8)
-        return UVM_MMU_ENGINE_TYPE_CE;
-
-    // We shouldn't be servicing faults from any other engines
-    UVM_ASSERT_MSG(mmu_engine_id >= NV_PFAULT_MMU_ENG_ID_GRAPHICS && mmu_engine_id < NV_PFAULT_MMU_ENG_ID_BAR1,
-                   "Unexpected engine ID: 0x%x\n", mmu_engine_id);
-
-    return UVM_MMU_ENGINE_TYPE_GRAPHICS;
-}
--- a/kernel-open/nvidia-uvm/uvm_types.h
+++ b/kernel-open/nvidia-uvm/uvm_types.h
@@ -491,6 +491,7 @@ typedef enum
    UvmFaultTypeUnsupportedKind                                                = 13,
    UvmFaultTypeRegionViolation                                                = 14,
    UvmFaultTypePoison                                                         = 15,
+    UvmFaultTypeCcViolation                                                    = 16,
    // ---- Add new values above this line
    UvmEventNumFaultTypes
 } UvmEventFaultType;
--- a/kernel-open/nvidia-uvm/uvm_user_channel.c
+++ b/kernel-open/nvidia-uvm/uvm_user_channel.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2016-2023 NVIDIA Corporation
+    Copyright (c) 2016-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -959,6 +959,7 @@ NV_STATUS uvm_test_check_channel_va_space(UVM_TEST_CHECK_CHANNEL_VA_SPACE_PARAMS
    uvm_va_space_t *va_space = NULL;
    uvm_va_space_t *channel_va_space;
    uvm_gpu_t *gpu;
+    uvm_gpu_t *channel_gpu;
    uvm_fault_buffer_entry_t fault_entry;
    UvmGpuChannelInstanceInfo *channel_info;
    NV_STATUS status;
@@ -1003,7 +1004,7 @@ NV_STATUS uvm_test_check_channel_va_space(UVM_TEST_CHECK_CHANNEL_VA_SPACE_PARAMS
        goto out;
    }

-    // Craft enough of the fault entry to do a VA space translation
+    // Craft enough of the fault entry to do a VA space translation.
    fault_entry.fault_type = UVM_FAULT_TYPE_INVALID_PTE;

    if (channel_info->sysmem)
@@ -1034,10 +1035,13 @@ NV_STATUS uvm_test_check_channel_va_space(UVM_TEST_CHECK_CHANNEL_VA_SPACE_PARAMS
    // We can ignore the return code because this ioctl only cares about whether
    // the provided channel + VEID matches the provided VA space. In all of the
    // non-NV_OK cases the translation will fail and we should return
-    // NV_ERR_INVALID_CHANNEL. channel_va_space == NULL for all such cases.
-    (void)uvm_parent_gpu_fault_entry_to_va_space(gpu->parent, &fault_entry, &channel_va_space);
+    // NV_ERR_INVALID_CHANNEL. channel_gpu_va_space == NULL for all such cases.
+    (void)uvm_parent_gpu_fault_entry_to_va_space(gpu->parent,
+                                                 &fault_entry,
+                                                 &channel_va_space,
+                                                 &channel_gpu);

-    if (channel_va_space == va_space)
+    if (channel_va_space == va_space && channel_gpu == gpu)
        status = NV_OK;
    else
        status = NV_ERR_INVALID_CHANNEL;
--- a/kernel-open/nvidia-uvm/uvm_user_channel.h
+++ b/kernel-open/nvidia-uvm/uvm_user_channel.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2016-2023 NVIDIA Corporation
+    Copyright (c) 2016-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -30,9 +30,9 @@
 #include "uvm_rb_tree.h"
 #include "nv-kref.h"

-// This structure contains the VA spaces of all the subcontexts in a TSG. It
+// This structure contains the GPU VA spaces of all the subcontexts in a TSG. It
 // is stored in a per-GPU UVM RB tree and is required to perform instance_ptr
-// to VA space translations when channels are registered in a subcontext,
+// to GPU VA space translations when channels are registered in a subcontext,
 // since SM fault/access counter notification packets may report any
 // instance_ptr in the TSG.
 typedef struct
@@ -46,7 +46,7 @@ typedef struct
    // Array of per-subcontext information
    struct
    {
-        uvm_va_space_t *va_space;
+        uvm_gpu_va_space_t *gpu_va_space;

        // Number of instance pointers referencing this specific subcontext
        NvU32 refcount;
--- a/kernel-open/nvidia-uvm/uvm_va_block.c
+++ b/kernel-open/nvidia-uvm/uvm_va_block.c
@@ -725,8 +725,9 @@ bool uvm_va_block_cpu_is_region_resident_on(uvm_va_block_t *va_block, int nid, u
 }

 // Return the preferred NUMA node ID for the block's policy.
-// If the preferred node ID is NUMA_NO_NODE, the current NUMA node ID
-// is returned.
+// If the preferred node ID is NUMA_NO_NODE, the nearest NUMA node ID
+// with memory is returned. In most cases, this should be the current
+// NUMA node.
 static int uvm_va_block_context_get_node(uvm_va_block_context_t *va_block_context)
 {
    if (va_block_context->make_resident.dest_nid != NUMA_NO_NODE)
@@ -1410,102 +1411,6 @@ error:
    return status;
 }

-static NV_STATUS block_sysmem_mappings_add_gpu_chunk(uvm_va_block_t *block,
-                                                     uvm_gpu_t *local_gpu,
-                                                     uvm_gpu_chunk_t *chunk,
-                                                     uvm_gpu_t *accessing_gpu)
-{
-    NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&local_gpu->pmm, chunk, accessing_gpu);
-    return uvm_pmm_sysmem_mappings_add_gpu_chunk_mapping(&accessing_gpu->pmm_reverse_sysmem_mappings,
-                                                         peer_addr,
-                                                         block->start + chunk->va_block_page_index * PAGE_SIZE,
-                                                         uvm_gpu_chunk_get_size(chunk),
-                                                         block,
-                                                         local_gpu->id);
-}
-
-static void block_sysmem_mappings_remove_gpu_chunk(uvm_gpu_t *local_gpu,
-                                                   uvm_gpu_chunk_t *chunk,
-                                                   uvm_gpu_t *accessing_gpu)
-{
-    NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&local_gpu->pmm, chunk, accessing_gpu);
-    uvm_pmm_sysmem_mappings_remove_gpu_chunk_mapping(&accessing_gpu->pmm_reverse_sysmem_mappings, peer_addr);
-}
-
-static NV_STATUS block_gpu_map_all_chunks_indirect_peer(uvm_va_block_t *block,
-                                                        uvm_gpu_t *local_gpu,
-                                                        uvm_gpu_t *accessing_gpu)
-{
-    uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, local_gpu->id);
-    uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
-    size_t num_chunks, i;
-    NV_STATUS status;
-
-    UVM_ASSERT(uvm_processor_mask_test(&va_space->indirect_peers[uvm_id_value(local_gpu->id)],
-                                       accessing_gpu->id));
-
-    // If no chunks are allocated currently, the mappings will be created later
-    // at chunk allocation.
-    if (!gpu_state || !gpu_state->chunks)
-        return NV_OK;
-
-    num_chunks = block_num_gpu_chunks(block, local_gpu);
-    for (i = 0; i < num_chunks; i++) {
-        uvm_gpu_chunk_t *chunk = gpu_state->chunks[i];
-        if (!chunk)
-            continue;
-
-        status = uvm_pmm_gpu_indirect_peer_map(&local_gpu->pmm, chunk, accessing_gpu);
-        if (status != NV_OK)
-            goto error;
-
-        status = block_sysmem_mappings_add_gpu_chunk(block, local_gpu, chunk, accessing_gpu);
-        if (status != NV_OK)
-            goto error;
-    }
-
-    return NV_OK;
-
-error:
-    while (i-- > 0) {
-        uvm_gpu_chunk_t *chunk = gpu_state->chunks[i];
-        if (chunk) {
-            // Indirect peer mappings are removed lazily by PMM, so if an error
-            // occurs the mappings established above will be removed when the
-            // chunk is freed later on. We only need to remove the sysmem
-            // reverse mappings.
-            block_sysmem_mappings_remove_gpu_chunk(local_gpu, chunk, accessing_gpu);
-        }
-    }
-
-    return status;
-}
-
-// Mappings for indirect peers are removed lazily by PMM, but we need to remove
-// the entries from the reverse map.
-static void block_gpu_unmap_all_chunks_indirect_peer(uvm_va_block_t *block,
-                                                     uvm_gpu_t *local_gpu,
-                                                     uvm_gpu_t *accessing_gpu)
-{
-    uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, local_gpu->id);
-    uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
-    size_t num_chunks, i;
-
-    UVM_ASSERT(uvm_processor_mask_test(&va_space->indirect_peers[uvm_id_value(local_gpu->id)],
-                                       accessing_gpu->id));
-
-    // Exit if no chunks are allocated currently.
-    if (!gpu_state || !gpu_state->chunks)
-        return;
-
-    num_chunks = block_num_gpu_chunks(block, local_gpu);
-    for (i = 0; i < num_chunks; i++) {
-        uvm_gpu_chunk_t *chunk = gpu_state->chunks[i];
-        if (chunk)
-            block_sysmem_mappings_remove_gpu_chunk(local_gpu, chunk, accessing_gpu);
-    }
-}
-
 // Retrieves the gpu_state for the given GPU. The returned pointer is
 // internally managed and will be allocated (and freed) automatically,
 // rather than by the caller.
@@ -1628,65 +1533,6 @@ void uvm_va_block_remove_cpu_chunks(uvm_va_block_t *va_block, uvm_va_block_regio
        uvm_processor_mask_clear(&va_block->resident, UVM_ID_CPU);
 }

-// Create physical mappings to allow other GPUs to access this chunk.
-static NV_STATUS block_map_indirect_peers_to_gpu_chunk(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_gpu_chunk_t *chunk)
-{
-    uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
-    uvm_gpu_t *accessing_gpu, *remove_gpu;
-    NV_STATUS status;
-
-    // Unlike uvm_va_block_map_cpu_chunk_on_gpus, this function isn't called on
-    // the eviction path, so we can assume that the VA space is locked.
-    //
-    // TODO: Bug 2007346: In the future we may want to enable eviction to peers,
-    //       meaning we may need to allocate peer memory and map it on the
-    //       eviction path. That will require making sure that peers can't be
-    //       enabled or disabled either in the VA space or globally within this
-    //       function.
-    uvm_assert_rwsem_locked(&va_space->lock);
-    uvm_assert_mutex_locked(&block->lock);
-
-    for_each_va_space_gpu_in_mask(accessing_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) {
-        status = uvm_pmm_gpu_indirect_peer_map(&gpu->pmm, chunk, accessing_gpu);
-        if (status != NV_OK)
-            goto error;
-
-        status = block_sysmem_mappings_add_gpu_chunk(block, gpu, chunk, accessing_gpu);
-        if (status != NV_OK)
-            goto error;
-    }
-
-    return NV_OK;
-
-error:
-    for_each_va_space_gpu_in_mask(remove_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) {
-        if (remove_gpu == accessing_gpu)
-            break;
-
-        // Indirect peer mappings are removed lazily by PMM, so if an error
-        // occurs the mappings established above will be removed when the
-        // chunk is freed later on. We only need to remove the sysmem
-        // reverse mappings.
-        block_sysmem_mappings_remove_gpu_chunk(gpu, chunk, remove_gpu);
-    }
-
-    return status;
-}
-
-static void block_unmap_indirect_peers_from_gpu_chunk(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_gpu_chunk_t *chunk)
-{
-    uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
-    uvm_gpu_t *peer_gpu;
-
-    uvm_assert_rwsem_locked(&va_space->lock);
-    uvm_assert_mutex_locked(&block->lock);
-
-    // Indirect peer mappings are removed lazily by PMM, so we only need to
-    // remove the sysmem reverse mappings.
-    for_each_va_space_gpu_in_mask(peer_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)])
-        block_sysmem_mappings_remove_gpu_chunk(gpu, chunk, peer_gpu);
-}
-
 // Mark a CPU page as dirty.
 static void block_mark_cpu_page_dirty(uvm_va_block_t *block, uvm_page_index_t page_index, int nid)
 {
@@ -1711,33 +1557,6 @@ static bool block_cpu_page_is_dirty(uvm_va_block_t *block, uvm_page_index_t page
    return uvm_cpu_chunk_is_dirty(chunk, page_index - chunk_region.first);
 }

-static NV_STATUS block_alloc_cpu_chunk_inject_error(uvm_va_block_t *block,
-                                                    uvm_chunk_size_t alloc_size,
-                                                    uvm_cpu_chunk_alloc_flags_t flags,
-                                                    int nid,
-                                                    uvm_cpu_chunk_t **chunk)
-{
-    uvm_va_block_test_t *block_test = uvm_va_block_get_test(block);
-
-    if (block_test) {
-        // Return out of memory error if the tests have requested it. As opposed
-        // to other error injection settings, this one fails N times and then
-        // succeeds.
-        // TODO: Bug 3701182: This will print a warning in Linux kernels newer
-        // than 5.16.0-rc1+.
-        if (block_test->inject_cpu_pages_allocation_error_count) {
-            if (block_test->inject_cpu_pages_allocation_error_count != ~(NvU32)0)
-                block_test->inject_cpu_pages_allocation_error_count--;
-            return NV_ERR_NO_MEMORY;
-        }
-
-        if (block_test->cpu_chunk_allocation_actual_id != NUMA_NO_NODE)
-            nid = block_test->cpu_chunk_allocation_actual_id;
-    }
-
-    return uvm_cpu_chunk_alloc(alloc_size, flags, nid, chunk);
-}
-
 // Allocate a CPU chunk with the given properties. This may involve retrying if
 // allocations fail. Allocating larger chunk sizes takes priority over
 // allocating on the specified node in the following manner:
@@ -1754,19 +1573,36 @@ static NV_STATUS block_alloc_cpu_chunk(uvm_va_block_t *block,
                                       int nid,
                                       uvm_cpu_chunk_t **chunk)
 {
+    uvm_va_block_test_t *block_test = uvm_va_block_get_test(block);
    NV_STATUS status = NV_ERR_NO_MEMORY;
    uvm_chunk_size_t alloc_size;
    bool numa_fallback = false;

+    if (block_test) {
+        // Return out of memory error if the tests have requested it. As opposed
+        // to other error injection settings, this one fails N times and then
+        // succeeds.
+        // TODO: Bug 3701182: This will print a warning in Linux kernels newer
+        // than 5.16.0-rc1+.
+        if (block_test->inject_cpu_chunk_allocation_error_count) {
+            if (block_test->inject_cpu_chunk_allocation_error_count != ~(NvU32)0)
+                block_test->inject_cpu_chunk_allocation_error_count--;
+            return NV_ERR_NO_MEMORY;
+        }
+
+        if (block_test->cpu_chunk_allocation_actual_id != NUMA_NO_NODE)
+            nid = block_test->cpu_chunk_allocation_actual_id;
+    }
+
    for_each_chunk_size_rev(alloc_size, cpu_allocation_sizes) {
-        status = block_alloc_cpu_chunk_inject_error(block, alloc_size, flags, nid, chunk);
+        status = uvm_cpu_chunk_alloc(alloc_size, flags, nid, chunk);
        if (status == NV_OK)
            break;

        if (flags & UVM_CPU_CHUNK_ALLOC_FLAGS_STRICT) {
            flags &= ~UVM_CPU_CHUNK_ALLOC_FLAGS_STRICT;
            numa_fallback = true;
-            status = block_alloc_cpu_chunk_inject_error(block, alloc_size, flags, NUMA_NO_NODE, chunk);
+            status = uvm_cpu_chunk_alloc(alloc_size, flags, NUMA_NO_NODE, chunk);
            if (status == NV_OK)
                break;
        }
@@ -2066,6 +1902,7 @@ static NV_STATUS block_populate_pages_cpu(uvm_va_block_t *block,
    uvm_page_mask_t *allocated_mask;
    uvm_cpu_chunk_alloc_flags_t alloc_flags = UVM_CPU_CHUNK_ALLOC_FLAGS_NONE;
    uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
+    const uvm_va_policy_t *policy = uvm_va_policy_get_region(block, populate_region);
    uvm_page_index_t page_index;
    uvm_gpu_id_t id;
    int preferred_nid = block_context->make_resident.dest_nid;
@@ -2073,6 +1910,10 @@ static NV_STATUS block_populate_pages_cpu(uvm_va_block_t *block,
    if (block_test && block_test->cpu_chunk_allocation_target_id != NUMA_NO_NODE)
        preferred_nid = block_test->cpu_chunk_allocation_target_id;

+    // If the VA range has a preferred NUMA node, use it.
+    if (preferred_nid == NUMA_NO_NODE)
+        preferred_nid = policy->preferred_nid;
+
    // TODO: Bug 4158598: Using NUMA_NO_NODE for staging allocations is sub-optimal.
    if (preferred_nid != NUMA_NO_NODE) {
        uvm_va_block_cpu_node_state_t *node_state = block_node_state_get(block, preferred_nid);
@@ -2123,13 +1964,12 @@ static NV_STATUS block_populate_pages_cpu(uvm_va_block_t *block,
        uvm_page_mask_t *node_pages_mask = &block_context->make_resident.node_pages_mask;
        uvm_chunk_sizes_mask_t allocation_sizes;

-        if (uvm_page_mask_test(allocated_mask, page_index)) {
+        if (uvm_page_mask_test(allocated_mask, page_index) ||
+            uvm_va_block_cpu_is_page_resident_on(block, preferred_nid, page_index)) {
            page_index = uvm_va_block_next_unset_page_in_mask(populate_region, allocated_mask, page_index) - 1;
            continue;
        }

-        UVM_ASSERT(!uvm_va_block_cpu_is_page_resident_on(block, preferred_nid, page_index));
-
        allocation_sizes = block_calculate_largest_alloc_size(block,
                                                              page_index,
                                                              allocated_mask,
@@ -3129,17 +2969,13 @@ static NV_STATUS block_populate_gpu_chunk(uvm_va_block_t *block,
    // compile-time that it can store VA Block page indexes.
    BUILD_BUG_ON(PAGES_PER_UVM_VA_BLOCK >= PAGE_SIZE);

-    status = block_map_indirect_peers_to_gpu_chunk(block, gpu, chunk);
-    if (status != NV_OK)
-        goto chunk_unmap;
-
    if (block_test && block_test->inject_populate_error) {
        block_test->inject_populate_error = false;

        // Use NV_ERR_MORE_PROCESSING_REQUIRED to force a retry rather than
        // causing a fatal OOM failure.
        status = NV_ERR_MORE_PROCESSING_REQUIRED;
-        goto chunk_unmap_indirect_peers;
+        goto chunk_unmap;
    }

    // Record the used chunk so that it can be unpinned at the end of the whole
@@ -3149,9 +2985,6 @@ static NV_STATUS block_populate_gpu_chunk(uvm_va_block_t *block,

    return NV_OK;

-chunk_unmap_indirect_peers:
-    block_unmap_indirect_peers_from_gpu_chunk(block, gpu, chunk);
-
 chunk_unmap:
    uvm_mmu_chunk_unmap(chunk, &block->tracker);

@@ -3326,8 +3159,9 @@ static uvm_gpu_chunk_t *block_phys_page_chunk(uvm_va_block_t *block, block_phys_
    return chunk;
 }

-// Get the physical GPU address of a block's page from the POV of the specified GPU
-// This is the address that should be used for making PTEs for the specified GPU.
+// Get the physical GPU address of a block's page from the POV of the specified
+// GPU. This is the address that should be used for making PTEs for the
+// specified GPU.
 static uvm_gpu_phys_address_t block_phys_page_address(uvm_va_block_t *block,
                                                      block_phys_page_t block_page,
                                                      uvm_gpu_t *gpu)
@@ -4033,7 +3867,7 @@ static NV_STATUS block_copy_pages(uvm_va_block_t *va_block,

        UVM_ASSERT(dst_chunk);
        UVM_ASSERT(uvm_cpu_chunk_get_size(src_chunk) >= uvm_va_block_region_size(region));
-        UVM_ASSERT(uvm_cpu_chunk_get_size(src_chunk) <= uvm_cpu_chunk_get_size(dst_chunk));
+        UVM_ASSERT(uvm_va_block_region_size(region) <= uvm_cpu_chunk_get_size(dst_chunk));

        // CPU-to-CPU copies using memcpy() don't have any inherent ordering with
        // copies using GPU CEs. So, we have to make sure that all previously
@@ -5128,7 +4962,7 @@ NV_STATUS uvm_va_block_make_resident_read_duplicate(uvm_va_block_t *va_block,
    uvm_page_mask_t *dst_resident_mask;
    uvm_page_mask_t *migrated_pages;
    uvm_page_mask_t *staged_pages;
-    uvm_page_mask_t *first_touch_mask;
+    uvm_page_mask_t *scratch_residency_mask;

    // TODO: Bug 3660922: need to implement HMM read duplication support.
    UVM_ASSERT(!uvm_va_block_is_hmm(va_block));
@@ -5147,6 +4981,10 @@ NV_STATUS uvm_va_block_make_resident_read_duplicate(uvm_va_block_t *va_block,
    uvm_assert_mutex_locked(&va_block->lock);
    UVM_ASSERT(!uvm_va_block_is_dead(va_block));

+    scratch_residency_mask = kmem_cache_alloc(g_uvm_page_mask_cache, NV_UVM_GFP_FLAGS);
+    if (!scratch_residency_mask)
+        return NV_ERR_NO_MEMORY;
+
    // For pages that are entering read-duplication we need to unmap remote
    // mappings and revoke RW and higher access permissions.
    //
@@ -5173,12 +5011,12 @@ NV_STATUS uvm_va_block_make_resident_read_duplicate(uvm_va_block_t *va_block,

        status = block_prep_read_duplicate_mapping(va_block, va_block_context, src_id, region, preprocess_page_mask);
        if (status != NV_OK)
-            return status;
+            goto out;
    }

    status = block_populate_pages(va_block, va_block_retry, va_block_context, dest_id, region, page_mask);
    if (status != NV_OK)
-        return status;
+        goto out;

    status = block_copy_resident_pages(va_block,
                                       va_block_context,
@@ -5188,22 +5026,17 @@ NV_STATUS uvm_va_block_make_resident_read_duplicate(uvm_va_block_t *va_block,
                                       prefetch_page_mask,
                                       UVM_VA_BLOCK_TRANSFER_MODE_COPY);
    if (status != NV_OK)
-        return status;
+        goto out;

    // Pages that weren't resident anywhere else were populated at the
    // destination directly. Mark them as resident now, since there were no
    // errors from block_copy_resident_pages() above.
-    // Note that va_block_context->scratch_page_mask is passed to
-    // block_copy_set_first_touch_residency() which is generally unsafe but in
-    // this case, block_copy_set_first_touch_residency() copies page_mask
-    // before scratch_page_mask could be clobbered.
    migrated_pages = &va_block_context->make_resident.pages_migrated;
-    first_touch_mask = &va_block_context->scratch_page_mask;
-    uvm_page_mask_init_from_region(first_touch_mask, region, page_mask);
-    uvm_page_mask_andnot(first_touch_mask, first_touch_mask, migrated_pages);
+    uvm_page_mask_init_from_region(scratch_residency_mask, region, page_mask);
+    uvm_page_mask_andnot(scratch_residency_mask, scratch_residency_mask, migrated_pages);

-    if (!uvm_page_mask_empty(first_touch_mask))
-        block_copy_set_first_touch_residency(va_block, va_block_context, dest_id, region, first_touch_mask);
+    if (!uvm_page_mask_empty(scratch_residency_mask))
+        block_copy_set_first_touch_residency(va_block, va_block_context, dest_id, region, scratch_residency_mask);

    staged_pages = &va_block_context->make_resident.pages_staged;
    if (!UVM_ID_IS_CPU(dest_id) && !uvm_page_mask_empty(staged_pages)) {
@@ -5215,6 +5048,18 @@ NV_STATUS uvm_va_block_make_resident_read_duplicate(uvm_va_block_t *va_block,

    if (!uvm_page_mask_empty(migrated_pages)) {
        if (UVM_ID_IS_CPU(dest_id)) {
+            // Check if the CPU is already in the resident set of processors.
+            // We need to do this since we can't have multiple NUMA nodes with
+            // resident pages.
+            // If any of the migrate pages were already resident on the CPU, the
+            // residency has to be switched to the destination NUMA node.
+            if (uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU) &&
+                uvm_page_mask_and(scratch_residency_mask,
+                                  uvm_va_block_resident_mask_get(va_block, UVM_ID_CPU, NUMA_NO_NODE),
+                                  migrated_pages)) {
+                uvm_va_block_cpu_clear_resident_all_chunks(va_block, va_block_context, scratch_residency_mask);
+            }
+
            uvm_va_block_cpu_set_resident_all_chunks(va_block, va_block_context, migrated_pages);
        }
        else {
@@ -5243,7 +5088,9 @@ NV_STATUS uvm_va_block_make_resident_read_duplicate(uvm_va_block_t *va_block,
    // Check state of all chunks after residency change.
    // TODO: Bug 4207783: Check both CPU and GPU chunks.
    UVM_ASSERT(block_check_cpu_chunks(va_block));
-    return NV_OK;
+out:
+    kmem_cache_free(g_uvm_page_mask_cache, scratch_residency_mask);
+    return status;
 }

 // Looks up the current CPU mapping state of page from the
@@ -5322,35 +5169,6 @@ static bool block_has_valid_mapping_cpu(uvm_va_block_t *block, uvm_va_block_regi
    return true;
 }

-static bool block_check_chunk_indirect_peers(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_gpu_chunk_t *chunk)
-{
-    uvm_gpu_t *accessing_gpu;
-    uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
-
-    if (!uvm_pmm_sysmem_mappings_indirect_supported())
-        return true;
-
-    for_each_va_space_gpu_in_mask(accessing_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) {
-        NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&gpu->pmm, chunk, accessing_gpu);
-        uvm_reverse_map_t reverse_map;
-        size_t num_mappings;
-
-        num_mappings = uvm_pmm_sysmem_mappings_dma_to_virt(&accessing_gpu->pmm_reverse_sysmem_mappings,
-                                                           peer_addr,
-                                                           uvm_gpu_chunk_get_size(chunk),
-                                                           &reverse_map,
-                                                           1);
-        UVM_ASSERT(num_mappings == 1);
-        UVM_ASSERT(reverse_map.va_block == block);
-        UVM_ASSERT(reverse_map.region.first == chunk->va_block_page_index);
-        UVM_ASSERT(uvm_va_block_region_size(reverse_map.region) == uvm_gpu_chunk_get_size(chunk));
-
-        uvm_va_block_release_no_destroy(reverse_map.va_block);
-    }
-
-    return true;
-}
-
 // Sanity check the given GPU's chunks array
 static bool block_check_gpu_chunks(uvm_va_block_t *block, uvm_gpu_id_t id)
 {
@@ -5408,8 +5226,6 @@ static bool block_check_gpu_chunks(uvm_va_block_t *block, uvm_gpu_id_t id)

            UVM_ASSERT(chunk->va_block == block);
            UVM_ASSERT(chunk->va_block_page_index == page_index);
-
-            UVM_ASSERT(block_check_chunk_indirect_peers(block, gpu, chunk));
        }

        page_index += chunk_size / PAGE_SIZE;
@@ -5528,13 +5344,15 @@ static bool block_check_mappings_page(uvm_va_block_t *block,
                   *block->read_duplicated_pages.bitmap);

    // Test read_duplicated_pages mask
-    UVM_ASSERT_MSG((uvm_processor_mask_get_count(resident_processors) <= 1 &&
-                     !uvm_page_mask_test(&block->read_duplicated_pages, page_index)) ||
-                   (uvm_processor_mask_get_count(resident_processors) > 1 &&
-                     uvm_page_mask_test(&block->read_duplicated_pages, page_index)),
+    UVM_ASSERT_MSG((!uvm_page_mask_test(&block->read_duplicated_pages, page_index) &&
+                    uvm_processor_mask_get_count(resident_processors) <= 1) ||
+                   (uvm_page_mask_test(&block->read_duplicated_pages, page_index) &&
+                    uvm_processor_mask_get_count(resident_processors) >= 1),
                   "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx - SWA: 0x%lx - RD: 0x%lx\n",
                   *resident_processors->bitmap,
-                   *read_mappings->bitmap, *write_mappings->bitmap, *atomic_mappings->bitmap,
+                   *read_mappings->bitmap,
+                   *write_mappings->bitmap,
+                   *atomic_mappings->bitmap,
                   *va_space->system_wide_atomics_enabled_processors.bitmap,
                   *block->read_duplicated_pages.bitmap);

@@ -5613,17 +5431,6 @@ static bool block_check_mappings_page(uvm_va_block_t *block,
                           *va_space->system_wide_atomics_enabled_processors.bitmap);
            for_each_id_in_mask(id, read_mappings) {
                UVM_ASSERT(uvm_processor_mask_test(&va_space->can_access[uvm_id_value(id)], residency));
-
-                if (uvm_processor_mask_test(&va_space->indirect_peers[uvm_id_value(residency)], id)) {
-                    uvm_gpu_t *resident_gpu = uvm_va_space_get_gpu(va_space, residency);
-                    uvm_gpu_t *mapped_gpu = uvm_va_space_get_gpu(va_space, id);
-                    uvm_gpu_chunk_t *chunk = block_phys_page_chunk(block,
-                                                                   block_phys_page(residency, NUMA_NO_NODE, page_index),
-                                                                   NULL);
-
-                    // This function will assert if no mapping exists
-                    (void)uvm_pmm_gpu_indirect_peer_addr(&resident_gpu->pmm, chunk, mapped_gpu);
-                }
            }
        }
    }
@@ -6018,7 +5825,7 @@ static bool block_has_remote_mapping_gpu(uvm_va_block_t *block,
        if (uvm_page_mask_empty(mapped_pages))
            return false;

-        return !uvm_id_equal(uvm_va_range_get_policy(block->va_range)->preferred_location, gpu_id);
+        return !uvm_va_policy_preferred_location_equal(uvm_va_range_get_policy(block->va_range), gpu_id, NUMA_NO_NODE);
    }

    // Remote pages are pages which are mapped but not resident locally
@@ -8361,6 +8168,7 @@ static NV_STATUS block_map_gpu_to(uvm_va_block_t *va_block,
                                  uvm_va_block_context_t *block_context,
                                  uvm_gpu_t *gpu,
                                  uvm_processor_id_t resident_id,
+                                  int resident_nid,
                                  uvm_page_mask_t *map_page_mask,
                                  uvm_prot_t new_prot,
                                  uvm_tracker_t *out_tracker)
@@ -8370,7 +8178,7 @@ static NV_STATUS block_map_gpu_to(uvm_va_block_t *va_block,
    uvm_push_t push;
    NV_STATUS status;
    uvm_page_mask_t *pages_to_map = &block_context->mapping.page_mask;
-    const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, resident_id, NUMA_NO_NODE);
+    const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, resident_id, resident_nid);
    uvm_pte_bits_gpu_t pte_bit;
    uvm_pte_bits_gpu_t prot_pte_bit = get_gpu_pte_bit_index(new_prot);
    uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state;
@@ -8379,8 +8187,10 @@ static NV_STATUS block_map_gpu_to(uvm_va_block_t *va_block,
    UVM_ASSERT(map_page_mask);
    UVM_ASSERT(uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(resident_id)], gpu->id));

-    if (uvm_processor_mask_test(block_get_uvm_lite_gpus(va_block), gpu->id))
-        UVM_ASSERT(uvm_id_equal(resident_id, uvm_va_range_get_policy(va_block->va_range)->preferred_location));
+    if (uvm_processor_mask_test(block_get_uvm_lite_gpus(va_block), gpu->id)) {
+        uvm_va_policy_t *policy = uvm_va_range_get_policy(va_block->va_range);
+        UVM_ASSERT(uvm_va_policy_preferred_location_equal(policy, resident_id, policy->preferred_nid));
+    }

    UVM_ASSERT(!uvm_page_mask_and(&block_context->scratch_page_mask,
                                  map_page_mask,
@@ -8482,18 +8292,27 @@ static NV_STATUS block_map_gpu_to(uvm_va_block_t *va_block,
    return uvm_tracker_add_push_safe(out_tracker, &push);
 }

+// allowed_nid_mask is only valid if the CPU is set in allowed_mask.
 static void map_get_allowed_destinations(uvm_va_block_t *block,
                                         uvm_va_block_context_t *va_block_context,
                                         const uvm_va_policy_t *policy,
                                         uvm_processor_id_t id,
-                                         uvm_processor_mask_t *allowed_mask)
+                                         uvm_processor_mask_t *allowed_mask,
+                                         nodemask_t *allowed_nid_mask)
 {
    uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);

+    *allowed_nid_mask = node_possible_map;
+
    if (uvm_processor_mask_test(block_get_uvm_lite_gpus(block), id)) {
        // UVM-Lite can only map resident pages on the preferred location
        uvm_processor_mask_zero(allowed_mask);
        uvm_processor_mask_set(allowed_mask, policy->preferred_location);
+        if (UVM_ID_IS_CPU(policy->preferred_location) &&
+            !uvm_va_policy_preferred_location_equal(policy, UVM_ID_CPU, NUMA_NO_NODE)) {
+            nodes_clear(*allowed_nid_mask);
+            node_set(policy->preferred_nid, *allowed_nid_mask);
+        }
    }
    else if ((uvm_va_policy_is_read_duplicate(policy, va_space) ||
              (uvm_id_equal(policy->preferred_location, id) &&
@@ -8536,6 +8355,7 @@ NV_STATUS uvm_va_block_map(uvm_va_block_t *va_block,
    uvm_page_mask_t *running_page_mask = &va_block_context->mapping.map_running_page_mask;
    NV_STATUS status = NV_OK;
    const uvm_va_policy_t *policy = uvm_va_policy_get_region(va_block, region);
+    nodemask_t *allowed_nid_destinations;

    va_block_context->mapping.cause = cause;

@@ -8585,10 +8405,20 @@ NV_STATUS uvm_va_block_map(uvm_va_block_t *va_block,
    if (!allowed_destinations)
        return NV_ERR_NO_MEMORY;

+    allowed_nid_destinations = uvm_kvmalloc(sizeof(*allowed_nid_destinations));
+    if (!allowed_nid_destinations) {
+        uvm_processor_mask_cache_free(allowed_destinations);
+        return NV_ERR_NO_MEMORY;
+    }
+
    // Map per resident location so we can more easily detect physically-
    // contiguous mappings.
-    map_get_allowed_destinations(va_block, va_block_context, policy, id, allowed_destinations);
-
+    map_get_allowed_destinations(va_block,
+                                 va_block_context,
+                                 policy,
+                                 id,
+                                 allowed_destinations,
+                                 allowed_nid_destinations);
    for_each_closest_id(resident_id, allowed_destinations, id, va_space) {
        if (UVM_ID_IS_CPU(id)) {
            status = block_map_cpu_to(va_block,
@@ -8599,11 +8429,30 @@ NV_STATUS uvm_va_block_map(uvm_va_block_t *va_block,
                                      new_prot,
                                      out_tracker);
        }
+        else if (UVM_ID_IS_CPU(resident_id)) {
+            int nid;
+
+            // map_get_allowed_distinations() will set the mask of CPU NUMA
+            // nodes that should be mapped.
+            for_each_node_mask(nid, *allowed_nid_destinations) {
+                status = block_map_gpu_to(va_block,
+                                          va_block_context,
+                                          gpu,
+                                          resident_id,
+                                          nid,
+                                          running_page_mask,
+                                          new_prot,
+                                          out_tracker);
+                if (status != NV_OK)
+                    break;
+            }
+        }
        else {
            status = block_map_gpu_to(va_block,
                                      va_block_context,
                                      gpu,
                                      resident_id,
+                                      NUMA_NO_NODE,
                                      running_page_mask,
                                      new_prot,
                                      out_tracker);
@@ -8618,6 +8467,7 @@ NV_STATUS uvm_va_block_map(uvm_va_block_t *va_block,
    }

    uvm_processor_mask_cache_free(allowed_destinations);
+    uvm_kvfree(allowed_nid_destinations);

    return status;
 }
@@ -9031,7 +8881,7 @@ static void block_destroy_gpu_state(uvm_va_block_t *block, uvm_va_block_context_
    uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, id);
    uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
    uvm_gpu_va_space_t *gpu_va_space;
-    uvm_gpu_t *gpu, *other_gpu;
+    uvm_gpu_t *gpu;

    if (!gpu_state)
        return;
@@ -9041,32 +8891,14 @@ static void block_destroy_gpu_state(uvm_va_block_t *block, uvm_va_block_context_
    // Unmap PTEs and free page tables
    gpu = uvm_va_space_get_gpu(va_space, id);
    gpu_va_space = uvm_gpu_va_space_get(va_space, gpu);
-    if (gpu_va_space) {
-
+    if (gpu_va_space)
        uvm_va_block_remove_gpu_va_space(block, gpu_va_space, block_context);
-    }

    UVM_ASSERT(!uvm_processor_mask_test(&block->mapped, id));

    // No processor should have this GPU mapped at this point
    UVM_ASSERT(block_check_processor_not_mapped(block, block_context, id));

-    // We need to remove the mappings of the indirect peers from the reverse
-    // map when the GPU state is being destroyed (for example, on
-    // unregister_gpu) and when peer access between indirect peers is disabled.
-    // However, we need to avoid double mapping removals. There are two
-    // possible scenarios:
-    // - Disable peer access first. This will remove all mappings between A and
-    // B GPUs, and the indirect_peers bit is cleared. Thus, the later call to
-    // unregister_gpu will not operate on that pair of GPUs.
-    // - Unregister GPU first. This will remove all mappings from all indirect
-    // peers to the GPU being unregistered. It will also destroy its GPU state.
-    // Subsequent calls to disable peers will remove the mappings from the GPU
-    // being unregistered, but never to the GPU being unregistered (since it no
-    // longer has a valid GPU state).
-    for_each_va_space_gpu_in_mask(other_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)])
-        block_gpu_unmap_all_chunks_indirect_peer(block, gpu, other_gpu);
-
    if (gpu_state->chunks) {
        size_t i, num_chunks;

@@ -9212,33 +9044,6 @@ void uvm_va_block_remove_gpu_va_space(uvm_va_block_t *va_block,
    UVM_ASSERT(block_check_mappings(va_block, block_context));
 }

-NV_STATUS uvm_va_block_enable_peer(uvm_va_block_t *va_block, uvm_gpu_t *gpu0, uvm_gpu_t *gpu1)
-{
-    NV_STATUS status;
-    uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
-
-    UVM_ASSERT(uvm_gpu_peer_caps(gpu0, gpu1)->link_type != UVM_GPU_LINK_INVALID);
-    uvm_assert_rwsem_locked_write(&va_space->lock);
-    uvm_assert_mutex_locked(&va_block->lock);
-
-    if (uvm_processor_mask_test(&va_space->indirect_peers[uvm_id_value(gpu0->id)], gpu1->id)) {
-        status = block_gpu_map_all_chunks_indirect_peer(va_block, gpu0, gpu1);
-        if (status != NV_OK)
-            return status;
-
-        status = block_gpu_map_all_chunks_indirect_peer(va_block, gpu1, gpu0);
-        if (status != NV_OK) {
-            block_gpu_unmap_all_chunks_indirect_peer(va_block, gpu0, gpu1);
-            return status;
-        }
-    }
-
-    // TODO: Bug 1767224: Refactor the uvm_va_block_set_accessed_by logic so we
-    //       call it here.
-
-    return NV_OK;
-}
-
 void uvm_va_block_disable_peer(uvm_va_block_t *va_block, uvm_gpu_t *gpu0, uvm_gpu_t *gpu1)
 {
    uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
@@ -9251,12 +9056,6 @@ void uvm_va_block_disable_peer(uvm_va_block_t *va_block, uvm_gpu_t *gpu0, uvm_gp

    uvm_assert_mutex_locked(&va_block->lock);

-    // See comment in block_destroy_gpu_state
-    if (uvm_processor_mask_test(&va_space->indirect_peers[uvm_id_value(gpu0->id)], gpu1->id)) {
-        block_gpu_unmap_all_chunks_indirect_peer(va_block, gpu0, gpu1);
-        block_gpu_unmap_all_chunks_indirect_peer(va_block, gpu1, gpu0);
-    }
-
    // If either of the GPUs doesn't have GPU state then nothing could be mapped
    // between them.
    if (!uvm_va_block_gpu_state_get(va_block, gpu0->id) || !uvm_va_block_gpu_state_get(va_block, gpu1->id))
@@ -9588,8 +9387,6 @@ static void block_gpu_release_region(uvm_va_block_t *va_block,
        if (!gpu_chunk)
            continue;

-        // TODO: Bug 3898467: unmap indirect peers when freeing GPU chunks
-
        uvm_mmu_chunk_unmap(gpu_chunk, &va_block->tracker);

        // The GPU chunk will be freed when the device private reference drops.
@@ -9777,29 +9574,11 @@ static void block_gpu_chunk_get_split_state(uvm_va_block_t *block,
    state->chunk_index = block_gpu_chunk_index_range(block, start, size, gpu, page_index, &state->chunk_size);
 }

-static void block_merge_chunk(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_gpu_chunk_t *chunk)
-{
-    uvm_gpu_t *accessing_gpu;
-    uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
-
-    uvm_pmm_gpu_merge_chunk(&gpu->pmm, chunk);
-
-    for_each_va_space_gpu_in_mask(accessing_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) {
-        NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&gpu->pmm, chunk, accessing_gpu);
-
-        uvm_pmm_sysmem_mappings_merge_gpu_chunk_mappings(&accessing_gpu->pmm_reverse_sysmem_mappings,
-                                                         peer_addr,
-                                                         uvm_gpu_chunk_get_size(chunk));
-    }
-}
-
 // Perform any chunk splitting and array growing required for this block split,
 // but don't actually move chunk pointers anywhere.
 static NV_STATUS block_presplit_gpu_chunks(uvm_va_block_t *existing, uvm_va_block_t *new, uvm_gpu_t *gpu)
 {
    uvm_va_block_gpu_state_t *existing_gpu_state = uvm_va_block_gpu_state_get(existing, gpu->id);
-    uvm_gpu_t *accessing_gpu;
-    uvm_va_space_t *va_space = uvm_va_block_get_va_space(existing);
    uvm_gpu_chunk_t **temp_chunks;
    uvm_gpu_chunk_t *original_chunk, *curr_chunk;
    uvm_page_index_t split_page_index = uvm_va_block_cpu_page_index(existing, new->start);
@@ -9864,17 +9643,6 @@ static NV_STATUS block_presplit_gpu_chunks(uvm_va_block_t *existing, uvm_va_bloc
        if (status != NV_OK)
            goto error;

-        // Split physical GPU mappings for indirect peers
-        for_each_va_space_gpu_in_mask(accessing_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) {
-            NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&gpu->pmm, curr_chunk, accessing_gpu);
-
-            status = uvm_pmm_sysmem_mappings_split_gpu_chunk_mappings(&accessing_gpu->pmm_reverse_sysmem_mappings,
-                                                                      peer_addr,
-                                                                      subchunk_size);
-            if (status != NV_OK)
-                goto error;
-        }
-
        if (subchunk_size == new_state.chunk_size)
            break;

@@ -9900,7 +9668,7 @@ static NV_STATUS block_presplit_gpu_chunks(uvm_va_block_t *existing, uvm_va_bloc

 error:
    // On error we need to leave the chunk in its initial state
-    block_merge_chunk(existing, gpu, original_chunk);
+    uvm_pmm_gpu_merge_chunk(&gpu->pmm, original_chunk);

    return status;
 }
@@ -10338,7 +10106,7 @@ error:
        if (!chunk || chunk->state != UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT)
            continue;

-        block_merge_chunk(existing, gpu, chunk);
+        uvm_pmm_gpu_merge_chunk(&gpu->pmm, chunk);

        // We could attempt to shrink the chunks array back down, but it doesn't
        // hurt much to have it larger than necessary, and we'd have to handle
@@ -10628,11 +10396,9 @@ static void block_split_gpu(uvm_va_block_t *existing, uvm_va_block_t *new, uvm_g
    uvm_va_space_t *va_space = uvm_va_block_get_va_space(existing);
    uvm_gpu_va_space_t *gpu_va_space;
    uvm_gpu_t *gpu;
-    uvm_gpu_t *accessing_gpu;
    size_t new_pages = uvm_va_block_num_cpu_pages(new);
    size_t existing_pages, existing_pages_4k, existing_pages_big, new_pages_big;
    uvm_pte_bits_gpu_t pte_bit;
-    size_t num_chunks, i;
    uvm_cpu_chunk_t *cpu_chunk;
    uvm_page_index_t page_index;
    int nid;
@@ -10659,23 +10425,6 @@ static void block_split_gpu(uvm_va_block_t *existing, uvm_va_block_t *new, uvm_g

    block_copy_split_gpu_chunks(existing, new, gpu);

-    num_chunks = block_num_gpu_chunks(new, gpu);
-
-    // Reparent GPU mappings for indirect peers
-    for (i = 0; i < num_chunks; ++i) {
-        uvm_gpu_chunk_t *chunk = new_gpu_state->chunks[i];
-        if (!chunk)
-            continue;
-
-        for_each_va_space_gpu_in_mask(accessing_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) {
-            NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&gpu->pmm, chunk, accessing_gpu);
-
-            uvm_pmm_sysmem_mappings_reparent_gpu_chunk_mapping(&accessing_gpu->pmm_reverse_sysmem_mappings,
-                                                               peer_addr,
-                                                               new);
-        }
-    }
-
    block_split_page_mask(&existing_gpu_state->resident,
                          existing_pages,
                          &new_gpu_state->resident,
@@ -11180,8 +10929,8 @@ NV_STATUS uvm_va_block_add_mappings_after_migration(uvm_va_block_t *va_block,
    // so uvm_va_block_map will be a no-op.
    uvm_processor_mask_and(map_uvm_lite_gpus, map_other_processors, block_get_uvm_lite_gpus(va_block));
    if (!uvm_processor_mask_empty(map_uvm_lite_gpus) &&
-        uvm_id_equal(new_residency, preferred_location)) {
-        for_each_id_in_mask(map_processor_id, map_uvm_lite_gpus) {
+        uvm_va_policy_preferred_location_equal(policy, new_residency, va_block_context->make_resident.dest_nid)) {
+        for_each_id_in_mask (map_processor_id, map_uvm_lite_gpus) {
            status = uvm_va_block_map(va_block,
                                      va_block_context,
                                      map_processor_id,
@@ -11642,6 +11391,10 @@ static int block_select_node_residency(uvm_va_block_t *va_block,
    // For GPU faults, the bottom half is pinned to CPUs closest to their GPU.
    // Therefore, in both cases, we can use numa_mem_id() to get the NUMA node
    // ID of the faulting processor.
+    // Note that numa_mem_id() returns the nearest node with memory. In most
+    // cases, this will be the current NUMA node. However, in the case that the
+    // current node does not have any memory, we probably want the nearest node
+    // with memory, anyway.
    int current_nid = numa_mem_id();
    bool may_read_duplicate = can_read_duplicate(va_block, page_index, policy, thrashing_hint);

@@ -11665,7 +11418,12 @@ static int block_select_node_residency(uvm_va_block_t *va_block,
    // If read duplication is enabled and the page is also resident on the CPU,
    // keep its current NUMA node residency.
    if (may_read_duplicate && uvm_va_block_cpu_is_page_resident_on(va_block, NUMA_NO_NODE, page_index))
-        return block_get_page_node_residency(va_block, page_index);
+        return NUMA_NO_NODE;
+
+    // The new_residency processor is the CPU and the preferred location is not
+    // the CPU. If the page is resident on the CPU, keep its current residency.
+    if (uvm_va_block_cpu_is_page_resident_on(va_block, NUMA_NO_NODE, page_index))
+        return NUMA_NO_NODE;

    return current_nid;
 }
@@ -13132,7 +12890,6 @@ NV_STATUS uvm_va_block_evict_chunks(uvm_va_block_t *va_block,
        goto out;

    for (i = 0; i < num_gpu_chunks; ++i) {
-        uvm_gpu_id_t accessing_gpu_id;
        uvm_gpu_chunk_t *chunk = gpu_state->chunks[i];

        if (!chunk)
@@ -13140,45 +12897,6 @@ NV_STATUS uvm_va_block_evict_chunks(uvm_va_block_t *va_block,
        if (!uvm_gpu_chunk_same_root(chunk, root_chunk))
            continue;

-        // Remove the mappings of indirect peers from the reverse map. We
-        // access the indirect peer mask from the VA space without holding the
-        // VA space lock. Therefore, we can race with enable_peer/disable_peer
-        // operations. However this is fine:
-        //
-        // The enable_peer sequence is as follows:
-        //
-        // set_bit in va_space->indirect_peers
-        // uvm_va_block_enable_peer;
-        //
-        // - If we read the mask BEFORE it is set or AFTER the mapping has
-        // been added to the map there is no race.
-        // - If we read the mask AFTER it is set but BEFORE adding the mapping
-        // to the reverse map, we will try to remove it although it is not
-        // there yet. Therefore, we use
-        // uvm_pmm_sysmem_mappings_remove_gpu_mapping_on_eviction, which does
-        // not check if the mapping is present in the reverse map.
-        //
-        // The disable_peer sequence is as follows:
-        //
-        // uvm_va_block_disable_peer;
-        // clear_bit in va_space->indirect_peers
-        //
-        // - If we read the mask BEFORE the mapping has been added to the map
-        // or AFTER the bit has been cleared, there is no race.
-        // - If we read the mask AFTER the mapping has been removed and BEFORE
-        // the bit is cleared, we will try to remove the mapping, too.
-        // Again, uvm_pmm_sysmem_mappings_remove_gpu_mapping_on_eviction works
-        // in this scenario.
-        // Obtain the uvm_gpu_t directly via the parent GPU's id since indirect
-        // peers are not supported when SMC is enabled.
-        for_each_gpu_id_in_mask(accessing_gpu_id, &va_space->indirect_peers[uvm_id_value(gpu->id)]) {
-            uvm_gpu_t *accessing_gpu = uvm_va_space_get_gpu(va_space, accessing_gpu_id);
-            NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&gpu->pmm, chunk, accessing_gpu);
-
-            uvm_pmm_sysmem_mappings_remove_gpu_mapping_on_eviction(&accessing_gpu->pmm_reverse_sysmem_mappings,
-                                                                   peer_addr);
-        }
-
        uvm_mmu_chunk_unmap(chunk, tracker);

        uvm_pmm_gpu_mark_chunk_evicted(&gpu->pmm, gpu_state->chunks[i]);
@@ -13311,8 +13029,8 @@ NV_STATUS uvm_test_va_block_inject_error(UVM_TEST_VA_BLOCK_INJECT_ERROR_PARAMS *
    if (params->eviction_error)
        va_block_test->inject_eviction_error = params->eviction_error;

-    if (params->cpu_pages_allocation_error_count)
-        va_block_test->inject_cpu_pages_allocation_error_count = params->cpu_pages_allocation_error_count;
+    if (params->cpu_chunk_allocation_error_count)
+        va_block_test->inject_cpu_chunk_allocation_error_count = params->cpu_chunk_allocation_error_count;

    if (params->populate_error)
        va_block_test->inject_populate_error = params->populate_error;
@@ -13651,63 +13369,24 @@ NV_STATUS uvm_test_va_residency_info(UVM_TEST_VA_RESIDENCY_INFO_PARAMS *params,
        ++count;
    }

-    if (params->resident_on_count == 1) {
-        if (uvm_processor_mask_test(resident_on_mask, UVM_ID_CPU)) {
-            if (uvm_pmm_sysmem_mappings_indirect_supported()) {
-                for_each_gpu_id(id) {
-                    NvU64 page_size = uvm_va_block_page_size_processor(block, id, page_index);
-                    uvm_reverse_map_t sysmem_page;
-                    uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page_resident(block, page_index);
-                    size_t num_pages;
-                    uvm_gpu_t *gpu;
+    if (params->resident_on_count == 1 && !uvm_processor_mask_test(resident_on_mask, UVM_ID_CPU)) {
+        uvm_gpu_id_t id = uvm_processor_mask_find_first_id(resident_on_mask);
+        uvm_reverse_map_t gpu_mapping;
+        size_t num_pages;
+        uvm_gpu_t *gpu = uvm_va_space_get_gpu(va_space, id);
+        uvm_gpu_phys_address_t phys_addr;

-                    if (!uvm_va_block_gpu_state_get(block, id))
-                        continue;
+        phys_addr = uvm_va_block_gpu_phys_page_address(block, page_index, gpu);
+        num_pages = uvm_pmm_gpu_phys_to_virt(&gpu->pmm, phys_addr.address, PAGE_SIZE, &gpu_mapping);

-                    gpu = uvm_va_space_get_gpu(va_space, id);
+        // Chunk may be in TEMP_PINNED state so it may not have a VA block
+        // assigned. In that case, we don't get a valid translation.
+        if (num_pages > 0) {
+            UVM_ASSERT(num_pages == 1);
+            UVM_ASSERT(gpu_mapping.va_block == block);
+            UVM_ASSERT(uvm_reverse_map_start(&gpu_mapping) == addr);

-                    if (!gpu->parent->access_counters_can_use_physical_addresses)
-                        continue;
-
-                    num_pages = uvm_pmm_sysmem_mappings_dma_to_virt(&gpu->pmm_reverse_sysmem_mappings,
-                                                                    uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu),
-                                                                    uvm_cpu_chunk_get_size(chunk),
-                                                                    &sysmem_page,
-                                                                    1);
-                    if (page_size > 0)
-                        UVM_ASSERT(num_pages == 1);
-                    else
-                        UVM_ASSERT(num_pages <= 1);
-
-                    if (num_pages == 1) {
-                        UVM_ASSERT(sysmem_page.va_block == block);
-                        UVM_ASSERT(uvm_reverse_map_start(&sysmem_page) <= addr);
-                        UVM_ASSERT(uvm_reverse_map_end(&sysmem_page) > addr);
-
-                        ++release_block_count;
-                    }
-                }
-            }
-        }
-        else {
-            uvm_gpu_id_t id = uvm_processor_mask_find_first_id(resident_on_mask);
-            uvm_reverse_map_t gpu_mapping;
-            size_t num_pages;
-            uvm_gpu_t *gpu = uvm_va_space_get_gpu(va_space, id);
-            uvm_gpu_phys_address_t phys_addr;
-
-            phys_addr = uvm_va_block_gpu_phys_page_address(block, page_index, gpu);
-            num_pages = uvm_pmm_gpu_phys_to_virt(&gpu->pmm, phys_addr.address, PAGE_SIZE, &gpu_mapping);
-
-            // Chunk may be in TEMP_PINNED state so it may not have a VA block
-            // assigned. In that case, we don't get a valid translation.
-            if (num_pages > 0) {
-                UVM_ASSERT(num_pages == 1);
-                UVM_ASSERT(gpu_mapping.va_block == block);
-                UVM_ASSERT(uvm_reverse_map_start(&gpu_mapping) == addr);
-
-                ++release_block_count;
-            }
+            ++release_block_count;
        }
    }

--- a/kernel-open/nvidia-uvm/uvm_va_block.h
+++ b/kernel-open/nvidia-uvm/uvm_va_block.h
@@ -542,7 +542,7 @@ struct uvm_va_block_wrapper_struct
        // uvm_cpu_chunk_allocation_sizes module parameter.
        NvU32 cpu_chunk_allocation_size_mask;

-        // Subsequent operations that need to allocate CPU pages will fail. As
+        // Subsequent operations that need to allocate CPU chunks will fail. As
        // opposed to other error injection settings, this one fails N times
        // and then succeeds instead of failing on the Nth try. A value of ~0u
        // means fail indefinitely.
@@ -550,7 +550,7 @@ struct uvm_va_block_wrapper_struct
        // the state of the VA blocks after the failure. However, some tests
        // use kernels to trigger migrations and a fault replay could trigger
        // a successful migration if this error flag is cleared.
-        NvU32 inject_cpu_pages_allocation_error_count;
+        NvU32 inject_cpu_chunk_allocation_error_count;

        // A NUMA node ID on which any CPU chunks will be allocated from.
        // This will override any other setting and/or policy.
@@ -1158,11 +1158,6 @@ void uvm_va_block_remove_gpu_va_space(uvm_va_block_t *va_block,
                                      uvm_gpu_va_space_t *gpu_va_space,
                                      uvm_va_block_context_t *block_context);

-// Creates any mappings necessary in this VA block between the two GPUs, in
-// either direction.
-// LOCKING: The caller must hold the va_block lock
-NV_STATUS uvm_va_block_enable_peer(uvm_va_block_t *va_block, uvm_gpu_t *gpu0, uvm_gpu_t *gpu1);
-
 // Unmaps all page tables in this VA block which have peer mappings between
 // the two GPUs, in either direction.
 // LOCKING: The caller must hold the va_block lock
--- a/kernel-open/nvidia-uvm/uvm_va_policy.c
+++ b/kernel-open/nvidia-uvm/uvm_va_policy.c
@@ -105,6 +105,12 @@ bool uvm_va_policy_preferred_location_equal(const uvm_va_policy_t *policy, uvm_p
 {
    bool equal = uvm_id_equal(policy->preferred_location, proc);

+    if (!UVM_ID_IS_CPU(policy->preferred_location))
+        UVM_ASSERT(policy->preferred_nid == NUMA_NO_NODE);
+
+    if (!UVM_ID_IS_CPU(proc))
+        UVM_ASSERT(cpu_numa_id == NUMA_NO_NODE);
+
    if (equal && UVM_ID_IS_CPU(policy->preferred_location))
        equal = uvm_numa_id_eq(policy->preferred_nid, cpu_numa_id);

@@ -656,7 +662,7 @@ const uvm_va_policy_t *uvm_va_policy_set_preferred_location(uvm_va_block_t *va_b
        // and that the policy is changing.
        UVM_ASSERT(node->node.start >= start);
        UVM_ASSERT(node->node.end <= end);
-        UVM_ASSERT(!uvm_id_equal(node->policy.preferred_location, processor_id));
+        UVM_ASSERT(!uvm_va_policy_preferred_location_equal(&node->policy, processor_id, cpu_node_id));
    }

    node->policy.preferred_location = processor_id;
--- a/kernel-open/nvidia-uvm/uvm_va_range.c
+++ b/kernel-open/nvidia-uvm/uvm_va_range.c
@@ -781,15 +781,6 @@ static NV_STATUS uvm_va_range_enable_peer_managed(uvm_va_range_t *va_range, uvm_


    for_each_va_block_in_va_range(va_range, va_block) {
-        // TODO: Bug 1767224: Refactor the uvm_va_block_set_accessed_by logic
-        //       into uvm_va_block_enable_peer.
-        uvm_mutex_lock(&va_block->lock);
-        status = uvm_va_block_enable_peer(va_block, gpu0, gpu1);
-        uvm_mutex_unlock(&va_block->lock);
-
-        if (status != NV_OK)
-            return status;
-
        // For UVM-Lite at most one GPU needs to map the peer GPU if it's the
        // preferred location, but it doesn't hurt to just try mapping both.
        if (gpu0_accessed_by) {
@@ -868,9 +859,9 @@ static void uvm_va_range_disable_peer_managed(uvm_va_range_t *va_range, uvm_gpu_
        // preferred location. If peer mappings are being disabled to the
        // preferred location, then unmap the other GPU.
        // Nothing to do otherwise.
-        if (uvm_id_equal(uvm_va_range_get_policy(va_range)->preferred_location, gpu0->id))
+        if (uvm_va_policy_preferred_location_equal(uvm_va_range_get_policy(va_range), gpu0->id, NUMA_NO_NODE))
            uvm_lite_gpu_to_unmap = gpu1;
-        else if (uvm_id_equal(uvm_va_range_get_policy(va_range)->preferred_location, gpu1->id))
+        else if (uvm_va_policy_preferred_location_equal(uvm_va_range_get_policy(va_range), gpu1->id, NUMA_NO_NODE))
            uvm_lite_gpu_to_unmap = gpu0;
        else
            return;
@@ -951,7 +942,7 @@ static void va_range_unregister_gpu_managed(uvm_va_range_t *va_range, uvm_gpu_t
    // Reset preferred location and accessed-by of VA ranges if needed
    // Note: ignoring the return code of uvm_va_range_set_preferred_location since this
    // will only return on error when setting a preferred location, not on a reset
-    if (uvm_id_equal(uvm_va_range_get_policy(va_range)->preferred_location, gpu->id))
+    if (uvm_va_policy_preferred_location_equal(uvm_va_range_get_policy(va_range), gpu->id, NUMA_NO_NODE))
        (void)uvm_va_range_set_preferred_location(va_range, UVM_ID_INVALID, NUMA_NO_NODE, mm, NULL);

    uvm_va_range_unset_accessed_by(va_range, gpu->id, NULL);
@@ -1683,7 +1674,7 @@ void uvm_va_range_unset_accessed_by(uvm_va_range_t *va_range,
    // If a UVM-Lite GPU is being removed from the accessed_by mask, it will
    // also stop being a UVM-Lite GPU unless it's also the preferred location.
    if (uvm_processor_mask_test(&va_range->uvm_lite_gpus, processor_id) &&
-        !uvm_id_equal(uvm_va_range_get_policy(va_range)->preferred_location, processor_id)) {
+        !uvm_va_policy_preferred_location_equal(uvm_va_range_get_policy(va_range), processor_id, NUMA_NO_NODE)) {
        range_unmap(va_range, processor_id, out_tracker);
    }

--- a/kernel-open/nvidia-uvm/uvm_va_space.c
+++ b/kernel-open/nvidia-uvm/uvm_va_space.c
@@ -155,11 +155,6 @@ static bool va_space_check_processors_masks(uvm_va_space_t *va_space)
                                                 &va_space->can_copy_from[uvm_id_value(processor)]));
        }

-        // Peers
-        UVM_ASSERT(!processor_mask_array_test(va_space->indirect_peers, processor, processor));
-        UVM_ASSERT(uvm_processor_mask_subset(&va_space->indirect_peers[uvm_id_value(processor)],
-                                             &va_space->has_native_atomics[uvm_id_value(processor)]));
-
        // Atomics
        UVM_ASSERT(processor_mask_array_test(va_space->has_native_atomics, processor, processor));

@@ -375,8 +370,6 @@ static void unregister_gpu(uvm_va_space_t *va_space,
    processor_mask_array_clear(va_space->has_nvlink, UVM_ID_CPU, gpu->id);
    UVM_ASSERT(processor_mask_array_empty(va_space->has_nvlink, gpu->id));

-    UVM_ASSERT(processor_mask_array_empty(va_space->indirect_peers, gpu->id));
-
    processor_mask_array_clear(va_space->has_native_atomics, gpu->id, gpu->id);
    processor_mask_array_clear(va_space->has_native_atomics, gpu->id, UVM_ID_CPU);
    processor_mask_array_clear(va_space->has_native_atomics, UVM_ID_CPU, gpu->id);
@@ -1035,8 +1028,6 @@ static void disable_peers(uvm_va_space_t *va_space,
    processor_mask_array_clear(va_space->can_copy_from, gpu1->id, gpu0->id);
    processor_mask_array_clear(va_space->has_nvlink, gpu0->id, gpu1->id);
    processor_mask_array_clear(va_space->has_nvlink, gpu1->id, gpu0->id);
-    processor_mask_array_clear(va_space->indirect_peers, gpu0->id, gpu1->id);
-    processor_mask_array_clear(va_space->indirect_peers, gpu1->id, gpu0->id);
    processor_mask_array_clear(va_space->has_native_atomics, gpu0->id, gpu1->id);
    processor_mask_array_clear(va_space->has_native_atomics, gpu1->id, gpu0->id);

@@ -1100,15 +1091,6 @@ static NV_STATUS enable_peers(uvm_va_space_t *va_space, uvm_gpu_t *gpu0, uvm_gpu

        processor_mask_array_set(va_space->has_native_atomics, gpu0->id, gpu1->id);
        processor_mask_array_set(va_space->has_native_atomics, gpu1->id, gpu0->id);
-
-        if (peer_caps->is_indirect_peer) {
-            UVM_ASSERT(peer_caps->link_type >= UVM_GPU_LINK_NVLINK_2);
-            UVM_ASSERT(gpu0->mem_info.numa.enabled);
-            UVM_ASSERT(gpu1->mem_info.numa.enabled);
-
-            processor_mask_array_set(va_space->indirect_peers, gpu0->id, gpu1->id);
-            processor_mask_array_set(va_space->indirect_peers, gpu1->id, gpu0->id);
-        }
    }
    else if (gpu0->parent == gpu1->parent) {
        processor_mask_array_set(va_space->has_native_atomics, gpu0->id, gpu1->id);
@@ -1587,45 +1569,19 @@ error_gpu_release:
    return status;
 }

-static NvU32 find_gpu_va_space_index(uvm_va_space_t *va_space,
-                                     uvm_parent_gpu_t *parent_gpu)
-{
-    uvm_gpu_id_t gpu_id;
-    NvU32 index = UVM_ID_MAX_PROCESSORS;
-
-    // TODO: Bug 4351121: this conversion from parent ID to gpu ID depends on
-    // the fact that only one partition is registered per va_space per physical
-    // GPU. This code will need to change when multiple MIG instances are
-    // supported.
-    for_each_sub_processor_id_in_parent_gpu(gpu_id, parent_gpu->id) {
-        if (uvm_processor_mask_test(&va_space->registered_gpu_va_spaces, gpu_id)) {
-            UVM_ASSERT(index == UVM_ID_MAX_PROCESSORS);
-            index = uvm_id_gpu_index(gpu_id);
-        }
-    }
-
-    return index;
-}
-
-uvm_gpu_va_space_t *uvm_gpu_va_space_get_by_parent_gpu(uvm_va_space_t *va_space,
-                                                       uvm_parent_gpu_t *parent_gpu)
+uvm_gpu_va_space_t *uvm_gpu_va_space_get(uvm_va_space_t *va_space, uvm_gpu_t *gpu)
 {
    uvm_gpu_va_space_t *gpu_va_space;
-    NvU32 gpu_index;

    uvm_assert_rwsem_locked(&va_space->lock);

-    if (!parent_gpu)
+    if (!gpu || !uvm_processor_mask_test(&va_space->registered_gpu_va_spaces, gpu->id))
        return NULL;

-    gpu_index = find_gpu_va_space_index(va_space, parent_gpu);
-    if (gpu_index == UVM_ID_MAX_PROCESSORS)
-        return NULL;
-
-    gpu_va_space = va_space->gpu_va_spaces[gpu_index];
+    gpu_va_space = va_space->gpu_va_spaces[uvm_id_gpu_index(gpu->id)];
    UVM_ASSERT(uvm_gpu_va_space_state(gpu_va_space) == UVM_GPU_VA_SPACE_STATE_ACTIVE);
    UVM_ASSERT(gpu_va_space->va_space == va_space);
-    UVM_ASSERT(gpu_va_space->gpu->parent == parent_gpu);
+    UVM_ASSERT(gpu_va_space->gpu == gpu);

    return gpu_va_space;
 }
@@ -1772,25 +1728,10 @@ uvm_processor_id_t uvm_processor_mask_find_closest_id(uvm_va_space_t *va_space,
    uvm_mutex_lock(&va_space->closest_processors.mask_mutex);

    if (uvm_processor_mask_and(mask, candidates, &va_space->has_nvlink[uvm_id_value(src)])) {
-        // NvLink peers
-        uvm_processor_mask_t *indirect_peers;
-        uvm_processor_mask_t *direct_peers = &va_space->closest_processors.direct_peers;
-
-        indirect_peers = &va_space->indirect_peers[uvm_id_value(src)];
-
-        if (uvm_processor_mask_andnot(direct_peers, mask, indirect_peers)) {
-            // Direct peers, prioritizing GPU peers over CPU
-            closest_id = uvm_processor_mask_find_first_gpu_id(direct_peers);
-            if (UVM_ID_IS_INVALID(closest_id))
-                closest_id = UVM_ID_CPU;
-        }
-        else {
-            // Indirect peers
-            UVM_ASSERT(UVM_ID_IS_GPU(src));
-            UVM_ASSERT(!uvm_processor_mask_test(mask, UVM_ID_CPU));
-
-            closest_id = uvm_processor_mask_find_first_gpu_id(mask);
-        }
+        // Direct peers, prioritizing GPU peers over CPU
+        closest_id = uvm_processor_mask_find_first_gpu_id(mask);
+        if (UVM_ID_IS_INVALID(closest_id))
+            closest_id = UVM_ID_CPU;
    }
    else if (uvm_processor_mask_and(mask, candidates, &va_space->can_access[uvm_id_value(src)])) {
        // If source is GPU, prioritize PCIe peers over CPU
--- a/kernel-open/nvidia-uvm/uvm_va_space.h
+++ b/kernel-open/nvidia-uvm/uvm_va_space.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2023 NVIDIA Corporation
+    Copyright (c) 2015-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -249,12 +249,6 @@ struct uvm_va_space_struct
    // for atomics in HW. This is a subset of accessible_from.
    uvm_processor_mask_t has_native_atomics[UVM_ID_MAX_PROCESSORS];

-    // Pre-computed masks that contain, for each processor memory, a mask with
-    // the processors that are indirect peers. Indirect peers can access each
-    // other's memory like regular peers, but with additional latency and/or bw
-    // penalty.
-    uvm_processor_mask_t indirect_peers[UVM_ID_MAX_PROCESSORS];
-
    // Mask of gpu_va_spaces registered with the va space
    // indexed by gpu->id
    uvm_processor_mask_t registered_gpu_va_spaces;
@@ -373,11 +367,7 @@ struct uvm_va_space_struct
        // uvm_processor_mask_find_closest_id.
        uvm_processor_mask_t mask;

-        // Temporary mask to hold direct_peers in
-        // uvm_processor_mask_find_closest_id.
-        uvm_processor_mask_t direct_peers;
-
-        // Protects the mask and direct_peers above.
+        // Protects the mask above.
        uvm_mutex_t mask_mutex;
    } closest_processors;

@@ -646,24 +636,9 @@ static uvm_gpu_va_space_state_t uvm_gpu_va_space_state(uvm_gpu_va_space_t *gpu_v
    return gpu_va_space->state;
 }

-// Return the GPU VA space for the given physical GPU.
+// Return the GPU VA space for the given GPU.
 // Locking: the va_space lock must be held.
-uvm_gpu_va_space_t *uvm_gpu_va_space_get_by_parent_gpu(uvm_va_space_t *va_space,
-                                                       uvm_parent_gpu_t *parent_gpu);
-
-static uvm_gpu_va_space_t *uvm_gpu_va_space_get(uvm_va_space_t *va_space, uvm_gpu_t *gpu)
-{
-    uvm_gpu_va_space_t *gpu_va_space;
-
-    if (!gpu)
-        return NULL;
-
-    gpu_va_space = uvm_gpu_va_space_get_by_parent_gpu(va_space, gpu->parent);
-    if (gpu_va_space)
-        UVM_ASSERT(gpu_va_space->gpu == gpu);
-
-    return gpu_va_space;
-}
+uvm_gpu_va_space_t *uvm_gpu_va_space_get(uvm_va_space_t *va_space, uvm_gpu_t *gpu);

 #define for_each_gpu_va_space(__gpu_va_space, __va_space)                                                   \
    for (__gpu_va_space =                                                                                   \
@@ -758,7 +733,6 @@ static uvm_gpu_t *uvm_processor_mask_find_next_va_space_gpu(const uvm_processor_
 // - src itself
 // - Direct NVLINK GPU peers if src is CPU or GPU (1)
 // - NVLINK CPU if src is GPU
-// - Indirect NVLINK GPU peers if src is GPU
 // - PCIe peers if src is GPU (2)
 // - CPU if src is GPU
 // - Deterministic selection from the pool of candidates
--- a/kernel-open/nvidia-uvm/uvm_volta_access_counter_buffer.c
+++ b/kernel-open/nvidia-uvm/uvm_volta_access_counter_buffer.c
@@ -199,12 +199,10 @@ void uvm_hal_volta_access_counter_buffer_parse_entry(uvm_parent_gpu_t *parent_gp
        buffer_entry->virtual_info.mmu_engine_id =
            READ_HWVALUE_MW(access_counter_entry, C365, NOTIFY_BUF_ENTRY, MMU_ENGINE_ID);

-        // MMU engine id aligns with the fault buffer packets. Therefore, we
-        // reuse the helpers to compute the MMU engine type and the VE ID from
-        // the fault buffer class
-        buffer_entry->virtual_info.mmu_engine_type =
-            parent_gpu->arch_hal->mmu_engine_id_to_type(buffer_entry->virtual_info.mmu_engine_id);
+        buffer_entry->virtual_info.mmu_engine_type = UVM_MMU_ENGINE_TYPE_GRAPHICS;

+        // MMU engine id aligns with the fault buffer packets. Therefore, we
+        // reuse the helper to compute the VE ID from the fault buffer class.
        buffer_entry->virtual_info.ve_id =
            parent_gpu->fault_buffer_hal->get_ve_id(buffer_entry->virtual_info.mmu_engine_id,
                                                    buffer_entry->virtual_info.mmu_engine_type);
--- a/kernel-open/nvidia-uvm/uvm_volta_fault_buffer.c
+++ b/kernel-open/nvidia-uvm/uvm_volta_fault_buffer.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2016-2023 NVIDIA Corporation
+    Copyright (c) 2016-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -106,8 +106,7 @@ static uvm_fault_access_type_t get_fault_access_type(const NvU32 *fault_entry)
 {
    NvU32 hw_access_type_value = READ_HWVALUE_MW(fault_entry, C369, BUF_ENTRY, ACCESS_TYPE);

-    switch (hw_access_type_value)
-    {
+    switch (hw_access_type_value) {
        case NV_PFAULT_ACCESS_TYPE_PHYS_READ:
        case NV_PFAULT_ACCESS_TYPE_VIRT_READ:
            return UVM_FAULT_ACCESS_TYPE_READ;
@@ -133,8 +132,7 @@ static bool is_fault_address_virtual(const NvU32 *fault_entry)
 {
    NvU32 hw_access_type_value = READ_HWVALUE_MW(fault_entry, C369, BUF_ENTRY, ACCESS_TYPE);

-    switch (hw_access_type_value)
-    {
+    switch (hw_access_type_value) {
        case NV_PFAULT_ACCESS_TYPE_PHYS_READ:
        case NV_PFAULT_ACCESS_TYPE_PHYS_WRITE:
        case NV_PFAULT_ACCESS_TYPE_PHYS_ATOMIC:
@@ -157,8 +155,7 @@ uvm_fault_type_t uvm_hal_volta_fault_buffer_get_fault_type(const NvU32 *fault_en
 {
    NvU32 hw_fault_type_value = READ_HWVALUE_MW(fault_entry, C369, BUF_ENTRY, FAULT_TYPE);

-    switch (hw_fault_type_value)
-    {
+    switch (hw_fault_type_value) {
        case NV_PFAULT_FAULT_TYPE_PDE:
            return UVM_FAULT_TYPE_INVALID_PDE;
        case NV_PFAULT_FAULT_TYPE_PTE:
@@ -203,8 +200,7 @@ static uvm_fault_client_type_t get_fault_client_type(const NvU32 *fault_entry)
 {
    NvU32 hw_client_type_value = READ_HWVALUE_MW(fault_entry, C369, BUF_ENTRY, MMU_CLIENT_TYPE);

-    switch (hw_client_type_value)
-    {
+    switch (hw_client_type_value) {
        case NV_PFAULT_MMU_CLIENT_TYPE_GPC:
            return UVM_FAULT_CLIENT_TYPE_GPC;
        case NV_PFAULT_MMU_CLIENT_TYPE_HUB:
@@ -220,8 +216,7 @@ static uvm_aperture_t get_fault_inst_aperture(const NvU32 *fault_entry)
 {
    NvU32 hw_aperture_value = READ_HWVALUE_MW(fault_entry, C369, BUF_ENTRY, INST_APERTURE);

-    switch (hw_aperture_value)
-    {
+    switch (hw_aperture_value) {
        case NVC369_BUF_ENTRY_INST_APERTURE_VID_MEM:
            return UVM_APERTURE_VID;
        case NVC369_BUF_ENTRY_INST_APERTURE_SYS_MEM_COHERENT:
@@ -261,6 +256,59 @@ static UvmFaultMetadataPacket *get_fault_buffer_entry_metadata(uvm_parent_gpu_t
    return fault_entry_metadata + index;
 }

+static bool client_id_ce(NvU16 client_id)
+{
+    if (client_id >= NV_PFAULT_CLIENT_HUB_HSCE0 && client_id <= NV_PFAULT_CLIENT_HUB_HSCE9)
+        return true;
+
+    switch (client_id) {
+        case NV_PFAULT_CLIENT_HUB_CE0:
+        case NV_PFAULT_CLIENT_HUB_CE1:
+        case NV_PFAULT_CLIENT_HUB_CE2:
+            return true;
+    }
+
+    return false;
+}
+
+static bool client_id_host(NvU16 client_id)
+{
+    switch (client_id) {
+        case NV_PFAULT_CLIENT_HUB_HOST:
+        case NV_PFAULT_CLIENT_HUB_HOST_CPU:
+            return true;
+    }
+
+    return false;
+}
+
+uvm_mmu_engine_type_t uvm_hal_volta_fault_buffer_get_mmu_engine_type(NvU16 mmu_engine_id,
+                                                                     uvm_fault_client_type_t client_type,
+                                                                     NvU16 client_id)
+{
+    // Servicing CE and Host (HUB clients) faults.
+    if (client_type == UVM_FAULT_CLIENT_TYPE_HUB) {
+        if (client_id_ce(client_id)) {
+            UVM_ASSERT(mmu_engine_id >= NV_PFAULT_MMU_ENG_ID_CE0 && mmu_engine_id <= NV_PFAULT_MMU_ENG_ID_CE8);
+
+            return UVM_MMU_ENGINE_TYPE_CE;
+        }
+
+        if (client_id_host(client_id)) {
+            UVM_ASSERT(mmu_engine_id >= NV_PFAULT_MMU_ENG_ID_HOST0 && mmu_engine_id <= NV_PFAULT_MMU_ENG_ID_HOST13);
+
+            return UVM_MMU_ENGINE_TYPE_HOST;
+        }
+    }
+
+    // We shouldn't be servicing faults from any other engines oither than GR.
+    UVM_ASSERT_MSG(client_id <= NV_PFAULT_CLIENT_GPC_T1_39, "Unexpected client ID: 0x%x\n", client_id);
+    UVM_ASSERT_MSG(mmu_engine_id >= NV_PFAULT_MMU_ENG_ID_GRAPHICS, "Unexpected engine ID: 0x%x\n", mmu_engine_id);
+    UVM_ASSERT(client_type == UVM_FAULT_CLIENT_TYPE_GPC);
+
+    return UVM_MMU_ENGINE_TYPE_GRAPHICS;
+}
+
 static void parse_fault_entry_common(uvm_parent_gpu_t *parent_gpu,
                                     NvU32 *fault_entry,
                                     uvm_fault_buffer_entry_t *buffer_entry)
@@ -272,6 +320,7 @@ static void parse_fault_entry_common(uvm_parent_gpu_t *parent_gpu,
    addr_hi = READ_HWVALUE_MW(fault_entry, C369, BUF_ENTRY, INST_HI);
    addr_lo = READ_HWVALUE_MW(fault_entry, C369, BUF_ENTRY, INST_LO);
    buffer_entry->instance_ptr.address = addr_lo + (addr_hi << HWSIZE_MW(C369, BUF_ENTRY, INST_LO));
+
    // HW value contains the 4K page number. Shift to build the full address
    buffer_entry->instance_ptr.address <<= 12;

@@ -279,6 +328,7 @@ static void parse_fault_entry_common(uvm_parent_gpu_t *parent_gpu,

    addr_hi = READ_HWVALUE_MW(fault_entry, C369, BUF_ENTRY, ADDR_HI);
    addr_lo = READ_HWVALUE_MW(fault_entry, C369, BUF_ENTRY, ADDR_LO);
+
    // HW value contains the 4K page number. Shift to build the full address
    buffer_entry->fault_address = (addr_lo + (addr_hi << HWSIZE_MW(C369, BUF_ENTRY, ADDR_LO))) << 12;
    buffer_entry->fault_address = uvm_parent_gpu_canonical_address(parent_gpu, buffer_entry->fault_address);
@@ -321,7 +371,9 @@ static void parse_fault_entry_common(uvm_parent_gpu_t *parent_gpu,
    BUILD_BUG_ON(sizeof(buffer_entry->fault_source.mmu_engine_id) * 8 < DRF_SIZE_MW(NVC369_BUF_ENTRY_ENGINE_ID));

    buffer_entry->fault_source.mmu_engine_type =
-        parent_gpu->arch_hal->mmu_engine_id_to_type(buffer_entry->fault_source.mmu_engine_id);
+        parent_gpu->fault_buffer_hal->get_mmu_engine_type(buffer_entry->fault_source.mmu_engine_id,
+                                                          buffer_entry->fault_source.client_type,
+                                                          buffer_entry->fault_source.client_id);

    buffer_entry->fault_source.ve_id =
        parent_gpu->fault_buffer_hal->get_ve_id(buffer_entry->fault_source.mmu_engine_id,
--- a/kernel-open/nvidia-uvm/uvm_volta_mmu.c
+++ b/kernel-open/nvidia-uvm/uvm_volta_mmu.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2017-2023 NVIDIA Corporation
+    Copyright (c) 2017-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -281,20 +281,6 @@ uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_volta(NvU64 big_page_size)
    return &volta_mmu_mode_hal;
 }

-uvm_mmu_engine_type_t uvm_hal_volta_mmu_engine_id_to_type(NvU16 mmu_engine_id)
-{
-    if (mmu_engine_id >= NV_PFAULT_MMU_ENG_ID_HOST0 && mmu_engine_id <= NV_PFAULT_MMU_ENG_ID_HOST13)
-        return UVM_MMU_ENGINE_TYPE_HOST;
-
-    if (mmu_engine_id >= NV_PFAULT_MMU_ENG_ID_CE0 && mmu_engine_id <= NV_PFAULT_MMU_ENG_ID_CE8)
-        return UVM_MMU_ENGINE_TYPE_CE;
-
-    // We shouldn't be servicing faults from any other engines
-    UVM_ASSERT_MSG(mmu_engine_id >= NV_PFAULT_MMU_ENG_ID_GRAPHICS, "Unexpected engine ID: 0x%x\n", mmu_engine_id);
-
-    return UVM_MMU_ENGINE_TYPE_GRAPHICS;
-}
-
 NvU16 uvm_hal_volta_mmu_client_id_to_utlb_id(NvU16 client_id)
 {
    switch (client_id) {