550.54.14

2026-02-01 05:59:48 +00:00 · 2024-02-23 16:37:56 +01:00
parent 91676d6628
commit 476bd34534
186 changed files with 42509 additions and 37629 deletions
--- a/kernel-open/nvidia-uvm/uvm.h
+++ b/kernel-open/nvidia-uvm/uvm.h
@@ -58,7 +58,7 @@
 #ifndef _UVM_H_
 #define _UVM_H_

-#define UVM_API_LATEST_REVISION 9
+#define UVM_API_LATEST_REVISION 11

 #if !defined(UVM_API_REVISION)
 #error "please define UVM_API_REVISION macro to a desired version number or UVM_API_LATEST_REVISION macro"
@@ -297,7 +297,9 @@ NV_STATUS UvmIsPageableMemoryAccessSupported(NvBool *pageableMemAccess);
 //
 // Arguments:
 //     gpuUuid: (INPUT)
-//         UUID of the GPU for which pageable memory access support is queried.
+//         UUID of the physical GPU if the GPU is not SMC capable or SMC
+//         enabled, or the GPU instance UUID of the partition for which
+//         pageable memory access support is queried.
 //
 //     pageableMemAccess: (OUTPUT)
 //         Returns true (non-zero) if the GPU represented by gpuUuid supports
@@ -327,6 +329,12 @@ NV_STATUS UvmIsPageableMemoryAccessSupportedOnGpu(const NvProcessorUuid *gpuUuid
 // usage. Calling UvmRegisterGpu multiple times on the same GPU from the same
 // process results in an error.
 //
+// After successfully registering a GPU partition, all subsequent API calls
+// which take a NvProcessorUuid argument (including UvmGpuMappingAttributes),
+// must use the GI partition UUID which can be obtained with
+// NvRmControl(NVC637_CTRL_CMD_GET_UUID). Otherwise, if the GPU is not SMC
+// capable or SMC enabled, the physical GPU UUID must be used.
+//
 // Arguments:
 //     gpuUuid: (INPUT)
 //         UUID of the physical GPU to register.
@@ -431,7 +439,8 @@ NV_STATUS UvmRegisterGpuSmc(const NvProcessorUuid *gpuUuid,
 //
 // Arguments:
 //     gpuUuid: (INPUT)
-//         UUID of the GPU to unregister.
+//         UUID of the physical GPU if the GPU is not SMC capable or SMC
+//         enabled, or the GPU instance UUID of the partition to unregister.
 //
 // Error codes:
 //     NV_ERR_INVALID_DEVICE:
@@ -489,7 +498,8 @@ NV_STATUS UvmUnregisterGpu(const NvProcessorUuid *gpuUuid);
 //
 // Arguments:
 //     gpuUuid: (INPUT)
-//         UUID of the GPU to register.
+//         UUID of the physical GPU if the GPU is not SMC capable or SMC
+//         enabled, or the GPU instance UUID of the partition to register.
 //
 //     platformParams: (INPUT)
 //         On Linux: RM ctrl fd, hClient and hVaSpace.
@@ -560,7 +570,9 @@ NV_STATUS UvmRegisterGpuVaSpace(const NvProcessorUuid             *gpuUuid,
 //
 // Arguments:
 //     gpuUuid: (INPUT)
-//         UUID of the GPU whose VA space should be unregistered.
+//         UUID of the physical GPU if the GPU is not SMC capable or SMC
+//         enabled, or the GPU instance UUID of the partition whose VA space
+//         should be unregistered.
 //
 // Error codes:
 //     NV_ERR_INVALID_DEVICE:
@@ -590,7 +602,7 @@ NV_STATUS UvmUnregisterGpuVaSpace(const NvProcessorUuid *gpuUuid);
 //
 // The two GPUs must be connected via PCIe. An error is returned if the GPUs are
 // not connected or are connected over an interconnect different than PCIe
-// (NVLink, for example).
+// (NVLink or SMC partitions, for example).
 //
 // If both GPUs have GPU VA spaces registered for them, the two GPU VA spaces
 // must support the same set of page sizes for GPU mappings.
@@ -603,10 +615,12 @@ NV_STATUS UvmUnregisterGpuVaSpace(const NvProcessorUuid *gpuUuid);
 //
 // Arguments:
 //     gpuUuidA: (INPUT)
-//         UUID of GPU A.
+//         UUID of the physical GPU if the GPU is not SMC capable or SMC
+//         enabled, or the GPU instance UUID of the partition A.
 //
 //     gpuUuidB: (INPUT)
-//         UUID of GPU B.
+//         UUID of the physical GPU if the GPU is not SMC capable or SMC
+//         enabled, or the GPU instance UUID of the partition B.
 //
 // Error codes:
 //     NV_ERR_NO_MEMORY:
@@ -652,10 +666,12 @@ NV_STATUS UvmEnablePeerAccess(const NvProcessorUuid *gpuUuidA,
 //
 // Arguments:
 //     gpuUuidA: (INPUT)
-//         UUID of GPU A.
+//         UUID of the physical GPU if the GPU is not SMC capable or SMC
+//         enabled, or the GPU instance UUID of the partition A.
 //
 //     gpuUuidB: (INPUT)
-//         UUID of GPU B.
+//         UUID of the physical GPU if the GPU is not SMC capable or SMC
+//         enabled, or the GPU instance UUID of the partition B.
 //
 // Error codes:
 //     NV_ERR_INVALID_DEVICE:
@@ -700,7 +716,9 @@ NV_STATUS UvmDisablePeerAccess(const NvProcessorUuid *gpuUuidA,
 //
 // Arguments:
 //     gpuUuid: (INPUT)
-//        UUID of the GPU that the channel is associated with.
+//        UUID of the physical GPU if the GPU is not SMC capable or SMC
+//        enabled, or the GPU instance UUID of the partition that the channel is
+//        associated with.
 //
 //     platformParams: (INPUT)
 //         On Linux: RM ctrl fd, hClient and hChannel.
@@ -1139,11 +1157,14 @@ NV_STATUS UvmAllowMigrationRangeGroups(const NvU64 *rangeGroupIds,
 //         Length, in bytes, of the range.
 //
 //     preferredLocationUuid: (INPUT)
-//         UUID of the preferred location for this VA range.
+//         UUID of the CPU, UUID of the physical GPU if the GPU is not SMC
+//         capable or SMC enabled, or the GPU instance UUID of the partition of
+//         the preferred location for this VA range.
 //
 //     accessedByUuids: (INPUT)
-//         UUIDs of all processors that should have persistent mappings to this
-//         VA range.
+//         UUID of the CPU, UUID of the physical GPUs if the GPUs are not SMC
+//         capable or SMC enabled, or the GPU instance UUID of the partitions
+//         that should have persistent mappings to this VA range.
 //
 //     accessedByCount: (INPUT)
 //         Number of elements in the accessedByUuids array.
@@ -1421,7 +1442,9 @@ NV_STATUS UvmAllocSemaphorePool(void                          *base,
 //         Length, in bytes, of the range.
 //
 //     destinationUuid: (INPUT)
-//         UUID of the destination processor to migrate pages to.
+//         UUID of the physical GPU if the GPU is not SMC capable or SMC
+//         enabled, the GPU instance UUID of the partition, or the CPU UUID to
+//         migrate pages to.
 //
 //     preferredCpuMemoryNode: (INPUT)
 //         Preferred CPU NUMA memory node used if the destination processor is
@@ -1499,7 +1522,9 @@ NV_STATUS UvmMigrate(void                  *base,
 //         Length, in bytes, of the range.
 //
 //     destinationUuid: (INPUT)
-//         UUID of the destination processor to migrate pages to.
+//         UUID of the physical GPU if the GPU is not SMC capable or SMC
+//         enabled, the GPU instance UUID of the partition, or the CPU UUID to
+//         migrate pages to.
 //
 //     preferredCpuMemoryNode: (INPUT)
 //         Preferred CPU NUMA memory node used if the destination processor is
@@ -1576,7 +1601,9 @@ NV_STATUS UvmMigrateAsync(void                  *base,
 //         Id of the range group whose associated VA ranges have to be migrated.
 //
 //     destinationUuid: (INPUT)
-//         UUID of the destination processor to migrate pages to.
+//         UUID of the physical GPU if the GPU is not SMC capable or SMC
+//         enabled, the GPU instance UUID of the partition, or the CPU UUID to
+//         migrate pages to.
 //
 // Error codes:
 //     NV_ERR_OBJECT_NOT_FOUND:
@@ -1938,7 +1965,9 @@ NV_STATUS UvmMapExternalAllocation(void                              *base,
 //
 //
 //     gpuUuid: (INPUT)
-//         UUID of the GPU to map the sparse region on.
+//         UUID of the physical GPU if the GPU is not SMC capable or SMC
+//         enabled, or the GPU instance UUID of the partition to map the sparse
+//         region on.
 //
 // Errors:
 //     NV_ERR_INVALID_ADDRESS:
@@ -1995,7 +2024,9 @@ NV_STATUS UvmMapExternalSparse(void                  *base,
 //         The length of the virtual address range.
 //
 //     gpuUuid: (INPUT)
-//         UUID of the GPU to unmap the VA range from.
+//         UUID of the physical GPU if the GPU is not SMC capable or SMC
+//         enabled, or the GPU instance UUID of the partition to unmap the VA
+//         range from.
 //
 // Errors:
 //     NV_ERR_INVALID_ADDRESS:
@@ -2062,7 +2093,9 @@ NV_STATUS UvmUnmapExternalAllocation(void                  *base,
 //         supported by the GPU.
 //
 //     gpuUuid: (INPUT)
-//         UUID of the GPU to map the dynamic parallelism region on.
+//         UUID of the physical GPU if the GPU is not SMC capable or SMC
+//         enabled, or the GPU instance UUID of the partition to map the
+//         dynamic parallelism region on.
 //
 // Errors:
 //     NV_ERR_UVM_ADDRESS_IN_USE:
@@ -2293,7 +2326,9 @@ NV_STATUS UvmDisableReadDuplication(void     *base,
 //         Length, in bytes, of the range.
 //
 //     preferredLocationUuid: (INPUT)
-//         UUID of the preferred location.
+//         UUID of the physical GPU if the GPU is not SMC capable or SMC
+//         enabled, the GPU instance UUID of the partition, or the CPU UUID
+//         preferred location.
 //
 //     preferredCpuMemoryNode: (INPUT)
 //         Preferred CPU NUMA memory node used if preferredLocationUuid is the
@@ -2469,8 +2504,9 @@ NV_STATUS UvmUnsetPreferredLocation(void     *base,
 //         Length, in bytes, of the range.
 //
 //     accessedByUuid: (INPUT)
-//         UUID of the processor that should have pages in the the VA range
-//         mapped when possible.
+//         UUID of the physical GPU if the GPU is not SMC capable or SMC
+//         enabled, the GPU instance UUID of the partition, or the CPU UUID
+//         that should have pages in the VA range mapped when possible.
 //
 // Errors:
 //     NV_ERR_INVALID_ADDRESS:
@@ -2538,8 +2574,10 @@ NV_STATUS UvmSetAccessedBy(void                  *base,
 //         Length, in bytes, of the range.
 //
 //     accessedByUuid: (INPUT)
-//         UUID of the processor from which any policies set by
-//         UvmSetAccessedBy should be revoked for the given VA range.
+//         UUID of the physical GPU if the GPU is not SMC capable or SMC
+//         enabled, the GPU instance UUID of the partition, or the CPU UUID
+//         from which any policies set by UvmSetAccessedBy should be revoked
+//         for the given VA range.
 //
 // Errors:
 //     NV_ERR_INVALID_ADDRESS:
@@ -2597,7 +2635,9 @@ NV_STATUS UvmUnsetAccessedBy(void                  *base,
 //
 // Arguments:
 //     gpuUuid: (INPUT)
-//         UUID of the GPU to enable software-assisted system-wide atomics on.
+//         UUID of the physical GPU if the GPU is not SMC capable or SMC
+//         enabled, or the GPU instance UUID of the partition to enable
+//         software-assisted system-wide atomics on.
 //
 // Error codes:
 //     NV_ERR_NO_MEMORY:
@@ -2633,7 +2673,9 @@ NV_STATUS UvmEnableSystemWideAtomics(const NvProcessorUuid *gpuUuid);
 //
 // Arguments:
 //     gpuUuid: (INPUT)
-//         UUID of the GPU to disable software-assisted system-wide atomics on.
+//         UUID of the physical GPU if the GPU is not SMC capable or SMC
+//         enabled, or the GPU instance UUID of the partition to disable
+//         software-assisted system-wide atomics on.
 //
 // Error codes:
 //     NV_ERR_INVALID_DEVICE:
@@ -2862,7 +2904,9 @@ NV_STATUS UvmDebugCountersEnable(UvmDebugSession   session,
 //         Name of the counter in that scope.
 //
 //     gpu: (INPUT)
-//         Gpuid of the scoped GPU. This parameter is ignored in AllGpu scopes.
+//         UUID of the physical GPU if the GPU is not SMC capable or SMC
+//         enabled, or the GPU instance UUID of the partition of the scoped GPU.
+//         This parameter is ignored in AllGpu scopes.
 //
 //     pCounterHandle: (OUTPUT)
 //         Handle to the counter address.
@@ -2916,7 +2960,7 @@ NV_STATUS UvmDebugGetCounterVal(UvmDebugSession     session,
 // UvmEventQueueCreate
 //
 // This call creates an event queue of the given size.
-// No events are added in the queue till they are enabled by the user.
+// No events are added in the queue until they are enabled by the user.
 // Event queue data is visible to the user even after the target process dies
 // if the session is active and queue is not freed.
 //
@@ -2967,7 +3011,7 @@ NV_STATUS UvmEventQueueCreate(UvmDebugSession        sessionHandle,
 // UvmEventQueueDestroy
 //
 // This call frees all interal resources associated with the queue, including
-// upinning of the memory associated with that queue. Freeing user buffer is
+// unpinning of the memory associated with that queue. Freeing user buffer is
 // responsibility of a caller. Event queue might be also destroyed as a side
 // effect of destroying a session associated with this queue.
 //
@@ -3151,9 +3195,9 @@ NV_STATUS UvmEventGetNotificationHandles(UvmEventQueueHandle  *queueHandleArray,
 // UvmEventGetGpuUuidTable
 //
 // Each migration event entry contains the gpu index to/from where data is
-// migrated. This index maps to a corresponding gpu UUID in the gpuUuidTable.
-// Using indices saves on the size of each event entry. This API provides the
-// gpuIndex to gpuUuid relation to the user.
+// migrated. This index maps to a corresponding physical gpu UUID in the
+// gpuUuidTable. Using indices saves on the size of each event entry. This API
+// provides the gpuIndex to gpuUuid relation to the user.
 //
 // This API does not access the queue state maintained in the user
 // library and so the user doesn't need to acquire a lock to protect the
@@ -3161,9 +3205,9 @@ NV_STATUS UvmEventGetNotificationHandles(UvmEventQueueHandle  *queueHandleArray,
 //
 // Arguments:
 //     gpuUuidTable: (OUTPUT)
-//         The return value is an array of UUIDs. The array index is the
-//         corresponding gpuIndex. There can be at max 32 gpus associated with
-//         UVM, so array size is 32.
+//         The return value is an array of physical GPU UUIDs. The array index
+//         is the corresponding gpuIndex. There can be at max 32 GPUs
+//         associated with UVM, so array size is 32.
 //
 //     validCount: (OUTPUT)
 //         The system doesn't normally contain 32 GPUs. This field gives the
@@ -3222,7 +3266,7 @@ NV_STATUS UvmEventGetGpuUuidTable(NvProcessorUuid *gpuUuidTable,
 //------------------------------------------------------------------------------
 NV_STATUS UvmEventFetch(UvmDebugSession      sessionHandle,
                        UvmEventQueueHandle  queueHandle,
-                        UvmEventEntry       *pBuffer,
+                        UvmEventEntry_V1    *pBuffer,
                        NvU64               *nEntries);

 //------------------------------------------------------------------------------
@@ -3418,10 +3462,15 @@ NV_STATUS UvmToolsDestroySession(UvmToolsSessionHandle session);
 // 4. Destroy event Queue using UvmToolsDestroyEventQueue
 //

-
+#if UVM_API_REV_IS_AT_MOST(10)
+// This is deprecated and replaced by sizeof(UvmToolsEventControlData_V1) or
+// sizeof(UvmToolsEventControlData_V2).
 NvLength UvmToolsGetEventControlSize(void);

+// This is deprecated and replaced by sizeof(UvmEventEntry_V1) or
+// sizeof(UvmEventEntry_V2).
 NvLength UvmToolsGetEventEntrySize(void);
+#endif

 NvLength UvmToolsGetNumberOfCounters(void);

@@ -3436,6 +3485,12 @@ NvLength UvmToolsGetNumberOfCounters(void);
 //     session: (INPUT)
 //         Handle to the tools session.
 //
+//     version: (INPUT)
+//         Requested version for events or counters.
+//         See UvmEventEntry_V1 and UvmEventEntry_V2.
+//         UvmToolsEventControlData_V2::version records the entry version that
+//         will be generated.
+//
 //     event_buffer: (INPUT)
 //         User allocated buffer. Must be page-aligned. Must be large enough to
 //         hold at least event_buffer_size events. Gets pinned until queue is
@@ -3447,10 +3502,9 @@ NvLength UvmToolsGetNumberOfCounters(void);
 //
 //     event_control (INPUT)
 //         User allocated buffer. Must be page-aligned. Must be large enough to
-//         hold UvmToolsEventControlData (although single page-size allocation
-//         should be more than enough). One could call
-//         UvmToolsGetEventControlSize() function to find out current size of
-//         UvmToolsEventControlData. Gets pinned until queue is destroyed.
+//         hold UvmToolsEventControlData_V1 if version is UvmEventEntry_V1 or
+//         UvmToolsEventControlData_V2 (although single page-size allocation
+//         should be more than enough). Gets pinned until queue is destroyed.
 //
 //     queue: (OUTPUT)
 //         Handle to the created queue.
@@ -3460,22 +3514,32 @@ NvLength UvmToolsGetNumberOfCounters(void);
 //         Session handle does not refer to a valid session
 //
 //     NV_ERR_INVALID_ARGUMENT:
+//         The version is not UvmEventEntry_V1 or UvmEventEntry_V2.
 //         One of the parameters: event_buffer, event_buffer_size, event_control
 //         is not valid
 //
 //     NV_ERR_INSUFFICIENT_RESOURCES:
-//         There could be multiple reasons for this error. One would be that it's
-//         not possible to allocate a queue of requested size. Another would be
-//         that either event_buffer or event_control memory couldn't be pinned
-//         (e.g. because of OS limitation of pinnable memory). Also it could not
-//         have been possible to create UvmToolsEventQueueDescriptor.
+//         There could be multiple reasons for this error. One would be that
+//         it's not possible to allocate a queue of requested size. Another
+//         would be either event_buffer or event_control memory couldn't be
+//         pinned (e.g. because of OS limitation of pinnable memory). Also it
+//         could not have been possible to create UvmToolsEventQueueDescriptor.
 //
 //------------------------------------------------------------------------------
+#if UVM_API_REV_IS_AT_MOST(10)
 NV_STATUS UvmToolsCreateEventQueue(UvmToolsSessionHandle     session,
                                   void                     *event_buffer,
                                   NvLength                  event_buffer_size,
                                   void                     *event_control,
                                   UvmToolsEventQueueHandle *queue);
+#else
+NV_STATUS UvmToolsCreateEventQueue(UvmToolsSessionHandle        session,
+                                   UvmToolsEventQueueVersion    version,
+                                   void                        *event_buffer,
+                                   NvLength                     event_buffer_size,
+                                   void                        *event_control,
+                                   UvmToolsEventQueueHandle    *queue);
+#endif

 UvmToolsEventQueueDescriptor UvmToolsGetEventQueueDescriptor(UvmToolsEventQueueHandle queue);

@@ -3512,7 +3576,7 @@ NV_STATUS UvmToolsSetNotificationThreshold(UvmToolsEventQueueHandle queue,
 //------------------------------------------------------------------------------
 // UvmToolsDestroyEventQueue
 //
-// Destroys all internal resources associated with the queue. It unpinns the
+// Destroys all internal resources associated with the queue. It unpins the
 // buffers provided in UvmToolsCreateEventQueue. Event Queue is also auto
 // destroyed when corresponding session gets destroyed.
 //
@@ -3534,7 +3598,7 @@ NV_STATUS UvmToolsDestroyEventQueue(UvmToolsEventQueueHandle queue);
 // UvmEventQueueEnableEvents
 //
 // This call enables a particular event type in the event queue. All events are
-// disabled by default. Any event type is considered listed if and only if it's
+// disabled by default. Any event type is considered listed if and only if its
 // corresponding value is equal to 1 (in other words, bit is set). Disabled
 // events listed in eventTypeFlags are going to be enabled. Enabled events and
 // events not listed in eventTypeFlags are not affected by this call.
@@ -3567,7 +3631,7 @@ NV_STATUS UvmToolsEventQueueEnableEvents(UvmToolsEventQueueHandle queue,
 // UvmToolsEventQueueDisableEvents
 //
 // This call disables a particular event type in the event queue. Any event type
-// is considered listed if and only if it's corresponding value is equal to 1
+// is considered listed if and only if its corresponding value is equal to 1
 // (in other words, bit is set). Enabled events listed in eventTypeFlags are
 // going to be disabled. Disabled events and events not listed in eventTypeFlags
 // are not affected by this call.
@@ -3605,7 +3669,7 @@ NV_STATUS UvmToolsEventQueueDisableEvents(UvmToolsEventQueueHandle queue,
 //
 // Counters position follows the layout of the memory that UVM driver decides to
 // use. To obtain particular counter value, user should perform consecutive
-// atomic reads at a a given buffer + offset address.
+// atomic reads at a given buffer + offset address.
 //
 // It is not defined what is the initial value of a counter. User should rely on
 // a difference between each snapshot.
@@ -3628,9 +3692,9 @@ NV_STATUS UvmToolsEventQueueDisableEvents(UvmToolsEventQueueHandle queue,
 //         Provided session is not valid
 //
 //     NV_ERR_INSUFFICIENT_RESOURCES
-//         There could be multiple reasons for this error. One would be that it's
-//         not possible to allocate counters structure. Another would be that
-//         either event_buffer or event_control memory couldn't be pinned
+//         There could be multiple reasons for this error. One would be that
+//         it's not possible to allocate counters structure. Another would be
+//         that either event_buffer or event_control memory couldn't be pinned
 //         (e.g. because of OS limitation of pinnable memory)
 //
 //------------------------------------------------------------------------------
@@ -3641,12 +3705,12 @@ NV_STATUS UvmToolsCreateProcessAggregateCounters(UvmToolsSessionHandle   session
 //------------------------------------------------------------------------------
 // UvmToolsCreateProcessorCounters
 //
-// Creates the counters structure for tracking per-process counters.
+// Creates the counters structure for tracking per-processor counters.
 // These counters are disabled by default.
 //
 // Counters position follows the layout of the memory that UVM driver decides to
 // use. To obtain particular counter value, user should perform consecutive
-// atomic reads at a a given buffer + offset address.
+// atomic reads at a given buffer + offset address.
 //
 // It is not defined what is the initial value of a counter. User should rely on
 // a difference between each snapshot.
@@ -3662,7 +3726,9 @@ NV_STATUS UvmToolsCreateProcessAggregateCounters(UvmToolsSessionHandle   session
 //         counters are destroyed.
 //
 //     processorUuid: (INPUT)
-//        UUID of the resource, for which counters will provide statistic data.
+//         UUID of the physical GPU if the GPU is not SMC capable or SMC
+//         enabled, the GPU instance UUID of the partition, or the CPU UUID of
+//         the resource, for which counters will provide statistic data.
 //
 //     counters: (OUTPUT)
 //         Handle to the created counters.
@@ -3672,9 +3738,9 @@ NV_STATUS UvmToolsCreateProcessAggregateCounters(UvmToolsSessionHandle   session
 //         session handle does not refer to a valid tools session
 //
 //     NV_ERR_INSUFFICIENT_RESOURCES
-//         There could be multiple reasons for this error. One would be that it's
-//         not possible to allocate counters structure. Another would be that
-//         either event_buffer or event_control memory couldn't be pinned
+//         There could be multiple reasons for this error. One would be that
+//         it's not possible to allocate counters structure. Another would be
+//         that either event_buffer or event_control memory couldn't be pinned
 //         (e.g. because of OS limitation of pinnable memory)
 //
 //     NV_ERR_INVALID_ARGUMENT
@@ -3690,7 +3756,7 @@ NV_STATUS UvmToolsCreateProcessorCounters(UvmToolsSessionHandle   session,
 // UvmToolsDestroyCounters
 //
 // Destroys all internal resources associated with this counters structure.
-// It unpinns the buffer provided in UvmToolsCreate*Counters. Counters structure
+// It unpins the buffer provided in UvmToolsCreate*Counters. Counters structure
 // also gest destroyed when corresponding session is destroyed.
 //
 // Arguments:
@@ -3711,7 +3777,7 @@ NV_STATUS UvmToolsDestroyCounters(UvmToolsCountersHandle counters);
 // UvmToolsEnableCounters
 //
 // This call enables certain counter types in the counters structure. Any
-// counter type is considered listed if and only if it's corresponding value is
+// counter type is considered listed if and only if its corresponding value is
 // equal to 1 (in other words, bit is set). Disabled counter types listed in
 // counterTypeFlags are going to be enabled. Already enabled counter types and
 // counter types not listed in counterTypeFlags are not affected by this call.
@@ -3745,7 +3811,7 @@ NV_STATUS UvmToolsEnableCounters(UvmToolsCountersHandle counters,
 // UvmToolsDisableCounters
 //
 // This call disables certain counter types in the counters structure. Any
-// counter type is considered listed if and only if it's corresponding value is
+// counter type is considered listed if and only if its corresponding value is
 // equal to 1 (in other words, bit is set). Enabled counter types listed in
 // counterTypeFlags are going to be disabled. Already disabled counter types and
 // counter types not listed in counterTypeFlags are not affected by this call.
@@ -3890,32 +3956,72 @@ NV_STATUS UvmToolsWriteProcessMemory(UvmToolsSessionHandle  session,
 // UvmToolsGetProcessorUuidTable
 //
 // Populate a table with the UUIDs of all the currently registered processors
-// in the target process.  When a GPU is registered, it is added to the table.
-// When a GPU is unregistered, it is removed.  As long as a GPU remains registered,
-// its index in the table does not change.  New registrations obtain the first
-// unused index.
+// in the target process. When a GPU is registered, it is added to the table.
+// When a GPU is unregistered, it is removed. As long as a GPU remains
+// registered, its index in the table does not change.
+// Note that the index in the table corresponds to the processor ID reported
+// in UvmEventEntry event records and that the table is not contiguously packed
+// with non-zero UUIDs even with no GPU unregistrations.
 //
 // Arguments:
 //     session: (INPUT)
 //         Handle to the tools session.
 //
+//     version: (INPUT)
+//         Requested version for the UUID table returned. The version must
+//         match the requested version of the event queue created with
+//         UvmToolsCreateEventQueue().
+//         See UvmEventEntry_V1 and UvmEventEntry_V2.
+//
 //     table: (OUTPUT)
 //         Array of processor UUIDs, including the CPU's UUID which is always
 //         at index zero.  The srcIndex and dstIndex fields of the
 //         UvmEventMigrationInfo struct index this array.  Unused indices will
-//         have a UUID of zero.
+//         have a UUID of zero. Version UvmEventEntry_V1 only uses GPU UUIDs
+//         for the UUID of the physical GPU and only supports a single SMC
+//         partition registered per process. Version UvmEventEntry_V2 supports
+//         multiple SMC partitions registered per process and uses physical GPU
+//         UUIDs if the GPU is not SMC capable or SMC enabled and GPU instance
+//         UUIDs for SMC partitions.
+//         The table pointer can be NULL in which case, the size of the table
+//         needed to hold all the UUIDs is returned in 'count'.
+//
+//     table_size: (INPUT)
+//         The size of the table in number of array elements. This can be
+//         zero if the table pointer is NULL.
 //
 //     count: (OUTPUT)
-//         Set by UVM to the number of UUIDs written, including any gaps in
-//         the table due to unregistered GPUs.
+//         On output, it is set by UVM to the number of UUIDs needed to hold
+//         all the UUIDs, including any gaps in the table due to unregistered
+//         GPUs.
 //
 // Error codes:
 //     NV_ERR_INVALID_ADDRESS:
-//         writing to table failed.
+//         writing to table failed or the count pointer was invalid.
+//
+//     NV_ERR_INVALID_ARGUMENT:
+//         The version is not UvmEventEntry_V1 or UvmEventEntry_V2.
+//         The count pointer is NULL.
+//         See UvmToolsEventQueueVersion.
+//
+//     NV_WARN_MISMATCHED_TARGET:
+//         The kernel returned a table suitable for UvmEventEntry_V1 events.
+//         (i.e., the kernel is older and doesn't support UvmEventEntry_V2).
+//
+//     NV_ERR_NO_MEMORY:
+//         Internal memory allocation failed.
 //------------------------------------------------------------------------------
+#if UVM_API_REV_IS_AT_MOST(10)
 NV_STATUS UvmToolsGetProcessorUuidTable(UvmToolsSessionHandle  session,
                                        NvProcessorUuid       *table,
                                        NvLength              *count);
+#else
+NV_STATUS UvmToolsGetProcessorUuidTable(UvmToolsSessionHandle      session,
+                                        UvmToolsEventQueueVersion  version,
+                                        NvProcessorUuid           *table,
+                                        NvLength                   table_size,
+                                        NvLength                  *count);
+#endif

 //------------------------------------------------------------------------------
 // UvmToolsFlushEvents
--- a/kernel-open/nvidia-uvm/uvm_ats.h
+++ b/kernel-open/nvidia-uvm/uvm_ats.h
@@ -34,16 +34,6 @@

    #define UVM_ATS_SUPPORTED() (UVM_ATS_IBM_SUPPORTED() || UVM_ATS_SVA_SUPPORTED())

-// ATS prefetcher uses hmm_range_fault() to query residency information.
-// hmm_range_fault() needs CONFIG_HMM_MIRROR. To detect racing CPU invalidates
-// of memory regions while hmm_range_fault() is being called, MMU interval
-// notifiers are needed.
-    #if defined(CONFIG_HMM_MIRROR) && defined(NV_MMU_INTERVAL_NOTIFIER)
-        #define UVM_ATS_PREFETCH_SUPPORTED() 1
-    #else
-        #define UVM_ATS_PREFETCH_SUPPORTED() 0
-    #endif
-
 typedef struct
 {
    // Mask of gpu_va_spaces which are registered for ATS access. The mask is
--- a/kernel-open/nvidia-uvm/uvm_ats_faults.c
+++ b/kernel-open/nvidia-uvm/uvm_ats_faults.c
@@ -30,7 +30,7 @@
 #include <linux/mempolicy.h>
 #include <linux/mmu_notifier.h>

-#if UVM_ATS_PREFETCH_SUPPORTED()
+#if UVM_HMM_RANGE_FAULT_SUPPORTED()
 #include <linux/hmm.h>
 #endif

@@ -246,7 +246,7 @@ static uvm_va_block_region_t uvm_ats_region_from_vma(struct vm_area_struct *vma,
    return uvm_ats_region_from_start_end(start, end);
 }

-#if UVM_ATS_PREFETCH_SUPPORTED()
+#if UVM_HMM_RANGE_FAULT_SUPPORTED()

 static bool uvm_ats_invalidate_notifier(struct mmu_interval_notifier *mni, unsigned long cur_seq)
 {
@@ -284,12 +284,12 @@ static NV_STATUS ats_compute_residency_mask(uvm_gpu_va_space_t *gpu_va_space,
                                            uvm_ats_fault_context_t *ats_context)
 {
    NV_STATUS status = NV_OK;
+    uvm_page_mask_t *residency_mask = &ats_context->prefetch_state.residency_mask;

-#if UVM_ATS_PREFETCH_SUPPORTED()
+#if UVM_HMM_RANGE_FAULT_SUPPORTED()
    int ret;
    NvU64 start;
    NvU64 end;
-    uvm_page_mask_t *residency_mask = &ats_context->prefetch_state.residency_mask;
    struct hmm_range range;
    uvm_page_index_t page_index;
    uvm_va_block_region_t vma_region;
@@ -370,6 +370,8 @@ static NV_STATUS ats_compute_residency_mask(uvm_gpu_va_space_t *gpu_va_space,

    mmu_interval_notifier_remove(range.notifier);

+#else
+    uvm_page_mask_zero(residency_mask);
 #endif

    return status;
@@ -403,21 +405,24 @@ static NV_STATUS ats_compute_prefetch(uvm_gpu_va_space_t *gpu_va_space,
                                      uvm_ats_service_type_t service_type,
                                      uvm_ats_fault_context_t *ats_context)
 {
-    NV_STATUS status = NV_OK;
+    NV_STATUS status;
    uvm_page_mask_t *accessed_mask = &ats_context->accessed_mask;
    uvm_page_mask_t *prefetch_mask = &ats_context->prefetch_state.prefetch_pages_mask;
    uvm_va_block_region_t max_prefetch_region = uvm_ats_region_from_vma(vma, base);

+    // Residency mask needs to be computed even if prefetching is disabled since
+    // the residency information is also needed by access counters servicing in
+    // uvm_ats_service_access_counters()
+    status = ats_compute_residency_mask(gpu_va_space, vma, base, ats_context);
+    if (status != NV_OK)
+        return status;
+
    if (!uvm_perf_prefetch_enabled(gpu_va_space->va_space))
        return status;

    if (uvm_page_mask_empty(accessed_mask))
        return status;

-    status = ats_compute_residency_mask(gpu_va_space, vma, base, ats_context);
-    if (status != NV_OK)
-        return status;
-
    // Prefetch the entire region if none of the pages are resident on any node
    // and if preferred_location is the faulting GPU.
    if (ats_context->prefetch_state.has_preferred_location &&
@@ -637,8 +642,18 @@ NV_STATUS uvm_ats_service_access_counters(uvm_gpu_va_space_t *gpu_va_space,

    ats_batch_select_residency(gpu_va_space, vma, ats_context);

+    // Ignoring the return value of ats_compute_prefetch is ok since prefetching
+    // is just an optimization and servicing access counter migrations is still
+    // worthwhile even without any prefetching added. So, let servicing continue
+    // instead of returning early even if the prefetch computation fails.
    ats_compute_prefetch(gpu_va_space, vma, base, service_type, ats_context);

+    // Remove pages which are already resident at the intended destination from
+    // the accessed_mask.
+    uvm_page_mask_andnot(&ats_context->accessed_mask,
+                         &ats_context->accessed_mask,
+                         &ats_context->prefetch_state.residency_mask);
+
    for_each_va_block_subregion_in_mask(subregion, &ats_context->accessed_mask, region) {
        NV_STATUS status;
        NvU64 start = base + (subregion.first * PAGE_SIZE);
--- a/kernel-open/nvidia-uvm/uvm_common.c
+++ b/kernel-open/nvidia-uvm/uvm_common.c
@@ -318,10 +318,11 @@ int format_uuid_to_buffer(char *buffer, unsigned bufferLength, const NvProcessor
    unsigned i;
    unsigned dashMask = 1 << 4 | 1 << 6 | 1 << 8 | 1 << 10;

-    memcpy(buffer, "UVM-GPU-", 8);
    if (bufferLength < (8 /*prefix*/+ 16 * 2 /*digits*/ + 4 * 1 /*dashes*/ + 1 /*null*/))
        return *buffer = 0;

+    memcpy(buffer, "UVM-GPU-", 8);
+
    for (i = 0; i < 16; i++) {
        *str++ = uvm_digit_to_hex(pUuidStruct->uuid[i] >> 4);
        *str++ = uvm_digit_to_hex(pUuidStruct->uuid[i] & 0xF);
--- a/kernel-open/nvidia-uvm/uvm_get_rm_ptes_test.c
+++ b/kernel-open/nvidia-uvm/uvm_get_rm_ptes_test.c
@@ -151,22 +151,6 @@ static NV_STATUS verify_mapping_info(uvm_va_space_t *va_space,
    return NV_OK;
 }

-static void fix_memory_info_uuid(uvm_va_space_t *va_space, UvmGpuMemoryInfo *mem_info)
-{
-    uvm_gpu_t *gpu;
-
-    // TODO: Bug 4351121: RM will return the GI UUID, but
-    // uvm_va_space_get_gpu_by_uuid() currently matches on physical GPU UUIDs.
-    // Match on GI UUID until the UVM user level API has been updated to use
-    // the GI UUID.
-    for_each_va_space_gpu(gpu, va_space) {
-        if (uvm_uuid_eq(&gpu->uuid, &mem_info->uuid)) {
-            mem_info->uuid = gpu->parent->uuid;
-            break;
-        }
-    }
-}
-
 static NV_STATUS test_get_rm_ptes_single_gpu(uvm_va_space_t *va_space, UVM_TEST_GET_RM_PTES_PARAMS *params)
 {
    NV_STATUS status = NV_OK;
@@ -197,11 +181,6 @@ static NV_STATUS test_get_rm_ptes_single_gpu(uvm_va_space_t *va_space, UVM_TEST_
    if (status != NV_OK)
        return status;

-    // TODO: Bug 4351121: RM will return the GI UUID. Replace it with the
-    // physical GPU UUID until the UVM user level has been updated to use
-    // the GI UUID.
-    fix_memory_info_uuid(va_space, &memory_info);
-
    TEST_CHECK_GOTO(uvm_uuid_eq(&memory_info.uuid, &params->gpu_uuid), done);

    TEST_CHECK_GOTO((memory_info.size == params->size), done);
@@ -309,11 +288,6 @@ static NV_STATUS test_get_rm_ptes_multi_gpu(uvm_va_space_t *va_space, UVM_TEST_G
   if (status != NV_OK)
       return status;

-    // TODO: Bug 4351121: RM will return the GI UUID. Replace it with the
-    // physical GPU UUID until the UVM user level has been updated to use
-    // the GI UUID.
-    fix_memory_info_uuid(va_space, &memory_info);
-
    memset(&ext_mapping_info, 0, sizeof(ext_mapping_info));

    memset(pte_buffer, 0, sizeof(pte_buffer));
--- a/kernel-open/nvidia-uvm/uvm_global.c
+++ b/kernel-open/nvidia-uvm/uvm_global.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2023 NVIDIA Corporation
+    Copyright (c) 2015-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -314,7 +314,7 @@ static NV_STATUS uvm_suspend(void)
        // interrupts in the bottom half in the future, the bottom half flush
        // below will no longer be able to guarantee that all outstanding
        // notifications have been handled.
-        uvm_gpu_access_counters_set_ignore(gpu, true);
+        uvm_parent_gpu_access_counters_set_ignore(gpu->parent, true);

        uvm_parent_gpu_set_isr_suspended(gpu->parent, true);

@@ -373,13 +373,13 @@ static NV_STATUS uvm_resume(void)

        // Bring the fault buffer software state back in sync with the
        // hardware state.
-        uvm_gpu_fault_buffer_resume(gpu->parent);
+        uvm_parent_gpu_fault_buffer_resume(gpu->parent);

        uvm_parent_gpu_set_isr_suspended(gpu->parent, false);

        // Reenable access counter interrupt processing unless notifications
        // have been set to be suppressed.
-        uvm_gpu_access_counters_set_ignore(gpu, false);
+        uvm_parent_gpu_access_counters_set_ignore(gpu->parent, false);
    }

    uvm_up_write(&g_uvm_global.pm.lock);
--- a/kernel-open/nvidia-uvm/uvm_gpu.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2023 NVIDIA Corporation
+    Copyright (c) 2015-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -59,6 +59,7 @@ MODULE_PARM_DESC(uvm_peer_copy, "Choose the addressing mode for peer copying, op

 static void remove_gpu(uvm_gpu_t *gpu);
 static void disable_peer_access(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1);
+static NV_STATUS discover_smc_peers(uvm_gpu_t *gpu);
 static NV_STATUS discover_nvlink_peers(uvm_gpu_t *gpu);
 static void destroy_nvlink_peers(uvm_gpu_t *gpu);

@@ -241,6 +242,8 @@ static NV_STATUS get_gpu_fb_info(uvm_gpu_t *gpu)
        gpu->mem_info.max_allocatable_address = fb_info.maxAllocatableAddress;
    }

+    gpu->mem_info.max_vidmem_page_size = fb_info.maxVidmemPageSize;
+
    return NV_OK;
 }

@@ -843,11 +846,11 @@ static NV_STATUS init_procfs_dirs(uvm_gpu_t *gpu)
    if (!uvm_procfs_is_enabled())
        return NV_OK;

-    format_uuid_to_buffer(uuid_text_buffer, sizeof(uuid_text_buffer), uvm_gpu_uuid(gpu));
+    format_uuid_to_buffer(uuid_text_buffer, sizeof(uuid_text_buffer), &gpu->parent->uuid);

    gpu_base_dir_entry = uvm_procfs_get_gpu_base_dir();

-    // Create UVM-GPU-${UUID}/${sub_processor_index} directory
+    // Create UVM-GPU-${physical-UUID}/${sub_processor_index} directory
    snprintf(gpu_dir_name, sizeof(gpu_dir_name), "%u", uvm_id_sub_processor_index(gpu->id));

    gpu->procfs.dir = NV_CREATE_PROC_DIR(gpu_dir_name, gpu->parent->procfs.dir);
@@ -855,7 +858,7 @@ static NV_STATUS init_procfs_dirs(uvm_gpu_t *gpu)
        return NV_ERR_OPERATING_SYSTEM;

    // Create symlink from ${gpu_id} to
-    // gpus/UVM-GPU-${UUID}/${sub_processor_index}
+    // UVM-GPU-${physical-UUID}/${sub_processor_index}
    snprintf(symlink_name, sizeof(symlink_name), "%u", uvm_id_value(gpu->id));
    snprintf(gpu_dir_name,
             sizeof(gpu_dir_name),
@@ -867,6 +870,16 @@ static NV_STATUS init_procfs_dirs(uvm_gpu_t *gpu)
    if (gpu->procfs.dir_symlink == NULL)
        return NV_ERR_OPERATING_SYSTEM;

+    if (gpu->parent->smc.enabled) {
+        // Create symlink from UVM-GPU-${GI-UUID} to
+        // UVM-GPU-${physical-UUID}/${sub_processor_index}
+        format_uuid_to_buffer(uuid_text_buffer, sizeof(uuid_text_buffer), &gpu->uuid);
+
+        gpu->procfs.gpu_instance_uuid_symlink = proc_symlink(uuid_text_buffer, gpu_base_dir_entry, gpu_dir_name);
+        if (gpu->procfs.gpu_instance_uuid_symlink == NULL)
+            return NV_ERR_OPERATING_SYSTEM;
+    }
+
    // GPU peer files are debug only
    if (!uvm_procfs_is_debug_enabled())
        return NV_OK;
@@ -882,6 +895,7 @@ static NV_STATUS init_procfs_dirs(uvm_gpu_t *gpu)
 static void deinit_procfs_dirs(uvm_gpu_t *gpu)
 {
    proc_remove(gpu->procfs.dir_peers);
+    proc_remove(gpu->procfs.gpu_instance_uuid_symlink);
    proc_remove(gpu->procfs.dir_symlink);
    proc_remove(gpu->procfs.dir);
 }
@@ -1038,6 +1052,7 @@ static NV_STATUS configure_address_space(uvm_gpu_t *gpu)
    NvU32 num_entries;
    NvU64 va_size;
    NvU64 va_per_entry;
+    uvm_mmu_page_table_alloc_t *tree_alloc;

    status = uvm_page_tree_init(gpu,
                                NULL,
@@ -1059,20 +1074,30 @@ static NV_STATUS configure_address_space(uvm_gpu_t *gpu)
    // Make sure that RM's part of the VA is aligned to the VA covered by a
    // single top level PDE.
    UVM_ASSERT_MSG(gpu->parent->rm_va_base % va_per_entry == 0,
-                   "va_base 0x%llx va_per_entry 0x%llx\n", gpu->parent->rm_va_base, va_per_entry);
+                   "va_base 0x%llx va_per_entry 0x%llx\n",
+                   gpu->parent->rm_va_base,
+                   va_per_entry);
    UVM_ASSERT_MSG(gpu->parent->rm_va_size % va_per_entry == 0,
-                   "va_size 0x%llx va_per_entry 0x%llx\n", gpu->parent->rm_va_size, va_per_entry);
+                   "va_size 0x%llx va_per_entry 0x%llx\n",
+                   gpu->parent->rm_va_size,
+                   va_per_entry);

+    UVM_ASSERT(uvm_mmu_page_size_supported(&gpu->address_space_tree, gpu->big_page.internal_size));
+    UVM_ASSERT(uvm_mmu_page_size_supported(&gpu->address_space_tree, gpu->mem_info.max_vidmem_page_size));
+
+    tree_alloc = uvm_page_tree_pdb(&gpu->address_space_tree);
    status = uvm_rm_locked_call(nvUvmInterfaceSetPageDirectory(gpu->rm_address_space,
-            uvm_page_tree_pdb(&gpu->address_space_tree)->addr.address, num_entries,
-            uvm_page_tree_pdb(&gpu->address_space_tree)->addr.aperture == UVM_APERTURE_VID,
-            gpu_get_internal_pasid(gpu)));
+                                                               tree_alloc->addr.address,
+                                                               num_entries,
+                                                               tree_alloc->addr.aperture == UVM_APERTURE_VID,
+                                                               gpu_get_internal_pasid(gpu)));
    if (status != NV_OK) {
        UVM_ERR_PRINT("nvUvmInterfaceSetPageDirectory() failed: %s, GPU %s\n",
                      nvstatusToString(status),
                      uvm_gpu_name(gpu));
        return status;
    }
+
    gpu->rm_address_space_moved_to_page_tree = true;

    return NV_OK;
@@ -1212,6 +1237,8 @@ static NV_STATUS init_parent_gpu(uvm_parent_gpu_t *parent_gpu,

 static NV_STATUS init_gpu(uvm_gpu_t *gpu, const UvmGpuInfo *gpu_info)
 {
+    char uuid_buffer[UVM_GPU_UUID_TEXT_BUFFER_LENGTH];
+    size_t len;
    NV_STATUS status;

    if (gpu->parent->smc.enabled) {
@@ -1229,6 +1256,20 @@ static NV_STATUS init_gpu(uvm_gpu_t *gpu, const UvmGpuInfo *gpu_info)
    uvm_uuid_copy(&gpu->uuid, &gpu_info->uuid);
    gpu->smc.swizz_id = gpu_info->smcSwizzId;

+    format_uuid_to_buffer(uuid_buffer, sizeof(uuid_buffer), &gpu->parent->uuid);
+    snprintf(gpu->name,
+             sizeof(gpu->name),
+             "ID %u: %s",
+             uvm_id_value(gpu->id),
+             uuid_buffer + 4);
+
+    format_uuid_to_buffer(uuid_buffer, sizeof(uuid_buffer), &gpu->uuid);
+    len = strlen(gpu->name);
+    snprintf(gpu->name + len,
+             sizeof(gpu->name) - len,
+             " UVM-GI-%s",
+             uuid_buffer + 8);
+
    // Initialize the per-GPU procfs dirs as early as possible so that other
    // parts of the driver can add files in them as part of their per-GPU init.
    status = init_procfs_dirs(gpu);
@@ -1338,7 +1379,6 @@ static NV_STATUS add_gpu(const NvProcessorUuid *gpu_uuid,
                         uvm_parent_gpu_t *parent_gpu,
                         uvm_gpu_t **gpu_out)
 {
-    char uuid_buffer[UVM_GPU_UUID_TEXT_BUFFER_LENGTH];
    NV_STATUS status;
    bool alloc_parent = (parent_gpu == NULL);
    uvm_gpu_t *gpu = NULL;
@@ -1364,13 +1404,6 @@ static NV_STATUS add_gpu(const NvProcessorUuid *gpu_uuid,
    if (alloc_parent)
        fill_parent_gpu_info(parent_gpu, gpu_info);

-    format_uuid_to_buffer(uuid_buffer, sizeof(uuid_buffer), &parent_gpu->uuid);
-    snprintf(gpu->name,
-             sizeof(gpu->name),
-             "ID %u: %s",
-             uvm_id_value(gpu->id),
-             uuid_buffer);
-
    // After this point all error clean up should be handled by remove_gpu()

    if (!gpu_supports_uvm(parent_gpu)) {
@@ -1432,13 +1465,25 @@ static NV_STATUS add_gpu(const NvProcessorUuid *gpu_uuid,

    uvm_spin_unlock_irqrestore(&g_uvm_global.gpu_table_lock);

-    if (alloc_parent) {
+    if (gpu->parent->smc.enabled) {
+        status = discover_smc_peers(gpu);
+        if (status != NV_OK) {
+            // Nobody can have retained the GPU yet, since we still hold the
+            // global lock.
+            UVM_ASSERT(uvm_gpu_retained_count(gpu) == 1);
+            atomic64_set(&gpu->retained_count, 0);
+            goto error;
+        }
+    }
+    else if (alloc_parent) {
        status = discover_nvlink_peers(gpu);
        if (status != NV_OK) {
-            UVM_ERR_PRINT("Failed to discover NVLINK peers: %s, GPU %s\n", nvstatusToString(status), uvm_gpu_name(gpu));
+            UVM_ERR_PRINT("Failed to discover NVLINK peers: %s, GPU %s\n",
+                          nvstatusToString(status),
+                          uvm_gpu_name(gpu));

-            // Nobody can have retained the GPU yet, since we still hold the global
-            // lock.
+            // Nobody can have retained the GPU yet, since we still hold the
+            // global lock.
            UVM_ASSERT(uvm_gpu_retained_count(gpu) == 1);
            atomic64_set(&gpu->retained_count, 0);
            goto error;
@@ -1686,7 +1731,7 @@ static void uvm_parent_gpu_destroy(nv_kref_t *nv_kref)

    nv_kthread_q_stop(&parent_gpu->lazy_free_q);

-    for (sub_processor_index = 0; sub_processor_index < UVM_PARENT_ID_MAX_SUB_PROCESSORS; sub_processor_index++)
+    for_each_sub_processor_index(sub_processor_index)
        UVM_ASSERT(!parent_gpu->gpus[sub_processor_index]);

    uvm_kvfree(parent_gpu);
@@ -1915,32 +1960,25 @@ uvm_parent_gpu_t *uvm_parent_gpu_get_by_uuid(const NvProcessorUuid *gpu_uuid)
    return uvm_parent_gpu_get_by_uuid_locked(gpu_uuid);
 }

-static uvm_gpu_t *gpu_get_by_uuid_locked(const NvProcessorUuid *gpu_uuid)
+uvm_gpu_t *uvm_gpu_get_by_uuid(const NvProcessorUuid *gpu_uuid)
 {
    uvm_gpu_id_t gpu_id;

+    uvm_assert_mutex_locked(&g_uvm_global.global_lock);
+
    for_each_gpu_id(gpu_id) {
        uvm_gpu_t *gpu = uvm_gpu_get(gpu_id);

        if (gpu) {
-            if (uvm_uuid_eq(uvm_gpu_uuid(gpu), gpu_uuid)) {
-                UVM_ASSERT(!gpu->parent->smc.enabled);
+            if (uvm_uuid_eq(&gpu->uuid, gpu_uuid))
                return gpu;
-            }
        }
    }

    return NULL;
 }

-uvm_gpu_t *uvm_gpu_get_by_uuid(const NvProcessorUuid *gpu_uuid)
-{
-    uvm_assert_mutex_locked(&g_uvm_global.global_lock);
-
-    return gpu_get_by_uuid_locked(gpu_uuid);
-}
-
-uvm_gpu_t *uvm_gpu_get_by_parent_and_swizz_id(uvm_parent_gpu_t *parent_gpu, NvU32 swizz_id)
+static uvm_gpu_t *uvm_gpu_get_by_parent_and_swizz_id(uvm_parent_gpu_t *parent_gpu, NvU32 swizz_id)
 {
    uvm_gpu_t *gpu;

@@ -1998,7 +2036,7 @@ static NV_STATUS gpu_retain_by_uuid_locked(const NvProcessorUuid *gpu_uuid,

    if (parent_gpu != NULL) {
        // If the UUID has been seen before, and if SMC is enabled, then check
-        // if this specific partition has been seen previously.  The UUID-based
+        // if this specific partition has been seen previously. The UUID-based
        // look-up above may have succeeded for a different partition with the
        // same parent GPU.
        if (gpu_info->smcEnabled) {
@@ -2287,7 +2325,7 @@ static NV_STATUS init_procfs_peer_cap_files(uvm_gpu_t *local, uvm_gpu_t *remote,
        return NV_ERR_OPERATING_SYSTEM;

    // Create a symlink from UVM GPU UUID (UVM-GPU-...) to the UVM GPU ID gpuB
-    format_uuid_to_buffer(symlink_name, sizeof(symlink_name), uvm_gpu_uuid(remote));
+    format_uuid_to_buffer(symlink_name, sizeof(symlink_name), &remote->uuid);
    peer_caps->procfs.peer_symlink_file[local_idx] = proc_symlink(symlink_name,
                                                                  local->procfs.dir_peers,
                                                                  gpu_dir_name);
@@ -2297,6 +2335,24 @@ static NV_STATUS init_procfs_peer_cap_files(uvm_gpu_t *local, uvm_gpu_t *remote,
    return NV_OK;
 }

+static NV_STATUS init_procfs_peer_files(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1)
+{
+    NV_STATUS status;
+
+    if (!uvm_procfs_is_debug_enabled())
+        return NV_OK;
+
+    status = init_procfs_peer_cap_files(gpu0, gpu1, 0);
+    if (status != NV_OK)
+        return status;
+
+    status = init_procfs_peer_cap_files(gpu1, gpu0, 1);
+    if (status != NV_OK)
+        return status;
+
+    return NV_OK;
+}
+
 static NV_STATUS init_peer_access(uvm_gpu_t *gpu0,
                                  uvm_gpu_t *gpu1,
                                  const UvmGpuP2PCapsParams *p2p_caps_params,
@@ -2377,16 +2433,41 @@ static NV_STATUS init_peer_access(uvm_gpu_t *gpu0,
        uvm_spin_unlock(&gpu1->peer_info.peer_gpus_lock);
    }

-    if (!uvm_procfs_is_debug_enabled())
-        return NV_OK;
+    return init_procfs_peer_files(gpu0, gpu1);
+}

-    status = init_procfs_peer_cap_files(gpu0, gpu1, 0);
-    if (status != NV_OK)
-        return status;
+static NV_STATUS discover_smc_peers(uvm_gpu_t *gpu)
+{
+    NvU32 sub_processor_index;
+    uvm_gpu_t *other_gpu;
+    NV_STATUS status;

-    status = init_procfs_peer_cap_files(gpu1, gpu0, 1);
-    if (status != NV_OK)
-        return status;
+    UVM_ASSERT(gpu);
+    uvm_assert_mutex_locked(&g_uvm_global.global_lock);
+    UVM_ASSERT(gpu->parent->smc.enabled);
+
+    for_each_sub_processor_index(sub_processor_index) {
+        uvm_gpu_peer_t *peer_caps;
+
+        other_gpu = gpu->parent->gpus[sub_processor_index];
+        if (!other_gpu || other_gpu == gpu)
+            continue;
+
+        peer_caps = uvm_gpu_peer_caps(gpu, other_gpu);
+        if (peer_caps->ref_count == 1)
+            continue;
+
+        UVM_ASSERT(peer_caps->ref_count == 0);
+
+        memset(peer_caps, 0, sizeof(*peer_caps));
+        peer_caps->ref_count = 1;
+
+        status = init_procfs_peer_files(gpu, other_gpu);
+        if (status != NV_OK) {
+            peer_caps->ref_count = 0;
+            return status;
+        }
+    }

    return NV_OK;
 }
@@ -2489,9 +2570,7 @@ static NV_STATUS discover_nvlink_peers(uvm_gpu_t *gpu)

    UVM_ASSERT(gpu);
    uvm_assert_mutex_locked(&g_uvm_global.global_lock);
-
-    if (gpu->parent->smc.enabled)
-        return NV_OK;
+    UVM_ASSERT(!gpu->parent->smc.enabled);

    for_each_gpu(other_gpu) {
        UvmGpuP2PCapsParams p2p_caps_params;
@@ -2592,10 +2671,6 @@ static void disable_peer_access(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1)
    UVM_ASSERT(gpu0);
    UVM_ASSERT(gpu1);

-    // P2P is not supported under SMC partitioning
-    UVM_ASSERT(!gpu0->parent->smc.enabled);
-    UVM_ASSERT(!gpu1->parent->smc.enabled);
-
    uvm_assert_mutex_locked(&g_uvm_global.global_lock);

    peer_caps = uvm_gpu_peer_caps(gpu0, gpu1);
@@ -2638,9 +2713,9 @@ static void disable_peer_access(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1)
    // IDs queried from the peer table above which are about to be removed from
    // the global table.
    if (gpu0->parent->access_counters_supported)
-        uvm_gpu_access_counter_buffer_flush(gpu0);
+        uvm_parent_gpu_access_counter_buffer_flush(gpu0->parent);
    if (gpu1->parent->access_counters_supported)
-        uvm_gpu_access_counter_buffer_flush(gpu1);
+        uvm_parent_gpu_access_counter_buffer_flush(gpu1->parent);

    memset(peer_caps, 0, sizeof(*peer_caps));
 }
@@ -2668,12 +2743,17 @@ void uvm_gpu_release_pcie_peer_access(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1)
 static uvm_aperture_t uvm_gpu_peer_caps_aperture(uvm_gpu_peer_t *peer_caps, uvm_gpu_t *local_gpu, uvm_gpu_t *remote_gpu)
 {
    size_t peer_index;
-    UVM_ASSERT(peer_caps->link_type != UVM_GPU_LINK_INVALID);

    // Indirect peers are accessed as sysmem addresses
    if (peer_caps->is_indirect_peer)
        return UVM_APERTURE_SYS;

+    // MIG instances in the same physical GPU have vidmem addresses
+    if (local_gpu->parent == remote_gpu->parent)
+        return UVM_APERTURE_VID;
+
+    UVM_ASSERT(peer_caps->link_type != UVM_GPU_LINK_INVALID);
+
    if (uvm_id_value(local_gpu->id) < uvm_id_value(remote_gpu->id))
        peer_index = 0;
    else
@@ -3285,12 +3365,19 @@ NV_STATUS uvm_api_register_gpu(UVM_REGISTER_GPU_PARAMS *params, struct file *fil
        .user_client   = params->hClient,
        .user_object   = params->hSmcPartRef,
    };
+    NvProcessorUuid gpu_instance_uuid;
+    NV_STATUS status;

-    return uvm_va_space_register_gpu(va_space,
-                                     &params->gpu_uuid,
-                                     &user_rm_va_space,
-                                     &params->numaEnabled,
-                                     &params->numaNodeId);
+    status = uvm_va_space_register_gpu(va_space,
+                                       &params->gpu_uuid,
+                                       &user_rm_va_space,
+                                       &params->numaEnabled,
+                                       &params->numaNodeId,
+                                       &gpu_instance_uuid);
+    if (status == NV_OK)
+        uvm_uuid_copy(&params->gpu_uuid, &gpu_instance_uuid);
+
+    return status;
 }

 NV_STATUS uvm_api_unregister_gpu(UVM_UNREGISTER_GPU_PARAMS *params, struct file *filp)
@@ -3363,10 +3450,10 @@ NV_STATUS uvm_test_set_prefetch_filtering(UVM_TEST_SET_PREFETCH_FILTERING_PARAMS

    switch (params->filtering_mode) {
        case UVM_TEST_PREFETCH_FILTERING_MODE_FILTER_ALL:
-            uvm_gpu_disable_prefetch_faults(gpu->parent);
+            uvm_parent_gpu_disable_prefetch_faults(gpu->parent);
            break;
        case UVM_TEST_PREFETCH_FILTERING_MODE_FILTER_NONE:
-            uvm_gpu_enable_prefetch_faults(gpu->parent);
+            uvm_parent_gpu_enable_prefetch_faults(gpu->parent);
            break;
        default:
            status = NV_ERR_INVALID_ARGUMENT;
--- a/kernel-open/nvidia-uvm/uvm_gpu.h
+++ b/kernel-open/nvidia-uvm/uvm_gpu.h
@@ -618,9 +618,10 @@ struct uvm_gpu_struct
    // The gpu's GI uuid if SMC is enabled; otherwise, a copy of parent->uuid.
    NvProcessorUuid uuid;

-    // Nice printable name in the format: ID: 999: UVM-GPU-<parent_uuid>.
+    // Nice printable name in the format:
+    // ID: 999: GPU-<parent_uuid> UVM-GI-<gi_uuid>.
    // UVM_GPU_UUID_TEXT_BUFFER_LENGTH includes the null character.
-    char name[9 + UVM_GPU_UUID_TEXT_BUFFER_LENGTH];
+    char name[9 + 2 * UVM_GPU_UUID_TEXT_BUFFER_LENGTH];

    // Refcount of the gpu, i.e. how many times it has been retained. This is
    // roughly a count of how many times it has been registered with a VA space,
@@ -656,6 +657,10 @@ struct uvm_gpu_struct
        // can allocate through PMM (PMA).
        NvU64 max_allocatable_address;

+        // Max supported vidmem page size may be smaller than the max GMMU page
+        // size, because of the vMMU supported page sizes.
+        NvU64 max_vidmem_page_size;
+
        struct
        {
            // True if the platform supports HW coherence and the GPU's memory
@@ -844,6 +849,9 @@ struct uvm_gpu_struct

        struct proc_dir_entry *dir_symlink;

+        // The GPU instance UUID symlink if SMC is enabled.
+        struct proc_dir_entry *gpu_instance_uuid_symlink;
+
        struct proc_dir_entry *info_file;

        struct proc_dir_entry *dir_peers;
@@ -1210,11 +1218,6 @@ static const char *uvm_gpu_name(uvm_gpu_t *gpu)
    return gpu->name;
 }

-static const NvProcessorUuid *uvm_gpu_uuid(uvm_gpu_t *gpu)
-{
-    return &gpu->parent->uuid;
-}
-
 static uvmGpuDeviceHandle uvm_gpu_device_handle(uvm_gpu_t *gpu)
 {
    if (gpu->parent->smc.enabled)
@@ -1234,6 +1237,9 @@ struct uvm_gpu_peer_struct
    // - The global lock is held.
    //
    // - While the global lock was held in the past, the two GPUs were detected
+    //   to be SMC peers and were both retained.
+    //
+    // - While the global lock was held in the past, the two GPUs were detected
    //   to be NVLINK peers and were both retained.
    //
    // - While the global lock was held in the past, the two GPUs were detected
@@ -1319,17 +1325,17 @@ static uvm_gpu_phys_address_t uvm_gpu_page_to_phys_address(uvm_gpu_t *gpu, struc
 // Note that there is a uvm_gpu_get() function defined in uvm_global.h to break
 // a circular dep between global and gpu modules.

-// Get a uvm_gpu_t by UUID.  This returns NULL if the GPU is not present.  This
-// is the general purpose call that should be used normally.
-// That is, unless a uvm_gpu_t for a specific SMC partition needs to be
-// retrieved, in which case uvm_gpu_get_by_parent_and_swizz_id() must be used
-// instead.
+// Get a uvm_gpu_t by UUID (physical GPU UUID if SMC is not enabled, otherwise
+// GPU instance UUID).
+// This returns NULL if the GPU is not present.
+// This is the general purpose call that should be used normally.
 //
 // LOCKING: requires the global lock to be held
 uvm_gpu_t *uvm_gpu_get_by_uuid(const NvProcessorUuid *gpu_uuid);

-// Get a uvm_parent_gpu_t by UUID.  Like uvm_gpu_get_by_uuid(), this function
-// returns NULL if the GPU has not been registered.
+// Get a uvm_parent_gpu_t by UUID (physical GPU UUID).
+// Like uvm_gpu_get_by_uuid(), this function returns NULL if the GPU has not
+// been registered.
 //
 // LOCKING: requires the global lock to be held
 uvm_parent_gpu_t *uvm_parent_gpu_get_by_uuid(const NvProcessorUuid *gpu_uuid);
@@ -1340,13 +1346,6 @@ uvm_parent_gpu_t *uvm_parent_gpu_get_by_uuid(const NvProcessorUuid *gpu_uuid);
 // limited cases.
 uvm_parent_gpu_t *uvm_parent_gpu_get_by_uuid_locked(const NvProcessorUuid *gpu_uuid);

-// Get the uvm_gpu_t for a partition by parent and swizzId. This returns NULL if
-// the partition hasn't been registered. This call needs to be used instead of
-// uvm_gpu_get_by_uuid() when a specific partition is targeted.
-//
-// LOCKING: requires the global lock to be held
-uvm_gpu_t *uvm_gpu_get_by_parent_and_swizz_id(uvm_parent_gpu_t *parent_gpu, NvU32 swizz_id);
-
 // Retain a gpu by uuid
 // Returns the retained uvm_gpu_t in gpu_out on success
 //
--- a/kernel-open/nvidia-uvm/uvm_gpu_access_counters.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu_access_counters.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2017-2023 NVIDIA Corporation
+    Copyright (c) 2017-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -33,7 +33,7 @@
 #include "uvm_va_space_mm.h"
 #include "uvm_pmm_sysmem.h"
 #include "uvm_perf_module.h"
-#include "uvm_ats_ibm.h"
+#include "uvm_ats.h"
 #include "uvm_ats_faults.h"

 #define UVM_PERF_ACCESS_COUNTER_BATCH_COUNT_MIN     1
@@ -99,7 +99,8 @@ MODULE_PARM_DESC(uvm_perf_access_counter_threshold,
                 "Number of remote accesses on a region required to trigger a notification."
                 "Valid values: [1, 65535]");

-static void access_counter_buffer_flush_locked(uvm_gpu_t *gpu, uvm_gpu_buffer_flush_mode_t flush_mode);
+static void access_counter_buffer_flush_locked(uvm_parent_gpu_t *parent_gpu,
+                                               uvm_gpu_buffer_flush_mode_t flush_mode);

 static uvm_perf_module_event_callback_desc_t g_callbacks_access_counters[] = {};

@@ -126,7 +127,7 @@ static va_space_access_counters_info_t *va_space_access_counters_info_get(uvm_va

 // Whether access counter migrations are enabled or not. The policy is as
 // follows:
-// - MIMC migrations are disabled by default on all systems except P9.
+// - MIMC migrations are disabled by default on all non-ATS systems.
 // - MOMC migrations are disabled by default on all systems
 // - Users can override this policy by specifying on/off
 static bool is_migration_enabled(uvm_access_counter_type_t type)
@@ -149,7 +150,7 @@ static bool is_migration_enabled(uvm_access_counter_type_t type)
    if (type == UVM_ACCESS_COUNTER_TYPE_MOMC)
        return false;

-    if (UVM_ATS_IBM_SUPPORTED())
+    if (UVM_ATS_SUPPORTED())
        return g_uvm_global.ats.supported;

    return false;
@@ -281,7 +282,7 @@ get_config_for_type(const uvm_access_counter_buffer_info_t *access_counters, uvm
                                                         &(access_counters)->current_config.momc;
 }

-bool uvm_gpu_access_counters_pending(uvm_parent_gpu_t *parent_gpu)
+bool uvm_parent_gpu_access_counters_pending(uvm_parent_gpu_t *parent_gpu)
 {
    UVM_ASSERT(parent_gpu->access_counters_supported);

@@ -340,7 +341,7 @@ static void init_access_counter_types_config(const UvmGpuAccessCntrConfig *confi
    UVM_ASSERT(counter_type_config->sub_granularity_regions_per_translation <= UVM_SUB_GRANULARITY_REGIONS);
 }

-NV_STATUS uvm_gpu_init_access_counters(uvm_parent_gpu_t *parent_gpu)
+NV_STATUS uvm_parent_gpu_init_access_counters(uvm_parent_gpu_t *parent_gpu)
 {
    NV_STATUS status = NV_OK;
    uvm_access_counter_buffer_info_t *access_counters = &parent_gpu->access_counter_buffer_info;
@@ -444,12 +445,12 @@ NV_STATUS uvm_gpu_init_access_counters(uvm_parent_gpu_t *parent_gpu)
    return NV_OK;

 fail:
-    uvm_gpu_deinit_access_counters(parent_gpu);
+    uvm_parent_gpu_deinit_access_counters(parent_gpu);

    return status;
 }

-void uvm_gpu_deinit_access_counters(uvm_parent_gpu_t *parent_gpu)
+void uvm_parent_gpu_deinit_access_counters(uvm_parent_gpu_t *parent_gpu)
 {
    uvm_access_counter_buffer_info_t *access_counters = &parent_gpu->access_counter_buffer_info;
    uvm_access_counter_service_batch_context_t *batch_context = &access_counters->batch_service_context;
@@ -475,7 +476,7 @@ void uvm_gpu_deinit_access_counters(uvm_parent_gpu_t *parent_gpu)
    batch_context->phys.translations = NULL;
 }

-bool uvm_gpu_access_counters_required(const uvm_parent_gpu_t *parent_gpu)
+bool uvm_parent_gpu_access_counters_required(const uvm_parent_gpu_t *parent_gpu)
 {
    if (!parent_gpu->access_counters_supported)
        return false;
@@ -518,7 +519,7 @@ static NV_STATUS access_counters_take_ownership(uvm_gpu_t *gpu, UvmGpuAccessCntr
    // taken control of the notify buffer since the GPU was initialized. Then
    // flush old notifications. This will update the cached_put pointer.
    access_counters->cached_get = UVM_GPU_READ_ONCE(*access_counters->rm_info.pAccessCntrBufferGet);
-    access_counter_buffer_flush_locked(gpu, UVM_GPU_BUFFER_FLUSH_MODE_UPDATE_PUT);
+    access_counter_buffer_flush_locked(gpu->parent, UVM_GPU_BUFFER_FLUSH_MODE_UPDATE_PUT);

    access_counters->current_config.threshold = config->threshold;

@@ -537,20 +538,20 @@ error:

 // If ownership is yielded as part of reconfiguration, the access counters
 // handling refcount may not be 0
-static void access_counters_yield_ownership(uvm_gpu_t *gpu)
+static void access_counters_yield_ownership(uvm_parent_gpu_t *parent_gpu)
 {
    NV_STATUS status;
-    uvm_access_counter_buffer_info_t *access_counters = &gpu->parent->access_counter_buffer_info;
+    uvm_access_counter_buffer_info_t *access_counters = &parent_gpu->access_counter_buffer_info;

-    UVM_ASSERT(gpu->parent->access_counters_supported);
-    UVM_ASSERT(uvm_sem_is_locked(&gpu->parent->isr.access_counters.service_lock));
+    UVM_ASSERT(parent_gpu->access_counters_supported);
+    UVM_ASSERT(uvm_sem_is_locked(&parent_gpu->isr.access_counters.service_lock));

    // Wait for any pending clear operation befor releasing ownership
    status = uvm_tracker_wait(&access_counters->clear_tracker);
    if (status != NV_OK)
        UVM_ASSERT(status == uvm_global_get_status());

-    status = uvm_rm_locked_call(nvUvmInterfaceDisableAccessCntr(gpu->parent->rm_device,
+    status = uvm_rm_locked_call(nvUvmInterfaceDisableAccessCntr(parent_gpu->rm_device,
                                                                &access_counters->rm_info));
    UVM_ASSERT(status == NV_OK);
 }
@@ -579,14 +580,14 @@ static NV_STATUS gpu_access_counters_enable(uvm_gpu_t *gpu, UvmGpuAccessCntrConf

 // Decrement the refcount of access counter enablement. If this is the last
 // reference, disable the HW feature.
-static void gpu_access_counters_disable(uvm_gpu_t *gpu)
+static void parent_gpu_access_counters_disable(uvm_parent_gpu_t *parent_gpu)
 {
-    UVM_ASSERT(uvm_sem_is_locked(&gpu->parent->isr.access_counters.service_lock));
-    UVM_ASSERT(gpu->parent->access_counters_supported);
-    UVM_ASSERT(gpu->parent->isr.access_counters.handling_ref_count > 0);
+    UVM_ASSERT(uvm_sem_is_locked(&parent_gpu->isr.access_counters.service_lock));
+    UVM_ASSERT(parent_gpu->access_counters_supported);
+    UVM_ASSERT(parent_gpu->isr.access_counters.handling_ref_count > 0);

-    if (--gpu->parent->isr.access_counters.handling_ref_count == 0)
-        access_counters_yield_ownership(gpu);
+    if (--parent_gpu->isr.access_counters.handling_ref_count == 0)
+        access_counters_yield_ownership(parent_gpu);
 }

 // Invoked during registration of the GPU in the VA space
@@ -598,7 +599,7 @@ NV_STATUS uvm_gpu_access_counters_enable(uvm_gpu_t *gpu, uvm_va_space_t *va_spac

    uvm_parent_gpu_access_counters_isr_lock(gpu->parent);

-    if (uvm_processor_mask_test(&va_space->access_counters_enabled_processors, gpu->id)) {
+    if (uvm_parent_processor_mask_test(&va_space->access_counters_enabled_processors, gpu->parent->id)) {
        status = NV_ERR_INVALID_DEVICE;
    }
    else {
@@ -616,7 +617,7 @@ NV_STATUS uvm_gpu_access_counters_enable(uvm_gpu_t *gpu, uvm_va_space_t *va_spac
        // modified to protect from concurrent enablement of access counters in
        // another GPU
        if (status == NV_OK)
-            uvm_processor_mask_set_atomic(&va_space->access_counters_enabled_processors, gpu->id);
+            uvm_parent_processor_mask_set_atomic(&va_space->access_counters_enabled_processors, gpu->parent->id);
    }

    // If this is the first reference taken on access counters, dropping the
@@ -626,22 +627,24 @@ NV_STATUS uvm_gpu_access_counters_enable(uvm_gpu_t *gpu, uvm_va_space_t *va_spac
    return status;
 }

-void uvm_gpu_access_counters_disable(uvm_gpu_t *gpu, uvm_va_space_t *va_space)
+void uvm_parent_gpu_access_counters_disable(uvm_parent_gpu_t *parent_gpu,
+                                            uvm_va_space_t *va_space)
 {
-    UVM_ASSERT(gpu->parent->access_counters_supported);
+    UVM_ASSERT(parent_gpu->access_counters_supported);

-    uvm_parent_gpu_access_counters_isr_lock(gpu->parent);
+    uvm_parent_gpu_access_counters_isr_lock(parent_gpu);

-    if (uvm_processor_mask_test_and_clear_atomic(&va_space->access_counters_enabled_processors, gpu->id)) {
-        gpu_access_counters_disable(gpu);
+    if (uvm_parent_processor_mask_test_and_clear_atomic(&va_space->access_counters_enabled_processors,
+                                                        parent_gpu->id)) {
+        parent_gpu_access_counters_disable(parent_gpu);

        // If this is VA space reconfigured access counters, clear the
        // ownership to allow for other processes to invoke the reconfiguration
-        if (gpu->parent->access_counter_buffer_info.reconfiguration_owner == va_space)
-            gpu->parent->access_counter_buffer_info.reconfiguration_owner = NULL;
+        if (parent_gpu->access_counter_buffer_info.reconfiguration_owner == va_space)
+            parent_gpu->access_counter_buffer_info.reconfiguration_owner = NULL;
    }

-    uvm_parent_gpu_access_counters_isr_unlock(gpu->parent);
+    uvm_parent_gpu_access_counters_isr_unlock(parent_gpu);
 }

 static void write_get(uvm_parent_gpu_t *parent_gpu, NvU32 get)
@@ -660,15 +663,16 @@ static void write_get(uvm_parent_gpu_t *parent_gpu, NvU32 get)
    UVM_GPU_WRITE_ONCE(*access_counters->rm_info.pAccessCntrBufferGet, get);
 }

-static void access_counter_buffer_flush_locked(uvm_gpu_t *gpu, uvm_gpu_buffer_flush_mode_t flush_mode)
+static void access_counter_buffer_flush_locked(uvm_parent_gpu_t *parent_gpu,
+                                               uvm_gpu_buffer_flush_mode_t flush_mode)
 {
    NvU32 get;
    NvU32 put;
    uvm_spin_loop_t spin;
-    uvm_access_counter_buffer_info_t *access_counters = &gpu->parent->access_counter_buffer_info;
+    uvm_access_counter_buffer_info_t *access_counters = &parent_gpu->access_counter_buffer_info;

-    UVM_ASSERT(uvm_sem_is_locked(&gpu->parent->isr.access_counters.service_lock));
-    UVM_ASSERT(gpu->parent->access_counters_supported);
+    UVM_ASSERT(uvm_sem_is_locked(&parent_gpu->isr.access_counters.service_lock));
+    UVM_ASSERT(parent_gpu->access_counters_supported);

    // Read PUT pointer from the GPU if requested
    UVM_ASSERT(flush_mode != UVM_GPU_BUFFER_FLUSH_MODE_WAIT_UPDATE_PUT);
@@ -680,28 +684,28 @@ static void access_counter_buffer_flush_locked(uvm_gpu_t *gpu, uvm_gpu_buffer_fl

    while (get != put) {
        // Wait until valid bit is set
-        UVM_SPIN_WHILE(!gpu->parent->access_counter_buffer_hal->entry_is_valid(gpu->parent, get), &spin);
+        UVM_SPIN_WHILE(!parent_gpu->access_counter_buffer_hal->entry_is_valid(parent_gpu, get), &spin);

-        gpu->parent->access_counter_buffer_hal->entry_clear_valid(gpu->parent, get);
+        parent_gpu->access_counter_buffer_hal->entry_clear_valid(parent_gpu, get);
        ++get;
        if (get == access_counters->max_notifications)
            get = 0;
    }

-    write_get(gpu->parent, get);
+    write_get(parent_gpu, get);
 }

-void uvm_gpu_access_counter_buffer_flush(uvm_gpu_t *gpu)
+void uvm_parent_gpu_access_counter_buffer_flush(uvm_parent_gpu_t *parent_gpu)
 {
-    UVM_ASSERT(gpu->parent->access_counters_supported);
+    UVM_ASSERT(parent_gpu->access_counters_supported);

    // Disables access counter interrupts and notification servicing
-    uvm_parent_gpu_access_counters_isr_lock(gpu->parent);
+    uvm_parent_gpu_access_counters_isr_lock(parent_gpu);

-    if (gpu->parent->isr.access_counters.handling_ref_count > 0)
-        access_counter_buffer_flush_locked(gpu, UVM_GPU_BUFFER_FLUSH_MODE_UPDATE_PUT);
+    if (parent_gpu->isr.access_counters.handling_ref_count > 0)
+        access_counter_buffer_flush_locked(parent_gpu, UVM_GPU_BUFFER_FLUSH_MODE_UPDATE_PUT);

-    uvm_parent_gpu_access_counters_isr_unlock(gpu->parent);
+    uvm_parent_gpu_access_counters_isr_unlock(parent_gpu);
 }

 static inline int cmp_access_counter_instance_ptr(const uvm_access_counter_buffer_entry_t *a,
@@ -1027,7 +1031,7 @@ static NV_STATUS service_va_block_locked(uvm_processor_id_t processor,
        if (!iter.migratable)
            continue;

-        thrashing_hint = uvm_perf_thrashing_get_hint(va_block, address, processor);
+        thrashing_hint = uvm_perf_thrashing_get_hint(va_block, service_context->block_context, address, processor);
        if (thrashing_hint.type == UVM_PERF_THRASHING_HINT_TYPE_THROTTLE) {
            // If the page is throttling, ignore the access counter
            // notification
@@ -1212,7 +1216,8 @@ static NV_STATUS service_phys_single_va_block(uvm_gpu_t *gpu,

        service_context->operation = UVM_SERVICE_OPERATION_ACCESS_COUNTERS;
        service_context->num_retries = 0;
-        service_context->block_context->mm = mm;
+
+        uvm_va_block_context_init(service_context->block_context, mm);

        if (uvm_va_block_is_hmm(va_block))
            uvm_hmm_migrate_begin_wait(va_block);
@@ -1221,7 +1226,8 @@ static NV_STATUS service_phys_single_va_block(uvm_gpu_t *gpu,

        reverse_mappings_to_va_block_page_mask(va_block, reverse_mappings, num_reverse_mappings, accessed_pages);

-        status = UVM_VA_BLOCK_RETRY_LOCKED(va_block, &va_block_retry,
+        status = UVM_VA_BLOCK_RETRY_LOCKED(va_block,
+                                           &va_block_retry,
                                           service_va_block_locked(processor,
                                                                   va_block,
                                                                   &va_block_retry,
@@ -1506,8 +1512,6 @@ static NV_STATUS service_notification_va_block_helper(struct mm_struct *mm,
    service_context->operation = UVM_SERVICE_OPERATION_ACCESS_COUNTERS;
    service_context->num_retries = 0;

-    uvm_va_block_context_init(service_context->block_context, mm);
-
    return UVM_VA_BLOCK_RETRY_LOCKED(va_block,
                                     &va_block_retry,
                                     service_va_block_locked(processor,
@@ -1519,6 +1523,7 @@ static NV_STATUS service_notification_va_block_helper(struct mm_struct *mm,

 static void expand_notification_block(uvm_gpu_va_space_t *gpu_va_space,
                                      uvm_va_block_t *va_block,
+                                      uvm_va_block_context_t *va_block_context,
                                      uvm_page_mask_t *accessed_pages,
                                      const uvm_access_counter_buffer_entry_t *current_entry)
 {
@@ -1546,7 +1551,7 @@ static void expand_notification_block(uvm_gpu_va_space_t *gpu_va_space,

    page_index = uvm_va_block_cpu_page_index(va_block, addr);

-    resident_id = uvm_va_block_page_get_closest_resident(va_block, page_index, gpu->id);
+    resident_id = uvm_va_block_page_get_closest_resident(va_block, va_block_context, page_index, gpu->id);

    // resident_id might be invalid or might already be the same as the GPU
    // which received the notification if the memory was already migrated before
@@ -1602,6 +1607,7 @@ static NV_STATUS service_virt_notifications_in_block(uvm_gpu_va_space_t *gpu_va_
    uvm_va_space_t *va_space = gpu_va_space->va_space;
    uvm_page_mask_t *accessed_pages = &batch_context->accessed_pages;
    uvm_access_counter_buffer_entry_t **notifications = batch_context->virt.notifications;
+    uvm_service_block_context_t *service_context = &batch_context->block_service_context;

    UVM_ASSERT(va_block);
    UVM_ASSERT(index < batch_context->virt.num_notifications);
@@ -1610,16 +1616,24 @@ static NV_STATUS service_virt_notifications_in_block(uvm_gpu_va_space_t *gpu_va_

    uvm_page_mask_zero(accessed_pages);

+    uvm_va_block_context_init(service_context->block_context, mm);
+
    uvm_mutex_lock(&va_block->lock);

    for (i = index; i < batch_context->virt.num_notifications; i++) {
        uvm_access_counter_buffer_entry_t *current_entry = notifications[i];
        NvU64 address = current_entry->address.address;

-        if ((current_entry->virtual_info.va_space == va_space) && (address <= va_block->end))
-            expand_notification_block(gpu_va_space, va_block, accessed_pages, current_entry);
-        else
+        if ((current_entry->virtual_info.va_space == va_space) && (address <= va_block->end)) {
+            expand_notification_block(gpu_va_space,
+                                      va_block,
+                                      batch_context->block_service_context.block_context,
+                                      accessed_pages,
+                                      current_entry);
+        }
+        else {
            break;
+        }
    }

    *out_index = i;
@@ -1698,6 +1712,9 @@ static NV_STATUS service_virt_notification_ats(uvm_gpu_va_space_t *gpu_va_space,
    // Atleast one notification should have been processed.
    UVM_ASSERT(index < *out_index);

+    // TODO: Bug 2113632: [UVM] Don't clear access counters when the preferred
+    //                    location is set
+    // If no pages were actually migrated, don't clear the access counters.
    status = uvm_ats_service_access_counters(gpu_va_space, vma, base, ats_context);
    if (status != NV_OK)
        flags &= ~UVM_ACCESS_COUNTER_ACTION_CLEAR;
@@ -1985,7 +2002,7 @@ NV_STATUS uvm_test_access_counters_enabled_by_default(UVM_TEST_ACCESS_COUNTERS_E
    if (!gpu)
        return NV_ERR_INVALID_DEVICE;

-    params->enabled = uvm_gpu_access_counters_required(gpu->parent);
+    params->enabled = uvm_parent_gpu_access_counters_required(gpu->parent);

    uvm_gpu_release(gpu);

@@ -2050,11 +2067,11 @@ NV_STATUS uvm_test_reconfigure_access_counters(UVM_TEST_RECONFIGURE_ACCESS_COUNT
        goto exit_isr_unlock;
    }

-    if (!uvm_processor_mask_test(&va_space->access_counters_enabled_processors, gpu->id)) {
+    if (!uvm_parent_processor_mask_test(&va_space->access_counters_enabled_processors, gpu->parent->id)) {
        status = gpu_access_counters_enable(gpu, &config);

        if (status == NV_OK)
-            uvm_processor_mask_set_atomic(&va_space->access_counters_enabled_processors, gpu->id);
+            uvm_parent_processor_mask_set_atomic(&va_space->access_counters_enabled_processors, gpu->parent->id);
        else
            goto exit_isr_unlock;
    }
@@ -2066,7 +2083,7 @@ NV_STATUS uvm_test_reconfigure_access_counters(UVM_TEST_RECONFIGURE_ACCESS_COUNT
    // enabled in at least gpu. This inconsistent state is not visible to other
    // threads or VA spaces because of the ISR lock, and it is immediately
    // rectified by retaking ownership.
-    access_counters_yield_ownership(gpu);
+    access_counters_yield_ownership(gpu->parent);
    status = access_counters_take_ownership(gpu, &config);

    // Retaking ownership failed, so RM owns the interrupt.
@@ -2080,8 +2097,8 @@ NV_STATUS uvm_test_reconfigure_access_counters(UVM_TEST_RECONFIGURE_ACCESS_COUNT
                           "Access counters interrupt still owned by RM, other VA spaces may experience failures");
        }

-        uvm_processor_mask_clear_atomic(&va_space->access_counters_enabled_processors, gpu->id);
-        gpu_access_counters_disable(gpu);
+        uvm_parent_processor_mask_clear_atomic(&va_space->access_counters_enabled_processors, gpu->parent->id);
+        parent_gpu_access_counters_disable(gpu->parent);
        goto exit_isr_unlock;
    }

@@ -2167,42 +2184,42 @@ exit_release_gpu:
    return status;
 }

-void uvm_gpu_access_counters_set_ignore(uvm_gpu_t *gpu, bool do_ignore)
+void uvm_parent_gpu_access_counters_set_ignore(uvm_parent_gpu_t *parent_gpu, bool do_ignore)
 {
    bool change_intr_state = false;

-    if (!gpu->parent->access_counters_supported)
+    if (!parent_gpu->access_counters_supported)
        return;

-    uvm_parent_gpu_access_counters_isr_lock(gpu->parent);
+    uvm_parent_gpu_access_counters_isr_lock(parent_gpu);

    if (do_ignore) {
-        if (gpu->parent->access_counter_buffer_info.notifications_ignored_count++ == 0)
+        if (parent_gpu->access_counter_buffer_info.notifications_ignored_count++ == 0)
            change_intr_state = true;
    }
    else {
-        UVM_ASSERT(gpu->parent->access_counter_buffer_info.notifications_ignored_count >= 1);
-        if (--gpu->parent->access_counter_buffer_info.notifications_ignored_count == 0)
+        UVM_ASSERT(parent_gpu->access_counter_buffer_info.notifications_ignored_count >= 1);
+        if (--parent_gpu->access_counter_buffer_info.notifications_ignored_count == 0)
            change_intr_state = true;
    }

    if (change_intr_state) {
        // We need to avoid an interrupt storm while ignoring notifications. We
        // just disable the interrupt.
-        uvm_spin_lock_irqsave(&gpu->parent->isr.interrupts_lock);
+        uvm_spin_lock_irqsave(&parent_gpu->isr.interrupts_lock);

        if (do_ignore)
-            uvm_parent_gpu_access_counters_intr_disable(gpu->parent);
+            uvm_parent_gpu_access_counters_intr_disable(parent_gpu);
        else
-            uvm_parent_gpu_access_counters_intr_enable(gpu->parent);
+            uvm_parent_gpu_access_counters_intr_enable(parent_gpu);

-        uvm_spin_unlock_irqrestore(&gpu->parent->isr.interrupts_lock);
+        uvm_spin_unlock_irqrestore(&parent_gpu->isr.interrupts_lock);

        if (!do_ignore)
-            access_counter_buffer_flush_locked(gpu, UVM_GPU_BUFFER_FLUSH_MODE_CACHED_PUT);
+            access_counter_buffer_flush_locked(parent_gpu, UVM_GPU_BUFFER_FLUSH_MODE_CACHED_PUT);
    }

-    uvm_parent_gpu_access_counters_isr_unlock(gpu->parent);
+    uvm_parent_gpu_access_counters_isr_unlock(parent_gpu);
 }

 NV_STATUS uvm_test_set_ignore_access_counters(UVM_TEST_SET_IGNORE_ACCESS_COUNTERS_PARAMS *params, struct file *filp)
@@ -2216,7 +2233,7 @@ NV_STATUS uvm_test_set_ignore_access_counters(UVM_TEST_SET_IGNORE_ACCESS_COUNTER
        return NV_ERR_INVALID_DEVICE;

    if (gpu->parent->access_counters_supported)
-        uvm_gpu_access_counters_set_ignore(gpu, params->ignore);
+        uvm_parent_gpu_access_counters_set_ignore(gpu->parent, params->ignore);
    else
        status = NV_ERR_NOT_SUPPORTED;

--- a/kernel-open/nvidia-uvm/uvm_gpu_access_counters.h
+++ b/kernel-open/nvidia-uvm/uvm_gpu_access_counters.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2017 NVIDIA Corporation
+    Copyright (c) 2017-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -27,13 +27,13 @@
 #include "uvm_forward_decl.h"
 #include "uvm_test_ioctl.h"

-NV_STATUS uvm_gpu_init_access_counters(uvm_parent_gpu_t *parent_gpu);
-void uvm_gpu_deinit_access_counters(uvm_parent_gpu_t *parent_gpu);
-bool uvm_gpu_access_counters_pending(uvm_parent_gpu_t *parent_gpu);
+NV_STATUS uvm_parent_gpu_init_access_counters(uvm_parent_gpu_t *parent_gpu);
+void uvm_parent_gpu_deinit_access_counters(uvm_parent_gpu_t *parent_gpu);
+bool uvm_parent_gpu_access_counters_pending(uvm_parent_gpu_t *parent_gpu);

 void uvm_gpu_service_access_counters(uvm_gpu_t *gpu);

-void uvm_gpu_access_counter_buffer_flush(uvm_gpu_t *gpu);
+void uvm_parent_gpu_access_counter_buffer_flush(uvm_parent_gpu_t *parent_gpu);

 // Ignore or unignore access counters notifications. Ignoring means that the
 // bottom half is a no-op which just leaves notifications in the HW buffer
@@ -46,7 +46,7 @@ void uvm_gpu_access_counter_buffer_flush(uvm_gpu_t *gpu);
 //
 // When uningoring, the interrupt conditions will be re-evaluated to trigger
 // processing of buffered notifications, if any exist.
-void uvm_gpu_access_counters_set_ignore(uvm_gpu_t *gpu, bool do_ignore);
+void uvm_parent_gpu_access_counters_set_ignore(uvm_parent_gpu_t *parent_gpu, bool do_ignore);

 // Return whether the VA space has access counter migrations enabled. The
 // caller must ensure that the VA space cannot go away.
@@ -63,7 +63,7 @@ void uvm_perf_access_counters_unload(uvm_va_space_t *va_space);

 // Check whether access counters should be enabled when the given GPU is
 // registered on any VA space.
-bool uvm_gpu_access_counters_required(const uvm_parent_gpu_t *parent_gpu);
+bool uvm_parent_gpu_access_counters_required(const uvm_parent_gpu_t *parent_gpu);

 // Functions used to enable/disable access counters on a GPU in the given VA
 // space.
@@ -72,12 +72,12 @@ bool uvm_gpu_access_counters_required(const uvm_parent_gpu_t *parent_gpu);
 // counters are currently enabled. The hardware notifications and interrupts on
 // the GPU are enabled the first time any VA space invokes
 // uvm_gpu_access_counters_enable, and disabled when the last VA space invokes
-// uvm_gpu_access_counters_disable
+// uvm_parent_gpu_access_counters_disable().
 //
 // Locking: the VA space lock must not be held by the caller since these
 // functions may take the access counters ISR lock.
 NV_STATUS uvm_gpu_access_counters_enable(uvm_gpu_t *gpu, uvm_va_space_t *va_space);
-void uvm_gpu_access_counters_disable(uvm_gpu_t *gpu, uvm_va_space_t *va_space);
+void uvm_parent_gpu_access_counters_disable(uvm_parent_gpu_t *parent_gpu, uvm_va_space_t *va_space);

 NV_STATUS uvm_test_access_counters_enabled_by_default(UVM_TEST_ACCESS_COUNTERS_ENABLED_BY_DEFAULT_PARAMS *params,
                                                      struct file *filp);
--- a/kernel-open/nvidia-uvm/uvm_gpu_isr.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu_isr.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2016-2023 NVIDIA Corporation
+    Copyright (c) 2016-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -100,7 +100,7 @@ static unsigned schedule_replayable_faults_handler(uvm_parent_gpu_t *parent_gpu)
    if (down_trylock(&parent_gpu->isr.replayable_faults.service_lock.sem) != 0)
        return 0;

-    if (!uvm_gpu_replayable_faults_pending(parent_gpu)) {
+    if (!uvm_parent_gpu_replayable_faults_pending(parent_gpu)) {
        up(&parent_gpu->isr.replayable_faults.service_lock.sem);
        return 0;
    }
@@ -137,7 +137,7 @@ static unsigned schedule_non_replayable_faults_handler(uvm_parent_gpu_t *parent_
    // interrupts will be triggered by the gpu and faults may stay
    // unserviced. Therefore, if there is a fault in the queue, we schedule
    // a bottom half unconditionally.
-    if (!uvm_gpu_non_replayable_faults_pending(parent_gpu))
+    if (!uvm_parent_gpu_non_replayable_faults_pending(parent_gpu))
        return 0;

    nv_kref_get(&parent_gpu->gpu_kref);
@@ -167,7 +167,7 @@ static unsigned schedule_access_counters_handler(uvm_parent_gpu_t *parent_gpu)
    if (down_trylock(&parent_gpu->isr.access_counters.service_lock.sem) != 0)
        return 0;

-    if (!uvm_gpu_access_counters_pending(parent_gpu)) {
+    if (!uvm_parent_gpu_access_counters_pending(parent_gpu)) {
        up(&parent_gpu->isr.access_counters.service_lock.sem);
        return 0;
    }
@@ -295,7 +295,7 @@ NV_STATUS uvm_parent_gpu_init_isr(uvm_parent_gpu_t *parent_gpu)
    uvm_va_block_context_t *block_context;

    if (parent_gpu->replayable_faults_supported) {
-        status = uvm_gpu_fault_buffer_init(parent_gpu);
+        status = uvm_parent_gpu_fault_buffer_init(parent_gpu);
        if (status != NV_OK) {
            UVM_ERR_PRINT("Failed to initialize GPU fault buffer: %s, GPU: %s\n",
                          nvstatusToString(status),
@@ -361,7 +361,7 @@ NV_STATUS uvm_parent_gpu_init_isr(uvm_parent_gpu_t *parent_gpu)
        }

        if (parent_gpu->access_counters_supported) {
-            status = uvm_gpu_init_access_counters(parent_gpu);
+            status = uvm_parent_gpu_init_access_counters(parent_gpu);
            if (status != NV_OK) {
                UVM_ERR_PRINT("Failed to initialize GPU access counters: %s, GPU: %s\n",
                              nvstatusToString(status),
@@ -423,7 +423,7 @@ void uvm_parent_gpu_disable_isr(uvm_parent_gpu_t *parent_gpu)
    // bottom half never take the global lock, since we're holding it here.
    //
    // Note that it's safe to call nv_kthread_q_stop() even if
-    // nv_kthread_q_init() failed in uvm_gpu_init_isr().
+    // nv_kthread_q_init() failed in uvm_parent_gpu_init_isr().
    nv_kthread_q_stop(&parent_gpu->isr.bottom_half_q);
    nv_kthread_q_stop(&parent_gpu->isr.kill_channel_q);
 }
@@ -438,8 +438,8 @@ void uvm_parent_gpu_deinit_isr(uvm_parent_gpu_t *parent_gpu)
        // replayable_faults.disable_intr_ref_count since they must retain the
        // GPU across uvm_parent_gpu_replayable_faults_isr_lock/
        // uvm_parent_gpu_replayable_faults_isr_unlock. This means the
-        // uvm_gpu_replayable_faults_disable_intr above could only have raced
-        // with bottom halves.
+        // uvm_parent_gpu_replayable_faults_disable_intr above could only have
+        // raced with bottom halves.
        //
        // If we cleared replayable_faults.handling before the bottom half got
        // to its uvm_parent_gpu_replayable_faults_isr_unlock, when it
@@ -455,13 +455,13 @@ void uvm_parent_gpu_deinit_isr(uvm_parent_gpu_t *parent_gpu)
                       uvm_parent_gpu_name(parent_gpu),
                       parent_gpu->isr.replayable_faults.disable_intr_ref_count);

-        uvm_gpu_fault_buffer_deinit(parent_gpu);
+        uvm_parent_gpu_fault_buffer_deinit(parent_gpu);
    }

    if (parent_gpu->access_counters_supported) {
        // It is safe to deinitialize access counters even if they have not been
        // successfully initialized.
-        uvm_gpu_deinit_access_counters(parent_gpu);
+        uvm_parent_gpu_deinit_access_counters(parent_gpu);
        block_context =
            parent_gpu->access_counter_buffer_info.batch_service_context.block_service_context.block_context;
        uvm_va_block_context_free(block_context);
--- a/kernel-open/nvidia-uvm/uvm_gpu_non_replayable_faults.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu_non_replayable_faults.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2017-2023 NVIDIA Corporation
+    Copyright (c) 2017-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -116,8 +116,8 @@


 // There is no error handling in this function. The caller is in charge of
-// calling uvm_gpu_fault_buffer_deinit_non_replayable_faults on failure.
-NV_STATUS uvm_gpu_fault_buffer_init_non_replayable_faults(uvm_parent_gpu_t *parent_gpu)
+// calling uvm_parent_gpu_fault_buffer_deinit_non_replayable_faults on failure.
+NV_STATUS uvm_parent_gpu_fault_buffer_init_non_replayable_faults(uvm_parent_gpu_t *parent_gpu)
 {
    uvm_non_replayable_fault_buffer_info_t *non_replayable_faults = &parent_gpu->fault_buffer_info.non_replayable;

@@ -145,7 +145,7 @@ NV_STATUS uvm_gpu_fault_buffer_init_non_replayable_faults(uvm_parent_gpu_t *pare
    return NV_OK;
 }

-void uvm_gpu_fault_buffer_deinit_non_replayable_faults(uvm_parent_gpu_t *parent_gpu)
+void uvm_parent_gpu_fault_buffer_deinit_non_replayable_faults(uvm_parent_gpu_t *parent_gpu)
 {
    uvm_non_replayable_fault_buffer_info_t *non_replayable_faults = &parent_gpu->fault_buffer_info.non_replayable;

@@ -163,7 +163,7 @@ void uvm_gpu_fault_buffer_deinit_non_replayable_faults(uvm_parent_gpu_t *parent_
    non_replayable_faults->fault_cache        = NULL;
 }

-bool uvm_gpu_non_replayable_faults_pending(uvm_parent_gpu_t *parent_gpu)
+bool uvm_parent_gpu_non_replayable_faults_pending(uvm_parent_gpu_t *parent_gpu)
 {
    NV_STATUS status;
    NvBool has_pending_faults;
--- a/kernel-open/nvidia-uvm/uvm_gpu_non_replayable_faults.h
+++ b/kernel-open/nvidia-uvm/uvm_gpu_non_replayable_faults.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2017 NVIDIA Corporation
+    Copyright (c) 2017-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -26,12 +26,12 @@
 #include <nvstatus.h>
 #include "uvm_forward_decl.h"

-bool uvm_gpu_non_replayable_faults_pending(uvm_parent_gpu_t *parent_gpu);
+bool uvm_parent_gpu_non_replayable_faults_pending(uvm_parent_gpu_t *parent_gpu);

 void uvm_gpu_service_non_replayable_fault_buffer(uvm_gpu_t *gpu);

-NV_STATUS uvm_gpu_fault_buffer_init_non_replayable_faults(uvm_parent_gpu_t *parent_gpu);
+NV_STATUS uvm_parent_gpu_fault_buffer_init_non_replayable_faults(uvm_parent_gpu_t *parent_gpu);

-void uvm_gpu_fault_buffer_deinit_non_replayable_faults(uvm_parent_gpu_t *parent_gpu);
+void uvm_parent_gpu_fault_buffer_deinit_non_replayable_faults(uvm_parent_gpu_t *parent_gpu);

 #endif // __UVM_GPU_NON_REPLAYABLE_FAULTS_H__
--- a/kernel-open/nvidia-uvm/uvm_gpu_replayable_faults.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu_replayable_faults.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2023 NVIDIA Corporation
+    Copyright (c) 2015-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -44,6 +44,24 @@
 // provides some background for understanding replayable faults, non-replayable
 // faults, and how UVM services each fault type.

+// The HW fault buffer flush mode instructs RM on how to flush the hardware
+// replayable fault buffer; it is only used in Confidential Computing.
+//
+// Unless HW_FAULT_BUFFER_FLUSH_MODE_MOVE is functionally required (because UVM
+// needs to inspect the faults currently present in the HW fault buffer) it is
+// recommended to use HW_FAULT_BUFFER_FLUSH_MODE_DISCARD for performance
+// reasons.
+typedef enum
+{
+    // Flush the HW fault buffer, discarding all the resulting faults. UVM never
+    // gets to see these faults.
+    HW_FAULT_BUFFER_FLUSH_MODE_DISCARD,
+
+    // Flush the HW fault buffer, and move all the resulting faults to the SW
+    // fault ("shadow") buffer.
+    HW_FAULT_BUFFER_FLUSH_MODE_MOVE,
+} hw_fault_buffer_flush_mode_t;
+
 #define UVM_PERF_REENABLE_PREFETCH_FAULTS_LAPSE_MSEC_DEFAULT 1000

 // Lapse of time in milliseconds after which prefetch faults can be re-enabled.
@@ -226,7 +244,7 @@ static void fault_buffer_deinit_replayable_faults(uvm_parent_gpu_t *parent_gpu)
    batch_context->utlbs               = NULL;
 }

-NV_STATUS uvm_gpu_fault_buffer_init(uvm_parent_gpu_t *parent_gpu)
+NV_STATUS uvm_parent_gpu_fault_buffer_init(uvm_parent_gpu_t *parent_gpu)
 {
    NV_STATUS status = NV_OK;

@@ -253,7 +271,7 @@ NV_STATUS uvm_gpu_fault_buffer_init(uvm_parent_gpu_t *parent_gpu)
        goto fail;

    if (parent_gpu->non_replayable_faults_supported) {
-        status = uvm_gpu_fault_buffer_init_non_replayable_faults(parent_gpu);
+        status = uvm_parent_gpu_fault_buffer_init_non_replayable_faults(parent_gpu);
        if (status != NV_OK)
            goto fail;
    }
@@ -261,28 +279,28 @@ NV_STATUS uvm_gpu_fault_buffer_init(uvm_parent_gpu_t *parent_gpu)
    return NV_OK;

 fail:
-    uvm_gpu_fault_buffer_deinit(parent_gpu);
+    uvm_parent_gpu_fault_buffer_deinit(parent_gpu);

    return status;
 }

 // Reinitialize state relevant to replayable fault handling after returning
 // from a power management cycle.
-void uvm_gpu_fault_buffer_resume(uvm_parent_gpu_t *parent_gpu)
+void uvm_parent_gpu_fault_buffer_resume(uvm_parent_gpu_t *parent_gpu)
 {
    UVM_ASSERT(parent_gpu->replayable_faults_supported);

    fault_buffer_reinit_replayable_faults(parent_gpu);
 }

-void uvm_gpu_fault_buffer_deinit(uvm_parent_gpu_t *parent_gpu)
+void uvm_parent_gpu_fault_buffer_deinit(uvm_parent_gpu_t *parent_gpu)
 {
    NV_STATUS status = NV_OK;

    uvm_assert_mutex_locked(&g_uvm_global.global_lock);

    if (parent_gpu->non_replayable_faults_supported)
-        uvm_gpu_fault_buffer_deinit_non_replayable_faults(parent_gpu);
+        uvm_parent_gpu_fault_buffer_deinit_non_replayable_faults(parent_gpu);

    fault_buffer_deinit_replayable_faults(parent_gpu);

@@ -297,7 +315,7 @@ void uvm_gpu_fault_buffer_deinit(uvm_parent_gpu_t *parent_gpu)
    }
 }

-bool uvm_gpu_replayable_faults_pending(uvm_parent_gpu_t *parent_gpu)
+bool uvm_parent_gpu_replayable_faults_pending(uvm_parent_gpu_t *parent_gpu)
 {
    uvm_replayable_fault_buffer_info_t *replayable_faults = &parent_gpu->fault_buffer_info.replayable;

@@ -533,25 +551,26 @@ static void write_get(uvm_parent_gpu_t *parent_gpu, NvU32 get)
    parent_gpu->fault_buffer_hal->write_get(parent_gpu, get);
 }

-static NV_STATUS hw_fault_buffer_flush_locked(uvm_parent_gpu_t *parent_gpu)
+// In Confidential Computing GSP-RM owns the HW replayable fault buffer.
+// Flushing the fault buffer implies flushing both the HW buffer (using a RM
+// API), and the SW buffer accessible by UVM ("shadow" buffer).
+//
+// The HW buffer needs to be flushed first. This is because, once that flush
+// completes, any faults that were present in the HW buffer have been moved to
+// the shadow buffer, or have been discarded by RM.
+static NV_STATUS hw_fault_buffer_flush_locked(uvm_parent_gpu_t *parent_gpu, hw_fault_buffer_flush_mode_t flush_mode)
 {
-    NV_STATUS status = NV_OK;
+    NV_STATUS status;
+    NvBool is_flush_mode_move;
+
+    UVM_ASSERT(uvm_sem_is_locked(&parent_gpu->isr.replayable_faults.service_lock));
+    UVM_ASSERT((flush_mode == HW_FAULT_BUFFER_FLUSH_MODE_MOVE) || (flush_mode == HW_FAULT_BUFFER_FLUSH_MODE_DISCARD));

-    // When Confidential Computing is enabled, GSP-RM owns the HW replayable
-    // fault buffer. Flushing the fault buffer implies flushing both the HW
-    // buffer (using a RM API), and the SW buffer accessible by UVM ("shadow"
-    // buffer).
-    //
-    // The HW buffer needs to be flushed first. This is because, once that
-    // flush completes, any faults that were present in the HW buffer when
-    // fault_buffer_flush_locked is called, are now either flushed from the HW
-    // buffer, or are present in the shadow buffer and are about to be discarded
-    // too.
    if (!g_uvm_global.conf_computing_enabled)
        return NV_OK;

-    // Flush the HW replayable buffer owned by GSP-RM.
-    status = nvUvmInterfaceFlushReplayableFaultBuffer(parent_gpu->rm_device);
+    is_flush_mode_move = (NvBool) (flush_mode == HW_FAULT_BUFFER_FLUSH_MODE_MOVE);
+    status = nvUvmInterfaceFlushReplayableFaultBuffer(&parent_gpu->fault_buffer_info.rm_info, is_flush_mode_move);

    UVM_ASSERT(status == NV_OK);

@@ -595,10 +614,9 @@ static NV_STATUS fault_buffer_flush_locked(uvm_gpu_t *gpu,

    // Read PUT pointer from the GPU if requested
    if (flush_mode == UVM_GPU_BUFFER_FLUSH_MODE_UPDATE_PUT || flush_mode == UVM_GPU_BUFFER_FLUSH_MODE_WAIT_UPDATE_PUT) {
-        status = hw_fault_buffer_flush_locked(parent_gpu);
+        status = hw_fault_buffer_flush_locked(parent_gpu, HW_FAULT_BUFFER_FLUSH_MODE_DISCARD);
        if (status != NV_OK)
            return status;
-
        replayable_faults->cached_put = parent_gpu->fault_buffer_hal->read_put(parent_gpu);
    }

@@ -1435,7 +1453,10 @@ static NV_STATUS service_fault_batch_block_locked(uvm_gpu_t *gpu,
                                                uvm_fault_access_type_to_prot(service_access_type)))
            continue;

-        thrashing_hint = uvm_perf_thrashing_get_hint(va_block, current_entry->fault_address, gpu->id);
+        thrashing_hint = uvm_perf_thrashing_get_hint(va_block,
+                                                     block_context->block_context,
+                                                     current_entry->fault_address,
+                                                     gpu->id);
        if (thrashing_hint.type == UVM_PERF_THRASHING_HINT_TYPE_THROTTLE) {
            // Throttling is implemented by sleeping in the fault handler on
            // the CPU and by continuing to process faults on other pages on
@@ -1981,7 +2002,7 @@ static NV_STATUS service_fault_batch_for_cancel(uvm_gpu_t *gpu, uvm_fault_servic
    // in the HW buffer. When GSP owns the HW buffer, we also have to wait for
    // GSP to copy all available faults from the HW buffer into the shadow
    // buffer.
-    status = hw_fault_buffer_flush_locked(gpu->parent);
+    status = hw_fault_buffer_flush_locked(gpu->parent, HW_FAULT_BUFFER_FLUSH_MODE_MOVE);
    if (status != NV_OK)
        goto done;

@@ -2738,14 +2759,14 @@ static void enable_disable_prefetch_faults(uvm_parent_gpu_t *parent_gpu, uvm_fau
         (uvm_enable_builtin_tests &&
          parent_gpu->rm_info.isSimulated &&
          batch_context->num_invalid_prefetch_faults > 5))) {
-        uvm_gpu_disable_prefetch_faults(parent_gpu);
+        uvm_parent_gpu_disable_prefetch_faults(parent_gpu);
    }
    else if (!parent_gpu->fault_buffer_info.prefetch_faults_enabled) {
        NvU64 lapse = NV_GETTIME() - parent_gpu->fault_buffer_info.disable_prefetch_faults_timestamp;

        // Reenable prefetch faults after some time
        if (lapse > ((NvU64)uvm_perf_reenable_prefetch_faults_lapse_msec * (1000 * 1000)))
-            uvm_gpu_enable_prefetch_faults(parent_gpu);
+            uvm_parent_gpu_enable_prefetch_faults(parent_gpu);
    }
 }

@@ -2872,7 +2893,7 @@ void uvm_gpu_service_replayable_faults(uvm_gpu_t *gpu)
        UVM_DBG_PRINT("Error servicing replayable faults on GPU: %s\n", uvm_gpu_name(gpu));
 }

-void uvm_gpu_enable_prefetch_faults(uvm_parent_gpu_t *parent_gpu)
+void uvm_parent_gpu_enable_prefetch_faults(uvm_parent_gpu_t *parent_gpu)
 {
    UVM_ASSERT(parent_gpu->isr.replayable_faults.handling);
    UVM_ASSERT(parent_gpu->prefetch_fault_supported);
@@ -2883,7 +2904,7 @@ void uvm_gpu_enable_prefetch_faults(uvm_parent_gpu_t *parent_gpu)
    }
 }

-void uvm_gpu_disable_prefetch_faults(uvm_parent_gpu_t *parent_gpu)
+void uvm_parent_gpu_disable_prefetch_faults(uvm_parent_gpu_t *parent_gpu)
 {
    UVM_ASSERT(parent_gpu->isr.replayable_faults.handling);
    UVM_ASSERT(parent_gpu->prefetch_fault_supported);
@@ -2940,7 +2961,7 @@ NV_STATUS uvm_test_drain_replayable_faults(UVM_TEST_DRAIN_REPLAYABLE_FAULTS_PARA

    do {
        uvm_parent_gpu_replayable_faults_isr_lock(gpu->parent);
-        pending = uvm_gpu_replayable_faults_pending(gpu->parent);
+        pending = uvm_parent_gpu_replayable_faults_pending(gpu->parent);
        uvm_parent_gpu_replayable_faults_isr_unlock(gpu->parent);

        if (!pending)
--- a/kernel-open/nvidia-uvm/uvm_gpu_replayable_faults.h
+++ b/kernel-open/nvidia-uvm/uvm_gpu_replayable_faults.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015 NVIDIA Corporation
+    Copyright (c) 2015-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -52,12 +52,12 @@ typedef enum

 const char *uvm_perf_fault_replay_policy_string(uvm_perf_fault_replay_policy_t fault_replay);

-NV_STATUS uvm_gpu_fault_buffer_init(uvm_parent_gpu_t *parent_gpu);
-void uvm_gpu_fault_buffer_deinit(uvm_parent_gpu_t *parent_gpu);
+NV_STATUS uvm_parent_gpu_fault_buffer_init(uvm_parent_gpu_t *parent_gpu);
+void uvm_parent_gpu_fault_buffer_deinit(uvm_parent_gpu_t *parent_gpu);

-void uvm_gpu_fault_buffer_resume(uvm_parent_gpu_t *parent_gpu);
+void uvm_parent_gpu_fault_buffer_resume(uvm_parent_gpu_t *parent_gpu);

-bool uvm_gpu_replayable_faults_pending(uvm_parent_gpu_t *parent_gpu);
+bool uvm_parent_gpu_replayable_faults_pending(uvm_parent_gpu_t *parent_gpu);

 // Clear valid bit for all remaining unserviced faults in the buffer, set GET to
 // PUT, and push a fault replay of type UVM_FAULT_REPLAY_TYPE_START. It does not
@@ -68,8 +68,8 @@ bool uvm_gpu_replayable_faults_pending(uvm_parent_gpu_t *parent_gpu);
 NV_STATUS uvm_gpu_fault_buffer_flush(uvm_gpu_t *gpu);

 // Enable/disable HW support for prefetch-initiated faults
-void uvm_gpu_enable_prefetch_faults(uvm_parent_gpu_t *parent_gpu);
-void uvm_gpu_disable_prefetch_faults(uvm_parent_gpu_t *parent_gpu);
+void uvm_parent_gpu_enable_prefetch_faults(uvm_parent_gpu_t *parent_gpu);
+void uvm_parent_gpu_disable_prefetch_faults(uvm_parent_gpu_t *parent_gpu);

 // Service pending replayable faults on the given GPU. This function must be
 // only called from the ISR bottom half
--- a/kernel-open/nvidia-uvm/uvm_hmm.c
+++ b/kernel-open/nvidia-uvm/uvm_hmm.c
@@ -1306,7 +1306,7 @@ void uvm_hmm_block_add_eviction_mappings(uvm_va_space_t *va_space,
    uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
    uvm_va_policy_node_t *node;
    uvm_va_block_region_t region;
-    uvm_processor_mask_t map_processors;
+    uvm_processor_mask_t *map_processors = &block_context->hmm.map_processors_eviction;
    uvm_processor_id_t id;
    NV_STATUS tracker_status;
    NV_STATUS status = NV_OK;
@@ -1333,9 +1333,9 @@ void uvm_hmm_block_add_eviction_mappings(uvm_va_space_t *va_space,

            // Exclude the processors that have been already mapped due to
            // AccessedBy.
-            uvm_processor_mask_andnot(&map_processors, &va_block->evicted_gpus, &node->policy.accessed_by);
+            uvm_processor_mask_andnot(map_processors, &va_block->evicted_gpus, &node->policy.accessed_by);

-            for_each_gpu_id_in_mask(id, &map_processors) {
+            for_each_gpu_id_in_mask(id, map_processors) {
                uvm_gpu_t *gpu = uvm_va_space_get_gpu(va_space, id);
                uvm_va_block_gpu_state_t *gpu_state;

@@ -1866,7 +1866,7 @@ static void lock_block_cpu_page(uvm_va_block_t *va_block,
                                unsigned long *dst_pfns,
                                uvm_page_mask_t *same_devmem_page_mask)
 {
-    uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, page_to_nid(src_page), page_index);
+    uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_any_chunk_for_page(va_block, page_index);
    uvm_va_block_region_t chunk_region;
    struct page *dst_page;

@@ -2708,7 +2708,9 @@ static NV_STATUS dmamap_src_sysmem_pages(uvm_va_block_t *va_block,
                // Since there is a CPU resident page, there shouldn't be one
                // anywhere else. TODO: Bug 3660922: Need to handle read
                // duplication at some point.
-                UVM_ASSERT(!uvm_va_block_page_resident_processors_count(va_block, page_index));
+                UVM_ASSERT(!uvm_va_block_page_resident_processors_count(va_block,
+                                                                        service_context->block_context,
+                                                                        page_index));

                // migrate_vma_setup() was able to isolate and lock the page;
                // therefore, it is CPU resident and not mapped.
@@ -2725,8 +2727,9 @@ static NV_STATUS dmamap_src_sysmem_pages(uvm_va_block_t *va_block,
            // used for GPU to GPU copies. It can't be an evicted page because
            // migrate_vma_setup() would have found a source page.
            if (uvm_page_mask_test(&va_block->cpu.allocated, page_index)) {
-                UVM_ASSERT(!uvm_va_block_page_resident_processors_count(va_block, page_index));
-
+                UVM_ASSERT(!uvm_va_block_page_resident_processors_count(va_block,
+                                                                        service_context->block_context,
+                                                                        page_index));
                hmm_va_block_cpu_page_unpopulate(va_block, page_index, NULL);
            }
        }
--- a/kernel-open/nvidia-uvm/uvm_ioctl.h
+++ b/kernel-open/nvidia-uvm/uvm_ioctl.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2013-2019 NVidia Corporation
+    Copyright (c) 2013-2023 NVidia Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -320,7 +320,7 @@ typedef struct

 typedef struct
 {
-    NvProcessorUuid gpuUuidArray[UVM_MAX_GPUS];                    // IN
+    NvProcessorUuid gpuUuidArray[UVM_MAX_GPUS_V1];                 // IN
    NvU32           numGpus;                                       // IN
    NvU64           serverId                    NV_ALIGN_BYTES(8); // OUT
    NV_STATUS       rmStatus;                                      // OUT
@@ -344,9 +344,9 @@ typedef struct

 typedef struct
 {
-    NvProcessorUuid gpuUuidArray[UVM_MAX_GPUS]; // OUT
-    NvU32           validCount;                 // OUT
-    NV_STATUS       rmStatus;                   // OUT
+    NvProcessorUuid gpuUuidArray[UVM_MAX_GPUS_V1]; // OUT
+    NvU32           validCount;                    // OUT
+    NV_STATUS       rmStatus;                      // OUT
 } UVM_GET_GPU_UUID_TABLE_PARAMS;

 #if defined(WIN32) || defined(WIN64)
@@ -494,7 +494,7 @@ typedef struct
    NvU64                   base                            NV_ALIGN_BYTES(8); // IN
    NvU64                   length                          NV_ALIGN_BYTES(8); // IN
    NvU64                   offset                          NV_ALIGN_BYTES(8); // IN
-    UvmGpuMappingAttributes perGpuAttributes[UVM_MAX_GPUS];                    // IN
+    UvmGpuMappingAttributes perGpuAttributes[UVM_MAX_GPUS_V2];                 // IN
    NvU64                   gpuAttributesCount              NV_ALIGN_BYTES(8); // IN
    NvS32                   rmCtrlFd;                                          // IN
    NvU32                   hClient;                                           // IN
@@ -552,7 +552,7 @@ typedef struct

 typedef struct
 {
-    NvProcessorUuid gpu_uuid;    // IN
+    NvProcessorUuid gpu_uuid;    // IN/OUT
    NvBool          numaEnabled; // OUT
    NvS32           numaNodeId;  // OUT
    NvS32           rmCtrlFd;    // IN
@@ -835,7 +835,14 @@ typedef struct

 //
 // Initialize any tracker object such as a queue or counter
-// UvmToolsCreateEventQueue, UvmToolsCreateProcessAggregateCounters, UvmToolsCreateProcessorCounters
+// UvmToolsCreateEventQueue, UvmToolsCreateProcessAggregateCounters,
+// UvmToolsCreateProcessorCounters.
+// Note that the order of structure elements has the version as the last field.
+// This is used to tell whether the kernel supports V2 events or not because
+// the V1 UVM_TOOLS_INIT_EVENT_TRACKER ioctl would not read or update that
+// field but V2 will. This is needed because it is possible to create an event
+// queue before CUDA is initialized which means UvmSetDriverVersion() hasn't
+// been called yet and the kernel version is unknown.
 //
 #define UVM_TOOLS_INIT_EVENT_TRACKER                                  UVM_IOCTL_BASE(56)
 typedef struct
@@ -847,6 +854,8 @@ typedef struct
    NvU32           allProcessors;                        // IN
    NvU32           uvmFd;                                // IN
    NV_STATUS       rmStatus;                             // OUT
+    NvU32           requestedVersion;                     // IN
+    NvU32           grantedVersion;                       // OUT
 } UVM_TOOLS_INIT_EVENT_TRACKER_PARAMS;

 //
@@ -927,6 +936,12 @@ typedef struct

 //
 // UvmToolsGetProcessorUuidTable
+// Note that tablePtr != 0 and count == 0 means that tablePtr is assumed to be
+// an array of size UVM_MAX_PROCESSORS_V1 and that only UvmEventEntry_V1
+// processor IDs (physical GPU UUIDs) will be reported.
+// tablePtr == 0 and count == 0 can be used to query how many processors are
+// present in order to dynamically allocate the correct size array since the
+// total number of processors is returned in 'count'.
 //
 #define UVM_TOOLS_GET_PROCESSOR_UUID_TABLE                            UVM_IOCTL_BASE(64)
 typedef struct
@@ -934,6 +949,7 @@ typedef struct
    NvU64     tablePtr                 NV_ALIGN_BYTES(8); // IN
    NvU32     count;                                      // IN/OUT
    NV_STATUS rmStatus;                                   // OUT
+    NvU32     version;                                    // OUT
 } UVM_TOOLS_GET_PROCESSOR_UUID_TABLE_PARAMS;


@@ -979,7 +995,7 @@ typedef struct
 {
    NvU64                   base                            NV_ALIGN_BYTES(8); // IN
    NvU64                   length                          NV_ALIGN_BYTES(8); // IN
-    UvmGpuMappingAttributes perGpuAttributes[UVM_MAX_GPUS];                    // IN
+    UvmGpuMappingAttributes perGpuAttributes[UVM_MAX_GPUS_V2];                 // IN
    NvU64                   gpuAttributesCount              NV_ALIGN_BYTES(8); // IN
    NV_STATUS               rmStatus;                                          // OUT
 } UVM_ALLOC_SEMAPHORE_POOL_PARAMS;
--- a/kernel-open/nvidia-uvm/uvm_linux.h
+++ b/kernel-open/nvidia-uvm/uvm_linux.h
@@ -114,6 +114,16 @@ static inline const struct cpumask *uvm_cpumask_of_node(int node)
        #define UVM_IS_CONFIG_HMM() 0
    #endif

+// ATS prefetcher uses hmm_range_fault() to query residency information.
+// hmm_range_fault() needs CONFIG_HMM_MIRROR. To detect racing CPU invalidates
+// of memory regions while hmm_range_fault() is being called, MMU interval
+// notifiers are needed.
+    #if defined(CONFIG_HMM_MIRROR) && defined(NV_MMU_INTERVAL_NOTIFIER)
+        #define UVM_HMM_RANGE_FAULT_SUPPORTED() 1
+    #else
+        #define UVM_HMM_RANGE_FAULT_SUPPORTED() 0
+    #endif
+
 // Various issues prevent us from using mmu_notifiers in older kernels. These
 // include:
 //  - ->release being called under RCU instead of SRCU: fixed by commit
--- a/kernel-open/nvidia-uvm/uvm_map_external.c
+++ b/kernel-open/nvidia-uvm/uvm_map_external.c
@@ -633,8 +633,7 @@ static NV_STATUS set_ext_gpu_map_location(uvm_ext_gpu_map_t *ext_gpu_map,
                                          uvm_gpu_t *mapping_gpu,
                                          const UvmGpuMemoryInfo *mem_info)
 {
-    uvm_gpu_t *owning_gpu = NULL;
-    uvm_gpu_t *gpu;
+    uvm_gpu_t *owning_gpu;

    if (mem_info->egm)
        UVM_ASSERT(mem_info->sysmem);
@@ -653,16 +652,7 @@ static NV_STATUS set_ext_gpu_map_location(uvm_ext_gpu_map_t *ext_gpu_map,
    // registered.
    // This also checks for if EGM owning GPU is registered.

-    // TODO: Bug 4351121: RM will return the GI UUID, but
-    // uvm_va_space_get_gpu_by_uuid() currently matches on physical GPU UUIDs.
-    // Match on GI UUID until the UVM user level API has been updated to use
-    // the GI UUID.
-    for_each_va_space_gpu(gpu, va_space) {
-        if (uvm_uuid_eq(&gpu->uuid, &mem_info->uuid)) {
-            owning_gpu = gpu;
-            break;
-        }
-    }
+    owning_gpu = uvm_va_space_get_gpu_by_uuid(va_space, &mem_info->uuid);
    if (!owning_gpu)
        return NV_ERR_INVALID_DEVICE;

@@ -954,6 +944,12 @@ static NV_STATUS uvm_map_external_allocation_on_gpu(uvm_va_range_t *va_range,
        goto error;
    }

+    // Check for the maximum page size for the mapping of vidmem allocations,
+    // the vMMU segment size may limit the range of page sizes.
+    if (!ext_gpu_map->is_sysmem && (ext_gpu_map->gpu == ext_gpu_map->owning_gpu) &&
+        (mapping_page_size > mapping_gpu->mem_info.max_vidmem_page_size))
+        mapping_page_size = mapping_gpu->mem_info.max_vidmem_page_size;
+
    mem_info.pageSize = mapping_page_size;

    status = uvm_va_range_map_rm_allocation(va_range, mapping_gpu, &mem_info, map_rm_params, ext_gpu_map, out_tracker);
@@ -989,7 +985,7 @@ static NV_STATUS uvm_map_external_allocation(uvm_va_space_t *va_space, UVM_MAP_E
    if (uvm_api_range_invalid_4k(params->base, params->length))
        return NV_ERR_INVALID_ADDRESS;

-    if (params->gpuAttributesCount == 0 || params->gpuAttributesCount > UVM_MAX_GPUS)
+    if (params->gpuAttributesCount == 0 || params->gpuAttributesCount > UVM_MAX_GPUS_V2)
        return NV_ERR_INVALID_ARGUMENT;

    uvm_va_space_down_read_rm(va_space);
--- a/kernel-open/nvidia-uvm/uvm_migrate.c
+++ b/kernel-open/nvidia-uvm/uvm_migrate.c
@@ -86,7 +86,7 @@ static NV_STATUS block_migrate_map_mapped_pages(uvm_va_block_t *va_block,

    // Only map those pages that are not already mapped on destination
    for_each_va_block_unset_page_in_region_mask(page_index, pages_mapped_on_destination, region) {
-        prot = uvm_va_block_page_compute_highest_permission(va_block, dest_id, page_index);
+        prot = uvm_va_block_page_compute_highest_permission(va_block, va_block_context, dest_id, page_index);
        if (prot == UVM_PROT_NONE)
            continue;

--- a/kernel-open/nvidia-uvm/uvm_mmu.c
+++ b/kernel-open/nvidia-uvm/uvm_mmu.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2023 NVIDIA Corporation
+    Copyright (c) 2015-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -149,6 +149,26 @@ static NV_STATUS phys_mem_allocate_sysmem(uvm_page_tree_t *tree, NvLength size,
    return NV_OK;
 }

+// The aperture may filter the biggest page size:
+// - UVM_APERTURE_VID       biggest page size on vidmem mappings
+// - UVM_APERTURE_SYS       biggest page size on sysmem mappings
+// - UVM_APERTURE_PEER_0-7  biggest page size on peer mappings
+static NvU32 mmu_biggest_page_size(uvm_page_tree_t *tree, uvm_aperture_t aperture)
+{
+    UVM_ASSERT(aperture < UVM_APERTURE_DEFAULT);
+
+    // There may be scenarios where the GMMU must use a subset of the supported
+    // page sizes, e.g., to comply with the vMMU supported page sizes due to
+    // segmentation sizes.
+    if (aperture == UVM_APERTURE_VID) {
+        UVM_ASSERT(tree->gpu->mem_info.max_vidmem_page_size <= NV_U32_MAX);
+        return (NvU32) tree->gpu->mem_info.max_vidmem_page_size;
+    }
+    else {
+        return 1 << __fls(tree->hal->page_sizes());
+    }
+}
+
 static NV_STATUS phys_mem_allocate_vidmem(uvm_page_tree_t *tree,
                                          NvLength size,
                                          uvm_pmm_alloc_flags_t pmm_flags,
@@ -856,7 +876,7 @@ static NV_STATUS page_tree_ats_init(uvm_page_tree_t *tree)
    if (!page_tree_ats_init_required(tree))
        return NV_OK;

-    page_size = uvm_mmu_biggest_page_size(tree);
+    page_size = mmu_biggest_page_size(tree, UVM_APERTURE_VID);

    uvm_cpu_get_unaddressable_range(&max_va_lower, &min_va_upper);

@@ -1090,6 +1110,8 @@ NV_STATUS uvm_page_tree_init(uvm_gpu_t *gpu,
    tree->gpu_va_space = gpu_va_space;
    tree->big_page_size = big_page_size;

+    UVM_ASSERT(gpu->mem_info.max_vidmem_page_size & tree->hal->page_sizes());
+
    page_tree_set_location(tree, location);

    uvm_tracker_init(&tree->tracker);
@@ -2301,7 +2323,7 @@ NV_STATUS create_static_vidmem_mapping(uvm_gpu_t *gpu)

    UVM_ASSERT(!uvm_mmu_parent_gpu_needs_dynamic_vidmem_mapping(gpu->parent));

-    page_size = uvm_mmu_biggest_page_size(&gpu->address_space_tree);
+    page_size = mmu_biggest_page_size(&gpu->address_space_tree, UVM_APERTURE_VID);
    size = UVM_ALIGN_UP(gpu->mem_info.max_allocatable_address + 1, page_size);

    UVM_ASSERT(page_size);
@@ -2338,9 +2360,9 @@ NV_STATUS uvm_mmu_create_peer_identity_mappings(uvm_gpu_t *gpu, uvm_gpu_t *peer)
    if (gpu->parent->peer_copy_mode != UVM_GPU_PEER_COPY_MODE_VIRTUAL || peer->mem_info.size == 0)
        return NV_OK;

-    page_size = uvm_mmu_biggest_page_size(&gpu->address_space_tree);
-    size = UVM_ALIGN_UP(peer->mem_info.max_allocatable_address + 1, page_size);
    aperture = uvm_gpu_peer_aperture(gpu, peer);
+    page_size = mmu_biggest_page_size(&gpu->address_space_tree, aperture);
+    size = UVM_ALIGN_UP(peer->mem_info.max_allocatable_address + 1, page_size);
    peer_mapping = uvm_gpu_get_peer_mapping(gpu, peer->id);
    phys_offset = 0ULL;

@@ -2783,7 +2805,7 @@ static NV_STATUS create_dynamic_sysmem_mapping(uvm_gpu_t *gpu)
    // sysmem mappings with 128K entries.
    UVM_ASSERT(is_power_of_2(mapping_size));
    UVM_ASSERT(mapping_size >= UVM_SIZE_1GB);
-    UVM_ASSERT(mapping_size >= uvm_mmu_biggest_page_size(&gpu->address_space_tree));
+    UVM_ASSERT(mapping_size >= mmu_biggest_page_size(&gpu->address_space_tree, UVM_APERTURE_SYS));
    UVM_ASSERT(mapping_size <= flat_sysmem_va_size);

    flat_sysmem_va_size = UVM_ALIGN_UP(flat_sysmem_va_size, mapping_size);
@@ -2828,7 +2850,7 @@ NV_STATUS uvm_mmu_sysmem_map(uvm_gpu_t *gpu, NvU64 pa, NvU64 size)
        if (sysmem_mapping->range_vec == NULL) {
            uvm_gpu_address_t virtual_address = uvm_parent_gpu_address_virtual_from_sysmem_phys(gpu->parent, curr_pa);
            NvU64 phys_offset = curr_pa;
-            NvU32 page_size = uvm_mmu_biggest_page_size(&gpu->address_space_tree);
+            NvU32 page_size = mmu_biggest_page_size(&gpu->address_space_tree, UVM_APERTURE_SYS);
            uvm_pmm_alloc_flags_t pmm_flags;

            // No eviction is requested when allocating the page tree storage,
--- a/kernel-open/nvidia-uvm/uvm_mmu.h
+++ b/kernel-open/nvidia-uvm/uvm_mmu.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2023 NVIDIA Corporation
+    Copyright (c) 2015-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -612,6 +612,9 @@ static NvU64 uvm_mmu_pde_coverage(uvm_page_tree_t *tree, NvU32 page_size)
    return uvm_mmu_page_tree_entries(tree, depth, page_size) * page_size;
 }

+// Page sizes supported by the GPU. Use uvm_mmu_biggest_page_size() to retrieve
+// the largest page size supported in a given system, which considers the GMMU
+// and vMMU page sizes and segment sizes.
 static bool uvm_mmu_page_size_supported(uvm_page_tree_t *tree, NvU32 page_size)
 {
    UVM_ASSERT_MSG(is_power_of_2(page_size), "0x%x\n", page_size);
@@ -642,11 +645,6 @@ static NvU32 uvm_mmu_biggest_page_size_up_to(uvm_page_tree_t *tree, NvU32 max_pa
    return page_size;
 }

-static NvU32 uvm_mmu_biggest_page_size(uvm_page_tree_t *tree)
-{
-    return 1 << __fls(tree->hal->page_sizes());
-}
-
 static NvU32 uvm_mmu_pte_size(uvm_page_tree_t *tree, NvU32 page_size)
 {
    return tree->hal->entry_size(tree->hal->page_table_depth(page_size));
--- a/kernel-open/nvidia-uvm/uvm_perf_thrashing.c
+++ b/kernel-open/nvidia-uvm/uvm_perf_thrashing.c
@@ -1442,6 +1442,7 @@ static bool preferred_location_is_thrashing(uvm_processor_id_t preferred_locatio

 static uvm_perf_thrashing_hint_t get_hint_for_migration_thrashing(va_space_thrashing_info_t *va_space_thrashing,
                                                                  uvm_va_block_t *va_block,
+                                                                  uvm_va_block_context_t *va_block_context,
                                                                  uvm_page_index_t page_index,
                                                                  page_thrashing_info_t *page_thrashing,
                                                                  uvm_processor_id_t requester)
@@ -1460,7 +1461,7 @@ static uvm_perf_thrashing_hint_t get_hint_for_migration_thrashing(va_space_thras

    hint.type = UVM_PERF_THRASHING_HINT_TYPE_NONE;

-    closest_resident_id = uvm_va_block_page_get_closest_resident(va_block, page_index, requester);
+    closest_resident_id = uvm_va_block_page_get_closest_resident(va_block, va_block_context, page_index, requester);
    if (uvm_va_block_is_hmm(va_block)) {
        // HMM pages always start out resident on the CPU but may not be
        // recorded in the va_block state because hmm_range_fault() or
@@ -1601,6 +1602,7 @@ static uvm_perf_thrashing_hint_t get_hint_for_migration_thrashing(va_space_thras
 //   that case we keep the page pinned while applying the same algorithm as in
 //   Phase1.
 uvm_perf_thrashing_hint_t uvm_perf_thrashing_get_hint(uvm_va_block_t *va_block,
+                                                      uvm_va_block_context_t *va_block_context,
                                                      NvU64 address,
                                                      uvm_processor_id_t requester)
 {
@@ -1713,6 +1715,7 @@ uvm_perf_thrashing_hint_t uvm_perf_thrashing_get_hint(uvm_va_block_t *va_block,
    else {
        hint = get_hint_for_migration_thrashing(va_space_thrashing,
                                                va_block,
+                                                va_block_context,
                                                page_index,
                                                page_thrashing,
                                                requester);
--- a/kernel-open/nvidia-uvm/uvm_perf_thrashing.h
+++ b/kernel-open/nvidia-uvm/uvm_perf_thrashing.h
@@ -74,7 +74,9 @@ typedef struct
 } uvm_perf_thrashing_hint_t;

 // Obtain a hint to prevent thrashing on the page with given address
-uvm_perf_thrashing_hint_t uvm_perf_thrashing_get_hint(uvm_va_block_t *va_block, NvU64 address,
+uvm_perf_thrashing_hint_t uvm_perf_thrashing_get_hint(uvm_va_block_t *va_block,
+                                                      uvm_va_block_context_t *va_block_context,
+                                                      NvU64 address,
                                                      uvm_processor_id_t requester);

 // Obtain a pointer to a mask with the processors that are thrashing on the
--- a/kernel-open/nvidia-uvm/uvm_pmm_gpu.c
+++ b/kernel-open/nvidia-uvm/uvm_pmm_gpu.c
@@ -1408,8 +1408,6 @@ uvm_gpu_address_t uvm_pmm_gpu_peer_copy_address(uvm_pmm_gpu_t *pmm,
    uvm_gpu_peer_t *peer_caps = uvm_gpu_peer_caps(accessing_gpu, gpu);
    uvm_gpu_identity_mapping_t *gpu_peer_mapping;

-    UVM_ASSERT(peer_caps->link_type != UVM_GPU_LINK_INVALID);
-
    if (peer_caps->is_indirect_peer ||
        (accessing_gpu->parent->peer_copy_mode == UVM_GPU_PEER_COPY_MODE_PHYSICAL)) {
        // Indirect peers are accessed as sysmem addresses, so they don't need
--- a/kernel-open/nvidia-uvm/uvm_pmm_test.c
+++ b/kernel-open/nvidia-uvm/uvm_pmm_test.c
@@ -1082,6 +1082,7 @@ static NV_STATUS test_pmm_reverse_map_many_blocks(uvm_gpu_t *gpu, uvm_va_space_t
 {
    uvm_va_range_t *va_range;
    uvm_va_block_t *va_block = NULL;
+    uvm_va_block_context_t *va_block_context = NULL;
    NvU32 num_blocks;
    NvU32 index = 0;
    uvm_gpu_phys_address_t phys_addr = {0};
@@ -1099,9 +1100,12 @@ static NV_STATUS test_pmm_reverse_map_many_blocks(uvm_gpu_t *gpu, uvm_va_space_t
    }
    TEST_CHECK_RET(va_block);

+    va_block_context = uvm_va_block_context_alloc(NULL);
+    TEST_CHECK_RET(va_block_context);
+
    uvm_mutex_lock(&va_block->lock);

-    is_resident = uvm_id_equal(uvm_va_block_page_get_closest_resident(va_block, 0, gpu->id), gpu->id);
+    is_resident = uvm_id_equal(uvm_va_block_page_get_closest_resident(va_block, va_block_context, 0, gpu->id), gpu->id);
    if (is_resident) {
        phys_addr = uvm_va_block_gpu_phys_page_address(va_block, 0, gpu);
        phys_addr.address = UVM_ALIGN_DOWN(phys_addr.address, UVM_VA_BLOCK_SIZE);
@@ -1109,6 +1113,8 @@ static NV_STATUS test_pmm_reverse_map_many_blocks(uvm_gpu_t *gpu, uvm_va_space_t

    uvm_mutex_unlock(&va_block->lock);

+    uvm_va_block_context_free(va_block_context);
+
    TEST_CHECK_RET(is_resident);

    // Perform the lookup for the whole root chunk
--- a/kernel-open/nvidia-uvm/uvm_processors.c
+++ b/kernel-open/nvidia-uvm/uvm_processors.c
@@ -25,6 +25,8 @@
 #include "uvm_processors.h"

 static struct kmem_cache *g_uvm_processor_mask_cache __read_mostly;
+const uvm_processor_mask_t g_uvm_processor_mask_cpu = { .bitmap = { 1 << UVM_PARENT_ID_CPU_VALUE }};
+const uvm_processor_mask_t g_uvm_processor_mask_empty = { };

 NV_STATUS uvm_processor_mask_cache_init(void)
 {
--- a/kernel-open/nvidia-uvm/uvm_processors.h
+++ b/kernel-open/nvidia-uvm/uvm_processors.h
@@ -522,6 +522,9 @@ UVM_PROCESSOR_MASK(uvm_processor_mask_t,              \
                   uvm_processor_id_t,                \
                   uvm_id_from_value)

+extern const uvm_processor_mask_t g_uvm_processor_mask_cpu;
+extern const uvm_processor_mask_t g_uvm_processor_mask_empty;
+
 // Like uvm_processor_mask_subset() but ignores the CPU in the subset mask.
 // Returns whether the GPUs in subset are a subset of the GPUs in mask.
 bool uvm_processor_mask_gpu_subset(const uvm_processor_mask_t *subset,
@@ -567,6 +570,10 @@ void uvm_parent_gpus_from_processor_mask(uvm_parent_processor_mask_t *parent_mas
         (uvm_id_value(i) < uvm_id_value(uvm_gpu_id_from_parent_gpu_id(id)) + UVM_PARENT_ID_MAX_SUB_PROCESSORS); \
         i = uvm_gpu_id_next(i))

+// Helper to iterate over all sub processor indexes.
+#define for_each_sub_processor_index(i) \
+    for (i = 0; i < UVM_PARENT_ID_MAX_SUB_PROCESSORS; i++)
+
 // Helper to iterate over all valid processor ids.
 #define for_each_id(i) for (i = UVM_ID_CPU; UVM_ID_IS_VALID(i); i = uvm_id_next(i))

--- a/kernel-open/nvidia-uvm/uvm_test.c
+++ b/kernel-open/nvidia-uvm/uvm_test.c
@@ -41,15 +41,11 @@
 static NV_STATUS uvm_test_get_gpu_ref_count(UVM_TEST_GET_GPU_REF_COUNT_PARAMS *params, struct file *filp)
 {
    NvU64 retained_count = 0;
-    uvm_parent_gpu_t *parent_gpu;
    uvm_gpu_t *gpu = NULL;

    uvm_mutex_lock(&g_uvm_global.global_lock);

-    parent_gpu = uvm_parent_gpu_get_by_uuid(&params->gpu_uuid);
-    if (parent_gpu)
-        gpu = uvm_gpu_get_by_parent_and_swizz_id(parent_gpu, params->swizz_id);
-
+    gpu = uvm_gpu_get_by_uuid(&params->gpu_uuid);
    if (gpu != NULL)
        retained_count = uvm_gpu_retained_count(gpu);

--- a/kernel-open/nvidia-uvm/uvm_test_ioctl.h
+++ b/kernel-open/nvidia-uvm/uvm_test_ioctl.h
@@ -40,7 +40,6 @@ typedef struct
 {
    // In params
    NvProcessorUuid gpu_uuid;
-    NvU32           swizz_id;
    // Out params
    NvU64           ref_count NV_ALIGN_BYTES(8);
    NV_STATUS       rmStatus;
@@ -192,7 +191,7 @@ typedef struct
    NvU32                           read_duplication;                                   // Out (UVM_TEST_READ_DUPLICATION_POLICY)
    NvProcessorUuid                 preferred_location;                                 // Out
    NvS32                           preferred_cpu_nid;                                  // Out
-    NvProcessorUuid                 accessed_by[UVM_MAX_PROCESSORS];                    // Out
+    NvProcessorUuid                 accessed_by[UVM_MAX_PROCESSORS_V2];                 // Out
    NvU32                           accessed_by_count;                                  // Out
    NvU32                           type;                                               // Out (UVM_TEST_VA_RANGE_TYPE)
    union
@@ -505,7 +504,12 @@ typedef struct
 typedef struct
 {
    // In params
-    UvmEventEntry entry; // contains only NvUxx types
+    union
+    {
+        UvmEventEntry_V1 entry_v1; // contains only NvUxx types
+        UvmEventEntry_V2 entry_v2; // contains only NvUxx types
+    };
+    NvU32 version;
    NvU32 count;

    // Out param
@@ -620,7 +624,7 @@ typedef struct

    // Array of processors which have a resident copy of the page containing
    // lookup_address.
-    NvProcessorUuid                 resident_on[UVM_MAX_PROCESSORS];                    // Out
+    NvProcessorUuid                 resident_on[UVM_MAX_PROCESSORS_V2];                 // Out
    NvU32                           resident_on_count;                                  // Out

    // If the memory is resident on the CPU, the NUMA node on which the page
@@ -631,24 +635,24 @@ typedef struct
    // system-page-sized portion of this allocation which contains
    // lookup_address is guaranteed to be resident on the corresponding
    // processor.
-    NvU32                           resident_physical_size[UVM_MAX_PROCESSORS];         // Out
+    NvU32                           resident_physical_size[UVM_MAX_PROCESSORS_V2];      // Out

    // The physical address of the physical allocation backing lookup_address.
-    NvU64                           resident_physical_address[UVM_MAX_PROCESSORS] NV_ALIGN_BYTES(8); // Out
+    NvU64                           resident_physical_address[UVM_MAX_PROCESSORS_V2] NV_ALIGN_BYTES(8); // Out

    // Array of processors which have a virtual mapping covering lookup_address.
-    NvProcessorUuid                 mapped_on[UVM_MAX_PROCESSORS];                      // Out
-    NvU32                           mapping_type[UVM_MAX_PROCESSORS];                   // Out
-    NvU64                           mapping_physical_address[UVM_MAX_PROCESSORS] NV_ALIGN_BYTES(8); // Out
+    NvProcessorUuid                 mapped_on[UVM_MAX_PROCESSORS_V2];                   // Out
+    NvU32                           mapping_type[UVM_MAX_PROCESSORS_V2];                // Out
+    NvU64                           mapping_physical_address[UVM_MAX_PROCESSORS_V2] NV_ALIGN_BYTES(8); // Out
    NvU32                           mapped_on_count;                                    // Out

    // The size of the virtual mapping covering lookup_address on each
    // mapped_on processor.
-    NvU32                           page_size[UVM_MAX_PROCESSORS];                      // Out
+    NvU32                           page_size[UVM_MAX_PROCESSORS_V2];                   // Out

    // Array of processors which have physical memory populated that would back
    // lookup_address if it was resident.
-    NvProcessorUuid                 populated_on[UVM_MAX_PROCESSORS];                   // Out
+    NvProcessorUuid                 populated_on[UVM_MAX_PROCESSORS_V2];                // Out
    NvU32                           populated_on_count;                                 // Out

    NV_STATUS rmStatus;                                                                 // Out
--- a/kernel-open/nvidia-uvm/uvm_tools.c
+++ b/kernel-open/nvidia-uvm/uvm_tools.c
--- a/kernel-open/nvidia-uvm/uvm_types.h
+++ b/kernel-open/nvidia-uvm/uvm_types.h
@@ -52,8 +52,19 @@ typedef enum

 typedef unsigned long long UvmStream;

-#define UVM_MAX_GPUS         NV_MAX_DEVICES
-#define UVM_MAX_PROCESSORS   (UVM_MAX_GPUS + 1)
+// The maximum number of GPUs changed when multiple MIG instances per
+// uvm_parent_gpu_t were added. See UvmEventQueueCreate().
+#define UVM_MAX_GPUS_V1       NV_MAX_DEVICES
+#define UVM_MAX_PROCESSORS_V1 (UVM_MAX_GPUS_V1 + 1)
+#define UVM_MAX_GPUS_V2       (NV_MAX_DEVICES * NV_MAX_SUBDEVICES)
+#define UVM_MAX_PROCESSORS_V2 (UVM_MAX_GPUS_V2 + 1)
+
+// For backward compatibility:
+// TODO: Bug 4465348: remove these after replacing old references.
+#define UVM_MAX_GPUS UVM_MAX_GPUS_V1
+#define UVM_MAX_PROCESSORS UVM_MAX_PROCESSORS_V1
+
+#define UVM_PROCESSOR_MASK_SIZE ((UVM_MAX_PROCESSORS_V2 + (sizeof(NvU64) * 8) - 1) / (sizeof(NvU64) * 8))

 #define UVM_INIT_FLAGS_DISABLE_HMM                       ((NvU64)0x1)
 #define UVM_INIT_FLAGS_MULTI_PROCESS_SHARING_MODE        ((NvU64)0x2)
@@ -152,6 +163,8 @@ typedef enum {

 typedef struct
 {
+    // UUID of the physical GPU if the GPU is not SMC capable or SMC enabled,
+    // or the GPU instance UUID of the partition.
    NvProcessorUuid gpuUuid;
    NvU32           gpuMappingType;     // UvmGpuMappingType
    NvU32           gpuCachingType;     // UvmGpuCachingType
@@ -410,7 +423,29 @@ typedef struct
    NvU32 pid;                // process id causing the fault
    NvU32 threadId;           // thread id causing the fault
    NvU64 pc;                 // address of the instruction causing the fault
-} UvmEventCpuFaultInfo;
+} UvmEventCpuFaultInfo_V1;
+
+typedef struct
+{
+    //
+    // eventType has to be 1st argument of this structure. Setting eventType to
+    // UvmEventTypeMemoryViolation helps to identify event data in a queue.
+    //
+    NvU8 eventType;
+    NvU8 accessType;          // read/write violation (UvmEventMemoryAccessType)
+    //
+    // This structure is shared between UVM kernel and tools.
+    // Manually padding the structure so that compiler options like pragma pack
+    // or malign-double will have no effect on the field offsets.
+    //
+    NvU16 padding16Bits;
+    NvS32 nid;                // NUMA node ID of faulting CPU
+    NvU64 address;            // faulting address
+    NvU64 timeStamp;          // cpu time when the fault occurred
+    NvU32 pid;                // process id causing the fault
+    NvU32 threadId;           // thread id causing the fault
+    NvU64 pc;                 // address of the instruction causing the fault
+} UvmEventCpuFaultInfo_V2;

 typedef enum
 {
@@ -567,7 +602,49 @@ typedef struct
                                   // on the gpu
    NvU64 endTimeStampGpu;         // time stamp when the migration finished
                                   // on the gpu
-} UvmEventMigrationInfo;
+} UvmEventMigrationInfo_V1;
+
+typedef struct
+{
+    //
+    // eventType has to be the 1st argument of this structure. Setting eventType
+    // to UvmEventTypeMigration helps to identify event data in a queue.
+    //
+    NvU8 eventType;
+    //
+    // Cause that triggered the migration
+    //
+    NvU8 migrationCause;
+    //
+    // This structure is shared between UVM kernel and tools.
+    // Manually padding the structure so that compiler options like pragma pack
+    // or malign-double will have no effect on the field offsets
+    //
+    NvU16 padding16Bits;
+    //
+    // Indices are used for the source and destination of migration instead of
+    // using gpu uuid/cpu id. This reduces the size of each event. The index to
+    // gpuUuid relation can be obtained from UvmToolsGetProcessorUuidTable.
+    // Currently we do not distinguish between CPUs so they all use index 0.
+    //
+    NvU16 srcIndex;                // source CPU/GPU index
+    NvU16 dstIndex;                // destination CPU/GPU index
+    NvS32 srcNid;                  // source CPU NUMA node ID
+    NvS32 dstNid;                  // destination CPU NUMA node ID
+    NvU64 address;                 // base virtual addr used for migration
+    NvU64 migratedBytes;           // number of bytes migrated
+    NvU64 beginTimeStamp;          // cpu time stamp when the memory transfer
+                                   // was queued on the gpu
+    NvU64 endTimeStamp;            // cpu time stamp when the memory transfer
+                                   // finalization was communicated to the cpu
+                                   // For asynchronous operations this field
+                                   // will be zero
+    NvU64 rangeGroupId;            // range group tied with this migration
+    NvU64 beginTimeStampGpu;       // time stamp when the migration started
+                                   // on the gpu
+    NvU64 endTimeStampGpu;         // time stamp when the migration finished
+                                   // on the gpu
+} UvmEventMigrationInfo_V2;

 typedef enum
 {
@@ -633,7 +710,64 @@ typedef struct
    //
    NvU8 padding8Bits;
    NvU16 padding16Bits;
-} UvmEventGpuFaultInfo;
+} UvmEventGpuFaultInfo_V1;
+
+typedef struct
+{
+    //
+    // eventType has to be the 1st argument of this structure.
+    // Setting eventType = UvmEventTypeGpuFault helps to identify event data in
+    // a queue.
+    //
+    NvU8 eventType;
+    NvU8 faultType;       // type of gpu fault, refer UvmEventFaultType
+    NvU8 accessType;      // memory access type, refer UvmEventMemoryAccessType
+    //
+    // This structure is shared between UVM kernel and tools.
+    // Manually padding the structure so that compiler options like pragma pack
+    // or malign-double will have no effect on the field offsets
+    //
+    NvU8 padding8Bits_1;
+    union
+    {
+        NvU16 gpcId;      // If this is a replayable fault, this field contains
+                          // the physical GPC index where the fault was
+                          // triggered
+
+        NvU16 channelId;  // If this is a non-replayable fault, this field
+                          // contains the id of the channel that launched the
+                          // operation that caused the fault.
+                          //
+                          // TODO: Bug 3283289: this field is ambiguous for
+                          // Ampere+ GPUs, but it is never consumed by clients.
+    };
+    NvU16 clientId;       // Id of the MMU client that triggered the fault. This
+                          // is the value provided by HW and is architecture-
+                          // specific. There are separate client ids for
+                          // different client types (See dev_fault.h).
+    NvU64 address;        // virtual address at which gpu faulted
+    NvU64 timeStamp;      // time stamp when the cpu started processing the
+                          // fault
+    NvU64 timeStampGpu;   // gpu time stamp when the fault entry was written
+                          // in the fault buffer
+    NvU32 batchId;        // Per-GPU unique id to identify the faults serviced
+                          // in batch before:
+                          // - Issuing a replay for replayable faults
+                          // - Re-scheduling the channel for non-replayable
+                          //   faults.
+    NvU8 clientType;      // Volta+ GPUs can fault on clients other than GR.
+                          // UvmEventFaultClientTypeGpc indicates replayable
+                          // fault, while UvmEventFaultClientTypeHub indicates
+                          // non-replayable fault.
+
+    //
+    // This structure is shared between UVM kernel and tools.
+    // Manually padding the structure so that compiler options like pragma pack
+    // or malign-double will have no effect on the field offsets
+    //
+    NvU8 padding8Bits_2;
+    NvU16 gpuIndex;       // GPU that experienced the fault
+} UvmEventGpuFaultInfo_V2;

 //------------------------------------------------------------------------------
 // This info is provided when a gpu fault is replayed (for replayable faults)
@@ -666,7 +800,25 @@ typedef struct
                            // accesses is queued on the gpu
    NvU64 timeStampGpu;     // gpu time stamp when the replay operation finished
                            // executing on the gpu
-} UvmEventGpuFaultReplayInfo;
+} UvmEventGpuFaultReplayInfo_V1;
+
+typedef struct
+{
+    //
+    // eventType has to be the 1st argument of this structure.
+    // Setting eventType = UvmEventTypeGpuFaultReplay helps to identify event
+    // data in a queue.
+    //
+    NvU8 eventType;
+    NvU8 clientType;        // See clientType in UvmEventGpuFaultInfo
+    NvU16 gpuIndex;         // GPU that experienced the fault
+    NvU32 batchId;          // Per-GPU unique id to identify the faults that
+                            // have been serviced in batch
+    NvU64 timeStamp;        // cpu time when the replay of the faulting memory
+                            // accesses is queued on the gpu
+    NvU64 timeStampGpu;     // gpu time stamp when the replay operation finished
+                            // executing on the gpu
+} UvmEventGpuFaultReplayInfo_V2;

 //------------------------------------------------------------------------------
 // This info is provided per fatal fault
@@ -689,7 +841,26 @@ typedef struct
    NvU16 padding16bits;
    NvU64 address;        // virtual address at which the processor faulted
    NvU64 timeStamp;      // CPU time when the fault is detected to be fatal
-} UvmEventFatalFaultInfo;
+} UvmEventFatalFaultInfo_V1;
+
+typedef struct
+{
+    //
+    // eventType has to be the 1st argument of this structure.
+    // Setting eventType = UvmEventTypeFatalFault helps to identify event data
+    // in a queue.
+    //
+    NvU8 eventType;
+    NvU8 faultType;       // type of gpu fault, refer UvmEventFaultType. Only
+                          // valid if processorIndex is a GPU
+    NvU8 accessType;      // memory access type, refer UvmEventMemoryAccessType
+    NvU8 reason;          // reason why the fault is fatal, refer
+                          // UvmEventFatalReason
+    NvU16 processorIndex; // processor that experienced the fault
+    NvU16 padding16bits;
+    NvU64 address;        // virtual address at which the processor faulted
+    NvU64 timeStamp;      // CPU time when the fault is detected to be fatal
+} UvmEventFatalFaultInfo_V2;

 typedef struct
 {
@@ -718,7 +889,38 @@ typedef struct
                            // participate in read-duplicate this is time stamp
                            // when all the operations have been pushed to all
                            // the processors.
-} UvmEventReadDuplicateInfo;
+} UvmEventReadDuplicateInfo_V1;
+
+typedef struct
+{
+    //
+    // eventType has to be the 1st argument of this structure.
+    // Setting eventType = UvmEventTypeReadDuplicate helps to identify event
+    // data in a queue.
+    //
+    NvU8 eventType;
+    //
+    // This structure is shared between UVM kernel and tools.
+    // Manually padding the structure so that compiler options like pragma pack
+    // or malign-double will have no effect on the field offsets
+    //
+    NvU8  padding8bits;
+    NvU16 padding16bits;
+    NvU32 padding32bits;
+    NvU64 address;          // virtual address of the memory region that is
+                            // read-duplicated
+    NvU64 size;             // size in bytes of the memory region that is
+                            // read-duplicated
+    NvU64 timeStamp;        // cpu time stamp when the memory region becomes
+                            // read-duplicate. Since many processors can
+                            // participate in read-duplicate this is time stamp
+                            // when all the operations have been pushed to all
+                            // the processors.
+    NvU64 processors[UVM_PROCESSOR_MASK_SIZE];
+                            // mask that specifies in which processors this
+                            // memory region is read-duplicated. This is last
+                            // so UVM_PROCESSOR_MASK_SIZE can grow.
+} UvmEventReadDuplicateInfo_V2;

 typedef struct
 {
@@ -728,13 +930,13 @@ typedef struct
    // identify event data in a queue.
    //
    NvU8 eventType;
+    NvU8 residentIndex;     // index of the cpu/gpu that now contains the only
+                            // valid copy of the memory region
    //
    // This structure is shared between UVM kernel and tools.
    // Manually padding the structure so that compiler options like pragma pack
    // or malign-double will have no effect on the field offsets
    //
-    NvU8 residentIndex;     // index of the cpu/gpu that now contains the only
-                            // valid copy of the memory region
    NvU16 padding16bits;
    NvU32 padding32bits;
    NvU64 address;          // virtual address of the memory region that is
@@ -746,8 +948,34 @@ typedef struct
                            // participate in read-duplicate this is time stamp
                            // when all the operations have been pushed to all
                            // the processors.
-} UvmEventReadDuplicateInvalidateInfo;
+} UvmEventReadDuplicateInvalidateInfo_V1;

+typedef struct
+{
+    //
+    // eventType has to be the 1st argument of this structure.
+    // Setting eventType = UvmEventTypeReadDuplicateInvalidate helps to
+    // identify event data in a queue.
+    //
+    NvU8 eventType;
+    NvU8 padding8bits;
+    NvU16 residentIndex;
+    //
+    // This structure is shared between UVM kernel and tools.
+    // Manually padding the structure so that compiler options like pragma pack
+    // or malign-double will have no effect on the field offsets
+    //
+    NvU32 padding32bits;
+    NvU64 address;          // virtual address of the memory region that is
+                            // read-duplicated
+    NvU64 size;             // size of the memory region that is
+                            // read-duplicated
+    NvU64 timeStamp;        // cpu time stamp when the memory region is no
+                            // longer read-duplicate. Since many processors can
+                            // participate in read-duplicate this is time stamp
+                            // when all the operations have been pushed to all
+                            // the processors.
+} UvmEventReadDuplicateInvalidateInfo_V2;

 typedef struct
 {
@@ -770,7 +998,30 @@ typedef struct
                            // changed
    NvU64 timeStamp;        // cpu time stamp when the new page size is
                            // queued on the gpu
-} UvmEventPageSizeChangeInfo;
+} UvmEventPageSizeChangeInfo_V1;
+
+typedef struct
+{
+    //
+    // eventType has to be the 1st argument of this structure.
+    // Setting eventType = UvmEventTypePageSizeChange helps to identify event
+    // data in a queue.
+    //
+    NvU8 eventType;
+    //
+    // This structure is shared between UVM kernel and tools.
+    // Manually padding the structure so that compiler options like pragma pack
+    // or malign-double will have no effect on the field offsets
+    //
+    NvU8 padding8bits;
+    NvU16 processorIndex;   // cpu/gpu processor index for which the page size
+                            // changed
+    NvU32 size;             // new page size
+    NvU64 address;          // virtual address of the page whose size has
+                            // changed
+    NvU64 timeStamp;        // cpu time stamp when the new page size is
+                            // queued on the gpu
+} UvmEventPageSizeChangeInfo_V2;

 typedef struct
 {
@@ -794,7 +1045,33 @@ typedef struct
                            // thrashing
    NvU64 size;             // size of the memory region that is thrashing
    NvU64 timeStamp;        // cpu time stamp when thrashing is detected
-} UvmEventThrashingDetectedInfo;
+} UvmEventThrashingDetectedInfo_V1;
+
+typedef struct
+{
+    //
+    // eventType has to be the 1st argument of this structure.
+    // Setting eventType = UvmEventTypeThrashingDetected helps to identify event
+    // data in a queue.
+    //
+    NvU8 eventType;
+    //
+    // This structure is shared between UVM kernel and tools.
+    // Manually padding the structure so that compiler options like pragma pack
+    // or malign-double will have no effect on the field offsets
+    //
+    NvU8 padding8bits;
+    NvU16 padding16bits;
+    NvU32 padding32bits;
+    NvU64 address;          // virtual address of the memory region that is
+                            // thrashing
+    NvU64 size;             // size of the memory region that is thrashing
+    NvU64 timeStamp;        // cpu time stamp when thrashing is detected
+    NvU64 processors[UVM_PROCESSOR_MASK_SIZE];
+                            // mask that specifies which processors are
+                            // fighting for this memory region. This is last
+                            // so UVM_PROCESSOR_MASK_SIZE can grow.
+} UvmEventThrashingDetectedInfo_V2;

 typedef struct
 {
@@ -815,7 +1092,28 @@ typedef struct
    NvU64 address;          // address of the page whose servicing is being
                            // throttled
    NvU64 timeStamp;        // cpu start time stamp for the throttling operation
-} UvmEventThrottlingStartInfo;
+} UvmEventThrottlingStartInfo_V1;
+
+typedef struct
+{
+    //
+    // eventType has to be the 1st argument of this structure.
+    // Setting eventType = UvmEventTypeThrottlingStart helps to identify event
+    // data in a queue.
+    //
+    NvU8 eventType;
+    //
+    // This structure is shared between UVM kernel and tools.
+    // Manually padding the structure so that compiler options like pragma pack
+    // or malign-double will have no effect on the field offsets
+    //
+    NvU8  padding8bits;
+    NvU16 padding16bits[2];
+    NvU16 processorIndex;   // index of the cpu/gpu that was throttled
+    NvU64 address;          // address of the page whose servicing is being
+                            // throttled
+    NvU64 timeStamp;        // cpu start time stamp for the throttling operation
+} UvmEventThrottlingStartInfo_V2;

 typedef struct
 {
@@ -836,7 +1134,28 @@ typedef struct
    NvU64 address;          // address of the page whose servicing is being
                            // throttled
    NvU64 timeStamp;        // cpu end time stamp for the throttling operation
-} UvmEventThrottlingEndInfo;
+} UvmEventThrottlingEndInfo_V1;
+
+typedef struct
+{
+    //
+    // eventType has to be the 1st argument of this structure.
+    // Setting eventType = UvmEventTypeThrottlingEnd helps to identify event
+    // data in a queue.
+    //
+    NvU8 eventType;
+    //
+    // This structure is shared between UVM kernel and tools.
+    // Manually padding the structure so that compiler options like pragma pack
+    // or malign-double will have no effect on the field offsets
+    //
+    NvU8  padding8bits;
+    NvU16 padding16bits[2];
+    NvU16 processorIndex;   // index of the cpu/gpu that was throttled
+    NvU64 address;          // address of the page whose servicing is being
+                            // throttled
+    NvU64 timeStamp;        // cpu end time stamp for the throttling operation
+} UvmEventThrottlingEndInfo_V2;

 typedef enum
 {
@@ -892,7 +1211,36 @@ typedef struct
    NvU64 timeStampGpu;     // time stamp when the new mapping is effective in
                            // the processor specified by srcIndex. If srcIndex
                            // is a cpu, this field will be zero.
-} UvmEventMapRemoteInfo;
+} UvmEventMapRemoteInfo_V1;
+
+typedef struct
+{
+    //
+    // eventType has to be the 1st argument of this structure.
+    // Setting eventType = UvmEventTypeMapRemote helps to identify event data
+    // in a queue.
+    //
+    NvU8 eventType;
+    NvU8 mapRemoteCause;    // field to type UvmEventMapRemoteCause that tells
+                            // the cause for the page to be mapped remotely
+    //
+    // This structure is shared between UVM kernel and tools.
+    // Manually padding the structure so that compiler options like pragma pack
+    // or malign-double will have no effect on the field offsets
+    //
+    NvU16 padding16bits;
+    NvU16 srcIndex;         // index of the cpu/gpu being remapped
+    NvU16 dstIndex;         // index of the cpu/gpu memory that contains the
+                            // memory region data
+    NvU64 address;          // virtual address of the memory region that is
+                            // thrashing
+    NvU64 size;             // size of the memory region that is thrashing
+    NvU64 timeStamp;        // cpu time stamp when all the required operations
+                            // have been pushed to the processor
+    NvU64 timeStampGpu;     // time stamp when the new mapping is effective in
+                            // the processor specified by srcIndex. If srcIndex
+                            // is a cpu, this field will be zero.
+} UvmEventMapRemoteInfo_V2;

 typedef struct
 {
@@ -918,7 +1266,33 @@ typedef struct
    NvU64 addressIn;        // virtual address that caused the eviction
    NvU64 size;             // size of the memory region that being evicted
    NvU64 timeStamp;        // cpu time stamp when eviction starts on the cpu
-} UvmEventEvictionInfo;
+} UvmEventEvictionInfo_V1;
+
+typedef struct
+{
+    //
+    // eventType has to be the 1st argument of this structure.
+    // Setting eventType = UvmEventTypeEviction helps to identify event data
+    // in a queue.
+    //
+    NvU8 eventType;
+    //
+    // This structure is shared between UVM kernel and tools.
+    // Manually padding the structure so that compiler options like pragma pack
+    // or malign-double will have no effect on the field offsets
+    //
+    NvU8  padding8bits;
+    NvU16 padding16bits;
+    NvU16 srcIndex;         // index of the cpu/gpu from which data is being
+                            // evicted
+    NvU16 dstIndex;         // index of the cpu/gpu memory to which data is
+                            // going to be stored
+    NvU64 addressOut;       // virtual address of the memory region that is
+                            // being evicted
+    NvU64 addressIn;        // virtual address that caused the eviction
+    NvU64 size;             // size of the memory region that being evicted
+    NvU64 timeStamp;        // cpu time stamp when eviction starts on the cpu
+} UvmEventEvictionInfo_V2;

 // TODO: Bug 1870362: [uvm] Provide virtual address and processor index in
 // AccessCounter events
@@ -978,7 +1352,44 @@ typedef struct
    NvU32 bank;
    NvU64 address;
    NvU64 instancePtr;
-} UvmEventTestAccessCounterInfo;
+} UvmEventTestAccessCounterInfo_V1;
+
+typedef struct
+{
+    //
+    // eventType has to be the 1st argument of this structure.
+    // Setting eventType = UvmEventTypeAccessCounter helps to identify event
+    // data in a queue.
+    //
+    NvU8 eventType;
+    // See uvm_access_counter_buffer_entry_t for details
+    NvU8 aperture;
+    NvU8 instancePtrAperture;
+    NvU8 isVirtual;
+    NvU8 isFromCpu;
+    NvU8 veId;
+
+    // The physical access counter notification was triggered on a managed
+    // memory region. This is not set for virtual access counter notifications.
+    NvU8 physOnManaged;
+
+    //
+    // This structure is shared between UVM kernel and tools.
+    // Manually padding the structure so that compiler options like pragma pack
+    // or malign-double will have no effect on the field offsets
+    //
+    NvU8  padding8bits;
+    NvU16 srcIndex;         // index of the gpu that received the access counter
+                            // notification
+    NvU16 padding16bits;
+    NvU32 value;
+    NvU32 subGranularity;
+    NvU32 tag;
+    NvU32 bank;
+    NvU32 padding32bits;
+    NvU64 address;
+    NvU64 instancePtr;
+} UvmEventTestAccessCounterInfo_V2;

 typedef struct
 {
@@ -998,30 +1409,64 @@ typedef struct
            NvU8 eventType;
            UvmEventMigrationInfo_Lite migration_Lite;

-            UvmEventCpuFaultInfo cpuFault;
-            UvmEventMigrationInfo migration;
-            UvmEventGpuFaultInfo gpuFault;
-            UvmEventGpuFaultReplayInfo gpuFaultReplay;
-            UvmEventFatalFaultInfo fatalFault;
-            UvmEventReadDuplicateInfo readDuplicate;
-            UvmEventReadDuplicateInvalidateInfo readDuplicateInvalidate;
-            UvmEventPageSizeChangeInfo pageSizeChange;
-            UvmEventThrashingDetectedInfo thrashing;
-            UvmEventThrottlingStartInfo throttlingStart;
-            UvmEventThrottlingEndInfo throttlingEnd;
-            UvmEventMapRemoteInfo mapRemote;
-            UvmEventEvictionInfo eviction;
+            UvmEventCpuFaultInfo_V1 cpuFault;
+            UvmEventMigrationInfo_V1 migration;
+            UvmEventGpuFaultInfo_V1 gpuFault;
+            UvmEventGpuFaultReplayInfo_V1 gpuFaultReplay;
+            UvmEventFatalFaultInfo_V1 fatalFault;
+            UvmEventReadDuplicateInfo_V1 readDuplicate;
+            UvmEventReadDuplicateInvalidateInfo_V1 readDuplicateInvalidate;
+            UvmEventPageSizeChangeInfo_V1 pageSizeChange;
+            UvmEventThrashingDetectedInfo_V1 thrashing;
+            UvmEventThrottlingStartInfo_V1 throttlingStart;
+            UvmEventThrottlingEndInfo_V1 throttlingEnd;
+            UvmEventMapRemoteInfo_V1 mapRemote;
+            UvmEventEvictionInfo_V1 eviction;
        } eventData;

        union
        {
            NvU8 eventType;

-            UvmEventTestAccessCounterInfo accessCounter;
+            UvmEventTestAccessCounterInfo_V1 accessCounter;
            UvmEventTestSplitInvalidateInfo splitInvalidate;
        } testEventData;
    };
-} UvmEventEntry;
+} UvmEventEntry_V1;
+
+typedef struct
+{
+    union
+    {
+        union
+        {
+            NvU8 eventType;
+            UvmEventMigrationInfo_Lite migration_Lite;
+
+            UvmEventCpuFaultInfo_V2 cpuFault;
+            UvmEventMigrationInfo_V2 migration;
+            UvmEventGpuFaultInfo_V2 gpuFault;
+            UvmEventGpuFaultReplayInfo_V2 gpuFaultReplay;
+            UvmEventFatalFaultInfo_V2 fatalFault;
+            UvmEventReadDuplicateInfo_V2 readDuplicate;
+            UvmEventReadDuplicateInvalidateInfo_V2 readDuplicateInvalidate;
+            UvmEventPageSizeChangeInfo_V2 pageSizeChange;
+            UvmEventThrashingDetectedInfo_V2 thrashing;
+            UvmEventThrottlingStartInfo_V2 throttlingStart;
+            UvmEventThrottlingEndInfo_V2 throttlingEnd;
+            UvmEventMapRemoteInfo_V2 mapRemote;
+            UvmEventEvictionInfo_V2 eviction;
+        } eventData;
+
+        union
+        {
+            NvU8 eventType;
+
+            UvmEventTestAccessCounterInfo_V2 accessCounter;
+            UvmEventTestSplitInvalidateInfo splitInvalidate;
+        } testEventData;
+    };
+} UvmEventEntry_V2;

 //------------------------------------------------------------------------------
 // Type of time stamp used in the event entry:
@@ -1060,7 +1505,12 @@ typedef enum
    UvmDebugAccessTypeWrite = 1,
 } UvmDebugAccessType;

-typedef struct UvmEventControlData_tag {
+typedef enum {
+    UvmToolsEventQueueVersion_V1 = 1,
+    UvmToolsEventQueueVersion_V2 = 2,
+} UvmToolsEventQueueVersion;
+
+typedef struct UvmEventControlData_V1_tag {
    // entries between get_ahead and get_behind are currently being read
    volatile NvU32 get_ahead;
    volatile NvU32 get_behind;
@@ -1070,7 +1520,30 @@ typedef struct UvmEventControlData_tag {

    // counter of dropped events
    NvU64 dropped[UvmEventNumTypesAll];
-} UvmToolsEventControlData;
+} UvmToolsEventControlData_V1;
+
+typedef struct UvmEventControlData_V2_tag {
+    // entries between get_ahead and get_behind are currently being read
+    volatile NvU32 get_ahead;
+    volatile NvU32 get_behind;
+
+    // entries between put_ahead and put_behind are currently being written
+    volatile NvU32 put_ahead;
+    volatile NvU32 put_behind;
+
+    // The version values are limited to UvmToolsEventQueueVersion and
+    // initialized by UvmToolsCreateEventQueue().
+    NvU32 version;
+    NvU32 padding32Bits;
+
+    // counter of dropped events
+    NvU64 dropped[UvmEventNumTypesAll];
+} UvmToolsEventControlData_V2;
+
+// For backward compatibility:
+// TODO: Bug 4465348: remove these after replacing old references.
+typedef UvmToolsEventControlData_V1 UvmToolsEventControlData;
+typedef UvmEventEntry_V1 UvmEventEntry;

 //------------------------------------------------------------------------------
 // UVM Tools forward types (handles) definitions
--- a/kernel-open/nvidia-uvm/uvm_va_block.c
+++ b/kernel-open/nvidia-uvm/uvm_va_block.c
--- a/kernel-open/nvidia-uvm/uvm_va_block.h
+++ b/kernel-open/nvidia-uvm/uvm_va_block.h
@@ -706,11 +706,6 @@ void uvm_va_block_context_free(uvm_va_block_context_t *va_block_context);
 // mm is used to initialize the value of va_block_context->mm. NULL is allowed.
 void uvm_va_block_context_init(uvm_va_block_context_t *va_block_context, struct mm_struct *mm);

-// Return the preferred NUMA node ID for the block's policy.
-// If the preferred node ID is NUMA_NO_NODE, the current NUMA node ID
-// is returned.
-int uvm_va_block_context_get_node(uvm_va_block_context_t *va_block_context);
-
 // TODO: Bug 1766480: Using only page masks instead of a combination of regions
 //       and page masks could simplify the below APIs and their implementations
 //       at the cost of having to scan the whole mask for small regions.
@@ -1546,7 +1541,11 @@ NV_STATUS uvm_va_block_write_from_cpu(uvm_va_block_t *va_block,
 // The [src, src + size) range has to fit within a single PAGE_SIZE page.
 //
 // LOCKING: The caller must hold the va_block lock
-NV_STATUS uvm_va_block_read_to_cpu(uvm_va_block_t *va_block, uvm_mem_t *dst, NvU64 src, size_t size);
+NV_STATUS uvm_va_block_read_to_cpu(uvm_va_block_t *va_block,
+                                   uvm_va_block_context_t *va_block_context,
+                                   uvm_mem_t *dst,
+                                   NvU64 src,
+                                   size_t size);

 // Initialize va block retry tracking
 void uvm_va_block_retry_init(uvm_va_block_retry_t *uvm_va_block_retry);
@@ -2090,11 +2089,14 @@ void uvm_va_block_page_resident_processors(uvm_va_block_t *va_block,

 // Count how many processors have a copy of the given page resident in their
 // memory.
-NvU32 uvm_va_block_page_resident_processors_count(uvm_va_block_t *va_block, uvm_page_index_t page_index);
+NvU32 uvm_va_block_page_resident_processors_count(uvm_va_block_t *va_block,
+                                                  uvm_va_block_context_t *va_block_context,
+                                                  uvm_page_index_t page_index);

 // Get the processor with a resident copy of a page closest to the given
 // processor.
 uvm_processor_id_t uvm_va_block_page_get_closest_resident(uvm_va_block_t *va_block,
+                                                          uvm_va_block_context_t *va_block_context,
                                                          uvm_page_index_t page_index,
                                                          uvm_processor_id_t processor);

@@ -2127,6 +2129,11 @@ uvm_cpu_chunk_t *uvm_cpu_chunk_get_chunk_for_page(uvm_va_block_t *va_block,
                                                  int nid,
                                                  uvm_page_index_t page_index);

+// Return the CPU chunk for the given page_index from the first available NUMA
+// node from the va_block. Should only be called for HMM va_blocks.
+// Locking: The va_block lock must be held.
+uvm_cpu_chunk_t *uvm_cpu_chunk_get_any_chunk_for_page(uvm_va_block_t *va_block, uvm_page_index_t page_index);
+
 // Return the struct page * from the chunk corresponding to the given page_index
 // Locking: The va_block lock must be held.
 struct page *uvm_cpu_chunk_get_cpu_page(uvm_va_block_t *va_block, uvm_cpu_chunk_t *chunk, uvm_page_index_t page_index);
@@ -2241,6 +2248,7 @@ uvm_processor_id_t uvm_va_block_select_residency(uvm_va_block_t *va_block,
 // Return the maximum mapping protection for processor_id that will not require
 // any permision revocation on the rest of processors.
 uvm_prot_t uvm_va_block_page_compute_highest_permission(uvm_va_block_t *va_block,
+                                                        uvm_va_block_context_t *va_block_context,
                                                        uvm_processor_id_t processor_id,
                                                        uvm_page_index_t page_index);

--- a/kernel-open/nvidia-uvm/uvm_va_block_types.h
+++ b/kernel-open/nvidia-uvm/uvm_va_block_types.h
@@ -175,6 +175,14 @@ typedef struct
    // Scratch node mask. This follows the same rules as scratch_page_mask;
    nodemask_t scratch_node_mask;

+    // Available as scratch space for the internal APIs. This is like a caller-
+    // save register: it shouldn't be used across function calls which also take
+    // this va_block_context.
+    uvm_processor_mask_t scratch_processor_mask;
+
+    // Temporary mask in block_add_eviction_mappings().
+    uvm_processor_mask_t map_processors_eviction;
+
    // State used by uvm_va_block_make_resident
    struct uvm_make_resident_context_struct
    {
@@ -233,6 +241,16 @@ typedef struct
        // are removed as the operation progresses.
        uvm_page_mask_t revoke_running_page_mask;

+        // Mask used by block_gpu_split_2m and block_gpu_split_big to track
+        // splitting of big PTEs but they are never called concurrently. This
+        // mask can be used concurrently with other page masks.
+        uvm_page_mask_t big_split_page_mask;
+
+        // Mask used by block_unmap_gpu to track non_uvm_lite_gpus which have
+        // this block mapped. This mask can be used concurrently with other page
+        // masks.
+        uvm_processor_mask_t non_uvm_lite_gpus;
+
        uvm_page_mask_t page_mask;
        uvm_page_mask_t filtered_page_mask;
        uvm_page_mask_t migratable_mask;
@@ -276,6 +294,10 @@ typedef struct
        struct vm_area_struct *vma;

 #if UVM_IS_CONFIG_HMM()
+
+        // Temporary mask used in uvm_hmm_block_add_eviction_mappings().
+        uvm_processor_mask_t map_processors_eviction;
+
        // Used for migrate_vma_*() to migrate pages to/from GPU/CPU.
        struct migrate_vma migrate_vma_args;
 #endif
--- a/kernel-open/nvidia-uvm/uvm_va_range.c
+++ b/kernel-open/nvidia-uvm/uvm_va_range.c
@@ -1799,7 +1799,7 @@ NV_STATUS uvm_api_alloc_semaphore_pool(UVM_ALLOC_SEMAPHORE_POOL_PARAMS *params,

    if (uvm_api_range_invalid(params->base, params->length))
        return NV_ERR_INVALID_ADDRESS;
-    if (params->gpuAttributesCount > UVM_MAX_GPUS)
+    if (params->gpuAttributesCount > UVM_MAX_GPUS_V2)
        return NV_ERR_INVALID_ARGUMENT;

    if (g_uvm_global.conf_computing_enabled && params->gpuAttributesCount == 0)
--- a/kernel-open/nvidia-uvm/uvm_va_space.c
+++ b/kernel-open/nvidia-uvm/uvm_va_space.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2023 NVIDIA Corporation
+    Copyright (c) 2015-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -86,11 +86,13 @@ static void init_tools_data(uvm_va_space_t *va_space)

    for (i = 0; i < ARRAY_SIZE(va_space->tools.counters); i++)
        INIT_LIST_HEAD(va_space->tools.counters + i);
-    for (i = 0; i < ARRAY_SIZE(va_space->tools.queues); i++)
-        INIT_LIST_HEAD(va_space->tools.queues + i);
+    for (i = 0; i < ARRAY_SIZE(va_space->tools.queues_v1); i++)
+        INIT_LIST_HEAD(va_space->tools.queues_v1 + i);
+    for (i = 0; i < ARRAY_SIZE(va_space->tools.queues_v2); i++)
+        INIT_LIST_HEAD(va_space->tools.queues_v2 + i);
 }

-static NV_STATUS register_gpu_nvlink_peers(uvm_va_space_t *va_space, uvm_gpu_t *gpu)
+static NV_STATUS register_gpu_peers(uvm_va_space_t *va_space, uvm_gpu_t *gpu)
 {
    uvm_gpu_t *other_gpu;

@@ -104,7 +106,7 @@ static NV_STATUS register_gpu_nvlink_peers(uvm_va_space_t *va_space, uvm_gpu_t *

        peer_caps = uvm_gpu_peer_caps(gpu, other_gpu);

-        if (peer_caps->link_type >= UVM_GPU_LINK_NVLINK_1) {
+        if (peer_caps->link_type >= UVM_GPU_LINK_NVLINK_1 || gpu->parent == other_gpu->parent) {
            NV_STATUS status = enable_peers(va_space, gpu, other_gpu);
            if (status != NV_OK)
                return status;
@@ -324,10 +326,16 @@ static void unregister_gpu(uvm_va_space_t *va_space,
        }
    }

-    if (gpu->parent->isr.replayable_faults.handling)
+    if (gpu->parent->isr.replayable_faults.handling) {
+        UVM_ASSERT(uvm_processor_mask_test(&va_space->faultable_processors, gpu->id));
        uvm_processor_mask_clear(&va_space->faultable_processors, gpu->id);
-
-    uvm_processor_mask_clear(&va_space->system_wide_atomics_enabled_processors, gpu->id);
+        UVM_ASSERT(uvm_processor_mask_test(&va_space->system_wide_atomics_enabled_processors, gpu->id));
+        uvm_processor_mask_clear(&va_space->system_wide_atomics_enabled_processors, gpu->id);
+    }
+    else {
+        UVM_ASSERT(uvm_processor_mask_test(&va_space->non_faultable_processors, gpu->id));
+        uvm_processor_mask_clear(&va_space->non_faultable_processors, gpu->id);
+    }

    processor_mask_array_clear(va_space->can_access, gpu->id, gpu->id);
    processor_mask_array_clear(va_space->can_access, gpu->id, UVM_ID_CPU);
@@ -514,7 +522,7 @@ void uvm_va_space_destroy(uvm_va_space_t *va_space)
            nv_kthread_q_flush(&gpu->parent->isr.kill_channel_q);

        if (gpu->parent->access_counters_supported)
-            uvm_gpu_access_counters_disable(gpu, va_space);
+            uvm_parent_gpu_access_counters_disable(gpu->parent, va_space);
    }

    // Check that all CPU/GPU affinity masks are empty
@@ -604,7 +612,7 @@ uvm_gpu_t *uvm_va_space_get_gpu_by_uuid(uvm_va_space_t *va_space, const NvProces
    uvm_gpu_t *gpu;

    for_each_va_space_gpu(gpu, va_space) {
-        if (uvm_uuid_eq(uvm_gpu_uuid(gpu), gpu_uuid))
+        if (uvm_uuid_eq(&gpu->uuid, gpu_uuid))
            return gpu;
    }

@@ -663,7 +671,8 @@ NV_STATUS uvm_va_space_register_gpu(uvm_va_space_t *va_space,
                                    const NvProcessorUuid *gpu_uuid,
                                    const uvm_rm_user_object_t *user_rm_device,
                                    NvBool *numa_enabled,
-                                    NvS32 *numa_node_id)
+                                    NvS32 *numa_node_id,
+                                    NvProcessorUuid *uuid_out)
 {
    NV_STATUS status;
    uvm_va_range_t *va_range;
@@ -675,13 +684,15 @@ NV_STATUS uvm_va_space_register_gpu(uvm_va_space_t *va_space,
    if (status != NV_OK)
        return status;

+    uvm_uuid_copy(uuid_out, &gpu->uuid);
+
    // Enabling access counters requires taking the ISR lock, so it is done
    // without holding the (deeper order) VA space lock. Enabling the counters
    // after dropping the VA space lock would create a window of time in which
    // another thread could see the GPU as registered, but access counters would
    // be disabled. Therefore, the counters are enabled before taking the VA
    // space lock.
-    if (uvm_gpu_access_counters_required(gpu->parent)) {
+    if (uvm_parent_gpu_access_counters_required(gpu->parent)) {
        status = uvm_gpu_access_counters_enable(gpu, va_space);
        if (status != NV_OK) {
            uvm_gpu_release(gpu);
@@ -726,10 +737,17 @@ NV_STATUS uvm_va_space_register_gpu(uvm_va_space_t *va_space,
    va_space->registered_gpus_table[uvm_id_gpu_index(gpu->id)] = gpu;

    if (gpu->parent->isr.replayable_faults.handling) {
+        UVM_ASSERT(!uvm_processor_mask_test(&va_space->faultable_processors, gpu->id));
        uvm_processor_mask_set(&va_space->faultable_processors, gpu->id);
+
+        UVM_ASSERT(!uvm_processor_mask_test(&va_space->system_wide_atomics_enabled_processors, gpu->id));
        // System-wide atomics are enabled by default
        uvm_processor_mask_set(&va_space->system_wide_atomics_enabled_processors, gpu->id);
    }
+    else {
+        UVM_ASSERT(!uvm_processor_mask_test(&va_space->non_faultable_processors, gpu->id));
+        uvm_processor_mask_set(&va_space->non_faultable_processors, gpu->id);
+    }

    // All GPUs have native atomics on their own memory
    processor_mask_array_set(va_space->has_native_atomics, gpu->id, gpu->id);
@@ -785,7 +803,7 @@ NV_STATUS uvm_va_space_register_gpu(uvm_va_space_t *va_space,
        }
    }

-    status = register_gpu_nvlink_peers(va_space, gpu);
+    status = register_gpu_peers(va_space, gpu);
    if (status != NV_OK)
        goto cleanup;

@@ -822,9 +840,9 @@ done:
    if (status != NV_OK) {
        // There is no risk of disabling access counters on a previously
        // registered GPU: the enablement step would have failed before even
-        // discovering that the GPU is already registed.
-        if (uvm_gpu_access_counters_required(gpu->parent))
-            uvm_gpu_access_counters_disable(gpu, va_space);
+        // discovering that the GPU is already registered.
+        if (uvm_parent_gpu_access_counters_required(gpu->parent))
+            uvm_parent_gpu_access_counters_disable(gpu->parent, va_space);

        uvm_gpu_release(gpu);
    }
@@ -876,15 +894,16 @@ NV_STATUS uvm_va_space_unregister_gpu(uvm_va_space_t *va_space, const NvProcesso
    // it from the VA space until we're done.
    uvm_va_space_up_read_rm(va_space);

-    // If uvm_gpu_access_counters_required(gpu->parent) is true, a concurrent
-    // registration could enable access counters after they are disabled here.
+    // If uvm_parent_gpu_access_counters_required(gpu->parent) is true, a
+    // concurrent registration could enable access counters after they are
+    // disabled here.
    // The concurrent registration will fail later on if it acquires the VA
    // space lock before the unregistration does (because the GPU is still
    // registered) and undo the access counters enablement, or succeed if it
    // acquires the VA space lock after the unregistration does. Both outcomes
    // result on valid states.
    if (gpu->parent->access_counters_supported)
-        uvm_gpu_access_counters_disable(gpu, va_space);
+        uvm_parent_gpu_access_counters_disable(gpu->parent, va_space);

    // mmap_lock is needed to establish CPU mappings to any pages evicted from
    // the GPU if accessed by CPU is set for them.
@@ -1040,6 +1059,10 @@ static NV_STATUS enable_peers(uvm_va_space_t *va_space, uvm_gpu_t *gpu0, uvm_gpu
            processor_mask_array_set(va_space->indirect_peers, gpu1->id, gpu0->id);
        }
    }
+    else if (gpu0->parent == gpu1->parent) {
+        processor_mask_array_set(va_space->has_native_atomics, gpu0->id, gpu1->id);
+        processor_mask_array_set(va_space->has_native_atomics, gpu1->id, gpu0->id);
+    }

    UVM_ASSERT(va_space_check_processors_masks(va_space));
    __set_bit(table_index, va_space->enabled_peers);
@@ -1091,6 +1114,7 @@ static NV_STATUS retain_pcie_peers_from_uuids(uvm_va_space_t *va_space,
 static bool uvm_va_space_pcie_peer_enabled(uvm_va_space_t *va_space, uvm_gpu_t *gpu0, uvm_gpu_t *gpu1)
 {
    return !processor_mask_array_test(va_space->has_nvlink, gpu0->id, gpu1->id) &&
+           gpu0->parent != gpu1->parent &&
           uvm_va_space_peer_enabled(va_space, gpu0, gpu1);
 }

--- a/kernel-open/nvidia-uvm/uvm_va_space.h
+++ b/kernel-open/nvidia-uvm/uvm_va_space.h
@@ -163,6 +163,10 @@ struct uvm_va_space_struct
    // faults.
    uvm_processor_mask_t faultable_processors;

+    // Mask of processors registered with the va space that don't support
+    // faulting.
+    uvm_processor_mask_t non_faultable_processors;
+
    // This is a count of non fault capable processors with a GPU VA space
    // registered.
    NvU32 num_non_faultable_gpu_va_spaces;
@@ -261,8 +265,8 @@ struct uvm_va_space_struct
    // Mask of processors that are participating in system-wide atomics
    uvm_processor_mask_t system_wide_atomics_enabled_processors;

-    // Mask of GPUs where access counters are enabled on this VA space
-    uvm_processor_mask_t access_counters_enabled_processors;
+    // Mask of physical GPUs where access counters are enabled on this VA space
+    uvm_parent_processor_mask_t access_counters_enabled_processors;

    // Array with information regarding CPU/GPU NUMA affinity. There is one
    // entry per CPU NUMA node. Entries in the array are populated sequentially
@@ -308,7 +312,8 @@ struct uvm_va_space_struct

        // Lists of counters listening for events on this VA space
        struct list_head counters[UVM_TOTAL_COUNTERS];
-        struct list_head queues[UvmEventNumTypesAll];
+        struct list_head queues_v1[UvmEventNumTypesAll];
+        struct list_head queues_v2[UvmEventNumTypesAll];

        // Node for this va_space in global subscribers list
        struct list_head node;
@@ -399,7 +404,7 @@ static void uvm_va_space_processor_uuid(uvm_va_space_t *va_space, NvProcessorUui
    else {
        uvm_gpu_t *gpu = uvm_va_space_get_gpu(va_space, id);
        UVM_ASSERT(gpu);
-        memcpy(uuid, uvm_gpu_uuid(gpu), sizeof(*uuid));
+        memcpy(uuid, &gpu->uuid, sizeof(*uuid));
    }
 }

@@ -472,9 +477,9 @@ void uvm_va_space_destroy(uvm_va_space_t *va_space);
        uvm_mutex_unlock(&(__va_space)->serialize_writers_lock);        \
    } while (0)

-// Get a registered gpu by uuid. This restricts the search for GPUs, to those that
-// have been registered with a va_space. This returns NULL if the GPU is not present, or not
-// registered with the va_space.
+// Get a registered gpu by uuid. This restricts the search for GPUs, to those
+// that have been registered with a va_space. This returns NULL if the GPU is
+// not present, or not registered with the va_space.
 //
 // LOCKING: The VA space lock must be held.
 uvm_gpu_t *uvm_va_space_get_gpu_by_uuid(uvm_va_space_t *va_space, const NvProcessorUuid *gpu_uuid);
@@ -501,13 +506,19 @@ bool uvm_va_space_can_read_duplicate(uvm_va_space_t *va_space, uvm_gpu_t *changi
 // Register a gpu in the va space
 // Note that each gpu can be only registered once in a va space
 //
+// The input gpu_uuid is for the phyisical GPU. The user_rm_va_space argument
+// identifies the SMC partition if provided and SMC is enabled.
+//
 // This call returns whether the GPU memory is a NUMA node in the kernel and the
 // corresponding node id.
+// It also returns the GI UUID (if gpu_uuid is a SMC partition) or a copy of
+// gpu_uuid if the GPU is not SMC capable or SMC is not enabled.
 NV_STATUS uvm_va_space_register_gpu(uvm_va_space_t *va_space,
                                    const NvProcessorUuid *gpu_uuid,
                                    const uvm_rm_user_object_t *user_rm_va_space,
                                    NvBool *numa_enabled,
-                                    NvS32 *numa_node_id);
+                                    NvS32 *numa_node_id,
+                                    NvProcessorUuid *uuid_out);

 // Unregister a gpu from the va space
 NV_STATUS uvm_va_space_unregister_gpu(uvm_va_space_t *va_space, const NvProcessorUuid *gpu_uuid);
--- a/kernel-open/nvidia-uvm/uvm_va_space_mm.c
+++ b/kernel-open/nvidia-uvm/uvm_va_space_mm.c
@@ -280,7 +280,9 @@ NV_STATUS uvm_va_space_mm_register(uvm_va_space_t *va_space)
        }
    }

-    if ((UVM_IS_CONFIG_HMM() || UVM_ATS_PREFETCH_SUPPORTED()) && uvm_va_space_pageable_mem_access_supported(va_space)) {
+    if ((UVM_IS_CONFIG_HMM() || UVM_HMM_RANGE_FAULT_SUPPORTED()) &&
+        uvm_va_space_pageable_mem_access_supported(va_space)) {
+
        #if UVM_CAN_USE_MMU_NOTIFIERS()
            // Initialize MMU interval notifiers for this process. This allows
            // mmu_interval_notifier_insert() to be called without holding the