535.113.01

2026-02-02 22:47:25 +00:00 · 2023-09-21 10:43:43 -07:00
parent a8e01be6b2
commit f59818b751
94 changed files with 2414 additions and 800 deletions
--- a/src/nvidia/arch/nvalloc/common/inc/nvcst.h
+++ b/src/nvidia/arch/nvalloc/common/inc/nvcst.h
@@ -186,6 +186,7 @@ CSINFO chipsetInfo[] =
    {PCI_VENDOR_ID_INTEL,       0x7A82, CS_INTEL_7A82,      "Intel-AlderLake",      Intel_7A82_setupFunc},
    {PCI_VENDOR_ID_INTEL,       0x7A84, CS_INTEL_7A82,      "Intel-AlderLake",      Intel_7A82_setupFunc},
    {PCI_VENDOR_ID_INTEL,       0x1B81, CS_INTEL_1B81,      "Intel-SapphireRapids", NULL},
+    {PCI_VENDOR_ID_INTEL,       0x7A8A, CS_INTEL_1B81,      "Intel-SapphireRapids", NULL},
    {PCI_VENDOR_ID_INTEL,       0x18DC, CS_INTEL_18DC,      "Intel-IceLake",        NULL},
    {PCI_VENDOR_ID_INTEL,       0x7A04, CS_INTEL_7A04,      "Intel-RaptorLake",     Intel_7A04_setupFunc},

--- a/src/nvidia/arch/nvalloc/unix/include/os-interface.h
+++ b/src/nvidia/arch/nvalloc/unix/include/os-interface.h
@@ -207,9 +207,13 @@ enum os_pci_req_atomics_type {
    OS_INTF_PCIE_REQ_ATOMICS_128BIT
 };
 NV_STATUS   NV_API_CALL  os_enable_pci_req_atomics   (void *, enum os_pci_req_atomics_type);
+NV_STATUS   NV_API_CALL  os_get_numa_node_memory_usage (NvS32, NvU64 *, NvU64 *);
 NV_STATUS   NV_API_CALL  os_numa_add_gpu_memory      (void *, NvU64, NvU64, NvU32 *);
 NV_STATUS   NV_API_CALL  os_numa_remove_gpu_memory   (void *, NvU64, NvU64, NvU32); 
 NV_STATUS   NV_API_CALL  os_offline_page_at_address(NvU64 address);
+void*       NV_API_CALL  os_get_pid_info(void);
+void        NV_API_CALL  os_put_pid_info(void *pid_info);
+NV_STATUS   NV_API_CALL  os_find_ns_pid(void *pid_info, NvU32 *ns_pid);

 extern NvU32 os_page_size;
 extern NvU64 os_page_mask;
--- a/src/nvidia/arch/nvalloc/unix/src/os.c
+++ b/src/nvidia/arch/nvalloc/unix/src/os.c
@@ -684,6 +684,21 @@ NV_STATUS osGetCurrentThread(OS_THREAD_HANDLE *pThreadId)
    return rmStatus;
 }

+void* osGetPidInfo(void)
+{
+    return os_get_pid_info();
+}
+
+void osPutPidInfo(void *pOsPidInfo)
+{
+    os_put_pid_info(pOsPidInfo);
+}
+
+NV_STATUS osFindNsPid(void *pOsPidInfo, NvU32 *pNsPid)
+{
+    return os_find_ns_pid(pOsPidInfo, pNsPid);
+}
+
 NV_STATUS osAttachToProcess(void** ppProcessInfo, NvU32 ProcessId)
 {
    //
@@ -5371,6 +5386,28 @@ osReleaseGpuOsInfo
    nv_put_file_private(pOsInfo);
 }

+/*!
+ * @brief Get free, total memory of a NUMA node by NUMA node ID from kernel.
+ *
+ * @param[in]      numaId              NUMA node ID.
+ * @param[out]     free_memory_bytes   free memory in bytes.
+ * @param[out]     total_memory_bytes  total memory in bytes.
+ *
+ */
+void
+osGetNumaMemoryUsage
+(
+    NvS32 numaId,
+    NvU64 *free_memory_bytes,
+    NvU64 *total_memory_bytes
+)
+{
+    NV_STATUS status = os_get_numa_node_memory_usage(numaId,
+                                                     free_memory_bytes,
+                                                     total_memory_bytes);
+    NV_ASSERT(status == NV_OK);
+}
+
 /*!
 * @brief Add GPU memory as a NUMA node.
 *
--- a/src/nvidia/generated/g_client_nvoc.h
+++ b/src/nvidia/generated/g_client_nvoc.h
@@ -140,6 +140,7 @@ struct RmClient {
    NvU32 Flags;
    NvU32 ClientDebuggerState;
    void *pOSInfo;
+    void *pOsPidInfo;
    char name[100];
    CLI_SYSTEM_EVENT_INFO CliSysEventInfo;
    PSECURITY_TOKEN pSecurityToken;
--- a/src/nvidia/generated/g_gpu_nvoc.c
+++ b/src/nvidia/generated/g_gpu_nvoc.c
@@ -492,6 +492,17 @@ static void __nvoc_init_funcTable_OBJGPU_1(OBJGPU *pThis) {
        pThis->__gpuWriteFunctionConfigRegEx__ = &gpuWriteFunctionConfigRegEx_GM107;
    }

+    // Hal function -- gpuReadVgpuConfigReg
+    if (( ((chipHal_HalVarIdx >> 5) == 1UL) && ((1UL << (chipHal_HalVarIdx & 0x1f)) & 0x10000000UL) )) /* ChipHal: GH100 */ 
+    {
+        pThis->__gpuReadVgpuConfigReg__ = &gpuReadVgpuConfigReg_GH100;
+    }
+    // default
+    else
+    {
+        pThis->__gpuReadVgpuConfigReg__ = &gpuReadVgpuConfigReg_46f6a7;
+    }
+
    // Hal function -- gpuGetIdInfo
    if (( ((chipHal_HalVarIdx >> 5) == 1UL) && ((1UL << (chipHal_HalVarIdx & 0x1f)) & 0x10000000UL) )) /* ChipHal: GH100 */ 
    {
--- a/src/nvidia/generated/g_gpu_nvoc.h
+++ b/src/nvidia/generated/g_gpu_nvoc.h
@@ -877,6 +877,7 @@ struct OBJGPU {
    NV_STATUS (*__gpuReadFunctionConfigReg__)(struct OBJGPU *, NvU32, NvU32, NvU32 *);
    NV_STATUS (*__gpuWriteFunctionConfigReg__)(struct OBJGPU *, NvU32, NvU32, NvU32);
    NV_STATUS (*__gpuWriteFunctionConfigRegEx__)(struct OBJGPU *, NvU32, NvU32, NvU32, THREAD_STATE_NODE *);
+    NV_STATUS (*__gpuReadVgpuConfigReg__)(struct OBJGPU *, NvU32, NvU32 *);
    void (*__gpuGetIdInfo__)(struct OBJGPU *);
    void (*__gpuHandleSanityCheckRegReadError__)(struct OBJGPU *, NvU32, NvU32);
    void (*__gpuHandleSecFault__)(struct OBJGPU *);
@@ -1007,6 +1008,9 @@ struct OBJGPU {
    NvU32 subdeviceInstance;
    NvS32 numaNodeId;
    _GPU_UUID gpuUuid;
+    NvU32 gpuPhysicalId;
+    NvU32 gpuTerminatedLinkMask;
+    NvBool gpuLinkTerminationEnabled;
    NvBool gspRmInitialized;
    _GPU_PCIE_PEER_CLIQUE pciePeerClique;
    NvU32 i2cPortForExtdev;
@@ -1427,6 +1431,8 @@ NV_STATUS __nvoc_objCreate_OBJGPU(OBJGPU**, Dynamic*, NvU32,
 #define gpuWriteFunctionConfigReg_HAL(pGpu, function, reg, data) gpuWriteFunctionConfigReg_DISPATCH(pGpu, function, reg, data)
 #define gpuWriteFunctionConfigRegEx(pGpu, function, reg, data, pThreadState) gpuWriteFunctionConfigRegEx_DISPATCH(pGpu, function, reg, data, pThreadState)
 #define gpuWriteFunctionConfigRegEx_HAL(pGpu, function, reg, data, pThreadState) gpuWriteFunctionConfigRegEx_DISPATCH(pGpu, function, reg, data, pThreadState)
+#define gpuReadVgpuConfigReg(pGpu, index, data) gpuReadVgpuConfigReg_DISPATCH(pGpu, index, data)
+#define gpuReadVgpuConfigReg_HAL(pGpu, index, data) gpuReadVgpuConfigReg_DISPATCH(pGpu, index, data)
 #define gpuGetIdInfo(pGpu) gpuGetIdInfo_DISPATCH(pGpu)
 #define gpuGetIdInfo_HAL(pGpu) gpuGetIdInfo_DISPATCH(pGpu)
 #define gpuHandleSanityCheckRegReadError(pGpu, addr, value) gpuHandleSanityCheckRegReadError_DISPATCH(pGpu, addr, value)
@@ -2422,6 +2428,19 @@ static inline void gpuUpdateUserSharedData(struct OBJGPU *pGpu) {

 #define gpuUpdateUserSharedData_HAL(pGpu) gpuUpdateUserSharedData(pGpu)

+void gpuGetTerminatedLinkMask_GA100(struct OBJGPU *pGpu, NvU32 arg0);
+
+
+#ifdef __nvoc_gpu_h_disabled
+static inline void gpuGetTerminatedLinkMask(struct OBJGPU *pGpu, NvU32 arg0) {
+    NV_ASSERT_FAILED_PRECOMP("OBJGPU was disabled!");
+}
+#else //__nvoc_gpu_h_disabled
+#define gpuGetTerminatedLinkMask(pGpu, arg0) gpuGetTerminatedLinkMask_GA100(pGpu, arg0)
+#endif //__nvoc_gpu_h_disabled
+
+#define gpuGetTerminatedLinkMask_HAL(pGpu, arg0) gpuGetTerminatedLinkMask(pGpu, arg0)
+
 NV_STATUS gpuJtVersionSanityCheck_TU102(struct OBJGPU *pGpu);


@@ -2970,6 +2989,16 @@ static inline NV_STATUS gpuWriteFunctionConfigRegEx_DISPATCH(struct OBJGPU *pGpu
    return pGpu->__gpuWriteFunctionConfigRegEx__(pGpu, function, reg, data, pThreadState);
 }

+NV_STATUS gpuReadVgpuConfigReg_GH100(struct OBJGPU *pGpu, NvU32 index, NvU32 *data);
+
+static inline NV_STATUS gpuReadVgpuConfigReg_46f6a7(struct OBJGPU *pGpu, NvU32 index, NvU32 *data) {
+    return NV_ERR_NOT_SUPPORTED;
+}
+
+static inline NV_STATUS gpuReadVgpuConfigReg_DISPATCH(struct OBJGPU *pGpu, NvU32 index, NvU32 *data) {
+    return pGpu->__gpuReadVgpuConfigReg__(pGpu, index, data);
+}
+
 void gpuGetIdInfo_GM107(struct OBJGPU *pGpu);

 void gpuGetIdInfo_GH100(struct OBJGPU *pGpu);
--- a/src/nvidia/generated/g_kern_fsp_nvoc.c
+++ b/src/nvidia/generated/g_kern_fsp_nvoc.c
@@ -137,10 +137,14 @@ void __nvoc_dtor_KernelFsp(KernelFsp *pThis) {
 void __nvoc_init_dataField_KernelFsp(KernelFsp *pThis, RmHalspecOwner *pRmhalspecowner) {
    ChipHal *chipHal = &pRmhalspecowner->chipHal;
    const unsigned long chipHal_HalVarIdx = (unsigned long)chipHal->__nvoc_HalVarIdx;
+    RmVariantHal *rmVariantHal = &pRmhalspecowner->rmVariantHal;
+    const unsigned long rmVariantHal_HalVarIdx = (unsigned long)rmVariantHal->__nvoc_HalVarIdx;
    PORT_UNREFERENCED_VARIABLE(pThis);
    PORT_UNREFERENCED_VARIABLE(pRmhalspecowner);
    PORT_UNREFERENCED_VARIABLE(chipHal);
    PORT_UNREFERENCED_VARIABLE(chipHal_HalVarIdx);
+    PORT_UNREFERENCED_VARIABLE(rmVariantHal);
+    PORT_UNREFERENCED_VARIABLE(rmVariantHal_HalVarIdx);

    // NVOC Property Hal field -- PDB_PROP_KFSP_IS_MISSING
    if (( ((chipHal_HalVarIdx >> 5) == 1UL) && ((1UL << (chipHal_HalVarIdx & 0x1f)) & 0x10000000UL) )) /* ChipHal: GH100 */ 
@@ -152,6 +156,12 @@ void __nvoc_init_dataField_KernelFsp(KernelFsp *pThis, RmHalspecOwner *pRmhalspe
    {
        pThis->setProperty(pThis, PDB_PROP_KFSP_IS_MISSING, ((NvBool)(0 == 0)));
    }
+
+    // NVOC Property Hal field -- PDB_PROP_KFSP_DISABLE_FRTS_SYSMEM
+    if (( ((rmVariantHal_HalVarIdx >> 5) == 0UL) && ((1UL << (rmVariantHal_HalVarIdx & 0x1f)) & 0x00000002UL) )) /* RmVariantHal: PF_KERNEL_ONLY */ 
+    {
+        pThis->setProperty(pThis, PDB_PROP_KFSP_DISABLE_FRTS_SYSMEM, ((NvBool)(0 == 0)));
+    }
 }

 NV_STATUS __nvoc_ctor_OBJENGSTATE(OBJENGSTATE* );
@@ -171,10 +181,14 @@ __nvoc_ctor_KernelFsp_exit:
 static void __nvoc_init_funcTable_KernelFsp_1(KernelFsp *pThis, RmHalspecOwner *pRmhalspecowner) {
    ChipHal *chipHal = &pRmhalspecowner->chipHal;
    const unsigned long chipHal_HalVarIdx = (unsigned long)chipHal->__nvoc_HalVarIdx;
+    RmVariantHal *rmVariantHal = &pRmhalspecowner->rmVariantHal;
+    const unsigned long rmVariantHal_HalVarIdx = (unsigned long)rmVariantHal->__nvoc_HalVarIdx;
    PORT_UNREFERENCED_VARIABLE(pThis);
    PORT_UNREFERENCED_VARIABLE(pRmhalspecowner);
    PORT_UNREFERENCED_VARIABLE(chipHal);
    PORT_UNREFERENCED_VARIABLE(chipHal_HalVarIdx);
+    PORT_UNREFERENCED_VARIABLE(rmVariantHal);
+    PORT_UNREFERENCED_VARIABLE(rmVariantHal_HalVarIdx);

    pThis->__kfspConstructEngine__ = &kfspConstructEngine_IMPL;

--- a/src/nvidia/generated/g_kern_mem_sys_nvoc.c
+++ b/src/nvidia/generated/g_kern_mem_sys_nvoc.c
@@ -425,6 +425,28 @@ static void __nvoc_init_funcTable_KernelMemorySystem_1(KernelMemorySystem *pThis
        pThis->__kmemsysRemoveAllAtsPeers__ = &kmemsysRemoveAllAtsPeers_GV100;
    }

+    // Hal function -- kmemsysCheckEccCounts
+    if (( ((chipHal_HalVarIdx >> 5) == 1UL) && ((1UL << (chipHal_HalVarIdx & 0x1f)) & 0x10000000UL) )) /* ChipHal: GH100 */ 
+    {
+        pThis->__kmemsysCheckEccCounts__ = &kmemsysCheckEccCounts_GH100;
+    }
+    // default
+    else
+    {
+        pThis->__kmemsysCheckEccCounts__ = &kmemsysCheckEccCounts_b3696a;
+    }
+
+    // Hal function -- kmemsysClearEccCounts
+    if (( ((chipHal_HalVarIdx >> 5) == 1UL) && ((1UL << (chipHal_HalVarIdx & 0x1f)) & 0x10000000UL) )) /* ChipHal: GH100 */ 
+    {
+        pThis->__kmemsysClearEccCounts__ = &kmemsysClearEccCounts_GH100;
+    }
+    // default
+    else
+    {
+        pThis->__kmemsysClearEccCounts__ = &kmemsysClearEccCounts_56cd7a;
+    }
+
    pThis->__nvoc_base_OBJENGSTATE.__engstateConstructEngine__ = &__nvoc_thunk_KernelMemorySystem_engstateConstructEngine;

    pThis->__nvoc_base_OBJENGSTATE.__engstateStateInitLocked__ = &__nvoc_thunk_KernelMemorySystem_engstateStateInitLocked;
--- a/src/nvidia/generated/g_kern_mem_sys_nvoc.h
+++ b/src/nvidia/generated/g_kern_mem_sys_nvoc.h
@@ -222,6 +222,8 @@ struct KernelMemorySystem {
    void (*__kmemsysNumaRemoveAllMemory__)(OBJGPU *, struct KernelMemorySystem *);
    NV_STATUS (*__kmemsysSetupAllAtsPeers__)(OBJGPU *, struct KernelMemorySystem *);
    void (*__kmemsysRemoveAllAtsPeers__)(OBJGPU *, struct KernelMemorySystem *);
+    void (*__kmemsysCheckEccCounts__)(OBJGPU *, struct KernelMemorySystem *);
+    NV_STATUS (*__kmemsysClearEccCounts__)(OBJGPU *, struct KernelMemorySystem *);
    NV_STATUS (*__kmemsysStateLoad__)(POBJGPU, struct KernelMemorySystem *, NvU32);
    NV_STATUS (*__kmemsysStateUnload__)(POBJGPU, struct KernelMemorySystem *, NvU32);
    NV_STATUS (*__kmemsysStatePostUnload__)(POBJGPU, struct KernelMemorySystem *, NvU32);
@@ -323,6 +325,10 @@ NV_STATUS __nvoc_objCreate_KernelMemorySystem(KernelMemorySystem**, Dynamic*, Nv
 #define kmemsysSetupAllAtsPeers_HAL(pGpu, pKernelMemorySystem) kmemsysSetupAllAtsPeers_DISPATCH(pGpu, pKernelMemorySystem)
 #define kmemsysRemoveAllAtsPeers(pGpu, pKernelMemorySystem) kmemsysRemoveAllAtsPeers_DISPATCH(pGpu, pKernelMemorySystem)
 #define kmemsysRemoveAllAtsPeers_HAL(pGpu, pKernelMemorySystem) kmemsysRemoveAllAtsPeers_DISPATCH(pGpu, pKernelMemorySystem)
+#define kmemsysCheckEccCounts(pGpu, pKernelMemorySystem) kmemsysCheckEccCounts_DISPATCH(pGpu, pKernelMemorySystem)
+#define kmemsysCheckEccCounts_HAL(pGpu, pKernelMemorySystem) kmemsysCheckEccCounts_DISPATCH(pGpu, pKernelMemorySystem)
+#define kmemsysClearEccCounts(pGpu, pKernelMemorySystem) kmemsysClearEccCounts_DISPATCH(pGpu, pKernelMemorySystem)
+#define kmemsysClearEccCounts_HAL(pGpu, pKernelMemorySystem) kmemsysClearEccCounts_DISPATCH(pGpu, pKernelMemorySystem)
 #define kmemsysStateLoad(pGpu, pEngstate, arg0) kmemsysStateLoad_DISPATCH(pGpu, pEngstate, arg0)
 #define kmemsysStateUnload(pGpu, pEngstate, arg0) kmemsysStateUnload_DISPATCH(pGpu, pEngstate, arg0)
 #define kmemsysStatePostUnload(pGpu, pEngstate, arg0) kmemsysStatePostUnload_DISPATCH(pGpu, pEngstate, arg0)
@@ -733,6 +739,26 @@ static inline void kmemsysRemoveAllAtsPeers_DISPATCH(OBJGPU *pGpu, struct Kernel
    pKernelMemorySystem->__kmemsysRemoveAllAtsPeers__(pGpu, pKernelMemorySystem);
 }

+void kmemsysCheckEccCounts_GH100(OBJGPU *pGpu, struct KernelMemorySystem *pKernelMemorySystem);
+
+static inline void kmemsysCheckEccCounts_b3696a(OBJGPU *pGpu, struct KernelMemorySystem *pKernelMemorySystem) {
+    return;
+}
+
+static inline void kmemsysCheckEccCounts_DISPATCH(OBJGPU *pGpu, struct KernelMemorySystem *pKernelMemorySystem) {
+    pKernelMemorySystem->__kmemsysCheckEccCounts__(pGpu, pKernelMemorySystem);
+}
+
+NV_STATUS kmemsysClearEccCounts_GH100(OBJGPU *pGpu, struct KernelMemorySystem *pKernelMemorySystem);
+
+static inline NV_STATUS kmemsysClearEccCounts_56cd7a(OBJGPU *pGpu, struct KernelMemorySystem *pKernelMemorySystem) {
+    return NV_OK;
+}
+
+static inline NV_STATUS kmemsysClearEccCounts_DISPATCH(OBJGPU *pGpu, struct KernelMemorySystem *pKernelMemorySystem) {
+    return pKernelMemorySystem->__kmemsysClearEccCounts__(pGpu, pKernelMemorySystem);
+}
+
 static inline NV_STATUS kmemsysStateLoad_DISPATCH(POBJGPU pGpu, struct KernelMemorySystem *pEngstate, NvU32 arg0) {
    return pEngstate->__kmemsysStateLoad__(pGpu, pEngstate, arg0);
 }
--- a/src/nvidia/generated/g_kernel_ce_nvoc.c
+++ b/src/nvidia/generated/g_kernel_ce_nvoc.c
@@ -221,6 +221,16 @@ static void __nvoc_init_funcTable_KernelCE_1(KernelCE *pThis, RmHalspecOwner *pR

    pThis->__kceServiceNotificationInterrupt__ = &kceServiceNotificationInterrupt_IMPL;

+    // Hal function -- kceGetP2PCes
+    if (( ((chipHal_HalVarIdx >> 5) == 1UL) && ((1UL << (chipHal_HalVarIdx & 0x1f)) & 0x10000000UL) )) /* ChipHal: GH100 */ 
+    {
+        pThis->__kceGetP2PCes__ = &kceGetP2PCes_GH100;
+    }
+    else
+    {
+        pThis->__kceGetP2PCes__ = &kceGetP2PCes_GV100;
+    }
+
    // Hal function -- kceGetNvlinkAutoConfigCeValues
    if (( ((chipHal_HalVarIdx >> 5) == 1UL) && ((1UL << (chipHal_HalVarIdx & 0x1f)) & 0x000003e0UL) )) /* ChipHal: TU102 | TU104 | TU106 | TU116 | TU117 */ 
    {
--- a/src/nvidia/generated/g_kernel_ce_nvoc.h
+++ b/src/nvidia/generated/g_kernel_ce_nvoc.h
@@ -113,6 +113,7 @@ struct KernelCE {
    NV_STATUS (*__kceStateUnload__)(OBJGPU *, struct KernelCE *, NvU32);
    void (*__kceRegisterIntrService__)(OBJGPU *, struct KernelCE *, IntrServiceRecord *);
    NV_STATUS (*__kceServiceNotificationInterrupt__)(OBJGPU *, struct KernelCE *, IntrServiceServiceNotificationInterruptArguments *);
+    NV_STATUS (*__kceGetP2PCes__)(struct KernelCE *, OBJGPU *, NvU32, NvU32 *);
    NV_STATUS (*__kceGetNvlinkAutoConfigCeValues__)(OBJGPU *, struct KernelCE *, NvU32 *, NvU32 *, NvU32 *);
    NvBool (*__kceGetNvlinkMaxTopoForTable__)(OBJGPU *, struct KernelCE *, struct NVLINK_TOPOLOGY_PARAMS *, void *, NvU32, NvU32 *);
    NvBool (*__kceIsCurrentMaxTopology__)(OBJGPU *, struct KernelCE *, struct NVLINK_TOPOLOGY_PARAMS *, NvU32 *, NvU32 *);
@@ -190,6 +191,8 @@ NV_STATUS __nvoc_objCreate_KernelCE(KernelCE**, Dynamic*, NvU32);
 #define kceStateUnload_HAL(pGpu, pKCe, flags) kceStateUnload_DISPATCH(pGpu, pKCe, flags)
 #define kceRegisterIntrService(arg0, arg1, arg2) kceRegisterIntrService_DISPATCH(arg0, arg1, arg2)
 #define kceServiceNotificationInterrupt(arg0, arg1, arg2) kceServiceNotificationInterrupt_DISPATCH(arg0, arg1, arg2)
+#define kceGetP2PCes(arg0, pGpu, gpuMask, nvlinkP2PCeMask) kceGetP2PCes_DISPATCH(arg0, pGpu, gpuMask, nvlinkP2PCeMask)
+#define kceGetP2PCes_HAL(arg0, pGpu, gpuMask, nvlinkP2PCeMask) kceGetP2PCes_DISPATCH(arg0, pGpu, gpuMask, nvlinkP2PCeMask)
 #define kceGetNvlinkAutoConfigCeValues(pGpu, pKCe, arg0, arg1, arg2) kceGetNvlinkAutoConfigCeValues_DISPATCH(pGpu, pKCe, arg0, arg1, arg2)
 #define kceGetNvlinkAutoConfigCeValues_HAL(pGpu, pKCe, arg0, arg1, arg2) kceGetNvlinkAutoConfigCeValues_DISPATCH(pGpu, pKCe, arg0, arg1, arg2)
 #define kceGetNvlinkMaxTopoForTable(pGpu, pKCe, arg0, arg1, arg2, arg3) kceGetNvlinkMaxTopoForTable_DISPATCH(pGpu, pKCe, arg0, arg1, arg2, arg3)
@@ -305,20 +308,6 @@ static inline NvBool kceIsCeNvlinkP2P(OBJGPU *pGpu, struct KernelCE *pKCe) {

 #define kceIsCeNvlinkP2P_HAL(pGpu, pKCe) kceIsCeNvlinkP2P(pGpu, pKCe)

-NV_STATUS kceGetP2PCes_GV100(struct KernelCE *arg0, OBJGPU *pGpu, NvU32 gpuMask, NvU32 *nvlinkP2PCeMask);
-
-
-#ifdef __nvoc_kernel_ce_h_disabled
-static inline NV_STATUS kceGetP2PCes(struct KernelCE *arg0, OBJGPU *pGpu, NvU32 gpuMask, NvU32 *nvlinkP2PCeMask) {
-    NV_ASSERT_FAILED_PRECOMP("KernelCE was disabled!");
-    return NV_ERR_NOT_SUPPORTED;
-}
-#else //__nvoc_kernel_ce_h_disabled
-#define kceGetP2PCes(arg0, pGpu, gpuMask, nvlinkP2PCeMask) kceGetP2PCes_GV100(arg0, pGpu, gpuMask, nvlinkP2PCeMask)
-#endif //__nvoc_kernel_ce_h_disabled
-
-#define kceGetP2PCes_HAL(arg0, pGpu, gpuMask, nvlinkP2PCeMask) kceGetP2PCes(arg0, pGpu, gpuMask, nvlinkP2PCeMask)
-
 void kceGetSysmemRWLCEs_GV100(struct KernelCE *arg0, NvU32 *rd, NvU32 *wr);


@@ -397,6 +386,14 @@ static inline NV_STATUS kceServiceNotificationInterrupt_DISPATCH(OBJGPU *arg0, s
    return arg1->__kceServiceNotificationInterrupt__(arg0, arg1, arg2);
 }

+NV_STATUS kceGetP2PCes_GV100(struct KernelCE *arg0, OBJGPU *pGpu, NvU32 gpuMask, NvU32 *nvlinkP2PCeMask);
+
+NV_STATUS kceGetP2PCes_GH100(struct KernelCE *arg0, OBJGPU *pGpu, NvU32 gpuMask, NvU32 *nvlinkP2PCeMask);
+
+static inline NV_STATUS kceGetP2PCes_DISPATCH(struct KernelCE *arg0, OBJGPU *pGpu, NvU32 gpuMask, NvU32 *nvlinkP2PCeMask) {
+    return arg0->__kceGetP2PCes__(arg0, pGpu, gpuMask, nvlinkP2PCeMask);
+}
+
 NV_STATUS kceGetNvlinkAutoConfigCeValues_TU102(OBJGPU *pGpu, struct KernelCE *pKCe, NvU32 *arg0, NvU32 *arg1, NvU32 *arg2);

 NV_STATUS kceGetNvlinkAutoConfigCeValues_GA100(OBJGPU *pGpu, struct KernelCE *pKCe, NvU32 *arg0, NvU32 *arg1, NvU32 *arg2);
--- a/src/nvidia/generated/g_mem_multicast_fabric_nvoc.c
+++ b/src/nvidia/generated/g_mem_multicast_fabric_nvoc.c
@@ -105,10 +105,6 @@ static NV_STATUS __nvoc_thunk_MemoryMulticastFabric_memControl(struct Memory *pM
    return memorymulticastfabricControl((struct MemoryMulticastFabric *)(((unsigned char *)pMemoryMulticastFabric) - __nvoc_rtti_MemoryMulticastFabric_Memory.offset), pCallContext, pParams);
 }

-static NV_STATUS __nvoc_thunk_MemoryMulticastFabric_rmresControl_Prologue(struct RmResource *pMemoryMulticastFabric, CALL_CONTEXT *pCallContext, struct RS_RES_CONTROL_PARAMS_INTERNAL *pParams) {
-    return memorymulticastfabricControl_Prologue((struct MemoryMulticastFabric *)(((unsigned char *)pMemoryMulticastFabric) - __nvoc_rtti_MemoryMulticastFabric_RmResource.offset), pCallContext, pParams);
-}
-
 static NvBool __nvoc_thunk_MemoryMulticastFabric_memIsGpuMapAllowed(struct Memory *pMemoryMulticastFabric, struct OBJGPU *pGpu) {
    return memorymulticastfabricIsGpuMapAllowed((struct MemoryMulticastFabric *)(((unsigned char *)pMemoryMulticastFabric) - __nvoc_rtti_MemoryMulticastFabric_Memory.offset), pGpu);
 }
@@ -137,6 +133,10 @@ static void __nvoc_thunk_RsResource_memorymulticastfabricAddAdditionalDependants
    resAddAdditionalDependants(pClient, (struct RsResource *)(((unsigned char *)pResource) + __nvoc_rtti_MemoryMulticastFabric_RsResource.offset), pReference);
 }

+static NV_STATUS __nvoc_thunk_RmResource_memorymulticastfabricControl_Prologue(struct MemoryMulticastFabric *pResource, CALL_CONTEXT *pCallContext, struct RS_RES_CONTROL_PARAMS_INTERNAL *pParams) {
+    return rmresControl_Prologue((struct RmResource *)(((unsigned char *)pResource) + __nvoc_rtti_MemoryMulticastFabric_RmResource.offset), pCallContext, pParams);
+}
+
 static NV_STATUS __nvoc_thunk_RsResource_memorymulticastfabricUnmapFrom(struct MemoryMulticastFabric *pResource, RS_RES_UNMAP_FROM_PARAMS *pParams) {
    return resUnmapFrom((struct RsResource *)(((unsigned char *)pResource) + __nvoc_rtti_MemoryMulticastFabric_RsResource.offset), pParams);
 }
@@ -324,8 +324,6 @@ static void __nvoc_init_funcTable_MemoryMulticastFabric_1(MemoryMulticastFabric

    pThis->__memorymulticastfabricControl__ = &memorymulticastfabricControl_IMPL;

-    pThis->__memorymulticastfabricControl_Prologue__ = &memorymulticastfabricControl_Prologue_IMPL;
-
    pThis->__memorymulticastfabricIsGpuMapAllowed__ = &memorymulticastfabricIsGpuMapAllowed_IMPL;

    pThis->__memorymulticastfabricGetMapAddrSpace__ = &memorymulticastfabricGetMapAddrSpace_IMPL;
@@ -356,8 +354,6 @@ static void __nvoc_init_funcTable_MemoryMulticastFabric_1(MemoryMulticastFabric

    pThis->__nvoc_base_Memory.__memControl__ = &__nvoc_thunk_MemoryMulticastFabric_memControl;

-    pThis->__nvoc_base_Memory.__nvoc_base_RmResource.__rmresControl_Prologue__ = &__nvoc_thunk_MemoryMulticastFabric_rmresControl_Prologue;
-
    pThis->__nvoc_base_Memory.__memIsGpuMapAllowed__ = &__nvoc_thunk_MemoryMulticastFabric_memIsGpuMapAllowed;

    pThis->__nvoc_base_Memory.__memGetMapAddrSpace__ = &__nvoc_thunk_MemoryMulticastFabric_memGetMapAddrSpace;
@@ -372,6 +368,8 @@ static void __nvoc_init_funcTable_MemoryMulticastFabric_1(MemoryMulticastFabric

    pThis->__memorymulticastfabricAddAdditionalDependants__ = &__nvoc_thunk_RsResource_memorymulticastfabricAddAdditionalDependants;

+    pThis->__memorymulticastfabricControl_Prologue__ = &__nvoc_thunk_RmResource_memorymulticastfabricControl_Prologue;
+
    pThis->__memorymulticastfabricUnmapFrom__ = &__nvoc_thunk_RsResource_memorymulticastfabricUnmapFrom;

    pThis->__memorymulticastfabricControl_Epilogue__ = &__nvoc_thunk_RmResource_memorymulticastfabricControl_Epilogue;
--- a/src/nvidia/generated/g_mem_multicast_fabric_nvoc.h
+++ b/src/nvidia/generated/g_mem_multicast_fabric_nvoc.h
@@ -158,7 +158,6 @@ struct MemoryMulticastFabric {
    NV_STATUS (*__memorymulticastfabricCopyConstruct__)(struct MemoryMulticastFabric *, CALL_CONTEXT *, struct RS_RES_ALLOC_PARAMS_INTERNAL *);
    NV_STATUS (*__memorymulticastfabricIsReady__)(struct MemoryMulticastFabric *, NvBool);
    NV_STATUS (*__memorymulticastfabricControl__)(struct MemoryMulticastFabric *, CALL_CONTEXT *, struct RS_RES_CONTROL_PARAMS_INTERNAL *);
-    NV_STATUS (*__memorymulticastfabricControl_Prologue__)(struct MemoryMulticastFabric *, CALL_CONTEXT *, struct RS_RES_CONTROL_PARAMS_INTERNAL *);
    NvBool (*__memorymulticastfabricIsGpuMapAllowed__)(struct MemoryMulticastFabric *, struct OBJGPU *);
    NV_STATUS (*__memorymulticastfabricGetMapAddrSpace__)(struct MemoryMulticastFabric *, CALL_CONTEXT *, NvU32, NV_ADDRESS_SPACE *);
    NV_STATUS (*__memorymulticastfabricCtrlGetInfo__)(struct MemoryMulticastFabric *, NV00FD_CTRL_GET_INFO_PARAMS *);
@@ -171,6 +170,7 @@ struct MemoryMulticastFabric {
    NV_STATUS (*__memorymulticastfabricMapTo__)(struct MemoryMulticastFabric *, RS_RES_MAP_TO_PARAMS *);
    NvU32 (*__memorymulticastfabricGetRefCount__)(struct MemoryMulticastFabric *);
    void (*__memorymulticastfabricAddAdditionalDependants__)(struct RsClient *, struct MemoryMulticastFabric *, RsResourceRef *);
+    NV_STATUS (*__memorymulticastfabricControl_Prologue__)(struct MemoryMulticastFabric *, CALL_CONTEXT *, struct RS_RES_CONTROL_PARAMS_INTERNAL *);
    NV_STATUS (*__memorymulticastfabricUnmapFrom__)(struct MemoryMulticastFabric *, RS_RES_UNMAP_FROM_PARAMS *);
    void (*__memorymulticastfabricControl_Epilogue__)(struct MemoryMulticastFabric *, CALL_CONTEXT *, struct RS_RES_CONTROL_PARAMS_INTERNAL *);
    NV_STATUS (*__memorymulticastfabricControlLookup__)(struct MemoryMulticastFabric *, struct RS_RES_CONTROL_PARAMS_INTERNAL *, const struct NVOC_EXPORTED_METHOD_DEF **);
@@ -220,7 +220,6 @@ NV_STATUS __nvoc_objCreate_MemoryMulticastFabric(MemoryMulticastFabric**, Dynami
 #define memorymulticastfabricCopyConstruct(pMemoryMulticastFabric, pCallContext, pParams) memorymulticastfabricCopyConstruct_DISPATCH(pMemoryMulticastFabric, pCallContext, pParams)
 #define memorymulticastfabricIsReady(pMemoryMulticastFabric, bCopyConstructorContext) memorymulticastfabricIsReady_DISPATCH(pMemoryMulticastFabric, bCopyConstructorContext)
 #define memorymulticastfabricControl(pMemoryMulticastFabric, pCallContext, pParams) memorymulticastfabricControl_DISPATCH(pMemoryMulticastFabric, pCallContext, pParams)
-#define memorymulticastfabricControl_Prologue(pMemoryMulticastFabric, pCallContext, pParams) memorymulticastfabricControl_Prologue_DISPATCH(pMemoryMulticastFabric, pCallContext, pParams)
 #define memorymulticastfabricIsGpuMapAllowed(pMemoryMulticastFabric, pGpu) memorymulticastfabricIsGpuMapAllowed_DISPATCH(pMemoryMulticastFabric, pGpu)
 #define memorymulticastfabricGetMapAddrSpace(pMemoryMulticastFabric, pCallContext, mapFlags, pAddrSpace) memorymulticastfabricGetMapAddrSpace_DISPATCH(pMemoryMulticastFabric, pCallContext, mapFlags, pAddrSpace)
 #define memorymulticastfabricCtrlGetInfo(pMemoryMulticastFabric, pParams) memorymulticastfabricCtrlGetInfo_DISPATCH(pMemoryMulticastFabric, pParams)
@@ -233,6 +232,7 @@ NV_STATUS __nvoc_objCreate_MemoryMulticastFabric(MemoryMulticastFabric**, Dynami
 #define memorymulticastfabricMapTo(pResource, pParams) memorymulticastfabricMapTo_DISPATCH(pResource, pParams)
 #define memorymulticastfabricGetRefCount(pResource) memorymulticastfabricGetRefCount_DISPATCH(pResource)
 #define memorymulticastfabricAddAdditionalDependants(pClient, pResource, pReference) memorymulticastfabricAddAdditionalDependants_DISPATCH(pClient, pResource, pReference)
+#define memorymulticastfabricControl_Prologue(pResource, pCallContext, pParams) memorymulticastfabricControl_Prologue_DISPATCH(pResource, pCallContext, pParams)
 #define memorymulticastfabricUnmapFrom(pResource, pParams) memorymulticastfabricUnmapFrom_DISPATCH(pResource, pParams)
 #define memorymulticastfabricControl_Epilogue(pResource, pCallContext, pParams) memorymulticastfabricControl_Epilogue_DISPATCH(pResource, pCallContext, pParams)
 #define memorymulticastfabricControlLookup(pResource, pParams, ppEntry) memorymulticastfabricControlLookup_DISPATCH(pResource, pParams, ppEntry)
@@ -271,12 +271,6 @@ static inline NV_STATUS memorymulticastfabricControl_DISPATCH(struct MemoryMulti
    return pMemoryMulticastFabric->__memorymulticastfabricControl__(pMemoryMulticastFabric, pCallContext, pParams);
 }

-NV_STATUS memorymulticastfabricControl_Prologue_IMPL(struct MemoryMulticastFabric *pMemoryMulticastFabric, CALL_CONTEXT *pCallContext, struct RS_RES_CONTROL_PARAMS_INTERNAL *pParams);
-
-static inline NV_STATUS memorymulticastfabricControl_Prologue_DISPATCH(struct MemoryMulticastFabric *pMemoryMulticastFabric, CALL_CONTEXT *pCallContext, struct RS_RES_CONTROL_PARAMS_INTERNAL *pParams) {
-    return pMemoryMulticastFabric->__memorymulticastfabricControl_Prologue__(pMemoryMulticastFabric, pCallContext, pParams);
-}
-
 NvBool memorymulticastfabricIsGpuMapAllowed_IMPL(struct MemoryMulticastFabric *pMemoryMulticastFabric, struct OBJGPU *pGpu);

 static inline NvBool memorymulticastfabricIsGpuMapAllowed_DISPATCH(struct MemoryMulticastFabric *pMemoryMulticastFabric, struct OBJGPU *pGpu) {
@@ -339,6 +333,10 @@ static inline void memorymulticastfabricAddAdditionalDependants_DISPATCH(struct
    pResource->__memorymulticastfabricAddAdditionalDependants__(pClient, pResource, pReference);
 }

+static inline NV_STATUS memorymulticastfabricControl_Prologue_DISPATCH(struct MemoryMulticastFabric *pResource, CALL_CONTEXT *pCallContext, struct RS_RES_CONTROL_PARAMS_INTERNAL *pParams) {
+    return pResource->__memorymulticastfabricControl_Prologue__(pResource, pCallContext, pParams);
+}
+
 static inline NV_STATUS memorymulticastfabricUnmapFrom_DISPATCH(struct MemoryMulticastFabric *pResource, RS_RES_UNMAP_FROM_PARAMS *pParams) {
    return pResource->__memorymulticastfabricUnmapFrom__(pResource, pParams);
 }
--- a/src/nvidia/generated/g_nv_name_released.h
+++ b/src/nvidia/generated/g_nv_name_released.h
@@ -1007,6 +1007,10 @@ static const CHIPS_RELEASED sChipsReleased[] = {
    { 0x27B0, 0x16fa, 0x103c, "NVIDIA RTX 4000 SFF Ada Generation" },
    { 0x27B0, 0x16fa, 0x10de, "NVIDIA RTX 4000 SFF Ada Generation" },
    { 0x27B0, 0x16fa, 0x17aa, "NVIDIA RTX 4000 SFF Ada Generation" },
+    { 0x27B1, 0x180c, 0x1028, "NVIDIA RTX 4500 Ada Generation" },
+    { 0x27B1, 0x180c, 0x103c, "NVIDIA RTX 4500 Ada Generation" },
+    { 0x27B1, 0x180c, 0x10de, "NVIDIA RTX 4500 Ada Generation" },
+    { 0x27B1, 0x180c, 0x17aa, "NVIDIA RTX 4500 Ada Generation" },
    { 0x27B2, 0x181b, 0x1028, "NVIDIA RTX 4000 Ada Generation" },
    { 0x27B2, 0x181b, 0x103c, "NVIDIA RTX 4000 Ada Generation" },
    { 0x27B2, 0x181b, 0x10de, "NVIDIA RTX 4000 Ada Generation" },
--- a/src/nvidia/generated/g_os_nvoc.h
+++ b/src/nvidia/generated/g_os_nvoc.h
@@ -880,6 +880,10 @@ NV_STATUS osReserveCpuAddressSpaceUpperBound(void **ppSectionHandle,
                                             NvU64 maxSectionSize);
 void osReleaseCpuAddressSpaceUpperBound(void *pSectionHandle);

+void* osGetPidInfo(void);
+void osPutPidInfo(void *pOsPidInfo);
+NV_STATUS osFindNsPid(void *pOsPidInfo, NvU32 *pNsPid);
+
 // OS Tegra IPC functions
 NV_STATUS osTegraDceRegisterIpcClient(NvU32 interfaceType, void *usrCtx,
                                      NvU32 *clientId);
@@ -1249,6 +1253,8 @@ static NV_INLINE NV_STATUS isrWrapper(NvBool testIntr, OBJGPU *pGpu)
 #define OS_PCIE_CAP_MASK_REQ_ATOMICS_64    NVBIT(1)
 #define OS_PCIE_CAP_MASK_REQ_ATOMICS_128   NVBIT(2)

+void osGetNumaMemoryUsage(NvS32 numaId, NvU64 *free_memory_bytes, NvU64 *total_memory_bytes);
+
 NV_STATUS osNumaAddGpuMemory(OS_GPU_INFO *pOsGpuInfo, NvU64 offset,
                             NvU64 size, NvU32 *pNumaNodeId);
 void osNumaRemoveGpuMemory(OS_GPU_INFO *pOsGpuInfo, NvU64 offset,
--- a/src/nvidia/src/kernel/gpu/arch/hopper/kern_gpu_gh100.c
+++ b/src/nvidia/src/kernel/gpu/arch/hopper/kern_gpu_gh100.c
@@ -32,6 +32,7 @@
 #include "published/hopper/gh100/dev_pmc.h"
 #include "published/hopper/gh100/dev_xtl_ep_pcfg_gpu.h"
 #include "published/hopper/gh100/pri_nv_xal_ep.h"
+#include "published/hopper/gh100/dev_xtl_ep_pri.h"

 #include "ctrl/ctrl2080/ctrl2080mc.h"

@@ -77,6 +78,28 @@ gpuReadBusConfigReg_GH100
    return gpuReadBusConfigCycle(pGpu, index, pData);
 }

+/*!
+ * @brief Read the non-private registers on vGPU through mirror space
+ *
+ * @param[in]  pGpu   GPU object pointer
+ * @param[in]  index  Register offset in PCIe config space
+ * @param[out] pData  Value of the register
+ *
+ * @returns    NV_OK on success
+ */
+NV_STATUS
+gpuReadVgpuConfigReg_GH100
+(
+    OBJGPU    *pGpu,
+    NvU32      index,
+    NvU32     *pData
+)
+{
+    *pData = GPU_REG_RD32(pGpu, DEVICE_BASE(NV_EP_PCFGM) + index);
+
+    return NV_OK;
+}
+
 /*!
 * @brief Get GPU ID based on PCIE config reads.
 * Also determine other properties of the PCIE capabilities.
--- a/src/nvidia/src/kernel/gpu/ce/arch/hopper/kernel_ce_gh100.c
+++ b/src/nvidia/src/kernel/gpu/ce/arch/hopper/kernel_ce_gh100.c
@@ -45,6 +45,7 @@
 #define NV_CE_NUM_FBPCE                   4
 #define NV_CE_NUM_PCES_NO_LINK_CASE       12
 #define NV_CE_MAX_PCE_PER_GRCE            2
+#define NV_CE_HSHUBNVL_ID_0               2

 /*
 * Table for setting the PCE2LCE mapping for WAR configs that cannot be implemented
@@ -931,3 +932,181 @@ kceGetMappings_GH100
    NV_PRINTF(LEVEL_INFO, "status = %d, statusC2C = %d\n", status, statusC2C);
    return NV_OK;
 }
+
+NV_STATUS kceGetP2PCes_GH100(KernelCE *pKCe, OBJGPU *pGpu, NvU32 gpuMask, NvU32 *nvlinkP2PCeMask)
+{
+    //
+    // Currently Bug 4103154 requires an updated algorithm described below
+    // to assign the proper LCE. Cases without MODS enabled can default back
+    // to the previous version.
+    //
+    return kceGetP2PCes_GV100(pKCe, pGpu, gpuMask, nvlinkP2PCeMask);
+
+    NvU32         gpuCount       = gpumgrGetSubDeviceCount(gpuMask);
+    NvU32         minP2PLce      = (NV_CE_EVEN_ASYNC_LCE_MASK | NV_CE_ODD_ASYNC_LCE_MASK) & NV_CE_MAX_LCE_MASK;
+    NvU32         i;
+    KernelNvlink  *pKernelNvlink = GPU_GET_KERNEL_NVLINK(pGpu);
+
+    if (pKernelNvlink == NULL)
+    {
+        return NV_WARN_NOTHING_TO_DO;
+    }
+
+    if (knvlinkIsGpuConnectedToNvswitch(pGpu, pKernelNvlink))
+    {
+        return kceGetP2PCes_GV100(pKCe, pGpu, gpuMask, nvlinkP2PCeMask);
+    }
+
+    LOWESTBITIDX_32(minP2PLce);
+    *nvlinkP2PCeMask  = 0;
+
+    if (gpuCount == 1)
+    {
+        *nvlinkP2PCeMask |= NVBIT(minP2PLce);
+        for (i = minP2PLce; i < gpuGetNumCEs(pGpu); i++)
+        {
+            *nvlinkP2PCeMask |= NVBIT(i);
+
+        }
+    }
+    else if (gpuCount > 2)
+    {
+        // if gpuCount > 2, this is an invalid request. Print warning and return NV_OK
+        NV_PRINTF(LEVEL_INFO, "GPU %d invalid request for gpuCount %d\n", gpuGetInstance(pGpu), gpuCount);
+        return NV_ERR_INVALID_STATE;
+    }
+    else
+    {
+        OBJGPU       *pRemoteGpu        = NULL;
+        KernelCE     *pKCeLoop          = NULL;
+        NvU32         peerLinkMask      = 0;
+        NvU32         gpuInstance       = 0;
+        NvU32         phyLinkId, status, targetPceMask, numPces;
+
+        //
+        // The LCE returned should be the LCE which has the most PCEs mapped
+        // on the given HSHUB. This HSHUB should be determined by
+        // tracking where the majority of links are connected.
+        //
+        NvU32     linksPerHshub[NV_CE_MAX_HSHUBS] = {0};
+        NvU32     maxLinksConnectedHshub = 0;
+        NvU32     maxConnectedHshubId = NV_CE_MAX_HSHUBS;
+        NvU32     lceAssignedMask = 0;
+        KernelCE *maxLcePerHshub[NV_CE_MAX_HSHUBS] = {0};
+
+        NV2080_CTRL_INTERNAL_HSHUB_GET_HSHUB_ID_FOR_LINKS_PARAMS params;
+
+        if (pKernelNvlink != NULL)
+        {
+            // Get the remote GPU
+            while ((pRemoteGpu = gpumgrGetNextGpu(gpuMask, &gpuInstance)) != NULL)
+            {
+                if (pRemoteGpu != pGpu)
+                    break;
+            }
+
+            NV_ASSERT_OR_RETURN(pRemoteGpu != NULL, NV_ERR_INVALID_STATE);
+            gpuInstance = gpuGetInstance(pRemoteGpu);
+
+            peerLinkMask = knvlinkGetLinkMaskToPeer(pGpu, pKernelNvlink, pRemoteGpu);
+        }
+
+        portMemSet(&params, 0, sizeof(params));
+        params.linkMask = peerLinkMask;
+
+        status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink,
+                                     NV2080_CTRL_CMD_INTERNAL_HSHUB_GET_HSHUB_ID_FOR_LINKS,
+                                     (void *)&params, sizeof(params));
+        NV_ASSERT_OK_OR_RETURN(status);
+
+
+        FOR_EACH_INDEX_IN_MASK(32, phyLinkId, peerLinkMask)
+        {
+            NvU32 hshubId = params.hshubIds[phyLinkId];
+            linksPerHshub[hshubId]++;
+
+            if (linksPerHshub[hshubId] > maxLinksConnectedHshub)
+            {
+                maxLinksConnectedHshub = linksPerHshub[hshubId];
+                maxConnectedHshubId = hshubId;
+            }
+        }
+        FOR_EACH_INDEX_IN_MASK_END;
+
+        //
+        // Iterate through all Async LCEs to track which HSHUB should
+        // be using which LCE. This is decided based on the majority. If
+        // there is a tie, then LCE with the lower index is preferred.
+        //
+        KCE_ITER_ALL_BEGIN(pGpu, pKCeLoop, minP2PLce)
+            NvU32 localMaxPcePerHshub = 0;
+            KernelCE *localMaxLcePerHshub;
+            NvU32 localMaxHshub = NV_CE_MAX_HSHUBS;
+
+            // if LCE is stubbed or LCE is already assigned to another peer
+            if (pKCeLoop->bStubbed)
+            {
+                continue;
+            }
+
+            // LCE is already assigned to this peer
+            if ((pKCeLoop->nvlinkPeerMask & NVBIT(gpuInstance)) != 0)
+            {
+                maxLcePerHshub[maxConnectedHshubId] = pKCeLoop;
+                break;
+            }
+            // LCE is already assigned to another peer
+            else if (pKCeLoop->nvlinkPeerMask != 0)
+            {
+                continue;
+            }
+
+            NV2080_CTRL_CE_GET_CE_PCE_MASK_PARAMS params = {0};
+
+            params.ceEngineType = NV2080_ENGINE_TYPE_COPY(pKCeLoop->publicID);
+            status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink,
+                                                     NV2080_CTRL_CMD_CE_GET_CE_PCE_MASK,
+                                                     (void *)&params, sizeof(params));
+            NV_ASSERT_OK_OR_RETURN(status);
+
+            //
+            // An LCE may be utilized across several HSHUBs. Loop through all HSHUBs
+            // in order to decide which HSHUB holds the majority of this specific LCE.
+            // To help with this, create a mask of PCEs only on the HSHUB which the peer
+            // is most connected to by shifting the HSHUB PCE mask
+            //
+
+            for (i = NV_CE_HSHUBNVL_ID_0; i < NV_CE_MAX_HSHUBS; i++)
+            {
+                targetPceMask = params.pceMask & ((NVBIT(NV_CE_PCE_PER_HSHUB) - 1) << ((i - NV_CE_HSHUBNVL_ID_0) * NV_CE_PCE_PER_HSHUB));
+                numPces = nvPopCount32(targetPceMask);
+                if (numPces > localMaxPcePerHshub && !(lceAssignedMask & NVBIT(pKCeLoop->publicID)))
+                {
+                    localMaxPcePerHshub = numPces;
+                    localMaxLcePerHshub = pKCeLoop;
+                    localMaxHshub = i;
+                }
+            }
+
+            if (localMaxHshub < NV_CE_MAX_HSHUBS)
+            {
+                maxLcePerHshub[localMaxHshub] = localMaxLcePerHshub;
+                lceAssignedMask |= NVBIT(localMaxLcePerHshub->publicID);
+            }
+
+        KCE_ITER_END
+
+        if (maxLcePerHshub[maxConnectedHshubId] != NULL)
+        {
+            NV_PRINTF(LEVEL_INFO,
+                      "GPU %d Assigning Peer %d to LCE %d\n",
+                      gpuGetInstance(pGpu), gpuInstance,
+                      maxLcePerHshub[maxConnectedHshubId]->publicID);
+
+            maxLcePerHshub[maxConnectedHshubId]->nvlinkPeerMask = NVBIT(gpuInstance);
+            *nvlinkP2PCeMask = NVBIT(maxLcePerHshub[maxConnectedHshubId]->publicID);
+        }
+    }
+
+    return NV_OK;
+}
--- a/src/nvidia/src/kernel/gpu/conf_compute/conf_compute.c
+++ b/src/nvidia/src/kernel/gpu/conf_compute/conf_compute.c
@@ -51,6 +51,9 @@ confComputeConstructEngine_IMPL(OBJGPU                  *pGpu,
                                ConfidentialCompute     *pConfCompute,
                                ENGDESCRIPTOR           engDesc)
 {
+    OBJSYS *pSys = SYS_GET_INSTANCE();
+    NvU32 data = 0;
+    NvBool bForceEnableCC = 0;
    pConfCompute->pSpdm = NULL;
    portMemSet(&pConfCompute->ccStaticInfo, 0, sizeof(pConfCompute->ccStaticInfo));
    pConfCompute->gspProxyRegkeys = 0;
@@ -74,6 +77,20 @@ confComputeConstructEngine_IMPL(OBJGPU                  *pGpu,

    if (pConfCompute->getProperty(pConfCompute, PDB_PROP_CONFCOMPUTE_ENABLED))
    {
+        bForceEnableCC = (osReadRegistryDword(pGpu, NV_REG_STR_RM_CONFIDENTIAL_COMPUTE, &data) == NV_OK) &&
+         FLD_TEST_DRF(_REG_STR, _RM_CONFIDENTIAL_COMPUTE, _ENABLED, _YES, data);
+
+        if (!RMCFG_FEATURE_PLATFORM_GSP && !RMCFG_FEATURE_PLATFORM_MODS && !bForceEnableCC)
+        {
+            if (!(sysGetStaticConfig(pSys)->bOsCCEnabled))
+            {
+                NV_PRINTF(LEVEL_ERROR, "CPU does not support confidential compute.\n");
+                NV_ASSERT(0);
+                pConfCompute->setProperty(pConfCompute, PDB_PROP_CONFCOMPUTE_ENABLED, NV_FALSE);
+                return NV_ERR_INVALID_OPERATION;
+            }
+        }
+
        NV_CHECK_OR_RETURN(LEVEL_ERROR, confComputeIsGpuCcCapable_HAL(pGpu, pConfCompute), NV_ERR_INVALID_OPERATION);

        if (pGpu->getProperty(pGpu, PDB_PROP_GPU_APM_FEATURE_CAPABLE))
@@ -92,7 +109,7 @@ confComputeConstructEngine_IMPL(OBJGPU                  *pGpu,
        }
        else
        {
-            NV_PRINTF(LEVEL_ERROR, "GPU does not support confidential compute");
+            NV_PRINTF(LEVEL_ERROR, "GPU does not support confidential compute.\n");
            NV_ASSERT(0);
            return NV_ERR_INVALID_OPERATION;
        }
--- a/src/nvidia/src/kernel/gpu/disp/kern_disp.c
+++ b/src/nvidia/src/kernel/gpu/disp/kern_disp.c
@@ -50,6 +50,8 @@

 #include "kernel/gpu/intr/engine_idx.h"

+#include "gpu/external_device/external_device.h"
+
 #include "ctrl/ctrl2080.h"

 #include "class/cl5070.h"
@@ -490,6 +492,8 @@ void
 kdispStateDestroy_IMPL(OBJGPU *pGpu,
                       KernelDisplay *pKernelDisplay)
 {
+    extdevDestroy(pGpu);
+
    if (pKernelDisplay->pInst != NULL)
    {
        instmemStateDestroy(pGpu, pKernelDisplay->pInst);
--- a/src/nvidia/src/kernel/gpu/falcon/kernel_crashcat_engine.c
+++ b/src/nvidia/src/kernel/gpu/falcon/kernel_crashcat_engine.c
@@ -264,7 +264,7 @@ void *kcrashcatEngineMapBufferDescriptor_IMPL
        memdescMap(pMemDesc, 0, memdescGetSize(pMemDesc), NV_TRUE,
                   NV_PROTECT_READABLE, &pBuf, &pPriv),
        {
-            if (pBufDesc->pEngPriv == NULL)
+            if (!pBufDesc->bRegistered)
                memdescDestroy(pMemDesc);
            return NULL;
        });
--- a/src/nvidia/src/kernel/gpu/gpu.c
+++ b/src/nvidia/src/kernel/gpu/gpu.c
@@ -4941,12 +4941,19 @@ gpuReadBusConfigCycle_IMPL
    NvU8  device   = gpuGetDevice(pGpu);
    NvU8  function = 0;

-    if (pGpu->hPci == NULL)
+    if (IS_PASSTHRU(pGpu))
    {
-        pGpu->hPci = osPciInitHandle(domain, bus, device, function, NULL, NULL);
+        gpuReadVgpuConfigReg_HAL(pGpu, index, pData);
    }
+    else
+    {
+        if (pGpu->hPci == NULL)
+        {
+            pGpu->hPci = osPciInitHandle(domain, bus, device, function, NULL, NULL);
+        }

-    *pData = osPciReadDword(pGpu->hPci, index);
+        *pData = osPciReadDword(pGpu->hPci, index);
+    }

    return NV_OK;
 }
--- a/src/nvidia/src/kernel/gpu/gpu_rmapi.c
+++ b/src/nvidia/src/kernel/gpu/gpu_rmapi.c
@@ -647,6 +647,20 @@ _gpuiIsPidSavedAlready
    return NV_FALSE;
 }

+static NV_STATUS
+_gpuConvertPid
+(
+    RmClient *pClient,
+    NvU32    *pNsPid
+)
+{
+    if (pClient->pOsPidInfo != NULL)
+        return osFindNsPid(pClient->pOsPidInfo, pNsPid);
+
+    *pNsPid = pClient->ProcID;
+    return NV_OK;
+}
+
 //
 // Searches through clients to find processes with clients that have
 // allocated an ElementType of class, defined by elementID. The return values
@@ -673,6 +687,7 @@ gpuGetProcWithObject_IMPL
    RmClient      *pClient;
    RsClient      *pRsClient;
    RsResourceRef *pResourceRef;
+    NV_STATUS     status;

    NV_ASSERT_OR_RETURN((pPidArray != NULL), NV_ERR_INVALID_ARGUMENT);
    NV_ASSERT_OR_RETURN((pPidArrayCount != NULL), NV_ERR_INVALID_ARGUMENT);
@@ -782,8 +797,15 @@ gpuGetProcWithObject_IMPL
            }
            if (elementInClient)
            {
-                pPidArray[pidcount] = pClient->ProcID;
-                pidcount++;
+                status = _gpuConvertPid(pClient, &pPidArray[pidcount]);
+                if (status == NV_OK)
+                {
+                    pidcount++;
+                }
+                else if (status != NV_ERR_OBJECT_NOT_FOUND)
+                {
+                    return status;
+                }

                if (pidcount == NV2080_CTRL_GPU_GET_PIDS_MAX_COUNT)
                {
--- a/src/nvidia/src/kernel/gpu/gsp/arch/hopper/kernel_gsp_gh100.c
+++ b/src/nvidia/src/kernel/gpu/gsp/arch/hopper/kernel_gsp_gh100.c
@@ -29,6 +29,7 @@
 #include "gpu/conf_compute/conf_compute.h"
 #include "gpu/fsp/kern_fsp.h"
 #include "gpu/gsp/kernel_gsp.h"
+#include "gpu/mem_sys/kern_mem_sys.h"
 #include "gsp/gspifpub.h"
 #include "vgpu/rpc.h"

@@ -523,6 +524,7 @@ kgspBootstrapRiscvOSEarly_GH100
 {
    KernelFalcon *pKernelFalcon = staticCast(pKernelGsp, KernelFalcon);
    KernelFsp *pKernelFsp = GPU_GET_KERNEL_FSP(pGpu);
+    KernelMemorySystem *pKernelMemorySystem = GPU_GET_KERNEL_MEMORY_SYSTEM(pGpu);
    NV_STATUS     status        = NV_OK;

    // Only for GSP client builds
@@ -532,8 +534,16 @@ kgspBootstrapRiscvOSEarly_GH100
        return NV_ERR_NOT_SUPPORTED;
    }

+    // Clear ECC errors before attempting to load GSP
+    status = kmemsysClearEccCounts_HAL(pGpu, pKernelMemorySystem);
+    if (status != NV_OK)
+    {
+        NV_PRINTF(LEVEL_ERROR, "Issue clearing ECC counts! Status:0x%x\n", status);
+    }
+
    // Setup the descriptors that GSP-FMC needs to boot GSP-RM
-    NV_ASSERT_OK_OR_RETURN(kgspSetupGspFmcArgs_HAL(pGpu, pKernelGsp, pGspFw));
+    NV_CHECK_OK_OR_GOTO(status, LEVEL_ERROR,
+            kgspSetupGspFmcArgs_HAL(pGpu, pKernelGsp, pGspFw), exit);

    kgspSetupLibosInitArgs(pGpu, pKernelGsp);

@@ -562,7 +572,8 @@ kgspBootstrapRiscvOSEarly_GH100
    {
        NV_PRINTF(LEVEL_NOTICE, "Starting to boot GSP via FSP.\n");
        pKernelFsp->setProperty(pKernelFsp, PDB_PROP_KFSP_GSP_MODE_GSPRM, NV_TRUE);
-        NV_ASSERT_OK_OR_RETURN(kfspSendBootCommands_HAL(pGpu, pKernelFsp));
+        NV_CHECK_OK_OR_GOTO(status, LEVEL_ERROR,
+                kfspSendBootCommands_HAL(pGpu, pKernelFsp), exit);
    }
    else
    {
@@ -585,7 +596,7 @@ kgspBootstrapRiscvOSEarly_GH100
                kfspDumpDebugState_HAL(pGpu, pKernelFsp);
            }

-            return status;
+            goto exit;
        }
    }

@@ -606,7 +617,7 @@ kgspBootstrapRiscvOSEarly_GH100
                  kflcnRegRead_HAL(pGpu, pKernelFalcon, NV_PFALCON_FALCON_MAILBOX0));
        NV_PRINTF(LEVEL_ERROR, "NV_PGSP_FALCON_MAILBOX1 = 0x%x\n",
                  kflcnRegRead_HAL(pGpu, pKernelFalcon, NV_PFALCON_FALCON_MAILBOX1));
-        return status;
+        goto exit;
    }

    // Start polling for libos logs now that lockdown is released
@@ -640,6 +651,11 @@ kgspBootstrapRiscvOSEarly_GH100
    NV_PRINTF(LEVEL_INFO, "GSP FW RM ready.\n");

 exit:
+    // If GSP fails to boot, check if there's any DED error.
+    if (status != NV_OK)
+    {
+        kmemsysCheckEccCounts_HAL(pGpu, pKernelMemorySystem);
+    }
    NV_ASSERT(status == NV_OK);

    return status;
--- a/src/nvidia/src/kernel/gpu/gsp/arch/turing/kernel_gsp_tu102.c
+++ b/src/nvidia/src/kernel/gpu/gsp/arch/turing/kernel_gsp_tu102.c
@@ -799,7 +799,7 @@ kgspHealthCheck_TU102
            objDelete(pReport);
        }

-        return bHealthy;
+        goto exit_health_check;
    }

    NvU32 mb0 = GPU_REG_RD32(pGpu, NV_PGSP_MAILBOX(0));
@@ -845,6 +845,12 @@ kgspHealthCheck_TU102
                  "********************************************************************************\n");
    }

+exit_health_check:
+    if (!bHealthy)
+    {
+        KernelMemorySystem *pKernelMemorySystem = GPU_GET_KERNEL_MEMORY_SYSTEM(pGpu);
+        kmemsysCheckEccCounts_HAL(pGpu, pKernelMemorySystem);
+    }
    return bHealthy;
 }

--- a/src/nvidia/src/kernel/gpu/gsp/kernel_gsp.c
+++ b/src/nvidia/src/kernel/gpu/gsp/kernel_gsp.c
@@ -2438,7 +2438,8 @@ kgspInitRm_IMPL
    if (pKernelGsp->pLogElf == NULL)
        NV_CHECK_OK_OR_GOTO(status, LEVEL_ERROR, nvlogRegisterFlushCb(kgspNvlogFlushCb, pKernelGsp), done);

-    // Wait for GFW_BOOT OK status
+    // Reset thread state timeout and wait for GFW_BOOT OK status
+    threadStateResetTimeout(pGpu);
    NV_CHECK_OK_OR_GOTO(status, LEVEL_ERROR, kgspWaitForGfwBootOk_HAL(pGpu, pKernelGsp), done);

    // Fail early if WPR2 is up
--- a/src/nvidia/src/kernel/gpu/mem_mgr/mem_mgr.c
+++ b/src/nvidia/src/kernel/gpu/mem_mgr/mem_mgr.c
@@ -494,19 +494,6 @@ memmgrStateLoad_IMPL
        memmgrScrubInit_HAL(pGpu, pMemoryManager);
    }

-    if (osNumaOnliningEnabled(pGpu->pOsGpuInfo))
-    {
-        //
-        // NUMA onlined memory size should not exceed memory size assigned to PMA.
-        // TODO : Currently in selfhosted and P9+GV100 systems numaOnlined size is less
-        // than PMA Memory Size. Ideally both of them should be identical. Bug 4051320.
-        //
-        NvU64 pmaTotalMemorySize;
-        NvU64 numaOnlineSize = GPU_GET_KERNEL_MEMORY_SYSTEM(pGpu)->numaOnlineSize;
-        pmaGetTotalMemory(&GPU_GET_HEAP(pGpu)->pmaObject, &pmaTotalMemorySize);
-        NV_ASSERT_OR_RETURN(pmaTotalMemorySize >= numaOnlineSize, NV_ERR_INVALID_STATE);
-    }
-
    // Dump FB regions
    memmgrDumpFbRegions(pGpu, pMemoryManager);

@@ -1978,6 +1965,7 @@ memmgrSetPartitionableMem_IMPL
    {
        PMA_REGION_DESCRIPTOR *pFirstPmaRegionDesc = NULL;
        NvU32 numPmaRegions;
+        NvU32 pmaConfig = PMA_QUERY_NUMA_ONLINED;

        NV_ASSERT_OK_OR_RETURN(pmaGetRegionInfo(&pHeap->pmaObject,
            &numPmaRegions, &pFirstPmaRegionDesc));
@@ -1986,6 +1974,8 @@ memmgrSetPartitionableMem_IMPL
        pmaGetFreeMemory(&pHeap->pmaObject, &freeMem);
        pmaGetTotalMemory(&pHeap->pmaObject, &size);

+        NV_ASSERT_OK(pmaQueryConfigs(&pHeap->pmaObject, &pmaConfig));
+
        //
        // MIG won't be used alongside APM and hence the check below is of no use
        // Even if we enable the check for APM the check will fail given that after
@@ -1996,8 +1986,11 @@ memmgrSetPartitionableMem_IMPL
        // channels are required to be in CPR vidmem. This changes the calculation below
        // We can ignore this for the non-MIG case.
        //
-        if (!gpuIsCCorApmFeatureEnabled(pGpu) ||
-            IS_MIG_ENABLED(pGpu))
+        // When FB memory is onlined as NUMA node, kernel can directly alloc FB memory
+        // and hence free memory can not be expected to be same as total memory.
+        //
+        if ((!gpuIsCCorApmFeatureEnabled(pGpu) || IS_MIG_ENABLED(pGpu)) &&
+            !(pmaConfig & PMA_QUERY_NUMA_ONLINED))
        {
            //
            // PMA should be completely free at this point, otherwise we risk
@@ -2891,6 +2884,7 @@ memmgrPmaRegisterRegions_IMPL
    NvU32 blPageIndex;
    NvU32 blackListCount;
    NvU64 base, size;
+    NvU64 pmaTotalMemorySize = 0;
    NV_STATUS status = NV_OK;
    const MEMORY_SYSTEM_STATIC_CONFIG *pMemsysConfig = 
               kmemsysGetStaticConfig(pGpu, GPU_GET_KERNEL_MEMORY_SYSTEM(pGpu));
@@ -2983,6 +2977,7 @@ memmgrPmaRegisterRegions_IMPL
            }
        }

+        pmaTotalMemorySize += (pmaRegion.limit - pmaRegion.base + 1);
        NV_PRINTF(LEVEL_INFO,
                  "Register FB region %llx..%llx of size %llx with PMA\n",
                  pmaRegion.base, pmaRegion.limit,
@@ -3008,6 +3003,18 @@ memmgrPmaRegisterRegions_IMPL
        pmaRegionIdx++;
    }

+    if (gpuIsSelfHosted(pGpu) && osNumaOnliningEnabled(pGpu->pOsGpuInfo))
+    {
+        //
+        // NUMA onlined memory size should not exceed memory size assigned to PMA.
+        // TODO : Currently in selfhosted and P9+GV100 systems numaOnlined size is less
+        // than PMA Memory Size. Ideally both of them should be identical. Bug 4051320.
+        //
+        NvU64 numaTotalSize = 0;
+        NvU64 numaFreeSize = 0;
+        osGetNumaMemoryUsage(pPma->numaNodeId, &numaFreeSize, &numaTotalSize);
+        NV_ASSERT_OR_RETURN(pmaTotalMemorySize >= numaTotalSize, NV_ERR_INVALID_STATE);
+    }
    //
    // bug #200354346, make sure the RM reserved region(s) are
    // scrubbed during the region creation itself. Top Down scrubber,
--- a/src/nvidia/src/kernel/gpu/mem_mgr/mem_scrub.c
+++ b/src/nvidia/src/kernel/gpu/mem_mgr/mem_scrub.c
@@ -69,11 +69,13 @@ static NvU32  _scrubMemory(OBJMEMSCRUB  *pScrubber, RmPhysAddr base, NvU64 size,
                           NvU32 dstCpuCacheAttrib, NvU32 freeToken);
 static void   _scrubWaitAndSave(OBJMEMSCRUB *pScrubber, PSCRUB_NODE pList, NvLength  itemsToSave);
 static NvU64  _scrubGetFreeEntries(OBJMEMSCRUB *pScrubber);
-static NvU64  _scrubCheckAndSubmit(OBJMEMSCRUB *pScrubber, NvU64  chunkSize, NvU64  *pPages,
-                                 NvU64  pageCount, PSCRUB_NODE  pList, NvLength  pagesToScrubCheck);
+static NvU64  _scrubCheckAndSubmit(OBJMEMSCRUB *pScrubber, NvU64 pageCount, PSCRUB_NODE  pList,
+                                   PSCRUB_NODE pScrubListCopy, NvLength  pagesToScrubCheck);
 static void   _scrubCopyListItems(OBJMEMSCRUB *pScrubber, PSCRUB_NODE pList, NvLength itemsToSave);

 static NV_STATUS _scrubCheckLocked(OBJMEMSCRUB  *pScrubber, PSCRUB_NODE *ppList, NvU64 *pSize);
+static NV_STATUS _scrubCombinePages(NvU64 *pPages, NvU64 pageSize, NvU64 pageCount,
+                                    PSCRUB_NODE *ppScrubList, NvU64 *pSize);

 /**
 * Constructs the memory scrubber object and signals
@@ -403,63 +405,78 @@ scrubSubmitPages
 {
    NvU64       curPagesSaved     = 0;
    PSCRUB_NODE pScrubList        = NULL;
+    PSCRUB_NODE pScrubListCopy    = NULL;
+    NvU64       scrubListSize     = 0;
    NvLength    pagesToScrubCheck = 0;
    NvU64       totalSubmitted    = 0;
    NvU64       numFinished       = 0;
    NvU64       freeEntriesInList = 0;
    NvU64       scrubCount        = 0;
-    NvU64       numPagesToScrub   = pageCount;
+    NvU64       numPagesToScrub   = 0;
    NV_STATUS   status            = NV_OK;

    portSyncMutexAcquire(pScrubber->pScrubberMutex);
    *pSize  = 0;
    *ppList = pScrubList;

+    NV_CHECK_OR_GOTO(LEVEL_INFO, pageCount > 0, cleanup);
+
    NV_PRINTF(LEVEL_INFO, "submitting pages, pageCount = 0x%llx chunkSize = 0x%llx\n", pageCount, chunkSize);

    freeEntriesInList = _scrubGetFreeEntries(pScrubber);
-    if (freeEntriesInList < pageCount)
-    {
-        pScrubList = (PSCRUB_NODE)
-                     portMemAllocNonPaged((NvLength)(sizeof(SCRUB_NODE) * (pageCount - freeEntriesInList)));

-        if (pScrubList == NULL)
+    NV_ASSERT_OK_OR_GOTO(status,
+                         _scrubCombinePages(pPages,
+                                            chunkSize,
+                                            pageCount,
+                                            &pScrubList,
+                                            &scrubListSize),
+                         cleanup);
+
+    numPagesToScrub = scrubListSize;
+
+    if (freeEntriesInList < scrubListSize)
+    {
+        pScrubListCopy = (PSCRUB_NODE)
+                          portMemAllocNonPaged((NvLength)(sizeof(SCRUB_NODE) * (scrubListSize - freeEntriesInList)));
+
+        if (pScrubListCopy == NULL)
        {
            status = NV_ERR_NO_MEMORY;
            goto cleanup;
        }

-        while (freeEntriesInList < pageCount)
+        while (freeEntriesInList < scrubListSize)
        {
-            if (pageCount > MAX_SCRUB_ITEMS)
+            if (scrubListSize > MAX_SCRUB_ITEMS)
            {
                pagesToScrubCheck = (NvLength)(MAX_SCRUB_ITEMS - freeEntriesInList);
                scrubCount        = MAX_SCRUB_ITEMS;
            }
            else
            {
-                pagesToScrubCheck  = (NvLength)(pageCount - freeEntriesInList);
-                scrubCount         = pageCount;
+                pagesToScrubCheck = (NvLength)(scrubListSize - freeEntriesInList);
+                scrubCount        = scrubListSize;
            }

-            numFinished = _scrubCheckAndSubmit(pScrubber, chunkSize, &pPages[totalSubmitted],
-                                               scrubCount, &pScrubList[curPagesSaved],
+            numFinished = _scrubCheckAndSubmit(pScrubber, scrubCount,
+                                               &pScrubList[totalSubmitted],
+                                               &pScrubListCopy[curPagesSaved],
                                               pagesToScrubCheck);

-            pageCount         -= numFinished;
+            scrubListSize     -= numFinished;
            curPagesSaved     += pagesToScrubCheck;
            totalSubmitted    += numFinished;
            freeEntriesInList  = _scrubGetFreeEntries(pScrubber);
        }

-        *ppList = pScrubList;
+        *ppList = pScrubListCopy;
        *pSize  = curPagesSaved;
    }
    else
    {
-        totalSubmitted = _scrubCheckAndSubmit(pScrubber, chunkSize, pPages,
-                                              pageCount, NULL,
-                                              0);
+        totalSubmitted = _scrubCheckAndSubmit(pScrubber, scrubListSize,
+                                              pScrubList, NULL, 0);
        *ppList = NULL;
        *pSize  = 0;
    }
@@ -467,6 +484,12 @@ scrubSubmitPages
 cleanup:
    portSyncMutexRelease(pScrubber->pScrubberMutex);

+    if (pScrubList != NULL)
+    {
+        portMemFree(pScrubList);
+        pScrubList = NULL;
+    }
+
    NV_CHECK_OK_OR_RETURN(LEVEL_INFO, status);

    if (totalSubmitted == numPagesToScrub)
@@ -507,15 +530,33 @@ scrubWaitPages
 )
 {

-    NvU32     iter   = 0;
-    NV_STATUS status = NV_OK;
+    NvU32       iter          = 0;
+    NV_STATUS   status        = NV_OK;
+    PSCRUB_NODE pScrubList    = NULL;
+    NvU64       scrubListSize = 0;
+
+    NV_ASSERT_OK_OR_RETURN(_scrubCombinePages(pPages,
+                                              chunkSize,
+                                              pageCount,
+                                              &pScrubList,
+                                              &scrubListSize));

    portSyncMutexAcquire(pScrubber->pScrubberMutex);
-    for (iter = 0; iter < pageCount; iter++)
+
+    for (iter = 0; iter < scrubListSize; iter++)
    {
-        _waitForPayload(pScrubber, pPages[iter], (pPages[iter] + chunkSize - 1));
+        _waitForPayload(pScrubber,
+                        pScrubList[iter].base,
+                        (pScrubList[iter].base + pScrubList[iter].size - 1));
    }
    portSyncMutexRelease(pScrubber->pScrubberMutex);
+
+    if (pScrubList != NULL)
+    {
+        portMemFree(pScrubList);
+        pScrubList = NULL;
+    }
+
    return status;

 }
@@ -644,29 +685,28 @@ _scrubCopyListItems
 /*  This function is used to check and submit work items always within the
 *  available / maximum scrub list size.
 *
- *  @param[in]  pScrubber    OBJMEMSCRUB pointer
- *  @param[in]  chunkSize     size of each page
- *  @param[in]  pPages       Array of base address
- *  @param[in]  pageCount    number of pages in the array
- *  @param[in]  pList        pointer will store the return check array
+ *  @param[in]  pScrubber           OBJMEMSCRUB pointer
+ *  @param[in]  pageCount           number of pages in the array
+ *  @param[in]  pList               pointer will store the return check array
+ *  @param[in]  pScrubListCopy      List where pages are saved
+ *  @param[in]  pagesToScrubCheck   How many pages will need to be saved
 *  @returns the number of work successfully submitted, else 0
 */
 static NvU64
 _scrubCheckAndSubmit
 (
    OBJMEMSCRUB *pScrubber,
-    NvU64        chunkSize,
-    NvU64       *pPages,
    NvU64        pageCount,
    PSCRUB_NODE  pList,
+    PSCRUB_NODE  pScrubListCopy,
    NvLength     pagesToScrubCheck
 )
 {
-    NvU64        iter              = 0;
-    NvU64        newId;
-    NV_STATUS    status;
+    NvU64     iter = 0;
+    NvU64     newId;
+    NV_STATUS status;

-    if (pList == NULL && pagesToScrubCheck != 0)
+    if (pScrubListCopy == NULL && pagesToScrubCheck != 0)
    {
        NV_PRINTF(LEVEL_ERROR,
                  "pages need to be saved off, but stash list is invalid\n");
@@ -681,19 +721,19 @@ _scrubCheckAndSubmit

        NV_PRINTF(LEVEL_INFO,
                  "Submitting work, Id: %llx, base: %llx, size: %llx\n",
-                  newId, pPages[iter], chunkSize);
+                  newId, pList[iter].base, pList[iter].size);

        {
-            status =_scrubMemory(pScrubber, pPages[iter], chunkSize, NV_MEMORY_DEFAULT,
+            status =_scrubMemory(pScrubber, pList[iter].base, pList[iter].size, NV_MEMORY_DEFAULT,
                                 (NvU32)newId);
        }

        if(status != NV_OK)
        {
-            NV_PRINTF(LEVEL_ERROR, "Failing because the work dint submit.\n");
+            NV_PRINTF(LEVEL_ERROR, "Failing because the work didn't submit.\n");
            goto exit;
        }
-        _scrubAddWorkToList(pScrubber, pPages[iter], chunkSize, newId);
+        _scrubAddWorkToList(pScrubber, pList[iter].base, pList[iter].size, newId);
        _scrubCheckProgress(pScrubber);
    }

@@ -897,7 +937,7 @@ _scrubCheckProgress
        else
            lastSWSemaphoreDone = ceutilsUpdateProgress(pScrubber->pCeUtils);
    }
-    
+
    pScrubber->lastSWSemaphoreDone = lastSWSemaphoreDone;

    return lastSWSemaphoreDone;
@@ -949,3 +989,42 @@ cleanup:
    memdescDestroy(pMemDesc);
    return status;
 }
+
+static NV_STATUS
+_scrubCombinePages
+(
+    NvU64       *pPages,
+    NvU64        pageSize,
+    NvU64        pageCount,
+    PSCRUB_NODE *ppScrubList,
+    NvU64       *pSize
+)
+{
+    NvU64 i, j;
+
+    *ppScrubList = (PSCRUB_NODE)portMemAllocNonPaged(sizeof(SCRUB_NODE) * pageCount);
+    NV_ASSERT_OR_RETURN(*ppScrubList != NULL, NV_ERR_NO_MEMORY);
+
+    // Copy first element from original list to new list
+    (*ppScrubList)[0].base = pPages[0];
+    (*ppScrubList)[0].size = pageSize;
+
+    for (i = 0, j = 0; i < (pageCount - 1); i++)
+    {
+        if ((((*ppScrubList)[j].size + pageSize) > SCRUB_MAX_BYTES_PER_LINE) ||
+            ((pPages[i] + pageSize) != pPages[i+1]))
+        {
+            j++;
+            (*ppScrubList)[j].base = pPages[i+1];
+            (*ppScrubList)[j].size = pageSize;
+        }
+        else
+        {
+            (*ppScrubList)[j].size += pageSize;
+        }
+    }
+
+    *pSize = j + 1;
+
+    return NV_OK;
+}
--- a/src/nvidia/src/kernel/gpu/mem_mgr/phys_mem_allocator/numa.c
+++ b/src/nvidia/src/kernel/gpu/mem_mgr/phys_mem_allocator/numa.c
@@ -363,7 +363,7 @@ static NV_STATUS _pmaNumaAllocatePages
        osAllocAcquirePage(sysPhysAddr + (1 << osPageShift), (pageSize >> osPageShift) - 1);
    }

-    if (bScrubOnAlloc)
+    if (bScrubOnAlloc && (i > 0))
    {
        PSCRUB_NODE pPmaScrubList = NULL;
        NvU64 count;
--- a/src/nvidia/src/kernel/gpu/mem_mgr/phys_mem_allocator/phys_mem_allocator.c
+++ b/src/nvidia/src/kernel/gpu/mem_mgr/phys_mem_allocator/phys_mem_allocator.c
@@ -1618,6 +1618,24 @@ pmaGetFreeMemory
    NvU64           *pBytesFree
 )
 {
+#if !defined(SRT_BUILD)
+    NvU64 val;
+
+    portSyncSpinlockAcquire(pPma->pPmaLock);
+    NvBool nodeOnlined = pPma->nodeOnlined;
+    portSyncSpinlockRelease(pPma->pPmaLock);
+
+    if (nodeOnlined)
+    {
+        osGetNumaMemoryUsage(pPma->numaNodeId, pBytesFree, &val);
+        return;
+    }
+    //
+    // what to return when bNUMA == NV_TRUE and nodeOnlined==NV_FALSE?
+    // TODO : BUG 4199482.
+    //
+#endif
+
    portSyncSpinlockAcquire(pPma->pPmaLock);

    *pBytesFree = pPma->pmaStats.numFreeFrames << PMA_PAGE_SHIFT;
@@ -1638,6 +1656,24 @@ pmaGetTotalMemory

    *pBytesTotal = 0;

+#if !defined(SRT_BUILD)
+    NvU64 val;
+
+    portSyncSpinlockAcquire(pPma->pPmaLock);
+    NvBool nodeOnlined = pPma->nodeOnlined;
+    portSyncSpinlockRelease(pPma->pPmaLock);
+
+    if (nodeOnlined)
+    {
+        osGetNumaMemoryUsage(pPma->numaNodeId, &val, pBytesTotal);
+        return;
+    }
+    //
+    // what to return when bNUMA == NV_TRUE and nodeOnlined==NV_FALSE?
+    // TODO : BUG 4199482.
+    //
+#endif
+
    for (i = 0; i < pPma->regSize; i++)
    {
        pMap = pPma->pRegions[i];
--- a/src/nvidia/src/kernel/gpu/mem_sys/arch/hopper/kern_mem_sys_gh100.c
+++ b/src/nvidia/src/kernel/gpu/mem_sys/arch/hopper/kern_mem_sys_gh100.c
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@@ -23,15 +23,24 @@

 #include "core/core.h"
 #include "gpu/gpu.h"
+#include "nvtypes.h"
 #include "os/os.h"
 #include "kernel/gpu/mem_sys/kern_mem_sys.h"
 #include "gpu/mem_mgr/mem_desc.h"
 #include "gpu/bus/kern_bus.h"
+#include "kernel/gpu/intr/intr.h"
+#include "nverror.h"

 #include "published/hopper/gh100/dev_fb.h"
+#include "published/hopper/gh100/dev_ltc.h"
+#include "published/hopper/gh100/dev_fbpa.h"
 #include "published/hopper/gh100/dev_vm.h"
 #include "published/hopper/gh100/pri_nv_xal_ep.h"
 #include "published/hopper/gh100/dev_nv_xal_addendum.h"
+#include "published/hopper/gh100/dev_nv_xpl.h"
+#include "published/hopper/gh100/dev_xtl_ep_pri.h"
+#include "published/hopper/gh100/hwproject.h"
+#include "published/ampere/ga100/dev_fb.h"

 NV_STATUS
 kmemsysDoCacheOp_GH100
@@ -566,3 +575,168 @@ kmemsysSwizzIdToVmmuSegmentsRange_GH100

    return NV_OK;
 }
+/*!
+ * Utility function used to read registers and ignore PRI errors
+ */
+static NvU32
+_kmemsysReadRegAndMaskPriError
+(
+    OBJGPU *pGpu,
+    NvU32 regAddr
+)
+{
+    NvU32 regVal;
+
+    regVal = osGpuReadReg032(pGpu, regAddr);
+    if ((regVal & GPU_READ_PRI_ERROR_MASK) == GPU_READ_PRI_ERROR_CODE)
+    {
+        return 0;
+    }
+
+    return regVal;
+}
+/*
+ * @brief Function that checks if ECC error occurred by reading various count
+ * registers/interrupt registers. This function is not floorsweeping-aware so
+ * PRI errors are ignored
+ */
+void
+kmemsysCheckEccCounts_GH100
+(
+    OBJGPU *pGpu,
+    KernelMemorySystem *pKernelMemorySystem
+)
+{
+    NvU32 dramCount = 0;
+    NvU32 mmuCount = 0;
+    NvU32 ltcCount = 0;
+    NvU32 pcieCount = 0;
+    NvU32 regVal;
+    for (NvU32 i = 0; i < NV_SCAL_LITTER_NUM_FBPAS; i++)
+    {
+        for (NvU32 j = 0; j < NV_PFB_FBPA_0_ECC_DED_COUNT__SIZE_1; j++)
+        {
+            // DRAM count read
+            dramCount += _kmemsysReadRegAndMaskPriError(pGpu, NV_PFB_FBPA_0_ECC_DED_COUNT(j) + (i * NV_FBPA_PRI_STRIDE));
+
+            // LTC count read
+            regVal = _kmemsysReadRegAndMaskPriError(pGpu, NV_PLTCG_LTC0_LTS0_L2_CACHE_ECC_UNCORRECTED_ERR_COUNT +
+                    (i * NV_LTC_PRI_STRIDE) + (j * NV_LTS_PRI_STRIDE));
+            ltcCount += DRF_VAL(_PLTCG_LTC0_LTS0, _L2_CACHE_ECC, _UNCORRECTED_ERR_COUNT_UNIQUE, regVal);
+        }
+    }
+
+    // L2TLB
+    regVal = _kmemsysReadRegAndMaskPriError(pGpu, NV_PFB_PRI_MMU_L2TLB_ECC_UNCORRECTED_ERR_COUNT);
+    mmuCount += DRF_VAL(_PFB_PRI_MMU, _L2TLB_ECC, _UNCORRECTED_ERR_COUNT_UNIQUE, regVal);
+
+    // HUBTLB
+    regVal = _kmemsysReadRegAndMaskPriError(pGpu, NV_PFB_PRI_MMU_HUBTLB_ECC_UNCORRECTED_ERR_COUNT);
+    mmuCount += DRF_VAL(_PFB_PRI_MMU, _HUBTLB_ECC, _UNCORRECTED_ERR_COUNT_UNIQUE, regVal);
+
+    // FILLUNIT
+    regVal = _kmemsysReadRegAndMaskPriError(pGpu, NV_PFB_PRI_MMU_FILLUNIT_ECC_UNCORRECTED_ERR_COUNT);
+    mmuCount += DRF_VAL(_PFB_PRI_MMU, _FILLUNIT_ECC, _UNCORRECTED_ERR_COUNT_UNIQUE, regVal);
+
+    // PCIE RBUF
+    regVal = _kmemsysReadRegAndMaskPriError(pGpu, NV_XPL_BASE_ADDRESS + NV_XPL_DL_ERR_COUNT_RBUF);
+    pcieCount += DRF_VAL(_XPL_DL, _ERR_COUNT_RBUF, _UNCORR_ERR, regVal);
+
+    // PCIE SEQ_LUT
+    regVal = _kmemsysReadRegAndMaskPriError(pGpu, NV_XPL_BASE_ADDRESS + NV_XPL_DL_ERR_COUNT_SEQ_LUT);
+    pcieCount += DRF_VAL(_XPL_DL, _ERR_COUNT_SEQ_LUT, _UNCORR_ERR, regVal);
+
+    // PCIE RE ORDER
+    regVal = _kmemsysReadRegAndMaskPriError(pGpu, NV_XAL_EP_REORDER_ECC_UNCORRECTED_ERR_COUNT);
+    pcieCount += DRF_VAL(_XAL_EP, _REORDER_ECC, _UNCORRECTED_ERR_COUNT_UNIQUE, regVal);
+
+    // PCIE P2PREQ
+    regVal = _kmemsysReadRegAndMaskPriError(pGpu, NV_XAL_EP_P2PREQ_ECC_UNCORRECTED_ERR_COUNT);
+    pcieCount += DRF_VAL(_XAL_EP, _P2PREQ_ECC, _UNCORRECTED_ERR_COUNT_UNIQUE, regVal);
+
+    // PCIE XTL
+    regVal = _kmemsysReadRegAndMaskPriError(pGpu, NV_XTL_BASE_ADDRESS + NV_XTL_EP_PRI_DED_ERROR_STATUS);
+    if (regVal != 0)
+    {
+        pcieCount += 1;
+    }
+
+    // PCIE XTL
+    regVal = _kmemsysReadRegAndMaskPriError(pGpu, NV_XTL_BASE_ADDRESS + NV_XTL_EP_PRI_RAM_ERROR_INTR_STATUS);
+    if (regVal != 0)
+    {
+        pcieCount += 1;
+    }
+
+    // If counts > 0 or if poison interrupt pending, ECC error has occurred.
+    if (((dramCount + ltcCount + mmuCount + pcieCount) != 0) ||
+        intrIsVectorPending_HAL(pGpu, GPU_GET_INTR(pGpu), NV_PFB_FBHUB_POISON_INTR_VECTOR_HW_INIT, NULL))
+    {
+        nvErrorLog_va((void *)pGpu, UNRECOVERABLE_ECC_ERROR_ESCAPE,
+                      "An uncorrectable ECC error detected "
+                      "(possible firmware handling failure) "
+                      "DRAM:%d, LTC:%d, MMU:%d, PCIE:%d", dramCount, ltcCount, mmuCount, pcieCount);
+    }
+}
+
+/*
+ * @brief  Function that clears ECC error count registers.
+ */
+NV_STATUS
+kmemsysClearEccCounts_GH100
+(
+    OBJGPU *pGpu,
+    KernelMemorySystem *pKernelMemorySystem
+)
+{
+    NvU32 regVal = 0;
+    RMTIMEOUT timeout;
+    NV_STATUS status = NV_OK;
+
+    gpuClearFbhubPoisonIntrForBug2924523_HAL(pGpu);
+
+    for (NvU32 i = 0; i < NV_SCAL_LITTER_NUM_FBPAS; i++)
+    {
+        for (NvU32 j = 0; j < NV_PFB_FBPA_0_ECC_DED_COUNT__SIZE_1; j++)
+        {
+            osGpuWriteReg032(pGpu, NV_PFB_FBPA_0_ECC_DED_COUNT(j) + (i * NV_FBPA_PRI_STRIDE), 0);
+            osGpuWriteReg032(pGpu, NV_PLTCG_LTC0_LTS0_L2_CACHE_ECC_UNCORRECTED_ERR_COUNT + (i * NV_LTC_PRI_STRIDE) + (j * NV_LTS_PRI_STRIDE), 0);
+        }
+    }
+
+    // Reset MMU counts
+    osGpuWriteReg032(pGpu, NV_PFB_PRI_MMU_L2TLB_ECC_UNCORRECTED_ERR_COUNT, 0);
+    osGpuWriteReg032(pGpu, NV_PFB_PRI_MMU_HUBTLB_ECC_UNCORRECTED_ERR_COUNT, 0);
+    osGpuWriteReg032(pGpu, NV_PFB_PRI_MMU_FILLUNIT_ECC_UNCORRECTED_ERR_COUNT, 0);
+
+    // Reset XAL-EP counts
+    osGpuWriteReg032(pGpu, NV_XAL_EP_REORDER_ECC_UNCORRECTED_ERR_COUNT, 0);
+    osGpuWriteReg032(pGpu, NV_XAL_EP_P2PREQ_ECC_UNCORRECTED_ERR_COUNT, 0);
+
+    // Reset XTL-EP status registers
+    osGpuWriteReg032(pGpu, NV_XTL_BASE_ADDRESS + NV_XTL_EP_PRI_DED_ERROR_STATUS, ~0);
+    osGpuWriteReg032(pGpu, NV_XTL_BASE_ADDRESS + NV_XTL_EP_PRI_RAM_ERROR_INTR_STATUS, ~0);
+
+    // Reset XPL-EP error counters
+    regVal = DRF_DEF(_XPL, _DL_ERR_RESET, _RBUF_UNCORR_ERR_COUNT, _PENDING) |
+             DRF_DEF(_XPL, _DL_ERR_RESET, _SEQ_LUT_UNCORR_ERR_COUNT, _PENDING);
+    osGpuWriteReg032(pGpu, NV_XPL_BASE_ADDRESS + NV_XPL_DL_ERR_RESET, regVal);
+
+    // Wait for the error counter reset to complete
+    gpuSetTimeout(pGpu, GPU_TIMEOUT_DEFAULT, &timeout, 0);
+    for (;;)
+    {
+        status = gpuCheckTimeout(pGpu, &timeout);
+
+        regVal = osGpuReadReg032(pGpu, NV_XPL_BASE_ADDRESS + NV_XPL_DL_ERR_RESET);
+
+        if (FLD_TEST_DRF(_XPL, _DL_ERR_RESET, _RBUF_UNCORR_ERR_COUNT, _DONE, regVal) &&
+            FLD_TEST_DRF(_XPL, _DL_ERR_RESET, _SEQ_LUT_UNCORR_ERR_COUNT, _DONE, regVal))
+            break;
+
+        if (status != NV_OK)
+            return status;
+    }
+
+    return NV_OK;
+}
--- a/src/nvidia/src/kernel/gpu/mig_mgr/kernel_mig_manager.c
+++ b/src/nvidia/src/kernel/gpu/mig_mgr/kernel_mig_manager.c
@@ -5800,6 +5800,7 @@ kmigmgrInitGPUInstanceBufPools_IMPL
 {
    Heap *pHeap;
    MemoryManager *pMemoryManager = GPU_GET_MEMORY_MANAGER(pGpu);
+    NvU32 pmaConfig = PMA_QUERY_NUMA_ONLINED;
    NV_ASSERT_OR_RETURN(pKernelMIGGpuInstance != NULL, NV_ERR_INVALID_ARGUMENT);
    pHeap = pKernelMIGGpuInstance->pMemoryPartitionHeap;
    NV_ASSERT_OR_RETURN(pHeap != NULL, NV_ERR_INVALID_STATE);
@@ -5815,7 +5816,12 @@ kmigmgrInitGPUInstanceBufPools_IMPL
    // This is just a sanity check to make sure this assumption is correct and
    // allocation from PMA cannot trigger UVM evictions.
    //
-    if (memmgrIsPmaInitialized(pMemoryManager))
+    // When FB memory is onlined as NUMA node, kernel can directly alloc FB memory
+    // and hence free memory can not be expected to be same as total memory.
+    //
+    if (memmgrIsPmaInitialized(pMemoryManager) &&
+        (pmaQueryConfigs(&pHeap->pmaObject, &pmaConfig) == NV_OK) &&
+        !(pmaConfig & PMA_QUERY_NUMA_ONLINED))
    {
        NvU64 freeSpace, totalSpace;
        pmaGetFreeMemory(&pHeap->pmaObject, &freeSpace);
--- a/src/nvidia/src/kernel/mem_mgr/mem_fabric.c
+++ b/src/nvidia/src/kernel/mem_mgr/mem_fabric.c
@@ -93,6 +93,7 @@ _memoryfabricValidatePhysMem
    MEMORY_DESCRIPTOR *pPhysMemDesc;
    NvU64 physPageSize;
    NV_STATUS status;
+    Memory *pMemory;

    if (hPhysMem == 0)
    {
@@ -110,7 +111,19 @@ _memoryfabricValidatePhysMem
        return status;
    }

-    pPhysMemDesc = (dynamicCast(pPhysmemRef->pResource, Memory))->pMemDesc;
+    pMemory = dynamicCast(pPhysmemRef->pResource, Memory);
+    if (pMemory == NULL)
+    {
+        NV_PRINTF(LEVEL_ERROR, "Invalid memory handle\n");
+        return NV_ERR_INVALID_OBJECT_HANDLE;
+    }
+
+    pPhysMemDesc = pMemory->pMemDesc;
+    if (pPhysMemDesc == NULL)
+    {
+        NV_PRINTF(LEVEL_ERROR, "Invalid memory handle\n");
+        return NV_ERR_INVALID_OBJECT_HANDLE;
+    }

    if ((pOwnerGpu != pPhysMemDesc->pGpu) ||
        !memmgrIsApertureSupportedByFla_HAL(pOwnerGpu, pMemoryManager,
--- a/src/nvidia/src/kernel/mem_mgr/mem_multicast_fabric.c
+++ b/src/nvidia/src/kernel/mem_mgr/mem_multicast_fabric.c
@@ -218,7 +218,7 @@ _memMulticastFabricDescriptorDequeueWaitUnderLock
    }
 }

-NV_STATUS
+static NV_STATUS
 _memMulticastFabricGpuInfoAddUnderLock
 (
    MemoryMulticastFabric          *pMemoryMulticastFabric,
@@ -1027,8 +1027,8 @@ memorymulticastfabricConstruct_IMPL
    return status;
 }

-NV_STATUS
-memorymulticastfabricCtrlAttachGpu_IMPL
+static NV_STATUS
+_memorymulticastfabricCtrlAttachGpu
 (
    MemoryMulticastFabric         *pMemoryMulticastFabric,
    NV00FD_CTRL_ATTACH_GPU_PARAMS *pParams
@@ -1041,14 +1041,13 @@ memorymulticastfabricCtrlAttachGpu_IMPL
    OBJGPU *pGpu;
    FABRIC_VASPACE *pFabricVAS;
    NvU64 gpuProbeHandle;
-    MEM_MULTICAST_FABRIC_GPU_INFO *pNode = \
-                                listTail(&pMulticastFabricDesc->gpuInfoList);
+    MEM_MULTICAST_FABRIC_GPU_INFO *pNode = NULL;
+    CALL_CONTEXT *pCallContext = resservGetTlsCallContext();

    if (pParams->flags != 0)
    {
        NV_PRINTF(LEVEL_ERROR, "flags passed for attach mem must be zero\n");
-        status = NV_ERR_INVALID_ARGUMENT;
-        goto fail;
+        return NV_ERR_INVALID_ARGUMENT;
    }

    // Check if the Multicast FLA object has any additional slots for GPUs
@@ -1070,10 +1069,19 @@ memorymulticastfabricCtrlAttachGpu_IMPL
    {
        NV_PRINTF(LEVEL_ERROR,
                  "Multicast attach not supported on Windows/CC/vGPU modes\n");
-        status = NV_ERR_NOT_SUPPORTED;
-        goto fail;
+        return NV_ERR_NOT_SUPPORTED;
    }

+    status = _memMulticastFabricGpuInfoAddUnderLock(pMemoryMulticastFabric,
+                                                    pCallContext->pControlParams);
+    if (status != NV_OK)
+    {
+        NV_PRINTF(LEVEL_ERROR, "Failed to populate GPU info\n");
+        return status;
+    }
+
+    pNode = listTail(&pMulticastFabricDesc->gpuInfoList);
+
    status = gpuFabricProbeGetGpuFabricHandle(pGpu->pGpuFabricProbeInfoKernel,
                                              &gpuProbeHandle);
    if (status != NV_OK)
@@ -1119,6 +1127,26 @@ fail:
    return status;
 }

+NV_STATUS
+memorymulticastfabricCtrlAttachGpu_IMPL
+(
+    MemoryMulticastFabric         *pMemoryMulticastFabric,
+    NV00FD_CTRL_ATTACH_GPU_PARAMS *pParams
+)
+{
+    Fabric *pFabric = SYS_GET_FABRIC(SYS_GET_INSTANCE());
+    NV_STATUS status = NV_OK;
+
+    fabricMulticastFabricOpsMutexAcquire(pFabric);
+
+    status = _memorymulticastfabricCtrlAttachGpu(pMemoryMulticastFabric,
+                                                 pParams);
+
+    fabricMulticastFabricOpsMutexRelease(pFabric);
+
+    return status;
+}
+
 static MEM_MULTICAST_FABRIC_GPU_INFO*
 _memorymulticastfabricGetAttchedGpuInfo
 (
@@ -1148,8 +1176,8 @@ _memorymulticastfabricGetAttchedGpuInfo
    return NULL;
 }

-NV_STATUS
-memorymulticastfabricCtrlDetachMem_IMPL
+static NV_STATUS
+_memorymulticastfabricCtrlDetachMem
 (
    MemoryMulticastFabric         *pMemoryMulticastFabric,
    NV00FD_CTRL_DETACH_MEM_PARAMS *pParams
@@ -1189,6 +1217,26 @@ memorymulticastfabricCtrlDetachMem_IMPL
    return NV_OK;
 }

+NV_STATUS
+memorymulticastfabricCtrlDetachMem_IMPL
+(
+    MemoryMulticastFabric         *pMemoryMulticastFabric,
+    NV00FD_CTRL_DETACH_MEM_PARAMS *pParams
+)
+{
+    Fabric *pFabric = SYS_GET_FABRIC(SYS_GET_INSTANCE());
+    NV_STATUS status = NV_OK;
+
+    fabricMulticastFabricOpsMutexAcquire(pFabric);
+
+    status = _memorymulticastfabricCtrlDetachMem(pMemoryMulticastFabric,
+                                                 pParams);
+
+    fabricMulticastFabricOpsMutexRelease(pFabric);
+
+    return status;
+}
+
 static NV_STATUS
 _memorymulticastfabricValidatePhysMem
 (
@@ -1202,6 +1250,7 @@ _memorymulticastfabricValidatePhysMem
    MEMORY_DESCRIPTOR *pPhysMemDesc;
    NvU64 physPageSize;
    NV_STATUS status;
+    Memory *pMemory;

    status = serverutilGetResourceRef(RES_GET_CLIENT_HANDLE(pMemoryMulticastFabric),
                                      hPhysMem, &pPhysmemRef);
@@ -1213,7 +1262,19 @@ _memorymulticastfabricValidatePhysMem
        return status;
    }

-    pPhysMemDesc = (dynamicCast(pPhysmemRef->pResource, Memory))->pMemDesc;
+    pMemory = dynamicCast(pPhysmemRef->pResource, Memory);
+    if (pMemory == NULL)
+    {
+        NV_PRINTF(LEVEL_ERROR, "Invalid memory handle\n");
+        return NV_ERR_INVALID_OBJECT_HANDLE;
+    }
+
+    pPhysMemDesc = pMemory->pMemDesc;
+    if (pPhysMemDesc == NULL)
+    {
+        NV_PRINTF(LEVEL_ERROR, "Invalid memory handle\n");
+        return NV_ERR_INVALID_OBJECT_HANDLE;
+    }

    if (memdescGetAddressSpace(pPhysMemDesc) != ADDR_FBMEM ||
        (pAttachedGpu != pPhysMemDesc->pGpu))
@@ -1237,8 +1298,8 @@ _memorymulticastfabricValidatePhysMem
    return NV_OK;
 }

-NV_STATUS
-memorymulticastfabricCtrlAttachMem_IMPL
+static NV_STATUS
+_memorymulticastfabricCtrlAttachMem
 (
    MemoryMulticastFabric         *pMemoryMulticastFabric,
    NV00FD_CTRL_ATTACH_MEM_PARAMS *pParams
@@ -1342,6 +1403,26 @@ freeDupedMem:
    return status;
 }

+NV_STATUS
+memorymulticastfabricCtrlAttachMem_IMPL
+(
+    MemoryMulticastFabric         *pMemoryMulticastFabric,
+    NV00FD_CTRL_ATTACH_MEM_PARAMS *pParams
+)
+{
+    Fabric *pFabric = SYS_GET_FABRIC(SYS_GET_INSTANCE());
+    NV_STATUS status = NV_OK;
+
+    fabricMulticastFabricOpsMutexAcquire(pFabric);
+
+    status = _memorymulticastfabricCtrlAttachMem(pMemoryMulticastFabric,
+                                                 pParams);
+
+    fabricMulticastFabricOpsMutexRelease(pFabric);
+
+    return status;
+}
+
 void
 memorymulticastfabricDestruct_IMPL
 (
@@ -1393,8 +1474,8 @@ memorymulticastfabricCopyConstruct_IMPL
    return NV_OK;
 }

-NV_STATUS
-memorymulticastfabricCtrlGetInfo_IMPL
+static NV_STATUS
+_memorymulticastfabricCtrlGetInfo
 (
    MemoryMulticastFabric       *pMemoryMulticastFabric,
    NV00FD_CTRL_GET_INFO_PARAMS *pParams
@@ -1413,6 +1494,26 @@ memorymulticastfabricCtrlGetInfo_IMPL
    return NV_OK;
 }

+NV_STATUS
+memorymulticastfabricCtrlGetInfo_IMPL
+(
+    MemoryMulticastFabric       *pMemoryMulticastFabric,
+    NV00FD_CTRL_GET_INFO_PARAMS *pParams
+)
+{
+    Fabric *pFabric = SYS_GET_FABRIC(SYS_GET_INSTANCE());
+    NV_STATUS status = NV_OK;
+
+    fabricMulticastFabricOpsMutexAcquire(pFabric);
+
+    status = _memorymulticastfabricCtrlGetInfo(pMemoryMulticastFabric,
+                                               pParams);
+
+    fabricMulticastFabricOpsMutexRelease(pFabric);
+
+    return status;
+}
+
 NV_STATUS
 memorymulticastfabricIsReady_IMPL
 (
@@ -1451,8 +1552,8 @@ memorymulticastfabricIsReady_IMPL
    return mcTeamStatus;
 }

-NV_STATUS
-memorymulticastfabricCtrlRegisterEvent_IMPL
+static NV_STATUS
+_memorymulticastfabricCtrlRegisterEvent
 (
    MemoryMulticastFabric             *pMemoryMulticastFabric,
    NV00FD_CTRL_REGISTER_EVENT_PARAMS *pParams
@@ -1467,20 +1568,23 @@ memorymulticastfabricCtrlRegisterEvent_IMPL
 }

 NV_STATUS
-memorymulticastfabricControl_Prologue_IMPL
+memorymulticastfabricCtrlRegisterEvent_IMPL
 (
-    MemoryMulticastFabric          *pMemoryMulticastFabric,
-    CALL_CONTEXT                   *pCallContext,
-    RS_RES_CONTROL_PARAMS_INTERNAL *pParams
+    MemoryMulticastFabric             *pMemoryMulticastFabric,
+    NV00FD_CTRL_REGISTER_EVENT_PARAMS *pParams
 )
 {
-    RmResource *pResource = staticCast(pMemoryMulticastFabric, RmResource);
+    Fabric *pFabric = SYS_GET_FABRIC(SYS_GET_INSTANCE());
+    NV_STATUS status = NV_OK;

-    // Other control calls, nothing to be validated.
-    if (pParams->cmd != NV00FD_CTRL_CMD_ATTACH_GPU)
-        return rmresControl_Prologue_IMPL(pResource, pCallContext, pParams);
+    fabricMulticastFabricOpsMutexAcquire(pFabric);

-    return _memMulticastFabricGpuInfoAddUnderLock(pMemoryMulticastFabric, pParams);
+    status = _memorymulticastfabricCtrlRegisterEvent(pMemoryMulticastFabric,
+                                                     pParams);
+
+    fabricMulticastFabricOpsMutexRelease(pFabric);
+
+    return status;
 }

 NV_STATUS
@@ -1491,7 +1595,6 @@ memorymulticastfabricControl_IMPL
    RS_RES_CONTROL_PARAMS_INTERNAL *pParams
 )
 {
-    Fabric *pFabric = SYS_GET_FABRIC(SYS_GET_INSTANCE());
    NV_STATUS status = NV_OK;

    if (pParams->cmd != NV00FD_CTRL_CMD_ATTACH_GPU)
@@ -1522,14 +1625,13 @@ memorymulticastfabricControl_IMPL
        NV_CHECK_OK_OR_RETURN(LEVEL_ERROR, status);
    }

-    fabricMulticastFabricOpsMutexAcquire(pFabric);
-
-    status = resControl_IMPL(staticCast(pMemoryMulticastFabric, RsResource),
-                             pCallContext, pParams);
-
-    fabricMulticastFabricOpsMutexRelease(pFabric);
-
-    return status;
+    //
+    // Note: GPU lock(s) is required for some control calls. Thus, it is
+    // incorrect to take the leaf lock here. resControl_IMPL() attempts to
+    // acquire the GPU locks before it calls the control call body.
+    //
+    return resControl_IMPL(staticCast(pMemoryMulticastFabric, RsResource),
+                           pCallContext, pParams);
 }

 NvBool
--- a/src/nvidia/src/kernel/rmapi/client.c
+++ b/src/nvidia/src/kernel/rmapi/client.c
@@ -82,6 +82,8 @@ rmclientConstruct_IMPL
    pClient->pSecurityToken  = NULL;
    pClient->pOSInfo         = pSecInfo->clientOSInfo;

+    pClient->cachedPrivilege = pSecInfo->privLevel;
+
    // TODO: Revisit in M2, see GPUSWSEC-1176
    if (RMCFG_FEATURE_PLATFORM_GSP && IS_VGPU_GSP_PLUGIN_OFFLOAD_ENABLED(pGpu))
    {
@@ -96,10 +98,10 @@ rmclientConstruct_IMPL
    else
    {
        pClient->ProcID = osGetCurrentProcess();
+        if (pClient->cachedPrivilege <= RS_PRIV_LEVEL_USER_ROOT)
+            pClient->pOsPidInfo = osGetPidInfo();
    }

-    pClient->cachedPrivilege = pSecInfo->privLevel;
-
    // Set user-friendly client name from current process
    osGetCurrentProcessName(pClient->name, NV_PROC_NAME_MAX_LENGTH);

@@ -128,7 +130,7 @@ rmclientConstruct_IMPL
        {
            NV_PRINTF(LEVEL_WARNING,
                      "NVRM_RPC: Failed to set host client resource handle range %x\n", status);
-            return status;
+            goto out;
        }
    }

@@ -139,7 +141,7 @@ rmclientConstruct_IMPL
    {
        NV_PRINTF(LEVEL_WARNING,
                  "Failed to set host client restricted resource handle range. Status=%x\n", status);
-        return status;
+        goto out;
    }

    if (!rmGpuLockIsOwner())
@@ -148,7 +150,7 @@ rmclientConstruct_IMPL
        if ((status = rmGpuLocksAcquire(GPUS_LOCK_FLAGS_NONE, RM_LOCK_MODULES_CLIENT)) != NV_OK)
        {
            NV_ASSERT(0);
-            return status;
+            goto out;
        }
        bReleaseLock = NV_TRUE;
    }
@@ -206,6 +208,13 @@ rmclientConstruct_IMPL
    if (status == NV_OK && pParams->pAllocParams != NULL)
        *(NvHandle*)(pParams->pAllocParams) = pParams->hClient;

+out:
+    if (status != NV_OK)
+    {
+        osPutPidInfo(pClient->pOsPidInfo);
+        pClient->pOsPidInfo = NULL;
+    }
+
    return status;
 }

@@ -230,6 +239,8 @@ rmclientDestruct_IMPL
    // Free any association of the client with existing third-party p2p object
    CliUnregisterFromThirdPartyP2P(pClient);

+    osPutPidInfo(pClient->pOsPidInfo);
+
    //
    // Free all of the devices of the client (do it in reverse order to
    // facilitate tear down of things like ctxdmas, etc)
--- a/src/nvidia/src/kernel/rmapi/control.c
+++ b/src/nvidia/src/kernel/rmapi/control.c
@@ -1013,7 +1013,7 @@ _rmapiControlWithSecInfoTlsIRQL
    NV_STATUS           status;
    THREAD_STATE_NODE   threadState;

-    NvU8                stackAllocator[TLS_ISR_ALLOCATOR_SIZE];
+    NvU8                stackAllocator[2*TLS_ISR_ALLOCATOR_SIZE];
    PORT_MEM_ALLOCATOR* pIsrAllocator = portMemAllocatorCreateOnExistingBlock(stackAllocator, sizeof(stackAllocator));
    tlsIsrInit(pIsrAllocator);

--- a/src/nvidia/src/libraries/crashcat/crashcat_engine.c
+++ b/src/nvidia/src/libraries/crashcat/crashcat_engine.c
@@ -142,6 +142,7 @@ static CrashCatBufferDescriptor *_crashcatEngineCreateBufferDescriptor

    portMemSet(pBufDesc, 0, sizeof(*pBufDesc));

+    pBufDesc->bRegistered = NV_FALSE;
    pBufDesc->aperture = aperture;
    pBufDesc->physOffset = offset;
    pBufDesc->size = size;
@@ -315,6 +316,8 @@ void *crashcatEngineMapCrashBuffer_IMPL
        //
        if (!pBufDesc->bRegistered)
            _crashcatEngineDestroyBufferDescriptor(pCrashCatEng, pBufDesc);
+
+        return NULL;
    }

    return pBufDesc->pMapping;
--- a/src/nvidia/src/libraries/crashcat/v1/impl/crashcat_report_v1_generic.c
+++ b/src/nvidia/src/libraries/crashcat/v1/impl/crashcat_report_v1_generic.c
@@ -69,7 +69,11 @@ void crashcatReportLogReporter_V1_GENERIC(CrashCatReport *pReport)
    NvCrashCatNvriscvUcodeId ucodeId = crashcatReportV1ReporterUcodeId(pReportV1);
    NV_CRASHCAT_RISCV_MODE riscvMode = crashcatReportV1ReporterMode(pReportV1);

-    crashcatEnginePrintf(pReport->pEngine, NV_FALSE,
+    //
+    // Though this is technically not a separate packet, we use the CRASHCAT_REPORT_LOG_PACKET_TYPE
+    // macro to get the correct prefix/indentation for the reporter information.
+    //
+    CRASHCAT_REPORT_LOG_PACKET_TYPE(pReport,
        "Reported by partition:%u ucode:%u [%c-mode] version:%u @ %u",
        partition, ucodeId, crashcatReportModeToChar_GENERIC(riscvMode),
        crashcatReportV1ReporterVersion(pReportV1),
--- a/src/nvidia/src/libraries/crashcat/v1/impl/crashcat_report_v1_libos2.c
+++ b/src/nvidia/src/libraries/crashcat/v1/impl/crashcat_report_v1_libos2.c
@@ -114,10 +114,13 @@ void crashcatReportLogReporter_V1_LIBOS2(CrashCatReport *pReport)
    NvCrashCatReport_V1 *pReportV1 = &pReport->v1.report;
    NvU8 taskId = crashcatReportV1ReporterLibos2TaskId(pReportV1);

+    //
+    // Though this is technically not a separate packet, we use the CRASHCAT_REPORT_LOG_PACKET_TYPE
+    // macro to get the correct prefix/indentation for the reporter information.
+    //
    if (taskId == NV_CRASHCAT_REPORT_V1_REPORTER_ID_LIBOS2_TASK_ID_UNSPECIFIED)
    {
-        crashcatEnginePrintf(pReport->pEngine, NV_FALSE,
-            "Reported by libos kernel v%u.%u [%u] @ %u",
+        CRASHCAT_REPORT_LOG_PACKET_TYPE(pReport, "Reported by libos kernel v%u.%u [%u] @ %u",
            crashcatReportV1ReporterVersionLibos2Major(pReportV1),
            crashcatReportV1ReporterVersionLibos2Minor(pReportV1),
            crashcatReportV1ReporterVersionLibos2Cl(pReportV1),
@@ -125,8 +128,7 @@ void crashcatReportLogReporter_V1_LIBOS2(CrashCatReport *pReport)
    }
    else
    {
-        crashcatEnginePrintf(pReport->pEngine, NV_FALSE,
-            "Reported by libos task:%u v%u.%u [%u] @ ts:%u",
+        CRASHCAT_REPORT_LOG_PACKET_TYPE(pReport, "Reported by libos task:%u v%u.%u [%u] @ ts:%u",
            taskId, crashcatReportV1ReporterVersionLibos2Major(pReportV1),
            crashcatReportV1ReporterVersionLibos2Minor(pReportV1),
            crashcatReportV1ReporterVersionLibos2Cl(pReportV1),
--- a/src/nvidia/src/libraries/mmu/mmu_walk.c
+++ b/src/nvidia/src/libraries/mmu/mmu_walk.c
@@ -223,9 +223,12 @@ mmuWalkFindLevel
 )
 {
    const MMU_WALK_LEVEL *pLevel = &pWalk->root;
-    while (pLevel->pFmt != pLevelFmt)
+    while (pLevel != NULL && pLevel->pFmt != pLevelFmt)
    {
        NvU32 subLevel;
+
+        NV_ASSERT_OR_RETURN(pLevel->pFmt != NULL, NULL);
+
        // Single sub-level always continues.
        if (1 == pLevel->pFmt->numSubLevels)
        {
--- a/src/nvidia/src/libraries/nvport/memory/memory_tracking.c
+++ b/src/nvidia/src/libraries/nvport/memory/memory_tracking.c
@@ -1444,6 +1444,14 @@ _portMemAllocatorCreateOnExistingBlock
    pAllocator->pTracking     = NULL; // No tracking for this allocator
    pAllocator->pImpl         = (PORT_MEM_ALLOCATOR_IMPL*)(pAllocator + 1);

+
+    //
+    // PORT_MEM_BITVECTOR (pAllocator->pImpl) and PORT_MEM_ALLOCATOR_TRACKING (pAllocator->pImpl->tracking)
+    // are mutually exclusively used.
+    // When pAllocator->pTracking = NULL the data in pAllocator->pImpl->tracking is not used and instead 
+    // pBitVector uses the same meory location. 
+    // When pAllocator->pImpl->tracking there is no usage of PORT_MEM_BITVECTOR
+    //
    pBitVector = (PORT_MEM_BITVECTOR*)(pAllocator->pImpl);
    pBitVector->pSpinlock = pSpinlock;

@@ -1544,6 +1552,10 @@ _portMemAllocatorAllocExistingWrapper
    {
        portSyncSpinlockRelease(pSpinlock);
    }
+    if (pMem == NULL)
+    {
+         PORT_MEM_PRINT_ERROR("Memory allocation failed.\n");
+    }
    return pMem;
 }