570.195.03

2026-01-27 03:29:47 +00:00 · 2025-09-30 12:46:28 -07:00
parent 0bf68f06cc
commit b8c77a9a62
26 changed files with 309 additions and 248 deletions
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 # NVIDIA Linux Open GPU Kernel Module Source

 This is the source release of the NVIDIA Linux open GPU kernel modules,
-version 570.190.
+version 570.195.03.


 ## How to Build
@@ -17,7 +17,7 @@ as root:

 Note that the kernel modules built here must be used with GSP
 firmware and user-space NVIDIA GPU driver components from a corresponding
-570.190 driver release.  This can be achieved by installing
+570.195.03 driver release.  This can be achieved by installing
 the NVIDIA GPU driver from the .run file using the `--no-kernel-modules`
 option.  E.g.,

@@ -185,7 +185,7 @@ table below).
 For details on feature support and limitations, see the NVIDIA GPU driver
 end user README here:

-https://us.download.nvidia.com/XFree86/Linux-x86_64/570.190/README/kernel_open.html
+https://us.download.nvidia.com/XFree86/Linux-x86_64/570.195.03/README/kernel_open.html

 For vGPU support, please refer to the README.vgpu packaged in the vGPU Host
 Package for more details.
--- a/kernel-open/Kbuild
+++ b/kernel-open/Kbuild
@@ -79,7 +79,7 @@ ccflags-y += -I$(src)/common/inc
 ccflags-y += -I$(src)
 ccflags-y += -Wall $(DEFINES) $(INCLUDES) -Wno-cast-qual -Wno-format-extra-args
 ccflags-y += -D__KERNEL__ -DMODULE -DNVRM
-ccflags-y += -DNV_VERSION_STRING=\"570.190\"
+ccflags-y += -DNV_VERSION_STRING=\"570.195.03\"

 ifneq ($(SYSSRCHOST1X),)
 ccflags-y += -I$(SYSSRCHOST1X)
--- a/kernel-open/nvidia/nv.c
+++ b/kernel-open/nvidia/nv.c
@@ -2423,6 +2423,12 @@ nvidia_ioctl(
    {
        nv_ioctl_wait_open_complete_t *params = arg_copy;

+        if (arg_size != sizeof(nv_ioctl_wait_open_complete_t))
+        {
+            status = -EINVAL;
+            goto done_early;
+        }
+
        params->rc = nvlfp->open_rc;
        params->adapterStatus = nvlfp->adapter_status;
        goto done_early;
@@ -2503,8 +2509,12 @@ nvidia_ioctl(
                goto done;
            }

+            /* atomically check and alloc attached_gpus */
+            down(&nvl->ldata_lock);
+
            if (nvlfp->num_attached_gpus != 0)
            {
+                up(&nvl->ldata_lock);
                status = -EINVAL;
                goto done;
            }
@@ -2512,12 +2522,15 @@ nvidia_ioctl(
            NV_KMALLOC(nvlfp->attached_gpus, arg_size);
            if (nvlfp->attached_gpus == NULL)
            {
+                up(&nvl->ldata_lock);
                status = -ENOMEM;
                goto done;
            }
            memcpy(nvlfp->attached_gpus, arg_copy, arg_size);
            nvlfp->num_attached_gpus = num_arg_gpus;

+            up(&nvl->ldata_lock);
+
            for (i = 0; i < nvlfp->num_attached_gpus; i++)
            {
                if (nvlfp->attached_gpus[i] == 0)
@@ -2533,9 +2546,14 @@ nvidia_ioctl(
                            nvidia_dev_put(nvlfp->attached_gpus[i], sp);
                    }

+                    /* atomically free attached_gpus */
+                    down(&nvl->ldata_lock);
+
                    NV_KFREE(nvlfp->attached_gpus, arg_size);
                    nvlfp->num_attached_gpus = 0;

+                    up(&nvl->ldata_lock);
+
                    status = -EINVAL;
                    break;
                }
--- a/src/common/inc/nvBldVer.h
+++ b/src/common/inc/nvBldVer.h
@@ -36,25 +36,25 @@
 // and then checked back in. You cannot make changes to these sections without
 // corresponding changes to the buildmeister script
 #ifndef NV_BUILD_BRANCH
-    #define NV_BUILD_BRANCH             r570_00
+    #define NV_BUILD_BRANCH             r573_76
 #endif
 #ifndef NV_PUBLIC_BRANCH
-    #define NV_PUBLIC_BRANCH             r570_00
+    #define NV_PUBLIC_BRANCH             r573_76
 #endif

 #if defined(NV_LINUX) || defined(NV_BSD) || defined(NV_SUNOS)
-#define NV_BUILD_BRANCH_VERSION         "rel/gpu_drv/r570/r570_00-575"
-#define NV_BUILD_CHANGELIST_NUM         (36467544)
+#define NV_BUILD_BRANCH_VERSION         "rel/gpu_drv/r570/r573_76-590"
+#define NV_BUILD_CHANGELIST_NUM         (36569223)
 #define NV_BUILD_TYPE                   "Official"
-#define NV_BUILD_NAME                   "rel/gpu_drv/r570/r570_00-575"
-#define NV_LAST_OFFICIAL_CHANGELIST_NUM (36467544)
+#define NV_BUILD_NAME                   "rel/gpu_drv/r570/r573_76-590"
+#define NV_LAST_OFFICIAL_CHANGELIST_NUM (36569223)

 #else     /* Windows builds */
-#define NV_BUILD_BRANCH_VERSION         "r570_00-569"
-#define NV_BUILD_CHANGELIST_NUM         (36467544)
-#define NV_BUILD_TYPE                   "Official"
-#define NV_BUILD_NAME                   "573.73"
-#define NV_LAST_OFFICIAL_CHANGELIST_NUM (36467544)
+#define NV_BUILD_BRANCH_VERSION         "r573_76-1"
+#define NV_BUILD_CHANGELIST_NUM         (36518415)
+#define NV_BUILD_TYPE                   "Nightly"
+#define NV_BUILD_NAME                   "r573_76-250909"
+#define NV_LAST_OFFICIAL_CHANGELIST_NUM (36506718)
 #define NV_BUILD_BRANCH_BASE_VERSION    R570
 #endif
 // End buildmeister python edited section
--- a/src/common/inc/nvUnixVersion.h
+++ b/src/common/inc/nvUnixVersion.h
@@ -4,7 +4,7 @@
 #if defined(NV_LINUX) || defined(NV_BSD) || defined(NV_SUNOS) || defined(NV_VMWARE) || defined(NV_QNX) || defined(NV_INTEGRITY) || \
    (defined(RMCFG_FEATURE_PLATFORM_GSP) && RMCFG_FEATURE_PLATFORM_GSP == 1)

-#define NV_VERSION_STRING               "570.190"
+#define NV_VERSION_STRING               "570.195.03"

 #else

--- a/src/common/sdk/nvidia/inc/nverror.h
+++ b/src/common/sdk/nvidia/inc/nverror.h
@@ -109,7 +109,7 @@
 #define ROBUST_CHANNEL_NVJPG5_ERROR                     (103)
 #define ROBUST_CHANNEL_NVJPG6_ERROR                     (104)
 #define ROBUST_CHANNEL_NVJPG7_ERROR                     (105)
-#define DESTINATION_FLA_TRANSLATION_ERROR               (108)
+#define NVLINK_REMOTE_TRANSLATION_ERROR                 (108)
 #define SEC_FAULT_ERROR                                 (110)
 #define GSP_RPC_TIMEOUT                                 (119)
 #define GSP_ERROR                                       (120)
@@ -129,7 +129,7 @@
 #define ROBUST_CHANNEL_CE18_ERROR                       (134)
 #define ROBUST_CHANNEL_CE19_ERROR                       (135)
 #define ALI_TRAINING_FAIL                               (136)
-#define NVLINK_FLA_PRIV_ERR                             (137)
+#define NVLINK_PRIV_ERR                                 (137)
 #define ROBUST_CHANNEL_DLA_ERROR                        (138)
 #define ROBUST_CHANNEL_OFA1_ERROR                       (139)
 #define UNRECOVERABLE_ECC_ERROR_ESCAPE                  (140)
--- a/src/nvidia/generated/g_kern_gmmu_nvoc.h
+++ b/src/nvidia/generated/g_kern_gmmu_nvoc.h
@@ -500,7 +500,6 @@ struct KernelGmmu {
    NvBool PDB_PROP_KGMMU_REDUCE_NR_FAULT_BUFFER_SIZE;

    // Data members
-    NvBool bReportFlaTranslationXid;
    MEMORY_DESCRIPTOR *pFakeSparseBuffer;
    NvU64 fakeSparseEntry[3];
    NV2080_CTRL_INTERNAL_GMMU_GET_STATIC_INFO_PARAMS *pStaticInfo;
@@ -636,7 +635,6 @@ struct KernelGmmu_PRIVATE {
    NvBool PDB_PROP_KGMMU_REDUCE_NR_FAULT_BUFFER_SIZE;

    // Data members
-    NvBool bReportFlaTranslationXid;
    MEMORY_DESCRIPTOR *pFakeSparseBuffer;
    NvU64 fakeSparseEntry[3];
    NV2080_CTRL_INTERNAL_GMMU_GET_STATIC_INFO_PARAMS *pStaticInfo;
--- a/src/nvidia/inc/kernel/core/thread_state.h
+++ b/src/nvidia/inc/kernel/core/thread_state.h
@@ -76,6 +76,7 @@ struct THREAD_STATE_NODE
     */
    NvU32                threadSeqId;
    NvBool               bValid;
+    NvBool               bUsingHeap;
    THREAD_TIMEOUT_STATE timeout;
    NvU32                cpuNum;
    NvU32                flags;
@@ -208,6 +209,7 @@ void        threadStateOnlyProcessWorkISRAndDeferredIntHandler(THREAD_STATE_NODE
 void        threadStateOnlyFreeISRAndDeferredIntHandler(THREAD_STATE_NODE *, OBJGPU*, NvU32);
 void        threadStateFreeISRAndDeferredIntHandler(THREAD_STATE_NODE *, OBJGPU*, NvU32);
 void        threadStateInit(THREAD_STATE_NODE *pThreadNode, NvU32 flags);
+THREAD_STATE_NODE* threadStateAlloc(NvU32 flags);
 void        threadStateFree(THREAD_STATE_NODE *pThreadNode, NvU32 flags);

 NV_STATUS   threadStateGetCurrent(THREAD_STATE_NODE **ppThreadNode, OBJGPU *pGpu);
--- a/src/nvidia/inc/kernel/gpu/rpc/objrpc.h
+++ b/src/nvidia/inc/kernel/gpu/rpc/objrpc.h
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2004-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2004-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@@ -50,8 +50,8 @@ TYPEDEF_BITVECTOR(MC_ENGINE_BITVECTOR);
 #include "g_rpc_hal.h" // For RPC_HAL_IFACES
 #include "g_rpc_odb.h" // For RPC_HAL_IFACES

-#define RPC_TIMEOUT_LIMIT_PRINT_RATE_THRESH 3  // rate limit after 3 prints
-#define RPC_TIMEOUT_LIMIT_PRINT_RATE_SKIP   29 // skip 29 of 30 prints
+#define RPC_TIMEOUT_GPU_RESET_THRESHOLD 3  // Reset GPU after 3 back to back GSP RPC timeout
+#define RPC_TIMEOUT_PRINT_RATE_SKIP   29 // skip 29 of 30 prints

 #define RPC_HISTORY_DEPTH 128

--- a/src/nvidia/inc/kernel/mem_mgr/ctx_buf_pool.h
+++ b/src/nvidia/inc/kernel/mem_mgr/ctx_buf_pool.h
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2016-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2016-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@@ -44,10 +44,9 @@ struct CTX_BUF_POOL_INFO
 {
    //
    // Each array index corresponds to a pointer to memory pool with
-    // page size corresponding to RM_ATTR_PAGE_SIZE_*
-    // Pool corresponding to RM_ATTR_PAGE_SIZE_DEFAULT will be left unused
+    // page size corresponding to POOL_CONFIG_MODE
    //
-    RM_POOL_ALLOC_MEM_RESERVE_INFO *pMemPool[RM_ATTR_PAGE_SIZE_INVALID];
+    RM_POOL_ALLOC_MEM_RESERVE_INFO *pMemPool[POOL_CONFIG_MAX_SUPPORTED];
 };

 // List of all context buffers supported by memory pools
--- a/src/nvidia/inc/kernel/mem_mgr/pool_alloc.h
+++ b/src/nvidia/inc/kernel/mem_mgr/pool_alloc.h
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2016-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2016-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@@ -51,6 +51,7 @@ typedef enum
    POOL_CONFIG_CTXBUF_256G,    // configure pool for RM internal allocations like ctx buffers with 256GB page size
    POOL_CONFIG_CTXBUF_512M,    // configure pool for RM internal allocations like ctx buffers with 512MB page size
    POOL_CONFIG_CTXBUF_2M,      // configure pool for RM internal allocations like ctx buffers with 2MB page size
+    POOL_CONFIG_CTXBUF_128K,   // configure pool for RM internal allocations like ctx buffers with  128KB page size
    POOL_CONFIG_CTXBUF_64K,     // configure pool for RM internal allocations like ctx buffers with 64KB page size
    POOL_CONFIG_CTXBUF_4K,      // configure pool for RM internal allocations like ctx buffers with 4KB page size
    POOL_CONFIG_MAX_SUPPORTED
--- a/src/nvidia/src/kernel/core/thread_state.c
+++ b/src/nvidia/src/kernel/core/thread_state.c
@@ -508,27 +508,21 @@ static void _threadStateLogInitCaller(THREAD_STATE_NODE *pThreadNode, NvU64 func
 }

 /**
- * @brief Initialize a threadState for regular threads (non-interrupt context)
- *
- * @param[in/out] pThreadNode
- * @param[in] flags
- *
+ * @brief Common initialization logic for both stack and heap thread state nodes
+ * 
+ * @param[in/out] pThreadNode The node to initialize 
+ * @param[in] flags Thread state flags
+ * @param[in] bUsingHeap NV_TRUE if heap-allocated, NV_FALSE if stack-allocated
+ * 
+ * @return NV_OK on success, error code on failure
 */
-void threadStateInit(THREAD_STATE_NODE *pThreadNode, NvU32 flags)
+static NV_STATUS _threadStateInitCommon(THREAD_STATE_NODE *pThreadNode, NvU32 flags, NvBool bUsingHeap)
 {
    NV_STATUS rmStatus;
    NvU64 funcAddr;

-    // Isrs should be using threadStateIsrInit().
-    NV_ASSERT((flags & (THREAD_STATE_FLAGS_IS_ISR_LOCKLESS |
-                        THREAD_STATE_FLAGS_IS_ISR |
-                        THREAD_STATE_FLAGS_DEFERRED_INT_HANDLER_RUNNING)) == 0);
-
-    // Check to see if ThreadState is enabled
-    if (!(threadStateDatabase.setupFlags & THREAD_STATE_SETUP_FLAGS_ENABLED))
-        return;
-
    portMemSet(pThreadNode, 0, sizeof(*pThreadNode));
+    pThreadNode->bUsingHeap = bUsingHeap;
    pThreadNode->threadSeqId = portAtomicIncrementU32(&threadStateDatabase.threadSeqCntr);
    pThreadNode->cpuNum = osGetCurrentProcessorNumber();
    pThreadNode->flags = flags;
@@ -546,9 +540,10 @@ void threadStateInit(THREAD_STATE_NODE *pThreadNode, NvU32 flags)

    rmStatus = osGetCurrentThread(&pThreadNode->threadId);
    if (rmStatus != NV_OK)
-        return;
+        return rmStatus;

-    NV_ASSERT_OR_RETURN_VOID(pThreadNode->cpuNum < threadStateDatabase.maxCPUs);
+    NV_ASSERT_OR_RETURN(pThreadNode->cpuNum < threadStateDatabase.maxCPUs, 
+                        NV_ERR_INVALID_STATE);

    funcAddr = (NvU64) (NV_RETURN_ADDRESS());

@@ -558,27 +553,23 @@ void threadStateInit(THREAD_STATE_NODE *pThreadNode, NvU32 flags)
        // Reset the threadId as insertion failed. bValid is already NV_FALSE
        pThreadNode->threadId = 0;
        portSyncSpinlockRelease(threadStateDatabase.spinlock);
-        return;
-    }
-    else
-    {
-        pThreadNode->bValid = NV_TRUE;
-        rmStatus = NV_OK;
+        return NV_ERR_GENERIC;
    }

+    pThreadNode->bValid = NV_TRUE;
    _threadStateLogInitCaller(pThreadNode, funcAddr);

    portSyncSpinlockRelease(threadStateDatabase.spinlock);

    _threadStatePrintInfo(pThreadNode);

-    NV_ASSERT(rmStatus == NV_OK);
    threadPriorityStateAlloc();

    if (TLS_MIRROR_THREADSTATE)
    {
        THREAD_STATE_NODE **pTls = (THREAD_STATE_NODE **)tlsEntryAcquire(TLS_ENTRY_ID_THREADSTATE);
-        NV_ASSERT_OR_RETURN_VOID(pTls != NULL);
+        NV_ASSERT_OR_RETURN(pTls != NULL, NV_ERR_INVALID_STATE);
+
        if (*pTls != NULL)
        {
            NV_PRINTF(LEVEL_WARNING,
@@ -587,6 +578,66 @@ void threadStateInit(THREAD_STATE_NODE *pThreadNode, NvU32 flags)
        }
        *pTls = pThreadNode;
    }
+    return NV_OK;
+}
+
+/**
+ * @brief Initialize a threadState for regular threads (non-interrupt context)
+ *  Use the new UAF-safe API for new code, threadStateAlloc().
+ * @param[in/out] pThreadNode
+ * @param[in] flags
+ *
+ */
+void threadStateInit(THREAD_STATE_NODE *pThreadNode, NvU32 flags)
+{
+    // Isrs should be using threadStateIsrInit().
+    NV_ASSERT_OR_RETURN_VOID((flags & (THREAD_STATE_FLAGS_IS_ISR_LOCKLESS |
+        THREAD_STATE_FLAGS_IS_ISR |
+        THREAD_STATE_FLAGS_DEFERRED_INT_HANDLER_RUNNING)) == 0);
+
+    // Check to see if ThreadState is enabled
+    if (!(threadStateDatabase.setupFlags & THREAD_STATE_SETUP_FLAGS_ENABLED))
+        return;
+
+    // Use common initialization logic (stack-allocated)
+    // Note: Legacy void API ignores errors for backward compatibility
+    _threadStateInitCommon(pThreadNode, flags, NV_FALSE);
+}
+
+/**
+ * @brief Allocate a heap-based threadState
+ * @param[in] flags Thread state flags
+ *
+ * @return Heap-allocated THREAD_STATE_NODE* on success, NULL on failure
+ */
+THREAD_STATE_NODE* threadStateAlloc(NvU32 flags)
+{
+    THREAD_STATE_NODE *pHeapNode;
+    NV_STATUS rmStatus;
+
+    // Isrs should be using threadStateIsrInit().
+    NV_ASSERT_OR_RETURN((flags & (THREAD_STATE_FLAGS_IS_ISR_LOCKLESS |
+        THREAD_STATE_FLAGS_IS_ISR |
+        THREAD_STATE_FLAGS_DEFERRED_INT_HANDLER_RUNNING)) == 0, NULL);
+
+    // Check to see if ThreadState is enabled
+    if (!(threadStateDatabase.setupFlags & THREAD_STATE_SETUP_FLAGS_ENABLED))
+        return NULL;
+
+    // Allocate heap node directly
+    pHeapNode = portMemAllocNonPaged(sizeof(THREAD_STATE_NODE));
+    if (pHeapNode == NULL)
+        return NULL;
+
+    rmStatus = _threadStateInitCommon(pHeapNode, flags, NV_TRUE);
+    if (rmStatus != NV_OK)
+        goto cleanup_heap;
+
+    return pHeapNode;
+
+cleanup_heap:
+    portMemFree(pHeapNode);
+    return NULL;
 }

 /**
@@ -870,6 +921,12 @@ void threadStateFree(THREAD_STATE_NODE *pThreadNode, NvU32 flags)
                     r);
        }
    }
+
+    // Free heap memory if this node was heap-allocated
+    if (pThreadNode->bUsingHeap)
+    {
+        portMemFree(pThreadNode);
+    }
 }

 /**
--- a/src/nvidia/src/kernel/gpu/fifo/kernel_channel.c
+++ b/src/nvidia/src/kernel/gpu/fifo/kernel_channel.c
@@ -3117,10 +3117,12 @@ kchannelCtrlCmdResetIsolatedChannel_IMPL
    OBJGPU    *pGpu      = GPU_RES_GET_GPU(pKernelChannel);
    RM_API    *pRmApi    = GPU_GET_PHYSICAL_RMAPI(pGpu);

-
    // This ctrl sets bIsRcPending in the KernelChannel object. Because Kernel-RM is
    // the source of truth on this, it's important that this ctrl is called from CPU-RM
-    NV_ASSERT_OR_RETURN(!RMCFG_FEATURE_PLATFORM_GSP, NV_ERR_INVALID_OPERATION);
+
+    // In case of vGPU this Rmctrl gets called in GSP-RM only,
+    // this RmCtrl is issued from guest kernel RM and then called by the GSP plugin directly to GSP RM
+    // Since bIsRcPending is handled in guest, so we need to allow the call in GSP RM.

    // Call internal RMCTRL on physical-RM, kchannelFwdToInternalCtrl() is not
    // used because no conversion from KernelChannel to Channel is required
--- a/src/nvidia/src/kernel/gpu/fsp/arch/hopper/kern_fsp_gh100.c
+++ b/src/nvidia/src/kernel/gpu/fsp/arch/hopper/kern_fsp_gh100.c
@@ -245,11 +245,14 @@ kfspCanSendPacket_GH100
 {
    NvU32 cmdqHead;
    NvU32 cmdqTail;
+    NvU32 msgqHead;
+    NvU32 msgqTail;

    _kfspGetQueueHeadTail_GH100(pGpu, pKernelFsp, &cmdqHead, &cmdqTail);
+    _kfspGetMsgQueueHeadTail_GH100(pGpu, pKernelFsp, &msgqHead, &msgqTail);

    // FSP will set QUEUE_HEAD = TAIL after each packet is received
-    return (cmdqHead == cmdqTail);
+    return (cmdqHead == cmdqTail) && (msgqHead == msgqTail);
 }

 /*!
--- a/src/nvidia/src/kernel/gpu/fsp/kern_fsp.c
+++ b/src/nvidia/src/kernel/gpu/fsp/kern_fsp.c
@@ -269,6 +269,27 @@ kfspStateDestroy_IMPL

 }

+/*
+ * @brief GpuWaitConditionFunc for FSP ready
+ *
+ * @param[in] pGpu          GPU object pointer
+ * @param[in] pCondData     KernelFsp object pointer
+ *
+ * @returns   NvBool        NV_TRUE if command and message fsp
+ *                          queues are empty
+ */
+static NvBool
+_kfspWaitForCanSend
+(
+    OBJGPU *pGpu,
+    void   *pCondData
+)
+{
+    KernelFsp *pKernelFsp = (KernelFsp*) pCondData;
+
+    return kfspCanSendPacket_HAL(pGpu, pKernelFsp);
+}
+
 /*!
 * @brief Wait until RM can send to FSP
 *
@@ -290,40 +311,11 @@ kfspPollForCanSend_IMPL
    gpuSetTimeout(pGpu, GPU_TIMEOUT_DEFAULT, &timeout,
        GPU_TIMEOUT_FLAGS_OSTIMER);

-    while (!kfspCanSendPacket_HAL(pGpu, pKernelFsp))
+    status = gpuTimeoutCondWait(pGpu, _kfspWaitForCanSend, pKernelFsp, &timeout);
+    if (status != NV_OK)
    {
-        //
-        // For now we assume that any response from FSP before RM message
-        // send is complete indicates an error and we should abort.
-        //
-        // Ongoing dicussion on usefullness of this check. Bug to be filed.
-        //
-        if (kfspIsResponseAvailable_HAL(pGpu, pKernelFsp))
-        {
-            kfspReadMessage(pGpu, pKernelFsp, NULL, 0);
-            NV_PRINTF(LEVEL_ERROR,
-                "Received error message from FSP while waiting to send.\n");
-            status = NV_ERR_GENERIC;
-            break;
-        }
-
-        osSpinLoop();
-
-        status = gpuCheckTimeout(pGpu, &timeout);
-        if (status != NV_OK)
-        {
-            if ((status == NV_ERR_TIMEOUT) &&
-                kfspCanSendPacket_HAL(pGpu, pKernelFsp))
-            {
-                status = NV_OK;
-            }
-            else
-            {
-                NV_PRINTF(LEVEL_ERROR,
-                    "Timed out waiting for FSP command queue to be empty.\n");
-            }
-            break;
-        }
+        NV_PRINTF(LEVEL_ERROR,
+            "Timed out waiting for FSP queues to be empty.\n");
    }

    return status;
--- a/src/nvidia/src/kernel/gpu/gsp/kernel_gsp.c
+++ b/src/nvidia/src/kernel/gpu/gsp/kernel_gsp.c
@@ -2061,8 +2061,8 @@ _kgspRpcIncrementTimeoutCountAndRateLimitPrints
 {
    pRpc->timeoutCount++;

-    if ((pRpc->timeoutCount == (RPC_TIMEOUT_LIMIT_PRINT_RATE_THRESH + 1)) &&
-        (RPC_TIMEOUT_LIMIT_PRINT_RATE_SKIP > 0))
+    if ((pRpc->timeoutCount == (RPC_TIMEOUT_GPU_RESET_THRESHOLD + 1)) &&
+        (RPC_TIMEOUT_PRINT_RATE_SKIP > 0))
    {
        // make sure we warn Xid and NV_PRINTF/NVLOG consumers that we are rate limiting prints
        if (GPU_GET_KERNEL_RC(pGpu)->bLogEvents)
@@ -2072,15 +2072,15 @@ _kgspRpcIncrementTimeoutCountAndRateLimitPrints
                gpuGetDomain(pGpu),
                gpuGetBus(pGpu),
                gpuGetDevice(pGpu),
-                RPC_TIMEOUT_LIMIT_PRINT_RATE_SKIP + 1);
+                RPC_TIMEOUT_PRINT_RATE_SKIP + 1);
        }
        NV_PRINTF(LEVEL_WARNING,
                  "Rate limiting GSP RPC error prints (printing 1 of every %d)\n",
-                  RPC_TIMEOUT_LIMIT_PRINT_RATE_SKIP + 1);
+                  RPC_TIMEOUT_PRINT_RATE_SKIP + 1);
    }

-    pRpc->bQuietPrints = ((pRpc->timeoutCount > RPC_TIMEOUT_LIMIT_PRINT_RATE_THRESH) &&
-                          ((pRpc->timeoutCount % (RPC_TIMEOUT_LIMIT_PRINT_RATE_SKIP + 1)) != 0));
+    pRpc->bQuietPrints = ((pRpc->timeoutCount > RPC_TIMEOUT_GPU_RESET_THRESHOLD) &&
+                          ((pRpc->timeoutCount % (RPC_TIMEOUT_PRINT_RATE_SKIP + 1)) != 0));
 }

 /*!
@@ -2228,6 +2228,22 @@ _kgspRpcRecvPoll
                _kgspLogXid119(pGpu, pRpc, expectedFunc, expectedSequence);
            }

+            // Detect for 3 back to back GSP RPC timeout
+            if (pRpc->timeoutCount == RPC_TIMEOUT_GPU_RESET_THRESHOLD)
+            {
+                // GSP is completely stalled and cannot be recovered. Mark the GPU for reset.
+                NV_ASSERT_FAILED("Back to back GSP RPC timeout detected! GPU marked for reset");
+                gpuMarkDeviceForReset(pGpu);
+                pKernelGsp->bFatalError = NV_TRUE;
+
+                // For Windows, if TDR is supported, trigger TDR to recover the system.
+                if (pGpu->getProperty(pGpu, PDB_PROP_GPU_SUPPORTS_TDR_EVENT))
+                {
+                    NV_ASSERT_FAILED("Triggering TDR to recover from GSP hang");
+                    gpuNotifySubDeviceEvent(pGpu, NV2080_NOTIFIERS_UCODE_RESET, NULL, 0, 0, 0);
+                }
+            }
+
            goto done;
        }
        else if (timeoutStatus != NV_OK)
--- a/src/nvidia/src/kernel/gpu/gsp/message_queue_cpu.c
+++ b/src/nvidia/src/kernel/gpu/gsp/message_queue_cpu.c
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@@ -720,7 +720,7 @@ NV_STATUS GspMsgQueueReceiveStatus(MESSAGE_QUEUE_INFO *pMQI, OBJGPU *pGpu)
        else
        {
            NV_PRINTF(LEVEL_ERROR, "Read failed after %d retries.\n", nRetries);
-            return nvStatus;
+            goto exit;
        }
    }

@@ -758,16 +758,14 @@ NV_STATUS GspMsgQueueReceiveStatus(MESSAGE_QUEUE_INFO *pMQI, OBJGPU *pGpu)
        nvStatus = NV_ERR_INVALID_PARAM_STRUCT;
    }

-    if (nvStatus == NV_OK)
-    {
-        pMQI->rxSeqNum++;
+exit:
+    pMQI->rxSeqNum++;

-        nRet = msgqRxMarkConsumed(pMQI->hQueue, nElements);
-        if (nRet < 0)
-        {
-            NV_PRINTF(LEVEL_ERROR, "msgqRxMarkConsumed failed: %d\n", nRet);
-            nvStatus = NV_ERR_GENERIC;
-        }
+    nRet = msgqRxMarkConsumed(pMQI->hQueue, nElements);
+    if (nRet < 0)
+    {
+        NV_PRINTF(LEVEL_ERROR, "msgqRxMarkConsumed failed: %d\n", nRet);
+        nvStatus = NV_ERR_GENERIC;
    }

    return nvStatus;
--- a/src/nvidia/src/kernel/gpu/mem_mgr/mem_desc.c
+++ b/src/nvidia/src/kernel/gpu/mem_mgr/mem_desc.c
@@ -236,6 +236,11 @@ memdescCreate

    allocSize = Size;

+    if (allocSize == 0)
+    {
+        return NV_ERR_INVALID_ARGUMENT;
+    }
+
    //
    // this memdesc may have gotten forced to sysmem if no carveout,
    // but for VPR it needs to be in vidmem, so check and re-direct here,
@@ -306,14 +311,7 @@ memdescCreate
    // (4k >> 12 = 1). This modification helps us to avoid overflow of variable
    // allocSize, in case caller of this function passes highest value of NvU64.
    //
-    if (allocSize == 0)
-    {
-        PageCount = 0;
-    }
-    else
-    {
-        PageCount = ((allocSize - 1) >> RM_PAGE_SHIFT) + 1;
-    }
+    PageCount = ((allocSize - 1) >> RM_PAGE_SHIFT) + 1;

    if (PhysicallyContiguous)
    {
--- a/src/nvidia/src/kernel/gpu/mmu/arch/ampere/kern_gmmu_ga100.c
+++ b/src/nvidia/src/kernel/gpu/mmu/arch/ampere/kern_gmmu_ga100.c
@@ -166,7 +166,7 @@ kgmmuSetupWarForBug2720120FmtFamily_GA100
                                         kgmmuGetPTEAperture(pKernelGmmu),
                                         kgmmuGetPTEAttr(pKernelGmmu), 0));

-    memdescTagAlloc(status, NV_FB_ALLOC_RM_INTERNAL_OWNER_WAR_PT, 
+    memdescTagAlloc(status, NV_FB_ALLOC_RM_INTERNAL_OWNER_WAR_PT,
                    pKernelGmmu->pWarSmallPageTable);
    NV_ASSERT_OK_OR_GOTO(status, status, failed);

@@ -201,7 +201,7 @@ kgmmuSetupWarForBug2720120FmtFamily_GA100
                                               kgmmuGetPTEAperture(pKernelGmmu),
                                               kgmmuGetPTEAttr(pKernelGmmu), 0), failed);

-    memdescTagAlloc(status, NV_FB_ALLOC_RM_INTERNAL_OWNER_WAR_PD, 
+    memdescTagAlloc(status, NV_FB_ALLOC_RM_INTERNAL_OWNER_WAR_PD,
                    pKernelGmmu->pWarPageDirectory0);
    NV_ASSERT_OK_OR_GOTO(status, status, failed);

@@ -376,30 +376,26 @@ kgmmuServiceMmuFault_GA100
    FIFO_MMU_EXCEPTION_DATA *pMmuExceptionData
 )
 {
+    NV_STATUS status = NV_OK;
+
    MMU_FAULT_BUFFER_ENTRY *pParsedFaultEntry = KERNEL_POINTER_FROM_NvP64(MMU_FAULT_BUFFER_ENTRY *, pParsedFaultInfo);

    //  If FLA fault do not reset channel
    if (pParsedFaultEntry->mmuFaultEngineId == NV_PFAULT_MMU_ENG_ID_FLA)
    {
-        if (pKernelGmmu->bReportFlaTranslationXid)
-        {
-            nvErrorLog_va((void *)pGpu,
-                DESTINATION_FLA_TRANSLATION_ERROR,
-                "FLA Fault: inst:0x%x dev:0x%x subdev:0x%x, faulted @ 0x%x_%08x. Fault is of type %s %s",
-                gpuGetInstance(pGpu),
-                gpuGetDeviceInstance(pGpu),
-                pGpu->subdeviceInstance,
-                pMmuExceptionData->addrHi,
-                pMmuExceptionData->addrLo,
-                kgmmuGetFaultTypeString_HAL(pKernelGmmu, pMmuExceptionData->faultType),
-                kfifoGetFaultAccessTypeString_HAL(pGpu, GPU_GET_KERNEL_FIFO(pGpu),
-                    pMmuExceptionData->accessType));
-        }
-
-        return NV_OK;
+        nvErrorLog_va((void *)pGpu,
+            NVLINK_REMOTE_TRANSLATION_ERROR,
+            "NVLink remote translation error: faulted @ 0x%x_%08x. Fault is of type %s %s",
+            pMmuExceptionData->addrHi,
+            pMmuExceptionData->addrLo,
+            kgmmuGetFaultTypeString_HAL(pKernelGmmu, pMmuExceptionData->faultType),
+            kfifoGetFaultAccessTypeString_HAL(pGpu, GPU_GET_KERNEL_FIFO(pGpu),
+                pMmuExceptionData->accessType));
    }
    else
    {
-        return kgmmuServiceMmuFault_GV100(pGpu, pKernelGmmu, pParsedFaultInfo, pMmuExceptionData);
+        status = kgmmuServiceMmuFault_GV100(pGpu, pKernelGmmu, pParsedFaultInfo, pMmuExceptionData);
    }
+
+    return status;
 }
--- a/src/nvidia/src/kernel/gpu/rc/kernel_rc_callback.c
+++ b/src/nvidia/src/kernel/gpu/rc/kernel_rc_callback.c
@@ -61,12 +61,27 @@ _vgpuRcResetCallback
        {
            THREAD_STATE_NODE                             threadState;
            NV506F_CTRL_CMD_RESET_ISOLATED_CHANNEL_PARAMS params = {0};
+            RsClient      *pClient;
+            KernelChannel *pKernelChannel = NULL;

            threadStateInitISRAndDeferredIntHandler(
                &threadState,
                pRcErrorContext->pGpu,
                THREAD_STATE_FLAGS_IS_DEFERRED_INT_HANDLER);

+            NV_ASSERT_OK_OR_GOTO(
+                status,
+                serverGetClientUnderLock(&g_resServ, hClient, &pClient),
+                error_cleanup);
+            NV_ASSERT_OK_OR_GOTO(
+                status,
+                CliGetKernelChannel(pClient, hChannel, &pKernelChannel),
+                error_cleanup);
+ 
+            NV_ASSERT_OR_ELSE(pKernelChannel != NULL,
+                              status = NV_ERR_INVALID_STATE;
+                              goto error_cleanup);
+
            params.engineID   = pRcErrorContext->EngineId;
            params.exceptType = pRcErrorContext->exceptType;

@@ -99,6 +114,11 @@ _vgpuRcResetCallback
    }

    return status;
+
+error_cleanup:
+    rmGpuLocksRelease(GPUS_LOCK_FLAGS_NONE, NULL);
+    osReleaseRmSema(pSys->pSema, NULL);
+    return status;
 }


--- a/src/nvidia/src/kernel/mem_mgr/ctx_buf_pool.c
+++ b/src/nvidia/src/kernel/mem_mgr/ctx_buf_pool.c
@@ -121,7 +121,7 @@ ctxBufPoolInit
 {
    NV_STATUS status = NV_OK;
    CTX_BUF_POOL_INFO *pCtxBufPool = NULL;
-    NvU32 i, poolConfig;
+    NvU32 i;

    NV_ASSERT_OR_RETURN(ppCtxBufPool != NULL, NV_ERR_INVALID_ARGUMENT);

@@ -136,35 +136,13 @@ ctxBufPoolInit

    //
    // create a mem pool for each page size supported by RM
-    // pool corresponding to RM_ATTR_PAGE_SIZE_DEFAULT remains unused
    //
-    for (i = 0; i < RM_ATTR_PAGE_SIZE_INVALID; i++)
+    for (i = 0; i < POOL_CONFIG_MAX_SUPPORTED; i++)
    {
-        switch (i)
-        {
-            case RM_ATTR_PAGE_SIZE_DEFAULT:
-            case RM_ATTR_PAGE_SIZE_4KB:
-                poolConfig = POOL_CONFIG_CTXBUF_4K;
-                break;
-            case RM_ATTR_PAGE_SIZE_BIG:
-                poolConfig = POOL_CONFIG_CTXBUF_64K;
-                break;
-            case RM_ATTR_PAGE_SIZE_HUGE:
-                poolConfig = POOL_CONFIG_CTXBUF_2M;
-                break;
-            case RM_ATTR_PAGE_SIZE_512MB:
-                poolConfig = POOL_CONFIG_CTXBUF_512M;
-                break;
-            case RM_ATTR_PAGE_SIZE_256GB:
-                poolConfig = POOL_CONFIG_CTXBUF_256G;
-                break;
-            default:
-                NV_PRINTF(LEVEL_ERROR, "Unsupported page size attr %d\n", i);
-                return NV_ERR_INVALID_STATE;
-        }
+        // Pool Config starts from POOL_CONFIG_CTXBUF_256G
        NV_ASSERT_OK_OR_GOTO(status,
            rmMemPoolSetup((void*)&pHeap->pmaObject, &pCtxBufPool->pMemPool[i],
-                           poolConfig),
+                           (POOL_CONFIG_MODE) i),
            cleanup);

        // Allocate the pool in CPR in case of Confidential Compute
@@ -211,7 +189,7 @@ ctxBufPoolDestroy

    pCtxBufPool = *ppCtxBufPool;

-    for (i = 0; i < RM_ATTR_PAGE_SIZE_INVALID; i++)
+    for (i = 0; i < POOL_CONFIG_MAX_SUPPORTED; i++)
    {
        if (pCtxBufPool->pMemPool[i] != NULL)
        {
@@ -224,6 +202,29 @@ ctxBufPoolDestroy
    NV_PRINTF(LEVEL_INFO, "Ctx buf pool destroyed\n");
 }

+static NvU32 NV_FORCEINLINE
+ctxBufPoolPageSizeToPoolIndex(NvU64 pageSize)
+{
+    switch (pageSize)
+    {
+        case RM_PAGE_SIZE:
+            return POOL_CONFIG_CTXBUF_4K;
+        case RM_PAGE_SIZE_64K:
+            return POOL_CONFIG_CTXBUF_64K;
+        case RM_PAGE_SIZE_128K:
+            return POOL_CONFIG_CTXBUF_128K;
+        case RM_PAGE_SIZE_HUGE:
+            return POOL_CONFIG_CTXBUF_2M;
+        case RM_PAGE_SIZE_512M:
+            return POOL_CONFIG_CTXBUF_512M;
+        case RM_PAGE_SIZE_256G:
+            return POOL_CONFIG_CTXBUF_256G;
+        default:
+            NV_PRINTF(LEVEL_ERROR, "Unrecognized/unsupported page size = 0x%llx\n", pageSize);
+            NV_ASSERT_OR_RETURN(0, POOL_CONFIG_MAX_SUPPORTED);
+    }
+}
+
 /*
 * @brief Calculates total amount of memory required for all buffers in each pool and reserves the memory
 *
@@ -263,7 +264,7 @@ ctxBufPoolReserve
    NV_STATUS status = NV_OK;
    NvU64 pageSize;
    NvU32 i;
-    NvU64 totalSize[RM_ATTR_PAGE_SIZE_INVALID] = {0};
+    NvU64 totalSize[POOL_CONFIG_MAX_SUPPORTED] = {0};
    NvU64 size;

    NV_ASSERT_OR_RETURN(pCtxBufPool != NULL, NV_ERR_INVALID_ARGUMENT);
@@ -282,32 +283,13 @@ ctxBufPoolReserve
        // Determine the pool(4K/64K/2M) from where this buffer will eventually
        // get allocated and mark that pool to reserve this memory.
        //
-        switch(pageSize)
-        {
-            case RM_PAGE_SIZE:
-                totalSize[RM_ATTR_PAGE_SIZE_4KB] += size;
-                break;
-            case RM_PAGE_SIZE_64K:
-            case RM_PAGE_SIZE_128K:
-                totalSize[RM_ATTR_PAGE_SIZE_BIG] += size;
-                break;
-            case RM_PAGE_SIZE_HUGE:
-                totalSize[RM_ATTR_PAGE_SIZE_HUGE] += size;
-                break;
-            case RM_PAGE_SIZE_512M:
-                totalSize[RM_ATTR_PAGE_SIZE_512MB] += size;
-                break;
-            case RM_PAGE_SIZE_256G:
-                totalSize[RM_ATTR_PAGE_SIZE_256GB] += size;
-                break;
-            default:
-                NV_PRINTF(LEVEL_ERROR, "Unrecognized/unsupported page size = 0x%llx\n", pageSize);
-                NV_ASSERT_OR_RETURN(0, NV_ERR_INVALID_ARGUMENT);
-        }
+        NvU32 poolIndex = ctxBufPoolPageSizeToPoolIndex(pageSize);
+        NV_ASSERT_OR_RETURN(poolIndex < POOL_CONFIG_MAX_SUPPORTED, NV_ERR_INVALID_ARGUMENT);
+        totalSize[poolIndex] += size;
        NV_PRINTF(LEVEL_INFO, "Reserving 0x%llx bytes for buf Id = 0x%x in pool with page size = 0x%llx\n", size, i, pageSize);
    }

-    for (i = 0; i < RM_ATTR_PAGE_SIZE_INVALID; i++)
+    for (i = 0; i < POOL_CONFIG_MAX_SUPPORTED; i++)
    {
        if (totalSize[i] > 0)
        {
@@ -342,7 +324,7 @@ ctxBufPoolTrim
    NvU32 i;
    NV_ASSERT_OR_RETURN(pCtxBufPool != NULL, NV_ERR_INVALID_ARGUMENT);

-    for (i = 0; i < RM_ATTR_PAGE_SIZE_INVALID; i++)
+    for (i = 0; i < POOL_CONFIG_MAX_SUPPORTED; i++)
    {
        rmMemPoolTrim(pCtxBufPool->pMemPool[i], 0, 0);
        NV_PRINTF(LEVEL_INFO, "Trimmed pool with RM_ATTR_PAGE_SIZE_* = 0x%x\n", i);
@@ -369,7 +351,7 @@ ctxBufPoolRelease
    NvU32 i;
    NV_ASSERT(pCtxBufPool != NULL);

-    for (i = 0; i < RM_ATTR_PAGE_SIZE_INVALID; i++)
+    for (i = 0; i < POOL_CONFIG_MAX_SUPPORTED; i++)
    {
        rmMemPoolRelease(pCtxBufPool->pMemPool[i], 0);
    }
@@ -426,29 +408,10 @@ ctxBufPoolAllocate
        pageSize = newPageSize;
    }

-    // Determine the pool(4K/64K/2M) from where this buffer is to be allocated
-    switch(pageSize)
-    {
-        case RM_PAGE_SIZE:
-            pPool = pCtxBufPool->pMemPool[RM_ATTR_PAGE_SIZE_4KB];
-            break;
-        case RM_PAGE_SIZE_64K:
-        case RM_PAGE_SIZE_128K:
-            pPool = pCtxBufPool->pMemPool[RM_ATTR_PAGE_SIZE_BIG];
-            break;
-        case RM_PAGE_SIZE_HUGE:
-            pPool = pCtxBufPool->pMemPool[RM_ATTR_PAGE_SIZE_HUGE];
-            break;
-        case RM_PAGE_SIZE_512M:
-            pPool = pCtxBufPool->pMemPool[RM_ATTR_PAGE_SIZE_512MB];
-            break;
-        case RM_PAGE_SIZE_256G:
-            pPool = pCtxBufPool->pMemPool[RM_ATTR_PAGE_SIZE_256GB];
-            break;
-        default:
-            NV_PRINTF(LEVEL_ERROR, "Unsupported page size = 0x%llx set for context buffer\n", pageSize);
-            NV_ASSERT_OR_RETURN(0, NV_ERR_INVALID_ARGUMENT);
-    }
+    NvU32 poolIndex = ctxBufPoolPageSizeToPoolIndex(pageSize);
+    NV_ASSERT_OR_RETURN(poolIndex < POOL_CONFIG_MAX_SUPPORTED, NV_ERR_INVALID_ARGUMENT);
+    pPool = pCtxBufPool->pMemPool[poolIndex];
+
    NV_ASSERT_OK_OR_RETURN(rmMemPoolAllocate(pPool, (RM_POOL_ALLOC_MEMDESC*)pMemDesc));
    NV_PRINTF(LEVEL_INFO, "Buffer allocated from ctx buf pool with page size = 0x%llx\n", pageSize);
    return NV_OK;
@@ -488,28 +451,9 @@ ctxBufPoolFree
            pMemDesc->Alignment, RM_ATTR_PAGE_SIZE_DEFAULT, NV_TRUE, &size, &pageSize));
    }

-    switch(pageSize)
-    {
-        case RM_PAGE_SIZE:
-            pPool = pCtxBufPool->pMemPool[RM_ATTR_PAGE_SIZE_4KB];
-            break;
-        case RM_PAGE_SIZE_64K:
-        case RM_PAGE_SIZE_128K:
-            pPool = pCtxBufPool->pMemPool[RM_ATTR_PAGE_SIZE_BIG];
-            break;
-        case RM_PAGE_SIZE_HUGE:
-            pPool = pCtxBufPool->pMemPool[RM_ATTR_PAGE_SIZE_HUGE];
-            break;
-        case RM_PAGE_SIZE_512M:
-            pPool = pCtxBufPool->pMemPool[RM_ATTR_PAGE_SIZE_512MB];
-            break;
-        case RM_PAGE_SIZE_256G:
-            pPool = pCtxBufPool->pMemPool[RM_ATTR_PAGE_SIZE_256GB];
-            break;
-        default:
-            NV_PRINTF(LEVEL_ERROR, "Unsupported page size detected for context buffer\n");
-            NV_ASSERT_OR_RETURN(0, NV_ERR_INVALID_STATE);
-    }
+    NvU32 poolIndex = ctxBufPoolPageSizeToPoolIndex(pageSize);
+    NV_ASSERT_OR_RETURN(poolIndex < POOL_CONFIG_MAX_SUPPORTED, NV_ERR_INVALID_ARGUMENT);
+    pPool = pCtxBufPool->pMemPool[poolIndex];

    // If scrubber is being skipped by PMA we need to manually scrub this memory
    if (rmMemPoolIsScrubSkipped(pPool))
@@ -665,16 +609,19 @@ ctxBufPoolGetSizeAndPageSize
    {
        NvU64 chunkSize = 0;
        NvU32 i;
-        for (i = 0; i < RM_ATTR_PAGE_SIZE_INVALID; i++)
+        //
+        // pools are sorted in descending order of chunk size. So, start from the pool with the smallest chunk size.
+        //
+        for (i = POOL_CONFIG_MAX_SUPPORTED; i; i--)
        {
-            NV_ASSERT_OK_OR_RETURN(rmMemPoolGetChunkAndPageSize(pCtxBufPool->pMemPool[i], &chunkSize, &pageSize));
+            NV_ASSERT_OK_OR_RETURN(rmMemPoolGetChunkAndPageSize(pCtxBufPool->pMemPool[i - 1], &chunkSize, &pageSize));
            if (chunkSize >= size)
            {
                size = chunkSize;
                break;
            }
        }
-        if (i == RM_ATTR_PAGE_SIZE_INVALID)
+        if (i == 0)
        {
            NV_PRINTF(LEVEL_ERROR, "couldn't find pool with chunksize >= 0x%llx\n", size);
            DBG_BREAKPOINT();
@@ -722,7 +669,7 @@ ctxBufPoolIsScrubSkipped
 {
    NvU32 i;
    NV_ASSERT_OR_RETURN(pCtxBufPool != NULL, NV_ERR_INVALID_ARGUMENT);
-    for (i = 0; i < RM_ATTR_PAGE_SIZE_INVALID; i++)
+    for (i = 0; i < POOL_CONFIG_MAX_SUPPORTED; i++)
    {
        if (!rmMemPoolIsScrubSkipped(pCtxBufPool->pMemPool[i]))
            return NV_FALSE;
@@ -747,7 +694,7 @@ ctxBufPoolSetScrubSkip
 {
    NvU32 i;
    NV_ASSERT_OR_RETURN_VOID(pCtxBufPool != NULL);
-    for (i = 0; i < RM_ATTR_PAGE_SIZE_INVALID; i++)
+    for (i = 0; i < POOL_CONFIG_MAX_SUPPORTED; i++)
    {
        rmMemPoolSkipScrub(pCtxBufPool->pMemPool[i], bSkipScrub);
    }
--- a/src/nvidia/src/kernel/mem_mgr/pool_alloc.c
+++ b/src/nvidia/src/kernel/mem_mgr/pool_alloc.c
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2016-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2016-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@@ -99,8 +99,7 @@ typedef enum
 * This array contains the alloction sizes (in bytes) of each pool.
 */
 static const NvU64 poolAllocSizes[] = {
-    0x4000000000,
-    0x20000000, 0x200000, 0x40000, 0x20000, 0x10000, 0x2000, 0x1000, 0x100
+    0x4000000000, 0x20000000, 0x200000, 0x40000, 0x20000, 0x10000, 0x2000, 0x1000, 0x100
 };

 #define POOL_CONFIG_POOL_IDX       0
@@ -112,7 +111,8 @@ static const NvU64 poolConfig[POOL_CONFIG_MAX_SUPPORTED][POOL_CONFIG_CHUNKSIZE_I
     { RM_POOL_IDX_4K,   PMA_CHUNK_SIZE_64K },  // pool with pageSize = 4K for GMMU_FMT_VERSION_2
     { RM_POOL_IDX_256G, PMA_CHUNK_SIZE_256G }, // pool with pageSize = 256G for RM allocated buffers (unused as of blackwell)
     { RM_POOL_IDX_512M, PMA_CHUNK_SIZE_512M }, // pool with pageSize = 512MB for RM allocated buffers (unused as of ampere)
-     { RM_POOL_IDX_2M,   PMA_CHUNK_SIZE_4M },   // pool with pageSize = 2MB for RM allocated buffers
+     { RM_POOL_IDX_2M,   PMA_CHUNK_SIZE_4M },   // pool with pageSize = 4MB for RM allocated buffers
+     { RM_POOL_IDX_128K, PMA_CHUNK_SIZE_2M},    // pool with pageSize = 2MB for RM allocated buffers
     { RM_POOL_IDX_64K,  PMA_CHUNK_SIZE_256K }, // pool with pageSize = 64K for RM allocated buffers
     { RM_POOL_IDX_4K,   PMA_CHUNK_SIZE_64K }   // pool with pageSize = 4K for RM allocated buffers
 };
--- a/src/nvidia/src/kernel/mem_mgr/standard_mem.c
+++ b/src/nvidia/src/kernel/mem_mgr/standard_mem.c
@@ -57,6 +57,11 @@ NV_STATUS stdmemValidateParams
        return NV_ERR_INVALID_ARGUMENT;
    }

+    if (pAllocData->size == 0)
+    {
+        return NV_ERR_INVALID_ARGUMENT;
+    }
+
    //
    // These flags don't do anything in this path. No mapping on alloc and
    // kernel map is controlled by TYPE
--- a/src/nvidia/src/kernel/rmapi/nv_gpu_ops.c
+++ b/src/nvidia/src/kernel/rmapi/nv_gpu_ops.c
@@ -7616,7 +7616,7 @@ static NV_STATUS dupMemory(struct gpuDevice *device,
 {
    NV_STATUS status = NV_OK;
    nvGpuOpsLockSet acquiredLocks;
-    THREAD_STATE_NODE threadState;
+    THREAD_STATE_NODE *pThreadState;
    NvHandle  dupedMemHandle;
    Memory *pMemory =  NULL;
    PMEMORY_DESCRIPTOR pMemDesc = NULL;
@@ -7637,14 +7637,15 @@ static NV_STATUS dupMemory(struct gpuDevice *device,

    NV_ASSERT((flags == NV04_DUP_HANDLE_FLAGS_REJECT_KERNEL_DUP_PRIVILEGE) || (flags == NV04_DUP_HANDLE_FLAGS_NONE));

-    threadStateInit(&threadState, THREAD_STATE_FLAGS_NONE);
-
+    pThreadState = threadStateAlloc(THREAD_STATE_FLAGS_NONE);
+    if (!pThreadState)
+        return NV_ERR_NO_MEMORY;
    // RS-TODO use dual client locking
    status = _nvGpuOpsLocksAcquireAll(RMAPI_LOCK_FLAGS_NONE, device->session->handle,
        &pSessionClient, &acquiredLocks);
    if (status != NV_OK)
    {
-        threadStateFree(&threadState, THREAD_STATE_FLAGS_NONE);
+        threadStateFree(pThreadState, THREAD_STATE_FLAGS_NONE);
        return status;
    }

@@ -7686,10 +7687,18 @@ static NV_STATUS dupMemory(struct gpuDevice *device,
    }

    // For SYSMEM or indirect peer mappings
-    bIsIndirectPeer = gpumgrCheckIndirectPeer(pMappingGpu, pAdjustedMemDesc->pGpu);
+    // Deviceless memory (NV01_MEMORY_DEVICELESS) can have a NULL pGpu. Perform targeted
+    // null checks before IOMMU operations that require valid GPU contexts.
+    bIsIndirectPeer = (pAdjustedMemDesc->pGpu != NULL) ?
+                       gpumgrCheckIndirectPeer(pMappingGpu, pAdjustedMemDesc->pGpu) : NV_FALSE;
    if (bIsIndirectPeer ||
        memdescRequiresIommuMapping(pAdjustedMemDesc))
    {
+        if (NV_UNLIKELY(pAdjustedMemDesc->pGpu == NULL))
+        {
+            status = NV_ERR_INVALID_STATE;
+            goto freeGpaMemdesc;
+        }
        // For sysmem allocations, the dup done below is very shallow and in
        // particular doesn't create IOMMU mappings required for the mapped GPU
        // to access the memory. That's a problem if the mapped GPU is different
@@ -7778,7 +7787,7 @@ freeGpaMemdesc:

 done:
    _nvGpuOpsLocksRelease(&acquiredLocks);
-    threadStateFree(&threadState, THREAD_STATE_FLAGS_NONE);
+    threadStateFree(pThreadState, THREAD_STATE_FLAGS_NONE);
    return status;
 }

--- a/src/nvidia/src/kernel/vgpu/rpc.c
+++ b/src/nvidia/src/kernel/vgpu/rpc.c
@@ -1856,7 +1856,7 @@ static NV_STATUS _issueRpcLarge
    // Set the correct length for this queue entry.
    vgpu_rpc_message_header_v->length = entryLength;

-    nvStatus = rpcSendMessage(pGpu, pRpc, &firstSequence);
+    nvStatus = rpcSendMessage(pGpu, pRpc, &lastSequence);
    if (nvStatus != NV_OK)
    {
        NV_PRINTF(LEVEL_ERROR, "rpcSendMessage failed with status 0x%08x for fn %d!\n",
--- a/version.mk
+++ b/version.mk
@@ -1,4 +1,4 @@
-NVIDIA_VERSION = 570.190
+NVIDIA_VERSION = 570.195.03

 # This file.
 VERSION_MK_FILE := $(lastword $(MAKEFILE_LIST))