550.90.07

2026-02-02 06:29:47 +00:00 · 2024-06-04 13:48:03 +02:00
parent 083cd9cf17
commit e45d91de02
180 changed files with 43467 additions and 38127 deletions
--- a/kernel-open/Kbuild
+++ b/kernel-open/Kbuild
@@ -72,7 +72,7 @@ EXTRA_CFLAGS += -I$(src)/common/inc
 EXTRA_CFLAGS += -I$(src)
 EXTRA_CFLAGS += -Wall $(DEFINES) $(INCLUDES) -Wno-cast-qual -Wno-format-extra-args
 EXTRA_CFLAGS += -D__KERNEL__ -DMODULE -DNVRM
-EXTRA_CFLAGS += -DNV_VERSION_STRING=\"550.78\"
+EXTRA_CFLAGS += -DNV_VERSION_STRING=\"550.90.07\"

 ifneq ($(SYSSRCHOST1X),)
 EXTRA_CFLAGS += -I$(SYSSRCHOST1X)
--- a/kernel-open/common/inc/nv-hypervisor.h
+++ b/kernel-open/common/inc/nv-hypervisor.h
@@ -37,13 +37,11 @@ typedef enum _HYPERVISOR_TYPE
    OS_HYPERVISOR_UNKNOWN
 } HYPERVISOR_TYPE;

-#define CMD_VGPU_VFIO_WAKE_WAIT_QUEUE         0
-#define CMD_VGPU_VFIO_INJECT_INTERRUPT        1
-#define CMD_VGPU_VFIO_REGISTER_MDEV           2
-#define CMD_VGPU_VFIO_PRESENT                 3
-#define CMD_VFIO_PCI_CORE_PRESENT             4
+#define CMD_VFIO_WAKE_REMOVE_GPU              1
+#define CMD_VGPU_VFIO_PRESENT                 2
+#define CMD_VFIO_PCI_CORE_PRESENT             3

-#define MAX_VF_COUNT_PER_GPU 64
+#define MAX_VF_COUNT_PER_GPU                  64

 typedef enum _VGPU_TYPE_INFO
 {
@@ -54,17 +52,11 @@ typedef enum _VGPU_TYPE_INFO

 typedef struct
 {
-    void  *vgpuVfioRef;
-    void  *waitQueue;
    void  *nv;
-    NvU32 *vgpuTypeIds;
-    NvU8 **vgpuNames;
-    NvU32  numVgpuTypes;
-    NvU32  domain;
-    NvU8   bus;
-    NvU8   slot;
-    NvU8   function;
-    NvBool is_virtfn;
+    NvU32 domain;
+    NvU32 bus;
+    NvU32 device;
+    NvU32 return_status;
 } vgpu_vfio_info;

 typedef struct
--- a/kernel-open/common/inc/nv-linux.h
+++ b/kernel-open/common/inc/nv-linux.h
@@ -1614,6 +1614,10 @@ typedef struct nv_linux_state_s {
    nv_kthread_q_t open_q;
    NvBool is_accepting_opens;
    struct semaphore open_q_lock;
+#if defined(NV_VGPU_KVM_BUILD)
+    wait_queue_head_t wait;
+    NvS32 return_status;
+#endif
 } nv_linux_state_t;

 extern nv_linux_state_t *nv_linux_devices;
--- a/kernel-open/common/inc/nv.h
+++ b/kernel-open/common/inc/nv.h
@@ -1041,13 +1041,12 @@ NV_STATUS  NV_API_CALL  nv_vgpu_create_request(nvidia_stack_t *, nv_state_t *, c
 NV_STATUS  NV_API_CALL  nv_vgpu_delete(nvidia_stack_t *, const NvU8 *, NvU16);
 NV_STATUS  NV_API_CALL  nv_vgpu_get_type_ids(nvidia_stack_t *, nv_state_t *, NvU32 *, NvU32 *, NvBool, NvU8, NvBool);
 NV_STATUS  NV_API_CALL  nv_vgpu_get_type_info(nvidia_stack_t *, nv_state_t *, NvU32, char *, int, NvU8);
-NV_STATUS  NV_API_CALL  nv_vgpu_get_bar_info(nvidia_stack_t *, nv_state_t *, const NvU8 *, NvU64 *, NvU32, void *, NvBool *);
+NV_STATUS  NV_API_CALL  nv_vgpu_get_bar_info(nvidia_stack_t *, nv_state_t *, const NvU8 *, NvU64 *,
+                                             NvU64 *, NvU64 *, NvU32 *, NvBool *, NvU8 *);
 NV_STATUS  NV_API_CALL  nv_vgpu_get_hbm_info(nvidia_stack_t *, nv_state_t *, const NvU8 *, NvU64 *, NvU64 *);
-NV_STATUS  NV_API_CALL  nv_vgpu_start(nvidia_stack_t *, const NvU8 *, void *, NvS32 *, NvU8 *, NvU32);
-NV_STATUS  NV_API_CALL  nv_vgpu_get_sparse_mmap(nvidia_stack_t *, nv_state_t *, const NvU8 *, NvU64 **, NvU64 **, NvU32 *);
 NV_STATUS  NV_API_CALL  nv_vgpu_process_vf_info(nvidia_stack_t *, nv_state_t *, NvU8, NvU32, NvU8, NvU8, NvU8, NvBool, void *);
-NV_STATUS  NV_API_CALL  nv_vgpu_update_request(nvidia_stack_t *, const NvU8 *, NvU32, NvU64 *, NvU64 *, const char *);
 NV_STATUS  NV_API_CALL  nv_gpu_bind_event(nvidia_stack_t *);
+NV_STATUS  NV_API_CALL  nv_gpu_unbind_event(nvidia_stack_t *, NvU32, NvBool *);

 NV_STATUS NV_API_CALL nv_get_usermap_access_params(nv_state_t*, nv_usermap_access_params_t*);
 nv_soc_irq_type_t NV_API_CALL nv_get_current_irq_type(nv_state_t*);
--- a/kernel-open/common/inc/nv_uvm_interface.h
+++ b/kernel-open/common/inc/nv_uvm_interface.h
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2013-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2013-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@@ -1505,23 +1505,35 @@ NV_STATUS nvUvmInterfaceCslInitContext(UvmCslContext *uvmCslContext,
 void nvUvmInterfaceDeinitCslContext(UvmCslContext *uvmCslContext);

 /*******************************************************************************
-    nvUvmInterfaceCslUpdateContext
+    nvUvmInterfaceCslRotateKey

-    Updates a context after a key rotation event and can only be called once per
-    key rotation event. Following a key rotation event, and before
-    nvUvmInterfaceCslUpdateContext is called, data encrypted by the GPU with the
-    previous key can be decrypted with nvUvmInterfaceCslDecrypt.
+    Disables channels and rotates keys.

-    Locking: This function acquires an API lock.
-    Memory : This function does not dynamically allocate memory.
+    This function disables channels and rotates associated keys. The channels
+    associated with the given CSL contexts must be idled before this function is
+    called. To trigger key rotation all allocated channels for a given key must
+    be present in the list. If the function returns successfully then the CSL
+    contexts have been updated with the new key.
+
+    Locking: This function attempts to acquire the GPU lock. In case of failure
+             to acquire the return code is NV_ERR_STATE_IN_USE. The caller must
+             guarantee that no CSL function, including this one, is invoked
+             concurrently with the CSL contexts in contextList.
+    Memory : This function dynamically allocates memory.

    Arguments:
-        uvmCslContext[IN] - The CSL context associated with a channel.
-
+        contextList[IN/OUT]  - An array of pointers to CSL contexts.
+        contextListCount[IN] - Number of CSL contexts in contextList. Its value
+                               must be greater than 0.
    Error codes:
-        NV_ERR_INVALID_ARGUMENT - The CSL context is not associated with a channel.
+        NV_ERR_INVALID_ARGUMENT - contextList is NULL or contextListCount is 0.
+        NV_ERR_STATE_IN_USE     - Unable to acquire lock / resource. Caller
+                                  can retry at a later time.
+        NV_ERR_GENERIC          - A failure other than _STATE_IN_USE occurred
+                                  when attempting to acquire a lock.
 */
-NV_STATUS nvUvmInterfaceCslUpdateContext(UvmCslContext *uvmCslContext);
+NV_STATUS nvUvmInterfaceCslRotateKey(UvmCslContext *contextList[],
+                                     NvU32 contextListCount);

 /*******************************************************************************
    nvUvmInterfaceCslRotateIv
@@ -1529,17 +1541,13 @@ NV_STATUS nvUvmInterfaceCslUpdateContext(UvmCslContext *uvmCslContext);
    Rotates the IV for a given channel and operation.

    This function will rotate the IV on both the CPU and the GPU.
-    Outstanding messages that have been encrypted by the GPU should first be
-    decrypted before calling this function with operation equal to
-    UVM_CSL_OPERATION_DECRYPT. Similarly, outstanding messages that have been
-    encrypted by the CPU should first be decrypted before calling this function
-    with operation equal to UVM_CSL_OPERATION_ENCRYPT. For a given operation
-    the channel must be idle before calling this function. This function can be
-    called regardless of the value of the IV's message counter.
+    For a given operation the channel must be idle before calling this function.
+    This function can be called regardless of the value of the IV's message counter.

-    Locking: This function attempts to acquire the GPU lock.
-             In case of failure to acquire the return code
-             is NV_ERR_STATE_IN_USE.
+    Locking: This function attempts to acquire the GPU lock. In case of failure to
+             acquire the return code is NV_ERR_STATE_IN_USE. The caller must guarantee
+             that no CSL function, including this one, is invoked concurrently with
+             the same CSL context.
    Memory : This function does not dynamically allocate memory.

 Arguments:
@@ -1573,8 +1581,8 @@ NV_STATUS nvUvmInterfaceCslRotateIv(UvmCslContext *uvmCslContext,
    However, it is optional. If it is NULL, the next IV in line will be used.

    Locking: This function does not acquire an API or GPU lock.
-             If called concurrently in different threads with the same UvmCslContext
-             the caller must guarantee exclusion.
+             The caller must guarantee that no CSL function, including this one,
+             is invoked concurrently with the same CSL context.
    Memory : This function does not dynamically allocate memory.

 Arguments:
@@ -1610,9 +1618,14 @@ NV_STATUS nvUvmInterfaceCslEncrypt(UvmCslContext *uvmCslContext,
    maximized when the input and output buffers are 16-byte aligned. This is
    natural alignment for AES block.

+    During a key rotation event the previous key is stored in the CSL context.
+    This allows data encrypted by the GPU to be decrypted with the previous key.
+    The keyRotationId parameter identifies which key is used. The first key rotation
+    ID has a value of 0 that increments by one for each key rotation event.
+
    Locking: This function does not acquire an API or GPU lock.
-             If called concurrently in different threads with the same UvmCslContext
-             the caller must guarantee exclusion.
+             The caller must guarantee that no CSL function, including this one,
+             is invoked concurrently with the same CSL context.
    Memory : This function does not dynamically allocate memory.

    Arguments:
@@ -1622,6 +1635,8 @@ NV_STATUS nvUvmInterfaceCslEncrypt(UvmCslContext *uvmCslContext,
        decryptIv[IN]         - IV used to decrypt the ciphertext. Its value can either be given by
                                nvUvmInterfaceCslIncrementIv, or, if NULL, the CSL context's
                                internal counter is used.
+        keyRotationId[IN]     - Specifies the key that is used for decryption.
+                                A value of NV_U32_MAX specifies the current key.
        inputBuffer[IN]       - Address of ciphertext input buffer.
        outputBuffer[OUT]     - Address of plaintext output buffer.
        addAuthData[IN]       - Address of the plaintext additional authenticated data used to
@@ -1642,6 +1657,7 @@ NV_STATUS nvUvmInterfaceCslDecrypt(UvmCslContext *uvmCslContext,
                                   NvU32 bufferSize,
                                   NvU8 const *inputBuffer,
                                   UvmCslIv const *decryptIv,
+                                   NvU32 keyRotationId,
                                   NvU8 *outputBuffer,
                                   NvU8 const *addAuthData,
                                   NvU32 addAuthDataSize,
@@ -1656,8 +1672,8 @@ NV_STATUS nvUvmInterfaceCslDecrypt(UvmCslContext *uvmCslContext,
    undefined behavior.

    Locking: This function does not acquire an API or GPU lock.
-             If called concurrently in different threads with the same UvmCslContext
-             the caller must guarantee exclusion.
+             The caller must guarantee that no CSL function, including this one,
+             is invoked concurrently with the same CSL context.
    Memory : This function does not dynamically allocate memory.

    Arguments:
@@ -1685,8 +1701,8 @@ NV_STATUS nvUvmInterfaceCslSign(UvmCslContext *uvmCslContext,

    Locking: This function does not acquire an API or GPU lock.
    Memory : This function does not dynamically allocate memory.
-             If called concurrently in different threads with the same UvmCslContext
-             the caller must guarantee exclusion.
+             The caller must guarantee that no CSL function, including this one,
+             is invoked concurrently with the same CSL context.

    Arguments:
        uvmCslContext[IN/OUT] - The CSL context.
@@ -1711,8 +1727,8 @@ NV_STATUS nvUvmInterfaceCslQueryMessagePool(UvmCslContext *uvmCslContext,
    the returned IV can be used in nvUvmInterfaceCslDecrypt.

    Locking: This function does not acquire an API or GPU lock.
-             If called concurrently in different threads with the same UvmCslContext
-             the caller must guarantee exclusion.
+             The caller must guarantee that no CSL function, including this one,
+             is invoked concurrently with the same CSL context.
    Memory : This function does not dynamically allocate memory.

 Arguments:
@@ -1734,28 +1750,41 @@ NV_STATUS nvUvmInterfaceCslIncrementIv(UvmCslContext *uvmCslContext,
                                       UvmCslIv *iv);

 /*******************************************************************************
-    nvUvmInterfaceCslLogExternalEncryption
+    nvUvmInterfaceCslLogEncryption

-    Checks and logs information about non-CSL encryptions, such as those that
-    originate from the GPU.
+    Checks and logs information about encryptions associated with the given
+    CSL context.

-    This function does not modify elements of the UvmCslContext.
+    For contexts associated with channels, this function does not modify elements of
+    the UvmCslContext, and must be called for every CPU/GPU encryption.
+
+    For the context associated with fault buffers, bufferSize can encompass multiple
+    encryption invocations, and the UvmCslContext will be updated following a key
+    rotation event.
+
+    In either case the IV remains unmodified after this function is called.

    Locking: This function does not acquire an API or GPU lock.
    Memory : This function does not dynamically allocate memory.
-             If called concurrently in different threads with the same UvmCslContext
-             the caller must guarantee exclusion.
+             The caller must guarantee that no CSL function, including this one,
+             is invoked concurrently with the same CSL context.

    Arguments:
        uvmCslContext[IN/OUT] - The CSL context.
-        bufferSize[OUT]       - The size of the buffer encrypted by the
+        operation[IN]         - If the CSL context is associated with a fault
+                                buffer, this argument is ignored. If it is
+                                associated with a channel, it must be either
+                                - UVM_CSL_OPERATION_ENCRYPT
+                                - UVM_CSL_OPERATION_DECRYPT
+        bufferSize[IN]        - The size of the buffer(s) encrypted by the
                                external entity in units of bytes.

    Error codes:
-      NV_ERR_INSUFFICIENT_RESOURCES - The device encryption would cause a counter
+      NV_ERR_INSUFFICIENT_RESOURCES - The encryption would cause a counter
                                      to overflow.
 */
-NV_STATUS nvUvmInterfaceCslLogExternalEncryption(UvmCslContext *uvmCslContext,
-                                                 NvU32 bufferSize);
+NV_STATUS nvUvmInterfaceCslLogEncryption(UvmCslContext *uvmCslContext,
+                                         UvmCslOperation operation,
+                                         NvU32 bufferSize);

 #endif // _NV_UVM_INTERFACE_H_
--- a/kernel-open/common/inc/nv_uvm_types.h
+++ b/kernel-open/common/inc/nv_uvm_types.h
@@ -267,6 +267,7 @@ typedef struct UvmGpuChannelInfo_tag

    // The errorNotifier is filled out when the channel hits an RC error.
    NvNotification    *errorNotifier;
+    NvNotification    *keyRotationNotifier;

    NvU32              hwRunlistId;
    NvU32              hwChannelId;
@@ -292,13 +293,13 @@ typedef struct UvmGpuChannelInfo_tag

    // GPU VAs of both GPFIFO and GPPUT are needed in Confidential Computing
    // so a channel can be controlled via another channel (SEC2 or WLC/LCIC)
-    NvU64             gpFifoGpuVa;
-    NvU64             gpPutGpuVa;
-    NvU64             gpGetGpuVa;
+    NvU64              gpFifoGpuVa;
+    NvU64              gpPutGpuVa;
+    NvU64              gpGetGpuVa;
    // GPU VA of work submission offset is needed in Confidential Computing
    // so CE channels can ring doorbell of other channels as required for
    // WLC/LCIC work submission
-    NvU64             workSubmissionOffsetGpuVa;
+    NvU64              workSubmissionOffsetGpuVa;
 } UvmGpuChannelInfo;

 typedef enum
@@ -604,6 +605,8 @@ typedef struct UvmGpuConfComputeCaps_tag
 {
    // Out: GPU's confidential compute mode
    UvmGpuConfComputeMode mode;
+    // Is key rotation enabled for UVM keys
+    NvBool bKeyRotationEnabled;
 } UvmGpuConfComputeCaps;

 #define UVM_GPU_NAME_LENGTH 0x40
@@ -1086,4 +1089,21 @@ typedef enum UvmCslOperation
    UVM_CSL_OPERATION_DECRYPT
 } UvmCslOperation;

+typedef enum UVM_KEY_ROTATION_STATUS {
+    // Key rotation complete/not in progress
+    UVM_KEY_ROTATION_STATUS_IDLE = 0,
+    // RM is waiting for clients to report their channels are idle for key rotation
+    UVM_KEY_ROTATION_STATUS_PENDING = 1,
+    // Key rotation is in progress
+    UVM_KEY_ROTATION_STATUS_IN_PROGRESS = 2,
+    // Key rotation timeout failure, RM will RC non-idle channels.
+    // UVM should never see this status value.
+    UVM_KEY_ROTATION_STATUS_FAILED_TIMEOUT = 3,
+    // Key rotation failed because upper threshold was crossed, RM will RC non-idle channels
+    UVM_KEY_ROTATION_STATUS_FAILED_THRESHOLD = 4,
+    // Internal RM failure while rotating keys for a certain channel, RM will RC the channel.
+    UVM_KEY_ROTATION_STATUS_FAILED_ROTATION = 5,
+    UVM_KEY_ROTATION_STATUS_MAX_COUNT = 6,
+} UVM_KEY_ROTATION_STATUS;
+
 #endif // _NV_UVM_TYPES_H_
--- a/kernel-open/common/inc/rm-gpu-ops.h
+++ b/kernel-open/common/inc/rm-gpu-ops.h
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 1999-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1999-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@@ -103,14 +103,14 @@ NV_STATUS  NV_API_CALL rm_gpu_ops_paging_channel_push_stream(nvidia_stack_t *, n

 NV_STATUS  NV_API_CALL rm_gpu_ops_ccsl_context_init(nvidia_stack_t *, struct ccslContext_t **, nvgpuChannelHandle_t);
 NV_STATUS  NV_API_CALL rm_gpu_ops_ccsl_context_clear(nvidia_stack_t *, struct ccslContext_t *);
-NV_STATUS  NV_API_CALL rm_gpu_ops_ccsl_context_update(nvidia_stack_t *, struct ccslContext_t *);
+NV_STATUS  NV_API_CALL rm_gpu_ops_ccsl_rotate_key(nvidia_stack_t *, UvmCslContext *[], NvU32);
 NV_STATUS  NV_API_CALL rm_gpu_ops_ccsl_rotate_iv(nvidia_stack_t *, struct ccslContext_t *, NvU8);
 NV_STATUS  NV_API_CALL rm_gpu_ops_ccsl_encrypt(nvidia_stack_t *, struct ccslContext_t *, NvU32, NvU8 const *, NvU8 *, NvU8 *);
 NV_STATUS  NV_API_CALL rm_gpu_ops_ccsl_encrypt_with_iv(nvidia_stack_t *, struct ccslContext_t *, NvU32, NvU8 const *, NvU8*, NvU8 *, NvU8 *);
-NV_STATUS  NV_API_CALL rm_gpu_ops_ccsl_decrypt(nvidia_stack_t *, struct ccslContext_t *, NvU32, NvU8 const *, NvU8 const *, NvU8 *, NvU8 const *, NvU32, NvU8 const *);
+NV_STATUS  NV_API_CALL rm_gpu_ops_ccsl_decrypt(nvidia_stack_t *, struct ccslContext_t *, NvU32, NvU8 const *, NvU8 const *, NvU32, NvU8 *, NvU8 const *, NvU32, NvU8 const *);
 NV_STATUS  NV_API_CALL rm_gpu_ops_ccsl_sign(nvidia_stack_t *, struct ccslContext_t *, NvU32, NvU8 const *, NvU8 *);
 NV_STATUS  NV_API_CALL rm_gpu_ops_ccsl_query_message_pool(nvidia_stack_t *, struct ccslContext_t *, NvU8, NvU64 *);
 NV_STATUS  NV_API_CALL rm_gpu_ops_ccsl_increment_iv(nvidia_stack_t *, struct ccslContext_t *, NvU8, NvU64, NvU8 *);
-NV_STATUS  NV_API_CALL rm_gpu_ops_ccsl_log_device_encryption(nvidia_stack_t *, struct ccslContext_t *, NvU32);
+NV_STATUS  NV_API_CALL rm_gpu_ops_ccsl_log_encryption(nvidia_stack_t *, struct ccslContext_t *, NvU8, NvU32);

 #endif
--- a/kernel-open/conftest.sh
+++ b/kernel-open/conftest.sh
@@ -1416,6 +1416,42 @@ compile_test() {
            compile_check_conftest "$CODE" "NV_VFIO_REGISTER_EMULATED_IOMMU_DEV_PRESENT" "" "functions"
        ;;

+        bus_type_has_iommu_ops)
+            #
+            # Determine if 'bus_type' structure has a 'iommu_ops' field.
+            #
+            # This field was removed by commit 17de3f5fdd35 (iommu: Retire bus ops)
+            # in v6.8
+            #
+            CODE="
+            #include <linux/device.h>
+
+            int conftest_bus_type_has_iommu_ops(void) {
+                return offsetof(struct bus_type, iommu_ops);
+            }"
+
+            compile_check_conftest "$CODE" "NV_BUS_TYPE_HAS_IOMMU_OPS" "" "types"
+        ;;
+
+        eventfd_signal_has_counter_arg)
+            #
+            # Determine if eventfd_signal() function has an additional 'counter' argument.
+            #
+            # This argument was removed by commit 3652117f8548 (eventfd: simplify
+            # eventfd_signal()) in v6.8
+            #
+            CODE="
+            #include <linux/eventfd.h>
+
+            void conftest_eventfd_signal_has_counter_arg(void) {
+                struct eventfd_ctx *ctx;
+
+                eventfd_signal(ctx, 1);
+            }"
+
+            compile_check_conftest "$CODE" "NV_EVENTFD_SIGNAL_HAS_COUNTER_ARG" "" "types"
+        ;;
+
        drm_available)
            # Determine if the DRM subsystem is usable
            CODE="
@@ -5216,25 +5252,23 @@ compile_test() {
            compile_check_conftest "$CODE" "NV_PCI_CLASS_MULTIMEDIA_HD_AUDIO_PRESENT" "" "generic"
        ;;

-        unsafe_follow_pfn)
+        follow_pfn)
            #
-            # Determine if unsafe_follow_pfn() is present.
+            # Determine if follow_pfn() is present.
            #
-            # unsafe_follow_pfn() was added by commit 69bacee7f9ad
-            # ("mm: Add unsafe_follow_pfn") in v5.13-rc1.
-            #
-            # Note: this commit never made it to the linux kernel, so
-            # unsafe_follow_pfn() never existed.
+            # follow_pfn() was added by commit 3b6748e2dd69
+            # ("mm: introduce follow_pfn()") in v2.6.31-rc1, and removed
+            # by commit 233eb0bf3b94 ("mm: remove follow_pfn")
+            # from linux-next 233eb0bf3b94.
            #
            CODE="
            #include <linux/mm.h>
-            void conftest_unsafe_follow_pfn(void) {
-                unsafe_follow_pfn();
+            void conftest_follow_pfn(void) {
+                follow_pfn();
            }"

-            compile_check_conftest "$CODE" "NV_UNSAFE_FOLLOW_PFN_PRESENT" "" "functions"
+            compile_check_conftest "$CODE" "NV_FOLLOW_PFN_PRESENT" "" "functions"
        ;;
-
        drm_plane_atomic_check_has_atomic_state_arg)
            #
            # Determine if drm_plane_helper_funcs::atomic_check takes 'state'
--- a/kernel-open/nvidia-drm/nv-kthread-q.c
+++ b/kernel-open/nvidia-drm/nv-kthread-q.c
@@ -201,7 +201,7 @@ static struct task_struct *thread_create_on_node(int (*threadfn)(void *data),

        // Ran out of attempts - return thread even if its stack may not be
        // allocated on the preferred node
-        if ((i == (attempts - 1)))
+        if (i == (attempts - 1))
            break;

        // Get the NUMA node where the first page of the stack is resident. If
--- a/kernel-open/nvidia-modeset/nv-kthread-q.c
+++ b/kernel-open/nvidia-modeset/nv-kthread-q.c
@@ -201,7 +201,7 @@ static struct task_struct *thread_create_on_node(int (*threadfn)(void *data),

        // Ran out of attempts - return thread even if its stack may not be
        // allocated on the preferred node
-        if ((i == (attempts - 1)))
+        if (i == (attempts - 1))
            break;

        // Get the NUMA node where the first page of the stack is resident. If
--- a/kernel-open/nvidia-uvm/nv-kthread-q.c
+++ b/kernel-open/nvidia-uvm/nv-kthread-q.c
@@ -201,7 +201,7 @@ static struct task_struct *thread_create_on_node(int (*threadfn)(void *data),

        // Ran out of attempts - return thread even if its stack may not be
        // allocated on the preferred node
-        if ((i == (attempts - 1)))
+        if (i == (attempts - 1))
            break;

        // Get the NUMA node where the first page of the stack is resident. If
--- a/kernel-open/nvidia-uvm/uvm.h
+++ b/kernel-open/nvidia-uvm/uvm.h
@@ -1448,7 +1448,9 @@ NV_STATUS UvmAllocSemaphorePool(void                          *base,
 //
 //     preferredCpuMemoryNode: (INPUT)
 //         Preferred CPU NUMA memory node used if the destination processor is
-//         the CPU.
+//         the CPU. -1 indicates no preference, in which case the pages used
+//         can be on any of the available CPU NUMA nodes. If NUMA is disabled
+//         only 0 and -1 are allowed.
 //
 // Error codes:
 //     NV_ERR_INVALID_ADDRESS:
@@ -1462,6 +1464,11 @@ NV_STATUS UvmAllocSemaphorePool(void                          *base,
 //         The VA range exceeds the largest virtual address supported by the
 //         destination processor.
 //
+//     NV_ERR_INVALID_ARGUMENT:
+//         preferredCpuMemoryNode is not a valid CPU NUMA node or it corresponds
+//         to a NUMA node ID for a registered GPU. If NUMA is disabled, it
+//         indicates that preferredCpuMemoryNode was not either 0 or -1.
+//
 //     NV_ERR_INVALID_DEVICE:
 //         destinationUuid does not represent a valid processor such as a CPU or
 //         a GPU with a GPU VA space registered for it. Or destinationUuid is a
@@ -1528,8 +1535,9 @@ NV_STATUS UvmMigrate(void                  *base,
 //
 //     preferredCpuMemoryNode: (INPUT)
 //         Preferred CPU NUMA memory node used if the destination processor is
-//         the CPU. This argument is ignored if the given virtual address range
-//         corresponds to managed memory.
+//         the CPU. -1 indicates no preference, in which case the pages used
+//         can be on any of the available CPU NUMA nodes. If NUMA is disabled
+//         only 0 and -1 are allowed.
 //
 //     semaphoreAddress: (INPUT)
 //         Base address of the semaphore.
@@ -1586,8 +1594,8 @@ NV_STATUS UvmMigrateAsync(void                  *base,
 //
 // Migrates the backing of all virtual address ranges associated with the given
 // range group to the specified destination processor. The behavior of this API
-// is equivalent to calling UvmMigrate on each VA range associated with this
-// range group.
+// is equivalent to calling UvmMigrate with preferredCpuMemoryNode = -1 on each
+// VA range associated with this range group.
 //
 // Any errors encountered during migration are returned immediately. No attempt
 // is made to migrate the remaining unmigrated ranges and the ranges that are
@@ -2169,7 +2177,8 @@ NV_STATUS UvmMapDynamicParallelismRegion(void                  *base,
 //
 // If any page in the VA range has a preferred location, then the migration and
 // mapping policies associated with this API take precedence over those related
-// to the preferred location.
+// to the preferred location. If the preferred location is a specific CPU NUMA
+// node, that NUMA node will be used for a CPU-resident copy of the page.
 //
 // If any pages in this VA range have any processors present in their
 // accessed-by list, the migration and mapping policies associated with this
@@ -2300,7 +2309,7 @@ NV_STATUS UvmDisableReadDuplication(void     *base,
 // UvmPreventMigrationRangeGroups has not been called on the range group that
 // those pages are associated with, then the migration and mapping policies
 // associated with UvmEnableReadDuplication override the policies outlined
-// above. Note that enabling read duplication on on any pages in this VA range
+// above. Note that enabling read duplication on any pages in this VA range
 // does not clear the state set by this API for those pages. It merely overrides
 // the policies associated with this state until read duplication is disabled
 // for those pages.
@@ -2333,7 +2342,8 @@ NV_STATUS UvmDisableReadDuplication(void     *base,
 //     preferredCpuMemoryNode: (INPUT)
 //         Preferred CPU NUMA memory node used if preferredLocationUuid is the
 //         UUID of the CPU. -1 is a special value which indicates all CPU nodes
-//         allowed by the global and thread memory policies.
+//         allowed by the global and thread memory policies. If NUMA is disabled
+//         only 0 and -1 are allowed.
 //
 // Errors:
 //     NV_ERR_INVALID_ADDRESS:
--- a/kernel-open/nvidia-uvm/uvm_ce_test.c
+++ b/kernel-open/nvidia-uvm/uvm_ce_test.c
@@ -855,6 +855,7 @@ static NV_STATUS cpu_decrypt_in_order(uvm_channel_t *channel,
                                      uvm_mem_t *dst_mem,
                                      uvm_mem_t *src_mem,
                                      const UvmCslIv *decrypt_iv,
+                                      NvU32 key_version,
                                      uvm_mem_t *auth_tag_mem,
                                      size_t size,
                                      NvU32 copy_size)
@@ -869,6 +870,7 @@ static NV_STATUS cpu_decrypt_in_order(uvm_channel_t *channel,
                                                         dst_plain + i * copy_size,
                                                         src_cipher + i * copy_size,
                                                         decrypt_iv + i,
+                                                         key_version,
                                                         copy_size,
                                                         auth_tag_buffer + i * UVM_CONF_COMPUTING_AUTH_TAG_SIZE));
    }
@@ -879,6 +881,7 @@ static NV_STATUS cpu_decrypt_out_of_order(uvm_channel_t *channel,
                                          uvm_mem_t *dst_mem,
                                          uvm_mem_t *src_mem,
                                          const UvmCslIv *decrypt_iv,
+                                          NvU32 key_version,
                                          uvm_mem_t *auth_tag_mem,
                                          size_t size,
                                          NvU32 copy_size)
@@ -896,6 +899,7 @@ static NV_STATUS cpu_decrypt_out_of_order(uvm_channel_t *channel,
                                                         dst_plain + i * copy_size,
                                                         src_cipher + i * copy_size,
                                                         decrypt_iv + i,
+                                                         key_version,
                                                         copy_size,
                                                         auth_tag_buffer + i * UVM_CONF_COMPUTING_AUTH_TAG_SIZE));
    }
@@ -959,7 +963,7 @@ static void gpu_encrypt(uvm_push_t *push,
                                                          i * UVM_CONF_COMPUTING_AUTH_TAG_SIZE,
                                                          dst_cipher);

-        uvm_conf_computing_log_gpu_encryption(push->channel, decrypt_iv);
+        uvm_conf_computing_log_gpu_encryption(push->channel, copy_size, decrypt_iv);

        if (i > 0)
            uvm_push_set_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
@@ -1020,6 +1024,7 @@ static NV_STATUS test_cpu_to_gpu_roundtrip(uvm_gpu_t *gpu,
    size_t auth_tag_buffer_size = (size / copy_size) * UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
    UvmCslIv *decrypt_iv = NULL;
    UvmCslIv *encrypt_iv = NULL;
+    NvU32 key_version;
    uvm_tracker_t tracker;
    size_t src_plain_size;

@@ -1089,6 +1094,11 @@ static NV_STATUS test_cpu_to_gpu_roundtrip(uvm_gpu_t *gpu,

    gpu_encrypt(&push, dst_cipher, dst_plain_gpu, auth_tag_mem, decrypt_iv, size, copy_size);

+    // There shouldn't be any key rotation between the end of the push and the
+    // CPU decryption(s), but it is more robust against test changes to force
+    // decryption to use the saved key.
+    key_version = uvm_channel_pool_key_version(push.channel->pool);
+
    TEST_NV_CHECK_GOTO(uvm_push_end_and_wait(&push), out);

    TEST_CHECK_GOTO(!mem_match(src_plain, src_cipher, size), out);
@@ -1101,6 +1111,7 @@ static NV_STATUS test_cpu_to_gpu_roundtrip(uvm_gpu_t *gpu,
                                                dst_plain,
                                                dst_cipher,
                                                decrypt_iv,
+                                                key_version,
                                                auth_tag_mem,
                                                size,
                                                copy_size),
@@ -1111,6 +1122,7 @@ static NV_STATUS test_cpu_to_gpu_roundtrip(uvm_gpu_t *gpu,
                                                    dst_plain,
                                                    dst_cipher,
                                                    decrypt_iv,
+                                                    key_version,
                                                    auth_tag_mem,
                                                    size,
                                                    copy_size),
--- a/kernel-open/nvidia-uvm/uvm_channel.c
+++ b/kernel-open/nvidia-uvm/uvm_channel.c
--- a/kernel-open/nvidia-uvm/uvm_channel.h
+++ b/kernel-open/nvidia-uvm/uvm_channel.h
@@ -228,21 +228,65 @@ typedef struct
    // variant is required when the thread holding the pool lock must sleep
    // (ex: acquire another mutex) deeper in the call stack, either in UVM or
    // RM.
-    union {
+    union
+    {
        uvm_spinlock_t spinlock;
        uvm_mutex_t mutex;
    };

-    // Secure operations require that uvm_push_begin order matches
-    // uvm_push_end order, because the engine's state is used in its internal
-    // operation and each push may modify this state. push_locks is protected by
-    // the channel pool lock.
-    DECLARE_BITMAP(push_locks, UVM_CHANNEL_MAX_NUM_CHANNELS_PER_POOL);
+    struct
+    {
+        // Secure operations require that uvm_push_begin order matches
+        // uvm_push_end order, because the engine's state is used in its
+        // internal operation and each push may modify this state.
+        // push_locks is protected by the channel pool lock.
+        DECLARE_BITMAP(push_locks, UVM_CHANNEL_MAX_NUM_CHANNELS_PER_POOL);

-    // Counting semaphore for available and unlocked channels, it must be
-    // acquired before submitting work to a channel when the Confidential
-    // Computing feature is enabled.
-    uvm_semaphore_t push_sem;
+        // Counting semaphore for available and unlocked channels, it must be
+        // acquired before submitting work to a channel when the Confidential
+        // Computing feature is enabled.
+        uvm_semaphore_t push_sem;
+
+        // Per channel buffers in unprotected sysmem.
+        uvm_rm_mem_t *pool_sysmem;
+
+        // Per channel buffers in protected vidmem.
+        uvm_rm_mem_t *pool_vidmem;
+
+       struct
+       {
+            // Current encryption key version, incremented upon key rotation.
+            // While there are separate keys for encryption and decryption, the
+            // two keys are rotated at once, so the versioning applies to both.
+            NvU32 version;
+
+            // Lock used to ensure mutual exclusion during key rotation.
+            uvm_mutex_t mutex;
+
+            // CSL contexts passed to RM for key rotation. This is usually an
+            // array containing the CSL contexts associated with the channels in
+            // the pool. In the case of the WLC pool, the array also includes
+            // CSL contexts associated with LCIC channels.
+            UvmCslContext **csl_contexts;
+
+            // Number of elements in the CSL context array.
+            unsigned num_csl_contexts;
+
+            // Number of bytes encrypted, or decrypted, on the engine associated
+            // with the pool since the last key rotation. Only used during
+            // testing, to force key rotations after a certain encryption size,
+            // see UVM_CONF_COMPUTING_KEY_ROTATION_LOWER_THRESHOLD.
+            //
+            // Encryptions on a LCIC pool are accounted for in the paired WLC
+            // pool.
+            //
+            // TODO: Bug 4612912: these accounting variables can be removed once
+            // RM exposes an API to set the key rotation lower threshold.
+            atomic64_t encrypted;
+            atomic64_t decrypted;
+        } key_rotation;
+
+    } conf_computing;
 } uvm_channel_pool_t;

 struct uvm_channel_struct
@@ -322,43 +366,14 @@ struct uvm_channel_struct
        // work launches to match the order of push end-s that triggered them.
        volatile NvU32 gpu_put;

-        // Static pushbuffer for channels with static schedule (WLC/LCIC)
-        uvm_rm_mem_t *static_pb_protected_vidmem;
-
-        // Static pushbuffer staging buffer for WLC
-        uvm_rm_mem_t *static_pb_unprotected_sysmem;
-        void *static_pb_unprotected_sysmem_cpu;
-        void *static_pb_unprotected_sysmem_auth_tag_cpu;
-
-        // The above static locations are required by the WLC (and LCIC)
-        // schedule. Protected sysmem location completes WLC's independence
-        // from the pushbuffer allocator.
+        // Protected sysmem location makes WLC independent from the pushbuffer
+        // allocator. Unprotected sysmem and protected vidmem counterparts
+        // are allocated from the channel pool (sysmem, vidmem).
        void *static_pb_protected_sysmem;

-        // Static tracking semaphore notifier values
-        // Because of LCIC's fixed schedule, the secure semaphore release
-        // mechanism uses two additional static locations for incrementing the
-        // notifier values. See:
-        // . channel_semaphore_secure_release()
-        // . setup_lcic_schedule()
-        // . internal_channel_submit_work_wlc()
-        uvm_rm_mem_t *static_notifier_unprotected_sysmem;
-        NvU32 *static_notifier_entry_unprotected_sysmem_cpu;
-        NvU32 *static_notifier_exit_unprotected_sysmem_cpu;
-        uvm_gpu_address_t static_notifier_entry_unprotected_sysmem_gpu_va;
-        uvm_gpu_address_t static_notifier_exit_unprotected_sysmem_gpu_va;
-
-        // Explicit location for push launch tag used by WLC.
-        // Encryption auth tags have to be located in unprotected sysmem.
-        void *launch_auth_tag_cpu;
-        NvU64 launch_auth_tag_gpu_va;
-
        // Used to decrypt the push back to protected sysmem.
        // This happens when profilers register callbacks for migration data.
        uvm_push_crypto_bundle_t *push_crypto_bundles;
-
-        // Accompanying authentication tags for the crypto bundles
-        uvm_rm_mem_t *push_crypto_bundle_auth_tags;
    } conf_computing;

    // RM channel information
@@ -418,7 +433,7 @@ struct uvm_channel_manager_struct
    unsigned num_channel_pools;

    // Mask containing the indexes of the usable Copy Engines. Each usable CE
-    // has at least one pool associated with it.
+    // has at least one pool of type UVM_CHANNEL_POOL_TYPE_CE associated with it
    DECLARE_BITMAP(ce_mask, UVM_COPY_ENGINE_COUNT_MAX);

    struct
@@ -451,6 +466,16 @@ struct uvm_channel_manager_struct
        UVM_BUFFER_LOCATION gpput_loc;
        UVM_BUFFER_LOCATION pushbuffer_loc;
    } conf;
+
+    struct
+    {
+        // Flag indicating that the WLC/LCIC mechanism is ready/setup; should
+        // only be false during (de)initialization.
+        bool wlc_ready;
+
+        // True indicates that key rotation is enabled (UVM-wise).
+        bool key_rotation_enabled;
+    } conf_computing;
 };

 // Create a channel manager for the GPU
@@ -501,6 +526,14 @@ uvm_channel_t *uvm_channel_lcic_get_paired_wlc(uvm_channel_t *lcic_channel);

 uvm_channel_t *uvm_channel_wlc_get_paired_lcic(uvm_channel_t *wlc_channel);

+NvU64 uvm_channel_get_static_pb_protected_vidmem_gpu_va(uvm_channel_t *channel);
+
+NvU64 uvm_channel_get_static_pb_unprotected_sysmem_gpu_va(uvm_channel_t *channel);
+
+char* uvm_channel_get_static_pb_unprotected_sysmem_cpu(uvm_channel_t *channel);
+
+char *uvm_channel_get_push_crypto_bundle_auth_tags_cpu_va(uvm_channel_t *channel, unsigned tag_index);
+
 static bool uvm_channel_pool_is_proxy(uvm_channel_pool_t *pool)
 {
    UVM_ASSERT(uvm_pool_type_is_valid(pool->pool_type));
@@ -532,6 +565,17 @@ static uvm_channel_type_t uvm_channel_proxy_channel_type(void)
    return UVM_CHANNEL_TYPE_MEMOPS;
 }

+// Force key rotation in the engine associated with the given channel pool.
+// Rotation may still not happen if RM cannot acquire the necessary locks (in
+// which case the function returns NV_ERR_STATE_IN_USE).
+//
+// This function should be only invoked in pools in which key rotation is
+// enabled.
+NV_STATUS uvm_channel_pool_rotate_key(uvm_channel_pool_t *pool);
+
+// Retrieve the current encryption key version associated with the channel pool.
+NvU32 uvm_channel_pool_key_version(uvm_channel_pool_t *pool);
+
 // Privileged channels support all the Host and engine methods, while
 // non-privileged channels don't support privileged methods.
 //
@@ -579,12 +623,9 @@ NvU32 uvm_channel_manager_update_progress(uvm_channel_manager_t *channel_manager
 // beginning.
 NV_STATUS uvm_channel_manager_wait(uvm_channel_manager_t *manager);

-// Check if WLC/LCIC mechanism is ready/setup
-// Should only return false during initialization
 static bool uvm_channel_manager_is_wlc_ready(uvm_channel_manager_t *manager)
 {
-    return (manager->pool_to_use.default_for_type[UVM_CHANNEL_TYPE_WLC] != NULL) &&
-           (manager->pool_to_use.default_for_type[UVM_CHANNEL_TYPE_LCIC] != NULL);
+    return manager->conf_computing.wlc_ready;
 }
 // Get the GPU VA of semaphore_channel's tracking semaphore within the VA space
 // associated with access_channel.
--- a/kernel-open/nvidia-uvm/uvm_channel_test.c
+++ b/kernel-open/nvidia-uvm/uvm_channel_test.c
@@ -796,11 +796,8 @@ done:
 NV_STATUS test_conf_computing_channel_selection(uvm_va_space_t *va_space)
 {
    NV_STATUS status = NV_OK;
-    uvm_channel_pool_t *pool;
-    uvm_push_t *pushes;
-    uvm_gpu_t *gpu;
-    NvU32 i;
-    NvU32 num_pushes;
+    uvm_push_t *pushes = NULL;
+    uvm_gpu_t *gpu = NULL;

    if (!g_uvm_global.conf_computing_enabled)
        return NV_OK;
@@ -810,9 +807,19 @@ NV_STATUS test_conf_computing_channel_selection(uvm_va_space_t *va_space)
    for_each_va_space_gpu(gpu, va_space) {
        uvm_channel_type_t channel_type;

+        // Key rotation is disabled because this test relies on nested pushes,
+        // which is illegal. If any push other than the first one triggers key
+        // rotation, the test won't complete. This is because key rotation
+        // depends on waiting for ongoing pushes to end, which doesn't happen
+        // if those pushes are ended after the current one begins.
+        uvm_conf_computing_disable_key_rotation(gpu);
+
        for (channel_type = 0; channel_type < UVM_CHANNEL_TYPE_COUNT; channel_type++) {
-            pool = gpu->channel_manager->pool_to_use.default_for_type[channel_type];
-            TEST_CHECK_RET(pool != NULL);
+            NvU32 i;
+            NvU32 num_pushes;
+            uvm_channel_pool_t *pool = gpu->channel_manager->pool_to_use.default_for_type[channel_type];
+
+            TEST_CHECK_GOTO(pool != NULL, error);

            // Skip LCIC channels as those can't accept any pushes
            if (uvm_channel_pool_is_lcic(pool))
@@ -824,7 +831,7 @@ NV_STATUS test_conf_computing_channel_selection(uvm_va_space_t *va_space)
            num_pushes = min(pool->num_channels, (NvU32)UVM_PUSH_MAX_CONCURRENT_PUSHES);

            pushes = uvm_kvmalloc_zero(sizeof(*pushes) * num_pushes);
-            TEST_CHECK_RET(pushes != NULL);
+            TEST_CHECK_GOTO(pushes != NULL, error);

            for (i = 0; i < num_pushes; i++) {
                uvm_push_t *push = &pushes[i];
@@ -841,12 +848,18 @@ NV_STATUS test_conf_computing_channel_selection(uvm_va_space_t *va_space)

            uvm_kvfree(pushes);
        }
+
+        uvm_conf_computing_enable_key_rotation(gpu);
    }

    uvm_thread_context_lock_enable_tracking();

    return status;
+
 error:
+    if (gpu != NULL)
+        uvm_conf_computing_enable_key_rotation(gpu);
+
    uvm_thread_context_lock_enable_tracking();
    uvm_kvfree(pushes);

@@ -948,6 +961,318 @@ release:
    return NV_OK;
 }

+static NV_STATUS force_key_rotations(uvm_channel_pool_t *pool, unsigned num_rotations)
+{
+    unsigned num_tries;
+    unsigned max_num_tries = 20;
+    unsigned num_rotations_completed = 0;
+
+    if (num_rotations == 0)
+        return NV_OK;
+
+    // The number of accepted rotations is kept low, so failed rotation
+    // invocations due to RM not acquiring the necessary locks (which imply a
+    // sleep in the test) do not balloon the test execution time.
+    UVM_ASSERT(num_rotations <= 10);
+
+    for (num_tries = 0; (num_tries < max_num_tries) && (num_rotations_completed < num_rotations); num_tries++) {
+        // Force key rotation, irrespective of encryption usage.
+        NV_STATUS status = uvm_channel_pool_rotate_key(pool);
+
+        // Key rotation may not be able to complete due to RM failing to acquire
+        // the necessary locks. Detect the situation, sleep for a bit, and then
+        // try again
+        //
+        // The maximum time spent sleeping in a single rotation call is
+        // (max_num_tries * max_sleep_us)
+        if (status == NV_ERR_STATE_IN_USE) {
+            NvU32 min_sleep_us = 1000;
+            NvU32 max_sleep_us = 10000;
+
+            usleep_range(min_sleep_us, max_sleep_us);
+            continue;
+        }
+
+        TEST_NV_CHECK_RET(status);
+
+        num_rotations_completed++;
+    }
+
+    // If not a single key rotation occurred, the dependent tests still pass,
+    // but there is no much value to them. Instead, return an error so the
+    // maximum number of tries, or the maximum sleep time, are adjusted to
+    // ensure that at least one rotation completes.
+    if (num_rotations_completed > 0)
+        return NV_OK;
+    else
+        return NV_ERR_STATE_IN_USE;
+}
+
+static NV_STATUS force_key_rotation(uvm_channel_pool_t *pool)
+{
+    return force_key_rotations(pool, 1);
+}
+
+// Test key rotation in all pools. This is useful because key rotation may not
+// happen otherwise on certain engines during UVM test execution. For example,
+// if the MEMOPS channel type is mapped to a CE not shared with any other
+// channel type, then the only encryption taking place in the engine is due to
+// semaphore releases (4 bytes each). This small encryption size makes it
+// unlikely to exceed even small rotation thresholds.
+static NV_STATUS test_channel_key_rotation_basic(uvm_gpu_t *gpu)
+{
+    uvm_channel_pool_t *pool;
+
+    uvm_for_each_pool(pool, gpu->channel_manager) {
+        if (!uvm_conf_computing_is_key_rotation_enabled_in_pool(pool))
+            continue;
+
+        TEST_NV_CHECK_RET(force_key_rotation(pool));
+    }
+
+    return NV_OK;
+}
+
+// Interleave GPU encryptions and decryptions, and their CPU counterparts, with
+// key rotations.
+static NV_STATUS test_channel_key_rotation_interleave(uvm_gpu_t *gpu)
+{
+    int i;
+    uvm_channel_pool_t *gpu_to_cpu_pool;
+    uvm_channel_pool_t *cpu_to_gpu_pool;
+    NV_STATUS status = NV_OK;
+    size_t size = UVM_CONF_COMPUTING_DMA_BUFFER_SIZE;
+    void *initial_plain_cpu = NULL;
+    void *final_plain_cpu = NULL;
+    uvm_mem_t *plain_gpu = NULL;
+    uvm_gpu_address_t plain_gpu_address;
+
+    cpu_to_gpu_pool = gpu->channel_manager->pool_to_use.default_for_type[UVM_CHANNEL_TYPE_CPU_TO_GPU];
+    TEST_CHECK_RET(uvm_conf_computing_is_key_rotation_enabled_in_pool(cpu_to_gpu_pool));
+
+    gpu_to_cpu_pool = gpu->channel_manager->pool_to_use.default_for_type[UVM_CHANNEL_TYPE_GPU_TO_CPU];
+    TEST_CHECK_RET(uvm_conf_computing_is_key_rotation_enabled_in_pool(gpu_to_cpu_pool));
+
+    initial_plain_cpu = uvm_kvmalloc_zero(size);
+    if (initial_plain_cpu == NULL) {
+        status = NV_ERR_NO_MEMORY;
+        goto out;
+    }
+
+    final_plain_cpu = uvm_kvmalloc_zero(size);
+    if (final_plain_cpu == NULL) {
+        status = NV_ERR_NO_MEMORY;
+        goto out;
+    }
+
+    TEST_NV_CHECK_GOTO(uvm_mem_alloc_vidmem(size, gpu, &plain_gpu), out);
+    TEST_NV_CHECK_GOTO(uvm_mem_map_gpu_kernel(plain_gpu, gpu), out);
+    plain_gpu_address = uvm_mem_gpu_address_virtual_kernel(plain_gpu, gpu);
+
+    memset(initial_plain_cpu, 1, size);
+
+    for (i = 0; i < 5; i++) {
+        TEST_NV_CHECK_GOTO(force_key_rotation(gpu_to_cpu_pool), out);
+        TEST_NV_CHECK_GOTO(force_key_rotation(cpu_to_gpu_pool), out);
+
+        TEST_NV_CHECK_GOTO(uvm_conf_computing_util_memcopy_cpu_to_gpu(gpu,
+                                                                      plain_gpu_address,
+                                                                      initial_plain_cpu,
+                                                                      size,
+                                                                      NULL,
+                                                                      "CPU > GPU"),
+                           out);
+
+        TEST_NV_CHECK_GOTO(force_key_rotation(gpu_to_cpu_pool), out);
+        TEST_NV_CHECK_GOTO(force_key_rotation(cpu_to_gpu_pool), out);
+
+        TEST_NV_CHECK_GOTO(uvm_conf_computing_util_memcopy_gpu_to_cpu(gpu,
+                                                                      final_plain_cpu,
+                                                                      plain_gpu_address,
+                                                                      size,
+                                                                      NULL,
+                                                                      "GPU > CPU"),
+                           out);
+
+        TEST_CHECK_GOTO(!memcmp(initial_plain_cpu, final_plain_cpu, size), out);
+
+        memset(final_plain_cpu, 0, size);
+    }
+
+out:
+    uvm_mem_free(plain_gpu);
+    uvm_kvfree(final_plain_cpu);
+    uvm_kvfree(initial_plain_cpu);
+
+    return status;
+}
+
+static NV_STATUS memset_vidmem(uvm_mem_t *mem, NvU8 val)
+{
+    uvm_push_t push;
+    uvm_gpu_address_t gpu_address;
+    uvm_gpu_t *gpu = mem->backing_gpu;
+
+    UVM_ASSERT(uvm_mem_is_vidmem(mem));
+
+    TEST_NV_CHECK_RET(uvm_push_begin(gpu->channel_manager, UVM_CHANNEL_TYPE_GPU_INTERNAL, &push, "zero vidmem"));
+
+    gpu_address = uvm_mem_gpu_address_virtual_kernel(mem, gpu);
+    gpu->parent->ce_hal->memset_1(&push, gpu_address, val, mem->size);
+
+    TEST_NV_CHECK_RET(uvm_push_end_and_wait(&push));
+
+    return NV_OK;
+}
+
+// Custom version of uvm_conf_computing_util_memcopy_gpu_to_cpu that allows
+// testing to insert key rotations in between the push end, and the CPU
+// decryption
+static NV_STATUS encrypted_memcopy_gpu_to_cpu(uvm_gpu_t *gpu,
+                                              void *dst_plain,
+                                              uvm_gpu_address_t src_gpu_address,
+                                              size_t size,
+                                              unsigned num_rotations_to_insert)
+{
+    NV_STATUS status;
+    uvm_push_t push;
+    uvm_conf_computing_dma_buffer_t *dma_buffer;
+    uvm_gpu_address_t dst_gpu_address, auth_tag_gpu_address;
+    void *src_cipher, *auth_tag;
+    uvm_channel_t *channel;
+
+    UVM_ASSERT(g_uvm_global.conf_computing_enabled);
+    UVM_ASSERT(size <= UVM_CONF_COMPUTING_DMA_BUFFER_SIZE);
+
+    status = uvm_conf_computing_dma_buffer_alloc(&gpu->conf_computing.dma_buffer_pool, &dma_buffer, NULL);
+    if (status != NV_OK)
+        return status;
+
+    status = uvm_push_begin(gpu->channel_manager, UVM_CHANNEL_TYPE_GPU_TO_CPU, &push, "Small GPU > CPU encryption");
+    if (status != NV_OK)
+        goto out;
+
+    channel = push.channel;
+    uvm_conf_computing_log_gpu_encryption(channel, size, dma_buffer->decrypt_iv);
+    dma_buffer->key_version[0] = uvm_channel_pool_key_version(channel->pool);
+
+    dst_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->alloc, gpu);
+    auth_tag_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->auth_tag, gpu);
+    gpu->parent->ce_hal->encrypt(&push, dst_gpu_address, src_gpu_address, size, auth_tag_gpu_address);
+
+    status = uvm_push_end_and_wait(&push);
+    if (status != NV_OK)
+        goto out;
+
+    TEST_NV_CHECK_GOTO(force_key_rotations(channel->pool, num_rotations_to_insert), out);
+
+    // If num_rotations_to_insert is not zero, the current encryption key will
+    // be different from the one used during CE encryption.
+
+    src_cipher = uvm_mem_get_cpu_addr_kernel(dma_buffer->alloc);
+    auth_tag = uvm_mem_get_cpu_addr_kernel(dma_buffer->auth_tag);
+    status = uvm_conf_computing_cpu_decrypt(channel,
+                                            dst_plain,
+                                            src_cipher,
+                                            dma_buffer->decrypt_iv,
+                                            dma_buffer->key_version[0],
+                                            size,
+                                            auth_tag);
+
+ out:
+    uvm_conf_computing_dma_buffer_free(&gpu->conf_computing.dma_buffer_pool, dma_buffer, NULL);
+    return status;
+}
+
+static NV_STATUS test_channel_key_rotation_cpu_decryption(uvm_gpu_t *gpu,
+                                                          unsigned num_repetitions,
+                                                          unsigned num_rotations_to_insert)
+{
+    unsigned i;
+    uvm_channel_pool_t *gpu_to_cpu_pool;
+    NV_STATUS status = NV_OK;
+    size_t size = UVM_CONF_COMPUTING_DMA_BUFFER_SIZE;
+    NvU8 *plain_cpu = NULL;
+    uvm_mem_t *plain_gpu = NULL;
+    uvm_gpu_address_t plain_gpu_address;
+
+    if (!uvm_conf_computing_is_key_rotation_enabled(gpu))
+        return NV_OK;
+
+    gpu_to_cpu_pool = gpu->channel_manager->pool_to_use.default_for_type[UVM_CHANNEL_TYPE_GPU_TO_CPU];
+    TEST_CHECK_RET(uvm_conf_computing_is_key_rotation_enabled_in_pool(gpu_to_cpu_pool));
+
+    plain_cpu = (NvU8 *) uvm_kvmalloc_zero(size);
+    if (plain_cpu == NULL) {
+        status = NV_ERR_NO_MEMORY;
+        goto out;
+    }
+
+    TEST_NV_CHECK_GOTO(uvm_mem_alloc_vidmem(size, gpu, &plain_gpu), out);
+    TEST_NV_CHECK_GOTO(uvm_mem_map_gpu_kernel(plain_gpu, gpu), out);
+    TEST_NV_CHECK_GOTO(memset_vidmem(plain_gpu, 1), out);
+
+    plain_gpu_address = uvm_mem_gpu_address_virtual_kernel(plain_gpu, gpu);
+
+    for (i = 0; i < num_repetitions; i++) {
+        unsigned j;
+
+        TEST_NV_CHECK_GOTO(encrypted_memcopy_gpu_to_cpu(gpu,
+                                                        plain_cpu,
+                                                        plain_gpu_address,
+                                                        size,
+                                                        num_rotations_to_insert),
+                          out);
+
+        for (j = 0; j < size; j++)
+            TEST_CHECK_GOTO(plain_cpu[j] == 1, out);
+
+        memset(plain_cpu, 0, size);
+
+    }
+out:
+    uvm_mem_free(plain_gpu);
+    uvm_kvfree(plain_cpu);
+
+    return status;
+}
+
+// Test that CPU decryptions can use old keys i.e. previous versions of the keys
+// that are no longer the current key, due to key rotation. Given that SEC2
+// does not expose encryption capabilities, the "decrypt-after-rotation" problem
+// is exclusive of CE encryptions.
+static NV_STATUS test_channel_key_rotation_decrypt_after_key_rotation(uvm_gpu_t *gpu)
+{
+    // Instruct encrypted_memcopy_gpu_to_cpu to insert several key rotations
+    // between the GPU encryption, and the associated CPU decryption.
+    unsigned num_rotations_to_insert = 8;
+
+    TEST_NV_CHECK_RET(test_channel_key_rotation_cpu_decryption(gpu, 1, num_rotations_to_insert));
+
+    return NV_OK;
+}
+
+static NV_STATUS test_channel_key_rotation(uvm_va_space_t *va_space)
+{
+    uvm_gpu_t *gpu;
+
+    if (!g_uvm_global.conf_computing_enabled)
+        return NV_OK;
+
+    for_each_va_space_gpu(gpu, va_space) {
+        if (!uvm_conf_computing_is_key_rotation_enabled(gpu))
+            break;
+
+        TEST_NV_CHECK_RET(test_channel_key_rotation_basic(gpu));
+
+        TEST_NV_CHECK_RET(test_channel_key_rotation_interleave(gpu));
+
+        TEST_NV_CHECK_RET(test_channel_key_rotation_decrypt_after_key_rotation(gpu));
+    }
+
+    return NV_OK;
+}
+
 NV_STATUS test_write_ctrl_gpfifo_noop(uvm_va_space_t *va_space)
 {
    uvm_gpu_t *gpu;
@@ -1203,6 +1528,10 @@ NV_STATUS uvm_test_channel_sanity(UVM_TEST_CHANNEL_SANITY_PARAMS *params, struct
    if (status != NV_OK)
        goto done;

+    status = test_channel_key_rotation(va_space);
+    if (status != NV_OK)
+        goto done;
+
    // The following tests have side effects, they reset the GPU's
    // channel_manager.
    status = test_channel_pushbuffer_extension_base(va_space);
@@ -1338,6 +1667,126 @@ done:
    return status;
 }

+static NV_STATUS channel_stress_key_rotation_cpu_encryption(uvm_gpu_t *gpu, UVM_TEST_CHANNEL_STRESS_PARAMS *params)
+{
+    int i;
+    uvm_channel_pool_t *cpu_to_gpu_pool;
+    NV_STATUS status = NV_OK;
+    size_t size = UVM_CONF_COMPUTING_DMA_BUFFER_SIZE;
+    void *initial_plain_cpu = NULL;
+    uvm_mem_t *plain_gpu = NULL;
+    uvm_gpu_address_t plain_gpu_address;
+
+    UVM_ASSERT(params->key_rotation_operation == UVM_TEST_CHANNEL_STRESS_KEY_ROTATION_OPERATION_CPU_TO_GPU);
+
+    cpu_to_gpu_pool = gpu->channel_manager->pool_to_use.default_for_type[UVM_CHANNEL_TYPE_CPU_TO_GPU];
+    TEST_CHECK_RET(uvm_conf_computing_is_key_rotation_enabled_in_pool(cpu_to_gpu_pool));
+
+    initial_plain_cpu = uvm_kvmalloc_zero(size);
+    if (initial_plain_cpu == NULL) {
+        status = NV_ERR_NO_MEMORY;
+        goto out;
+    }
+
+    TEST_NV_CHECK_GOTO(uvm_mem_alloc_vidmem(size, gpu, &plain_gpu), out);
+    TEST_NV_CHECK_GOTO(uvm_mem_map_gpu_kernel(plain_gpu, gpu), out);
+    plain_gpu_address = uvm_mem_gpu_address_virtual_kernel(plain_gpu, gpu);
+
+    memset(initial_plain_cpu, 1, size);
+
+    for (i = 0; i < params->iterations; i++) {
+        TEST_NV_CHECK_GOTO(uvm_conf_computing_util_memcopy_cpu_to_gpu(gpu,
+                                                                      plain_gpu_address,
+                                                                      initial_plain_cpu,
+                                                                      size,
+                                                                      NULL,
+                                                                      "CPU > GPU"),
+                           out);
+    }
+
+out:
+    uvm_mem_free(plain_gpu);
+    uvm_kvfree(initial_plain_cpu);
+
+    return status;
+}
+
+static NV_STATUS channel_stress_key_rotation_cpu_decryption(uvm_gpu_t *gpu, UVM_TEST_CHANNEL_STRESS_PARAMS *params)
+{
+    unsigned num_rotations_to_insert = 0;
+
+    UVM_ASSERT(params->key_rotation_operation == UVM_TEST_CHANNEL_STRESS_KEY_ROTATION_OPERATION_GPU_TO_CPU);
+
+    return test_channel_key_rotation_cpu_decryption(gpu, params->iterations, num_rotations_to_insert);
+}
+
+static NV_STATUS channel_stress_key_rotation_rotate(uvm_gpu_t *gpu, UVM_TEST_CHANNEL_STRESS_PARAMS *params)
+{
+    NvU32 i;
+
+    UVM_ASSERT(params->key_rotation_operation == UVM_TEST_CHANNEL_STRESS_KEY_ROTATION_OPERATION_ROTATE);
+
+    for (i = 0; i < params->iterations; ++i) {
+        NV_STATUS status;
+        uvm_channel_pool_t *pool;
+        uvm_channel_type_t type;
+
+        if ((i % 3) == 0)
+            type = UVM_CHANNEL_TYPE_CPU_TO_GPU;
+        else if ((i % 3) == 1)
+            type = UVM_CHANNEL_TYPE_GPU_TO_CPU;
+        else
+            type = UVM_CHANNEL_TYPE_WLC;
+
+        pool = gpu->channel_manager->pool_to_use.default_for_type[type];
+
+        if (!uvm_conf_computing_is_key_rotation_enabled_in_pool(pool))
+            return NV_ERR_INVALID_STATE;
+
+        status = force_key_rotation(pool);
+        if (status != NV_OK)
+            return status;
+    }
+
+    return NV_OK;
+}
+
+// The objective of this test is documented in the user-level function
+static NV_STATUS uvm_test_channel_stress_key_rotation(uvm_va_space_t *va_space, UVM_TEST_CHANNEL_STRESS_PARAMS *params)
+{
+    uvm_test_rng_t rng;
+    uvm_gpu_t *gpu;
+    NV_STATUS status = NV_OK;
+
+    if (!g_uvm_global.conf_computing_enabled)
+        return NV_OK;
+
+    uvm_test_rng_init(&rng, params->seed);
+
+    uvm_va_space_down_read(va_space);
+
+    // Key rotation should be enabled, or disabled, in all GPUs. Pick a random
+    // one.
+    gpu = random_va_space_gpu(&rng, va_space);
+
+    if (!uvm_conf_computing_is_key_rotation_enabled(gpu))
+        goto out;
+
+    if (params->key_rotation_operation == UVM_TEST_CHANNEL_STRESS_KEY_ROTATION_OPERATION_CPU_TO_GPU)
+        status = channel_stress_key_rotation_cpu_encryption(gpu, params);
+    else if (params->key_rotation_operation == UVM_TEST_CHANNEL_STRESS_KEY_ROTATION_OPERATION_GPU_TO_CPU)
+        status = channel_stress_key_rotation_cpu_decryption(gpu, params);
+    else if (params->key_rotation_operation == UVM_TEST_CHANNEL_STRESS_KEY_ROTATION_OPERATION_ROTATE)
+        status = channel_stress_key_rotation_rotate(gpu, params);
+    else
+        status = NV_ERR_INVALID_PARAMETER;
+
+out:
+    uvm_va_space_up_read(va_space);
+
+    return status;
+}
+
 NV_STATUS uvm_test_channel_stress(UVM_TEST_CHANNEL_STRESS_PARAMS *params, struct file *filp)
 {
    uvm_va_space_t *va_space = uvm_va_space_get(filp);
@@ -1349,6 +1798,8 @@ NV_STATUS uvm_test_channel_stress(UVM_TEST_CHANNEL_STRESS_PARAMS *params, struct
            return uvm_test_channel_stress_update_channels(va_space, params);
        case UVM_TEST_CHANNEL_STRESS_MODE_NOOP_PUSH:
            return uvm_test_channel_noop_push(va_space, params);
+        case UVM_TEST_CHANNEL_STRESS_MODE_KEY_ROTATION:
+            return uvm_test_channel_stress_key_rotation(va_space, params);
        default:
            return NV_ERR_INVALID_PARAMETER;
    }
--- a/kernel-open/nvidia-uvm/uvm_conf_computing.c
+++ b/kernel-open/nvidia-uvm/uvm_conf_computing.c
@@ -33,6 +33,15 @@
 #include "nv_uvm_interface.h"
 #include "uvm_va_block.h"

+// Amount of encrypted data on a given engine that triggers key rotation. This
+// is a UVM internal threshold, different from that of RM, and used only during
+// testing.
+//
+// Key rotation is triggered when the total encryption size, or the total
+// decryption size (whatever comes first) reaches this lower threshold on the
+// engine.
+#define UVM_CONF_COMPUTING_KEY_ROTATION_LOWER_THRESHOLD (UVM_SIZE_1MB * 8)
+
 // The maximum number of secure operations per push is:
 // UVM_MAX_PUSH_SIZE / min(CE encryption size, CE decryption size)
 // + 1 (tracking semaphore) =  128 * 1024 / 56 + 1 = 2342
@@ -352,6 +361,19 @@ error:
    return status;
 }

+// The production key rotation defaults are such that key rotations rarely
+// happen. During UVM testing more frequent rotations are triggering by relying
+// on internal encryption usage accounting. When key rotations are triggered by
+// UVM, the driver does not rely on channel key rotation notifiers.
+//
+// TODO: Bug 4612912: UVM should be able to programmatically set the rotation
+// lower threshold. This function, and all the metadata associated with it
+// (per-pool encryption accounting, for example) can be removed at that point.
+static bool key_rotation_is_notifier_driven(void)
+{
+    return !uvm_enable_builtin_tests;
+}
+
 NV_STATUS uvm_conf_computing_gpu_init(uvm_gpu_t *gpu)
 {
    NV_STATUS status;
@@ -394,17 +416,35 @@ void uvm_conf_computing_gpu_deinit(uvm_gpu_t *gpu)
    conf_computing_dma_buffer_pool_deinit(&gpu->conf_computing.dma_buffer_pool);
 }

-void uvm_conf_computing_log_gpu_encryption(uvm_channel_t *channel, UvmCslIv *iv)
+void uvm_conf_computing_log_gpu_encryption(uvm_channel_t *channel, size_t size, UvmCslIv *iv)
 {
    NV_STATUS status;
+    uvm_channel_pool_t *pool;
+
+    if (uvm_channel_is_lcic(channel))
+        pool = uvm_channel_lcic_get_paired_wlc(channel)->pool;
+    else
+        pool = channel->pool;

    uvm_mutex_lock(&channel->csl.ctx_lock);
+
+    if (uvm_conf_computing_is_key_rotation_enabled_in_pool(pool)) {
+        status = nvUvmInterfaceCslLogEncryption(&channel->csl.ctx, UVM_CSL_OPERATION_DECRYPT, size);
+
+        // Informing RM of an encryption/decryption should not fail
+        UVM_ASSERT(status == NV_OK);
+
+        if (!key_rotation_is_notifier_driven())
+            atomic64_add(size, &pool->conf_computing.key_rotation.encrypted);
+    }
+
    status = nvUvmInterfaceCslIncrementIv(&channel->csl.ctx, UVM_CSL_OPERATION_DECRYPT, 1, iv);
-    uvm_mutex_unlock(&channel->csl.ctx_lock);

    // IV rotation is done preemptively as needed, so the above
    // call cannot return failure.
    UVM_ASSERT(status == NV_OK);
+
+    uvm_mutex_unlock(&channel->csl.ctx_lock);
 }

 void uvm_conf_computing_acquire_encryption_iv(uvm_channel_t *channel, UvmCslIv *iv)
@@ -428,27 +468,46 @@ void uvm_conf_computing_cpu_encrypt(uvm_channel_t *channel,
                                    void *auth_tag_buffer)
 {
    NV_STATUS status;
+    uvm_channel_pool_t *pool;

    UVM_ASSERT(size);

+    if (uvm_channel_is_lcic(channel))
+        pool = uvm_channel_lcic_get_paired_wlc(channel)->pool;
+    else
+        pool = channel->pool;
+
    uvm_mutex_lock(&channel->csl.ctx_lock);
+
    status = nvUvmInterfaceCslEncrypt(&channel->csl.ctx,
                                      size,
                                      (NvU8 const *) src_plain,
                                      encrypt_iv,
                                      (NvU8 *) dst_cipher,
                                      (NvU8 *) auth_tag_buffer);
-    uvm_mutex_unlock(&channel->csl.ctx_lock);

    // IV rotation is done preemptively as needed, so the above
    // call cannot return failure.
    UVM_ASSERT(status == NV_OK);
+
+    if (uvm_conf_computing_is_key_rotation_enabled_in_pool(pool)) {
+        status = nvUvmInterfaceCslLogEncryption(&channel->csl.ctx, UVM_CSL_OPERATION_ENCRYPT, size);
+
+        // Informing RM of an encryption/decryption should not fail
+        UVM_ASSERT(status == NV_OK);
+
+        if (!key_rotation_is_notifier_driven())
+            atomic64_add(size, &pool->conf_computing.key_rotation.decrypted);
+    }
+
+    uvm_mutex_unlock(&channel->csl.ctx_lock);
 }

 NV_STATUS uvm_conf_computing_cpu_decrypt(uvm_channel_t *channel,
                                         void *dst_plain,
                                         const void *src_cipher,
                                         const UvmCslIv *src_iv,
+                                         NvU32 key_version,
                                         size_t size,
                                         const void *auth_tag_buffer)
 {
@@ -469,10 +528,19 @@ NV_STATUS uvm_conf_computing_cpu_decrypt(uvm_channel_t *channel,
                                      size,
                                      (const NvU8 *) src_cipher,
                                      src_iv,
+                                      key_version,
                                      (NvU8 *) dst_plain,
                                      NULL,
                                      0,
                                      (const NvU8 *) auth_tag_buffer);
+
+    if (status != NV_OK) {
+        UVM_ERR_PRINT("nvUvmInterfaceCslDecrypt() failed: %s, channel %s, GPU %s\n",
+                      nvstatusToString(status),
+                      channel->name,
+                      uvm_gpu_name(uvm_channel_get_gpu(channel)));
+    }
+
    uvm_mutex_unlock(&channel->csl.ctx_lock);

    return status;
@@ -485,6 +553,8 @@ NV_STATUS uvm_conf_computing_fault_decrypt(uvm_parent_gpu_t *parent_gpu,
                                           NvU8 valid)
 {
    NV_STATUS status;
+    NvU32 fault_entry_size = parent_gpu->fault_buffer_hal->entry_size(parent_gpu);
+    UvmCslContext *csl_context = &parent_gpu->fault_buffer_info.rm_info.replayable.cslCtx;

    // There is no dedicated lock for the CSL context associated with replayable
    // faults. The mutual exclusion required by the RM CSL API is enforced by
@@ -494,36 +564,48 @@ NV_STATUS uvm_conf_computing_fault_decrypt(uvm_parent_gpu_t *parent_gpu,

    UVM_ASSERT(g_uvm_global.conf_computing_enabled);

-    status = nvUvmInterfaceCslDecrypt(&parent_gpu->fault_buffer_info.rm_info.replayable.cslCtx,
-                                      parent_gpu->fault_buffer_hal->entry_size(parent_gpu),
+    status = nvUvmInterfaceCslLogEncryption(csl_context, UVM_CSL_OPERATION_DECRYPT, fault_entry_size);
+
+    // Informing RM of an encryption/decryption should not fail
+    UVM_ASSERT(status == NV_OK);
+
+    status = nvUvmInterfaceCslDecrypt(csl_context,
+                                      fault_entry_size,
                                      (const NvU8 *) src_cipher,
                                      NULL,
+                                      NV_U32_MAX,
                                      (NvU8 *) dst_plain,
                                      &valid,
                                      sizeof(valid),
                                      (const NvU8 *) auth_tag_buffer);

-    if (status != NV_OK)
+    if (status != NV_OK) {
        UVM_ERR_PRINT("nvUvmInterfaceCslDecrypt() failed: %s, GPU %s\n",
                      nvstatusToString(status),
                      uvm_parent_gpu_name(parent_gpu));

+    }
+
    return status;
 }

-void uvm_conf_computing_fault_increment_decrypt_iv(uvm_parent_gpu_t *parent_gpu, NvU64 increment)
+void uvm_conf_computing_fault_increment_decrypt_iv(uvm_parent_gpu_t *parent_gpu)
 {
    NV_STATUS status;
+    NvU32 fault_entry_size = parent_gpu->fault_buffer_hal->entry_size(parent_gpu);
+    UvmCslContext *csl_context = &parent_gpu->fault_buffer_info.rm_info.replayable.cslCtx;

    // See comment in uvm_conf_computing_fault_decrypt
    UVM_ASSERT(uvm_sem_is_locked(&parent_gpu->isr.replayable_faults.service_lock));

    UVM_ASSERT(g_uvm_global.conf_computing_enabled);

-    status = nvUvmInterfaceCslIncrementIv(&parent_gpu->fault_buffer_info.rm_info.replayable.cslCtx,
-                                          UVM_CSL_OPERATION_DECRYPT,
-                                          increment,
-                                          NULL);
+    status = nvUvmInterfaceCslLogEncryption(csl_context, UVM_CSL_OPERATION_DECRYPT, fault_entry_size);
+
+    // Informing RM of an encryption/decryption should not fail
+    UVM_ASSERT(status == NV_OK);
+
+    status = nvUvmInterfaceCslIncrementIv(csl_context, UVM_CSL_OPERATION_DECRYPT, 1, NULL);

    UVM_ASSERT(status == NV_OK);
 }
@@ -625,3 +707,231 @@ NV_STATUS uvm_conf_computing_maybe_rotate_channel_ivs_retry_busy(uvm_channel_t *
 {
    return uvm_conf_computing_rotate_channel_ivs_below_limit(channel, uvm_conf_computing_channel_iv_rotation_limit, true);
 }
+
+void uvm_conf_computing_enable_key_rotation(uvm_gpu_t *gpu)
+{
+    if (!g_uvm_global.conf_computing_enabled)
+        return;
+
+    // Key rotation cannot be enabled on UVM if it is disabled on RM
+    if (!gpu->parent->rm_info.gpuConfComputeCaps.bKeyRotationEnabled)
+        return;
+
+    gpu->channel_manager->conf_computing.key_rotation_enabled = true;
+}
+
+void uvm_conf_computing_disable_key_rotation(uvm_gpu_t *gpu)
+{
+    if (!g_uvm_global.conf_computing_enabled)
+        return;
+
+    gpu->channel_manager->conf_computing.key_rotation_enabled = false;
+}
+
+bool uvm_conf_computing_is_key_rotation_enabled(uvm_gpu_t *gpu)
+{
+    return gpu->channel_manager->conf_computing.key_rotation_enabled;
+}
+
+bool uvm_conf_computing_is_key_rotation_enabled_in_pool(uvm_channel_pool_t *pool)
+{
+    if (!uvm_conf_computing_is_key_rotation_enabled(pool->manager->gpu))
+        return false;
+
+    // TODO: Bug 4586447: key rotation must be disabled in the SEC2 engine,
+    // because currently the encryption key is shared between UVM and RM, but
+    // UVM is not able to idle SEC2 channels owned by RM.
+    if (uvm_channel_pool_is_sec2(pool))
+        return false;
+
+    // Key rotation happens as part of channel reservation, and LCIC channels
+    // are never reserved directly. Rotation of keys in LCIC channels happens
+    // as the result of key rotation in WLC channels.
+    //
+    // Return false even if there is nothing fundamental prohibiting direct key
+    // rotation on LCIC pools
+    if (uvm_channel_pool_is_lcic(pool))
+        return false;
+
+    return true;
+}
+
+static bool conf_computing_is_key_rotation_pending_use_stats(uvm_channel_pool_t *pool)
+{
+    NvU64 decrypted, encrypted;
+
+    UVM_ASSERT(!key_rotation_is_notifier_driven());
+
+    decrypted = atomic64_read(&pool->conf_computing.key_rotation.decrypted);
+
+    if (decrypted > UVM_CONF_COMPUTING_KEY_ROTATION_LOWER_THRESHOLD)
+        return true;
+
+    encrypted = atomic64_read(&pool->conf_computing.key_rotation.encrypted);
+
+    if (encrypted > UVM_CONF_COMPUTING_KEY_ROTATION_LOWER_THRESHOLD)
+        return true;
+
+    return false;
+}
+
+static bool conf_computing_is_key_rotation_pending_use_notifier(uvm_channel_pool_t *pool)
+{
+    // If key rotation is pending for the pool's engine, then the key rotation
+    // notifier in any of the engine channels can be used by UVM to detect the
+    // situation. Note that RM doesn't update all the notifiers in a single
+    // atomic operation, so it is possible that the channel read by UVM (the
+    // first one in the pool) indicates that a key rotation is pending, but
+    // another channel in the pool (temporarily) indicates the opposite, or vice
+    // versa.
+    uvm_channel_t *first_channel = pool->channels;
+
+    UVM_ASSERT(key_rotation_is_notifier_driven());
+    UVM_ASSERT(first_channel != NULL);
+
+    return first_channel->channel_info.keyRotationNotifier->status == UVM_KEY_ROTATION_STATUS_PENDING;
+}
+
+bool uvm_conf_computing_is_key_rotation_pending_in_pool(uvm_channel_pool_t *pool)
+{
+    if (!uvm_conf_computing_is_key_rotation_enabled_in_pool(pool))
+        return false;
+
+    if (key_rotation_is_notifier_driven())
+        return conf_computing_is_key_rotation_pending_use_notifier(pool);
+    else
+        return conf_computing_is_key_rotation_pending_use_stats(pool);
+}
+
+NV_STATUS uvm_conf_computing_rotate_pool_key(uvm_channel_pool_t *pool)
+{
+    NV_STATUS status;
+
+    UVM_ASSERT(uvm_conf_computing_is_key_rotation_enabled_in_pool(pool));
+    UVM_ASSERT(pool->conf_computing.key_rotation.csl_contexts != NULL);
+    UVM_ASSERT(pool->conf_computing.key_rotation.num_csl_contexts > 0);
+
+    // NV_ERR_STATE_IN_USE indicates that RM was not able to acquire the
+    // required locks at this time. This status is not interpreted as an error,
+    // but as a sign for UVM to try again later. This is the same "protocol"
+    // used in IV rotation.
+    status = nvUvmInterfaceCslRotateKey(pool->conf_computing.key_rotation.csl_contexts,
+                                        pool->conf_computing.key_rotation.num_csl_contexts);
+
+    if (status == NV_OK) {
+        pool->conf_computing.key_rotation.version++;
+
+        if (!key_rotation_is_notifier_driven()) {
+            atomic64_set(&pool->conf_computing.key_rotation.decrypted, 0);
+            atomic64_set(&pool->conf_computing.key_rotation.encrypted, 0);
+        }
+    }
+    else if (status != NV_ERR_STATE_IN_USE) {
+        UVM_DBG_PRINT("nvUvmInterfaceCslRotateKey() failed in engine %u: %s\n",
+                      pool->engine_index,
+                      nvstatusToString(status));
+    }
+
+    return status;
+}
+
+__attribute__ ((format(printf, 6, 7)))
+NV_STATUS uvm_conf_computing_util_memcopy_cpu_to_gpu(uvm_gpu_t *gpu,
+                                                     uvm_gpu_address_t dst_gpu_address,
+                                                     void *src_plain,
+                                                     size_t size,
+                                                     uvm_tracker_t *tracker,
+                                                     const char *format,
+                                                     ...)
+{
+    NV_STATUS status;
+    uvm_push_t push;
+    uvm_conf_computing_dma_buffer_t *dma_buffer;
+    uvm_gpu_address_t src_gpu_address, auth_tag_gpu_address;
+    void *dst_cipher, *auth_tag;
+    va_list args;
+
+    UVM_ASSERT(g_uvm_global.conf_computing_enabled);
+    UVM_ASSERT(size <= UVM_CONF_COMPUTING_DMA_BUFFER_SIZE);
+
+    status = uvm_conf_computing_dma_buffer_alloc(&gpu->conf_computing.dma_buffer_pool, &dma_buffer, NULL);
+    if (status != NV_OK)
+        return status;
+
+    va_start(args, format);
+    status = uvm_push_begin_acquire(gpu->channel_manager, UVM_CHANNEL_TYPE_CPU_TO_GPU, tracker, &push, format, args);
+    va_end(args);
+
+    if (status != NV_OK)
+        goto out;
+
+    dst_cipher = uvm_mem_get_cpu_addr_kernel(dma_buffer->alloc);
+    auth_tag = uvm_mem_get_cpu_addr_kernel(dma_buffer->auth_tag);
+    uvm_conf_computing_cpu_encrypt(push.channel, dst_cipher, src_plain, NULL, size, auth_tag);
+
+    src_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->alloc, gpu);
+    auth_tag_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->auth_tag, gpu);
+    gpu->parent->ce_hal->decrypt(&push, dst_gpu_address, src_gpu_address, size, auth_tag_gpu_address);
+
+    status = uvm_push_end_and_wait(&push);
+
+out:
+    uvm_conf_computing_dma_buffer_free(&gpu->conf_computing.dma_buffer_pool, dma_buffer, NULL);
+    return status;
+}
+
+__attribute__ ((format(printf, 6, 7)))
+NV_STATUS uvm_conf_computing_util_memcopy_gpu_to_cpu(uvm_gpu_t *gpu,
+                                                     void *dst_plain,
+                                                     uvm_gpu_address_t src_gpu_address,
+                                                     size_t size,
+                                                     uvm_tracker_t *tracker,
+                                                     const char *format,
+                                                     ...)
+{
+    NV_STATUS status;
+    uvm_push_t push;
+    uvm_conf_computing_dma_buffer_t *dma_buffer;
+    uvm_gpu_address_t dst_gpu_address, auth_tag_gpu_address;
+    void *src_cipher, *auth_tag;
+    va_list args;
+
+    UVM_ASSERT(g_uvm_global.conf_computing_enabled);
+    UVM_ASSERT(size <= UVM_CONF_COMPUTING_DMA_BUFFER_SIZE);
+
+    status = uvm_conf_computing_dma_buffer_alloc(&gpu->conf_computing.dma_buffer_pool, &dma_buffer, NULL);
+    if (status != NV_OK)
+        return status;
+
+    va_start(args, format);
+    status = uvm_push_begin_acquire(gpu->channel_manager, UVM_CHANNEL_TYPE_GPU_TO_CPU, tracker, &push, format, args);
+    va_end(args);
+
+    if (status != NV_OK)
+        goto out;
+
+    uvm_conf_computing_log_gpu_encryption(push.channel, size, dma_buffer->decrypt_iv);
+    dma_buffer->key_version[0] = uvm_channel_pool_key_version(push.channel->pool);
+
+    dst_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->alloc, gpu);
+    auth_tag_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->auth_tag, gpu);
+    gpu->parent->ce_hal->encrypt(&push, dst_gpu_address, src_gpu_address, size, auth_tag_gpu_address);
+
+    status = uvm_push_end_and_wait(&push);
+    if (status != NV_OK)
+        goto out;
+
+    src_cipher = uvm_mem_get_cpu_addr_kernel(dma_buffer->alloc);
+    auth_tag = uvm_mem_get_cpu_addr_kernel(dma_buffer->auth_tag);
+    status = uvm_conf_computing_cpu_decrypt(push.channel,
+                                            dst_plain,
+                                            src_cipher,
+                                            dma_buffer->decrypt_iv,
+                                            dma_buffer->key_version[0],
+                                            size,
+                                            auth_tag);
+
+ out:
+    uvm_conf_computing_dma_buffer_free(&gpu->conf_computing.dma_buffer_pool, dma_buffer, NULL);
+    return status;
+}
--- a/kernel-open/nvidia-uvm/uvm_conf_computing.h
+++ b/kernel-open/nvidia-uvm/uvm_conf_computing.h
@@ -87,9 +87,9 @@ typedef struct
    // a free buffer.
    uvm_tracker_t tracker;

-    // When the DMA buffer is used as the destination of a GPU encryption, SEC2
-    // writes the authentication tag here. Later when the buffer is decrypted
-    // on the CPU the authentication tag is used again (read) for CSL to verify
+    // When the DMA buffer is used as the destination of a GPU encryption, the
+    // engine (CE or SEC2) writes the authentication tag here. When the buffer
+    // is decrypted on the CPU the authentication tag is used by CSL to verify
    // the authenticity. The allocation is big enough for one authentication
    // tag per PAGE_SIZE page in the alloc buffer.
    uvm_mem_t *auth_tag;
@@ -98,7 +98,12 @@ typedef struct
    // to the authentication tag. The allocation is big enough for one IV per
    // PAGE_SIZE page in the alloc buffer. The granularity between the decrypt
    // IV and authentication tag must match.
-    UvmCslIv decrypt_iv[(UVM_CONF_COMPUTING_DMA_BUFFER_SIZE / PAGE_SIZE)];
+    UvmCslIv decrypt_iv[UVM_CONF_COMPUTING_DMA_BUFFER_SIZE / PAGE_SIZE];
+
+    // When the DMA buffer is used as the destination of a GPU encryption, the
+    // key version used during GPU encryption of each PAGE_SIZE page can be
+    // saved here, so CPU decryption uses the correct decryption key.
+    NvU32 key_version[UVM_CONF_COMPUTING_DMA_BUFFER_SIZE / PAGE_SIZE];

    // Bitmap of the encrypted pages in the backing allocation
    uvm_page_mask_t encrypted_page_mask;
@@ -147,7 +152,7 @@ NV_STATUS uvm_conf_computing_gpu_init(uvm_gpu_t *gpu);
 void uvm_conf_computing_gpu_deinit(uvm_gpu_t *gpu);

 // Logs encryption information from the GPU and returns the IV.
-void uvm_conf_computing_log_gpu_encryption(uvm_channel_t *channel, UvmCslIv *iv);
+void uvm_conf_computing_log_gpu_encryption(uvm_channel_t *channel, size_t size, UvmCslIv *iv);

 // Acquires next CPU encryption IV and returns it.
 void uvm_conf_computing_acquire_encryption_iv(uvm_channel_t *channel, UvmCslIv *iv);
@@ -167,10 +172,14 @@ void uvm_conf_computing_cpu_encrypt(uvm_channel_t *channel,
 // CPU side decryption helper. Decrypts data from src_cipher and writes the
 // plain text in dst_plain. src_cipher and dst_plain can't overlap. IV obtained
 // from uvm_conf_computing_log_gpu_encryption() needs to be be passed to src_iv.
+//
+// The caller must indicate which key to use for decryption by passing the
+// appropiate key version number.
 NV_STATUS uvm_conf_computing_cpu_decrypt(uvm_channel_t *channel,
                                         void *dst_plain,
                                         const void *src_cipher,
                                         const UvmCslIv *src_iv,
+                                         NvU32 key_version,
                                         size_t size,
                                         const void *auth_tag_buffer);

@@ -191,12 +200,12 @@ NV_STATUS uvm_conf_computing_fault_decrypt(uvm_parent_gpu_t *parent_gpu,
                                           NvU8 valid);

 // Increment the CPU-side decrypt IV of the CSL context associated with
-// replayable faults. The function is a no-op if the given increment is zero.
+// replayable faults.
 //
 // The IV associated with a fault CSL context is a 64-bit counter.
 //
 // Locking: this function must be invoked while holding the replayable ISR lock.
-void uvm_conf_computing_fault_increment_decrypt_iv(uvm_parent_gpu_t *parent_gpu, NvU64 increment);
+void uvm_conf_computing_fault_increment_decrypt_iv(uvm_parent_gpu_t *parent_gpu);

 // Query the number of remaining messages before IV needs to be rotated.
 void uvm_conf_computing_query_message_pools(uvm_channel_t *channel,
@@ -214,4 +223,71 @@ NV_STATUS uvm_conf_computing_maybe_rotate_channel_ivs_retry_busy(uvm_channel_t *
 // Check if there are fewer than 'limit' messages available in either direction
 // and rotate if not.
 NV_STATUS uvm_conf_computing_rotate_channel_ivs_below_limit(uvm_channel_t *channel, NvU64 limit, bool retry_if_busy);
+
+// Rotate the engine key associated with the given channel pool.
+NV_STATUS uvm_conf_computing_rotate_pool_key(uvm_channel_pool_t *pool);
+
+// Returns true if key rotation is allowed in the channel pool.
+bool uvm_conf_computing_is_key_rotation_enabled_in_pool(uvm_channel_pool_t *pool);
+
+// Returns true if key rotation is pending in the channel pool.
+bool uvm_conf_computing_is_key_rotation_pending_in_pool(uvm_channel_pool_t *pool);
+
+// Enable/disable key rotation in the passed GPU. Note that UVM enablement is
+// dependent on RM enablement: key rotation may still be disabled upon calling
+// this function, if it is disabled in RM. On the other hand, key rotation can
+// be disabled in UVM, even if it is enabled in RM.
+//
+// Enablement/Disablement affects only kernel key rotation in keys owned by UVM.
+// It doesn't affect user key rotation (CUDA, Video...), nor it affects RM
+// kernel key rotation.
+void uvm_conf_computing_enable_key_rotation(uvm_gpu_t *gpu);
+void uvm_conf_computing_disable_key_rotation(uvm_gpu_t *gpu);
+
+// Returns true if key rotation is enabled on UVM in the given GPU. Key rotation
+// can be enabled on the GPU but disabled on some of GPU engines (LCEs or SEC2),
+// see uvm_conf_computing_is_key_rotation_enabled_in_pool.
+bool uvm_conf_computing_is_key_rotation_enabled(uvm_gpu_t *gpu);
+
+// Launch a synchronous, encrypted copy between CPU and GPU.
+//
+// The maximum copy size allowed is UVM_CONF_COMPUTING_DMA_BUFFER_SIZE.
+//
+// The source CPU buffer pointed by src_plain contains the unencrypted (plain
+// text) contents; the function internally performs a CPU-side encryption step
+// before launching the GPU-side CE decryption. The source buffer can be in
+// protected or unprotected sysmem, while the destination buffer must be in
+// protected vidmem.
+//
+// The input tracker, if not NULL, is internally acquired by the push
+// responsible for the encrypted copy.
+__attribute__ ((format(printf, 6, 7)))
+NV_STATUS uvm_conf_computing_util_memcopy_cpu_to_gpu(uvm_gpu_t *gpu,
+                                                     uvm_gpu_address_t dst_gpu_address,
+                                                     void *src_plain,
+                                                     size_t size,
+                                                     uvm_tracker_t *tracker,
+                                                     const char *format,
+                                                     ...);
+
+// Launch a synchronous, encrypted copy between CPU and GPU.
+//
+// The maximum copy size allowed is UVM_CONF_COMPUTING_DMA_BUFFER_SIZE.
+//
+// The source CPU buffer pointed by src_plain contains the unencrypted (plain
+// text) contents; the function internally performs a CPU-side encryption step
+// before launching the GPU-side CE decryption. The source buffer can be in
+// protected or unprotected sysmem, while the destination buffer must be in
+// protected vidmem.
+//
+// The input tracker, if not NULL, is internally acquired by the push
+// responsible for the encrypted copy.
+__attribute__ ((format(printf, 6, 7)))
+NV_STATUS uvm_conf_computing_util_memcopy_gpu_to_cpu(uvm_gpu_t *gpu,
+                                                     void *dst_plain,
+                                                     uvm_gpu_address_t src_gpu_address,
+                                                     size_t size,
+                                                     uvm_tracker_t *tracker,
+                                                     const char *format,
+                                                     ...);
 #endif // __UVM_CONF_COMPUTING_H__
--- a/kernel-open/nvidia-uvm/uvm_gpu_replayable_faults.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu_replayable_faults.c
@@ -591,7 +591,7 @@ static void fault_buffer_skip_replayable_entry(uvm_parent_gpu_t *parent_gpu, NvU
    // replayable faults still requires manual adjustment so it is kept in sync
    // with the encryption IV on the GSP-RM's side.
    if (g_uvm_global.conf_computing_enabled)
-        uvm_conf_computing_fault_increment_decrypt_iv(parent_gpu, 1);
+        uvm_conf_computing_fault_increment_decrypt_iv(parent_gpu);

    parent_gpu->fault_buffer_hal->entry_clear_valid(parent_gpu, index);
 }
--- a/kernel-open/nvidia-uvm/uvm_gpu_semaphore.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu_semaphore.c
@@ -60,6 +60,17 @@ struct uvm_gpu_semaphore_pool_page_struct
    // Allocation backing the page
    uvm_rm_mem_t *memory;

+    struct {
+        // Unprotected sysmem storing encrypted value of semaphores
+        uvm_rm_mem_t *encrypted_payload_memory;
+
+        // Unprotected sysmem storing encryption auth tags
+        uvm_rm_mem_t *auth_tag_memory;
+
+        // Unprotected sysmem storing plain text notifier values
+        uvm_rm_mem_t *notifier_memory;
+    } conf_computing;
+
    // Pool the page is part of
    uvm_gpu_semaphore_pool_t *pool;

@@ -80,26 +91,6 @@ static bool gpu_semaphore_is_secure(uvm_gpu_semaphore_t *semaphore)
    return gpu_semaphore_pool_is_secure(semaphore->page->pool);
 }

-static NvU32 get_index(uvm_gpu_semaphore_t *semaphore)
-{
-    NvU32 offset;
-    NvU32 index;
-
-    if (gpu_semaphore_is_secure(semaphore))
-        return semaphore->conf_computing.index;
-
-    UVM_ASSERT(semaphore->payload != NULL);
-    UVM_ASSERT(semaphore->page != NULL);
-
-    offset = (char*)semaphore->payload - (char*)uvm_rm_mem_get_cpu_va(semaphore->page->memory);
-    UVM_ASSERT(offset % UVM_SEMAPHORE_SIZE == 0);
-
-    index = offset / UVM_SEMAPHORE_SIZE;
-    UVM_ASSERT(index < UVM_SEMAPHORE_COUNT_PER_PAGE);
-
-    return index;
-}
-
 // Use canary values on debug builds to catch semaphore use-after-free. We can
 // catch release-after-free by simply setting the payload to a known value at
 // free then checking it on alloc or pool free, but catching acquire-after-free
@@ -150,34 +141,83 @@ static bool gpu_can_access_semaphore_pool(uvm_gpu_t *gpu, uvm_rm_mem_t *rm_mem)
    return ((uvm_rm_mem_get_gpu_uvm_va(rm_mem, gpu) + rm_mem->size - 1) < gpu->parent->max_host_va);
 }

-// Secure semaphore pools are allocated in the CPR of vidmem and only mapped to
-// the owning GPU as no other processor have access to it.
-static NV_STATUS pool_alloc_secure_page(uvm_gpu_semaphore_pool_t *pool,
-                                        uvm_gpu_semaphore_pool_page_t *pool_page,
-                                        uvm_rm_mem_type_t memory_type)
+static void pool_page_free_buffers(uvm_gpu_semaphore_pool_page_t *page)
+{
+    uvm_rm_mem_free(page->memory);
+    page->memory = NULL;
+
+    if (gpu_semaphore_pool_is_secure(page->pool)) {
+        uvm_rm_mem_free(page->conf_computing.encrypted_payload_memory);
+        uvm_rm_mem_free(page->conf_computing.auth_tag_memory);
+        uvm_rm_mem_free(page->conf_computing.notifier_memory);
+
+        page->conf_computing.encrypted_payload_memory = NULL;
+        page->conf_computing.auth_tag_memory = NULL;
+        page->conf_computing.notifier_memory = NULL;
+    }
+    else {
+        UVM_ASSERT(!page->conf_computing.encrypted_payload_memory);
+        UVM_ASSERT(!page->conf_computing.auth_tag_memory);
+        UVM_ASSERT(!page->conf_computing.notifier_memory);
+    }
+}
+
+static NV_STATUS pool_page_alloc_buffers(uvm_gpu_semaphore_pool_page_t *page)
 {
    NV_STATUS status;
+    uvm_gpu_semaphore_pool_t *pool = page->pool;
+    uvm_rm_mem_type_t memory_type = (pool->aperture == UVM_APERTURE_SYS) ? UVM_RM_MEM_TYPE_SYS : UVM_RM_MEM_TYPE_GPU;
+    size_t align = 0;
+    bool map_all = true;
+    align = gpu_semaphore_pool_is_secure(pool) ? UVM_CONF_COMPUTING_BUF_ALIGNMENT : 0;
+    map_all = gpu_semaphore_pool_is_secure(pool) ? false : true;

-    UVM_ASSERT(gpu_semaphore_pool_is_secure(pool));
-    status = uvm_rm_mem_alloc(pool->gpu,
-                              memory_type,
-                              UVM_SEMAPHORE_PAGE_SIZE,
-                              UVM_CONF_COMPUTING_BUF_ALIGNMENT,
-                              &pool_page->memory);
+    if (map_all)
+        status = uvm_rm_mem_alloc_and_map_all(pool->gpu, memory_type, UVM_SEMAPHORE_PAGE_SIZE, align, &page->memory);
+    else
+        status = uvm_rm_mem_alloc(pool->gpu, memory_type, UVM_SEMAPHORE_PAGE_SIZE, align, &page->memory);

    if (status != NV_OK)
-        return status;
+        goto error;
+
+    if (!gpu_semaphore_pool_is_secure(pool))
+        return NV_OK;
+
+    status = uvm_rm_mem_alloc_and_map_cpu(pool->gpu,
+                                          UVM_RM_MEM_TYPE_SYS,
+                                          UVM_SEMAPHORE_PAGE_SIZE,
+                                          UVM_CONF_COMPUTING_BUF_ALIGNMENT,
+                                          &page->conf_computing.encrypted_payload_memory);
+    if (status != NV_OK)
+        goto error;
+
+    BUILD_BUG_ON(UVM_CONF_COMPUTING_AUTH_TAG_SIZE % UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT);
+    status = uvm_rm_mem_alloc_and_map_cpu(pool->gpu,
+                                          UVM_RM_MEM_TYPE_SYS,
+                                          UVM_SEMAPHORE_COUNT_PER_PAGE * UVM_CONF_COMPUTING_AUTH_TAG_SIZE,
+                                          UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT,
+                                          &page->conf_computing.auth_tag_memory);
+    if (status != NV_OK)
+        goto error;
+
+    status = uvm_rm_mem_alloc_and_map_cpu(pool->gpu,
+                                          UVM_RM_MEM_TYPE_SYS,
+                                          UVM_SEMAPHORE_COUNT_PER_PAGE * sizeof(NvU32),
+                                          0,
+                                          &page->conf_computing.notifier_memory);
+    if (status != NV_OK)
+        goto error;

    return NV_OK;
+error:
+    pool_page_free_buffers(page);
+    return status;
 }

 static NV_STATUS pool_alloc_page(uvm_gpu_semaphore_pool_t *pool)
 {
    NV_STATUS status;
    uvm_gpu_semaphore_pool_page_t *pool_page;
-    NvU32 *payloads;
-    size_t i;
-    uvm_rm_mem_type_t memory_type = (pool->aperture == UVM_APERTURE_SYS) ? UVM_RM_MEM_TYPE_SYS : UVM_RM_MEM_TYPE_GPU;

    uvm_assert_mutex_locked(&pool->mutex);

@@ -188,24 +228,9 @@ static NV_STATUS pool_alloc_page(uvm_gpu_semaphore_pool_t *pool)

    pool_page->pool = pool;

-    // Whenever the Confidential Computing feature is enabled, engines can
-    // access semaphores only in the CPR of vidmem. Mapping to other GPUs is
-    // also disabled.
-    if (gpu_semaphore_pool_is_secure(pool)) {
-        status = pool_alloc_secure_page(pool, pool_page, memory_type);
-
-        if (status != NV_OK)
-            goto error;
-    }
-    else {
-    status = uvm_rm_mem_alloc_and_map_all(pool->gpu,
-                                          memory_type,
-                                          UVM_SEMAPHORE_PAGE_SIZE,
-                                          0,
-                                          &pool_page->memory);
+    status = pool_page_alloc_buffers(pool_page);
    if (status != NV_OK)
        goto error;
-    }

    // Verify the GPU can access the semaphore pool.
    UVM_ASSERT(gpu_can_access_semaphore_pool(pool->gpu, pool_page->memory));
@@ -217,7 +242,9 @@ static NV_STATUS pool_alloc_page(uvm_gpu_semaphore_pool_t *pool)
    pool->free_semaphores_count += UVM_SEMAPHORE_COUNT_PER_PAGE;

    if (semaphore_uses_canary(pool)) {
-        payloads = uvm_rm_mem_get_cpu_va(pool_page->memory);
+        size_t i;
+        NvU32 *payloads = uvm_rm_mem_get_cpu_va(pool_page->memory);
+
        for (i = 0; i < UVM_SEMAPHORE_COUNT_PER_PAGE; i++)
            payloads[i] = make_canary(0);
    }
@@ -253,7 +280,7 @@ static void pool_free_page(uvm_gpu_semaphore_pool_page_t *page)

    pool->free_semaphores_count -= UVM_SEMAPHORE_COUNT_PER_PAGE;
    list_del(&page->all_pages_node);
-    uvm_rm_mem_free(page->memory);
+    pool_page_free_buffers(page);
    uvm_kvfree(page);
 }

@@ -273,19 +300,22 @@ NV_STATUS uvm_gpu_semaphore_alloc(uvm_gpu_semaphore_pool_t *pool, uvm_gpu_semaph
        goto done;

    list_for_each_entry(page, &pool->pages, all_pages_node) {
-        NvU32 semaphore_index = find_first_bit(page->free_semaphores, UVM_SEMAPHORE_COUNT_PER_PAGE);
+        const NvU32 semaphore_index = find_first_bit(page->free_semaphores, UVM_SEMAPHORE_COUNT_PER_PAGE);
+
+        UVM_ASSERT(semaphore_index <= UVM_SEMAPHORE_COUNT_PER_PAGE);
+
        if (semaphore_index == UVM_SEMAPHORE_COUNT_PER_PAGE)
            continue;

-        if (gpu_semaphore_pool_is_secure(pool)) {
-            semaphore->conf_computing.index = semaphore_index;
-        }
-        else {
-            semaphore->payload = (NvU32*)((char*)uvm_rm_mem_get_cpu_va(page->memory) +
-                                                 semaphore_index * UVM_SEMAPHORE_SIZE);
-        }
-
        semaphore->page = page;
+        semaphore->index = semaphore_index;
+
+        if (gpu_semaphore_pool_is_secure(pool)) {
+
+            // Reset the notifier to prevent detection of false attack when
+            // checking for updated value
+            *uvm_gpu_semaphore_get_notifier_cpu_va(semaphore) = semaphore->conf_computing.last_observed_notifier;
+        }

        if (semaphore_uses_canary(pool))
            UVM_ASSERT(is_canary(uvm_gpu_semaphore_get_payload(semaphore)));
@@ -311,7 +341,6 @@ void uvm_gpu_semaphore_free(uvm_gpu_semaphore_t *semaphore)
 {
    uvm_gpu_semaphore_pool_page_t *page;
    uvm_gpu_semaphore_pool_t *pool;
-    NvU32 index;

    UVM_ASSERT(semaphore);

@@ -323,7 +352,6 @@ void uvm_gpu_semaphore_free(uvm_gpu_semaphore_t *semaphore)
        return;

    pool = page->pool;
-    index = get_index(semaphore);

    // Write a known value lower than the current payload in an attempt to catch
    // release-after-free and acquire-after-free.
@@ -333,10 +361,9 @@ void uvm_gpu_semaphore_free(uvm_gpu_semaphore_t *semaphore)
    uvm_mutex_lock(&pool->mutex);

    semaphore->page = NULL;
-    semaphore->payload = NULL;

    ++pool->free_semaphores_count;
-    __set_bit(index, page->free_semaphores);
+    __set_bit(semaphore->index, page->free_semaphores);

    uvm_mutex_unlock(&pool->mutex);
 }
@@ -449,18 +476,72 @@ NvU64 uvm_gpu_semaphore_get_gpu_proxy_va(uvm_gpu_semaphore_t *semaphore, uvm_gpu

 NvU64 uvm_gpu_semaphore_get_gpu_va(uvm_gpu_semaphore_t *semaphore, uvm_gpu_t *gpu, bool is_proxy_va_space)
 {
-    NvU32 index = get_index(semaphore);
    NvU64 base_va = uvm_rm_mem_get_gpu_va(semaphore->page->memory, gpu, is_proxy_va_space).address;

-    return base_va + UVM_SEMAPHORE_SIZE * index;
+    return base_va + semaphore->index * UVM_SEMAPHORE_SIZE;
+}
+
+NvU32 *uvm_gpu_semaphore_get_cpu_va(uvm_gpu_semaphore_t *semaphore)
+{
+    char *base_va;
+
+    if (gpu_semaphore_is_secure(semaphore))
+        return &semaphore->conf_computing.cached_payload;
+
+    base_va = uvm_rm_mem_get_cpu_va(semaphore->page->memory);
+    return (NvU32*)(base_va + semaphore->index * UVM_SEMAPHORE_SIZE);
+}
+
+NvU32 *uvm_gpu_semaphore_get_encrypted_payload_cpu_va(uvm_gpu_semaphore_t *semaphore)
+{
+    char *encrypted_base_va = uvm_rm_mem_get_cpu_va(semaphore->page->conf_computing.encrypted_payload_memory);
+
+    return (NvU32*)(encrypted_base_va + semaphore->index * UVM_SEMAPHORE_SIZE);
+}
+
+uvm_gpu_address_t uvm_gpu_semaphore_get_encrypted_payload_gpu_va(uvm_gpu_semaphore_t *semaphore)
+{
+    NvU64 encrypted_base_va = uvm_rm_mem_get_gpu_uvm_va(semaphore->page->conf_computing.encrypted_payload_memory,
+                                                        semaphore->page->pool->gpu);
+
+    return uvm_gpu_address_virtual_unprotected(encrypted_base_va + semaphore->index * UVM_SEMAPHORE_SIZE);
+}
+
+uvm_gpu_semaphore_notifier_t *uvm_gpu_semaphore_get_notifier_cpu_va(uvm_gpu_semaphore_t *semaphore)
+{
+    uvm_gpu_semaphore_notifier_t *notifier_base_va =
+        uvm_rm_mem_get_cpu_va(semaphore->page->conf_computing.notifier_memory);
+
+    return notifier_base_va + semaphore->index;
+}
+
+uvm_gpu_address_t uvm_gpu_semaphore_get_notifier_gpu_va(uvm_gpu_semaphore_t *semaphore)
+{
+    NvU64 notifier_base_va = uvm_rm_mem_get_gpu_uvm_va(semaphore->page->conf_computing.notifier_memory,
+                                                       semaphore->page->pool->gpu);
+
+    return uvm_gpu_address_virtual_unprotected(notifier_base_va +
+                                               semaphore->index * sizeof(uvm_gpu_semaphore_notifier_t));
+}
+
+void *uvm_gpu_semaphore_get_auth_tag_cpu_va(uvm_gpu_semaphore_t *semaphore)
+{
+    char *auth_tag_base_va = uvm_rm_mem_get_cpu_va(semaphore->page->conf_computing.auth_tag_memory);
+
+    return (void*)(auth_tag_base_va + semaphore->index * UVM_CONF_COMPUTING_AUTH_TAG_SIZE);
+}
+
+uvm_gpu_address_t uvm_gpu_semaphore_get_auth_tag_gpu_va(uvm_gpu_semaphore_t *semaphore)
+{
+    NvU64 auth_tag_base_va = uvm_rm_mem_get_gpu_uvm_va(semaphore->page->conf_computing.auth_tag_memory,
+                                                       semaphore->page->pool->gpu);
+
+    return uvm_gpu_address_virtual_unprotected(auth_tag_base_va + semaphore->index * UVM_CONF_COMPUTING_AUTH_TAG_SIZE);
 }

 NvU32 uvm_gpu_semaphore_get_payload(uvm_gpu_semaphore_t *semaphore)
 {
-    if (gpu_semaphore_is_secure(semaphore))
-        return UVM_GPU_READ_ONCE(semaphore->conf_computing.cached_payload);
-
-    return UVM_GPU_READ_ONCE(*semaphore->payload);
+    return UVM_GPU_READ_ONCE(*uvm_gpu_semaphore_get_cpu_va(semaphore));
 }

 void uvm_gpu_semaphore_set_payload(uvm_gpu_semaphore_t *semaphore, NvU32 payload)
@@ -477,10 +558,7 @@ void uvm_gpu_semaphore_set_payload(uvm_gpu_semaphore_t *semaphore, NvU32 payload
    // the GPU correctly even on non-SMP).
    mb();

-    if (gpu_semaphore_is_secure(semaphore))
-            UVM_GPU_WRITE_ONCE(semaphore->conf_computing.cached_payload, payload);
-    else
-    UVM_GPU_WRITE_ONCE(*semaphore->payload, payload);
+    UVM_GPU_WRITE_ONCE(*uvm_gpu_semaphore_get_cpu_va(semaphore), payload);
 }

 // This function is intended to catch channels which have been left dangling in
@@ -546,22 +624,11 @@ void uvm_gpu_tracking_semaphore_free(uvm_gpu_tracking_semaphore_t *tracking_sem)
    uvm_gpu_semaphore_free(&tracking_sem->semaphore);
 }

-static bool should_skip_secure_semaphore_update(NvU32 last_observed_notifier, NvU32 gpu_notifier)
+static void gpu_semaphore_encrypted_payload_update(uvm_channel_t *channel, uvm_gpu_semaphore_t *semaphore)
 {
-    // No new value, or the GPU is currently writing the new encrypted material
-    // and no change in value would still result in corrupted data.
-    return (last_observed_notifier == gpu_notifier) || (gpu_notifier % 2);
-}
-
-static void uvm_gpu_semaphore_encrypted_payload_update(uvm_channel_t *channel, uvm_gpu_semaphore_t *semaphore)
-{
-    UvmCslIv local_iv;
    NvU32 local_payload;
-    NvU32 new_sem_value;
-    NvU32 gpu_notifier;
-    NvU32 last_observed_notifier;
-    NvU32 new_gpu_notifier = 0;
-    NvU32 iv_index = 0;
+    uvm_gpu_semaphore_notifier_t gpu_notifier;
+    uvm_gpu_semaphore_notifier_t new_gpu_notifier = 0;

    // A channel can have multiple entries pending and the tracking semaphore
    // update of each entry can race with this function. Since the semaphore
@@ -570,64 +637,72 @@ static void uvm_gpu_semaphore_encrypted_payload_update(uvm_channel_t *channel, u
    unsigned tries_left = channel->num_gpfifo_entries;
    NV_STATUS status = NV_OK;
    NvU8 local_auth_tag[UVM_CONF_COMPUTING_AUTH_TAG_SIZE];
-    UvmCslIv *ivs_cpu_addr = semaphore->conf_computing.ivs;
-    void *auth_tag_cpu_addr = uvm_rm_mem_get_cpu_va(semaphore->conf_computing.auth_tag);
-    NvU32 *gpu_notifier_cpu_addr = (NvU32 *)uvm_rm_mem_get_cpu_va(semaphore->conf_computing.notifier);
-    NvU32 *payload_cpu_addr = (NvU32 *)uvm_rm_mem_get_cpu_va(semaphore->conf_computing.encrypted_payload);
+    uvm_gpu_semaphore_notifier_t *semaphore_notifier_cpu_addr = uvm_gpu_semaphore_get_notifier_cpu_va(semaphore);

    UVM_ASSERT(g_uvm_global.conf_computing_enabled);
    UVM_ASSERT(uvm_channel_is_ce(channel));

-    last_observed_notifier = semaphore->conf_computing.last_observed_notifier;
-    gpu_notifier = UVM_READ_ONCE(*gpu_notifier_cpu_addr);
-    UVM_ASSERT(last_observed_notifier <= gpu_notifier);
-
-    if (should_skip_secure_semaphore_update(last_observed_notifier, gpu_notifier))
-        return;
-
    do {
-        gpu_notifier = UVM_READ_ONCE(*gpu_notifier_cpu_addr);
+        gpu_notifier = UVM_READ_ONCE(*semaphore_notifier_cpu_addr);
+
+        UVM_ASSERT(gpu_notifier >= semaphore->conf_computing.last_observed_notifier);

        // Odd notifier value means there's an update in progress.
        if (gpu_notifier % 2)
            continue;

+        // There's no change since last time
+        if (gpu_notifier == semaphore->conf_computing.last_observed_notifier)
+            return;
+
        // Make sure no memory accesses happen before we read the notifier
        smp_mb__after_atomic();

-        iv_index = (gpu_notifier / 2) % channel->num_gpfifo_entries;
-        memcpy(local_auth_tag, auth_tag_cpu_addr, sizeof(local_auth_tag));
-        local_payload = UVM_READ_ONCE(*payload_cpu_addr);
-        memcpy(&local_iv, &ivs_cpu_addr[iv_index], sizeof(local_iv));
+        memcpy(local_auth_tag, uvm_gpu_semaphore_get_auth_tag_cpu_va(semaphore), sizeof(local_auth_tag));
+        local_payload = UVM_READ_ONCE(*uvm_gpu_semaphore_get_encrypted_payload_cpu_va(semaphore));

        // Make sure the second read of notifier happens after
        // all memory accesses.
        smp_mb__before_atomic();
-        new_gpu_notifier = UVM_READ_ONCE(*gpu_notifier_cpu_addr);
+        new_gpu_notifier = UVM_READ_ONCE(*semaphore_notifier_cpu_addr);
        tries_left--;
    } while ((tries_left > 0) && ((gpu_notifier != new_gpu_notifier) || (gpu_notifier % 2)));

    if (!tries_left) {
        status = NV_ERR_INVALID_STATE;
-        goto error;
    }
+    else {
+        NvU32 key_version;
+        const NvU32 iv_index = (gpu_notifier / 2) % channel->num_gpfifo_entries;
+        NvU32 new_semaphore_value;
+
+        UVM_ASSERT(gpu_notifier == new_gpu_notifier);
+        UVM_ASSERT(gpu_notifier % 2 == 0);
+
+        // CPU decryption is guaranteed to use the same key version as the
+        // associated GPU encryption, because if there was any key rotation in
+        // between, then key rotation waited for all channels to complete before
+        // proceeding. The wait implies that the semaphore value matches the
+        // last one encrypted on the GPU, so this CPU decryption should happen
+        // before the key is rotated.
+        key_version = uvm_channel_pool_key_version(channel->pool);

-    if (gpu_notifier == new_gpu_notifier) {
        status = uvm_conf_computing_cpu_decrypt(channel,
-                                                &new_sem_value,
+                                                &new_semaphore_value,
                                                &local_payload,
-                                                &local_iv,
-                                                sizeof(new_sem_value),
+                                                &semaphore->conf_computing.ivs[iv_index],
+                                                key_version,
+                                                sizeof(new_semaphore_value),
                                                &local_auth_tag);

        if (status != NV_OK)
            goto error;

-        uvm_gpu_semaphore_set_payload(semaphore, new_sem_value);
+        uvm_gpu_semaphore_set_payload(semaphore, new_semaphore_value);
        UVM_WRITE_ONCE(semaphore->conf_computing.last_observed_notifier, new_gpu_notifier);
-    }

-    return;
+        return;
+    }

 error:
    // Decryption failure is a fatal error as well as running out of try left.
@@ -650,11 +725,11 @@ static NvU64 update_completed_value_locked(uvm_gpu_tracking_semaphore_t *trackin
    else
        uvm_assert_spinlock_locked(&tracking_semaphore->s_lock);

-    if (tracking_semaphore->semaphore.conf_computing.encrypted_payload) {
+    if (gpu_semaphore_is_secure(&tracking_semaphore->semaphore)) {
        // TODO: Bug 4008734: [UVM][HCC] Extend secure tracking semaphore
        //                     mechanism to all semaphore
        uvm_channel_t *channel = container_of(tracking_semaphore, uvm_channel_t, tracking_sem);
-        uvm_gpu_semaphore_encrypted_payload_update(channel, &tracking_semaphore->semaphore);
+        gpu_semaphore_encrypted_payload_update(channel, &tracking_semaphore->semaphore);
    }

    new_sem_value = uvm_gpu_semaphore_get_payload(&tracking_semaphore->semaphore);
@@ -690,7 +765,7 @@ static NvU64 update_completed_value_locked(uvm_gpu_tracking_semaphore_t *trackin
    UVM_ASSERT_MSG_RELEASE(new_value - old_value <= UVM_GPU_SEMAPHORE_MAX_JUMP,
                           "GPU %s unexpected semaphore (CPU VA 0x%llx) jump from 0x%llx to 0x%llx\n",
                           uvm_gpu_name(tracking_semaphore->semaphore.page->pool->gpu),
-                           (NvU64)(uintptr_t)tracking_semaphore->semaphore.payload,
+                           (NvU64)(uintptr_t)uvm_gpu_semaphore_get_cpu_va(&tracking_semaphore->semaphore),
                           old_value, new_value);

    // Use an atomic write even though the lock is held so that the value can
--- a/kernel-open/nvidia-uvm/uvm_gpu_semaphore.h
+++ b/kernel-open/nvidia-uvm/uvm_gpu_semaphore.h
@@ -29,6 +29,8 @@
 #include "uvm_rm_mem.h"
 #include "uvm_linux.h"

+typedef NvU32 uvm_gpu_semaphore_notifier_t;
+
 // A GPU semaphore is a memory location accessible by the GPUs and the CPU
 // that's used for synchronization among them.
 // The GPU has primitives to acquire (wait for) and release (set) 4-byte memory
@@ -45,17 +47,15 @@ struct uvm_gpu_semaphore_struct
    // The semaphore pool page the semaphore came from
    uvm_gpu_semaphore_pool_page_t *page;

-    // Pointer to the memory location
-    NvU32 *payload;
+    // Index of the semaphore in semaphore page
+    NvU16 index;
+
    struct {
-        NvU16 index;
-        NvU32 cached_payload;
-        uvm_rm_mem_t *encrypted_payload;
-        uvm_rm_mem_t *notifier;
-        uvm_rm_mem_t *auth_tag;
        UvmCslIv *ivs;
-        NvU32 last_pushed_notifier;
-        NvU32 last_observed_notifier;
+        NvU32 cached_payload;
+
+        uvm_gpu_semaphore_notifier_t last_pushed_notifier;
+        uvm_gpu_semaphore_notifier_t last_observed_notifier;
    } conf_computing;
 };

@@ -151,6 +151,17 @@ NvU64 uvm_gpu_semaphore_get_gpu_proxy_va(uvm_gpu_semaphore_t *semaphore, uvm_gpu

 NvU64 uvm_gpu_semaphore_get_gpu_va(uvm_gpu_semaphore_t *semaphore, uvm_gpu_t *gpu, bool is_proxy_va_space);

+NvU32 *uvm_gpu_semaphore_get_cpu_va(uvm_gpu_semaphore_t *semaphore);
+
+NvU32 *uvm_gpu_semaphore_get_encrypted_payload_cpu_va(uvm_gpu_semaphore_t *semaphore);
+uvm_gpu_address_t uvm_gpu_semaphore_get_encrypted_payload_gpu_va(uvm_gpu_semaphore_t *semaphore);
+
+uvm_gpu_semaphore_notifier_t *uvm_gpu_semaphore_get_notifier_cpu_va(uvm_gpu_semaphore_t *semaphore);
+uvm_gpu_address_t uvm_gpu_semaphore_get_notifier_gpu_va(uvm_gpu_semaphore_t *semaphore);
+
+void *uvm_gpu_semaphore_get_auth_tag_cpu_va(uvm_gpu_semaphore_t *semaphore);
+uvm_gpu_address_t uvm_gpu_semaphore_get_auth_tag_gpu_va(uvm_gpu_semaphore_t *semaphore);
+
 // Read the 32-bit payload of the semaphore
 // Notably doesn't provide any memory ordering guarantees and needs to be used with
 // care. For an example of what needs to be considered see
--- a/kernel-open/nvidia-uvm/uvm_hmm.c
+++ b/kernel-open/nvidia-uvm/uvm_hmm.c
@@ -284,8 +284,10 @@ static void hmm_va_block_unregister_gpu(uvm_va_block_t *va_block,

    // Reset preferred location and accessed-by of policy nodes if needed.
    uvm_for_each_va_policy_node_in(node, va_block, va_block->start, va_block->end) {
-        if (uvm_id_equal(node->policy.preferred_location, gpu->id))
+        if (uvm_va_policy_preferred_location_equal(&node->policy, gpu->id, NUMA_NO_NODE)) {
            node->policy.preferred_location = UVM_ID_INVALID;
+            node->policy.preferred_nid = NUMA_NO_NODE;
+        }

        uvm_processor_mask_clear(&node->policy.accessed_by, gpu->id);
    }
--- a/kernel-open/nvidia-uvm/uvm_lock.c
+++ b/kernel-open/nvidia-uvm/uvm_lock.c
@@ -27,7 +27,7 @@

 const char *uvm_lock_order_to_string(uvm_lock_order_t lock_order)
 {
-    BUILD_BUG_ON(UVM_LOCK_ORDER_COUNT != 34);
+    BUILD_BUG_ON(UVM_LOCK_ORDER_COUNT != 36);

    switch (lock_order) {
        UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_INVALID);
@@ -48,7 +48,9 @@ const char *uvm_lock_order_to_string(uvm_lock_order_t lock_order)
        UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_CONF_COMPUTING_DMA_BUFFER_POOL);
        UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_CHUNK_MAPPING);
        UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_PAGE_TREE);
+        UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_KEY_ROTATION);
        UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_CSL_PUSH);
+        UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_KEY_ROTATION_WLC);
        UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_CSL_WLC_PUSH);
        UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_CSL_SEC2_PUSH);
        UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_PUSH);
--- a/kernel-open/nvidia-uvm/uvm_lock.h
+++ b/kernel-open/nvidia-uvm/uvm_lock.h
@@ -322,6 +322,15 @@
 //      Operations not allowed while holding this lock
 //      - GPU memory allocation which can evict
 //
+// - Channel pool key rotation lock
+//      Order: UVM_LOCK_ORDER_KEY_ROTATION
+//      Condition: Confidential Computing is enabled
+//      Mutex per channel pool
+//
+//      The lock ensures mutual exclusion during key rotation affecting all the
+//      channels in the associated pool. Key rotation in WLC pools is handled
+//      using a separate lock order, see UVM_LOCK_ORDER_KEY_ROTATION_WLC below.
+//
 // - CE channel CSL channel pool semaphore
 //      Order: UVM_LOCK_ORDER_CSL_PUSH
 //      Condition: The Confidential Computing feature is enabled
@@ -338,6 +347,15 @@
 //      Operations allowed while holding this lock
 //      - Pushing work to CE channels (except for WLC channels)
 //
+// - WLC channel pool key rotation lock
+//      Order: UVM_LOCK_ORDER_KEY_ROTATION_WLC
+//      Condition: Confidential Computing is enabled
+//      Mutex of WLC channel pool
+//
+//      The lock has the same purpose as the regular channel pool key rotation
+//      lock. Using a different order lock for WLC channels allows key rotation
+//      on those channels during indirect work submission.
+//
 // - WLC CSL channel pool semaphore
 //      Order: UVM_LOCK_ORDER_CSL_WLC_PUSH
 //      Condition: The Confidential Computing feature is enabled
@@ -484,7 +502,9 @@ typedef enum
    UVM_LOCK_ORDER_CONF_COMPUTING_DMA_BUFFER_POOL,
    UVM_LOCK_ORDER_CHUNK_MAPPING,
    UVM_LOCK_ORDER_PAGE_TREE,
+    UVM_LOCK_ORDER_KEY_ROTATION,
    UVM_LOCK_ORDER_CSL_PUSH,
+    UVM_LOCK_ORDER_KEY_ROTATION_WLC,
    UVM_LOCK_ORDER_CSL_WLC_PUSH,
    UVM_LOCK_ORDER_CSL_SEC2_PUSH,
    UVM_LOCK_ORDER_PUSH,
--- a/kernel-open/nvidia-uvm/uvm_map_external.c
+++ b/kernel-open/nvidia-uvm/uvm_map_external.c
@@ -39,6 +39,7 @@
 #include "uvm_pte_batch.h"
 #include "uvm_tlb_batch.h"
 #include "nv_uvm_interface.h"
+#include "nv_uvm_types.h"

 #include "uvm_pushbuffer.h"

@@ -101,11 +102,11 @@ static NV_STATUS uvm_pte_buffer_init(uvm_va_range_t *va_range,

    pte_buffer->va_range = va_range;
    pte_buffer->gpu = gpu;
-    pte_buffer->mapping_info.cachingType = map_rm_params->caching_type;
-    pte_buffer->mapping_info.mappingType = map_rm_params->mapping_type;
-    pte_buffer->mapping_info.formatType = map_rm_params->format_type;
-    pte_buffer->mapping_info.elementBits = map_rm_params->element_bits;
-    pte_buffer->mapping_info.compressionType = map_rm_params->compression_type;
+    pte_buffer->mapping_info.cachingType        = (UvmRmGpuCachingType) map_rm_params->caching_type;
+    pte_buffer->mapping_info.mappingType        = (UvmRmGpuMappingType) map_rm_params->mapping_type;
+    pte_buffer->mapping_info.formatType         = (UvmRmGpuFormatType) map_rm_params->format_type;
+    pte_buffer->mapping_info.elementBits        = (UvmRmGpuFormatElementBits) map_rm_params->element_bits;
+    pte_buffer->mapping_info.compressionType    = (UvmRmGpuCompressionType) map_rm_params->compression_type;
    if (va_range->type == UVM_VA_RANGE_TYPE_EXTERNAL)
        pte_buffer->mapping_info.mappingPageSize = page_size;

--- a/kernel-open/nvidia-uvm/uvm_migrate.c
+++ b/kernel-open/nvidia-uvm/uvm_migrate.c
@@ -589,7 +589,7 @@ static NV_STATUS uvm_migrate_ranges(uvm_va_space_t *va_space,
                    skipped_migrate = true;
            }
            else if (uvm_processor_mask_test(&va_range->uvm_lite_gpus, dest_id) &&
-                     !uvm_id_equal(dest_id, policy->preferred_location)) {
+                     !uvm_va_policy_preferred_location_equal(policy, dest_id, NUMA_NO_NODE)) {
                // Don't migrate to a non-faultable GPU that is in UVM-Lite mode,
                // unless it's the preferred location
                status = NV_ERR_INVALID_DEVICE;
--- a/kernel-open/nvidia-uvm/uvm_pmm_sysmem.c
+++ b/kernel-open/nvidia-uvm/uvm_pmm_sysmem.c
@@ -126,7 +126,7 @@ NV_STATUS uvm_pmm_sysmem_mappings_add_gpu_mapping(uvm_pmm_sysmem_mappings_t *sys
            NvU64 remove_key;

            for (remove_key = base_key; remove_key < key; ++remove_key)
-                (void *)radix_tree_delete(&sysmem_mappings->reverse_map_tree, remove_key);
+                (void)radix_tree_delete(&sysmem_mappings->reverse_map_tree, remove_key);

            kmem_cache_free(g_reverse_page_map_cache, new_reverse_map);
            status = errno_to_nv_status(ret);
--- a/kernel-open/nvidia-uvm/uvm_policy.c
+++ b/kernel-open/nvidia-uvm/uvm_policy.c
@@ -671,6 +671,9 @@ static NV_STATUS va_block_set_read_duplication_locked(uvm_va_block_t *va_block,

    uvm_assert_mutex_locked(&va_block->lock);

+    // Force CPU page residency to be on the preferred NUMA node.
+    va_block_context->make_resident.dest_nid = uvm_va_range_get_policy(va_block->va_range)->preferred_nid;
+
    for_each_id_in_mask(src_id, &va_block->resident) {
        NV_STATUS status;
        uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, src_id, NUMA_NO_NODE);
--- a/kernel-open/nvidia-uvm/uvm_processors.c
+++ b/kernel-open/nvidia-uvm/uvm_processors.c
@@ -100,16 +100,8 @@ void uvm_parent_gpus_from_processor_mask(uvm_parent_processor_mask_t *parent_mas

 bool uvm_numa_id_eq(int nid0, int nid1)
 {
-    UVM_ASSERT(nid0 == -1 || nid0 < MAX_NUMNODES);
-    UVM_ASSERT(nid1 == -1 || nid1 < MAX_NUMNODES);
-
-    if ((nid0 == NUMA_NO_NODE || nid1 == NUMA_NO_NODE) && nodes_weight(node_possible_map) == 1) {
-        if (nid0 == NUMA_NO_NODE)
-            nid0 = first_node(node_possible_map);
-
-        if (nid1 == NUMA_NO_NODE)
-            nid1 = first_node(node_possible_map);
-    }
+    UVM_ASSERT(nid0 >= NUMA_NO_NODE && nid0 < MAX_NUMNODES);
+    UVM_ASSERT(nid1 >= NUMA_NO_NODE && nid1 < MAX_NUMNODES);

    return nid0 == nid1;
 }
--- a/kernel-open/nvidia-uvm/uvm_push.h
+++ b/kernel-open/nvidia-uvm/uvm_push.h
@@ -65,9 +65,12 @@ typedef enum
 } uvm_push_flag_t;

 struct uvm_push_crypto_bundle_struct {
-    // Initialization vector used to decrypt the push
+    // Initialization vector used to decrypt the push on the CPU
    UvmCslIv iv;

+    // Key version used to decrypt the push on the CPU
+    NvU32 key_version;
+
    // Size of the pushbuffer that is encrypted/decrypted
    NvU32 push_size;
 };
--- a/kernel-open/nvidia-uvm/uvm_pushbuffer.c
+++ b/kernel-open/nvidia-uvm/uvm_pushbuffer.c
@@ -451,7 +451,6 @@ static uvm_pushbuffer_chunk_t *gpfifo_to_chunk(uvm_pushbuffer_t *pushbuffer, uvm
 static void decrypt_push(uvm_channel_t *channel, uvm_gpfifo_entry_t *gpfifo)
 {
    NV_STATUS status;
-    NvU32 auth_tag_offset;
    void *auth_tag_cpu_va;
    void *push_protected_cpu_va;
    void *push_unprotected_cpu_va;
@@ -470,16 +469,15 @@ static void decrypt_push(uvm_channel_t *channel, uvm_gpfifo_entry_t *gpfifo)
    UVM_ASSERT(!uvm_channel_is_wlc(channel));
    UVM_ASSERT(!uvm_channel_is_lcic(channel));

-    push_protected_cpu_va = (char *)get_base_cpu_va(pushbuffer) + pushbuffer_offset;
+    push_protected_cpu_va = get_base_cpu_va(pushbuffer) + pushbuffer_offset;
    push_unprotected_cpu_va = (char *)uvm_rm_mem_get_cpu_va(pushbuffer->memory_unprotected_sysmem) + pushbuffer_offset;
-    auth_tag_offset = push_info_index * UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
-    auth_tag_cpu_va = (char *)uvm_rm_mem_get_cpu_va(channel->conf_computing.push_crypto_bundle_auth_tags) +
-                              auth_tag_offset;
+    auth_tag_cpu_va = uvm_channel_get_push_crypto_bundle_auth_tags_cpu_va(channel, push_info_index);

    status = uvm_conf_computing_cpu_decrypt(channel,
                                            push_protected_cpu_va,
                                            push_unprotected_cpu_va,
                                            &crypto_bundle->iv,
+                                            crypto_bundle->key_version,
                                            crypto_bundle->push_size,
                                            auth_tag_cpu_va);

@@ -558,7 +556,7 @@ NvU64 uvm_pushbuffer_get_gpu_va_for_push(uvm_pushbuffer_t *pushbuffer, uvm_push_
    if (uvm_channel_is_wlc(push->channel) || uvm_channel_is_lcic(push->channel)) {
        // We need to use the same static locations for PB as the fixed
        // schedule because that's what the channels are initialized to use.
-        return uvm_rm_mem_get_gpu_uvm_va(push->channel->conf_computing.static_pb_protected_vidmem, gpu);
+        return uvm_channel_get_static_pb_protected_vidmem_gpu_va(push->channel);
    }
    else if (uvm_channel_is_sec2(push->channel)) {
        // SEC2 PBs are in unprotected sysmem
@@ -575,7 +573,7 @@ void *uvm_pushbuffer_get_unprotected_cpu_va_for_push(uvm_pushbuffer_t *pushbuffe
    if (uvm_channel_is_wlc(push->channel)) {
        // Reuse existing WLC static pb for initialization
        UVM_ASSERT(!uvm_channel_manager_is_wlc_ready(push->channel->pool->manager));
-        return push->channel->conf_computing.static_pb_unprotected_sysmem_cpu;
+        return uvm_channel_get_static_pb_unprotected_sysmem_cpu(push->channel);
    }

    pushbuffer_base = uvm_rm_mem_get_cpu_va(pushbuffer->memory_unprotected_sysmem);
@@ -590,8 +588,8 @@ NvU64 uvm_pushbuffer_get_unprotected_gpu_va_for_push(uvm_pushbuffer_t *pushbuffe
    if (uvm_channel_is_wlc(push->channel)) {
        // Reuse existing WLC static pb for initialization
        UVM_ASSERT(!uvm_channel_manager_is_wlc_ready(push->channel->pool->manager));
-        return uvm_rm_mem_get_gpu_uvm_va(push->channel->conf_computing.static_pb_unprotected_sysmem,
-                                         uvm_push_get_gpu(push));
+
+        return uvm_channel_get_static_pb_unprotected_sysmem_gpu_va(push->channel);
    }

    pushbuffer_base = uvm_rm_mem_get_gpu_uvm_va(pushbuffer->memory_unprotected_sysmem, uvm_push_get_gpu(push));
--- a/kernel-open/nvidia-uvm/uvm_sec2_test.c
+++ b/kernel-open/nvidia-uvm/uvm_sec2_test.c
@@ -322,6 +322,7 @@ static NV_STATUS cpu_decrypt(uvm_channel_t *channel,
                             uvm_mem_t *dst_mem,
                             uvm_mem_t *src_mem,
                             UvmCslIv *decrypt_iv,
+                             NvU32 key_version,
                             uvm_mem_t *auth_tag_mem,
                             size_t size,
                             size_t copy_size)
@@ -338,6 +339,7 @@ static NV_STATUS cpu_decrypt(uvm_channel_t *channel,
                                                         dst_plain,
                                                         src_cipher,
                                                         &decrypt_iv[i],
+                                                         key_version,
                                                         copy_size,
                                                         auth_tag_buffer));

@@ -368,7 +370,7 @@ static void gpu_encrypt(uvm_push_t *push,
    uvm_gpu_address_t auth_tag_address = uvm_mem_gpu_address_virtual_kernel(auth_tag_mem, gpu);

    for (i = 0; i < num_iterations; i++) {
-        uvm_conf_computing_log_gpu_encryption(push->channel, decrypt_iv);
+        uvm_conf_computing_log_gpu_encryption(push->channel, copy_size, decrypt_iv);

        if (i > 0)
            uvm_push_set_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
@@ -427,6 +429,7 @@ static NV_STATUS test_cpu_to_gpu_roundtrip(uvm_gpu_t *gpu, size_t copy_size, siz
    size_t auth_tag_buffer_size = (size / copy_size) * UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
    uvm_push_t push;
    UvmCslIv *decrypt_iv;
+    NvU32 key_version;

    decrypt_iv = uvm_kvmalloc_zero((size / copy_size) * sizeof(UvmCslIv));
    if (!decrypt_iv)
@@ -456,6 +459,11 @@ static NV_STATUS test_cpu_to_gpu_roundtrip(uvm_gpu_t *gpu, size_t copy_size, siz

    gpu_encrypt(&push, dst_cipher, dst_plain, decrypt_iv, auth_tag_mem, size, copy_size);

+    // There shouldn't be any key rotation between the end of the push and the
+    // CPU decryption(s), but it is more robust against test changes to force
+    // decryption to use the saved key.
+    key_version = uvm_channel_pool_key_version(push.channel->pool);
+
    TEST_NV_CHECK_GOTO(uvm_push_end_and_wait(&push), out);

    TEST_CHECK_GOTO(!mem_match(src_plain, src_cipher), out);
@@ -465,6 +473,7 @@ static NV_STATUS test_cpu_to_gpu_roundtrip(uvm_gpu_t *gpu, size_t copy_size, siz
                                   dst_plain_cpu,
                                   dst_cipher,
                                   decrypt_iv,
+                                   key_version,
                                   auth_tag_mem,
                                   size,
                                   copy_size),
--- a/kernel-open/nvidia-uvm/uvm_test.c
+++ b/kernel-open/nvidia-uvm/uvm_test.c
@@ -124,24 +124,23 @@ static NV_STATUS uvm_test_verify_bh_affinity(uvm_intr_handler_t *isr, int node)
 static NV_STATUS uvm_test_numa_check_affinity(UVM_TEST_NUMA_CHECK_AFFINITY_PARAMS *params, struct file *filp)
 {
    uvm_gpu_t *gpu;
-    NV_STATUS status;
-    uvm_rm_user_object_t user_rm_va_space = {
-        .rm_control_fd = -1,
-        .user_client = params->client,
-        .user_object = params->smc_part_ref
-    };
+    NV_STATUS status = NV_OK;

    if (!UVM_THREAD_AFFINITY_SUPPORTED())
        return NV_ERR_NOT_SUPPORTED;

-    status = uvm_gpu_retain_by_uuid(&params->gpu_uuid, &user_rm_va_space, &gpu);
-    if (status != NV_OK)
-        return status;
+    uvm_mutex_lock(&g_uvm_global.global_lock);
+
+    gpu = uvm_gpu_get_by_uuid(&params->gpu_uuid);
+    if (!gpu) {
+        status = NV_ERR_INVALID_DEVICE;
+        goto unlock;
+    }

    // If the GPU is not attached to a NUMA node, there is nothing to do.
    if (gpu->parent->closest_cpu_numa_node == NUMA_NO_NODE) {
        status = NV_ERR_NOT_SUPPORTED;
-        goto release;
+        goto unlock;
    }

    if (gpu->parent->replayable_faults_supported) {
@@ -150,7 +149,7 @@ static NV_STATUS uvm_test_numa_check_affinity(UVM_TEST_NUMA_CHECK_AFFINITY_PARAM
                                              gpu->parent->closest_cpu_numa_node);
        uvm_parent_gpu_replayable_faults_isr_unlock(gpu->parent);
        if (status != NV_OK)
-            goto release;
+            goto unlock;

        if (gpu->parent->non_replayable_faults_supported) {
            uvm_parent_gpu_non_replayable_faults_isr_lock(gpu->parent);
@@ -158,7 +157,7 @@ static NV_STATUS uvm_test_numa_check_affinity(UVM_TEST_NUMA_CHECK_AFFINITY_PARAM
                                                  gpu->parent->closest_cpu_numa_node);
            uvm_parent_gpu_non_replayable_faults_isr_unlock(gpu->parent);
            if (status != NV_OK)
-                goto release;
+                goto unlock;
        }

        if (gpu->parent->access_counters_supported) {
@@ -168,8 +167,9 @@ static NV_STATUS uvm_test_numa_check_affinity(UVM_TEST_NUMA_CHECK_AFFINITY_PARAM
            uvm_parent_gpu_access_counters_isr_unlock(gpu->parent);
        }
    }
-release:
-    uvm_gpu_release(gpu);
+
+unlock:
+    uvm_mutex_unlock(&g_uvm_global.global_lock);
    return status;
 }

--- a/kernel-open/nvidia-uvm/uvm_test_ioctl.h
+++ b/kernel-open/nvidia-uvm/uvm_test_ioctl.h
@@ -347,20 +347,30 @@ typedef enum
    UVM_TEST_CHANNEL_STRESS_MODE_NOOP_PUSH = 0,
    UVM_TEST_CHANNEL_STRESS_MODE_UPDATE_CHANNELS,
    UVM_TEST_CHANNEL_STRESS_MODE_STREAM,
+    UVM_TEST_CHANNEL_STRESS_MODE_KEY_ROTATION,
 } UVM_TEST_CHANNEL_STRESS_MODE;

+typedef enum
+{
+    UVM_TEST_CHANNEL_STRESS_KEY_ROTATION_OPERATION_CPU_TO_GPU,
+    UVM_TEST_CHANNEL_STRESS_KEY_ROTATION_OPERATION_GPU_TO_CPU,
+    UVM_TEST_CHANNEL_STRESS_KEY_ROTATION_OPERATION_ROTATE,
+} UVM_TEST_CHANNEL_STRESS_KEY_ROTATION_OPERATION;
+
 #define UVM_TEST_CHANNEL_STRESS                          UVM_TEST_IOCTL_BASE(15)
 typedef struct
 {
-    NvU32     mode;                   // In
+    NvU32     mode;                   // In, one of UVM_TEST_CHANNEL_STRESS_MODE

    // Number of iterations:
    //   mode == NOOP_PUSH: number of noop pushes
    //   mode == UPDATE_CHANNELS: number of updates
    //   mode == STREAM: number of iterations per stream
+    //   mode == ROTATION: number of operations
    NvU32     iterations;

-    NvU32     num_streams;            // In, used only for mode == UVM_TEST_CHANNEL_STRESS_MODE_STREAM
+    NvU32     num_streams;            // In, used only if mode == STREAM
+    NvU32     key_rotation_operation; // In, used only if mode == ROTATION
    NvU32     seed;                   // In
    NvU32     verbose;                // In
    NV_STATUS rmStatus;               // Out
@@ -1210,8 +1220,6 @@ typedef struct
 typedef struct
 {
    NvProcessorUuid                 gpu_uuid;                                           // In
-    NvHandle                        client;                                             // In
-    NvHandle                        smc_part_ref;                                       // In

    NV_STATUS                       rmStatus;                                           // Out
 } UVM_TEST_NUMA_CHECK_AFFINITY_PARAMS;
--- a/kernel-open/nvidia-uvm/uvm_va_block.c
+++ b/kernel-open/nvidia-uvm/uvm_va_block.c
@@ -725,8 +725,9 @@ bool uvm_va_block_cpu_is_region_resident_on(uvm_va_block_t *va_block, int nid, u
 }

 // Return the preferred NUMA node ID for the block's policy.
-// If the preferred node ID is NUMA_NO_NODE, the current NUMA node ID
-// is returned.
+// If the preferred node ID is NUMA_NO_NODE, the nearest NUMA node ID
+// with memory is returned. In most cases, this should be the current
+// NUMA node.
 static int uvm_va_block_context_get_node(uvm_va_block_context_t *va_block_context)
 {
    if (va_block_context->make_resident.dest_nid != NUMA_NO_NODE)
@@ -2070,6 +2071,7 @@ static NV_STATUS block_populate_pages_cpu(uvm_va_block_t *block,
    uvm_page_mask_t *allocated_mask;
    uvm_cpu_chunk_alloc_flags_t alloc_flags = UVM_CPU_CHUNK_ALLOC_FLAGS_NONE;
    uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
+    const uvm_va_policy_t *policy = uvm_va_policy_get_region(block, populate_region);
    uvm_page_index_t page_index;
    uvm_gpu_id_t id;
    int preferred_nid = block_context->make_resident.dest_nid;
@@ -2077,6 +2079,10 @@ static NV_STATUS block_populate_pages_cpu(uvm_va_block_t *block,
    if (block_test && block_test->cpu_chunk_allocation_target_id != NUMA_NO_NODE)
        preferred_nid = block_test->cpu_chunk_allocation_target_id;

+    // If the VA range has a preferred NUMA node, use it.
+    if (preferred_nid == NUMA_NO_NODE)
+        preferred_nid = policy->preferred_nid;
+
    // TODO: Bug 4158598: Using NUMA_NO_NODE for staging allocations is sub-optimal.
    if (preferred_nid != NUMA_NO_NODE) {
        uvm_va_block_cpu_node_state_t *node_state = block_node_state_get(block, preferred_nid);
@@ -2127,13 +2133,12 @@ static NV_STATUS block_populate_pages_cpu(uvm_va_block_t *block,
        uvm_page_mask_t *node_pages_mask = &block_context->make_resident.node_pages_mask;
        uvm_chunk_sizes_mask_t allocation_sizes;

-        if (uvm_page_mask_test(allocated_mask, page_index)) {
+        if (uvm_page_mask_test(allocated_mask, page_index) ||
+            uvm_va_block_cpu_is_page_resident_on(block, preferred_nid, page_index)) {
            page_index = uvm_va_block_next_unset_page_in_mask(populate_region, allocated_mask, page_index) - 1;
            continue;
        }

-        UVM_ASSERT(!uvm_va_block_cpu_is_page_resident_on(block, preferred_nid, page_index));
-
        allocation_sizes = block_calculate_largest_alloc_size(block,
                                                              page_index,
                                                              allocated_mask,
@@ -3843,6 +3848,7 @@ static void conf_computing_block_copy_push_gpu_to_cpu(uvm_va_block_t *block,
    uvm_gpu_address_t staging_buffer = uvm_mem_gpu_address_virtual_kernel(dma_buffer->alloc, gpu);
    uvm_gpu_address_t auth_tag_buffer = uvm_mem_gpu_address_virtual_kernel(dma_buffer->auth_tag, gpu);
    uvm_gpu_address_t src_address = block_copy_get_address(block, &copy_state->src, page_index, gpu);
+    NvU32 key_version = uvm_channel_pool_key_version(push->channel->pool);

    UVM_ASSERT(UVM_ID_IS_GPU(copy_state->src.id));
    UVM_ASSERT(UVM_ID_IS_CPU(copy_state->dst.id));
@@ -3860,7 +3866,8 @@ static void conf_computing_block_copy_push_gpu_to_cpu(uvm_va_block_t *block,
    // crypto-operations and it only guarantees PAGE_SIZE contiguity, all
    // encryptions and decryptions must happen on a PAGE_SIZE basis.
    for_each_va_block_page_in_region(page_index, region) {
-        uvm_conf_computing_log_gpu_encryption(push->channel, &dma_buffer->decrypt_iv[page_index]);
+        uvm_conf_computing_log_gpu_encryption(push->channel, PAGE_SIZE, &dma_buffer->decrypt_iv[page_index]);
+        dma_buffer->key_version[page_index] = key_version;

        // All but the first encryption can be pipelined. The first encryption
        // uses the caller's pipelining settings.
@@ -3919,7 +3926,8 @@ static NV_STATUS conf_computing_copy_pages_finish(uvm_va_block_t *block,
        status = uvm_conf_computing_cpu_decrypt(push->channel,
                                                cpu_page_address,
                                                staging_buffer,
-                                                &dma_buffer->decrypt_iv[page_index],
+                                                dma_buffer->decrypt_iv + page_index,
+                                                dma_buffer->key_version[page_index],
                                                PAGE_SIZE,
                                                auth_tag_buffer);
        kunmap(dst_page);
@@ -4037,7 +4045,7 @@ static NV_STATUS block_copy_pages(uvm_va_block_t *va_block,

        UVM_ASSERT(dst_chunk);
        UVM_ASSERT(uvm_cpu_chunk_get_size(src_chunk) >= uvm_va_block_region_size(region));
-        UVM_ASSERT(uvm_cpu_chunk_get_size(src_chunk) <= uvm_cpu_chunk_get_size(dst_chunk));
+        UVM_ASSERT(uvm_va_block_region_size(region) <= uvm_cpu_chunk_get_size(dst_chunk));

        // CPU-to-CPU copies using memcpy() don't have any inherent ordering with
        // copies using GPU CEs. So, we have to make sure that all previously
@@ -5132,7 +5140,7 @@ NV_STATUS uvm_va_block_make_resident_read_duplicate(uvm_va_block_t *va_block,
    uvm_page_mask_t *dst_resident_mask;
    uvm_page_mask_t *migrated_pages;
    uvm_page_mask_t *staged_pages;
-    uvm_page_mask_t *first_touch_mask;
+    uvm_page_mask_t *scratch_residency_mask;

    // TODO: Bug 3660922: need to implement HMM read duplication support.
    UVM_ASSERT(!uvm_va_block_is_hmm(va_block));
@@ -5151,6 +5159,10 @@ NV_STATUS uvm_va_block_make_resident_read_duplicate(uvm_va_block_t *va_block,
    uvm_assert_mutex_locked(&va_block->lock);
    UVM_ASSERT(!uvm_va_block_is_dead(va_block));

+    scratch_residency_mask = kmem_cache_alloc(g_uvm_page_mask_cache, NV_UVM_GFP_FLAGS);
+    if (!scratch_residency_mask)
+        return NV_ERR_NO_MEMORY;
+
    // For pages that are entering read-duplication we need to unmap remote
    // mappings and revoke RW and higher access permissions.
    //
@@ -5177,12 +5189,12 @@ NV_STATUS uvm_va_block_make_resident_read_duplicate(uvm_va_block_t *va_block,

        status = block_prep_read_duplicate_mapping(va_block, va_block_context, src_id, region, preprocess_page_mask);
        if (status != NV_OK)
-            return status;
+            goto out;
    }

    status = block_populate_pages(va_block, va_block_retry, va_block_context, dest_id, region, page_mask);
    if (status != NV_OK)
-        return status;
+        goto out;

    status = block_copy_resident_pages(va_block,
                                       va_block_context,
@@ -5192,22 +5204,17 @@ NV_STATUS uvm_va_block_make_resident_read_duplicate(uvm_va_block_t *va_block,
                                       prefetch_page_mask,
                                       UVM_VA_BLOCK_TRANSFER_MODE_COPY);
    if (status != NV_OK)
-        return status;
+        goto out;

    // Pages that weren't resident anywhere else were populated at the
    // destination directly. Mark them as resident now, since there were no
    // errors from block_copy_resident_pages() above.
-    // Note that va_block_context->scratch_page_mask is passed to
-    // block_copy_set_first_touch_residency() which is generally unsafe but in
-    // this case, block_copy_set_first_touch_residency() copies page_mask
-    // before scratch_page_mask could be clobbered.
    migrated_pages = &va_block_context->make_resident.pages_migrated;
-    first_touch_mask = &va_block_context->scratch_page_mask;
-    uvm_page_mask_init_from_region(first_touch_mask, region, page_mask);
-    uvm_page_mask_andnot(first_touch_mask, first_touch_mask, migrated_pages);
+    uvm_page_mask_init_from_region(scratch_residency_mask, region, page_mask);
+    uvm_page_mask_andnot(scratch_residency_mask, scratch_residency_mask, migrated_pages);

-    if (!uvm_page_mask_empty(first_touch_mask))
-        block_copy_set_first_touch_residency(va_block, va_block_context, dest_id, region, first_touch_mask);
+    if (!uvm_page_mask_empty(scratch_residency_mask))
+        block_copy_set_first_touch_residency(va_block, va_block_context, dest_id, region, scratch_residency_mask);

    staged_pages = &va_block_context->make_resident.pages_staged;
    if (!UVM_ID_IS_CPU(dest_id) && !uvm_page_mask_empty(staged_pages)) {
@@ -5219,6 +5226,18 @@ NV_STATUS uvm_va_block_make_resident_read_duplicate(uvm_va_block_t *va_block,

    if (!uvm_page_mask_empty(migrated_pages)) {
        if (UVM_ID_IS_CPU(dest_id)) {
+            // Check if the CPU is already in the resident set of processors.
+            // We need to do this since we can't have multiple NUMA nodes with
+            // resident pages.
+            // If any of the migrate pages were already resident on the CPU, the
+            // residency has to be switched to the destination NUMA node.
+            if (uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU) &&
+                uvm_page_mask_and(scratch_residency_mask,
+                                  uvm_va_block_resident_mask_get(va_block, UVM_ID_CPU, NUMA_NO_NODE),
+                                  migrated_pages)) {
+                uvm_va_block_cpu_clear_resident_all_chunks(va_block, va_block_context, scratch_residency_mask);
+            }
+
            uvm_va_block_cpu_set_resident_all_chunks(va_block, va_block_context, migrated_pages);
        }
        else {
@@ -5247,7 +5266,9 @@ NV_STATUS uvm_va_block_make_resident_read_duplicate(uvm_va_block_t *va_block,
    // Check state of all chunks after residency change.
    // TODO: Bug 4207783: Check both CPU and GPU chunks.
    UVM_ASSERT(block_check_cpu_chunks(va_block));
-    return NV_OK;
+out:
+    kmem_cache_free(g_uvm_page_mask_cache, scratch_residency_mask);
+    return status;
 }

 // Looks up the current CPU mapping state of page from the
@@ -5532,13 +5553,15 @@ static bool block_check_mappings_page(uvm_va_block_t *block,
                   *block->read_duplicated_pages.bitmap);

    // Test read_duplicated_pages mask
-    UVM_ASSERT_MSG((uvm_processor_mask_get_count(resident_processors) <= 1 &&
-                     !uvm_page_mask_test(&block->read_duplicated_pages, page_index)) ||
-                   (uvm_processor_mask_get_count(resident_processors) > 1 &&
-                     uvm_page_mask_test(&block->read_duplicated_pages, page_index)),
+    UVM_ASSERT_MSG((!uvm_page_mask_test(&block->read_duplicated_pages, page_index) &&
+                    uvm_processor_mask_get_count(resident_processors) <= 1) ||
+                   (uvm_page_mask_test(&block->read_duplicated_pages, page_index) &&
+                    uvm_processor_mask_get_count(resident_processors) >= 1),
                   "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx - SWA: 0x%lx - RD: 0x%lx\n",
                   *resident_processors->bitmap,
-                   *read_mappings->bitmap, *write_mappings->bitmap, *atomic_mappings->bitmap,
+                   *read_mappings->bitmap,
+                   *write_mappings->bitmap,
+                   *atomic_mappings->bitmap,
                   *va_space->system_wide_atomics_enabled_processors.bitmap,
                   *block->read_duplicated_pages.bitmap);

@@ -6022,7 +6045,7 @@ static bool block_has_remote_mapping_gpu(uvm_va_block_t *block,
        if (uvm_page_mask_empty(mapped_pages))
            return false;

-        return !uvm_id_equal(uvm_va_range_get_policy(block->va_range)->preferred_location, gpu_id);
+        return !uvm_va_policy_preferred_location_equal(uvm_va_range_get_policy(block->va_range), gpu_id, NUMA_NO_NODE);
    }

    // Remote pages are pages which are mapped but not resident locally
@@ -8365,6 +8388,7 @@ static NV_STATUS block_map_gpu_to(uvm_va_block_t *va_block,
                                  uvm_va_block_context_t *block_context,
                                  uvm_gpu_t *gpu,
                                  uvm_processor_id_t resident_id,
+                                  int resident_nid,
                                  uvm_page_mask_t *map_page_mask,
                                  uvm_prot_t new_prot,
                                  uvm_tracker_t *out_tracker)
@@ -8374,7 +8398,7 @@ static NV_STATUS block_map_gpu_to(uvm_va_block_t *va_block,
    uvm_push_t push;
    NV_STATUS status;
    uvm_page_mask_t *pages_to_map = &block_context->mapping.page_mask;
-    const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, resident_id, NUMA_NO_NODE);
+    const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, resident_id, resident_nid);
    uvm_pte_bits_gpu_t pte_bit;
    uvm_pte_bits_gpu_t prot_pte_bit = get_gpu_pte_bit_index(new_prot);
    uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state;
@@ -8383,8 +8407,10 @@ static NV_STATUS block_map_gpu_to(uvm_va_block_t *va_block,
    UVM_ASSERT(map_page_mask);
    UVM_ASSERT(uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(resident_id)], gpu->id));

-    if (uvm_processor_mask_test(block_get_uvm_lite_gpus(va_block), gpu->id))
-        UVM_ASSERT(uvm_id_equal(resident_id, uvm_va_range_get_policy(va_block->va_range)->preferred_location));
+    if (uvm_processor_mask_test(block_get_uvm_lite_gpus(va_block), gpu->id)) {
+        uvm_va_policy_t *policy = uvm_va_range_get_policy(va_block->va_range);
+        UVM_ASSERT(uvm_va_policy_preferred_location_equal(policy, resident_id, policy->preferred_nid));
+    }

    UVM_ASSERT(!uvm_page_mask_and(&block_context->scratch_page_mask,
                                  map_page_mask,
@@ -8486,18 +8512,27 @@ static NV_STATUS block_map_gpu_to(uvm_va_block_t *va_block,
    return uvm_tracker_add_push_safe(out_tracker, &push);
 }

+// allowed_nid_mask is only valid if the CPU is set in allowed_mask.
 static void map_get_allowed_destinations(uvm_va_block_t *block,
                                         uvm_va_block_context_t *va_block_context,
                                         const uvm_va_policy_t *policy,
                                         uvm_processor_id_t id,
-                                         uvm_processor_mask_t *allowed_mask)
+                                         uvm_processor_mask_t *allowed_mask,
+                                         nodemask_t *allowed_nid_mask)
 {
    uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);

+    *allowed_nid_mask = node_possible_map;
+
    if (uvm_processor_mask_test(block_get_uvm_lite_gpus(block), id)) {
        // UVM-Lite can only map resident pages on the preferred location
        uvm_processor_mask_zero(allowed_mask);
        uvm_processor_mask_set(allowed_mask, policy->preferred_location);
+        if (UVM_ID_IS_CPU(policy->preferred_location) &&
+            !uvm_va_policy_preferred_location_equal(policy, UVM_ID_CPU, NUMA_NO_NODE)) {
+            nodes_clear(*allowed_nid_mask);
+            node_set(policy->preferred_nid, *allowed_nid_mask);
+        }
    }
    else if ((uvm_va_policy_is_read_duplicate(policy, va_space) ||
              (uvm_id_equal(policy->preferred_location, id) &&
@@ -8540,6 +8575,7 @@ NV_STATUS uvm_va_block_map(uvm_va_block_t *va_block,
    uvm_page_mask_t *running_page_mask = &va_block_context->mapping.map_running_page_mask;
    NV_STATUS status = NV_OK;
    const uvm_va_policy_t *policy = uvm_va_policy_get_region(va_block, region);
+    nodemask_t *allowed_nid_destinations;

    va_block_context->mapping.cause = cause;

@@ -8589,10 +8625,20 @@ NV_STATUS uvm_va_block_map(uvm_va_block_t *va_block,
    if (!allowed_destinations)
        return NV_ERR_NO_MEMORY;

+    allowed_nid_destinations = uvm_kvmalloc(sizeof(*allowed_nid_destinations));
+    if (!allowed_nid_destinations) {
+        uvm_processor_mask_cache_free(allowed_destinations);
+        return NV_ERR_NO_MEMORY;
+    }
+
    // Map per resident location so we can more easily detect physically-
    // contiguous mappings.
-    map_get_allowed_destinations(va_block, va_block_context, policy, id, allowed_destinations);
-
+    map_get_allowed_destinations(va_block,
+                                 va_block_context,
+                                 policy,
+                                 id,
+                                 allowed_destinations,
+                                 allowed_nid_destinations);
    for_each_closest_id(resident_id, allowed_destinations, id, va_space) {
        if (UVM_ID_IS_CPU(id)) {
            status = block_map_cpu_to(va_block,
@@ -8603,11 +8649,30 @@ NV_STATUS uvm_va_block_map(uvm_va_block_t *va_block,
                                      new_prot,
                                      out_tracker);
        }
+        else if (UVM_ID_IS_CPU(resident_id)) {
+            int nid;
+
+            // map_get_allowed_distinations() will set the mask of CPU NUMA
+            // nodes that should be mapped.
+            for_each_node_mask(nid, *allowed_nid_destinations) {
+                status = block_map_gpu_to(va_block,
+                                          va_block_context,
+                                          gpu,
+                                          resident_id,
+                                          nid,
+                                          running_page_mask,
+                                          new_prot,
+                                          out_tracker);
+                if (status != NV_OK)
+                    break;
+            }
+        }
        else {
            status = block_map_gpu_to(va_block,
                                      va_block_context,
                                      gpu,
                                      resident_id,
+                                      NUMA_NO_NODE,
                                      running_page_mask,
                                      new_prot,
                                      out_tracker);
@@ -8622,6 +8687,7 @@ NV_STATUS uvm_va_block_map(uvm_va_block_t *va_block,
    }

    uvm_processor_mask_cache_free(allowed_destinations);
+    uvm_kvfree(allowed_nid_destinations);

    return status;
 }
@@ -11175,8 +11241,8 @@ NV_STATUS uvm_va_block_add_mappings_after_migration(uvm_va_block_t *va_block,
    // so uvm_va_block_map will be a no-op.
    uvm_processor_mask_and(map_uvm_lite_gpus, map_other_processors, block_get_uvm_lite_gpus(va_block));
    if (!uvm_processor_mask_empty(map_uvm_lite_gpus) &&
-        uvm_id_equal(new_residency, preferred_location)) {
-        for_each_id_in_mask(map_processor_id, map_uvm_lite_gpus) {
+        uvm_va_policy_preferred_location_equal(policy, new_residency, va_block_context->make_resident.dest_nid)) {
+        for_each_id_in_mask (map_processor_id, map_uvm_lite_gpus) {
            status = uvm_va_block_map(va_block,
                                      va_block_context,
                                      map_processor_id,
@@ -11637,6 +11703,10 @@ static int block_select_node_residency(uvm_va_block_t *va_block,
    // For GPU faults, the bottom half is pinned to CPUs closest to their GPU.
    // Therefore, in both cases, we can use numa_mem_id() to get the NUMA node
    // ID of the faulting processor.
+    // Note that numa_mem_id() returns the nearest node with memory. In most
+    // cases, this will be the current NUMA node. However, in the case that the
+    // current node does not have any memory, we probably want the nearest node
+    // with memory, anyway.
    int current_nid = numa_mem_id();
    bool may_read_duplicate = can_read_duplicate(va_block, page_index, policy, thrashing_hint);

@@ -11660,7 +11730,12 @@ static int block_select_node_residency(uvm_va_block_t *va_block,
    // If read duplication is enabled and the page is also resident on the CPU,
    // keep its current NUMA node residency.
    if (may_read_duplicate && uvm_va_block_cpu_is_page_resident_on(va_block, NUMA_NO_NODE, page_index))
-        return block_get_page_node_residency(va_block, page_index);
+        return NUMA_NO_NODE;
+
+    // The new_residency processor is the CPU and the preferred location is not
+    // the CPU. If the page is resident on the CPU, keep its current residency.
+    if (uvm_va_block_cpu_is_page_resident_on(va_block, NUMA_NO_NODE, page_index))
+        return NUMA_NO_NODE;

    return current_nid;
 }
@@ -12564,125 +12639,6 @@ NV_STATUS uvm_va_block_find_create(uvm_va_space_t *va_space,
        return uvm_hmm_va_block_find_create(va_space, addr, hmm_vma, out_block);
 }

-// Launch a synchronous, encrypted copy between GPU and CPU.
-//
-// The copy entails a GPU-side encryption (relying on the Copy Engine), and a
-// CPU-side decryption step, such that the destination CPU buffer pointed by
-// dst_plain will contain the unencrypted (plain text) contents. The destination
-// buffer can be in protected or unprotected sysmem, while the source buffer
-// must be in protected vidmem.
-//
-// The maximum copy size allowed is UVM_CONF_COMPUTING_DMA_BUFFER_SIZE.
-//
-// The input tracker, if not NULL, is internally acquired by the push
-// responsible for the encrypted copy.
-__attribute__ ((format(printf, 6, 7)))
-static NV_STATUS encrypted_memcopy_gpu_to_cpu(uvm_gpu_t *gpu,
-                                              void *dst_plain,
-                                              uvm_gpu_address_t src_gpu_address,
-                                              size_t size,
-                                              uvm_tracker_t *tracker,
-                                              const char *format,
-                                              ...)
-{
-    NV_STATUS status;
-    UvmCslIv decrypt_iv;
-    uvm_push_t push;
-    uvm_conf_computing_dma_buffer_t *dma_buffer;
-    uvm_gpu_address_t dst_gpu_address, auth_tag_gpu_address;
-    void *src_cipher, *auth_tag;
-    va_list args;
-
-    UVM_ASSERT(g_uvm_global.conf_computing_enabled);
-    UVM_ASSERT(size <= UVM_CONF_COMPUTING_DMA_BUFFER_SIZE);
-
-    status = uvm_conf_computing_dma_buffer_alloc(&gpu->conf_computing.dma_buffer_pool, &dma_buffer, NULL);
-    if (status != NV_OK)
-        return status;
-
-    va_start(args, format);
-    status = uvm_push_begin_acquire(gpu->channel_manager, UVM_CHANNEL_TYPE_GPU_TO_CPU, tracker, &push, format, args);
-    va_end(args);
-
-    if (status != NV_OK)
-        goto out;
-
-    uvm_conf_computing_log_gpu_encryption(push.channel, &decrypt_iv);
-
-    dst_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->alloc, gpu);
-    auth_tag_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->auth_tag, gpu);
-    gpu->parent->ce_hal->encrypt(&push, dst_gpu_address, src_gpu_address, size, auth_tag_gpu_address);
-
-    status = uvm_push_end_and_wait(&push);
-    if (status != NV_OK)
-        goto out;
-
-    src_cipher = uvm_mem_get_cpu_addr_kernel(dma_buffer->alloc);
-    auth_tag = uvm_mem_get_cpu_addr_kernel(dma_buffer->auth_tag);
-    status = uvm_conf_computing_cpu_decrypt(push.channel, dst_plain, src_cipher, &decrypt_iv, size, auth_tag);
-
- out:
-    uvm_conf_computing_dma_buffer_free(&gpu->conf_computing.dma_buffer_pool, dma_buffer, NULL);
-    return status;
-}
-
-// Launch a synchronous, encrypted copy between CPU and GPU.
-//
-// The source CPU buffer pointed by src_plain contains the unencrypted (plain
-// text) contents; the function internally performs a CPU-side encryption step
-// before launching the GPU-side CE decryption. The source buffer can be in
-// protected or unprotected sysmem, while the destination buffer must be in
-// protected vidmem.
-//
-// The maximum copy size allowed is UVM_CONF_COMPUTING_DMA_BUFFER_SIZE.
-//
-// The input tracker, if not NULL, is internally acquired by the push
-// responsible for the encrypted copy.
-__attribute__ ((format(printf, 6, 7)))
-static NV_STATUS encrypted_memcopy_cpu_to_gpu(uvm_gpu_t *gpu,
-                                              uvm_gpu_address_t dst_gpu_address,
-                                              void *src_plain,
-                                              size_t size,
-                                              uvm_tracker_t *tracker,
-                                              const char *format,
-                                              ...)
-{
-    NV_STATUS status;
-    uvm_push_t push;
-    uvm_conf_computing_dma_buffer_t *dma_buffer;
-    uvm_gpu_address_t src_gpu_address, auth_tag_gpu_address;
-    void *dst_cipher, *auth_tag;
-    va_list args;
-
-    UVM_ASSERT(g_uvm_global.conf_computing_enabled);
-    UVM_ASSERT(size <= UVM_CONF_COMPUTING_DMA_BUFFER_SIZE);
-
-    status = uvm_conf_computing_dma_buffer_alloc(&gpu->conf_computing.dma_buffer_pool, &dma_buffer, NULL);
-    if (status != NV_OK)
-        return status;
-
-    va_start(args, format);
-    status = uvm_push_begin_acquire(gpu->channel_manager, UVM_CHANNEL_TYPE_CPU_TO_GPU, tracker, &push, format, args);
-    va_end(args);
-
-    if (status != NV_OK)
-        goto out;
-
-    dst_cipher = uvm_mem_get_cpu_addr_kernel(dma_buffer->alloc);
-    auth_tag = uvm_mem_get_cpu_addr_kernel(dma_buffer->auth_tag);
-    uvm_conf_computing_cpu_encrypt(push.channel, dst_cipher, src_plain, NULL, size, auth_tag);
-
-    src_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->alloc, gpu);
-    auth_tag_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->auth_tag, gpu);
-    gpu->parent->ce_hal->decrypt(&push, dst_gpu_address, src_gpu_address, size, auth_tag_gpu_address);
-
-    status = uvm_push_end_and_wait(&push);
-
-out:
-    uvm_conf_computing_dma_buffer_free(&gpu->conf_computing.dma_buffer_pool, dma_buffer, NULL);
-    return status;
-}
-
 static NV_STATUS va_block_write_cpu_to_gpu(uvm_va_block_t *va_block,
                                           uvm_gpu_t *gpu,
                                           uvm_gpu_address_t dst_gpu_address,
@@ -12695,14 +12651,14 @@ static NV_STATUS va_block_write_cpu_to_gpu(uvm_va_block_t *va_block,
    uvm_gpu_address_t src_gpu_address;

    if (g_uvm_global.conf_computing_enabled) {
-        return encrypted_memcopy_cpu_to_gpu(gpu,
-                                            dst_gpu_address,
-                                            uvm_mem_get_cpu_addr_kernel(src_mem),
-                                            size,
-                                            &va_block->tracker,
-                                            "Encrypted write to [0x%llx, 0x%llx)",
-                                            dst,
-                                            dst + size);
+        return uvm_conf_computing_util_memcopy_cpu_to_gpu(gpu,
+                                                          dst_gpu_address,
+                                                          uvm_mem_get_cpu_addr_kernel(src_mem),
+                                                          size,
+                                                          &va_block->tracker,
+                                                          "Encrypted write to [0x%llx, 0x%llx)",
+                                                          dst,
+                                                          dst + size);
    }

    status = uvm_push_begin_acquire(gpu->channel_manager,
@@ -12799,14 +12755,14 @@ static NV_STATUS va_block_read_gpu_to_cpu(uvm_va_block_t *va_block,
    uvm_gpu_address_t dst_gpu_address;

    if (g_uvm_global.conf_computing_enabled) {
-        return encrypted_memcopy_gpu_to_cpu(gpu,
-                                            uvm_mem_get_cpu_addr_kernel(dst_mem),
-                                            src_gpu_address,
-                                            size,
-                                            &va_block->tracker,
-                                            "Encrypted read from [0x%llx, 0x%llx)",
-                                            src,
-                                            src + size);
+        return uvm_conf_computing_util_memcopy_gpu_to_cpu(gpu,
+                                                          uvm_mem_get_cpu_addr_kernel(dst_mem),
+                                                          src_gpu_address,
+                                                          size,
+                                                          &va_block->tracker,
+                                                          "Encrypted read from [0x%llx, 0x%llx)",
+                                                          src,
+                                                          src + size);
    }

    status = uvm_push_begin_acquire(gpu->channel_manager,
--- a/kernel-open/nvidia-uvm/uvm_va_policy.c
+++ b/kernel-open/nvidia-uvm/uvm_va_policy.c
@@ -105,6 +105,12 @@ bool uvm_va_policy_preferred_location_equal(const uvm_va_policy_t *policy, uvm_p
 {
    bool equal = uvm_id_equal(policy->preferred_location, proc);

+    if (!UVM_ID_IS_CPU(policy->preferred_location))
+        UVM_ASSERT(policy->preferred_nid == NUMA_NO_NODE);
+
+    if (!UVM_ID_IS_CPU(proc))
+        UVM_ASSERT(cpu_numa_id == NUMA_NO_NODE);
+
    if (equal && UVM_ID_IS_CPU(policy->preferred_location))
        equal = uvm_numa_id_eq(policy->preferred_nid, cpu_numa_id);

@@ -656,7 +662,7 @@ const uvm_va_policy_t *uvm_va_policy_set_preferred_location(uvm_va_block_t *va_b
        // and that the policy is changing.
        UVM_ASSERT(node->node.start >= start);
        UVM_ASSERT(node->node.end <= end);
-        UVM_ASSERT(!uvm_id_equal(node->policy.preferred_location, processor_id));
+        UVM_ASSERT(!uvm_va_policy_preferred_location_equal(&node->policy, processor_id, cpu_node_id));
    }

    node->policy.preferred_location = processor_id;
--- a/kernel-open/nvidia-uvm/uvm_va_range.c
+++ b/kernel-open/nvidia-uvm/uvm_va_range.c
@@ -868,9 +868,9 @@ static void uvm_va_range_disable_peer_managed(uvm_va_range_t *va_range, uvm_gpu_
        // preferred location. If peer mappings are being disabled to the
        // preferred location, then unmap the other GPU.
        // Nothing to do otherwise.
-        if (uvm_id_equal(uvm_va_range_get_policy(va_range)->preferred_location, gpu0->id))
+        if (uvm_va_policy_preferred_location_equal(uvm_va_range_get_policy(va_range), gpu0->id, NUMA_NO_NODE))
            uvm_lite_gpu_to_unmap = gpu1;
-        else if (uvm_id_equal(uvm_va_range_get_policy(va_range)->preferred_location, gpu1->id))
+        else if (uvm_va_policy_preferred_location_equal(uvm_va_range_get_policy(va_range), gpu1->id, NUMA_NO_NODE))
            uvm_lite_gpu_to_unmap = gpu0;
        else
            return;
@@ -951,7 +951,7 @@ static void va_range_unregister_gpu_managed(uvm_va_range_t *va_range, uvm_gpu_t
    // Reset preferred location and accessed-by of VA ranges if needed
    // Note: ignoring the return code of uvm_va_range_set_preferred_location since this
    // will only return on error when setting a preferred location, not on a reset
-    if (uvm_id_equal(uvm_va_range_get_policy(va_range)->preferred_location, gpu->id))
+    if (uvm_va_policy_preferred_location_equal(uvm_va_range_get_policy(va_range), gpu->id, NUMA_NO_NODE))
        (void)uvm_va_range_set_preferred_location(va_range, UVM_ID_INVALID, NUMA_NO_NODE, mm, NULL);

    uvm_va_range_unset_accessed_by(va_range, gpu->id, NULL);
@@ -1683,7 +1683,7 @@ void uvm_va_range_unset_accessed_by(uvm_va_range_t *va_range,
    // If a UVM-Lite GPU is being removed from the accessed_by mask, it will
    // also stop being a UVM-Lite GPU unless it's also the preferred location.
    if (uvm_processor_mask_test(&va_range->uvm_lite_gpus, processor_id) &&
-        !uvm_id_equal(uvm_va_range_get_policy(va_range)->preferred_location, processor_id)) {
+        !uvm_va_policy_preferred_location_equal(uvm_va_range_get_policy(va_range), processor_id, NUMA_NO_NODE)) {
        range_unmap(va_range, processor_id, out_tracker);
    }

--- a/kernel-open/nvidia/libspdm_internal_crypt_lib.c
+++ b/kernel-open/nvidia/libspdm_internal_crypt_lib.c
@@ -0,0 +1,42 @@
+/*
+* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+* SPDX-License-Identifier: MIT
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+* DEALINGS IN THE SOFTWARE.
+*
+* Comments, prototypes and checks taken from DMTF: Copyright 2021-2022 DMTF. All rights reserved.
+* License: BSD 3-Clause License. For full text see link: https://github.com/DMTF/libspdm/blob/main/LICENSE.md
+*/
+
+#include "os-interface.h"
+#include "internal_crypt_lib.h"
+#include "library/cryptlib.h"
+
+bool libspdm_check_crypto_backend(void)
+{
+#ifdef USE_LKCA
+    nv_printf(NV_DBG_INFO, "libspdm_check_crypto_backend: LKCA wrappers found.\n");
+    nv_printf(NV_DBG_INFO, "libspdm_check_crypto_backend: LKCA calls may still fail if modules have not been loaded!\n");
+    return true;
+#else
+    nv_printf(NV_DBG_ERRORS, "libspdm_check_crypto_backend: Error - libspdm expects LKCA but found stubs!\n");
+    return false;
+#endif
+}
+
--- a/kernel-open/nvidia/nv-kthread-q.c
+++ b/kernel-open/nvidia/nv-kthread-q.c
@@ -201,7 +201,7 @@ static struct task_struct *thread_create_on_node(int (*threadfn)(void *data),

        // Ran out of attempts - return thread even if its stack may not be
        // allocated on the preferred node
-        if ((i == (attempts - 1)))
+        if (i == (attempts - 1))
            break;

        // Get the NUMA node where the first page of the stack is resident. If
--- a/kernel-open/nvidia/nv-pci.c
+++ b/kernel-open/nvidia/nv-pci.c
@@ -37,6 +37,10 @@
 #include <linux/kernfs.h>
 #endif

+#if !defined(NV_BUS_TYPE_HAS_IOMMU_OPS)
+#include <linux/iommu.h>
+#endif
+
 static void
 nv_check_and_exclude_gpu(
    nvidia_stack_t *sp,
@@ -530,35 +534,21 @@ nv_pci_probe
    if (pci_dev->is_virtfn)
    {
 #if defined(NV_VGPU_KVM_BUILD)
-        nvl = pci_get_drvdata(pci_dev->physfn);
-        if (!nvl)
+
+#if defined(NV_BUS_TYPE_HAS_IOMMU_OPS)
+        if (pci_dev->dev.bus->iommu_ops == NULL) 
+#else
+        if ((pci_dev->dev.iommu != NULL) && (pci_dev->dev.iommu->iommu_dev != NULL) &&
+            (pci_dev->dev.iommu->iommu_dev->ops == NULL))
+#endif
        {
            nv_printf(NV_DBG_ERRORS, "NVRM: Aborting probe for VF %04x:%02x:%02x.%x "
-                      "since PF is not bound to nvidia driver.\n",
+                      "since IOMMU is not present on the system.\n",
                       NV_PCI_DOMAIN_NUMBER(pci_dev), NV_PCI_BUS_NUMBER(pci_dev),
                       NV_PCI_SLOT_NUMBER(pci_dev), PCI_FUNC(pci_dev->devfn));
            goto failed;
        }

-        if (pci_dev->dev.bus->iommu_ops == NULL) 
-        {
-            nv = NV_STATE_PTR(nvl);
-            if (rm_is_iommu_needed_for_sriov(sp, nv))
-            {
-                nv_printf(NV_DBG_ERRORS, "NVRM: Aborting probe for VF %04x:%02x:%02x.%x "
-                          "since IOMMU is not present on the system.\n",
-                           NV_PCI_DOMAIN_NUMBER(pci_dev), NV_PCI_BUS_NUMBER(pci_dev),
-                           NV_PCI_SLOT_NUMBER(pci_dev), PCI_FUNC(pci_dev->devfn));
-                goto failed;
-            }
-        }
-
-        if (nvidia_vgpu_vfio_probe(pci_dev) != NV_OK)
-        {
-            nv_printf(NV_DBG_ERRORS, "NVRM: Failed to register device to vGPU VFIO module");
-            goto failed;
-        }
-
        nv_kmem_cache_free_stack(sp);
        return 0;
 #else
--- a/kernel-open/nvidia/nv_gpu_ops.h
+++ b/kernel-open/nvidia/nv_gpu_ops.h
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2013-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2013-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@@ -45,6 +45,11 @@ typedef struct gpuObject        *gpuObjectHandle;

 typedef struct gpuRetainedChannel_struct gpuRetainedChannel;

+
+NV_STATUS calculatePCIELinkRateMBps(NvU32 lanes,
+                                    NvU32 pciLinkMaxSpeed,
+                                    NvU32 *pcieLinkRate);
+
 NV_STATUS nvGpuOpsCreateSession(struct gpuSession **session);

 NV_STATUS nvGpuOpsDestroySession(struct gpuSession *session);
@@ -286,11 +291,11 @@ NV_STATUS nvGpuOpsTogglePrefetchFaults(gpuFaultInfo *pFaultInfo,
                                       NvBool bEnable);

 // Interface used for CCSL
-
 NV_STATUS nvGpuOpsCcslContextInit(struct ccslContext_t **ctx,
                                  gpuChannelHandle channel);
 NV_STATUS nvGpuOpsCcslContextClear(struct ccslContext_t *ctx);
-NV_STATUS nvGpuOpsCcslContextUpdate(struct ccslContext_t *ctx);
+NV_STATUS nvGpuOpsCcslRotateKey(UvmCslContext *contextList[],
+                                NvU32 contextListCount);
 NV_STATUS nvGpuOpsCcslRotateIv(struct ccslContext_t *ctx,
                               NvU8 direction);
 NV_STATUS nvGpuOpsCcslEncrypt(struct ccslContext_t *ctx,
@@ -308,6 +313,7 @@ NV_STATUS nvGpuOpsCcslDecrypt(struct ccslContext_t *ctx,
                              NvU32 bufferSize,
                              NvU8 const *inputBuffer,
                              NvU8 const *decryptIv,
+                              NvU32 keyRotationId,
                              NvU8 *outputBuffer,
                              NvU8 const *addAuthData,
                              NvU32 addAuthDataSize,
@@ -323,7 +329,8 @@ NV_STATUS nvGpuOpsIncrementIv(struct ccslContext_t *ctx,
                              NvU8 direction,
                              NvU64 increment,
                              NvU8 *iv);
-NV_STATUS nvGpuOpsLogDeviceEncryption(struct ccslContext_t *ctx,
-                                      NvU32 bufferSize);
+NV_STATUS nvGpuOpsLogEncryption(struct ccslContext_t *ctx,
+                                NvU8 direction,
+                                NvU32 bufferSize);

 #endif /* _NV_GPU_OPS_H_*/
--- a/kernel-open/nvidia/nv_uvm_interface.c
+++ b/kernel-open/nvidia/nv_uvm_interface.c
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2013-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2013-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@@ -1516,16 +1516,23 @@ void nvUvmInterfaceDeinitCslContext(UvmCslContext *uvmCslContext)
 }
 EXPORT_SYMBOL(nvUvmInterfaceDeinitCslContext);

-NV_STATUS nvUvmInterfaceCslUpdateContext(UvmCslContext *uvmCslContext)
+NV_STATUS nvUvmInterfaceCslRotateKey(UvmCslContext *contextList[],
+                                     NvU32 contextListCount)
 {
    NV_STATUS status;
-    nvidia_stack_t *sp = uvmCslContext->nvidia_stack;
+    nvidia_stack_t *sp;

-    status = rm_gpu_ops_ccsl_context_update(sp, uvmCslContext->ctx);
+    if ((contextList == NULL) || (contextListCount == 0) || (contextList[0] == NULL))
+    {
+        return NV_ERR_INVALID_ARGUMENT;
+    }
+
+    sp = contextList[0]->nvidia_stack;
+    status = rm_gpu_ops_ccsl_rotate_key(sp, contextList, contextListCount);

    return status;
 }
-EXPORT_SYMBOL(nvUvmInterfaceCslUpdateContext);
+EXPORT_SYMBOL(nvUvmInterfaceCslRotateKey);

 NV_STATUS nvUvmInterfaceCslRotateIv(UvmCslContext *uvmCslContext,
                                    UvmCslOperation operation)
@@ -1562,6 +1569,7 @@ NV_STATUS nvUvmInterfaceCslDecrypt(UvmCslContext *uvmCslContext,
                                   NvU32 bufferSize,
                                   NvU8 const *inputBuffer,
                                   UvmCslIv const *decryptIv,
+                                   NvU32 keyRotationId,
                                   NvU8 *outputBuffer,
                                   NvU8 const *addAuthData,
                                   NvU32 addAuthDataSize,
@@ -1575,6 +1583,7 @@ NV_STATUS nvUvmInterfaceCslDecrypt(UvmCslContext *uvmCslContext,
                                     bufferSize,
                                     inputBuffer,
                                     (NvU8 *)decryptIv,
+                                     keyRotationId,
                                     outputBuffer,
                                     addAuthData,
                                     addAuthDataSize,
@@ -1625,17 +1634,18 @@ NV_STATUS nvUvmInterfaceCslIncrementIv(UvmCslContext *uvmCslContext,
 }
 EXPORT_SYMBOL(nvUvmInterfaceCslIncrementIv);

-NV_STATUS nvUvmInterfaceCslLogExternalEncryption(UvmCslContext *uvmCslContext,
-                                                 NvU32 bufferSize)
+NV_STATUS nvUvmInterfaceCslLogEncryption(UvmCslContext *uvmCslContext,
+                                         UvmCslOperation operation,
+                                         NvU32 bufferSize)
 {
    NV_STATUS status;
    nvidia_stack_t *sp = uvmCslContext->nvidia_stack;

-    status = rm_gpu_ops_ccsl_log_device_encryption(sp, uvmCslContext->ctx, bufferSize);
+    status = rm_gpu_ops_ccsl_log_encryption(sp, uvmCslContext->ctx, operation, bufferSize);

    return status;
 }
-EXPORT_SYMBOL(nvUvmInterfaceCslLogExternalEncryption);
+EXPORT_SYMBOL(nvUvmInterfaceCslLogEncryption);

 #else // NV_UVM_ENABLE

--- a/kernel-open/nvidia/nvidia-sources.Kbuild
+++ b/kernel-open/nvidia/nvidia-sources.Kbuild
@@ -41,6 +41,7 @@ NVIDIA_SOURCES += nvidia/libspdm_rsa.c
 NVIDIA_SOURCES += nvidia/libspdm_aead_aes_gcm.c
 NVIDIA_SOURCES += nvidia/libspdm_sha.c
 NVIDIA_SOURCES += nvidia/libspdm_hmac_sha.c
+NVIDIA_SOURCES += nvidia/libspdm_internal_crypt_lib.c
 NVIDIA_SOURCES += nvidia/libspdm_hkdf_sha.c
 NVIDIA_SOURCES += nvidia/libspdm_ec.c
 NVIDIA_SOURCES += nvidia/libspdm_x509.c
--- a/kernel-open/nvidia/nvidia.Kbuild
+++ b/kernel-open/nvidia/nvidia.Kbuild
@@ -161,7 +161,7 @@ NV_CONFTEST_FUNCTION_COMPILE_TESTS += pci_enable_atomic_ops_to_root
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += vga_tryget
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += cc_platform_has
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += seq_read_iter
-NV_CONFTEST_FUNCTION_COMPILE_TESTS += unsafe_follow_pfn
+NV_CONFTEST_FUNCTION_COMPILE_TESTS += follow_pfn
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += drm_gem_object_get
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += drm_gem_object_put_unlocked
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += add_memory_driver_managed
@@ -228,6 +228,7 @@ NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_present_tsec_comms_alloc_me
 NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_present_tsec_comms_free_gscco_mem
 NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_present_memory_block_size_bytes
 NV_CONFTEST_SYMBOL_COMPILE_TESTS += crypto
+NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_present_follow_pte

 NV_CONFTEST_TYPE_COMPILE_TESTS += dma_ops
 NV_CONFTEST_TYPE_COMPILE_TESTS += swiotlb_dma_ops
@@ -251,6 +252,7 @@ NV_CONFTEST_TYPE_COMPILE_TESTS += pci_driver_has_driver_managed_dma
 NV_CONFTEST_TYPE_COMPILE_TESTS += vm_area_struct_has_const_vm_flags
 NV_CONFTEST_TYPE_COMPILE_TESTS += memory_failure_has_trapno_arg
 NV_CONFTEST_TYPE_COMPILE_TESTS += foll_longterm_present
+NV_CONFTEST_TYPE_COMPILE_TESTS += bus_type_has_iommu_ops

 NV_CONFTEST_GENERIC_COMPILE_TESTS += dom0_kernel_present
 NV_CONFTEST_GENERIC_COMPILE_TESTS += nvidia_vgpu_kvm_build
--- a/kernel-open/nvidia/nvspdm_cryptlib_extensions.h
+++ b/kernel-open/nvidia/nvspdm_cryptlib_extensions.h
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@@ -38,4 +38,4 @@ bool libspdm_aead_aes_gcm_decrypt_prealloc(void *context,
        const uint8_t *data_in, size_t data_in_size,
        const uint8_t *tag, size_t tag_size,
        uint8_t *data_out, size_t *data_out_size);
-
+bool libspdm_check_crypto_backend(void);
--- a/kernel-open/nvidia/os-mlock.c
+++ b/kernel-open/nvidia/os-mlock.c
@@ -36,10 +36,28 @@ static inline int nv_follow_pfn(struct vm_area_struct *vma,
                                unsigned long address,
                                unsigned long *pfn)
 {
-#if defined(NV_UNSAFE_FOLLOW_PFN_PRESENT)
-    return unsafe_follow_pfn(vma, address, pfn);
-#else
+#if defined(NV_FOLLOW_PFN_PRESENT)
    return follow_pfn(vma, address, pfn);
+#else
+#if NV_IS_EXPORT_SYMBOL_PRESENT_follow_pte
+    int status = 0;
+    spinlock_t *ptl;
+    pte_t *ptep;
+
+    if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
+        return status;
+
+    status = follow_pte(vma, address, &ptep, &ptl);
+    if (status)
+        return status;
+    *pfn = pte_pfn(ptep_get(ptep));
+
+    // The lock is acquired inside follow_pte()
+    pte_unmap_unlock(ptep, ptl);
+    return 0;
+#else // NV_IS_EXPORT_SYMBOL_PRESENT_follow_pte
+    return -1;
+#endif // NV_IS_EXPORT_SYMBOL_PRESENT_follow_pte
 #endif
 }